├── .test ├── qc │ ├── results │ │ └── genome1 │ │ │ ├── test_qc_raw.vcf.gz.tbi │ │ │ ├── data │ │ │ └── genome │ │ │ │ └── genome1.fna.fai │ │ │ ├── test_qc_raw.vcf.gz │ │ │ └── summary_stats │ │ │ └── test_qc_bam_sumstats.txt │ └── config │ │ ├── test.csv │ │ ├── samples.csv │ │ ├── resources.yaml │ │ ├── config.yaml │ │ └── test_qc_gls_config.yaml ├── postprocess │ ├── results │ │ └── genome1 │ │ │ ├── test_postprocess_raw.vcf.gz.tbi │ │ │ ├── test_postprocess_raw.vcf.gz │ │ │ └── data │ │ │ └── genome │ │ │ └── genome1.fna.fai │ └── config │ │ ├── test.csv │ │ ├── samples.csv │ │ ├── resources.yaml │ │ ├── test_qc_gls_config.yaml │ │ └── config.yaml ├── trackhub │ ├── results │ │ └── genome1 │ │ │ ├── data │ │ │ └── genome │ │ │ │ └── genome1.fna.fai │ │ │ ├── test_postprocess_clean_snps.vcf.gz │ │ │ └── test_postprocess_clean_snps.vcf.gz.tbi │ └── config │ │ ├── test.csv │ │ ├── samples.csv │ │ ├── resources.yaml │ │ ├── test_qc_gls_config.yaml │ │ └── config.yaml ├── ecoli │ ├── data │ │ ├── local_genome │ │ │ └── local_genome.fna.gz │ │ └── local_fastq │ │ │ ├── my_sample1_1.fastq.gz │ │ │ ├── my_sample1_2.fastq.gz │ │ │ ├── my_sample2_1.fastq.gz │ │ │ └── my_sample2_2.fastq.gz │ ├── config │ │ ├── ecoli_config_genome.csv │ │ ├── local_and_sra.csv │ │ ├── ecoli_samples.csv │ │ ├── resources.yaml │ │ └── config.yaml │ └── workflow │ │ └── scripts │ │ └── samples_to_keep.py └── ci │ └── config │ ├── samples.csv │ ├── resources.yaml │ └── config.yaml ├── docs ├── img │ └── logo.png ├── requirements.txt ├── datasets.md ├── conf.py ├── index.md ├── modules.md ├── executing.md └── examples.md ├── workflow ├── envs │ ├── bcftools.yml │ ├── sambamba.yml │ ├── sentieon.yml │ ├── mappability.yml │ ├── ucsc.yml │ ├── angsd.yml │ ├── cov_filter.yml │ ├── bam2vcf.yml │ └── fastq2bam.yml ├── modules │ ├── mk │ │ ├── envs │ │ │ ├── mk.yml │ │ │ └── ncbi.yml │ │ ├── config │ │ │ └── config.yaml │ │ ├── common.smk │ │ └── Snakefile │ ├── qc │ │ ├── config │ │ │ ├── test.csv │ │ │ ├── config.yaml │ │ │ └── test_qc_gls_config.yaml │ │ ├── envs │ │ │ ├── admixture.yml │ │ │ ├── subsample_snps.yml │ │ │ ├── vcftools_individuals.yml │ │ │ ├── plink.yml │ │ │ └── qc.yml │ │ ├── common.smk │ │ ├── scripts │ │ │ ├── contigs4admixture.py │ │ │ └── qc_dashboard_render.R │ │ └── Snakefile │ ├── postprocess │ │ ├── envs │ │ │ ├── bed.yml │ │ │ └── filter.yml │ │ ├── config │ │ │ └── config.yaml │ │ └── Snakefile │ ├── template │ │ ├── config │ │ │ └── config.yaml │ │ └── Snakefile │ └── trackhub │ │ ├── envs │ │ └── trackhub.yml │ │ ├── config │ │ └── config.yaml │ │ ├── scripts │ │ ├── vcftools_out_to_bg.py │ │ └── write_hub_files.py │ │ ├── html │ │ └── hub_description.html │ │ └── Snakefile ├── scripts │ ├── samples_to_keep.py │ ├── make_intervals.py │ ├── create_coverage_thresholds.py │ └── create_coverage_bed.py ├── Snakefile ├── rules │ ├── fastq2bam.smk │ ├── mappability.smk │ ├── fastq.smk │ ├── reference.smk │ ├── sumstats.smk │ ├── intervals.smk │ ├── cov_filter.smk │ ├── bam2vcf_gatk.smk │ ├── sentieon.smk │ └── bam2vcf_gatk_intervals.smk └── snparcher_utils │ ├── __init__.py │ └── write_samples.py ├── .readthedocs.yaml ├── .github └── workflows │ ├── ci.yaml │ └── main.yaml ├── .gitignore ├── LICENSE ├── README.md ├── workflow-profiles └── default │ └── config.yaml └── config └── config.yaml /.test/qc/results/genome1/test_qc_raw.vcf.gz.tbi: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.test/postprocess/results/genome1/test_postprocess_raw.vcf.gz.tbi: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.test/trackhub/results/genome1/data/genome/genome1.fna.fai: -------------------------------------------------------------------------------- 1 | JAKDEW010000001.1 53793026 30 60 61 2 | -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/docs/img/logo.png -------------------------------------------------------------------------------- /.test/qc/results/genome1/data/genome/genome1.fna.fai: -------------------------------------------------------------------------------- 1 | JAKDEW010000001.1 1 3 2 | JAKDEW010000002.1 1 3 3 | -------------------------------------------------------------------------------- /.test/qc/config/test.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject, 2 | sample,sample1,genome1,1,test,x 3 | 4 | -------------------------------------------------------------------------------- /.test/trackhub/config/test.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject, 2 | sample,sample1,genome1,1,test,x 3 | 4 | -------------------------------------------------------------------------------- /workflow/envs/bcftools.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bcftools==1.10 -------------------------------------------------------------------------------- /workflow/modules/mk/envs/mk.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - degenotate -------------------------------------------------------------------------------- /.test/postprocess/config/test.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject, 2 | sample,sample1,genome1,1,test,x 3 | 4 | -------------------------------------------------------------------------------- /workflow/modules/qc/config/test.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject, 2 | sample,sample1,genome1,1,test,x 3 | 4 | -------------------------------------------------------------------------------- /workflow/modules/qc/envs/admixture.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - admixture==1.3.0 -------------------------------------------------------------------------------- /.test/qc/results/genome1/test_qc_raw.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/qc/results/genome1/test_qc_raw.vcf.gz -------------------------------------------------------------------------------- /workflow/modules/postprocess/envs/bed.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bedtools==2.30 7 | -------------------------------------------------------------------------------- /workflow/modules/postprocess/envs/filter.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bcftools==1.16 7 | -------------------------------------------------------------------------------- /workflow/modules/qc/envs/subsample_snps.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bcftools==1.12 7 | -------------------------------------------------------------------------------- /.test/ecoli/data/local_genome/local_genome.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_genome/local_genome.fna.gz -------------------------------------------------------------------------------- /.test/qc/config/samples.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject 2 | testENA,EK7.12,GCA_000008865.2,ERR699557,Escherichia coli,PRJNA563564 3 | -------------------------------------------------------------------------------- /workflow/envs/sambamba.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - sambamba==0.8.0 7 | - python==3.11.4 8 | -------------------------------------------------------------------------------- /workflow/modules/qc/envs/vcftools_individuals.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - vcftools==0.1.16 7 | -------------------------------------------------------------------------------- /.test/ecoli/data/local_fastq/my_sample1_1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample1_1.fastq.gz -------------------------------------------------------------------------------- /.test/ecoli/data/local_fastq/my_sample1_2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample1_2.fastq.gz -------------------------------------------------------------------------------- /.test/ecoli/data/local_fastq/my_sample2_1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample2_1.fastq.gz -------------------------------------------------------------------------------- /.test/ecoli/data/local_fastq/my_sample2_2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample2_2.fastq.gz -------------------------------------------------------------------------------- /.test/trackhub/config/samples.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject 2 | testENA,EK7.12,GCA_000008865.2,ERR699557,Escherichia coli,PRJNA563564 3 | -------------------------------------------------------------------------------- /.test/postprocess/config/samples.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject 2 | testENA,EK7.12,GCA_000008865.2,ERR699557,Escherichia coli,PRJNA563564 3 | -------------------------------------------------------------------------------- /workflow/modules/qc/envs/plink.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - plink2==2.00a2.3 7 | - plink==1.90b6.21 8 | -------------------------------------------------------------------------------- /workflow/envs/sentieon.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - sentieon 7 | - python==3.11.4 8 | - samtools>=1.12 9 | -------------------------------------------------------------------------------- /workflow/envs/mappability.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bedtools==2.30.0 7 | - genmap>=1.3.0 8 | - python==3.11.4 -------------------------------------------------------------------------------- /.test/postprocess/results/genome1/test_postprocess_raw.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/postprocess/results/genome1/test_postprocess_raw.vcf.gz -------------------------------------------------------------------------------- /workflow/modules/mk/envs/ncbi.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - ncbi-datasets-cli==11.25.1 7 | - p7zip==16.02 8 | - pigz==2.6 -------------------------------------------------------------------------------- /.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Defining the exact version will make sure things don't break 2 | sphinx==5.3.0 3 | sphinx_rtd_theme==1.1.1 4 | readthedocs-sphinx-search==0.1.1 5 | myst-parser==1.0.0 6 | -------------------------------------------------------------------------------- /.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz.tbi -------------------------------------------------------------------------------- /workflow/envs/ucsc.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python==3.11.4 7 | - ucsc-fatotwobit==377 8 | - ucsc-twobitinfo==377 9 | 10 | -------------------------------------------------------------------------------- /workflow/modules/template/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/samples.csv" # path to the sample metadata CSV 6 | 7 | -------------------------------------------------------------------------------- /.test/postprocess/results/genome1/data/genome/genome1.fna.fai: -------------------------------------------------------------------------------- 1 | SCAF_1 122379970 8 60 61 2 | SCAF_2 108119840 124419653 60 61 3 | SCAF_3 107133695 234341499 60 61 4 | SCAF_4 104519870 343260764 60 61 5 | SCAF_5 94801293 449522640 60 61 6 | -------------------------------------------------------------------------------- /workflow/envs/angsd.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - angsd==0.937 7 | - samtools>=1.12 8 | - python>=3.6 9 | - numpy 10 | - scipy 11 | - cython 12 | - gxx 13 | -------------------------------------------------------------------------------- /workflow/envs/cov_filter.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bedtools==2.30.0 7 | - mosdepth==0.3.10 8 | - d4tools>=0.3.10 9 | - clam>=0.1.2 10 | - bedtk==0.0.r25.dirty 11 | 12 | -------------------------------------------------------------------------------- /workflow/modules/mk/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/samples.csv" # path to the sample metadata CSV 6 | final_prefix: "" # prefix for final output files 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /workflow/modules/trackhub/envs/trackhub.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - bedtools==2.30 7 | - bcftools==1.12 8 | - vcftools==0.1.16 9 | - ucsc-bedgraphtobigwig==377 10 | - ucsc-bedtobigbed==377 11 | - ucsc-bedsort==466 -------------------------------------------------------------------------------- /workflow/modules/trackhub/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/samples.csv" # path to the sample metadata CSV 6 | final_prefix: "" # prefix for final output files 7 | trackhub_email: "email@website.com" 8 | -------------------------------------------------------------------------------- /workflow/envs/bam2vcf.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - gatk4==4.1.8.0 7 | - freebayes==1.3.2 8 | - picard==2.22.8 9 | - samtools==1.11 10 | - vcftools==0.1.16 11 | - bedtools==2.29.2 12 | - pyyaml==5.3.1 13 | - htslib==1.11 14 | - bzip2==1.0.8 15 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.11" 7 | 8 | # Build from the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Explicitly set the version of Python and its requirements 13 | python: 14 | install: 15 | - requirements: docs/requirements.txt 16 | -------------------------------------------------------------------------------- /workflow/modules/qc/envs/qc.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - r-base==4.1.3 7 | - r-tidyverse==1.3.1 8 | - r-plotly==4.9.4.1 9 | - r-flexdashboard==0.5.2 10 | - r-ape==5.5 11 | - r-reshape2==1.4.4 12 | - bioconductor-ggtree==3.2.0 13 | - r-ggmap=3.0.0 14 | - r-ggplot2=3.3.5 15 | -------------------------------------------------------------------------------- /.test/ecoli/config/ecoli_config_genome.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,Run,Organism,BioProject,fq1,fq2 2 | SAMN12676327,EK7.12,SRR10058855,Escherichia coli,PRJNA563564,data/local_fastq/my_sample1_1.fastq.gz,data/local_fastq/my_sample1_2.fastq.gz 3 | SAMN12676342,EK7.30,SRR10058838,Escherichia coli,PRJNA563564,data/local_fastq/my_sample2_1.fastq.gz,data/local_fastq/my_sample2_2.fastq.gz 4 | -------------------------------------------------------------------------------- /workflow/envs/fastq2bam.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - samtools==1.14 7 | - fastp==0.20.1 8 | - bwa==0.7.17 9 | - sra-tools==3.0.0 10 | - ncbi-datasets-cli>=17.1.0 11 | - p7zip==16.02 12 | - pigz==2.6 13 | - curl>7.73.0 14 | - pip==22.0.4 15 | - bbmap==38.96 16 | - pip: 17 | - ffq 18 | -------------------------------------------------------------------------------- /.test/ecoli/config/local_and_sra.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,fq1,fq2,refPath 2 | SAMN12676327,EK7.12,GCA_000008865.2,SRR10058855,Escherichia coli,PRJNA563564,data/local_fastq/my_sample1_1.fastq.gz,data/local_fastq/my_sample1_2.fastq.gz,data/local_genome/local_genome.fna.gz 3 | SAMN12676342,EK7.30,GCA_003018455.1,SRR10058838,Escherichia coli,PRJNA563564 4 | -------------------------------------------------------------------------------- /workflow/scripts/samples_to_keep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | infile = sys.argv[1] 5 | DEPTH_CUTOFF = 2 6 | samps = [] 7 | with open(infile, "r") as f: 8 | next(f) 9 | for line in f: 10 | line = line.strip().split() 11 | if float(line[2]) >= DEPTH_CUTOFF: 12 | samps.append(line[0]) 13 | 14 | for s in samps: 15 | print(s) 16 | -------------------------------------------------------------------------------- /.test/ci/config/samples.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,SampleType 2 | test1,test1lib,GCF_000146045.2,SRR22893406,yeast,project,exclude 3 | test2,test2lib,GCF_000146045.2,SRR22893439,yeast,project,exclude 4 | test3,test3lib,GCF_000146045.2,SRR22893395,yeast,project 5 | test4,test4lib,GCF_000146045.2,SRR22893419,yeast,project 6 | test5,test5lib,GCF_000146045.2,SRR22893436,yeast,project -------------------------------------------------------------------------------- /.test/ecoli/workflow/scripts/samples_to_keep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | infile = sys.argv[1] 5 | DEPTH_CUTOFF = 2 6 | samps = [] 7 | with open(infile, "r") as f: 8 | next(f) 9 | for line in f: 10 | line = line.strip().split() 11 | if float(line[2]) >= DEPTH_CUTOFF: 12 | samps.append(line[0]) 13 | 14 | for s in samps: 15 | print(s) 16 | -------------------------------------------------------------------------------- /.test/ecoli/config/ecoli_samples.csv: -------------------------------------------------------------------------------- 1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,fq1,fq2,refPath 2 | SAMN12676327,EK7.12,GCA_000008865.2,SRR10058855,Escherichia coli,PRJNA563564,data/local_fastq/my_sample1_1.fastq.gz,data/local_fastq/my_sample1_2.fastq.gz,data/local_genome/local_genome.fna.gz 3 | SAMN12676342,EK7.30,GCA_003018455.1,SRR10058838,Escherichia coli,PRJNA563564,data/local_fastq/my_sample2_1.fastq.gz,data/local_fastq/my_sample2_2.fastq.gz 4 | -------------------------------------------------------------------------------- /workflow/modules/qc/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | final_prefix: "" # prefix for final output files 7 | 8 | ############################## 9 | # Variables you *might* need to change 10 | ############################## 11 | 12 | ## QC options ## 13 | nClusters: 3 14 | GoogleAPIKey: 15 | min_depth: 2 16 | -------------------------------------------------------------------------------- /workflow/modules/template/Snakefile: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | 5 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want 6 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve() 7 | if str(utils_path) not in sys.path: 8 | sys.path.append(str(utils_path)) 9 | 10 | import pandas as pd 11 | import snparcher_utils 12 | 13 | configfile: "config/config.yaml" 14 | wildcard_constraints: 15 | window="\d+" 16 | 17 | samples = snparcher_utils.parse_sample_sheet(config) 18 | 19 | # Define rules here 20 | rule all: 21 | pass -------------------------------------------------------------------------------- /docs/datasets.md: -------------------------------------------------------------------------------- 1 | # Datasets Produced by snpArcher 2 | A number of resequencing datasets have been run with snpArcher generating consistent variant calls, available via [Globus](https://www.globus.org/) in the [Comparative Population Genomics Data collection](https://app.globus.org/file-manager?origin_id=d2b75419-85ad-4871-8f34-003d73bbae7d&origin_path=%2F). Details of data processing are described [here](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1). If you use any of these datasets in your projects, please cite both the snpArcher paper and the original data producers. 3 | 4 | If you would like to contribute datasets you have created using snpArcher, please get in touch! 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | paths-ignore: 5 | - "docs/**" 6 | - "**.md" 7 | branches: 8 | - main 9 | 10 | 11 | jobs: 12 | Testing: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Test whole workflow 17 | uses: snakemake/snakemake-github-action@v1.25.1 18 | with: 19 | directory: .test/ci/ 20 | snakefile: workflow/Snakefile 21 | args: "--use-conda --show-failed-logs -j 1 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 22 | stagein: "conda config --set channel_priority strict" 23 | -------------------------------------------------------------------------------- /.test/qc/results/genome1/summary_stats/test_qc_bam_sumstats.txt: -------------------------------------------------------------------------------- 1 | Sample Total_Reads Percent_mapped Num_duplicates Percent_properly_paired Fraction_reads_pass_filter Num_filtered_reads 2 | test_A01 74002570 90.94 0 84.18 0.02587990396094019 2240784 3 | test_B01 80220874 95.94 0 89.27 0.6775766480787638 60496912 4 | test_C01 110928249 68.51 0 62.83 0.05437316857757944 6623216 5 | test_D01 87593075 96.97 0 88.18 0.9819319302461245 97194908 6 | test_E01 102503748 97.45 0 88.18 0.9798935985581342 112458212 7 | test_F01 118251228 97.90 0 90.86 0.9858274422405399 136304548 8 | test_G01 63596400 97.38 0 90.66 0.043010646075266695 3166694 9 | test_H01 102382268 97.54 0 88.44 0.08283396693724794 9375764 10 | test_A02 49840099 91.05 0 84.07 0.019503543101684753 1130540 -------------------------------------------------------------------------------- /workflow/modules/postprocess/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/samples.csv" # name of the sample metadata CSV 6 | final_prefix: "" # prefix for final output files 7 | 8 | ############################## 9 | # Variables you *might* need to change 10 | ############################## 11 | 12 | ## Filtering options ## 13 | 14 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable. 15 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable. 16 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable. 17 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | 3 | # -- Project information 4 | 5 | project = 'snpArcher' 6 | copyright = '2023, Cade Mirchandani' 7 | author = 'Cade Mirchandani' 8 | 9 | release = '0.1' 10 | version = '0.1.0' 11 | 12 | # -- General configuration 13 | 14 | extensions = [ 15 | 'sphinx.ext.duration', 16 | 'sphinx.ext.doctest', 17 | 'sphinx.ext.autodoc', 18 | 'sphinx.ext.autosummary', 19 | 'sphinx.ext.intersphinx', 20 | 'myst_parser' 21 | ] 22 | 23 | intersphinx_mapping = { 24 | 'python': ('https://docs.python.org/3/', None), 25 | 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), 26 | } 27 | intersphinx_disabled_domains = ['std'] 28 | 29 | templates_path = ['_templates'] 30 | myst_enable_extensions = [ 31 | "html_image" 32 | ] 33 | # -- Options for HTML output 34 | 35 | html_theme = 'sphinx_rtd_theme' 36 | 37 | # -- Options for EPUB output 38 | epub_show_urls = 'footnote' 39 | -------------------------------------------------------------------------------- /workflow/modules/qc/common.smk: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | # Get utils. This is not great, but we can move to setup.py and install later if want 5 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve() 6 | if str(utils_path) not in sys.path: 7 | sys.path.append(str(utils_path)) 8 | 9 | import pandas as pd 10 | import snparcher_utils 11 | 12 | def get_coords_if_available(wildcards): 13 | if 'lat' in samples.columns and 'long' in samples.columns: 14 | return "results/{refGenome}/QC/{prefix}.coords.txt" 15 | return [] 16 | 17 | def check_contig_names(fai, touch_file): 18 | dffai = pd.read_table(fai, sep='\t', header = None) 19 | fai_result=pd.to_numeric(dffai[0], errors='coerce').notnull().all() 20 | if fai_result==True: 21 | print("QC plots not generated because contig names are numeric and plink does not accept numeric contig names") 22 | elif fai_result==False: 23 | with open(touch_file, "w") as writer: 24 | writer.write("contigs are strings") 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | pixi.lock 3 | pixi.toml 4 | data_preparation/ 5 | .snakemake* 6 | template_slurm.sh 7 | slurm_logs/* 8 | # indexing and creating a sequence dictionary gets done within pipeline 9 | data/zebraFinch/genome/*.sa 10 | data/zebraFinch/genome/*.pac 11 | data/zebraFinch/genome/*.bwt 12 | data/zebraFinch/genome/*.ann 13 | data/zebraFinch/genome/*.amb 14 | data/zebraFinch/genome/*.fai 15 | data/zebraFinch/genome/*.dict 16 | rules/.snakemake 17 | data/BHduck/genome/*.sa 18 | data/BHduck/genome/*.pac 19 | data/BHduck/genome/*.bwt 20 | data/BHduck/genome/*.ann 21 | data/BHduck/genome/*.amb 22 | data/BHduck/genome/*.fai 23 | data/BHduck/genome/*.dict 24 | /data/ 25 | fastp.* 26 | intervalFiles/ 27 | out 28 | err 29 | __pycache__ 30 | log/ 31 | fastq2bam/ 32 | intervalFiles/ 33 | freebayes/ 34 | gatk/ 35 | logs/ 36 | *_dryrun.txt 37 | results/ 38 | tmp/ 39 | .test/ecoli/benchmarks/ 40 | .test/ecoli/logs/ 41 | .test/ecoli/results/ 42 | .test/ecoli/data/ 43 | *.lic 44 | .test/ci/results 45 | .test/ci/benchmarks 46 | .test/ci/logs 47 | .vscode 48 | .test/trackhub/*.sizes 49 | .test/trackhub/out.log 50 | # pixi environments 51 | .pixi 52 | *.egg-info 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2021 Harvard Informatics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /workflow/modules/qc/scripts/contigs4admixture.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import shutil 3 | 4 | def generate_mapping(input_file, bim_file, output_file): 5 | 6 | conversion_dict = {} 7 | with open(input_file, 'r') as f: 8 | for line in f: 9 | line = line.strip().split() 10 | conversion_dict[line[0]] = line[1] 11 | 12 | # Copy original bim file to a new file with ".orig" appended to its name 13 | orig_bim_file = bim_file + ".orig" 14 | shutil.copyfile(bim_file, orig_bim_file) 15 | 16 | # read bim file and replace the scaffold names with numbering 1:n (n = number of scaffolds) 17 | updated_lines = [] 18 | with open(bim_file, 'r') as f: 19 | for line in f: 20 | elements = line.strip().split('\t') 21 | scaffold = elements[0] 22 | if scaffold in conversion_dict: 23 | elements[0] = conversion_dict[scaffold] 24 | updated_lines.append('\t'.join(elements)) 25 | 26 | with open(output_file, 'w') as f: 27 | for line in updated_lines: 28 | f.write(line + '\n') 29 | 30 | input_file = snakemake.input.fai 31 | bim_file = snakemake.input.bim 32 | output_file = snakemake.output.bim 33 | generate_mapping(input_file, bim_file, output_file) 34 | -------------------------------------------------------------------------------- /workflow/Snakefile: -------------------------------------------------------------------------------- 1 | from snakemake.utils import min_version 2 | min_version("7.0") 3 | 4 | configfile: "config/config.yaml" 5 | include: "rules/common.smk" 6 | include: "rules/sumstats.smk" 7 | include: "rules/fastq.smk" 8 | include: "rules/reference.smk" 9 | include: "rules/mappability.smk" 10 | 11 | setup_curlrc() 12 | onerror: cleanup_curlrc() 13 | onsuccess: cleanup_curlrc() 14 | 15 | 16 | if config['sentieon']: 17 | include: "rules/sentieon.smk" 18 | else: 19 | include: "rules/fastq2bam.smk" 20 | if config['intervals']: 21 | include: "rules/bam2vcf_gatk_intervals.smk" 22 | include: "rules/intervals.smk" 23 | else: 24 | include: "rules/bam2vcf_gatk.smk" 25 | 26 | if config['cov_filter']: 27 | include: "rules/cov_filter.smk" 28 | 29 | module qc: 30 | snakefile: 31 | "modules/qc/Snakefile" 32 | config: 33 | config 34 | 35 | use rule * from qc as qc_* 36 | 37 | module mk: 38 | snakefile: 39 | "modules/mk/Snakefile" 40 | config: 41 | config 42 | 43 | use rule * from mk as mk_* 44 | 45 | module postprocess: 46 | snakefile: 47 | "modules/postprocess/Snakefile" 48 | config: 49 | config 50 | 51 | use rule * from postprocess as postprocess_* 52 | 53 | module trackhub: 54 | snakefile: 55 | "modules/trackhub/Snakefile" 56 | config: 57 | config 58 | 59 | use rule * from trackhub as trackhub_* 60 | 61 | rule all: 62 | input: 63 | get_output() 64 | default_target: True 65 | -------------------------------------------------------------------------------- /workflow/modules/mk/common.smk: -------------------------------------------------------------------------------- 1 | import glob 2 | import re 3 | import sys 4 | import os 5 | from pathlib import Path 6 | 7 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want 8 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve() 9 | if str(utils_path) not in sys.path: 10 | sys.path.append(str(utils_path)) 11 | 12 | import pandas as pd 13 | import snparcher_utils 14 | 15 | samples = snparcher_utils.parse_sample_sheet(config) 16 | 17 | def get_ref(wildcards): 18 | if 'refPath' in samples.columns: 19 | _refs = samples.loc[(samples['refGenome'] == wildcards.refGenome)]['refPath'].dropna().unique().tolist() 20 | for ref in _refs: 21 | if not os.path.exists(ref): 22 | raise WorkflowError(f"Reference genome {ref} does not exist") 23 | elif ref.rsplit(".", 1)[1] == '.gz': 24 | raise WorkflowError(f"Reference genome {ref} must be unzipped first.") 25 | return _refs 26 | else: 27 | return [] 28 | 29 | def get_gff(wildcards): 30 | if 'refGFF' in samples.columns: 31 | _refs = samples.loc[(samples['refGenome'] == wildcards.refGenome)]['refGFF'].dropna().unique().tolist() 32 | for ref in _refs: 33 | if not os.path.exists(ref): 34 | raise WorkflowError(f"Reference gff {ref} does not exist") 35 | elif ref.rsplit(".", 1)[1] == '.gz': 36 | raise WorkflowError(f"Reference gff {ref} must be unzipped first.") 37 | return _refs 38 | else: 39 | return [] 40 | -------------------------------------------------------------------------------- /workflow/modules/qc/scripts/qc_dashboard_render.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | render_qcplots <- function(prefix, nClusters, GMKey){ 4 | #specify the snakemake pipeline working d to knit with 5 | workd <- getwd() 6 | output.path <- gsub(".idepth", "_qc.html", normalizePath(paste0(prefix, ".idepth"))) #generate full path of output - brute force because I had issues with relative paths 7 | 8 | script.in <- paste0(snakemake@scriptdir, "/qc_dashboard_interactive.Rmd") #get real path of dashboard script 9 | script.out <- gsub(".Rmd", ".html", paste0(snakemake@scriptdir, "/qc_dashboard_interactive.Rmd")) #get name of future html 10 | 11 | rmarkdown::render(script.in, #knit the markdown file to html 12 | params = list(prefix = prefix, nClusters = nClusters, GMKey = GMKey), #pass the path to the QC files that are plotted (via snakemake params) 13 | knit_root_dir = workd) #make sure to knit in the working directory of the snakemake run 14 | 15 | #move the default html output to the QC folder. This is an inconvenience of knitr, and 16 | #the output.file 17 | copy_successful <- file.copy(script.out, output.path) 18 | 19 | # Check if the copy was successful 20 | if (copy_successful) { 21 | # If the copy succeeded, delete the original file 22 | file.remove(script.out) 23 | } else { 24 | # If the copy failed, print an error message 25 | cat("snpArcher: Failed to move the qc dashboard html.\n") 26 | } 27 | } 28 | 29 | render_qcplots(snakemake@params[[1]], snakemake@params[[2]], snakemake@params[[3]]) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # snpArcher 2 | 3 | snpArcher logo 4 | 5 | 6 | snpArcher is a reproducible workflow optimized for nonmodel organisms and comparisons across datasets, built on the [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html#) workflow management system. It provides a streamlined approach to dataset acquisition, variant calling, quality control, and downstream analysis. 7 | 8 | ### Usage 9 | For usage instructions and complete documentation, please visit our [docs](https://snparcher.readthedocs.io/en/latest/). 10 | 11 | ### Datasets generated by snpArcher 12 | A number of resequencing datasets have been run with snpArcher generating consistent variant calls, available via [Globus](https://www.globus.org/) in the [Comparative Population Genomics Data collection](https://app.globus.org/file-manager?origin_id=a6580c44-09fd-11ee-be16-195c41bc0be4&origin_path=%2F). Details of data processing are described [in our manuscript](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1). If you use any of these datasets in your projects, please cite both the [snpArcher paper](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1) and the original data producers. 13 | 14 | ### Citing snpArcher 15 | - Cade D Mirchandani, Allison J Shultz, Gregg W C Thomas, Sara J Smith, Mara Baylis, Brian Arnold, Russ Corbett-Detig, Erik Enbody, Timothy B Sackton, A fast, reproducible, high-throughput variant calling workflow for population genomics, Molecular Biology and Evolution, 2023;, msad270, https://doi.org/10.1093/molbev/msad270 16 | - Also, make sure to cite the tools you used within snpArcher. 17 | -------------------------------------------------------------------------------- /workflow/modules/trackhub/scripts/vcftools_out_to_bg.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | def chrom_dict(chrom_sizes_file): 4 | chroms = {} 5 | with open(chrom_sizes_file, "r") as f: 6 | for line in f: 7 | line = line.strip().split() 8 | chroms[line[0]] = int(line[1]) 9 | return chroms 10 | 11 | def parse_stat_file(stat_file, out_file, chrom_sizes): 12 | stat_file = Path(stat_file) 13 | file_type = stat_file.suffix 14 | window = int(stat_file.stem) 15 | 16 | with open(out_file, "w") as out: 17 | results = [] 18 | with open(stat_file, "r") as inp: 19 | next(inp) 20 | for line in inp: 21 | 22 | line = line.strip().split() 23 | chrom = line[0] 24 | if chrom not in chrom_sizes: 25 | 26 | continue 27 | else: 28 | start = int(line[1]) 29 | end = start + (window-1) 30 | if end >= chrom_sizes[chrom]: 31 | end = chrom_sizes[chrom]-1 32 | 33 | if file_type == ".Tajima": 34 | value = line[3] 35 | elif file_type == ".SNP-Density": 36 | value = line[2] 37 | elif file_type == ".Pi": 38 | value = line[4] 39 | else: 40 | raise(ValueError(f"Unknown file type: {file_type}")) 41 | 42 | results.append((chrom,start,end,value)) 43 | 44 | sorted_results = sorted(results, key=lambda x: (x[0], x[1])) 45 | 46 | for chrom, start, end, value in sorted_results: 47 | print(f"{chrom}\t{start}\t{end}\t{value}\n", file=out) 48 | def main(): 49 | chrom_sizes = chrom_dict(snakemake.input["chrom_sizes"]) 50 | parse_stat_file(stat_file=snakemake.input["stat_file"], out_file=snakemake.output[0], chrom_sizes=chrom_sizes) 51 | 52 | if __name__ == "__main__": 53 | main() -------------------------------------------------------------------------------- /workflow/rules/fastq2bam.smk: -------------------------------------------------------------------------------- 1 | rule bwa_map: 2 | input: 3 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 4 | r1 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_1.fastq.gz", 5 | r2 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_2.fastq.gz", 6 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 7 | output: 8 | bam = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam"), 9 | bai = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam.bai"), 10 | params: 11 | rg = get_read_group 12 | conda: 13 | "../envs/fastq2bam.yml" 14 | log: 15 | "logs/{refGenome}/bwa_mem/{sample}/{run}.txt" 16 | benchmark: 17 | "benchmarks/{refGenome}/bwa_mem/{sample}_{run}.txt" 18 | shell: 19 | "bwa mem -M -t {threads} -R {params.rg} {input.ref} {input.r1} {input.r2} 2> {log} | samtools sort -o {output.bam} - && samtools index {output.bam} {output.bai}" 20 | 21 | rule merge_bams: 22 | input: 23 | merge_bams_input 24 | output: 25 | bam = temp("results/{refGenome}/bams/postMerge/{sample}.bam"), 26 | bai = temp("results/{refGenome}/bams/postMerge/{sample}.bam.bai") 27 | conda: 28 | "../envs/fastq2bam.yml" 29 | log: 30 | "logs/{refGenome}/merge_bams/{sample}.txt" 31 | benchmark: 32 | "benchmarks/{refGenome}/merge_bams/{sample}.txt" 33 | shell: 34 | "samtools merge {output.bam} {input} && samtools index {output.bam} > {log}" 35 | 36 | rule dedup: 37 | input: 38 | unpack(dedup_input) 39 | output: 40 | dedupBam = "results/{refGenome}/bams/{sample}_final.bam", 41 | dedupBai = "results/{refGenome}/bams/{sample}_final.bam.bai", 42 | conda: 43 | "../envs/sambamba.yml" 44 | log: 45 | "logs/{refGenome}/sambamba_dedup/{sample}.txt" 46 | benchmark: 47 | "benchmarks/{refGenome}/sambamba_dedup/{sample}.txt" 48 | shell: 49 | "sambamba markdup -t {threads} {input.bam} {output.dedupBam} 2> {log}" -------------------------------------------------------------------------------- /workflow/rules/mappability.smk: -------------------------------------------------------------------------------- 1 | rule genmap: 2 | input: 3 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 4 | output: 5 | bg = temp("results/{refGenome}/genmap/{refGenome}.genmap.bedgraph"), 6 | sorted_bg = "results/{refGenome}/genmap/sorted_mappability.bg" 7 | params: 8 | indir = os.path.join(DEFAULT_STORAGE_PREFIX, "results/{refGenome}/genmap_index"), 9 | outdir = os.path.join(DEFAULT_STORAGE_PREFIX, "results/{refGenome}/genmap"), 10 | kmer = config['mappability_k'] 11 | log: 12 | "logs/{refGenome}/genmap/log.txt" 13 | benchmark: 14 | "benchmarks/{refGenome}/genmap/benchmark.txt" 15 | conda: 16 | "../envs/mappability.yml" 17 | shell: 18 | # snakemake creates the output directory before the shell command, but genmap doesnt like this. so we remove the directory first. 19 | """ 20 | rm -rf {params.indir} && genmap index -F {input.ref} -I {params.indir} &> {log} 21 | genmap map -K {params.kmer} -E 0 -I {params.indir} -O {params.outdir} -bg -T {threads} -v &> {log} 22 | sort -k1,1 -k2,2n {output.bg} > {output.sorted_bg} 2>> {log} 23 | """ 24 | 25 | rule mappability_bed: 26 | input: 27 | map = "results/{refGenome}/genmap/sorted_mappability.bg" 28 | output: 29 | callable_sites = "results/{refGenome}/callable_sites/{prefix}_callable_sites_map.bed" if config['cov_filter'] else "results/{refGenome}/{prefix}_callable_sites.bed", 30 | tmp_map = temp("results/{refGenome}/callable_sites/{prefix}_temp_map.bed") 31 | conda: 32 | "../envs/mappability.yml" 33 | benchmark: 34 | "benchmarks/{refGenome}/mapbed/{prefix}_benchmark.txt" 35 | params: 36 | merge = config['mappability_merge'], 37 | mappability = config['mappability_min'] 38 | shell: 39 | """ 40 | awk 'BEGIN{{OFS="\\t";FS="\\t"}} {{ if($4>={params.mappability}) print $1,$2,$3 }}' {input.map} > {output.tmp_map} 41 | bedtools sort -i {output.tmp_map} | bedtools merge -d {params.merge} -i - > {output.callable_sites} 42 | """ -------------------------------------------------------------------------------- /workflow/scripts/make_intervals.py: -------------------------------------------------------------------------------- 1 | from re import I 2 | from snakemake.exceptions import WorkflowError 3 | import os 4 | 5 | """ 6 | Reads output file from ScatterIntervalsByNs and puts intervals into (relatively) specified num of equal groups. 7 | Writes intervals groups to individual files for use by HaplotypeCaller 8 | """ 9 | 10 | 11 | def make_intervals( 12 | in_file: str, num_intervals: int, output_dir: str, int_output_file: str 13 | ) -> None: 14 | 15 | intervals = [] 16 | 17 | with open(in_file, "r") as f: 18 | for line in f: 19 | if not line.startswith("@"): 20 | line = line.strip().split() 21 | chrom, start, end, = ( 22 | line[0], 23 | int(line[1]), 24 | int(line[2]), 25 | ) 26 | size = end - start 27 | intervals.append((chrom, start, end, size)) 28 | 29 | if num_intervals > len(intervals): 30 | num_intervals = len(intervals) 31 | 32 | groups = [[] for i in range(num_intervals)] 33 | sums = {i: 0 for i in range(num_intervals)} 34 | c = 0 35 | for chrom, start, end, size in sorted(intervals, key=lambda x: x[3]): 36 | for i in sums: 37 | if c == sums[i]: 38 | groups[i].append((chrom, start, end)) 39 | break 40 | sums[i] += size 41 | c = min(sums.values()) 42 | 43 | if not os.path.exists(output_dir): 44 | os.mkdir(output_dir) 45 | 46 | with open(int_output_file, "w") as out: 47 | for i, group in enumerate(groups): 48 | file = os.path.join(output_dir, f"{i}.list") 49 | with open(file, "w") as f: 50 | for chrom, start, end in group: 51 | print(f"{chrom}:{start}-{end}", file=f) 52 | print(f"{chrom}:{start}-{end}", file=out) 53 | 54 | 55 | def main(): 56 | make_intervals( 57 | snakemake.input["in_file"], 58 | snakemake.params["max_intervals"], 59 | snakemake.output["out_dir"], 60 | snakemake.output["intervals"], 61 | ) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /workflow-profiles/default/config.yaml: -------------------------------------------------------------------------------- 1 | use-conda: True 2 | 3 | # These resources will be applied to all rules. Can be overriden on a per-rule basis below. 4 | default-resources: 5 | mem_mb: attempt * 16000 6 | mem_mb_reduced: (attempt * 16000) * 0.9 # Mem allocated to java for GATK rules (tries to prevent OOM errors) 7 | # Uncomment and edit following options for slurm execution: 8 | # slurm_partition: "" 9 | # slurm_account: # Same as sbatch -A. Not all clusters use this. 10 | # runtime: 720 # In minutes 11 | 12 | # Control number of threads each rule will use. 13 | set-threads: 14 | # Mappability 15 | genmap: 1 16 | 17 | # Fastq Processing 18 | get_fastq_pe: 6 19 | fastp: 6 20 | # Alignment 21 | bwa_map: 16 22 | dedup: 16 23 | 24 | # GVCF 25 | bam2gvcf: 1 # Does not benefit from more than 2 threads 26 | gvcf2DB: 1 # Does not benefit from more than 2 threads 27 | 28 | # VCF 29 | DB2vcf: 1 # Does not benefit from more than 2 threads 30 | filterVcfs: 1 # Does not benefit from more than 2 threads 31 | sort_gatherVcfs: 1 # Does not benefit from more than 2 threads 32 | 33 | # Callable Bed 34 | compute_d4: 6 35 | clam_loci: 6 36 | 37 | # Sentieon Tools 38 | sentieon_map: 16 39 | sentieon_dedup: 16 40 | sentieon_haplotyper: 32 41 | sentieon_combine_gvcf: 32 42 | # Control/overwrite resources per rule. 43 | # To use this feature, uncomment "set-resources:" below and add rules you want to customize. 44 | # Examples: 45 | # 46 | # set-resources: 47 | # # Example 1: Increase memory for bam2gvcf rule 48 | # bam2gvcf: 49 | # mem_mb: attempt * 64000 # Customize memory allocation 50 | # mem_mb_reduced: (attempt * 64000) * 0.9 # Customize Java memory allocation 51 | # 52 | # # Example 2: Set slurm parameters for a resource-intensive rule 53 | # sentieon_haplotyper: 54 | # mem_mb: attempt * 8000 55 | # mem_mb_reduced: (attempt * 8000) * 0.9 56 | # slurm_partition: high-mem 57 | # runtime: "24:00:00" 58 | # cpus_per_task: 32 59 | # 60 | # To customize a rule, copy one of the example blocks above, paste it under "set-resources:", 61 | # replace the rule name with your target rule, and adjust the resource parameters as needed. 62 | -------------------------------------------------------------------------------- /.test/qc/config/resources.yaml: -------------------------------------------------------------------------------- 1 | ### 2 | # fastq2bam workflow 3 | ## 4 | 5 | # fastq download 6 | get_fastq_pe: 7 | threads: 10 8 | mem: 5000 9 | # compress fastq 10 | gzip_fastq: 11 | mem: 4000 12 | # fastp program 13 | fastp: 14 | threads: 10 15 | mem: 5000 16 | # index reference 17 | index_ref: 18 | mem: 10000 19 | # genmap map 20 | genmap: 21 | threads: 10 22 | mem: 20000 23 | genmap_sort: 24 | mem: 4000 25 | # bwa mapping 26 | bwa_map: 27 | threads: 31 28 | mem: 15000 29 | # sort bam with picard's SortSam tool 30 | sort_bam: 31 | mem: 25000 32 | # mark duplicates with picard's MarkDuplicates 33 | dedup: 34 | threads: 31 35 | mem: 9000 36 | # calculate BAM summaries with samtools and picard 37 | bam_sumstats: 38 | mem: 9000 39 | merge_bams: 40 | mem: 9000 41 | bedtools: 42 | mem: 4000 43 | 44 | ### 45 | # Intervals workflow 46 | ### 47 | 48 | # preprocess genome, create intervals 49 | # picard's create CreateSequenceDictionary, samtools faidx 50 | process_ref: 51 | mem: 15000 52 | # custom python algo to create intervals 53 | create_db_intervals: 54 | mem: 5000 55 | 56 | create_gvcf_intervals: 57 | mem: 5000 58 | 59 | ## Callable sites workflow 60 | 61 | # genmap map 62 | genmap: 63 | threads: 10 64 | mem: 20000 65 | genmap_sort: 66 | mem: 4000 67 | compute_d4: 68 | mem: 4000 69 | threads: 4 70 | merge_d4: 71 | mem: 10000 72 | callable_bed: 73 | mem: 10000 74 | 75 | ### 76 | # bam2vcf workflows 77 | ### 78 | 79 | # gatk HaplotypeCaller 80 | bam2gvcf: 81 | mem: 30000 82 | # gatk GenomicsDBImport 83 | gvcf2DB: 84 | mem: 30000 85 | # gatk GenotypeGVCFs 86 | DB2vcf: 87 | mem: 30000 88 | ## freebayes program only! ## 89 | bam2vcf: 90 | mem: 30000 91 | # gatk filterVcfs 92 | filterVcfs: 93 | mem: 30000 94 | # gatk GatherVcfs 95 | gatherVcfs: 96 | mem: 30000 97 | # picard SortVcf 98 | sortVcf: 99 | mem: 30000 100 | # vcftools program 101 | vcftools: 102 | mem: 30000 103 | # bedtools program 104 | bedtools: 105 | mem: 30000 106 | # plink 107 | plink: 108 | threads: 5 109 | admixture: 110 | mem: 4000 111 | -------------------------------------------------------------------------------- /.test/postprocess/config/resources.yaml: -------------------------------------------------------------------------------- 1 | ### 2 | # fastq2bam workflow 3 | ## 4 | 5 | # fastq download 6 | get_fastq_pe: 7 | threads: 10 8 | mem: 5000 9 | # compress fastq 10 | gzip_fastq: 11 | mem: 4000 12 | # fastp program 13 | fastp: 14 | threads: 10 15 | mem: 5000 16 | # index reference 17 | index_ref: 18 | mem: 10000 19 | # genmap map 20 | genmap: 21 | threads: 10 22 | mem: 20000 23 | genmap_sort: 24 | mem: 4000 25 | # bwa mapping 26 | bwa_map: 27 | threads: 31 28 | mem: 15000 29 | # sort bam with picard's SortSam tool 30 | sort_bam: 31 | mem: 25000 32 | # mark duplicates with picard's MarkDuplicates 33 | dedup: 34 | threads: 31 35 | mem: 9000 36 | # calculate BAM summaries with samtools and picard 37 | bam_sumstats: 38 | mem: 9000 39 | merge_bams: 40 | mem: 9000 41 | bedtools: 42 | mem: 4000 43 | 44 | ### 45 | # Intervals workflow 46 | ### 47 | 48 | # preprocess genome, create intervals 49 | # picard's create CreateSequenceDictionary, samtools faidx 50 | process_ref: 51 | mem: 15000 52 | # custom python algo to create intervals 53 | create_db_intervals: 54 | mem: 5000 55 | 56 | create_gvcf_intervals: 57 | mem: 5000 58 | 59 | ## Callable sites workflow 60 | 61 | # genmap map 62 | genmap: 63 | threads: 10 64 | mem: 20000 65 | genmap_sort: 66 | mem: 4000 67 | compute_d4: 68 | mem: 4000 69 | threads: 4 70 | merge_d4: 71 | mem: 10000 72 | callable_bed: 73 | mem: 10000 74 | 75 | ### 76 | # bam2vcf workflows 77 | ### 78 | 79 | # gatk HaplotypeCaller 80 | bam2gvcf: 81 | mem: 30000 82 | # gatk GenomicsDBImport 83 | gvcf2DB: 84 | mem: 30000 85 | # gatk GenotypeGVCFs 86 | DB2vcf: 87 | mem: 30000 88 | ## freebayes program only! ## 89 | bam2vcf: 90 | mem: 30000 91 | # gatk filterVcfs 92 | filterVcfs: 93 | mem: 30000 94 | # gatk GatherVcfs 95 | gatherVcfs: 96 | mem: 30000 97 | # picard SortVcf 98 | sortVcf: 99 | mem: 30000 100 | # vcftools program 101 | vcftools: 102 | mem: 30000 103 | # bedtools program 104 | bedtools: 105 | mem: 30000 106 | # plink 107 | plink: 108 | threads: 5 109 | admixture: 110 | mem: 4000 111 | -------------------------------------------------------------------------------- /.test/trackhub/config/resources.yaml: -------------------------------------------------------------------------------- 1 | ### 2 | # fastq2bam workflow 3 | ## 4 | 5 | # fastq download 6 | get_fastq_pe: 7 | threads: 10 8 | mem: 5000 9 | # compress fastq 10 | gzip_fastq: 11 | mem: 4000 12 | # fastp program 13 | fastp: 14 | threads: 10 15 | mem: 5000 16 | # index reference 17 | index_ref: 18 | mem: 10000 19 | # genmap map 20 | genmap: 21 | threads: 10 22 | mem: 20000 23 | genmap_sort: 24 | mem: 4000 25 | # bwa mapping 26 | bwa_map: 27 | threads: 31 28 | mem: 15000 29 | # sort bam with picard's SortSam tool 30 | sort_bam: 31 | mem: 25000 32 | # mark duplicates with picard's MarkDuplicates 33 | dedup: 34 | threads: 31 35 | mem: 9000 36 | # calculate BAM summaries with samtools and picard 37 | bam_sumstats: 38 | mem: 9000 39 | merge_bams: 40 | mem: 9000 41 | bedtools: 42 | mem: 4000 43 | 44 | ### 45 | # Intervals workflow 46 | ### 47 | 48 | # preprocess genome, create intervals 49 | # picard's create CreateSequenceDictionary, samtools faidx 50 | process_ref: 51 | mem: 15000 52 | # custom python algo to create intervals 53 | create_db_intervals: 54 | mem: 5000 55 | 56 | create_gvcf_intervals: 57 | mem: 5000 58 | 59 | ## Callable sites workflow 60 | 61 | # genmap map 62 | genmap: 63 | threads: 10 64 | mem: 20000 65 | genmap_sort: 66 | mem: 4000 67 | compute_d4: 68 | mem: 4000 69 | threads: 4 70 | merge_d4: 71 | mem: 10000 72 | callable_bed: 73 | mem: 10000 74 | 75 | ### 76 | # bam2vcf workflows 77 | ### 78 | 79 | # gatk HaplotypeCaller 80 | bam2gvcf: 81 | mem: 30000 82 | # gatk GenomicsDBImport 83 | gvcf2DB: 84 | mem: 30000 85 | # gatk GenotypeGVCFs 86 | DB2vcf: 87 | mem: 30000 88 | ## freebayes program only! ## 89 | bam2vcf: 90 | mem: 30000 91 | # gatk filterVcfs 92 | filterVcfs: 93 | mem: 30000 94 | # gatk GatherVcfs 95 | gatherVcfs: 96 | mem: 30000 97 | # picard SortVcf 98 | sortVcf: 99 | mem: 30000 100 | # vcftools program 101 | vcftools: 102 | mem: 30000 103 | # bedtools program 104 | bedtools: 105 | mem: 30000 106 | # plink 107 | plink: 108 | threads: 5 109 | admixture: 110 | mem: 4000 111 | -------------------------------------------------------------------------------- /.test/ci/config/resources.yaml: -------------------------------------------------------------------------------- 1 | ### 2 | # fastq2bam rules 3 | ## 4 | 5 | # fastq download 6 | get_fastq_pe: 7 | threads: 8 8 | mem: 4000 9 | 10 | # index reference 11 | index_ref: 12 | mem: 10000 13 | 14 | # fastp program 15 | fastp: 16 | threads: 8 17 | mem: 4000 18 | 19 | # bwa mapping 20 | bwa_map: 21 | threads: 31 22 | mem: 15000 23 | # sort bam with picard's SortSam tool 24 | sort_bam: 25 | threads: 4 26 | mem_per_thread: 1000 27 | 28 | #merge bams 29 | merge_bams: 30 | mem: 9000 31 | threads: 2 32 | 33 | # mark duplicates with picard's MarkDuplicates 34 | dedup: 35 | threads: 31 36 | mem: 9000 37 | # calculate BAM summaries with samtools and picard 38 | bam_sumstats: 39 | mem: 9000 40 | merge_bams: 41 | mem: 9000 42 | bedtools: 43 | mem: 4000 44 | 45 | # Sentieon tools 46 | sentieon_map: 47 | machine_type: "n2d-standard-32" 48 | threads: 31 49 | mem: 15000 50 | 51 | sentieon_dedup: 52 | machine_type: "n2d-standard-32" 53 | threads: 31 54 | mem: 15000 55 | 56 | sentieon_haplotyper: 57 | machine_type: "n2d-standard-32" 58 | threads: 31 59 | mem: 15000 60 | 61 | sentieon_combine_gvcf: 62 | machine_type: "n2d-standard-32" 63 | threads: 31 64 | mem: 15000 65 | disk_mb: 2000000 66 | 67 | ### 68 | # Intervals workflow 69 | ### 70 | 71 | # preprocess genome, create intervals 72 | # picard's create CreateSequenceDictionary, samtools faidx 73 | process_ref: 74 | mem: 15000 75 | # custom python algo to create intervals 76 | create_db_intervals: 77 | mem: 5000 78 | 79 | create_gvcf_intervals: 80 | mem: 5000 81 | 82 | ## Callable sites workflow 83 | 84 | # genmap map 85 | genmap: 86 | threads: 10 87 | mem: 20000 88 | genmap_sort: 89 | mem: 4000 90 | compute_d4: 91 | mem: 4000 92 | threads: 4 93 | merge_d4: 94 | mem: 10000 95 | callable_bed: 96 | mem: 10000 97 | 98 | ### 99 | # bam2vcf workflows 100 | ### 101 | 102 | # gatk HaplotypeCaller 103 | bam2gvcf: 104 | mem: 4000 105 | # gatk GenomicsDBImport 106 | gvcf2DB: 107 | mem: 4000 108 | # gatk GenotypeGVCFs 109 | DB2vcf: 110 | mem: 4000 111 | # gatk filterVcfs 112 | filterVcfs: 113 | mem: 4000 114 | # gatk GatherVcfs 115 | gatherVcfs: 116 | mem: 4000 117 | # vcftools program 118 | vcftools: 119 | mem: 8000 120 | # plink 121 | plink: 122 | threads: 5 123 | admixture: 124 | mem: 4000 125 | -------------------------------------------------------------------------------- /workflow/modules/trackhub/html/hub_description.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | snpArcher Track Hub Description 5 | 30 | 31 | 32 |

snpArcher Track Hub Description

33 | 34 |

Introduction

35 |

To facilitate downstream data exploration and as an example of the module development components of this work, we 36 | developed a module to generate UCSC Genome Browser track files to explore population variation data (see preprint for details).

38 | 39 |

Track Descriptions

40 | 41 |

Tajima’s D

42 |

This track provides windowed estimates of Tajima’s D, a population genetic statistic that measures the departure from neutral evolution in a DNA sequence.

43 | 44 |

SNP Density

45 |

This track displays the density of single nucleotide polymorphisms (SNPs) across the genome, showing regions with high or low levels of genetic variation.

46 | 47 |

Pi

48 |

The Pi track represents the average number of nucleotide differences per site between any two sequences in a population, providing an estimate of genetic diversity.

49 | 50 |

Minor Allele Frequency

51 |

This track shows the frequency of the less common allele at a SNP locus, providing insights into the genetic variation within a population.

52 | 53 |

SNP Depth

54 |

The SNP Depth track displays the number of reads or sequencing depth at each SNP position, indicating the coverage and quality of the variant calls.

55 | 56 |

Non Callable Sites

57 |

The Non Callable Sites track highlights regions in the genome that are considered non-callable, meaning that they have low sequencing coverage or other technical limitations that make it difficult to accurately determine genetic variation in those regions.

58 | 59 | 60 | -------------------------------------------------------------------------------- /.test/ecoli/config/resources.yaml: -------------------------------------------------------------------------------- 1 | ######################################## 2 | ## RESOURCES ## 3 | ######################################## 4 | 5 | # fastq download 6 | get_fastq_pe: 7 | threads: 1 8 | mem: 4000 9 | # compress fastq 10 | gzip_fastq: 11 | mem: 4000 12 | # fastp program 13 | fastp: 14 | threads: 1 15 | mem: 4000 16 | # index reference 17 | index_ref: 18 | mem: 4000 19 | # genmap map 20 | genmap: 21 | threads: 1 22 | mem: 4000 23 | genmap_sort: 24 | mem: 4000 25 | # bwa mapping 26 | bwa_map: 27 | threads: 1 28 | mem: 4000 29 | # sort bam with picard's SortSam tool 30 | sort_bam: 31 | mem: 4000 32 | # mark duplicates with picard's MarkDuplicates 33 | dedup: 34 | threads: 1 35 | mem: 4000 36 | # calculate BAM summaries with samtools and picard 37 | bam_sumstats: 38 | mem: 4000 39 | merge_bams: 40 | mem: 4000 41 | bedtools: 42 | mem: 4000 43 | 44 | ### 45 | # Intervals workflow 46 | ### 47 | 48 | # preprocess genome, create intervals 49 | # picard's create CreateSequenceDictionary, samtools faidx 50 | process_ref: 51 | mem: 4000 52 | # custom python algo to create intervals 53 | create_db_intervals: 54 | mem: 5000 55 | 56 | create_gvcf_intervals: 57 | mem: 5000 58 | 59 | ## Callable sites workflow 60 | 61 | # genmap map 62 | genmap: 63 | threads: 1 64 | mem: 4000 65 | genmap_sort: 66 | mem: 4000 67 | compute_d4: 68 | mem: 4000 69 | threads: 1 70 | merge_d4: 71 | mem: 4000 72 | callable_bed: 73 | mem: 4000 74 | 75 | 76 | ## Callable sites workflow 77 | 78 | # genmap map 79 | genmap: 80 | threads: 10 81 | mem: 10000 82 | genmap_sort: 83 | mem: 4000 84 | compute_d4: 85 | mem: 4000 86 | threads: 4 87 | merge_d4: 88 | mem: 10000 89 | callable_bed: 90 | mem: 10000 91 | 92 | ### 93 | # bam2vcf workflows 94 | ### 95 | 96 | # gatk HaplotypeCaller 97 | bam2gvcf: 98 | mem: 4000 99 | # gatk GenomicsDBImport 100 | gvcf2DB: 101 | mem: 4000 102 | # gatk GenotypeGVCFs 103 | DB2vcf: 104 | mem: 4000 105 | ## freebayes program only! ## 106 | bam2vcf: 107 | mem: 4000 108 | # gatk filterVcfs 109 | filterVcfs: 110 | mem: 4000 111 | # gatk GatherVcfs 112 | gatherVcfs: 113 | mem: 4000 114 | # picard SortVcf 115 | sortVcf: 116 | mem: 4000 117 | # vcftools program 118 | vcftools: 119 | mem: 4000 120 | # bedtools program 121 | bedtools: 122 | mem: 4000 123 | # plink 124 | plink: 125 | threads: 1 126 | admixture: 127 | mem: 4000 128 | -------------------------------------------------------------------------------- /workflow/rules/fastq.smk: -------------------------------------------------------------------------------- 1 | rule get_fastq_pe: 2 | output: 3 | temp("results/data/fastq/{refGenome}/{sample}/{run}_1.fastq.gz"), 4 | temp("results/data/fastq/{refGenome}/{sample}/{run}_2.fastq.gz") 5 | params: 6 | outdir = os.path.join(DEFAULT_STORAGE_PREFIX, "results/data/fastq/{refGenome}/{sample}/") 7 | conda: 8 | "../envs/fastq2bam.yml" 9 | benchmark: 10 | "benchmarks/{refGenome}/getfastq/{sample}_{run}.txt" 11 | resources: 12 | tmpdir = get_big_temp 13 | shell: 14 | """ 15 | set +e 16 | #delete existing prefetch file in case of previous run failure 17 | rm -rf {wildcards.run} 18 | ##attempt to get SRA file from NCBI (prefetch) or ENA (wget) 19 | prefetch --max-size 1T {wildcards.run} 20 | prefetchExit=$? 21 | if [[ $prefetchExit -ne 0 ]] 22 | then 23 | ffq --ftp {wildcards.run} | grep -Eo '"url": "[^"]*"' | grep -o '"[^"]*"$' | grep "fastq" | xargs curl --remote-name-all --output-dir {params.outdir} 24 | else 25 | fasterq-dump {wildcards.run} -O {params.outdir} -e {threads} -t {resources.tmpdir} 26 | pigz -p {threads} {params.outdir}{wildcards.run}*.fastq 27 | fi 28 | rm -rf {wildcards.run} 29 | """ 30 | 31 | rule sort_reads: 32 | input: 33 | unpack(get_reads) 34 | output: 35 | r1 = temp("results/{refGenome}/sorted_reads/{sample}/{run}_1.fastq.gz"), 36 | r2 = temp("results/{refGenome}/sorted_reads/{sample}/{run}_2.fastq.gz"), 37 | conda: 38 | "../envs/fastq2bam.yml" 39 | log: 40 | "logs/{refGenome}/sort_reads/{sample}/{run}.txt" 41 | benchmark: 42 | "benchmarks/{refGenome}/sort_reads/{sample}_{run}.txt" 43 | shell: 44 | """ 45 | sortbyname.sh in={input.r1} out={output.r1} &> {log} 46 | sortbyname.sh in={input.r2} out={output.r2} &>> {log} 47 | """ 48 | 49 | rule fastp: 50 | input: 51 | unpack(get_reads_fastp) 52 | output: 53 | r1 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_1.fastq.gz", 54 | r2 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_2.fastq.gz", 55 | summ = "results/{refGenome}/summary_stats/{sample}/{run}.fastp.out" 56 | conda: 57 | "../envs/fastq2bam.yml" 58 | log: 59 | "logs/{refGenome}/fastp/{sample}/{run}.txt" 60 | benchmark: 61 | "benchmarks/{refGenome}/fastp/{sample}_{run}.txt" 62 | shell: 63 | """ 64 | fastp --in1 {input.r1} --in2 {input.r2} \ 65 | --out1 {output.r1} --out2 {output.r2} \ 66 | --thread {threads} \ 67 | --detect_adapter_for_pe \ 68 | -j {output.summ} -h /dev/null \ 69 | &>{log} 70 | """ -------------------------------------------------------------------------------- /workflow/rules/reference.smk: -------------------------------------------------------------------------------- 1 | ruleorder: download_reference > index_reference 2 | localrules: download_reference 3 | 4 | # This does not work with SLURM as of 4/3/24. See here for more info:https://github.com/snakemake/snakemake-executor-plugin-slurm/issues/60 5 | # rule copy_reference: 6 | # """Copies user-specified reference genome path to results dir to maintain refGenome wildcard""" 7 | # input: 8 | # ref = get_ref 9 | # output: 10 | # ref = "results/{refGenome}/data/genome/{refGenome}.fna" 11 | # log: 12 | # "logs/{refGenome}/copy_ref/log.txt" 13 | # shell: 14 | # #probably don't need to unzip but might as well. 15 | # """ 16 | # gunzip -c {input.ref} 2> {log} > {output.ref} || cp {input.ref} {output.ref} &> {log} 17 | # """ 18 | 19 | rule download_reference: 20 | input: 21 | ref = get_ref 22 | output: 23 | ref = "results/{refGenome}/data/genome/{refGenome}.fna" 24 | params: 25 | dataset = "results/{refGenome}/data/genome/{refGenome}_dataset.zip", 26 | outdir = "results/{refGenome}/data/genome/{refGenome}" 27 | conda: 28 | "../envs/fastq2bam.yml" 29 | log: 30 | "logs/{refGenome}/download_ref/log.txt" 31 | benchmark: 32 | "benchmarks/{refGenome}/download_ref/benchmark.txt" 33 | shell: 34 | """ 35 | if [ -z "{input.ref}" ] # check if this is empty 36 | then 37 | mkdir -p {params.outdir} 38 | datasets download genome accession {wildcards.refGenome} --include genome --filename {params.dataset} \ 39 | && (7z x {params.dataset} -aoa -o{params.outdir} || unzip -o {params.dataset} -d {params.outdir}) \ 40 | && cat {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/*.fna > {output.ref} 41 | else 42 | gunzip -c {input.ref} 2> {log} > {output.ref} || cp {input.ref} {output.ref} &> {log} 43 | fi 44 | """ 45 | 46 | rule index_reference: 47 | input: 48 | ref = "results/{refGenome}/data/genome/{refGenome}.fna" 49 | output: 50 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb"]), 51 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 52 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict" 53 | conda: 54 | "../envs/fastq2bam.yml" 55 | log: 56 | "logs/{refGenome}/index_ref/log.txt" 57 | benchmark: 58 | "benchmarks/{refGenome}/index_ref/benchmark.txt" 59 | shell: 60 | """ 61 | bwa index {input.ref} 2> {log} 62 | samtools faidx {input.ref} --output {output.fai} >> {log} 63 | samtools dict {input.ref} -o {output.dictf} >> {log} 2>&1 64 | """ 65 | -------------------------------------------------------------------------------- /workflow/scripts/create_coverage_thresholds.py: -------------------------------------------------------------------------------- 1 | from snakemake.script import snakemake 2 | from snakemake.exceptions import WorkflowError 3 | import math 4 | 5 | # read chrom coverage values and compute min/max 6 | cov_thresh = {} 7 | stdv_scale = snakemake.params["cov_threshold_stdev"] 8 | rel_scale = snakemake.params["cov_threshold_rel"] 9 | mean_lower = snakemake.params["cov_threshold_lower"] 10 | mean_upper = snakemake.params["cov_threshold_upper"] 11 | 12 | # check that correct settings are set 13 | 14 | if stdv_scale: 15 | if rel_scale: 16 | raise WorkflowError( 17 | "Both cov_threshold_stdev and cov_threshold_rel are set, please choose one and make sure the other variable is empty in the config file." 18 | ) 19 | elif mean_lower or mean_upper: 20 | raise WorkflowError( 21 | "Both cov_threshold_stdev and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file." 22 | ) 23 | elif rel_scale: 24 | if mean_lower or mean_upper: 25 | raise WorkflowError( 26 | "Both cov_threshold_rel and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file." 27 | ) 28 | elif mean_lower: 29 | if not mean_upper: 30 | mean_upper = 50000 31 | elif mean_upper: 32 | if not mean_lower: 33 | mean_lower = 1 34 | else: 35 | raise WorkflowError( 36 | "Use coverage filter is True, but you did not specify coverage filtering options in the config. Please check." 37 | ) 38 | 39 | with open(snakemake.input["stats"]) as stats: 40 | for line in stats: 41 | if "mean" in line: 42 | continue 43 | 44 | fields = line.split() 45 | mean = float(fields[1]) 46 | stdev = math.sqrt(mean) 47 | # 0 is chr, 1 is mean 48 | if stdv_scale: 49 | cov_thresh[fields[0]] = { 50 | "low": mean - (stdev * float(stdv_scale)), 51 | "high": mean + (stdev * float(stdv_scale)), 52 | } 53 | elif rel_scale: 54 | cov_thresh[fields[0]] = { 55 | "low": mean / float(rel_scale), 56 | "high": mean * float(rel_scale), 57 | } 58 | else: 59 | cov_thresh[fields[0]] = { 60 | "low": float(mean_lower), 61 | "high": float(mean_upper), 62 | } 63 | 64 | # Write the thresholds to a TSV file 65 | with open(snakemake.output[0], "w") as output_file: 66 | output_file.write("chrom\tmin\tmax\n") # Header line, if needed 67 | for chrom, thresholds in cov_thresh.items(): 68 | if chrom == "total": 69 | continue 70 | output_file.write(f"{chrom}\t{thresholds['low']}\t{thresholds['high']}\n") 71 | -------------------------------------------------------------------------------- /workflow/snparcher_utils/__init__.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | 4 | 5 | try: 6 | # Snakemake 8.x.x 7 | from snakemake_interface_common.exceptions import WorkflowError 8 | except ImportError: 9 | # Snakmake 7.x.x 10 | from snakemake.exceptions import WorkflowError 11 | 12 | def parse_sample_sheet(config: dict) -> pd.DataFrame: 13 | samples = ( 14 | pd.read_table(config["samples"], sep=",", dtype=str) 15 | .replace(" ", "_", regex=True) 16 | .infer_objects( 17 | copy=False 18 | ) # needed to maintain same behavior in future pandas versions 19 | ) 20 | config_genomes = get_config_genomes(config, samples) 21 | refGenome = 'refGenome' in samples.columns and samples['refGenome'].notna().any() 22 | refPath = 'refPath' in samples.columns and samples['refPath'].notna().any() 23 | if not any([config_genomes, refGenome, refPath]): 24 | raise WorkflowError("No 'refGenome' or 'refPath' found in config or sample sheet.") 25 | if config_genomes is not None: 26 | config_refGenome, config_refPath = config_genomes 27 | samples["refGenome"] = config_refGenome 28 | samples["refPath"] = config_refPath 29 | if 'refPath' in samples.columns and samples['refPath'].notna().any(): 30 | check_ref_paths(samples) 31 | return samples 32 | 33 | def get_config_genomes(config: dict, samples: pd.DataFrame): 34 | refGenome = config.get("refGenome", False) 35 | refPath = config.get("refPath", False) 36 | 37 | if refGenome and refPath: 38 | if 'refGenome' in samples.columns and samples['refGenome'].notna().any(): 39 | raise WorkflowError("'refGenome' is set in sample sheet AND in config. These are mutually exclusive.") 40 | return refGenome, refPath 41 | elif refGenome and not refPath: 42 | raise WorkflowError("'refGenome' is set in config, but 'refPath' is not. Both are required to use these settings.") 43 | elif refPath and not refGenome: 44 | raise WorkflowError("'refPath' is set in config, but 'refGenome' is not. Both are required to use these settings.") 45 | return None 46 | 47 | def check_ref_paths(samples: pd.DataFrame) -> None: 48 | """ 49 | Checks reference paths to make sure they exist, otherwise we might try to download them based on refGenome. 50 | Also make sure only one refPath per refGenome. 51 | """ 52 | for refname in samples["refGenome"].dropna().tolist(): 53 | refs = samples[samples["refGenome"] == refname]["refPath"].dropna().unique().tolist() 54 | if len(refs) > 1: 55 | raise WorkflowError(f"refGenome '{refname}' has more than one unique 'refPath' specified: {refs}") 56 | for ref in refs: 57 | if not Path(ref).exists: 58 | raise WorkflowError(f"refPath: '{ref}' was specified in sample sheet, but could not be found.") 59 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # 🚀 snpArcher 2 | 3 | snpArcher logo 4 | 5 | Have resequencing data as fastq files and a reference genome? Want a VCF file of genotypes? Use snpRarcher as your one-stop shop to quickly and efficiently produce an analysis-ready dataset. No need to create a hand-tailored workflow cobbled together by tape and error-ridden chatGPT code — use snpArcher for all your variant calling needs on your laptop, on your server, or up in the clouds. 6 | 7 | snpArcher is a reproducible workflow optimized for nonmodel organisms and comparisons across datasets, built on the [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html#) workflow management system. It provides a streamlined approach to dataset acquisition, variant calling, quality control, and downstream analysis. 8 | 9 | Snakemake makes it easy to bundle together the many steps involved in running a bioinformatics pipeline. The workflow involves mapping reads to a reference genome, calling SNPs using GATK's haplotypecaller, and calling variants at the population level using GATK's combineGVCFs. Each of these steps can be slow and tiresome to run on their own and the workflow has been carefully designed and tested to maximize efficiency. We use intervals to break up jobs into smaller chunks so that time and memory-hungry steps like haplotypecaller run quickly. If you have access to a Sentieon license for accelerated variant calling, we include options for this. 10 | 11 | Finally, the pipeline makes it easy to evaluate how the data looks. Review the HTML file in the QC folder at the end of a run to see how your samples relate to each other and also a number of metrics for evaluating variant-calling quality. 12 | 13 | Remember to examine the config.yaml file to edit options for each step. We have carefully chosen default options that should work for most users, but these can be tweaked here. 14 | 15 | ## Requirements 16 | - Illumina paired-end fastq files for one or more individuals 17 | - A reference genome 18 | - A sample sheet with sample names matched to the read names 19 | - Snakemake and Mamba installed on your system 20 | - If using Google Cloud, you will need to have set up an account on the GCP console 21 | 22 | ## Using snpArcher 23 | - To get started quickly, check out the quick start tutorial! 24 | - Otherwise start [here](./setup.md). 25 | 26 | ## Citing 27 | - Please cite our preprint [here](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1) 28 | - Also, make sure to cite the tools you used within snpArcher. 29 | 30 | ## Contributing to snpArcher 31 | - If you encounter a bug or want to request a feature, please open a issue on our [github page](https://github.com/harvardinformatics/snpArcher). 32 | - If you'd like to contribute a module, check out our [module contribution guidelines](./modules.md#module-contribution-guidelines). 33 | 34 | ```{toctree} 35 | :hidden: True 36 | ./setup.md 37 | ./executing.md 38 | ./examples.md 39 | ./modules.md 40 | ./datasets.md 41 | ``` 42 | -------------------------------------------------------------------------------- /workflow/rules/sumstats.smk: -------------------------------------------------------------------------------- 1 | rule bam_sumstats: 2 | input: 3 | unpack(get_bams), 4 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 5 | output: 6 | cov = "results/{refGenome}/summary_stats/{sample}_coverage.txt", 7 | alnSum = "results/{refGenome}/summary_stats/{sample}_AlnSumMets.txt", 8 | conda: 9 | "../envs/fastq2bam.yml" 10 | shell: 11 | """ 12 | samtools coverage --output {output.cov} {input.bam} 13 | samtools flagstat -O tsv {input.bam} > {output.alnSum} 14 | """ 15 | 16 | rule sentieon_bam_stats: 17 | input: 18 | unpack(get_bams), 19 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 20 | ref = "results/{refGenome}/data/genome/{refGenome}.fna" 21 | params: 22 | lic = config['sentieon_lic'] 23 | output: 24 | insert_file = "results/{refGenome}/summary_stats/{sample}_insert_metrics.txt", 25 | qd = "results/{refGenome}/summary_stats/{sample}_qd_metrics.txt", 26 | gc = "results/{refGenome}/summary_stats/{sample}_gc_metrics.txt", 27 | gc_summary = "results/{refGenome}/summary_stats/{sample}_gc_summary.txt", 28 | mq = "results/{refGenome}/summary_stats/{sample}_mq_metrics.txt" 29 | conda: 30 | "../envs/sentieon.yml" 31 | shell: 32 | """ 33 | export SENTIEON_LICENSE={params.lic} 34 | sentieon driver -r {input.ref} \ 35 | -t {threads} -i {input.bam} \ 36 | --algo MeanQualityByCycle {output.mq} \ 37 | --algo QualDistribution {output.qd} \ 38 | --algo GCBias --summary {output.gc_summary} {output.gc} \ 39 | --algo InsertSizeMetricAlgo {output.insert_file} 40 | """ 41 | 42 | rule collect_fastp_stats: 43 | input: 44 | collect_fastp_stats_input 45 | output: 46 | "results/{refGenome}/summary_stats/{sample}_fastp.out" 47 | run: 48 | combine_fastp_files(input, output) 49 | 50 | rule collect_sumstats: 51 | input: 52 | unpack(get_input_sumstats) 53 | output: 54 | "results/{refGenome}/summary_stats/{prefix}_bam_sumstats.txt" 55 | run: 56 | if not config['sentieon']: 57 | FractionReadsPassFilter, NumReadsPassFilter = collectFastpOutput(input.fastpFiles) 58 | aln_metrics = collectAlnSumMets(input.alnSumMetsFiles) 59 | SeqDepths, CoveredBases = collectCoverageMetrics(input.coverageFiles) 60 | printBamSumStats(SeqDepths, CoveredBases, aln_metrics, FractionReadsPassFilter, NumReadsPassFilter, output[0]) 61 | else: 62 | FractionReadsPassFilter, NumReadsPassFilter = collectFastpOutput(input.fastpFiles) 63 | aln_metrics = collectAlnSumMets(input.alnSumMetsFiles) 64 | SeqDepths, CoveredBases = collectCoverageMetrics(input.coverageFiles) 65 | median_inserts, median_insert_std = collect_inserts(input.insert_files) 66 | printBamSumStats(SeqDepths, CoveredBases, aln_metrics, FractionReadsPassFilter, NumReadsPassFilter, output[0], median_inserts, median_insert_std) -------------------------------------------------------------------------------- /.test/qc/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | resource_config: "config/resources.yaml" # path to resources yaml config 7 | final_prefix: "test_qc" # prefix for final output files 8 | intervals: True #Set to True if you want to perform variant calling using interval approach. 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 10 | sentieon_lic: "" #set to path of sentieon license 11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 12 | remote_reads_prefix: "" # set to google bucket prefix where reads live 13 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 14 | trackhub_email: "hi@website.com" 15 | ############################## 16 | # Variables you *might* need to change 17 | ############################## 18 | 19 | # Interval approach options, only applicable if intervals is True 20 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 21 | num_gvcf_intervals: 1 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps. 22 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1 23 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper 24 | 25 | ## Coverage options ## 26 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 27 | min_depth: 2 28 | # low coverage options (< 10x) 29 | minP: 1 30 | minD: 1 31 | 32 | # high coverage options (> 10x) 33 | #minP: 2 34 | #minD: 4 35 | 36 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods 37 | 38 | ######################################## 39 | ## callable sites bed file thresholds ## 40 | ######################################## 41 | 42 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file 43 | cov_threshold: 2 #regions of the genome with coverage above/below cov_thresh standard deviations will be filtered 44 | 45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold 46 | #to do strict filtering, set to 0 47 | 48 | callable_merge: 100 #merge callable regions separated by this or fewer bp into a single region 49 | 50 | 51 | ## QC options ## 52 | nClusters: 3 53 | GoogleAPIKey: 54 | 55 | ## Filtering options ## 56 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. 57 | -------------------------------------------------------------------------------- /workflow/modules/mk/Snakefile: -------------------------------------------------------------------------------- 1 | configfile: "config/config.yaml" 2 | include: "common.smk" 3 | 4 | 5 | REFGENOME = samples['refGenome'].unique().tolist() 6 | 7 | rule all: 8 | input: 9 | expand("results/{refGenome}/mk/{prefix}_mk_table.tsv", refGenome=REFGENOME, prefix=config['final_prefix']) 10 | 11 | rule prep_genome: 12 | """ 13 | Gets the needed information (fasta and gff) from NCBI 14 | """ 15 | input: 16 | ref = get_ref, 17 | gff = get_gff 18 | output: 19 | ref = "results/{refGenome}/mk/{refGenome}.fna", 20 | gff = "results/{refGenome}/mk/{refGenome}.gff" 21 | params: 22 | dataset = "results/{refGenome}/mk/{refGenome}_dataset.zip", 23 | outdir = "results/{refGenome}/mk/{refGenome}" 24 | conda: 25 | "envs/ncbi.yml" 26 | shell: 27 | """ 28 | set +e 29 | #if genome is local, datasets will fail, we will just continue 30 | mkdir -p {params.outdir} 31 | datasets download genome accession --exclude-protein --exclude-rna --filename {params.dataset} {wildcards.refGenome} \ 32 | && 7z x {params.dataset} -aoa -o{params.outdir} 33 | 34 | if [ -z "{input.ref}" ] 35 | then 36 | cat {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/*.fna > {output.ref} 37 | else 38 | cp {input.ref} {output.ref} 39 | fi 40 | 41 | if [ -z "{input.gff}" ] 42 | then 43 | cp {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/genomic.gff {output.gff} 44 | else 45 | cp {input.gff} {output.gff} 46 | fi 47 | """ 48 | 49 | rule split_samples: 50 | """ 51 | Splits sample sheet to make ingroup and outgroup files 52 | """ 53 | output: 54 | exclude = "results/{refGenome}/mk/{prefix}_exclude.txt", 55 | outgroups = "results/{refGenome}/mk/{prefix}_ougroups.txt" 56 | run: 57 | out_df = samples[["BioSample", "SampleType"]] 58 | out_df.drop_duplicates("BioSample", inplace=True) 59 | exclude =out_df[~out_df.SampleType.isin(["ingroup", "outgroup"])].BioSample 60 | outgroups = out_df[out_df.SampleType.isin(["outgroup"])].BioSample 61 | exclude.to_csv(output[0], index=False, sep="\t", header=False) 62 | outgroups.to_csv(output[1], index=False, sep="\t", header=False) 63 | 64 | rule degenotate: 65 | """ 66 | Runs degenotate to compute MK tables 67 | """ 68 | input: 69 | vcf = "results/{refGenome}/{prefix}_clean_snps.vcf.gz", 70 | genome = "results/{refGenome}/mk/{refGenome}.fna", 71 | gff = "results/{refGenome}/mk/{refGenome}.gff", 72 | exclude = "results/{refGenome}/mk/{prefix}_exclude.txt", 73 | outgroups = "results/{refGenome}/mk/{prefix}_ougroups.txt" 74 | output: 75 | "results/{refGenome}/mk/{prefix}_mk_table.tsv" 76 | params: 77 | delim = "space" 78 | log: 79 | "logs/{refGenome}/mk/{prefix}_degenotate.txt" 80 | conda: 81 | "envs/mk.yml" 82 | shell: 83 | """ 84 | if [ -s {input.exclude} ] 85 | then 86 | degenotate.py --overwrite -a {input.gff} -g {input.genome} -u {input.outgroups} -e {input.exclude} -d {params.delim} -o "results/{wildcards.refGenome}/mk/{wildcards.prefix}_degen_raw" -v {input.vcf} 87 | else 88 | degenotate.py --overwrite -a {input.gff} -g {input.genome} -u {input.outgroups} -d {params.delim} -o "results/{wildcards.refGenome}/mk/{wildcards.prefix}_degen_raw" -v {input.vcf} 89 | fi 90 | cp results/{wildcards.refGenome}/mk/{wildcards.prefix}_degen_raw/mk.tsv {output} 91 | """ 92 | -------------------------------------------------------------------------------- /workflow/rules/intervals.smk: -------------------------------------------------------------------------------- 1 | rule picard_intervals: 2 | input: 3 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 4 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 5 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict" 6 | output: 7 | intervals = temp("results/{refGenome}/intervals/picard_interval_list.list") 8 | params: 9 | minNmer = int(config['minNmer']) 10 | conda: 11 | '../envs/bam2vcf.yml' 12 | log: 13 | "logs/{refGenome}/picard_intervals/log.txt" 14 | benchmark: 15 | "benchmarks/{refGenome}/picard_intervals/benchmark.txt" 16 | shell: 17 | "picard ScatterIntervalsByNs -Xmx{resources.mem_mb_reduced}m REFERENCE={input.ref} OUTPUT={output.intervals} MAX_TO_MERGE={params.minNmer} OUTPUT_TYPE=ACGT &> {log}\n" 18 | 19 | rule format_interval_list: 20 | input: 21 | intervals = "results/{refGenome}/intervals/picard_interval_list.list" 22 | output: 23 | intervals = "results/{refGenome}/intervals/master_interval_list.list" 24 | run: 25 | with open(output.intervals, "w") as out: 26 | with open(input.intervals, "r") as inp: 27 | for line in inp: 28 | if not line.startswith("@"): 29 | line = line.strip().split("\t") 30 | chrom, start, end = line[0], line[1], line[2] 31 | print(f"{chrom}:{start}-{end}", file=out) 32 | 33 | 34 | checkpoint create_db_intervals: 35 | input: 36 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 37 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 38 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 39 | intervals = "results/{refGenome}/intervals/master_interval_list.list" 40 | output: 41 | fof = "results/{refGenome}/intervals/db_intervals/intervals.txt", 42 | out_dir = directory("results/{refGenome}/intervals/db_intervals"), 43 | params: 44 | max_intervals = get_db_interval_count 45 | log: 46 | "logs/{refGenome}/db_intervals/log.txt" 47 | benchmark: 48 | "benchmarks/{refGenome}/db_intervals/benchmark.txt" 49 | conda: 50 | '../envs/bam2vcf.yml' 51 | shell: 52 | """ 53 | gatk SplitIntervals --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' -L {input.intervals} \ 54 | -O {output.out_dir} -R {input.ref} -scatter {params} \ 55 | -mode INTERVAL_SUBDIVISION \ 56 | --interval-merging-rule OVERLAPPING_ONLY &> {log} 57 | ls -l {output.out_dir}/*scattered.interval_list > {output.fof} 58 | """ 59 | 60 | checkpoint create_gvcf_intervals: 61 | input: 62 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 63 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 64 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 65 | intervals = "results/{refGenome}/intervals/master_interval_list.list" 66 | output: 67 | fof = "results/{refGenome}/intervals/gvcf_intervals/intervals.txt", 68 | out_dir = directory("results/{refGenome}/intervals/gvcf_intervals"), 69 | params: 70 | max_intervals = config["num_gvcf_intervals"] 71 | log: 72 | "logs/{refGenome}/gvcf_intervals/log.txt" 73 | benchmark: 74 | "benchmarks/{refGenome}/gvcf_intervals/benchmark.txt" 75 | conda: 76 | '../envs/bam2vcf.yml' 77 | shell: 78 | """ 79 | gatk SplitIntervals --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' -L {input.intervals} \ 80 | -O {output.out_dir} -R {input.ref} -scatter {params} \ 81 | -mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION \ 82 | --interval-merging-rule OVERLAPPING_ONLY &> {log} 83 | ls -l {output.out_dir}/*scattered.interval_list > {output.fof} 84 | """ -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | paths: 6 | - "**/Snakefile" 7 | - "**/*.smk" 8 | - "**/*.py" 9 | - ".github/workflows/*" 10 | 11 | 12 | jobs: 13 | 14 | Testing_local-fastq: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Test workflow (Local Fastq > VCF) 19 | uses: snakemake/snakemake-github-action@v1.25.1 20 | with: 21 | directory: .test/ecoli/ 22 | snakefile: workflow/Snakefile 23 | args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 24 | stagein: "conda config --set channel_priority strict" 25 | Testing_config-genomes: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v2 29 | - name: Test workflow (Local Fastq > VCF) 30 | uses: snakemake/snakemake-github-action@v1.25.1 31 | with: 32 | directory: .test/ecoli/ 33 | snakefile: workflow/Snakefile 34 | args: "--config samples='config/ecoli_config_genome.csv' refGenome='ecoli' refPath='data/local_genome/local_genome.fna.gz' --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 35 | Testing_local-fastq-sentieon-dryrun: 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v2 39 | - name: Test workflow (Local Fastq > VCF) 40 | uses: snakemake/snakemake-github-action@v1.25.1 41 | with: 42 | directory: .test/ecoli/ 43 | snakefile: workflow/Snakefile 44 | args: "--config sentieon=True --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default --dry-run" 45 | stagein: "conda config --set channel_priority strict" 46 | Testing_local-fastq_and_sra: 47 | runs-on: ubuntu-latest 48 | steps: 49 | - uses: actions/checkout@v2 50 | - name: Test workflow (Local Fastq > VCF) 51 | uses: snakemake/snakemake-github-action@v1.25.1 52 | with: 53 | directory: .test/ecoli/ 54 | snakefile: workflow/Snakefile 55 | args: "--config samples='config/local_and_sra.csv' --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 56 | Testing_QC: 57 | runs-on: ubuntu-latest 58 | steps: 59 | - uses: actions/checkout@v2 60 | - name: Test workflow (QC rules) 61 | uses: snakemake/snakemake-github-action@v1.25.1 62 | with: 63 | snakefile: workflow/modules/qc/Snakefile 64 | directory: .test/qc/ 65 | args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 66 | # stagein: "mamba install -y -c conda-forge 'python==3.11.4'" 67 | Testing_Postprocess: 68 | runs-on: ubuntu-latest 69 | steps: 70 | - uses: actions/checkout@v2 71 | - name: Test workflow (Postprocess) 72 | uses: snakemake/snakemake-github-action@v1.25.1 73 | with: 74 | snakefile: workflow/modules/postprocess/Snakefile 75 | directory: .test/postprocess/ 76 | args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 77 | # stagein: "mamba install -y -c conda-forge 'python==3.11.4'" 78 | Testing_Trackhub: 79 | runs-on: ubuntu-latest 80 | steps: 81 | - uses: actions/checkout@v2 82 | - name: Test workflow (Trackhubs) 83 | uses: snakemake/snakemake-github-action@v1.25.1 84 | with: 85 | snakefile: workflow/modules/trackhub/Snakefile 86 | directory: .test/trackhub/ 87 | args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default" 88 | # stagein: "mamba install -y -c conda-forge 'python==3.11.4'" 89 | -------------------------------------------------------------------------------- /workflow/rules/cov_filter.smk: -------------------------------------------------------------------------------- 1 | rule compute_d4: 2 | input: 3 | unpack(get_bams) 4 | output: 5 | dist = "results/{refGenome}/callable_sites/{sample}.mosdepth.global.dist.txt", 6 | d4="results/{refGenome}/callable_sites/{sample}.per-base.d4.gz", 7 | d4gzi ="results/{refGenome}/callable_sites/{sample}.per-base.d4.gz.gzi", 8 | summary="results/{refGenome}/callable_sites/{sample}.mosdepth.summary.txt" 9 | conda: 10 | "../envs/cov_filter.yml" 11 | log: 12 | "logs/{refGenome}/compute_d4/{sample}.txt" 13 | benchmark: 14 | "benchmarks/{refGenome}/compute_d4/{sample}.txt" 15 | params: 16 | prefix = subpath(output.summary, strip_suffix=".mosdepth.summary.txt"), 17 | d4 = subpath(output.d4, strip_suffix=".gz") 18 | shell: 19 | """ 20 | mosdepth --d4 -t {threads} {params.prefix} {input.bam} &> {log} 21 | bgzip --index {params.d4} 22 | """ 23 | 24 | 25 | 26 | rule collect_covstats: 27 | input: 28 | unpack(get_input_covstats) 29 | output: 30 | "results/{refGenome}/summary_stats/all_cov_sumstats.txt" 31 | run: 32 | covStats = collectCovStats(input.covStatFiles) 33 | with open(output[0], "w") as f: 34 | print("chrom\tmean_cov\tstdev_cov", file=f) 35 | for chrom in covStats: 36 | print(chrom, covStats[chrom]['mean'], covStats[chrom]['stdev'], sep="\t", file=f) 37 | 38 | rule create_cov_thresholds: 39 | input: 40 | stats = "results/{refGenome}/summary_stats/all_cov_sumstats.txt", 41 | output: 42 | thresholds = "results/{refGenome}/callable_sites/{prefix}_callable_sites_thresholds.tsv" 43 | 44 | params: 45 | cov_threshold_stdev = config["cov_threshold_stdev"], 46 | cov_threshold_lower = config["cov_threshold_lower"], 47 | cov_threshold_upper = config["cov_threshold_upper"], 48 | cov_threshold_rel = config["cov_threshold_rel"] 49 | conda: 50 | "../envs/cov_filter.yml" 51 | script: 52 | "../scripts/create_coverage_thresholds.py" 53 | 54 | rule clam_loci: 55 | input: 56 | unpack(get_input_for_coverage), 57 | thresholds = "results/{refGenome}/callable_sites/{prefix}_callable_sites_thresholds.tsv" 58 | output: 59 | cov = "results/{refGenome}/callable_sites/{prefix}/callable_sites.d4", 60 | bed = "results/{refGenome}/callable_sites/{prefix}/callable_sites.bed", 61 | tmp_bed = temp("results/{refGenome}/callable_sites/{prefix}/callable_sites_temp.bed") # temp fix until clam produces better bed files cm 62 | params: 63 | outdir = subpath(output.cov, parent=True) 64 | conda: 65 | "../envs/cov_filter.yml" 66 | log: 67 | "logs/{refGenome}/covbed/{prefix}.txt" 68 | benchmark: 69 | "benchmarks/{refGenome}/covbed/{prefix}_benchmark.txt" 70 | shell: 71 | """ 72 | clam loci -t {threads} --bed --thresholds-file {input.thresholds} -o {params.outdir} {input.d4} 2> {log} 73 | bedtk merge {output.bed} > {output.tmp_bed} 2>> {log} 74 | cp {output.tmp_bed} {output.bed} 2>> {log} 75 | """ 76 | 77 | rule callable_bed: 78 | input: 79 | cov = "results/{refGenome}/callable_sites/{prefix}/callable_sites.bed", 80 | map = "results/{refGenome}/callable_sites/{prefix}_callable_sites_map.bed" 81 | output: 82 | callable_sites = "results/{refGenome}/{prefix}_callable_sites.bed", 83 | tmp_cov = temp("results/{refGenome}/callable_sites/{prefix}_temp_cov.bed") 84 | conda: 85 | "../envs/cov_filter.yml" 86 | benchmark: 87 | "benchmarks/{refGenome}/callable_bed/{prefix}_benchmark.txt" 88 | params: 89 | merge = config['cov_merge'] 90 | shell: 91 | """ 92 | bedtools merge -d {params.merge} -i {input.cov} > {output.tmp_cov} 93 | bedtools intersect -a {output.tmp_cov} -b {input.map} | bedtools sort -i - | bedtools merge -i - > {output.callable_sites} 94 | """ 95 | -------------------------------------------------------------------------------- /workflow/scripts/create_coverage_bed.py: -------------------------------------------------------------------------------- 1 | from pyd4 import D4File,D4Builder 2 | import math 3 | 4 | #read chrom coverage values and compute min/max 5 | cov_thresh = {} 6 | stdv_scale = snakemake.params["cov_threshold_stdev"] 7 | rel_scale = snakemake.params["cov_threshold_rel"] 8 | mean_lower = snakemake.params["cov_threshold_lower"] 9 | mean_upper = snakemake.params["cov_threshold_upper"] 10 | 11 | #check that correct settings are set 12 | 13 | if stdv_scale: 14 | if rel_scale: 15 | raise WorkflowError(f"Both cov_threshold_stdev and cov_threshold_rel are set, please choose one and make sure the other variable is empty in the config file.") 16 | elif mean_lower or mean_upper: 17 | raise WorkflowError(f"Both cov_threshold_stdev and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file.") 18 | elif rel_scale: 19 | if mean_lower or mean_upper: 20 | raise WorkflowError(f"Both cov_threshold_rel and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file.") 21 | elif mean_lower: 22 | if not mean_upper: 23 | mean_upper = 50000 24 | elif mean_upper: 25 | if not mean_lower: 26 | mean_lower = 1 27 | else: 28 | raise WorkflowError(f"Use coverage filter is True, but you did not specify coverage filtering options in the config. Please check.") 29 | 30 | with open(snakemake.input["stats"]) as stats: 31 | for line in stats: 32 | if "mean" in line: 33 | continue 34 | 35 | fields=line.split() 36 | mean = float(fields[1]) 37 | stdev = math.sqrt(mean) 38 | #0 is chr, 1 is mean 39 | if stdv_scale: 40 | cov_thresh[fields[0]] = { 41 | 'low' : mean - (stdev * float(stdv_scale)), 42 | 'high' : mean + (stdev * float(stdv_scale)) 43 | } 44 | elif rel_scale: 45 | cov_thresh[fields[0]] = { 46 | 'low' : mean / float(rel_scale), 47 | 'high' : mean * float(rel_scale) 48 | } 49 | else: 50 | cov_thresh[fields[0]] = { 51 | 'low' : float(mean_lower), 52 | 'high' : float(mean_upper) 53 | } 54 | 55 | #read d4 file into python, convert to 56 | covfile = D4File(snakemake.input["d4"]) 57 | covmat = covfile.open_all_tracks() 58 | 59 | with open(snakemake.output["covbed"], mode='w') as covbed: 60 | good_interval = False 61 | for chrom in covfile.chroms(): 62 | 63 | try: 64 | thresh_high = cov_thresh[chrom[0]]['high'] 65 | except KeyError: 66 | thresh_high = cov_thresh['total']['high'] 67 | 68 | try: 69 | thresh_low = cov_thresh[chrom[0]]['low'] 70 | except KeyError: 71 | thresh_low = cov_thresh['total']['low'] 72 | 73 | for values in covmat.enumerate_values(chrom[0],0,chrom[1]): 74 | covs=values[2] 75 | pos=values[1] 76 | chr=values[0] 77 | #get mean coverage for window 78 | res1=math.fsum(covs)/len(covs) 79 | 80 | if res1 <= thresh_high and res1 >= thresh_low and good_interval == False: 81 | # we are starting a new interval, print chr and pos 82 | print(chr, pos, file=covbed, sep="\t", end="") 83 | # set good interval to True 84 | good_interval = True 85 | elif (res1 > thresh_high or res1 < thresh_low) and good_interval: 86 | # we are ending a good interval 87 | print("\t", pos, file=covbed, sep="") 88 | good_interval = False 89 | else: 90 | # we are either in a good interval, or in a bad interval, so just keep going 91 | continue 92 | # if at this stage we are in a good interval, that means the good interval goes ot the end of the chromosome 93 | if good_interval: 94 | endpos = chrom[1]+1 95 | print("\t", endpos, file=covbed, sep="") 96 | good_interval = False 97 | 98 | -------------------------------------------------------------------------------- /workflow/snparcher_utils/write_samples.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import shutil 3 | import argparse 4 | from pathlib import Path 5 | from typing import TextIO 6 | # User provides list of sample names, 1 per line 7 | # User provides path to where fastq files are. Assume paired end and that file name has sample name in it 8 | # User provides path to reference genome. Need to copy this to proper path 9 | 10 | def read_sample_list(sample_fh: TextIO) -> list: 11 | return sample_fh.read().splitlines() 12 | 13 | def find_sample_fastqs(samples: list, fastq_dir: Path) -> dict: 14 | """Searches fastq_dir for sample names and associates in a dict""" 15 | sample_fastq_paths = {} 16 | cant_find = [] 17 | for samp_name in samples: 18 | fqs = sorted(list(fastq_dir.glob(f"*{samp_name}*"))) # Hoping that sorting will make fq1 first. 19 | if len(fqs) != 2: 20 | cant_find.append(samp_name) 21 | else: 22 | sample_fastq_paths[samp_name] = fqs 23 | return sample_fastq_paths, cant_find 24 | 25 | def copy_reference(ref: Path) -> str: 26 | exts = ['.fna', '.fa', '.fasta'] 27 | for ext in exts: 28 | if ext in ref.name: 29 | ref_name = ref.name.split(ext)[0] 30 | if Path('..', 'data', 'genome', ref_name + ".fna").exists(): 31 | return ref_name 32 | if not Path("../data/genome").exists(): 33 | Path("../data/genome").mkdir(parents=True) 34 | if ref.suffix == ".gz": 35 | with gzip.open(ref, 'rb') as f_in: 36 | with open(Path('..', 'data', 'genome', ref_name + ".fna"), 'wb') as f_out: 37 | shutil.copyfileobj(f_in, f_out) 38 | else: 39 | shutil.copyfile(ref, Path('data', 'genome', ref_name + ".fna")) 40 | return ref_name 41 | 42 | def write_sample_sheet(sample_dict: dict, ref_name: str, ref_path: str, ncbi_ref: bool) -> None: 43 | """Writes the sample sheet""" 44 | with open(Path("../config", "samples.csv"), "w") as out: 45 | if (ncbi_ref): 46 | out.write("BioSample,LibraryName,refGenome,Run,BioProject,fq1,fq2\n") 47 | for i, (k, v) in enumerate(sample_dict.items()): 48 | out.write(f"{k},lib_{k},{ref_name},{i},NaN,{v[0]},{v[1]}\n") 49 | else: 50 | out.write("BioSample,LibraryName,refGenome,refPath,Run,BioProject,fq1,fq2\n") 51 | for i, (k, v) in enumerate(sample_dict.items()): 52 | out.write(f"{k},lib_{k},{ref_name},{ref_path},{i},NaN,{v[0]},{v[1]}\n") 53 | 54 | 55 | def main() -> None: 56 | 57 | parser = argparse.ArgumentParser(description='Write sample files.') 58 | parser.add_argument('-s', '--sample_list', dest='samp', required=True, help="Specify path to sample list") 59 | parser.add_argument('-f', '--fastq_dir', dest='fastq', required=True, help="Specify path to fastq dir") 60 | parser.add_argument('-c', '--copy', dest='copyref', required=False, default=False, help="Copy reference genome to data/genome dir and unzip.") 61 | 62 | group = parser.add_mutually_exclusive_group(required=True) 63 | group.add_argument('-r', '--ref', dest='ref', help="Specify path to reference genome. Mutually exclusive with -a/--acc.") 64 | group.add_argument('-a', '--acc', dest='acc', help="Specify reference genome accession. Mutually exclusive with -r/--ref") 65 | args = parser.parse_args() 66 | 67 | sample_list = args.samp 68 | fastq_dir = Path(args.fastq) 69 | 70 | 71 | with open(sample_list, "r") as f: 72 | samples = read_sample_list(f) 73 | 74 | sample_dict, cant_find = find_sample_fastqs(samples, fastq_dir) 75 | ncbi_ref = True 76 | 77 | if (args.ref): 78 | ref = Path(args.ref) 79 | ncbi_ref = False 80 | if args.copyref: 81 | ref_name = copy_reference(ref) 82 | ref_path = "../data/genome/" + ref_name + ".fna" 83 | else: 84 | ref_name = ref.stem 85 | ref_path = args.ref 86 | else: 87 | ref_name = args.acc 88 | ref_path = "" 89 | 90 | 91 | write_sample_sheet(sample_dict, ref_name, ref_path, ncbi_ref) 92 | 93 | if cant_find: 94 | print("Couldnt' find fastqs for these files:") 95 | for name in cant_find: 96 | print(name) 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /.test/qc/config/test_qc_gls_config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | tmp_dir: "tmp/" # directory path for a temp dir 7 | split_by_n: True #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds. 8 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 9 | sentieon_lic: "" #set to path of sentieon license 10 | remote_reads: False # set if you want reads to be on google cloud storage remote 11 | remote_reads_prefix: "" # set to google bucket name where reads live 12 | ############################## 13 | # Variables you *might* need to change 14 | ############################## 15 | 16 | # if using the BAM -> VCF workflows 17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 18 | bamsForFB: "fastq2bam/01_mappedReads/" # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 19 | bam_suffix: "_final.bam" # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 20 | 21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome 22 | maxNumIntervals: 10 # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds. 23 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 24 | maxIntervalLen: 15000000 # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point 25 | maxBpPerList: 15000000 # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here 26 | maxIntervalsPerList: 200 # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good. 27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth 28 | 29 | ## Coverage options ## 30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 31 | 32 | # low coverage options (< 10x) 33 | minP: 1 34 | minD: 1 35 | 36 | # high coverage options (> 10x) 37 | #minP: 2 38 | #minD: 4 39 | 40 | ## QC options ## 41 | nClusters: 3 42 | 43 | ############################## 44 | # Variables you DO NOT need to change 45 | # if you do, ensure all path/directory names are followed by a "/" 46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name! 47 | ############################## 48 | 49 | output: "results/" 50 | fastqDir: "data/fastq/" # this is where raw fastqs are downloaded 51 | refGenomeDir: "data/genome/" # where reference genomes go 52 | fastq2bamDir: "fastq2bam/" 53 | fastqFilterDir: "00_fastqFiltered/" # new directory created for filtered fastq reads 54 | bamDir: "01_mappedReads/" # new directory created for BAM files 55 | sumstatDir: "02_bamSumstats/" # new directory created for BAM summary statistics 56 | 57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!) 58 | gatkDir: "gatk/" 59 | gvcfDir: "03_gvcfs/" 60 | dbDir: "04_genomicsDB/" 61 | vcfDir_gatk: "05_vcfs/" 62 | qcDir: "06_QC/" 63 | intDir: "intervalFiles/" 64 | 65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!) 66 | fbDir: "freebayes/" 67 | intervalDir: "00_intervals/" 68 | vcfDir_fb: "01_vcfs_byinterval/" 69 | 70 | #information for read groups 71 | flowcell: "FLOWCELL" 72 | platform: "ILLUMINA" 73 | 74 | cluster_config: "profiles/slurm/cluster_config.yml" 75 | test_qc: True 76 | -------------------------------------------------------------------------------- /.test/postprocess/config/test_qc_gls_config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | tmp_dir: "tmp/" # directory path for a temp dir 7 | split_by_n: True #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds. 8 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 9 | sentieon_lic: "" #set to path of sentieon license 10 | remote_reads: False # set if you want reads to be on google cloud storage remote 11 | remote_reads_prefix: "" # set to google bucket name where reads live 12 | ############################## 13 | # Variables you *might* need to change 14 | ############################## 15 | 16 | # if using the BAM -> VCF workflows 17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 18 | bamsForFB: "fastq2bam/01_mappedReads/" # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 19 | bam_suffix: "_final.bam" # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 20 | 21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome 22 | maxNumIntervals: 10 # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds. 23 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 24 | maxIntervalLen: 15000000 # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point 25 | maxBpPerList: 15000000 # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here 26 | maxIntervalsPerList: 200 # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good. 27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth 28 | 29 | ## Coverage options ## 30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 31 | 32 | # low coverage options (< 10x) 33 | minP: 1 34 | minD: 1 35 | 36 | # high coverage options (> 10x) 37 | #minP: 2 38 | #minD: 4 39 | 40 | ## QC options ## 41 | nClusters: 3 42 | 43 | ############################## 44 | # Variables you DO NOT need to change 45 | # if you do, ensure all path/directory names are followed by a "/" 46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name! 47 | ############################## 48 | 49 | output: "results/" 50 | fastqDir: "data/fastq/" # this is where raw fastqs are downloaded 51 | refGenomeDir: "data/genome/" # where reference genomes go 52 | fastq2bamDir: "fastq2bam/" 53 | fastqFilterDir: "00_fastqFiltered/" # new directory created for filtered fastq reads 54 | bamDir: "01_mappedReads/" # new directory created for BAM files 55 | sumstatDir: "02_bamSumstats/" # new directory created for BAM summary statistics 56 | 57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!) 58 | gatkDir: "gatk/" 59 | gvcfDir: "03_gvcfs/" 60 | dbDir: "04_genomicsDB/" 61 | vcfDir_gatk: "05_vcfs/" 62 | qcDir: "06_QC/" 63 | intDir: "intervalFiles/" 64 | 65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!) 66 | fbDir: "freebayes/" 67 | intervalDir: "00_intervals/" 68 | vcfDir_fb: "01_vcfs_byinterval/" 69 | 70 | #information for read groups 71 | flowcell: "FLOWCELL" 72 | platform: "ILLUMINA" 73 | 74 | cluster_config: "profiles/slurm/cluster_config.yml" 75 | test_qc: True 76 | -------------------------------------------------------------------------------- /.test/trackhub/config/test_qc_gls_config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | tmp_dir: "tmp/" # directory path for a temp dir 7 | split_by_n: True #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds. 8 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 9 | sentieon_lic: "" #set to path of sentieon license 10 | remote_reads: False # set if you want reads to be on google cloud storage remote 11 | remote_reads_prefix: "" # set to google bucket name where reads live 12 | ############################## 13 | # Variables you *might* need to change 14 | ############################## 15 | 16 | # if using the BAM -> VCF workflows 17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 18 | bamsForFB: "fastq2bam/01_mappedReads/" # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 19 | bam_suffix: "_final.bam" # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 20 | 21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome 22 | maxNumIntervals: 10 # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds. 23 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 24 | maxIntervalLen: 15000000 # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point 25 | maxBpPerList: 15000000 # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here 26 | maxIntervalsPerList: 200 # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good. 27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth 28 | 29 | ## Coverage options ## 30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 31 | 32 | # low coverage options (< 10x) 33 | minP: 1 34 | minD: 1 35 | 36 | # high coverage options (> 10x) 37 | #minP: 2 38 | #minD: 4 39 | 40 | ## QC options ## 41 | nClusters: 3 42 | 43 | ############################## 44 | # Variables you DO NOT need to change 45 | # if you do, ensure all path/directory names are followed by a "/" 46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name! 47 | ############################## 48 | 49 | output: "results/" 50 | fastqDir: "data/fastq/" # this is where raw fastqs are downloaded 51 | refGenomeDir: "data/genome/" # where reference genomes go 52 | fastq2bamDir: "fastq2bam/" 53 | fastqFilterDir: "00_fastqFiltered/" # new directory created for filtered fastq reads 54 | bamDir: "01_mappedReads/" # new directory created for BAM files 55 | sumstatDir: "02_bamSumstats/" # new directory created for BAM summary statistics 56 | 57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!) 58 | gatkDir: "gatk/" 59 | gvcfDir: "03_gvcfs/" 60 | dbDir: "04_genomicsDB/" 61 | vcfDir_gatk: "05_vcfs/" 62 | qcDir: "06_QC/" 63 | intDir: "intervalFiles/" 64 | 65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!) 66 | fbDir: "freebayes/" 67 | intervalDir: "00_intervals/" 68 | vcfDir_fb: "01_vcfs_byinterval/" 69 | 70 | #information for read groups 71 | flowcell: "FLOWCELL" 72 | platform: "ILLUMINA" 73 | 74 | cluster_config: "profiles/slurm/cluster_config.yml" 75 | test_qc: True 76 | -------------------------------------------------------------------------------- /workflow/modules/qc/config/test_qc_gls_config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | tmp_dir: "tmp/" # directory path for a temp dir 7 | split_by_n: True #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds. 8 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 9 | sentieon_lic: "" #set to path of sentieon license 10 | remote_reads: False # set if you want reads to be on google cloud storage remote 11 | remote_reads_prefix: "" # set to google bucket name where reads live 12 | ############################## 13 | # Variables you *might* need to change 14 | ############################## 15 | 16 | # if using the BAM -> VCF workflows 17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 18 | bamsForFB: "fastq2bam/01_mappedReads/" # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/" 19 | bam_suffix: "_final.bam" # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 20 | 21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome 22 | maxNumIntervals: 10 # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds. 23 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 24 | maxIntervalLen: 15000000 # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point 25 | maxBpPerList: 15000000 # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here 26 | maxIntervalsPerList: 200 # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good. 27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth 28 | 29 | ## Coverage options ## 30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 31 | 32 | # low coverage options (< 10x) 33 | minP: 1 34 | minD: 1 35 | 36 | # high coverage options (> 10x) 37 | #minP: 2 38 | #minD: 4 39 | 40 | ## QC options ## 41 | nClusters: 3 42 | 43 | ############################## 44 | # Variables you DO NOT need to change 45 | # if you do, ensure all path/directory names are followed by a "/" 46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name! 47 | ############################## 48 | 49 | output: "results/" 50 | fastqDir: "data/fastq/" # this is where raw fastqs are downloaded 51 | refGenomeDir: "data/genome/" # where reference genomes go 52 | fastq2bamDir: "fastq2bam/" 53 | fastqFilterDir: "00_fastqFiltered/" # new directory created for filtered fastq reads 54 | bamDir: "01_mappedReads/" # new directory created for BAM files 55 | sumstatDir: "02_bamSumstats/" # new directory created for BAM summary statistics 56 | 57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!) 58 | gatkDir: "gatk/" 59 | gvcfDir: "03_gvcfs/" 60 | dbDir: "04_genomicsDB/" 61 | vcfDir_gatk: "05_vcfs/" 62 | qcDir: "06_QC/" 63 | intDir: "intervalFiles/" 64 | 65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!) 66 | fbDir: "freebayes/" 67 | intervalDir: "00_intervals/" 68 | vcfDir_fb: "01_vcfs_byinterval/" 69 | 70 | #information for read groups 71 | flowcell: "FLOWCELL" 72 | platform: "ILLUMINA" 73 | 74 | cluster_config: "profiles/slurm/cluster_config.yml" 75 | test_qc: True 76 | -------------------------------------------------------------------------------- /docs/modules.md: -------------------------------------------------------------------------------- 1 | # Modules 2 | A key goal in the design of the snpArcher pipeline is to allow seamless extensibility with downstream processing. We implement this using Snakemake modules, which allow additional rules to easily extend the main pipeline. We present several modular extensions of snpArcher here, but we hope also that user-developed modules will grow the set of tools linked to snpArcher in order to facilitate diverse analysis. 3 | ## Module Contribution Guidelines 4 | We developed a set of criteria for including additional user-developed modules into snpArcher. This project is designed to be modular and easily extensible as we and workflow users develop additional features and downstream analysis tools. To ensure that contributed modules are reproducible and easily implemented, we propose the following evaluation criteria: 5 | 6 | 1. Each module must include Snakemake workflow that defines necessary environments using Conda. 7 | 2. The module must be freely distributed via Github with sufficient documentation that users can adapt it to their needs 8 | 3. The module must provide a unit test based on either existing test datasets available from the main snpArcher workflow or via a module-specific minimal test dataset 9 | 4. Each module should be registered within the main project page to enhance discoverability and ensure the above criteria are met. 10 | 11 | If you are interested in developing a module please reach out via email or Github, we'd love to know and chat about it. 12 | ## Quality Control 13 | The quality control module aggregates various statistics from the workflow and produces preliminary analyses and plots in an interactive HTML file, offering visualizations of summary statistics related to population structure, batch effects, sequencing depth, genetic relatedness, geography, and admixture. Most summaries are based on a random sample of 100,000 SNPs, while others provide high-level summaries of the full variant dataset. These visualizations help identify outliers, potential biases, and sequencing artifacts, such as cryptic genetic variation, batch effects, and reference bias. Additionally, an interactive heatmap aids in quickly identifying close relatives within the dataset, and spatial maps provide a visualization of PCA clusters in space. 14 | ### Config Options 15 | | Option | Description | Type | 16 | | ---- | -------------| ------ | 17 | |`nClusters`| Number of clusters for PCA| `int`| 18 | |`GoogleAPIKey`| Google Maps API key (optional).| `str`| 19 | |`min_depth`| Samples with average depth below this will be excluded for QC analysis| `int`| 20 | 21 | ```{note} 22 | To generate the QC dashboard, you must have at least 3 samples specified in your sample sheet. 23 | ``` 24 | ```{note} 25 | The output of the QC module should not be considered a final analysis and is solely intended to direct quality control of the dataset. 26 | ``` 27 | ## Postprocessing 28 | The postprocessing module is designed to be run after snpArcher has intially been run and you have determined if there are samples that you would like to exclude from downstream analyses. In order to trigger this module, you must add the `SampleType` column to your sample sheet, and mark samples for inclusion with the value `include` and exclusion with the value `exclude`. 29 | 30 | This module produces a filtered VCF by filtering excluded samples as well as sites not passing default and user defined filters. 31 | ### Config Options 32 | | Option | Description | Type | 33 | | ---- | -------------| ------ | 34 | |`contig_size`| SNPs on contigs this size or smaller will be excluded from 'clean' VCF | `int`| 35 | |`maf`| SNPs with MAF below this will be excluded from clean VCF| `float`| 36 | |`missingness`| SNPs with missingness below this will be excluded from clean VCF| `float`| 37 | |`scaffolds_to_exclude` | Comma separated, no spaces list of scaffolds/contigs to exclude from clean VCF| 38 | 39 | ```{hint} 40 | If you'd like to run the postprocessing module by default, you can add the `SampleType` column in your sample sheet, and mark all samples as `include`. 41 | ``` 42 | ## Trackhubs 43 | The trackhub module generates UCSC Genome Browser track files to explore population variation data from the VCF produced by snpArcher. This module computes and generates genome browser tracks for traditional population genomic summary statistics such as windowed estimates of Tajima’s D, SNP density, Pi, Minor Allele Frequency, SNP depth. To trigger this module, you must set the [config](./setup.md#core-configuration) option to `True` and supply a email (a requirement for tracks displayed on the UCSC Genome Browser). 44 | 45 | ```{warning} 46 | The Trackhubs module is dependent on the postprocessing module. 47 | ``` 48 | -------------------------------------------------------------------------------- /.test/postprocess/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | resource_config: "config/resources.yaml" # path to resources yaml config 7 | final_prefix: "test_postprocess" # prefix for final output files 8 | intervals: True #Set to True if you want to perform variant calling using interval approach. 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 10 | sentieon_lic: "" #set to path of sentieon license 11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 12 | remote_reads_prefix: "" # set to google bucket prefix where reads live 13 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 14 | trackhub_email: "hi@website.com" 15 | ############################## 16 | # Variables you *might* need to change 17 | ############################## 18 | 19 | # Interval approach options, only applicable if intervals is True 20 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 21 | num_gvcf_intervals: 1 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps. 22 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1 23 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper 24 | 25 | ## Coverage options ## 26 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 27 | min_depth: 2 28 | # low coverage options (< 10x) 29 | minP: 1 30 | minD: 1 31 | 32 | # high coverage options (> 10x) 33 | #minP: 2 34 | #minD: 4 35 | 36 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods 37 | 38 | ######################################## 39 | ## callable sites bed file thresholds ## 40 | ######################################## 41 | 42 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file 43 | cov_threshold: 2 #regions of the genome with coverage above/below cov_thresh standard deviations will be filtered 44 | 45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold 46 | #to do strict filtering, set to 0 47 | 48 | callable_merge: 100 #merge callable regions separated by this or fewer bp into a single region 49 | 50 | 51 | ## QC options ## 52 | nClusters: 3 53 | GoogleAPIKey: 54 | min_depth: 2 55 | 56 | ## Filtering options ## 57 | 58 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable. 59 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable. 60 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable. 61 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. 62 | 63 | ######################################## 64 | ## coverage thresholds ## 65 | ######################################## 66 | 67 | ## If cov_filter is True, use these parameters to control how coverage filtering is done 68 | ## Three options are provided for coverage-based filtering. The default option is to just filter 69 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit 70 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 71 | ## and make sure all other coverage variables are empty 72 | 73 | cov_threshold_lower: 1 74 | cov_threshold_upper: 10000 75 | 76 | ## Alternatively, filtering can be done based on standard deviations 77 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage), 78 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed. 79 | ## To use this option, set the variables below to the desired X 80 | ## and make sure all other coverage variables are empty 81 | 82 | cov_threshold_stdev: 83 | 84 | ## Finally, filtering can be done based on absolute scaling of the mean, 85 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed. 86 | ## To use this option, set the variable below to the desired X 87 | ## and make sure all other coverage variables are empty 88 | 89 | cov_threshold_rel: 90 | 91 | -------------------------------------------------------------------------------- /.test/trackhub/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/test_coords.csv" # name of the sample metadata CSV 6 | resource_config: "config/resources.yaml" # path to resources yaml config 7 | final_prefix: "test_postprocess" # prefix for final output files 8 | intervals: True #Set to True if you want to perform variant calling using interval approach. 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 10 | sentieon_lic: "" #set to path of sentieon license 11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 12 | remote_reads_prefix: "" # set to google bucket prefix where reads live 13 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 14 | trackhub_email: "hi@website.com" 15 | ############################## 16 | # Variables you *might* need to change 17 | ############################## 18 | 19 | # Interval approach options, only applicable if intervals is True 20 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 21 | num_gvcf_intervals: 1 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps. 22 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1 23 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper 24 | 25 | ## Coverage options ## 26 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 27 | min_depth: 2 28 | # low coverage options (< 10x) 29 | minP: 1 30 | minD: 1 31 | 32 | # high coverage options (> 10x) 33 | #minP: 2 34 | #minD: 4 35 | 36 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods 37 | 38 | ######################################## 39 | ## callable sites bed file thresholds ## 40 | ######################################## 41 | 42 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file 43 | cov_threshold: 2 #regions of the genome with coverage above/below cov_thresh standard deviations will be filtered 44 | 45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold 46 | #to do strict filtering, set to 0 47 | 48 | callable_merge: 100 #merge callable regions separated by this or fewer bp into a single region 49 | 50 | 51 | ## QC options ## 52 | nClusters: 3 53 | GoogleAPIKey: 54 | min_depth: 2 55 | 56 | ## Filtering options ## 57 | 58 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable. 59 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable. 60 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable. 61 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. 62 | 63 | ######################################## 64 | ## coverage thresholds ## 65 | ######################################## 66 | 67 | ## If cov_filter is True, use these parameters to control how coverage filtering is done 68 | ## Three options are provided for coverage-based filtering. The default option is to just filter 69 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit 70 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 71 | ## and make sure all other coverage variables are empty 72 | 73 | cov_threshold_lower: 1 74 | cov_threshold_upper: 10000 75 | 76 | ## Alternatively, filtering can be done based on standard deviations 77 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage), 78 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed. 79 | ## To use this option, set the variables below to the desired X 80 | ## and make sure all other coverage variables are empty 81 | 82 | cov_threshold_stdev: 83 | 84 | ## Finally, filtering can be done based on absolute scaling of the mean, 85 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed. 86 | ## To use this option, set the variable below to the desired X 87 | ## and make sure all other coverage variables are empty 88 | 89 | cov_threshold_rel: 90 | 91 | 92 | 93 | ###TRACKHUB### 94 | trackhub_windows: [1000, 10000, 100000] -------------------------------------------------------------------------------- /.test/ecoli/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/ecoli_samples.csv" # name of the sample metadata CSV 6 | resource_config: "config/resources.yaml" # path to resources yaml config 7 | final_prefix: "ecoli_test" # prefix for final output files 8 | intervals: True #Set to True if you want to perform variant calling using interval (split by ns) approach. 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 10 | sentieon_lic: ".lic" #set to path of sentieon license 11 | remote_reads: False # set if you want reads to be on google cloud storage remote 12 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty 13 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only) 14 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 15 | trackhub_email: "hi@website.com" 16 | mark_duplicates: True 17 | sort_reads: False 18 | ############################## 19 | # Variables you *might* need to change 20 | ############################## 21 | 22 | #refGenome: 23 | #refPath: 24 | 25 | # Interval approach options, only applicable if intervals is True 26 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 27 | num_gvcf_intervals: 3 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps. 28 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1 29 | ploidy: 1 # Ploidy for HaplotypeCaller and Sentieon Haplotyper 30 | ## Coverage options ## 31 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 32 | 33 | # low coverage options (< 10x) 34 | minP: 1 35 | minD: 1 36 | 37 | # high coverage options (> 10x) 38 | #minP: 2 39 | #minD: 4 40 | 41 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods 42 | 43 | ## callable sites bed file options ## 44 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file 45 | mappability_k: 150 #the kmer used to compute mappability with genmap; you should not need to change this except in special cases 46 | 47 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold 48 | #to do strict filtering, set to 0 49 | 50 | mappability_merge: 100 #merge passing mappability regions separated by this or fewer bp into a single region 51 | cov_merge: 100 #merge passing coverage regions separate by this or fewer bp into a single region 52 | 53 | ## QC options ## 54 | nClusters: 3 55 | GoogleAPIKey: 56 | min_depth: 2 57 | 58 | ## Filtering options ## 59 | 60 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable. 61 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable. 62 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable. 63 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. 64 | 65 | ######################################## 66 | ## coverage thresholds ## 67 | ######################################## 68 | 69 | ## If cov_filter is True, use these parameters to control how coverage filtering is done 70 | ## Three options are provided for coverage-based filtering. 71 | 72 | ## The default option is to filter 73 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit. 74 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 75 | ## and make sure all other coverage variables are empty. 76 | 77 | cov_threshold_lower: 1 78 | cov_threshold_upper: 50000 79 | 80 | ## Alternatively, filtering can be done based on standard deviations 81 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage), 82 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed. 83 | ## To use this option, set the variables below to the desired X 84 | ## and make sure all other coverage variables are empty 85 | 86 | cov_threshold_stdev: 87 | 88 | ## Finally, filtering can be done based on absolute scaling of the mean, 89 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed. 90 | ## To use this option, set the variable below to the desired X 91 | ## and make sure all other coverage variables are empty 92 | 93 | cov_threshold_rel: 94 | 95 | 96 | -------------------------------------------------------------------------------- /.test/ci/config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/samples.csv" # path to the sample metadata CSV 6 | resource_config: "config/resources.yaml" # path to resources yaml config 7 | final_prefix: "test_ci" # prefix for final output files 8 | intervals: True #Set to True if you want to perform variant calling using interval approach. 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 10 | sentieon_lic: "" #set to path of sentieon license 11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 12 | remote_reads_prefix: "" # set to google bucket prefix where reads live 13 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty 14 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only) 15 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 16 | trackhub_email: "hi@website.com" 17 | mark_duplicates: True 18 | sort_reads: False 19 | ############################## 20 | # Variables you *might* need to change 21 | ############################## 22 | 23 | # Interval approach options, only applicable if intervals is True 24 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 25 | num_gvcf_intervals: 50 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps. 26 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1 27 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper 28 | ## Coverage options ## 29 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 30 | 31 | # low coverage options (< 10x) 32 | minP: 1 33 | minD: 1 34 | 35 | # high coverage options (> 10x) 36 | #minP: 2 37 | #minD: 4 38 | 39 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods 40 | 41 | ## callable sites bed file options ## 42 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file 43 | mappability_k: 150 #the kmer used to compute mappability with genmap; you should not need to change this except in special cases 44 | 45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold 46 | #to do strict filtering, set to 0 47 | 48 | mappability_merge: 100 #merge passing mappability regions separated by this or fewer bp into a single region 49 | cov_merge: 100 #merge passing coverage regions separate by this or fewer bp into a single region 50 | 51 | ## QC options ## 52 | nClusters: 3 53 | GoogleAPIKey: 54 | min_depth: 2 55 | 56 | ## Filtering options ## 57 | 58 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable. 59 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable. 60 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable. 61 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. 62 | 63 | ######################################## 64 | ## coverage thresholds ## 65 | ######################################## 66 | 67 | ## If cov_filter is True, use these parameters to control how coverage filtering is done 68 | ## Three options are provided for coverage-based filtering. The default option is to just filter 69 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit 70 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 71 | ## and make sure all other coverage variables are empty 72 | 73 | cov_threshold_lower: 1 74 | cov_threshold_upper: 10000 75 | 76 | ## Alternatively, filtering can be done based on standard deviations 77 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage), 78 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed. 79 | ## To use this option, set the variables below to the desired X 80 | ## and make sure all other coverage variables are empty 81 | 82 | cov_threshold_stdev: 83 | 84 | ## Finally, filtering can be done based on absolute scaling of the mean, 85 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed. 86 | ## To use this option, set the variable below to the desired X 87 | ## and make sure all other coverage variables are empty 88 | 89 | cov_threshold_rel: 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Variables you need to change 3 | ############################## 4 | 5 | samples: "config/samples.csv" # path to the sample metadata CSV 6 | final_prefix: "" # prefix for final output files 7 | intervals: True #Set to True if you want to perform variant calling using interval approach. 8 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 9 | sentieon_lic: "" #set to path of sentieon license 10 | remote_reads: False # Set True if reads are in a location seperate from --default-remote-prefix. 11 | remote_reads_prefix: "" # set to google bucket prefix where reads live. FOR SNAKEMAKE 7.X.X ONLY. 12 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty 13 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only) 14 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 15 | trackhub_email: "" 16 | mark_duplicates: True 17 | sort_reads: False 18 | ############################## 19 | # Variables you *might* need to change 20 | ############################## 21 | 22 | # Set reference genome here if you would like to you use the same reference genome for all samples in sample sheet. See docs for more info. 23 | # refGenome: 24 | # refPath: 25 | 26 | # Interval approach options, only applicable if intervals is True 27 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50. 28 | num_gvcf_intervals: 50 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps. 29 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1 30 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper 31 | ## Coverage options ## 32 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options 33 | 34 | # low coverage options (< 10x) 35 | minP: 1 36 | minD: 1 37 | 38 | # high coverage options (> 10x) 39 | #minP: 2 40 | #minD: 4 41 | 42 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods 43 | 44 | ## callable sites bed file options ## 45 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file 46 | mappability_k: 150 #the kmer used to compute mappability with genmap; you should not need to change this except in special cases 47 | 48 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold 49 | #to do strict filtering, set to 0 50 | 51 | mappability_merge: 100 #merge passing mappability regions separated by this or fewer bp into a single region 52 | cov_merge: 100 #merge passing coverage regions separate by this or fewer bp into a single region 53 | 54 | ## QC options ## 55 | nClusters: 3 56 | GoogleAPIKey: 57 | min_depth: 2 58 | 59 | ## Filtering options ## 60 | 61 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable. 62 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable. 63 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable. 64 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable. 65 | 66 | ######################################## 67 | ## coverage thresholds ## 68 | ######################################## 69 | 70 | ## If cov_filter is True, use these parameters to control how coverage filtering is done 71 | ## Three options are provided for coverage-based filtering. The default option is to just filter 72 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit 73 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 74 | ## and make sure all other coverage variables are empty 75 | 76 | cov_threshold_lower: 1 77 | cov_threshold_upper: 10000 78 | 79 | ## Alternatively, filtering can be done based on standard deviations 80 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage), 81 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed. 82 | ## To use this option, set the variables below to the desired X 83 | ## and make sure all other coverage variables are empty 84 | 85 | cov_threshold_stdev: 86 | 87 | ## Finally, filtering can be done based on absolute scaling of the mean, 88 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed. 89 | ## To use this option, set the variable below to the desired X 90 | ## and make sure all other coverage variables are empty 91 | 92 | cov_threshold_rel: 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /workflow/modules/trackhub/scripts/write_hub_files.py: -------------------------------------------------------------------------------- 1 | from os.path import basename 2 | import shutil 3 | import os 4 | 5 | # https://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#UseOneFile 6 | hub_text = """hub {genome} 7 | shortLabel {genome} snpArcher Track Hub 8 | longLabel {genome} snpArcher Track Hub 9 | useOneFile on 10 | descriptionUrl index.html 11 | email {email}\n 12 | genome {genome}\n""" 13 | 14 | vcf_track_txt = """track VCF 15 | bigDataUrl {vcf_file} 16 | shortLabel VCF 17 | longLabel VCF 18 | visibility squish 19 | html index.html 20 | type vcfTabix\n""" 21 | 22 | window_parent_txt = """track {track_type} 23 | compositeTrack on 24 | shortLabel {track_type} 25 | longLabel {track_type} 26 | color {color} 27 | altColor 0,102,204 28 | autoScale on 29 | type bigWig 30 | allButtonPair on 31 | html index.html 32 | visibility full\n""" 33 | 34 | window_track_txt = """track {track_name} 35 | parent {parent} on 36 | bigDataUrl {data_url} 37 | type bigWig 38 | visibility {vis} 39 | shortLabel {label} 40 | longLabel {label}\n""" 41 | 42 | allele_freq_txt = """track MinorAlleleFrequency 43 | bigDataUrl {data_url} 44 | type bigWig 45 | color 88,85,120 46 | altColor 0,102,204 47 | autoScale on 48 | visibility full 49 | shortLabel Minor Allele Frequency 50 | html index.html 51 | longLabel Minor Allele Frequency\n""" 52 | 53 | snp_depth_txt = """track SNPDepth 54 | bigDataUrl {data_url} 55 | type bigWig 56 | color 120,172,145 57 | altColor 0,102,204 58 | autoScale on 59 | visibility full 60 | shortLabel SNP Depth 61 | html index.html 62 | longLabel SNP Depth\n""" 63 | 64 | coverage_track_txt = """track NonCallableSites 65 | bigDataUrl {cov_file} 66 | shortLabel Non Callable Sites 67 | type bigBed 68 | longLabel Non Callable Sites 69 | color 0,0,0 70 | html index.html 71 | visibility dense\n""" 72 | 73 | COLORS = { 74 | "Tajima": "(70,130,180)", 75 | "SNP-Density": "(186,85,211)", 76 | "Pi": "(248,174,51)", 77 | } 78 | 79 | html = """ 80 | 81 | 82 | snpArcher Track Hub Description 83 | 108 | 109 | 110 |

snpArcher Track Hub Description

111 | 112 |

Introduction

113 |

To facilitate downstream data exploration and as an example of the module development components of this work, we 114 | developed a module to generate UCSC Genome Browser track files to explore population variation data (see paper for details).

116 | 117 |

Track Descriptions

118 | 119 |

Tajima’s D

120 |

This track provides windowed estimates of Tajima’s D, a population genetic statistic that measures the departure from neutral evolution in a DNA sequence.

121 | 122 |

SNP Density

123 |

This track displays the density of single nucleotide polymorphisms (SNPs) across the genome, showing regions with high or low levels of genetic variation.

124 | 125 |

Pi

126 |

The Pi track represents the average number of nucleotide differences per site between any two sequences in a population, providing an estimate of genetic diversity.

127 | 128 |

Minor Allele Frequency

129 |

This track shows the frequency of the less common allele at a SNP locus, providing insights into the genetic variation within a population.

130 | 131 |

SNP Depth

132 |

The SNP Depth track displays the number of reads or sequencing depth at each SNP position, indicating the coverage and quality of the variant calls.

133 | 134 |

Non Callable Sites

135 |

The Non Callable Sites track highlights regions in the genome that are considered non-callable, meaning that they have low sequencing coverage or other technical limitations that make it difficult to accurately determine genetic variation in those regions.

136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | """ 144 | 145 | 146 | def human_format(num): 147 | num = float("{:.3g}".format(num)) 148 | magnitude = 0 149 | while abs(num) >= 1000: 150 | magnitude += 1 151 | num /= 1000.0 152 | return "{}{}".format( 153 | "{:f}".format(num).rstrip("0").rstrip("."), ["", "K", "M", "B", "T"][magnitude] 154 | ) 155 | 156 | 157 | def main(): 158 | file_types = snakemake.params["file_types"] # noqa: F821 159 | email = snakemake.params["email"] # noqa: F821 160 | trackhub_windows = snakemake.params["windows"] # noqa: F821 161 | vcf_file = basename(snakemake.input["vcf"][0]) # noqa: F821 162 | cov_file = basename(snakemake.input["callable_sites"][0]) # noqa: F821 163 | freq_file = basename(snakemake.input["allele_freq"][0]) # noqa: F821 164 | depth_file = basename(snakemake.input["depth"][0]) # noqa: F821 165 | genome = snakemake.params["refGenome"] # noqa: F821 166 | trackhub_file = snakemake.output["trackhub_file"] # noqa: F821 167 | html_file = snakemake.output["html"] # noqa: F821 168 | 169 | with open(html_file, "w") as f: 170 | f.write(html) 171 | 172 | with open(trackhub_file, "w") as out: 173 | print(hub_text.format(genome=genome, email=email), file=out) 174 | print(vcf_track_txt.format(vcf_file=vcf_file), file=out) 175 | print(coverage_track_txt.format(cov_file=cov_file), file=out) 176 | print(allele_freq_txt.format(data_url=freq_file), file=out) 177 | print(snp_depth_txt.format(data_url=depth_file), file=out) 178 | 179 | for file in file_types: 180 | print( 181 | window_parent_txt.format(track_type=file, color=COLORS[file]), file=out 182 | ) 183 | for window in trackhub_windows: 184 | track_name = f"{file}_{human_format(window)}_bp_bins" 185 | label = f"{file}_{human_format(window)}_bp bins" 186 | url = f"{file}_{window}.bw" 187 | if window == 1000: 188 | vis = "full" 189 | else: 190 | vis = "hide" 191 | print( 192 | window_track_txt.format( 193 | track_name=track_name, 194 | label=label, 195 | parent=file, 196 | data_url=url, 197 | vis=vis, 198 | ), 199 | file=out, 200 | ) 201 | 202 | 203 | if __name__ == "__main__": 204 | main() 205 | -------------------------------------------------------------------------------- /workflow/modules/postprocess/Snakefile: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | 5 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want 6 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve() 7 | if str(utils_path) not in sys.path: 8 | sys.path.append(str(utils_path)) 9 | 10 | import pandas as pd 11 | import snparcher_utils 12 | configfile: "config/config.yaml" 13 | 14 | samples = snparcher_utils.parse_sample_sheet(config) 15 | REFGENOME = samples['refGenome'].unique().tolist() 16 | 17 | rule all: 18 | input: 19 | expand("results/{refGenome}/{prefix}_filtered.vcf.gz", refGenome=REFGENOME, prefix=config['final_prefix']), 20 | expand("results/{refGenome}/{prefix}_clean_snps.vcf.gz", refGenome=REFGENOME, prefix=config['final_prefix']), 21 | expand("results/{refGenome}/{prefix}_clean_indels.vcf.gz", refGenome=REFGENOME, prefix=config['final_prefix']) 22 | 23 | rule filter_individuals: 24 | """ 25 | make list of individuals to exclude based on sampleType column 26 | """ 27 | output: 28 | include = "results/{refGenome}/postprocess/{prefix}_samps.txt", 29 | run: 30 | out_df = samples[["BioSample", "SampleType"]] 31 | out_df.drop_duplicates("BioSample", inplace=True) 32 | include =out_df[~out_df.SampleType.isin(["exclude"])].BioSample 33 | include_clean = include.dropna() 34 | include_clean.to_csv(output[0], index=False, sep="\t", header=False) 35 | 36 | rule basic_filter: 37 | """ 38 | Filters a vcf file to remove samples marked exclude, sites that don't pass filters, 39 | sites with reference equal to N or alt equal to ., and sites with AF == 0. 40 | """ 41 | input: 42 | vcf = "results/{refGenome}/{prefix}_raw.vcf.gz", 43 | include = "results/{refGenome}/postprocess/{prefix}_samps.txt" 44 | output: 45 | filtered = "results/{refGenome}/{prefix}_filtered.vcf.gz", 46 | filtered_idx = "results/{refGenome}/{prefix}_filtered.vcf.gz.csi" 47 | conda: 48 | "envs/filter.yml" 49 | shell: 50 | """ 51 | bcftools view -S {input.include} -f .,PASS {input.vcf} -a -U -O u | bcftools +fill-tags -Ou | 52 | bcftools view -m2 -e 'AF==0 | ref="N" | ALT="."' -O z -o {output.filtered} 53 | bcftools index {output.filtered} 54 | """ 55 | 56 | rule update_bed: 57 | """ 58 | Updates callable sites bed file to add contigs less than threshold to regions to exclude 59 | """ 60 | input: 61 | bed = "results/{refGenome}/{prefix}_callable_sites.bed", 62 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai" 63 | output: 64 | bed = "results/{refGenome}/postprocess/{prefix}_exclude_sites.bed", 65 | tmp_bed = temp("results/{refGenome}/postprocess/{prefix}_tmp.bed") 66 | conda: 67 | "envs/bed.yml" 68 | params: 69 | size_filter = config["contig_size"], 70 | shell: 71 | """ 72 | awk 'BEGIN{{OFS="\\t"}}{{if ($2<{params.size_filter}) {{print $1,0,$2}}}}' {input.fai} > {output.tmp_bed} 73 | cat {output.tmp_bed} {input.bed} | bedtools sort -i - | bedtools merge -i - > {output.bed} 74 | """ 75 | 76 | rule strict_filter: 77 | input: 78 | bed = "results/{refGenome}/postprocess/{prefix}_exclude_sites.bed", 79 | vcf = "results/{refGenome}/{prefix}_filtered.vcf.gz", 80 | filtered_idx = "results/{refGenome}/{prefix}_filtered.vcf.gz.csi" 81 | output: 82 | vcf = temp("results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz"), 83 | idx = temp("results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz.csi") 84 | conda: 85 | "envs/filter.yml" 86 | params: 87 | miss = config["missingness"], 88 | maf = config["maf"], 89 | upper_bound = lambda wildcards: 1 - float(config["maf"]), 90 | chr_ex = config["scaffolds_to_exclude"], 91 | shell: 92 | """ 93 | if [ -z "{params.chr_ex}" ] 94 | then 95 | bcftools view -R {input.bed} -m2 -M2 \ 96 | -e 'F_MISSING > {params.miss} | AF<{params.maf} | AF>{params.upper_bound}' \ 97 | {input.vcf} -O u -o {output.vcf} 98 | else 99 | bcftools view -t ^{params.chr_ex} -R {input.bed} -m2 -M2 \ 100 | -e 'F_MISSING > {params.miss} | AF<{params.maf} | AF>{params.upper_bound}' \ 101 | {input.vcf} -O u -o {output.vcf} 102 | fi 103 | bcftools index {output.vcf} 104 | """ 105 | 106 | rule subset_indels: 107 | """ 108 | Produce a clean vcf with only indels variants. 109 | """ 110 | input: 111 | vcf = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz", 112 | idx = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz.csi", 113 | output: 114 | vcf = "results/{refGenome}/{prefix}_clean_indels.vcf.gz", 115 | idx = "results/{refGenome}/{prefix}_clean_indels.vcf.gz.tbi" 116 | conda: 117 | "envs/filter.yml" 118 | log: 119 | "logs/{refGenome}/postprocess/{prefix}_subset_indels.txt" 120 | shell: 121 | """ 122 | bcftools view -v indels -O z -o {output.vcf} {input.vcf} 123 | bcftools index -t {output.vcf} 124 | """ 125 | 126 | rule subset_snps: 127 | """ 128 | Produce a clean vcf with only simple snps. 129 | """ 130 | input: 131 | vcf = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz", 132 | idx = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz.csi" 133 | output: 134 | vcf = temp("results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz"), 135 | idx = temp("results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz.tbi") 136 | conda: 137 | "envs/filter.yml" 138 | log: 139 | "logs/{refGenome}/postprocess/{prefix}_subset_snps.txt" 140 | shell: 141 | """ 142 | bcftools view -v snps -e 'TYPE ~ "indel"' -O z -o {output.vcf} {input.vcf} 143 | bcftools index -t {output.vcf} 144 | """ 145 | 146 | rule drop_indel_SNPs: 147 | """ 148 | identify and remove SNPs that overlapped with indels and are coded as genotype length > 1 149 | """ 150 | input: 151 | vcf = "results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz", 152 | idx = "results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz.tbi" 153 | output: 154 | keep_snps = temp("results/{refGenome}/postprocess/{prefix}_keep_snp_positions.txt"), 155 | vcf = "results/{refGenome}/{prefix}_clean_snps.vcf.gz", 156 | idx = "results/{refGenome}/{prefix}_clean_snps.vcf.gz.tbi" 157 | conda: 158 | "envs/filter.yml" 159 | log: 160 | "logs/{refGenome}/postprocess/{prefix}_drop_indel_snps.txt" 161 | shell: 162 | """ 163 | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {input.vcf} | awk 'length($3) == 1 {{print $1"\t"$2}}' | bgzip -c > {output.keep_snps} 164 | tabix -s1 -b2 -e2 {output.keep_snps} 165 | bcftools view -T {output.keep_snps} {input.vcf} -Oz -o {output.vcf} 166 | bcftools index -t {output.vcf} 167 | """ -------------------------------------------------------------------------------- /docs/executing.md: -------------------------------------------------------------------------------- 1 | # Running snpArcher 2 | ## Setup 3 | Please refer to our [setup instructions](./setup.md) to prepare the snpArcher environment and requisite files. 4 | ## Test datasets 5 | To test that your environment is properly setup, you can run a quick test with the following command: 6 | ``` 7 | snakemake -d .test/ecoli --cores 1 --use-conda --workflow-profile workflow-profiles/default 8 | ``` 9 | If this runs without errors, you are ready to go! 10 | ## Using the Dry-run option 11 | Snakemake offers the `--dry-run (-n)` CLI option to perform a dry-run of the workflow to show what jobs would be run. We recommend doing this before executing snpArcher to ensure that the sample sheet was setup correctly, and Snakemake has correctly built the workflow DAG. 12 | ## Local Execution 13 | Once you have setup the requisite configuration files and sample sheet, executing snpArcher on your local machine is as simple as running the Snakemake command with the number of cores you would like to use. For example, to use 8 cores you would run: 14 | ``` 15 | snakemake --cores 8 --use-conda --workflow-profile workflow-profiles/default 16 | ``` 17 | 18 | ### Optional directory setup 19 | To maintain organization across many different projects, you may consider creating a new directory for each project you run using snpArcher. This way, each of your project directories will contain the configuration files used for that run. Below is an example directory structure: 20 | 21 | ``` 22 | . 23 | ├── snpArcher 24 | ├── project_1/ 25 | │ ├── config/ 26 | │ │ ├── config.yaml 27 | │ │ └── samples.csv 28 | │ ├── data 29 | │ └── results 30 | └── project_2/ 31 | ├── config/ 32 | │ ├── config.yaml 33 | │ └── samples.csv 34 | └── data 35 | ``` 36 | 37 | When creating a new directory for an analysis, ensure that you copy the `config` directory from the snpArcher directory to your new directory. 38 | 39 | Then, to run snpArcher on `project_2` from our example, we would execute the command: 40 | ``` 41 | snakemake -s ./snpArcher/workflow/Snakefile -d ./project_2 42 | ``` 43 | 44 | ## Cluster Execution 45 | Snakemake [supports most cluster schedulers](https://snakemake.github.io/snakemake-plugin-catalog/) via executor plugins. Here, we provide documentation for SLURM, however please refer to Snakemake's documentation for further details on using other plugins. 46 | ### SLURM 47 | #### Install plugin 48 | To execute snpArcher on a SLURM cluster, you will need to install the [SLURM executor plugin](https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/slurm.html) into the snpArcher environment. 49 | ```shell 50 | conda activate snparcher 51 | pip install snakemake-executor-plugin-slurm 52 | ``` 53 | #### Profile Setup 54 | To specify resources for the workflow to SLURM, you must use a workflow profile. We have provided a SLURM profile template (`workflow-profiles/slurm/config.yaml`) which you can modify to specify SLURM partitions, memory allocation, etc. Please refer to the [profiles setup section](./setup.md#resources) for more details. Also see the [Snakemake documentation on profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) for more info. 55 | 56 | An example SLURM profile specifies required and recommended Snakemake options: 57 | ```yaml 58 | executor: slurm 59 | use-conda: True 60 | jobs: 100 # Have up to N jobs submitted at any given time 61 | latency-wait: 20 # Wait N seconds for output files due to latency 62 | retries: 3 # Retry jobs N times. 63 | ``` 64 | 65 | #### Running the workflow 66 | Once you have modified the SLURM profile appropriately, you can run snpArcher with the following command: 67 | ```shell 68 | snakemake --workflow-profile 69 | ``` 70 | Depending on your cluster, you can run this command on the head node and Snakemake will submit jobs to the SLURM queue. You can also submit this command via `srun` or `sbatch`. 71 | 72 | ## Cloud Execution 73 | ```{warning} 74 | Google Lifesciences execution is not supported by Snakemake versions >8. Please use Snakemake v7.32.4 if you would like to use this execution mode. 75 | ``` 76 | Like cluster execution, Snakemake [supports a number of cloud providers](https://snakemake.readthedocs.io/en/stable/executing/cloud.html). Here we provide documentation for executing using Snakemake's Google Lifesciences integration. Please refer to Snakemake's documentation for details on using other cloud providers. 77 | ### Google Lifesciences 78 | Snakemake's integration with the Google Lifesciences (GLS) API allows you to easily run snpArcher on the Google Cloud Platform (GCP). Using this execution mode allows you to take advantage of hundreds or thousands of GCP virtual machine instances. Snakemake manages deploying instances, running jobs, and deleting instances with finished jobs. Furthermore, you can use preemptible instances which are offered at a large cost discount, but can only run for a maximum of 24 hours. 79 | 80 | We include profiles for GLS using preemptible instances so that you can get up and running quickly. 81 | #### Google Credential Setup 82 | In order to use the Google Lifesciences execution option, you must first setup your Google Cloud credentials. Please refer [here](https://snakemake.readthedocs.io/en/stable/executor_tutorial/google_lifesciences.html#credentials) for full details. 83 | #### Data setup 84 | To use this execution mode, you must have a Google Storage bucket with your raw data files. This can be achieved by using Google's web interface, or at the command line using [`gsutil`](https://cloud.google.com/storage/docs/gsutil). For example, if we have some data locally like so: 85 | ``` 86 | . 87 | └── data/ 88 | ├── raw_reads/ 89 | │ ├── samp_1_R1.fq.gz 90 | │ ├── samp_1_R2.fq.gz 91 | │ ├── samp_2_R1.fq.gz 92 | │ ├── samp_2_R2.fq.gz 93 | │ ├── samp_3_R1.fq.gz 94 | │ ├── samp_3_R2.fq.gz 95 | │ ├── samp_4_R1.fq.gz 96 | │ ├── samp_4_R2.fq.gz 97 | │ └── ... 98 | └── ref_genome/ 99 | └── genome.fa 100 | ``` 101 | 102 | We can copy this data to our bucket like so: 103 | ``` 104 | gsutil cp -r ./data gs:// 105 | ``` 106 | 107 | ```{note} 108 | When using cloud execution, do not include the bucket name in any path fields of the sample sheet, such as fq1, fq2, or refPath 109 | ``` 110 | ```{note} 111 | If you are using data hosted on NCBI, you do not need to upload those data to your bucket, snpArcher will handle this for you. However, you still need to create storage bucket to be used for the workflow. 112 | ``` 113 | 114 | Some users may want to store their raw reads in a separate bucket from where the workflow will store files. To do so, you can specify the remote prefix in `config/config.yaml`. 115 | 116 | #### Running the workflow 117 | Once your credentials and data are setup, you can run snpArcher using the included profile `profiles/google_lifesciences/config.yaml`. You should modify this profile to set resources for the workflow such as instance type, threads, preemptible rules, etc. 118 | 119 | To run the workflow, execute the following command: 120 | ``` 121 | snakemake --workflow-profile --default-remote-prefix 122 | ``` 123 | 124 | As the workflow runs, Snakemake will print out logging information to the terminal. Please refer [here](https://snakemake.readthedocs.io/en/stable/executor_tutorial/google_lifesciences.html#step-5-debugging) for further details. 125 | -------------------------------------------------------------------------------- /workflow/modules/trackhub/Snakefile: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | 5 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want 6 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve() 7 | if str(utils_path) not in sys.path: 8 | sys.path.append(str(utils_path)) 9 | 10 | import pandas as pd 11 | import snparcher_utils 12 | 13 | configfile: "config/config.yaml" 14 | wildcard_constraints: 15 | window="\d+" 16 | 17 | samples = snparcher_utils.parse_sample_sheet(config) 18 | REFGENOME = samples['refGenome'].unique().tolist() 19 | WINDOWS = [1000, 10000, 100000] 20 | FILE_TYPES = ["Tajima", "SNP-Density", "Pi"] # dont change this unless you add rules to generate more stats. 21 | 22 | rule all: 23 | input: 24 | trackhub_file = expand("results/{refGenome}/trackhub/hub.txt", refGenome=REFGENOME), 25 | trackhub_html = expand("results/{refGenome}/trackhub/index.html", refGenome=REFGENOME), 26 | 27 | rule write_hub_files: 28 | input: 29 | window_stat_files = expand("results/{{refGenome}}/trackhub/{file_type}_{window}.bw", file_type=FILE_TYPES, window=WINDOWS), 30 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 31 | callable_sites = "results/{refGenome}/trackhub/non_callable_sites.bb", 32 | allele_freq = "results/{refGenome}/trackhub/allele_freq.bw", 33 | depth = "results/{refGenome}/trackhub/depth.bw", 34 | 35 | output: 36 | trackhub_file = "results/{refGenome}/trackhub/hub.txt", 37 | html = "results/{refGenome}/trackhub/index.html" 38 | params: 39 | refGenome = "{refGenome}", 40 | file_types = FILE_TYPES, 41 | windows = WINDOWS, 42 | email = config["trackhub_email"] 43 | script: 44 | "scripts/write_hub_files.py" 45 | 46 | rule strip_vcf: 47 | """ 48 | Strips vcf of all info/filters to reduce size 49 | """ 50 | input: 51 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 52 | output: 53 | vcf = "results/{refGenome}/trackhub/info_stripped_snps.vcf.gz", 54 | tbi = "results/{refGenome}/trackhub/info_stripped_snps.vcf.gz.tbi" 55 | log: 56 | "logs/{refGenome}/trackhub/strip_vcf.log" 57 | conda: 58 | "envs/trackhub.yml" 59 | shell: 60 | """ 61 | bcftools annotate -x INFO,FORMAT/DP,FORMAT/GQ,FORMAT/PL {input.vcf} -O z -o {output.vcf} 62 | tabix -p vcf {output.vcf} 63 | """ 64 | 65 | 66 | rule calc_tajima: 67 | input: 68 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 69 | output: 70 | temp("results/{refGenome}/trackhub/{window}.Tajima") 71 | log: 72 | "logs/{refGenome}/trackhub/tajima/{window}.log" 73 | conda: 74 | "envs/trackhub.yml" 75 | shell: 76 | """ 77 | vcftools --gzvcf {input} --TajimaD {wildcards.window} --stdout > {output} 2>> {log} 78 | """ 79 | 80 | rule calc_snpden: 81 | input: 82 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 83 | output: 84 | temp("results/{refGenome}/trackhub/{window}.SNP-Density") 85 | log: 86 | "logs/{refGenome}/trackhub/SNP-Density/{window}.log" 87 | conda: 88 | "envs/trackhub.yml" 89 | shell: 90 | """ 91 | vcftools --gzvcf {input} --SNPdensity {wildcards.window} --stdout > {output} 2> {log} 92 | """ 93 | 94 | rule calc_pi: 95 | input: 96 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 97 | output: 98 | temp( "results/{refGenome}/trackhub/{window}.Pi") 99 | log: 100 | "logs/{refGenome}/trackhub/Pi/{window}.log" 101 | conda: 102 | "envs/trackhub.yml" 103 | shell: 104 | """ 105 | vcftools --gzvcf {input} --window-pi {wildcards.window} --stdout > {output} 2> {log} 106 | """ 107 | 108 | rule chrom_sizes: 109 | input: 110 | "results/{refGenome}/data/genome/{refGenome}.fna.fai" 111 | output: 112 | "results/{refGenome}/trackhub/chrom.sizes" 113 | shell: 114 | "cut -f1,2 {input} > {output}" 115 | 116 | rule bcftools_depth: 117 | input: 118 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 119 | chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes" 120 | output: 121 | bg = temp("results/{refGenome}/trackhub/depth.bg"), 122 | bw = "results/{refGenome}/trackhub/depth.bw" 123 | 124 | conda: 125 | "envs/trackhub.yml" 126 | shell: 127 | """ 128 | bcftools query -f '%CHROM\t%POS\t%POS\t%DP\n' {input.vcf} | awk -v OFS='\t' '{{print $1,$2-1,$2,$4}}' > {output.bg} 129 | bedGraphToBigWig {output.bg} {input.chrom_sizes} {output.bw} 130 | """ 131 | 132 | rule vcftools_freq: 133 | input: 134 | vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']), 135 | chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes" 136 | output: 137 | bg = temp("results/{refGenome}/trackhub/allele_freq.bg"), 138 | bw = "results/{refGenome}/trackhub/allele_freq.bw" 139 | 140 | conda: 141 | "envs/trackhub.yml" 142 | shell: 143 | """ 144 | vcftools --gzvcf {input.vcf} --freq2 --stdout | cut -f 1,2,6 | tail -n +2 | awk -v OFS='\t' '{{print $1,$2-1,$2,$3}}' > {output.bg} 145 | bedGraphToBigWig {output.bg} {input.chrom_sizes} {output.bw} 146 | """ 147 | 148 | rule convert_to_bedgraph: 149 | input: 150 | stat_file = "results/{refGenome}/trackhub/{window}.{file_type}", 151 | chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes" 152 | output: 153 | temp("results/{refGenome}/trackhub/{file_type}_{window}.bg") 154 | script: 155 | "scripts/vcftools_out_to_bg.py" 156 | 157 | rule bedgraph_to_bigwig: 158 | input: 159 | bg = "results/{refGenome}/trackhub/{file_type}_{window}.bg", 160 | chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes" 161 | output: 162 | "results/{refGenome}/trackhub/{file_type}_{window}.bw" 163 | conda: 164 | "envs/trackhub.yml" 165 | shell: 166 | "bedGraphToBigWig {input.bg} {input.chrom_sizes} {output}" 167 | 168 | rule non_callable_sites: 169 | """ 170 | Calculates complement of callable sites to make 'non-callable sites' this makes visualzing on browser easier. 171 | """ 172 | input: 173 | callable_sites = expand("results/{{refGenome}}/{prefix}_callable_sites.bed", prefix=config['final_prefix']), 174 | chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes", 175 | 176 | output: 177 | bed = temp("results/{refGenome}/trackhub/non_callable_sites.bed"), 178 | bb = "results/{refGenome}/trackhub/non_callable_sites.bb" 179 | conda: 180 | "envs/trackhub.yml" 181 | shadow: 182 | "minimal" 183 | shell: 184 | """ 185 | sort -k1,1V {input.chrom_sizes} > sorted.chrom.sizes 186 | sort -k1,1V -k2,2n {input.callable_sites} > sorted_callable_sites.bed 187 | bedtools complement -i sorted_callable_sites.bed -g sorted.chrom.sizes > {output.bed} 188 | 189 | bedSort {output.bed} bedsort_non_callable_sites.bed 190 | bedToBigBed bedsort_non_callable_sites.bed sorted.chrom.sizes {output.bb} 191 | """ -------------------------------------------------------------------------------- /workflow/rules/bam2vcf_gatk.smk: -------------------------------------------------------------------------------- 1 | localrules: create_db_mapfile 2 | 3 | rule bam2gvcf: 4 | """ 5 | TODO 6 | """ 7 | input: 8 | unpack(get_bams), 9 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 10 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 11 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 12 | 13 | output: 14 | gvcf = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz", 15 | tbi = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz.tbi" 16 | log: 17 | "logs/{refGenome}/gatk_hc/{sample}.txt" 18 | benchmark: 19 | "benchmarks/{refGenome}/gatk_hc/{sample}.txt" 20 | params: 21 | minPrun = config['minP'], 22 | minDang = config['minD'], 23 | ploidy = config['ploidy'] 24 | conda: 25 | "../envs/bam2vcf.yml" 26 | shell: 27 | "gatk HaplotypeCaller " 28 | "--java-options \"-Xmx{resources.mem_mb_reduced}m\" " 29 | "-R {input.ref} " 30 | "-I {input.bam} " 31 | "-O {output.gvcf} " 32 | "-ploidy {params.ploidy} " 33 | "--emit-ref-confidence GVCF --min-pruning {params.minPrun} --min-dangling-branch-length {params.minDang} &> {log}" 34 | 35 | rule create_db_mapfile: 36 | """ 37 | TODO 38 | """ 39 | input: 40 | get_input_for_mapfile 41 | output: 42 | db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt" 43 | run: 44 | with open(output.db_mapfile, "w") as f: 45 | for file_path in input: 46 | sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "") 47 | print(sample_name, file_path, sep="\t", file=f) 48 | 49 | rule prepare_db_intervals: 50 | """GenomicsDBImport needs list of intervals to operate on so this rule writes that file""" 51 | input: 52 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 53 | output: 54 | intervals = "results/{refGenome}/genomics_db_import/db_intervals.list" 55 | run: 56 | with open(output.intervals, "w") as out: 57 | with open(input.fai, "r") as f: 58 | for line in f: 59 | line = line.strip().split() 60 | chrom, end = line[0], line[1] 61 | print(f"{chrom}:1-{end}", file=out) 62 | 63 | rule gvcf2DB: 64 | """ 65 | todo 66 | """ 67 | input: 68 | unpack(get_gvcfs_db), 69 | db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt", 70 | intervals = "results/{refGenome}/genomics_db_import/db_intervals.list" 71 | output: 72 | db = temp(directory("results/{refGenome}/genomics_db_import/DB")), 73 | tar = temp("results/{refGenome}/genomics_db_import/DB.tar"), 74 | log: 75 | "logs/{refGenome}/gatk_db_import.txt" 76 | benchmark: 77 | "benchmarks/{refGenome}/gatk_db_import.txt" 78 | conda: 79 | "../envs/bam2vcf.yml" 80 | shell: 81 | # NOTE: reader-threads > 1 useless if you specify multiple intervals 82 | # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance 83 | """ 84 | export TILEDB_DISABLE_FILE_LOCKING=1 85 | gatk GenomicsDBImport \ 86 | --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \ 87 | --genomicsdb-shared-posixfs-optimizations true \ 88 | --batch-size 25 \ 89 | --genomicsdb-workspace-path {output.db} \ 90 | -L {input.intervals} \ 91 | --merge-input-intervals \ 92 | --tmp-dir {resources.tmpdir} \ 93 | --sample-name-map {input.db_mapfile} &> {log} 94 | 95 | tar -cf {output.tar} {output.db} 96 | """ 97 | 98 | rule DB2vcf: 99 | """ 100 | This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists 101 | are still scattered. 102 | """ 103 | input: 104 | db = "results/{refGenome}/genomics_db_import/DB.tar", 105 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 106 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 107 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 108 | output: 109 | vcf = temp("results/{refGenome}/vcfs/raw.vcf.gz"), 110 | vcfidx = temp("results/{refGenome}/vcfs/raw.vcf.gz.tbi"), 111 | params: 112 | het = config['het_prior'], 113 | db = lambda wc, input: input.db[:-4] 114 | log: 115 | "logs/{refGenome}/gatk_genotype_gvcfs.txt" 116 | benchmark: 117 | "benchmarks/{refGenome}/gatk_genotype_gvcfs.txt" 118 | conda: 119 | "../envs/bam2vcf.yml" 120 | shell: 121 | """ 122 | tar -xf {input.db} 123 | gatk GenotypeGVCFs \ 124 | --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \ 125 | -R {input.ref} \ 126 | --heterozygosity {params.het} \ 127 | --genomicsdb-shared-posixfs-optimizations true \ 128 | -V gendb://{params.db} \ 129 | -O {output.vcf} \ 130 | --tmp-dir {resources.tmpdir} &> {log} 131 | """ 132 | 133 | rule filterVcfs: 134 | """ 135 | This rule filters all of the VCFs 136 | """ 137 | input: 138 | vcf = "results/{refGenome}/vcfs/raw.vcf.gz", 139 | vcfidx = "results/{refGenome}/vcfs/raw.vcf.gz.tbi", 140 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 141 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 142 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 143 | output: 144 | vcf = temp("results/{refGenome}/vcfs/filtered.vcf.gz"), 145 | vcfidx = temp("results/{refGenome}/vcfs/filtered.vcf.gz.tbi") 146 | conda: 147 | "../envs/bam2vcf.yml" 148 | log: 149 | "logs/{refGenome}/gatk_filter.txt" 150 | benchmark: 151 | "benchmarks/{refGenome}/gatk_filter.txt" 152 | shell: 153 | "gatk VariantFiltration " 154 | "-R {input.ref} " 155 | "-V {input.vcf} " 156 | "--output {output.vcf} " 157 | "--filter-name \"RPRS_filter\" " 158 | "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" " 159 | "--filter-name \"FS_SOR_filter\" " 160 | "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') && SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') && SOR > 10.0)))\" " 161 | "--filter-name \"MQ_filter\" " 162 | "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" " 163 | "--filter-name \"QUAL_filter\" " 164 | "--filter-expression \"QUAL < 30.0\" " 165 | "--create-output-variant-index " 166 | "--invalidate-previous-filters true &> {log}" 167 | 168 | rule sort_gatherVcfs: 169 | input: 170 | vcf = "results/{refGenome}/vcfs/filtered.vcf.gz", 171 | vcfidx = "results/{refGenome}/vcfs/filtered.vcf.gz.tbi" 172 | output: 173 | vcfFinal = "results/{refGenome}/{prefix}_raw.vcf.gz", 174 | vcfFinalidx = "results/{refGenome}/{prefix}_raw.vcf.gz.tbi" 175 | conda: 176 | "../envs/bcftools.yml" 177 | log: 178 | "logs/{refGenome}/sort_gather_vcfs/{prefix}_log.txt" 179 | benchmark: 180 | "benchmarks/{refGenome}/sort_gather_vcfs/{prefix}_benchmark.txt" 181 | shell: 182 | """ 183 | bcftools sort -Oz -o {output.vcfFinal} {input.vcf} 2>> {log} 184 | tabix -p vcf {output.vcfFinal} 2>> {log} 185 | """ -------------------------------------------------------------------------------- /workflow/rules/sentieon.smk: -------------------------------------------------------------------------------- 1 | rule sentieon_map: 2 | input: 3 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 4 | r1 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_1.fastq.gz", 5 | r2 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_2.fastq.gz", 6 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]) 7 | output: 8 | bam = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam"), 9 | bai = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam.bai"), 10 | params: 11 | rg = get_read_group, 12 | lic = config['sentieon_lic'] 13 | conda: 14 | "../envs/sentieon.yml" 15 | log: 16 | "logs/{refGenome}/sentieon_map/{sample}/{run}.txt" 17 | benchmark: 18 | "benchmarks/{refGenome}/sentieon_map/{sample}/{run}.txt" 19 | shell: 20 | """ 21 | export MALLOC_CONF=lg_dirty_mult:-1 22 | export SENTIEON_LICENSE={params.lic} 23 | sentieon bwa mem -M -R {params.rg} -t {threads} -K 10000000 {input.ref} {input.r1} {input.r2} | sentieon util sort --bam_compression 1 -r {input.ref} -o {output.bam} -t {threads} --sam2bam -i - 24 | samtools index {output.bam} {output.bai} 25 | """ 26 | rule merge_bams: 27 | input: 28 | merge_bams_input 29 | output: 30 | bam = temp("results/{refGenome}/bams/postMerge/{sample}.bam"), 31 | bai = temp("results/{refGenome}/bams/postMerge/{sample}.bam.bai") 32 | conda: 33 | "../envs/fastq2bam.yml" 34 | log: 35 | "logs/{refGenome}/merge_bams/{sample}.txt" 36 | benchmark: 37 | "benchmarks/{refGenome}/merge_bams/{sample}.txt" 38 | shell: 39 | "samtools merge {output.bam} {input} && samtools index {output.bam}" 40 | 41 | rule sentieon_dedup: 42 | input: 43 | unpack(dedup_input), 44 | output: 45 | dedupBam = "results/{refGenome}/bams/{sample}_final.bam", 46 | dedupBai = "results/{refGenome}/bams/{sample}_final.bam.bai", 47 | score = temp("results/{refGenome}/summary_stats/{sample}/sentieon_dedup_score.txt"), 48 | metrics = temp("results/{refGenome}/summary_stats/{sample}/sentieon_dedup_metrics.txt") 49 | params: 50 | lic = config['sentieon_lic'] 51 | conda: 52 | "../envs/sentieon.yml" 53 | log: 54 | "logs/{refGenome}/sentieon_dedup/{sample}.txt" 55 | benchmark: 56 | "benchmarks/{refGenome}/sentieon_dedup/{sample}.txt" 57 | shell: 58 | """ 59 | export SENTIEON_LICENSE={params.lic} 60 | sentieon driver -t {threads} -i {input.bam} --algo LocusCollector --fun score_info {output.score} 61 | sentieon driver -t {threads} -i {input.bam} --algo Dedup --score_info {output.score} --metrics {output.metrics} --bam_compression 1 {output.dedupBam} 62 | """ 63 | 64 | rule sentieon_haplotyper: 65 | input: 66 | unpack(get_bams), 67 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 68 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 69 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 70 | params: 71 | lic = config['sentieon_lic'], 72 | ploidy = config['ploidy'] 73 | output: 74 | gvcf = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz", 75 | gvcf_idx = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz.tbi", 76 | conda: 77 | "../envs/sentieon.yml" 78 | log: 79 | "logs/{refGenome}/sentieon_haplotyper/{sample}.txt" 80 | benchmark: 81 | "benchmarks/{refGenome}/sentieon_haplotyper/{sample}.txt" 82 | shell: 83 | """ 84 | export SENTIEON_LICENSE={params.lic} 85 | sentieon driver -r {input.ref} -t {threads} -i {input.bam} --algo Haplotyper --genotype_model multinomial --emit_mode gvcf --emit_conf 30 --call_conf 30 {output.gvcf} --ploidy {params.ploidy} 2> {log} 86 | """ 87 | 88 | rule sentieon_combine_gvcf: 89 | input: 90 | unpack(sentieon_combine_gvcf_input), 91 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 92 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 93 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict" 94 | output: 95 | vcf = temp("results/{refGenome}/vcfs/raw.vcf.gz"), 96 | tbi = temp("results/{refGenome}/vcfs/raw.vcf.gz.tbi") 97 | params: 98 | glist = lambda wc, input: " ".join(["-v " + gvcf for gvcf in input['gvcfs']]), 99 | lic = config['sentieon_lic'] 100 | conda: 101 | "../envs/sentieon.yml" 102 | log: 103 | "logs/{refGenome}/sentieon_combine_gvcf/log.txt" 104 | benchmark: 105 | "benchmarks/{refGenome}/sentieon_combine_gvcf/benchmark.txt" 106 | shell: 107 | """ 108 | export SENTIEON_LICENSE={params.lic} 109 | sentieon driver -r {input.ref} -t {threads} --algo GVCFtyper --emit_mode VARIANT {output.vcf} {params.glist} 2> {log} 110 | """ 111 | 112 | rule filter_vcf: 113 | """ 114 | This rule applies filters to the raw vcf using GNU Parallel. 115 | """ 116 | input: 117 | vcf = "results/{refGenome}/vcfs/raw.vcf.gz", 118 | tbi = "results/{refGenome}/vcfs/raw.vcf.gz.tbi", 119 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 120 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 121 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict" 122 | output: 123 | vcf = "results/{refGenome}/{prefix}_raw.vcf.gz", 124 | tbi = "results/{refGenome}/{prefix}_raw.vcf.gz.tbi" 125 | conda: 126 | "../envs/bam2vcf.yml" 127 | log: 128 | "logs/{refGenome}/sentieon_filter_vcfs/{prefix}_log.txt" 129 | shadow: "minimal" 130 | benchmark: 131 | "benchmarks/{refGenome}/sentieon_filter_vcfs/{prefix}_benchmark.txt" 132 | shell: 133 | """ 134 | # get the contig names from the .fai index 135 | contigs=$(cut -f1 {input.indexes[5]}) 136 | 137 | # create a function that will be passed to gnu parallel 138 | filter_contig() {{ 139 | contig=$1 140 | echo $contig 141 | 142 | gatk --java-options "-Xmx4g" VariantFiltration \ 143 | -R {input.ref} \ 144 | -L ${{contig}} \ 145 | -V {input.vcf} \ 146 | --output {wildcards.refGenome}_{wildcards.prefix}_filter_${{contig}}.vcf.gz \ 147 | --filter-name "RPRS_filter" \ 148 | --filter-expression "(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)" \ 149 | --filter-name "FS_SOR_filter" \ 150 | --filter-expression "(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') && SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') && SOR > 10.0)))" \ 151 | --filter-name "MQ_filter" \ 152 | --filter-expression "vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))" \ 153 | --filter-name "QUAL_filter" \ 154 | --filter-expression "QUAL < 30.0" \ 155 | --invalidate-previous-filters true 156 | }} 157 | 158 | export -f filter_contig 159 | 160 | # pass each contig to gnu parallel 161 | parallel -j {threads} filter_contig ::: ${{contigs}} 2> {log} 162 | 163 | bcftools concat {wildcards.refGenome}_{wildcards.prefix}_filter_*.vcf.gz --threads {threads} -Oz -o {output.vcf} 2>> {log} 164 | tabix -p vcf {output.vcf} 2>> {log} 165 | """ 166 | -------------------------------------------------------------------------------- /workflow/rules/bam2vcf_gatk_intervals.smk: -------------------------------------------------------------------------------- 1 | localrules: create_db_mapfile 2 | 3 | rule bam2gvcf: 4 | """ 5 | TODO 6 | """ 7 | input: 8 | unpack(get_bams), 9 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 10 | indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]), 11 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 12 | l = "results/{refGenome}/intervals/gvcf_intervals/{l}-scattered.interval_list", 13 | 14 | output: 15 | gvcf = "results/{refGenome}/interval_gvcfs/{sample}/{l}.raw.g.vcf.gz", 16 | gvcf_idx = "results/{refGenome}/interval_gvcfs/{sample}/{l}.raw.g.vcf.gz.tbi" 17 | log: 18 | "logs/{refGenome}/gatk_hc/{sample}/{l}.txt" 19 | benchmark: 20 | "benchmarks/{refGenome}/gatk_hc/{sample}_{l}.txt" 21 | params: 22 | minPrun = config['minP'], 23 | minDang = config['minD'], 24 | ploidy = config['ploidy'], 25 | conda: 26 | "../envs/bam2vcf.yml" 27 | shell: 28 | """ 29 | gatk HaplotypeCaller \ 30 | --java-options -Xmx{resources.mem_mb_reduced}m \ 31 | -R {input.ref} \ 32 | -I {input.bam} \ 33 | -O {output.gvcf} \ 34 | -L {input.l} \ 35 | -ploidy {params.ploidy} \ 36 | --emit-ref-confidence GVCF --min-pruning {params.minPrun} --min-dangling-branch-length {params.minDang} &> {log} 37 | """ 38 | 39 | rule concat_gvcfs: 40 | input: 41 | gvcfs = get_interval_gvcfs, 42 | tbis = get_interval_gvcfs_idx 43 | output: 44 | gvcf = temp("results/{refGenome}/gvcfs/{sample}.g.vcf.gz"), 45 | tbi = temp("results/{refGenome}/gvcfs/{sample}.g.vcf.gz.tbi") 46 | log: 47 | "logs/{refGenome}/concat_gvcfs/{sample}.txt" 48 | benchmark: 49 | "benchmarks/{refGenome}/concat_gvcfs/{sample}.txt" 50 | resources: 51 | tmpdir = get_big_temp 52 | conda: 53 | "../envs/bcftools.yml" 54 | shell: 55 | """ 56 | bcftools concat -D -a -Ou {input.gvcfs} 2> {log} | bcftools sort -T {resources.tmpdir} -Oz -o {output.gvcf} - 2>> {log} 57 | tabix -p vcf {output.gvcf} 2>> {log} 58 | """ 59 | 60 | rule bcftools_norm: 61 | input: 62 | gvcf = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz", 63 | output: 64 | gvcf = "results/{refGenome}/gvcfs_norm/{sample}.g.vcf.gz", 65 | tbi = "results/{refGenome}/gvcfs_norm/{sample}.g.vcf.gz.tbi" 66 | log: 67 | "logs/{refGenome}/norm_gvcf/{sample}.txt" 68 | benchmark: 69 | "benchmarks/{refGenome}/norm_gvcf/{sample}.txt" 70 | resources: 71 | tmpdir = get_big_temp 72 | conda: 73 | "../envs/bcftools.yml" 74 | shell: 75 | """ 76 | bcftools norm -m +any -Oz -o {output.gvcf} {input.gvcf} 2> {log} 77 | tabix -p vcf {output.gvcf} 2>> {log} 78 | """ 79 | 80 | rule create_db_mapfile: 81 | """ 82 | TODO 83 | """ 84 | input: 85 | get_input_for_mapfile 86 | output: 87 | db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt" 88 | run: 89 | with open(output.db_mapfile, "w") as f: 90 | for file_path in input: 91 | sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "") 92 | print(sample_name, file_path, sep="\t", file=f) 93 | 94 | rule gvcf2DB: 95 | """ 96 | Create GenomicsDB. 97 | """ 98 | input: 99 | unpack(get_gvcfs_db), 100 | l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list", 101 | db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt" 102 | output: 103 | db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}")), 104 | tar = temp("results/{refGenome}/genomics_db_import/DB_L{l}.tar"), 105 | log: 106 | "logs/{refGenome}/gatk_db_import/{l}.txt" 107 | benchmark: 108 | "benchmarks/{refGenome}/gatk_db_import/{l}.txt" 109 | resources: 110 | tmpdir = get_big_temp 111 | conda: 112 | "../envs/bam2vcf.yml" 113 | shell: 114 | # NOTE: reader-threads > 1 useless if you specify multiple intervals 115 | # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance 116 | """ 117 | export TILEDB_DISABLE_FILE_LOCKING=1 118 | gatk GenomicsDBImport \ 119 | --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \ 120 | --genomicsdb-shared-posixfs-optimizations true \ 121 | --batch-size 25 \ 122 | --genomicsdb-workspace-path {output.db} \ 123 | --merge-input-intervals \ 124 | -L {input.l} \ 125 | --tmp-dir {resources.tmpdir} \ 126 | --sample-name-map {input.db_mapfile} &> {log} 127 | 128 | tar -cf {output.tar} {output.db} 129 | """ 130 | 131 | rule DB2vcf: 132 | """ 133 | This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists 134 | are still scattered. 135 | """ 136 | input: 137 | db = "results/{refGenome}/genomics_db_import/DB_L{l}.tar", 138 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 139 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 140 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 141 | output: 142 | vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"), 143 | vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"), 144 | params: 145 | het = config['het_prior'], 146 | db = lambda wc, input: input.db[:-4] 147 | resources: 148 | tmpdir = get_big_temp 149 | log: 150 | "logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt" 151 | benchmark: 152 | "benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt" 153 | conda: 154 | "../envs/bam2vcf.yml" 155 | shell: 156 | """ 157 | tar -xf {input.db} 158 | gatk GenotypeGVCFs \ 159 | --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \ 160 | -R {input.ref} \ 161 | --heterozygosity {params.het} \ 162 | --genomicsdb-shared-posixfs-optimizations true \ 163 | -V gendb://{params.db} \ 164 | -O {output.vcf} \ 165 | --tmp-dir {resources.tmpdir} &> {log} 166 | """ 167 | 168 | rule filterVcfs: 169 | """ 170 | This rule filters all of the VCFs 171 | """ 172 | input: 173 | vcf = "results/{refGenome}/vcfs/intervals/L{l}.vcf.gz", 174 | vcfidx = "results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi", 175 | ref = "results/{refGenome}/data/genome/{refGenome}.fna", 176 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 177 | dictf = "results/{refGenome}/data/genome/{refGenome}.dict", 178 | output: 179 | vcf = temp("results/{refGenome}/vcfs/intervals/filtered_L{l}.vcf.gz"), 180 | vcfidx = temp("results/{refGenome}/vcfs/intervals/filtered_L{l}.vcf.gz.tbi") 181 | conda: 182 | "../envs/bam2vcf.yml" 183 | log: 184 | "logs/{refGenome}/gatk_filter/{l}.txt" 185 | benchmark: 186 | "benchmarks/{refGenome}/gatk_filter/{l}.txt" 187 | shell: 188 | "gatk VariantFiltration " 189 | "-R {input.ref} " 190 | "-V {input.vcf} " 191 | "--output {output.vcf} " 192 | "--filter-name \"RPRS_filter\" " 193 | "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" " 194 | "--filter-name \"FS_SOR_filter\" " 195 | "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') && SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') && SOR > 10.0)))\" " 196 | "--filter-name \"MQ_filter\" " 197 | "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" " 198 | "--filter-name \"QUAL_filter\" " 199 | "--filter-expression \"QUAL < 30.0\" " 200 | "--create-output-variant-index " 201 | "--invalidate-previous-filters true &> {log}" 202 | 203 | rule sort_gatherVcfs: 204 | input: 205 | vcfs = get_interval_vcfs, 206 | tbis = get_interval_vcfs_idx 207 | output: 208 | vcfFinal = "results/{refGenome}/{prefix}_raw.vcf.gz", 209 | vcfFinalidx = "results/{refGenome}/{prefix}_raw.vcf.gz.tbi" 210 | conda: 211 | "../envs/bcftools.yml" 212 | log: 213 | "logs/{refGenome}/sort_gather_vcfs/{prefix}_log.txt" 214 | benchmark: 215 | "benchmarks/{refGenome}/sort_gather_vcfs/{prefix}_benchmark.txt" 216 | resources: 217 | tmpdir = get_big_temp 218 | shell: 219 | """ 220 | bcftools concat -D -a -Ou {input.vcfs} 2> {log}| bcftools sort -T {resources.tmpdir} -Oz -o {output.vcfFinal} - 2>> {log} 221 | tabix -p vcf {output.vcfFinal} 2>> {log} 222 | """ 223 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | On this page you will find an example project scenario and how to setup and run it using snpArcher. 3 | 4 | In this example, we have 10 resequenced individuals we would like to generate variant calls for. We will cover creating the sample sheet, selecting config options, and running the workflow. 5 | ## Directory structure 6 | First, let's setup our directories as suggested in our [executing](./executing.md#optional-directory-setup) instructions. Let's assume we are working in a directory called `workdir/`, and the snpArcher repository has already been cloned there. We have also already created the `snparcher` conda env as instructed in the [setup docs](./setup.md#environment-setup). 7 | 8 | 1. Let's create a directory to organize this project and future ones, call it `projects`. Then, create a new directory for this project, we'll call it `secretarybird_reseq`. 9 | ``` 10 | . 11 | ├── projects 12 | │   └── secretarybird_reseq 13 | └── snpArcher 14 |    └── ... 15 | ``` 16 | ```{note} 17 | Not all files and directories are shown, only relevant ones. 18 | ``` 19 | 2. Copy the snpArcher config directory `snpArcher/config` to `projects/secretarybird_reseq`: 20 | ``` 21 | . 22 | ├── projects 23 | │   └── secretarybird_reseq 24 | │   └── config 25 | │   └── config.yaml 26 | └── snpArcher 27 |    └── ... 28 | ``` 29 | 30 | 3. Assume we already have all our sequence data and reference genome on our system, stored in a different location `/storage/data`. We do not need to move the raw data to our project directory. 31 | ```{note} 32 | We'll cover the cases using SRA data and refSeq genomes later on in this example. 33 | ``` 34 | ## Sample sheet setup 35 | Now we need to setup our sample sheet to inform snpArcher of our samples and their metadata. You can use any editor to create the sheet, as long as it is a CSV file. We will save the sample sheet in our project's config directory: `projects/secretarybird_reseq/samples.csv`. Below is the final sample sheet that we will use going forward, with explanations of each column following. 36 | 37 | For a more comprehensive explanation of the sample sheet, please refer to [here](./setup.md#creating-a-sample-sheet) for more details. 38 | 39 | 40 | ### Final sample sheet 41 | ``` 42 | BioSample,LibraryName,Run,fq1,fq2,lat,long 43 | bird_1,bird_1_lib,1,/storage/data/bird_1_R1.fq.gz,/storage/data/bird_1_R2.fq.gz,-8.758119,-36.280061 44 | bird_2,bird_2_lib,2,/storage/data/bird_2_R1.fq.gz,/storage/data/bird_2_R2.fq.gz,-72.336165,35.751903 45 | bird_3,bird_3_lib,3,/storage/data/bird_3_R1.fq.gz,/storage/data/bird_3_R2.fq.gz,-11.874137,-5.382251 46 | bird_4,bird_4_lib,4,/storage/data/bird_4_R1.fq.gz,/storage/data/bird_4_R2.fq.gz,-73.235723,-145.261219 47 | bird_5,bird_5_lib,5,/storage/data/bird_5_R1.fq.gz,/storage/data/bird_5_R2.fq.gz,88.08701,-52.658705 48 | bird_6,bird_6_lib,6,/storage/data/bird_6_R1.fq.gz,/storage/data/bird_6_R2.fq.gz,69.640536,-12.971862 49 | bird_7,bird_7_lib,7,/storage/data/bird_7_R1.fq.gz,/storage/data/bird_7_R2.fq.gz,18.608941,-100.485774 50 | bird_8,bird_8_lib,8,/storage/data/bird_8_R1.fq.gz,/storage/data/bird_8_R2.fq.gz,-36.570632,-102.38721 51 | bird_9,bird_9_lib,9,/storage/data/bird_9_R1.fq.gz,/storage/data/bird_9_R2.fq.gz,-88.592265,157.406505 52 | bird_10,bird_10_lib,10,/storage/data/bird_10_R1.fq.gz,/storage/data/bird_10_R2.fq.gz,40.106437,-58.649016 53 | ``` 54 | ### Description of Columns 55 | 1. **BioSample**: This is the name for the sample. 56 | 2. **LibraryName**: Identifier for the sample's sequencing library. This is especially important if you have samples that were sequenced multiple times across multiple lanes, which is not the case in this example. See [here](./setup.md#handling-samples-with-more-than-one-pair-of-reads) for more details. 57 | 3. **Run**: If we were using reads from the SRA, this is where the sample's SRR accession would go. However, since we have local data, this just has to be a unique value. 58 | 4. **fq1**: Path to the first read pair. Absolute paths are recommended. If we were using SRA data, this column should be omitted. 59 | 5. **fq1**: Path to the second read pair. Same note as fq1. 60 | 6. **lat**: Decimal latitude for the sample, used to generate map in QC module output. 61 | 6. **long**: Decimal longitude for the sample, used to generate map in QC module output. 62 | 63 | ```{note} 64 | If your project has multiple genomes, you can add the refPath and refGenome columns. 65 | ``` 66 | 67 | ## Config file setup 68 | Now that we've created our sample sheet, we need to edit the config file we copied earlier: `projects/secretarybird_reseq/config.yaml`. This file controls the main options for controlling snpArcher's outputs. Refer to the [setup section](./setup.md#configuring-snparcher) for more details. 69 | 70 | In our example we are using all of the default options. This will configure snpArcher to perform variant calling using GATK with the scatter-by-intervals approach. Also, we have set our reference genome name and path since we want to use the same genome for all samples in our sample sheet. 71 | 72 | ``` 73 | samples: "config/samples.csv" # path to the sample metadata CSV 74 | final_prefix: "" # prefix for final output files 75 | intervals: True #Set to True if you want to perform variant calling using interval approach. 76 | sentieon: False #set to True if you want to use sentieon, False if you want GATK 77 | sentieon_lic: "" #set to path of sentieon license 78 | remote_reads: False # Set True if reads are in a location seperate from --default-remote-prefix. 79 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty 80 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only) 81 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module. 82 | trackhub_email: "hi@email.com" 83 | ############################## 84 | # Variables you *might* need to change 85 | ############################## 86 | 87 | # Set reference genome here if you would like to you use the same reference genome for all samples in sample sheet. See docs for more info. 88 | refGenome: "bird_genome" # Name for reference genome 89 | refPath: "/storage/data/bird.fa.gz" 90 | ``` 91 | 92 | ## Profile setup 93 | Snakemake uses profile YAML files to specify commonly used command line arguments, so you don't have to remember all of the arguments you need. Read more about profiles [here](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles). To specify a profile, you can use the `--workflow-profile` option when running Snakemake. 94 | 95 | ``` 96 | cp -r snpArcher/workflow-profiles projects/secretarybird_reseq 97 | ``` 98 | 99 | The profile also enables you to specify the compute resources any of snpArcher's rules can use. This is done via the YAML keys `default-resources`, `set-resources`, and `set-threads`. `default-resources` will apply to all rules, and `set-resources` can be applied to indiviudal rules, overriding what the default was set to. There is no way to set a default thread value. 100 | 101 | First, we will specify how many threads each rule can use. This is the same using the default or SLURM profile. Both profiles come with reasonable default thread values, but you may need to adjust based on your system or cluster. 102 | 103 | Let's say we wanted the alignment step (bwa mem) to use more threads: 104 | ``` 105 | # ... 106 | set-threads: 107 | bwa_map: 16 # Changed from 8 to 16. 108 | # ... 109 | ``` 110 | Next, we will specify memory and other resources. This step only applies if you are running on a SLURM cluster. 111 | 112 | In our example cluster, we have two compute partitions, "short" and "long". So we want to put long running jobs on the "long" partition, and the rest on "short". Additionally, the "short" partition has a timelimit of 1 hour and "long" 10 hours, so we will specify that. 113 | 114 | First, lets specify the default resources: 115 | ``` 116 | default-resources: 117 | mem_mb: attempt * 2000 118 | mem_mb_reduced: (attempt * 2000) * 0.9 # Mem allocated to java for GATK rules (tries to prevent OOM errors) 119 | slurm_partition: "short" # This line was changed 120 | slurm_account: # Same as sbatch -A. Not all clusters use this. 121 | runtime: 60 # In minutes 122 | ``` 123 | Then, lets modify the specific resources for the GATK HaplotypeCaller step: 124 | ``` 125 | set-resources: 126 | # ... other rules 127 | bam2gvcf: # HaplotypeCaller <--- This line was uncommented 128 | # mem_mb: attempt * 2000 129 | # mem_mb_reduced: (attempt * 2000) * 0.9 # Mem allocated to java (tries to prevent OOM errors) 130 | slurm_partition: "long" # This line was changed 131 | runtime: 600 # This line was changed 132 | ``` 133 | 134 | ## Running the workflow 135 | We are now ready to run the workflow! From our working directory we can run the command: 136 | ``` 137 | snakemake -s snpArcher/workflow/Snakefile -d projects/secretarybird_reseq --workflow-profile projects/secretarybird_reseq/workflow-profiles/default 138 | ``` 139 | This instructs Snakemake to use snpArcher's workflow file, and to run in the project directory we setup using the config and sample sheet we setup there. 140 | 141 | If we were on a SLURM cluster, we would add `--executor slurm` to our command: 142 | ``` 143 | snakemake --executor slurm -s snpArcher/workflow/Snakefile -d projects/secretarybird_reseq --workflow-profile projects/secretarybird_reseq/workflow-profiles/default 144 | ``` -------------------------------------------------------------------------------- /workflow/modules/qc/Snakefile: -------------------------------------------------------------------------------- 1 | configfile: "config/config.yaml" 2 | include: "common.smk" 3 | 4 | 5 | samples = snparcher_utils.parse_sample_sheet(config) 6 | REFGENOME = samples['refGenome'].unique().tolist() 7 | 8 | rule all: 9 | input: 10 | expand("results/{refGenome}/QC/{prefix}_qc.html", refGenome=REFGENOME, prefix=config['final_prefix']) 11 | 12 | rule check_fai: 13 | """ 14 | checks fai file for numeric first column, then do not run plink and rest of workflow if they are all numeric 15 | """ 16 | input: 17 | vcf = "results/{refGenome}/{prefix}_raw.vcf.gz", 18 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 19 | output: 20 | faiResult = "results/{refGenome}/QC/{prefix}_fai_tmp.txt" 21 | run: 22 | check_contig_names(input.fai, output.faiResult) 23 | 24 | rule vcftools_individuals: 25 | input: 26 | vcf = "results/{refGenome}/{prefix}_raw.vcf.gz" 27 | output: 28 | depth = "results/{refGenome}/QC/{prefix}.idepth", 29 | miss = "results/{refGenome}/QC/{prefix}.imiss", 30 | samps = "results/{refGenome}/QC/{prefix}.samps.txt", 31 | summ = "results/{refGenome}/QC/{prefix}.FILTER.summary", 32 | het = "results/{refGenome}/QC/{prefix}.het" 33 | conda: 34 | "envs/vcftools_individuals.yml" 35 | params: 36 | prefix = lambda wc, input: os.path.join(input.vcf.rsplit("/", 1)[0], "QC", wc.prefix), 37 | min_depth = config["min_depth"] 38 | log: 39 | "logs/{refGenome}/QC/vcftools_individuals/{prefix}.txt" 40 | shell: 41 | """ 42 | vcftools --gzvcf {input.vcf} --FILTER-summary --out {params.prefix} &> {log} 43 | vcftools --gzvcf {input.vcf} --out {params.prefix} --depth &>> {log} 44 | vcftools --gzvcf {input.vcf} --out {params.prefix} --het &>> {log} 45 | vcftools --gzvcf {input.vcf} --out {params.prefix} --missing-indv &>> {log} 46 | tail -n +2 {output.depth} | awk '$3>{params.min_depth} {{print $1}}'> {output.samps} 2>> {log} 47 | """ 48 | 49 | rule subsample_snps: 50 | input: 51 | vcf = "results/{refGenome}/{prefix}_raw.vcf.gz", 52 | samps = "results/{refGenome}/QC/{prefix}.samps.txt", 53 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 54 | sumstats = "results/{refGenome}/summary_stats/{prefix}_bam_sumstats.txt" 55 | output: 56 | filtered = temp("results/{refGenome}/QC/{prefix}_filtered.vcf.gz"), 57 | filtered_idx = temp("results/{refGenome}/QC/{prefix}_filtered.vcf.gz.csi"), 58 | pruned = "results/{refGenome}/QC/{prefix}.pruned.vcf.gz", 59 | snpqc = "results/{refGenome}/QC/{prefix}_snpqc.txt", 60 | fai = "results/{refGenome}/QC/{prefix}.fna.fai", 61 | sumstats = "results/{refGenome}/QC/{prefix}_bam_sumstats.txt" 62 | conda: 63 | "envs/subsample_snps.yml" 64 | params: 65 | chr_ex = config["scaffolds_to_exclude"] 66 | log: 67 | "logs/{refGenome}/QC/subsample_snps/{prefix}.txt" 68 | shell: 69 | """ 70 | ##first remove filtered sites and retain only biallelic SNPs 71 | ##Also remove sites with MAF < 0.01 and those with > 75% missing data 72 | if [ -z "{params.chr_ex}" ] 73 | then 74 | bcftools view -S {input.samps} -v snps -m2 -M2 -f .,PASS -e 'AF==1 | AF==0 | AF<0.01 | ALT="*" | F_MISSING > 0.75 | TYPE~"indel" | ref="N"' {input.vcf} -O z -o {output.filtered} &> {log} 75 | else 76 | bcftools view -S {input.samps} -t ^{params.chr_ex} -v snps -m2 -M2 -f .,PASS -e 'AF==1 | AF==0 | AF<0.01 | ALT="*" | F_MISSING > 0.75 | TYPE~"indel" | ref="N"' {input.vcf} -O z -o {output.filtered} &> {log} 77 | fi 78 | bcftools index {output.filtered} &>> {log} 79 | 80 | #figure out how many SNPs are left, then identify how big of SNP window size to get down to between 100 and 150k snps 81 | ALLSITES=`bcftools query -f '%CHROM\t%POS\n' {output.filtered} | wc -l` 82 | SITES=`echo $(( ${{ALLSITES}} / 100000 ))` 83 | 84 | #if the top VCF has < 150k SNPs, then just take all the SNPs 85 | if [[ $SITES -gt 1 ]] 86 | then 87 | bcftools +prune -w $SITES -n 1 -N rand -O z -o {output.pruned} {output.filtered} &>> {log} 88 | else 89 | bcftools view -O z -o {output.pruned} {output.filtered} &>> {log} 90 | fi 91 | 92 | bcftools query -f '%CHROM\t%POS\t%ID\t%INFO/AF\t%QUAL\t%INFO/ReadPosRankSum\t%INFO/FS\t%INFO/SOR\t%INFO/MQ\t%INFO/MQRankSum\n' {output.pruned} > {output.snpqc} 2>> {log} 93 | 94 | ##copy the fai file into the QC folder for easy access 95 | cp {input.fai} {output.fai} &>> {log} 96 | cp {input.sumstats} {output.sumstats} &>> {log} 97 | """ 98 | 99 | rule plink: 100 | """ 101 | Call plink PCA. 102 | """ 103 | input: 104 | vcf = "results/{refGenome}/QC/{prefix}.pruned.vcf.gz", 105 | faiResult = "results/{refGenome}/QC/{prefix}_fai_tmp.txt" 106 | params: 107 | prefix = lambda wc, input: input.vcf.replace(".pruned.vcf.gz", "") 108 | output: 109 | bed = "results/{refGenome}/QC/{prefix}.bed", 110 | bim = "results/{refGenome}/QC/{prefix}.bim", 111 | fam = "results/{refGenome}/QC/{prefix}.fam", 112 | eigenvec = "results/{refGenome}/QC/{prefix}.eigenvec", 113 | eigenval = "results/{refGenome}/QC/{prefix}.eigenval", 114 | dist = "results/{refGenome}/QC/{prefix}.dist", 115 | distid = "results/{refGenome}/QC/{prefix}.dist.id", 116 | king = "results/{refGenome}/QC/{prefix}.king" 117 | conda: 118 | "envs/plink.yml" 119 | log: 120 | "logs/{refGenome}/QC/plink/{prefix}.txt" 121 | shell: 122 | #plink 2 for king relatedness matrix (robust to structure) and plink 1.9 for distance matrix 123 | """ 124 | plink2 --vcf {input.vcf} --pca 10 --out {params.prefix} --allow-extra-chr --autosome-num 95 --make-bed --make-king square --const-fid --bad-freqs &> {log} 125 | plink --vcf {input.vcf} --out {params.prefix} --allow-extra-chr --autosome-num 95 --distance square --const-fid &>> {log} 126 | """ 127 | 128 | rule setup_admixture: 129 | """ 130 | admixture requires all chromosome names to be integers, this sets them to be 1:n 131 | """ 132 | input: 133 | bim = "results/{refGenome}/QC/{prefix}.bim", 134 | fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai", 135 | output: 136 | bim = "results/{refGenome}/QC/{prefix}.bim_fixed", 137 | bim_back = "results/{refGenome}/QC/{prefix}.bim.orig" 138 | script: 139 | "scripts/contigs4admixture.py" 140 | 141 | rule admixture: 142 | """ 143 | Call Admixture. First, make a bim file that has no charecters in the chromosomes 144 | """ 145 | input: 146 | bed = "results/{refGenome}/QC/{prefix}.bed", 147 | bim = "results/{refGenome}/QC/{prefix}.bim", 148 | fam = "results/{refGenome}/QC/{prefix}.fam", 149 | bim_fixed = "results/{refGenome}/QC/{prefix}.bim_fixed", 150 | bim_back = "results/{refGenome}/QC/{prefix}.bim.orig" 151 | output: 152 | admix = "results/{refGenome}/QC/{prefix}.3.Q", 153 | admix2 = "results/{refGenome}/QC/{prefix}.2.Q" 154 | params: 155 | outdir = lambda wc, input: input.bed.rsplit("/", 1)[0] 156 | log: 157 | "logs/{refGenome}/QC/admixture/{prefix}.txt" 158 | conda: 159 | "envs/admixture.yml" 160 | shell: 161 | """ 162 | mv {input.bim_fixed} {input.bim} 2> {log} 163 | 164 | admixture {input.bed} 2 &>> {log} 165 | admixture {input.bed} 3 &>> {log} 166 | 167 | mv "{wildcards.prefix}".2.* {params.outdir} &>> {log} 168 | mv "{wildcards.prefix}".3.* {params.outdir} &>> {log} 169 | """ 170 | 171 | rule generate_coords_file: 172 | output: 173 | "results/{refGenome}/QC/{prefix}.coords.txt" 174 | run: 175 | out_df = samples.loc[(samples['refGenome'] == wildcards.refGenome)][["BioSample", "long", "lat"]] 176 | out_df.drop_duplicates("BioSample", inplace=True) 177 | out_df.dropna(subset=["long", "lat"], thresh=1, inplace=True) 178 | out_df.to_csv(output[0], index=False, sep="\t", header=False) 179 | 180 | rule qc_plots: 181 | """ 182 | Call plotting script 183 | """ 184 | input: 185 | eigenvec = "results/{refGenome}/QC/{prefix}.eigenvec", 186 | eigenval = "results/{refGenome}/QC/{prefix}.eigenval", 187 | depth = "results/{refGenome}/QC/{prefix}.idepth", 188 | dist = "results/{refGenome}/QC/{prefix}.dist", 189 | distid = "results/{refGenome}/QC/{prefix}.dist.id", 190 | king = "results/{refGenome}/QC/{prefix}.king", 191 | miss = "results/{refGenome}/QC/{prefix}.imiss", 192 | admix3 = "results/{refGenome}/QC/{prefix}.3.Q", 193 | admix2 = "results/{refGenome}/QC/{prefix}.2.Q", 194 | snpqc = "results/{refGenome}/QC/{prefix}_snpqc.txt", 195 | faiResult = "results/{refGenome}/QC/{prefix}_fai_tmp.txt", 196 | bed = "results/{refGenome}/QC/{prefix}.bed", 197 | bim = "results/{refGenome}/QC/{prefix}.bim", 198 | fam = "results/{refGenome}/QC/{prefix}.fam", 199 | sumstats = "results/{refGenome}/QC/{prefix}_bam_sumstats.txt", 200 | summ = "results/{refGenome}/QC/{prefix}.FILTER.summary", 201 | het = "results/{refGenome}/QC/{prefix}.het", 202 | fai = "results/{refGenome}/QC/{prefix}.fna.fai", 203 | coords = get_coords_if_available 204 | params: 205 | prefix = lambda wc, input: input.het[:-4], 206 | nClusters = config['nClusters'], 207 | GMKey = config['GoogleAPIKey'] 208 | output: 209 | qcpdf = "results/{refGenome}/QC/{prefix}_qc.html" 210 | conda: 211 | "envs/qc.yml" 212 | script: 213 | "scripts/qc_dashboard_render.R" 214 | --------------------------------------------------------------------------------