├── .test
    ├── qc
    │   ├── results
    │   │   └── genome1
    │   │   │   ├── test_qc_raw.vcf.gz.tbi
    │   │   │   ├── data
    │   │   │       └── genome
    │   │   │       │   └── genome1.fna.fai
    │   │   │   ├── test_qc_raw.vcf.gz
    │   │   │   └── summary_stats
    │   │   │       └── test_qc_bam_sumstats.txt
    │   └── config
    │   │   ├── test.csv
    │   │   ├── samples.csv
    │   │   ├── resources.yaml
    │   │   ├── config.yaml
    │   │   └── test_qc_gls_config.yaml
    ├── postprocess
    │   ├── results
    │   │   └── genome1
    │   │   │   ├── test_postprocess_raw.vcf.gz.tbi
    │   │   │   ├── test_postprocess_raw.vcf.gz
    │   │   │   └── data
    │   │   │       └── genome
    │   │   │           └── genome1.fna.fai
    │   └── config
    │   │   ├── test.csv
    │   │   ├── samples.csv
    │   │   ├── resources.yaml
    │   │   ├── test_qc_gls_config.yaml
    │   │   └── config.yaml
    ├── trackhub
    │   ├── results
    │   │   └── genome1
    │   │   │   ├── data
    │   │   │       └── genome
    │   │   │       │   └── genome1.fna.fai
    │   │   │   ├── test_postprocess_clean_snps.vcf.gz
    │   │   │   └── test_postprocess_clean_snps.vcf.gz.tbi
    │   └── config
    │   │   ├── test.csv
    │   │   ├── samples.csv
    │   │   ├── resources.yaml
    │   │   ├── test_qc_gls_config.yaml
    │   │   └── config.yaml
    ├── ecoli
    │   ├── data
    │   │   ├── local_genome
    │   │   │   └── local_genome.fna.gz
    │   │   └── local_fastq
    │   │   │   ├── my_sample1_1.fastq.gz
    │   │   │   ├── my_sample1_2.fastq.gz
    │   │   │   ├── my_sample2_1.fastq.gz
    │   │   │   └── my_sample2_2.fastq.gz
    │   ├── config
    │   │   ├── ecoli_config_genome.csv
    │   │   ├── local_and_sra.csv
    │   │   ├── ecoli_samples.csv
    │   │   ├── resources.yaml
    │   │   └── config.yaml
    │   └── workflow
    │   │   └── scripts
    │   │       └── samples_to_keep.py
    └── ci
    │   └── config
    │       ├── samples.csv
    │       ├── resources.yaml
    │       └── config.yaml
├── docs
    ├── img
    │   └── logo.png
    ├── requirements.txt
    ├── datasets.md
    ├── conf.py
    ├── index.md
    ├── modules.md
    ├── executing.md
    └── examples.md
├── workflow
    ├── envs
    │   ├── bcftools.yml
    │   ├── sambamba.yml
    │   ├── sentieon.yml
    │   ├── mappability.yml
    │   ├── ucsc.yml
    │   ├── angsd.yml
    │   ├── cov_filter.yml
    │   ├── bam2vcf.yml
    │   └── fastq2bam.yml
    ├── modules
    │   ├── mk
    │   │   ├── envs
    │   │   │   ├── mk.yml
    │   │   │   └── ncbi.yml
    │   │   ├── config
    │   │   │   └── config.yaml
    │   │   ├── common.smk
    │   │   └── Snakefile
    │   ├── qc
    │   │   ├── config
    │   │   │   ├── test.csv
    │   │   │   ├── config.yaml
    │   │   │   └── test_qc_gls_config.yaml
    │   │   ├── envs
    │   │   │   ├── admixture.yml
    │   │   │   ├── subsample_snps.yml
    │   │   │   ├── vcftools_individuals.yml
    │   │   │   ├── plink.yml
    │   │   │   └── qc.yml
    │   │   ├── common.smk
    │   │   ├── scripts
    │   │   │   ├── contigs4admixture.py
    │   │   │   └── qc_dashboard_render.R
    │   │   └── Snakefile
    │   ├── postprocess
    │   │   ├── envs
    │   │   │   ├── bed.yml
    │   │   │   └── filter.yml
    │   │   ├── config
    │   │   │   └── config.yaml
    │   │   └── Snakefile
    │   ├── template
    │   │   ├── config
    │   │   │   └── config.yaml
    │   │   └── Snakefile
    │   └── trackhub
    │   │   ├── envs
    │   │       └── trackhub.yml
    │   │   ├── config
    │   │       └── config.yaml
    │   │   ├── scripts
    │   │       ├── vcftools_out_to_bg.py
    │   │       └── write_hub_files.py
    │   │   ├── html
    │   │       └── hub_description.html
    │   │   └── Snakefile
    ├── scripts
    │   ├── samples_to_keep.py
    │   ├── make_intervals.py
    │   ├── create_coverage_thresholds.py
    │   └── create_coverage_bed.py
    ├── Snakefile
    ├── rules
    │   ├── fastq2bam.smk
    │   ├── mappability.smk
    │   ├── fastq.smk
    │   ├── reference.smk
    │   ├── sumstats.smk
    │   ├── intervals.smk
    │   ├── cov_filter.smk
    │   ├── bam2vcf_gatk.smk
    │   ├── sentieon.smk
    │   └── bam2vcf_gatk_intervals.smk
    └── snparcher_utils
    │   ├── __init__.py
    │   └── write_samples.py
├── .readthedocs.yaml
├── .github
    └── workflows
    │   ├── ci.yaml
    │   └── main.yaml
├── .gitignore
├── LICENSE
├── README.md
├── workflow-profiles
    └── default
    │   └── config.yaml
└── config
    └── config.yaml


/.test/qc/results/genome1/test_qc_raw.vcf.gz.tbi:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.test/postprocess/results/genome1/test_postprocess_raw.vcf.gz.tbi:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.test/trackhub/results/genome1/data/genome/genome1.fna.fai:
--------------------------------------------------------------------------------
1 | JAKDEW010000001.1	53793026	30	60	61
2 | 


--------------------------------------------------------------------------------
/docs/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/docs/img/logo.png


--------------------------------------------------------------------------------
/.test/qc/results/genome1/data/genome/genome1.fna.fai:
--------------------------------------------------------------------------------
1 | JAKDEW010000001.1   1   3
2 | JAKDEW010000002.1   1   3
3 | 


--------------------------------------------------------------------------------
/.test/qc/config/test.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,
2 | sample,sample1,genome1,1,test,x
3 | 
4 | 


--------------------------------------------------------------------------------
/.test/trackhub/config/test.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,
2 | sample,sample1,genome1,1,test,x
3 | 
4 | 


--------------------------------------------------------------------------------
/workflow/envs/bcftools.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - bcftools==1.10


--------------------------------------------------------------------------------
/workflow/modules/mk/envs/mk.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |  - degenotate


--------------------------------------------------------------------------------
/.test/postprocess/config/test.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,
2 | sample,sample1,genome1,1,test,x
3 | 
4 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/config/test.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,
2 | sample,sample1,genome1,1,test,x
3 | 
4 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/envs/admixture.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - admixture==1.3.0


--------------------------------------------------------------------------------
/.test/qc/results/genome1/test_qc_raw.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/qc/results/genome1/test_qc_raw.vcf.gz


--------------------------------------------------------------------------------
/workflow/modules/postprocess/envs/bed.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - bedtools==2.30
7 | 


--------------------------------------------------------------------------------
/workflow/modules/postprocess/envs/filter.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - bcftools==1.16
7 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/envs/subsample_snps.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - bcftools==1.12
7 | 


--------------------------------------------------------------------------------
/.test/ecoli/data/local_genome/local_genome.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_genome/local_genome.fna.gz


--------------------------------------------------------------------------------
/.test/qc/config/samples.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject
2 | testENA,EK7.12,GCA_000008865.2,ERR699557,Escherichia coli,PRJNA563564
3 | 


--------------------------------------------------------------------------------
/workflow/envs/sambamba.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - sambamba==0.8.0
7 |   - python==3.11.4
8 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/envs/vcftools_individuals.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - vcftools==0.1.16
7 | 


--------------------------------------------------------------------------------
/.test/ecoli/data/local_fastq/my_sample1_1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample1_1.fastq.gz


--------------------------------------------------------------------------------
/.test/ecoli/data/local_fastq/my_sample1_2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample1_2.fastq.gz


--------------------------------------------------------------------------------
/.test/ecoli/data/local_fastq/my_sample2_1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample2_1.fastq.gz


--------------------------------------------------------------------------------
/.test/ecoli/data/local_fastq/my_sample2_2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/ecoli/data/local_fastq/my_sample2_2.fastq.gz


--------------------------------------------------------------------------------
/.test/trackhub/config/samples.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject
2 | testENA,EK7.12,GCA_000008865.2,ERR699557,Escherichia coli,PRJNA563564
3 | 


--------------------------------------------------------------------------------
/.test/postprocess/config/samples.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject
2 | testENA,EK7.12,GCA_000008865.2,ERR699557,Escherichia coli,PRJNA563564
3 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/envs/plink.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - plink2==2.00a2.3
7 |   - plink==1.90b6.21
8 | 


--------------------------------------------------------------------------------
/workflow/envs/sentieon.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - sentieon
7 |   - python==3.11.4
8 |   - samtools>=1.12
9 | 


--------------------------------------------------------------------------------
/workflow/envs/mappability.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - bedtools==2.30.0
7 |   - genmap>=1.3.0
8 |   - python==3.11.4


--------------------------------------------------------------------------------
/.test/postprocess/results/genome1/test_postprocess_raw.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/postprocess/results/genome1/test_postprocess_raw.vcf.gz


--------------------------------------------------------------------------------
/workflow/modules/mk/envs/ncbi.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - ncbi-datasets-cli==11.25.1
7 |   - p7zip==16.02
8 |   - pigz==2.6


--------------------------------------------------------------------------------
/.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Defining the exact version will make sure things don't break
2 | sphinx==5.3.0
3 | sphinx_rtd_theme==1.1.1
4 | readthedocs-sphinx-search==0.1.1
5 | myst-parser==1.0.0
6 | 


--------------------------------------------------------------------------------
/.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harvardinformatics/snpArcher/HEAD/.test/trackhub/results/genome1/test_postprocess_clean_snps.vcf.gz.tbi


--------------------------------------------------------------------------------
/workflow/envs/ucsc.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults  
 5 | dependencies:
 6 |   - python==3.11.4
 7 |   - ucsc-fatotwobit==377
 8 |   - ucsc-twobitinfo==377
 9 | 
10 | 


--------------------------------------------------------------------------------
/workflow/modules/template/config/config.yaml:
--------------------------------------------------------------------------------
1 | ##############################
2 | # Variables you need to change
3 | ##############################
4 | 
5 | samples: "config/samples.csv" # path to the sample metadata CSV
6 | 
7 | 


--------------------------------------------------------------------------------
/.test/postprocess/results/genome1/data/genome/genome1.fna.fai:
--------------------------------------------------------------------------------
1 | SCAF_1	122379970	8	60	61
2 | SCAF_2	108119840	124419653	60	61
3 | SCAF_3	107133695	234341499	60	61
4 | SCAF_4	104519870	343260764	60	61
5 | SCAF_5	94801293	449522640	60	61
6 | 


--------------------------------------------------------------------------------
/workflow/envs/angsd.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - angsd==0.937
 7 |   - samtools>=1.12
 8 |   - python>=3.6
 9 |   - numpy
10 |   - scipy
11 |   - cython
12 |   - gxx
13 | 


--------------------------------------------------------------------------------
/workflow/envs/cov_filter.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - bedtools==2.30.0
 7 |   - mosdepth==0.3.10
 8 |   - d4tools>=0.3.10
 9 |   - clam>=0.1.2
10 |   - bedtk==0.0.r25.dirty
11 |   
12 | 


--------------------------------------------------------------------------------
/workflow/modules/mk/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/samples.csv" # path to the sample metadata CSV
 6 | final_prefix: "" # prefix for final output files
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/workflow/modules/trackhub/envs/trackhub.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - bedtools==2.30
 7 |   - bcftools==1.12
 8 |   - vcftools==0.1.16
 9 |   - ucsc-bedgraphtobigwig==377
10 |   - ucsc-bedtobigbed==377
11 |   - ucsc-bedsort==466


--------------------------------------------------------------------------------
/workflow/modules/trackhub/config/config.yaml:
--------------------------------------------------------------------------------
1 | ##############################
2 | # Variables you need to change
3 | ##############################
4 | 
5 | samples: "config/samples.csv" # path to the sample metadata CSV
6 | final_prefix: "" # prefix for final output files
7 | trackhub_email: "email@website.com"
8 | 


--------------------------------------------------------------------------------
/workflow/envs/bam2vcf.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - gatk4==4.1.8.0
 7 |   - freebayes==1.3.2
 8 |   - picard==2.22.8
 9 |   - samtools==1.11
10 |   - vcftools==0.1.16
11 |   - bedtools==2.29.2
12 |   - pyyaml==5.3.1
13 |   - htslib==1.11
14 |   - bzip2==1.0.8
15 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | # Build from the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Explicitly set the version of Python and its requirements
13 | python:
14 |   install:
15 |     - requirements: docs/requirements.txt
16 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/envs/qc.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - r-base==4.1.3
 7 |   - r-tidyverse==1.3.1
 8 |   - r-plotly==4.9.4.1
 9 |   - r-flexdashboard==0.5.2
10 |   - r-ape==5.5
11 |   - r-reshape2==1.4.4
12 |   - bioconductor-ggtree==3.2.0
13 |   - r-ggmap=3.0.0
14 |   - r-ggplot2=3.3.5
15 | 


--------------------------------------------------------------------------------
/.test/ecoli/config/ecoli_config_genome.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,Run,Organism,BioProject,fq1,fq2
2 | SAMN12676327,EK7.12,SRR10058855,Escherichia coli,PRJNA563564,data/local_fastq/my_sample1_1.fastq.gz,data/local_fastq/my_sample1_2.fastq.gz
3 | SAMN12676342,EK7.30,SRR10058838,Escherichia coli,PRJNA563564,data/local_fastq/my_sample2_1.fastq.gz,data/local_fastq/my_sample2_2.fastq.gz
4 | 


--------------------------------------------------------------------------------
/workflow/envs/fastq2bam.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - samtools==1.14
 7 |   - fastp==0.20.1
 8 |   - bwa==0.7.17
 9 |   - sra-tools==3.0.0
10 |   - ncbi-datasets-cli>=17.1.0
11 |   - p7zip==16.02
12 |   - pigz==2.6
13 |   - curl>7.73.0
14 |   - pip==22.0.4
15 |   - bbmap==38.96
16 |   - pip:
17 |       - ffq
18 | 


--------------------------------------------------------------------------------
/.test/ecoli/config/local_and_sra.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,fq1,fq2,refPath
2 | SAMN12676327,EK7.12,GCA_000008865.2,SRR10058855,Escherichia coli,PRJNA563564,data/local_fastq/my_sample1_1.fastq.gz,data/local_fastq/my_sample1_2.fastq.gz,data/local_genome/local_genome.fna.gz
3 | SAMN12676342,EK7.30,GCA_003018455.1,SRR10058838,Escherichia coli,PRJNA563564
4 | 


--------------------------------------------------------------------------------
/workflow/scripts/samples_to_keep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | infile = sys.argv[1]
 5 | DEPTH_CUTOFF = 2
 6 | samps = []
 7 | with open(infile, "r") as f:
 8 |     next(f)
 9 |     for line in f:
10 |         line = line.strip().split()
11 |         if float(line[2]) >= DEPTH_CUTOFF:
12 |             samps.append(line[0])
13 | 
14 | for s in samps:
15 |     print(s)
16 |         


--------------------------------------------------------------------------------
/.test/ci/config/samples.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,SampleType
2 | test1,test1lib,GCF_000146045.2,SRR22893406,yeast,project,exclude
3 | test2,test2lib,GCF_000146045.2,SRR22893439,yeast,project,exclude
4 | test3,test3lib,GCF_000146045.2,SRR22893395,yeast,project
5 | test4,test4lib,GCF_000146045.2,SRR22893419,yeast,project
6 | test5,test5lib,GCF_000146045.2,SRR22893436,yeast,project


--------------------------------------------------------------------------------
/.test/ecoli/workflow/scripts/samples_to_keep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | infile = sys.argv[1]
 5 | DEPTH_CUTOFF = 2
 6 | samps = []
 7 | with open(infile, "r") as f:
 8 |     next(f)
 9 |     for line in f:
10 |         line = line.strip().split()
11 |         if float(line[2]) >= DEPTH_CUTOFF:
12 |             samps.append(line[0])
13 | 
14 | for s in samps:
15 |     print(s)
16 |         


--------------------------------------------------------------------------------
/.test/ecoli/config/ecoli_samples.csv:
--------------------------------------------------------------------------------
1 | BioSample,LibraryName,refGenome,Run,Organism,BioProject,fq1,fq2,refPath
2 | SAMN12676327,EK7.12,GCA_000008865.2,SRR10058855,Escherichia coli,PRJNA563564,data/local_fastq/my_sample1_1.fastq.gz,data/local_fastq/my_sample1_2.fastq.gz,data/local_genome/local_genome.fna.gz
3 | SAMN12676342,EK7.30,GCA_003018455.1,SRR10058838,Escherichia coli,PRJNA563564,data/local_fastq/my_sample2_1.fastq.gz,data/local_fastq/my_sample2_2.fastq.gz
4 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"  # name of the sample metadata CSV 
 6 | final_prefix: "" # prefix for final output files
 7 | 
 8 | ##############################
 9 | # Variables you *might* need to change
10 | ##############################
11 | 
12 | ## QC options ##
13 | nClusters: 3
14 | GoogleAPIKey:
15 | min_depth: 2
16 | 


--------------------------------------------------------------------------------
/workflow/modules/template/Snakefile:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want
 6 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve()
 7 | if str(utils_path) not in sys.path:
 8 |     sys.path.append(str(utils_path))
 9 | 
10 | import pandas as pd
11 | import snparcher_utils
12 | 
13 | configfile: "config/config.yaml"
14 | wildcard_constraints:
15 |     window="\d+"
16 | 
17 | samples = snparcher_utils.parse_sample_sheet(config)
18 | 
19 | # Define rules here
20 | rule all:
21 |     pass


--------------------------------------------------------------------------------
/docs/datasets.md:
--------------------------------------------------------------------------------
1 | # Datasets Produced by snpArcher
2 | A number of resequencing datasets have been run with snpArcher generating consistent variant calls, available via [Globus](https://www.globus.org/) in the [Comparative Population Genomics Data collection](https://app.globus.org/file-manager?origin_id=d2b75419-85ad-4871-8f34-003d73bbae7d&origin_path=%2F). Details of data processing are described [here](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1). If you use any of these datasets in your projects, please cite both the snpArcher paper and the original data producers. 
3 | 
4 | If you would like to contribute datasets you have created using snpArcher, please get in touch!
5 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |     paths-ignore:
 5 |         - "docs/**"
 6 |         - "**.md"
 7 |     branches:
 8 |     - main
 9 |   
10 | 
11 | jobs:
12 |   Testing:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Test whole workflow
17 |       uses: snakemake/snakemake-github-action@v1.25.1
18 |       with:
19 |         directory: .test/ci/
20 |         snakefile: workflow/Snakefile
21 |         args: "--use-conda --show-failed-logs -j 1 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
22 |         stagein: "conda config --set channel_priority strict"
23 | 


--------------------------------------------------------------------------------
/.test/qc/results/genome1/summary_stats/test_qc_bam_sumstats.txt:
--------------------------------------------------------------------------------
 1 | Sample	Total_Reads	Percent_mapped	Num_duplicates	Percent_properly_paired	Fraction_reads_pass_filter	Num_filtered_reads
 2 | test_A01 	 74002570 	 90.94 	 0 	 84.18 	 0.02587990396094019 	 2240784
 3 | test_B01 	 80220874 	 95.94 	 0 	 89.27 	 0.6775766480787638 	 60496912
 4 | test_C01 	 110928249 	 68.51 	 0 	 62.83 	 0.05437316857757944 	 6623216
 5 | test_D01 	 87593075 	 96.97 	 0 	 88.18 	 0.9819319302461245 	 97194908
 6 | test_E01 	 102503748 	 97.45 	 0 	 88.18 	 0.9798935985581342 	 112458212
 7 | test_F01 	 118251228 	 97.90 	 0 	 90.86 	 0.9858274422405399 	 136304548
 8 | test_G01 	 63596400 	 97.38 	 0 	 90.66 	 0.043010646075266695 	 3166694
 9 | test_H01 	 102382268 	 97.54 	 0 	 88.44 	 0.08283396693724794 	 9375764
10 | test_A02 	 49840099 	 91.05 	 0 	 84.07 	 0.019503543101684753 	 1130540


--------------------------------------------------------------------------------
/workflow/modules/postprocess/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/samples.csv"  # name of the sample metadata CSV 
 6 | final_prefix: "" # prefix for final output files
 7 | 
 8 | ##############################
 9 | # Variables you *might* need to change
10 | ##############################
11 | 
12 | ## Filtering options ##
13 | 
14 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable.
15 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable.
16 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable.
17 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | 
 3 | # -- Project information
 4 | 
 5 | project = 'snpArcher'
 6 | copyright = '2023, Cade Mirchandani'
 7 | author = 'Cade Mirchandani'
 8 | 
 9 | release = '0.1'
10 | version = '0.1.0'
11 | 
12 | # -- General configuration
13 | 
14 | extensions = [
15 |     'sphinx.ext.duration',
16 |     'sphinx.ext.doctest',
17 |     'sphinx.ext.autodoc',
18 |     'sphinx.ext.autosummary',
19 |     'sphinx.ext.intersphinx',
20 |     'myst_parser'
21 | ]
22 | 
23 | intersphinx_mapping = {
24 |     'python': ('https://docs.python.org/3/', None),
25 |     'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
26 | }
27 | intersphinx_disabled_domains = ['std']
28 | 
29 | templates_path = ['_templates']
30 | myst_enable_extensions = [
31 |     "html_image"
32 | ]
33 | # -- Options for HTML output
34 | 
35 | html_theme = 'sphinx_rtd_theme'
36 | 
37 | # -- Options for EPUB output
38 | epub_show_urls = 'footnote'
39 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/common.smk:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | # Get utils. This is not great, but we can move to setup.py and install later if want
 5 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve()
 6 | if str(utils_path) not in sys.path:
 7 |     sys.path.append(str(utils_path))
 8 | 
 9 | import pandas as pd
10 | import snparcher_utils
11 | 
12 | def get_coords_if_available(wildcards):
13 |     if 'lat' in samples.columns and 'long' in samples.columns:
14 |         return "results/{refGenome}/QC/{prefix}.coords.txt"
15 |     return []
16 | 
17 | def check_contig_names(fai, touch_file):
18 |     dffai = pd.read_table(fai, sep='\t', header = None)
19 |     fai_result=pd.to_numeric(dffai[0], errors='coerce').notnull().all()
20 |     if fai_result==True:
21 |         print("QC plots not generated because contig names are numeric and plink does not accept numeric contig names")
22 |     elif fai_result==False:
23 |         with open(touch_file, "w") as writer:
24 |             writer.write("contigs are strings")
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | pixi.lock
 3 | pixi.toml
 4 | data_preparation/
 5 | .snakemake*
 6 | template_slurm.sh
 7 | slurm_logs/*
 8 | # indexing and creating a sequence dictionary gets done within pipeline
 9 | data/zebraFinch/genome/*.sa
10 | data/zebraFinch/genome/*.pac
11 | data/zebraFinch/genome/*.bwt
12 | data/zebraFinch/genome/*.ann
13 | data/zebraFinch/genome/*.amb
14 | data/zebraFinch/genome/*.fai
15 | data/zebraFinch/genome/*.dict
16 | rules/.snakemake
17 | data/BHduck/genome/*.sa
18 | data/BHduck/genome/*.pac
19 | data/BHduck/genome/*.bwt
20 | data/BHduck/genome/*.ann
21 | data/BHduck/genome/*.amb
22 | data/BHduck/genome/*.fai
23 | data/BHduck/genome/*.dict
24 | /data/
25 | fastp.*
26 | intervalFiles/
27 | out
28 | err
29 | __pycache__
30 | log/
31 | fastq2bam/
32 | intervalFiles/
33 | freebayes/
34 | gatk/
35 | logs/
36 | *_dryrun.txt
37 | results/
38 | tmp/
39 | .test/ecoli/benchmarks/
40 | .test/ecoli/logs/
41 | .test/ecoli/results/
42 | .test/ecoli/data/
43 | *.lic
44 | .test/ci/results
45 | .test/ci/benchmarks
46 | .test/ci/logs
47 | .vscode
48 | .test/trackhub/*.sizes
49 | .test/trackhub/out.log
50 | # pixi environments
51 | .pixi
52 | *.egg-info
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-2021 Harvard Informatics
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/scripts/contigs4admixture.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import shutil
 3 | 
 4 | def generate_mapping(input_file, bim_file, output_file):
 5 | 
 6 |     conversion_dict = {}
 7 |     with open(input_file, 'r') as f:
 8 |         for line in f:
 9 |             line = line.strip().split()
10 |             conversion_dict[line[0]] = line[1]
11 | 
12 |     # Copy original bim file to a new file with ".orig" appended to its name
13 |     orig_bim_file = bim_file + ".orig"
14 |     shutil.copyfile(bim_file, orig_bim_file)
15 | 
16 |     # read bim file and replace the scaffold names with numbering 1:n (n = number of scaffolds)
17 |     updated_lines = []
18 |     with open(bim_file, 'r') as f:
19 |         for line in f:
20 |             elements = line.strip().split('\t')
21 |             scaffold = elements[0]
22 |             if scaffold in conversion_dict:
23 |                 elements[0] = conversion_dict[scaffold]
24 |             updated_lines.append('\t'.join(elements))
25 | 
26 |     with open(output_file, 'w') as f:
27 |         for line in updated_lines:
28 |             f.write(line + '\n')
29 | 
30 | input_file = snakemake.input.fai
31 | bim_file = snakemake.input.bim
32 | output_file = snakemake.output.bim
33 | generate_mapping(input_file, bim_file, output_file)
34 | 


--------------------------------------------------------------------------------
/workflow/Snakefile:
--------------------------------------------------------------------------------
 1 | from snakemake.utils import min_version
 2 | min_version("7.0")
 3 | 
 4 | configfile: "config/config.yaml"
 5 | include: "rules/common.smk"
 6 | include: "rules/sumstats.smk"
 7 | include: "rules/fastq.smk"
 8 | include: "rules/reference.smk"
 9 | include: "rules/mappability.smk"
10 | 
11 | setup_curlrc()
12 | onerror: cleanup_curlrc()
13 | onsuccess: cleanup_curlrc()
14 | 
15 | 
16 | if config['sentieon']:
17 |     include: "rules/sentieon.smk"
18 | else:
19 |     include: "rules/fastq2bam.smk"
20 |     if config['intervals']:
21 |         include: "rules/bam2vcf_gatk_intervals.smk"
22 |         include: "rules/intervals.smk"
23 |     else:
24 |         include: "rules/bam2vcf_gatk.smk"
25 | 
26 | if config['cov_filter']:
27 |     include: "rules/cov_filter.smk"
28 | 
29 | module qc:
30 |     snakefile:
31 |         "modules/qc/Snakefile"
32 |     config:
33 |         config
34 | 
35 | use rule * from qc as qc_*
36 | 
37 | module mk:
38 |     snakefile:
39 |         "modules/mk/Snakefile"
40 |     config:
41 |         config
42 | 
43 | use rule * from mk as mk_*
44 | 
45 | module postprocess:
46 |     snakefile:
47 |         "modules/postprocess/Snakefile"
48 |     config:
49 |         config
50 | 
51 | use rule * from postprocess as postprocess_*
52 | 
53 | module trackhub:
54 |     snakefile:
55 |         "modules/trackhub/Snakefile"
56 |     config:
57 |         config
58 | 
59 | use rule * from trackhub as trackhub_*
60 | 
61 | rule all:
62 |     input:
63 |         get_output()
64 |     default_target: True
65 | 


--------------------------------------------------------------------------------
/workflow/modules/mk/common.smk:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import re
 3 | import sys
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want
 8 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve()
 9 | if str(utils_path) not in sys.path:
10 |     sys.path.append(str(utils_path))
11 | 
12 | import pandas as pd
13 | import snparcher_utils
14 | 
15 | samples = snparcher_utils.parse_sample_sheet(config)
16 | 
17 | def get_ref(wildcards):
18 |     if 'refPath' in samples.columns:
19 |         _refs = samples.loc[(samples['refGenome'] == wildcards.refGenome)]['refPath'].dropna().unique().tolist()
20 |         for ref in _refs:
21 |             if not os.path.exists(ref):
22 |                 raise WorkflowError(f"Reference genome {ref} does not exist")
23 |             elif ref.rsplit(".", 1)[1] == '.gz':
24 |                 raise WorkflowError(f"Reference genome {ref} must be unzipped first.")
25 |         return _refs
26 |     else:
27 |         return []
28 | 
29 | def get_gff(wildcards):
30 |     if 'refGFF' in samples.columns:
31 |         _refs = samples.loc[(samples['refGenome'] == wildcards.refGenome)]['refGFF'].dropna().unique().tolist()
32 |         for ref in _refs:
33 |             if not os.path.exists(ref):
34 |                 raise WorkflowError(f"Reference gff {ref} does not exist")
35 |             elif ref.rsplit(".", 1)[1] == '.gz':
36 |                 raise WorkflowError(f"Reference gff {ref} must be unzipped first.")
37 |         return _refs
38 |     else:
39 |         return []
40 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/scripts/qc_dashboard_render.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | render_qcplots <- function(prefix, nClusters, GMKey){
 4 |     #specify the snakemake pipeline working d to knit with
 5 |     workd <- getwd()
 6 |     output.path <- gsub(".idepth", "_qc.html", normalizePath(paste0(prefix, ".idepth"))) #generate full path of output - brute force because I had issues with relative paths
 7 |     
 8 |     script.in <- paste0(snakemake@scriptdir, "/qc_dashboard_interactive.Rmd") #get real path of dashboard script
 9 |     script.out <- gsub(".Rmd", ".html", paste0(snakemake@scriptdir, "/qc_dashboard_interactive.Rmd")) #get name of future html
10 | 
11 |     rmarkdown::render(script.in, #knit the markdown file to html
12 |                     params = list(prefix = prefix, nClusters = nClusters, GMKey = GMKey), #pass the path to the QC files that are plotted (via snakemake params)
13 |                     knit_root_dir = workd)  #make sure to knit in the working directory of the snakemake run
14 |     
15 |     #move the default html output to the QC folder. This is an inconvenience of knitr, and 
16 |     #the output.file 
17 |     copy_successful <- file.copy(script.out, output.path)
18 | 
19 |     # Check if the copy was successful
20 |     if (copy_successful) {
21 |     # If the copy succeeded, delete the original file
22 |     file.remove(script.out)
23 |     } else {
24 |     # If the copy failed, print an error message
25 |     cat("snpArcher: Failed to move the qc dashboard html.\n")
26 |     }
27 | }
28 | 
29 | render_qcplots(snakemake@params[[1]], snakemake@params[[2]], snakemake@params[[3]])


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # snpArcher
 2 | 
 3 | <img src="./docs/img/logo.png" alt="snpArcher logo" height="300"/>
 4 | 
 5 | 
 6 | snpArcher is a reproducible workflow optimized for nonmodel organisms and comparisons across datasets, built on the [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html#) workflow management system. It provides a streamlined approach to dataset acquisition, variant calling, quality control, and downstream analysis.
 7 | 
 8 | ### Usage
 9 | For usage instructions and complete documentation, please visit our [docs](https://snparcher.readthedocs.io/en/latest/).
10 | 
11 | ### Datasets generated by snpArcher
12 | A number of resequencing datasets have been run with snpArcher generating consistent variant calls, available via [Globus](https://www.globus.org/) in the [Comparative Population Genomics Data collection](https://app.globus.org/file-manager?origin_id=a6580c44-09fd-11ee-be16-195c41bc0be4&origin_path=%2F). Details of data processing are described [in our manuscript](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1). If you use any of these datasets in your projects, please cite both the [snpArcher paper](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1) and the original data producers.
13 | 
14 | ### Citing snpArcher
15 | - Cade D Mirchandani, Allison J Shultz, Gregg W C Thomas, Sara J Smith, Mara Baylis, Brian Arnold, Russ Corbett-Detig, Erik Enbody, Timothy B Sackton, A fast, reproducible, high-throughput variant calling workflow for population genomics, Molecular Biology and Evolution, 2023;, msad270, https://doi.org/10.1093/molbev/msad270
16 | - Also, make sure to cite the tools you used within snpArcher.
17 | 


--------------------------------------------------------------------------------
/workflow/modules/trackhub/scripts/vcftools_out_to_bg.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | def chrom_dict(chrom_sizes_file):
 4 |     chroms = {}
 5 |     with open(chrom_sizes_file, "r") as f:
 6 |         for line in f:
 7 |             line = line.strip().split()
 8 |             chroms[line[0]] = int(line[1])
 9 |     return chroms
10 | 
11 | def parse_stat_file(stat_file, out_file, chrom_sizes):
12 |     stat_file = Path(stat_file)
13 |     file_type = stat_file.suffix
14 |     window = int(stat_file.stem)
15 |     
16 |     with open(out_file, "w") as out:
17 |         results = []
18 |         with open(stat_file, "r") as inp:
19 |             next(inp)
20 |             for line in inp:
21 |                 
22 |                 line = line.strip().split()
23 |                 chrom = line[0]
24 |                 if chrom not in chrom_sizes:
25 |                     
26 |                     continue
27 |                 else:
28 |                     start = int(line[1])
29 |                     end = start + (window-1)
30 |                     if end >= chrom_sizes[chrom]:
31 |                         end = chrom_sizes[chrom]-1
32 |                     
33 |                     if file_type == ".Tajima":
34 |                         value = line[3]
35 |                     elif file_type == ".SNP-Density":
36 |                         value = line[2]
37 |                     elif file_type == ".Pi":
38 |                         value = line[4]
39 |                     else:
40 |                         raise(ValueError(f"Unknown file type: {file_type}"))
41 |                     
42 |                     results.append((chrom,start,end,value))
43 |         
44 |         sorted_results = sorted(results, key=lambda x: (x[0], x[1]))
45 |         
46 |         for chrom, start, end, value in sorted_results:
47 |             print(f"{chrom}\t{start}\t{end}\t{value}\n", file=out)
48 | def main():
49 |     chrom_sizes = chrom_dict(snakemake.input["chrom_sizes"])
50 |     parse_stat_file(stat_file=snakemake.input["stat_file"], out_file=snakemake.output[0], chrom_sizes=chrom_sizes)
51 | 
52 | if __name__ == "__main__":
53 |     main()


--------------------------------------------------------------------------------
/workflow/rules/fastq2bam.smk:
--------------------------------------------------------------------------------
 1 | rule bwa_map:
 2 |     input:
 3 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 4 |         r1 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_1.fastq.gz",
 5 |         r2 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_2.fastq.gz",
 6 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
 7 |     output: 
 8 |         bam = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam"),
 9 |         bai = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam.bai"),
10 |     params:
11 |         rg = get_read_group
12 |     conda:
13 |         "../envs/fastq2bam.yml"
14 |     log:
15 |         "logs/{refGenome}/bwa_mem/{sample}/{run}.txt"
16 |     benchmark:
17 |         "benchmarks/{refGenome}/bwa_mem/{sample}_{run}.txt"
18 |     shell:
19 |         "bwa mem -M -t {threads} -R {params.rg} {input.ref} {input.r1} {input.r2} 2> {log} | samtools sort -o {output.bam} - && samtools index {output.bam} {output.bai}"
20 | 
21 | rule merge_bams:
22 |     input:
23 |         merge_bams_input
24 |     output:
25 |         bam = temp("results/{refGenome}/bams/postMerge/{sample}.bam"),
26 |         bai = temp("results/{refGenome}/bams/postMerge/{sample}.bam.bai")
27 |     conda:
28 |         "../envs/fastq2bam.yml"
29 |     log:
30 |         "logs/{refGenome}/merge_bams/{sample}.txt"
31 |     benchmark:
32 |         "benchmarks/{refGenome}/merge_bams/{sample}.txt"
33 |     shell:
34 |         "samtools merge {output.bam} {input} && samtools index {output.bam} > {log}"
35 | 
36 | rule dedup:
37 |     input:
38 |         unpack(dedup_input)
39 |     output:
40 |         dedupBam = "results/{refGenome}/bams/{sample}_final.bam",
41 |         dedupBai = "results/{refGenome}/bams/{sample}_final.bam.bai",
42 |     conda:
43 |         "../envs/sambamba.yml"
44 |     log:
45 |         "logs/{refGenome}/sambamba_dedup/{sample}.txt"
46 |     benchmark:
47 |         "benchmarks/{refGenome}/sambamba_dedup/{sample}.txt"
48 |     shell:
49 |         "sambamba markdup -t {threads} {input.bam} {output.dedupBam} 2> {log}"


--------------------------------------------------------------------------------
/workflow/rules/mappability.smk:
--------------------------------------------------------------------------------
 1 | rule genmap:
 2 |     input:
 3 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 4 |     output:
 5 |         bg = temp("results/{refGenome}/genmap/{refGenome}.genmap.bedgraph"),
 6 |         sorted_bg = "results/{refGenome}/genmap/sorted_mappability.bg"
 7 |     params:
 8 |         indir = os.path.join(DEFAULT_STORAGE_PREFIX, "results/{refGenome}/genmap_index"),
 9 |         outdir = os.path.join(DEFAULT_STORAGE_PREFIX, "results/{refGenome}/genmap"),
10 |         kmer = config['mappability_k']
11 |     log:
12 |         "logs/{refGenome}/genmap/log.txt"
13 |     benchmark:
14 |         "benchmarks/{refGenome}/genmap/benchmark.txt"
15 |     conda:
16 |         "../envs/mappability.yml"
17 |     shell:
18 |         # snakemake creates the output directory before the shell command, but genmap doesnt like this. so we remove the directory first.
19 |         """
20 |         rm -rf {params.indir} && genmap index -F {input.ref} -I {params.indir} &> {log}
21 |         genmap map -K {params.kmer} -E 0 -I {params.indir} -O {params.outdir} -bg -T {threads} -v &> {log}
22 |         sort -k1,1 -k2,2n {output.bg} > {output.sorted_bg} 2>> {log}
23 |         """
24 | 
25 | rule mappability_bed:
26 |     input:
27 |         map = "results/{refGenome}/genmap/sorted_mappability.bg"
28 |     output:
29 |         callable_sites = "results/{refGenome}/callable_sites/{prefix}_callable_sites_map.bed" if config['cov_filter'] else "results/{refGenome}/{prefix}_callable_sites.bed",
30 |         tmp_map = temp("results/{refGenome}/callable_sites/{prefix}_temp_map.bed")
31 |     conda:
32 |         "../envs/mappability.yml"
33 |     benchmark:
34 |         "benchmarks/{refGenome}/mapbed/{prefix}_benchmark.txt"
35 |     params:
36 |         merge = config['mappability_merge'],
37 |         mappability = config['mappability_min']
38 |     shell:
39 |         """
40 |         awk 'BEGIN{{OFS="\\t";FS="\\t"}} {{ if($4>={params.mappability}) print $1,$2,$3 }}' {input.map} > {output.tmp_map}
41 |         bedtools sort -i {output.tmp_map} | bedtools merge -d {params.merge} -i - > {output.callable_sites}
42 |         """


--------------------------------------------------------------------------------
/workflow/scripts/make_intervals.py:
--------------------------------------------------------------------------------
 1 | from re import I
 2 | from snakemake.exceptions import WorkflowError
 3 | import os
 4 | 
 5 | """
 6 | Reads output file from ScatterIntervalsByNs and puts intervals into (relatively) specified num of equal groups.
 7 | Writes intervals groups to individual files for use by HaplotypeCaller
 8 | """
 9 | 
10 | 
11 | def make_intervals(
12 |     in_file: str, num_intervals: int, output_dir: str, int_output_file: str
13 | ) -> None:
14 | 
15 |     intervals = []
16 | 
17 |     with open(in_file, "r") as f:
18 |         for line in f:
19 |             if not line.startswith("@"):
20 |                 line = line.strip().split()
21 |                 chrom, start, end, = (
22 |                     line[0],
23 |                     int(line[1]),
24 |                     int(line[2]),
25 |                 )
26 |                 size = end - start
27 |                 intervals.append((chrom, start, end, size))
28 | 
29 |     if num_intervals > len(intervals):
30 |         num_intervals = len(intervals)
31 | 
32 |     groups = [[] for i in range(num_intervals)]
33 |     sums = {i: 0 for i in range(num_intervals)}
34 |     c = 0
35 |     for chrom, start, end, size in sorted(intervals, key=lambda x: x[3]):
36 |         for i in sums:
37 |             if c == sums[i]:
38 |                 groups[i].append((chrom, start, end))
39 |                 break
40 |         sums[i] += size
41 |         c = min(sums.values())
42 | 
43 |     if not os.path.exists(output_dir):
44 |         os.mkdir(output_dir)
45 | 
46 |     with open(int_output_file, "w") as out:
47 |         for i, group in enumerate(groups):
48 |             file = os.path.join(output_dir, f"{i}.list")
49 |             with open(file, "w") as f:
50 |                 for chrom, start, end in group:
51 |                     print(f"{chrom}:{start}-{end}", file=f)
52 |                     print(f"{chrom}:{start}-{end}", file=out)
53 | 
54 | 
55 | def main():
56 |     make_intervals(
57 |         snakemake.input["in_file"],
58 |         snakemake.params["max_intervals"],
59 |         snakemake.output["out_dir"],
60 |         snakemake.output["intervals"],
61 |     )
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/workflow-profiles/default/config.yaml:
--------------------------------------------------------------------------------
 1 | use-conda: True
 2 | 
 3 | # These resources will be applied to all rules. Can be overriden on a per-rule basis below.
 4 | default-resources:
 5 |   mem_mb: attempt * 16000
 6 |   mem_mb_reduced: (attempt * 16000) * 0.9 # Mem allocated to java for GATK rules (tries to prevent OOM errors)
 7 |   # Uncomment and edit following options for slurm execution:
 8 |   # slurm_partition: ""
 9 |   # slurm_account: # Same as sbatch -A. Not all clusters use this.
10 |   # runtime: 720 # In minutes
11 | 
12 | # Control number of threads each rule will use.
13 | set-threads:
14 |   # Mappability
15 |   genmap: 1
16 | 
17 |   # Fastq Processing
18 |   get_fastq_pe: 6
19 |   fastp: 6
20 |   # Alignment
21 |   bwa_map: 16
22 |   dedup: 16
23 | 
24 |   # GVCF
25 |   bam2gvcf: 1 # Does not benefit from more than 2 threads
26 |   gvcf2DB: 1 # Does not benefit from more than 2 threads
27 | 
28 |   # VCF
29 |   DB2vcf: 1 # Does not benefit from more than 2 threads
30 |   filterVcfs: 1 # Does not benefit from more than 2 threads
31 |   sort_gatherVcfs: 1 # Does not benefit from more than 2 threads
32 | 
33 |   # Callable Bed
34 |   compute_d4: 6
35 |   clam_loci: 6
36 | 
37 |   # Sentieon Tools
38 |   sentieon_map: 16
39 |   sentieon_dedup: 16
40 |   sentieon_haplotyper: 32
41 |   sentieon_combine_gvcf: 32
42 | # Control/overwrite resources per rule.
43 | # To use this feature, uncomment "set-resources:" below and add rules you want to customize.
44 | # Examples:
45 | #
46 | # set-resources:
47 | #   # Example 1: Increase memory for bam2gvcf rule
48 | #   bam2gvcf:
49 | #     mem_mb: attempt * 64000  # Customize memory allocation
50 | #     mem_mb_reduced: (attempt * 64000) * 0.9  # Customize Java memory allocation
51 | #
52 | #   # Example 2: Set slurm parameters for a resource-intensive rule
53 | #   sentieon_haplotyper:
54 | #     mem_mb: attempt * 8000
55 | #     mem_mb_reduced: (attempt * 8000) * 0.9
56 | #     slurm_partition: high-mem
57 | #     runtime: "24:00:00"
58 | #     cpus_per_task: 32
59 | #
60 | # To customize a rule, copy one of the example blocks above, paste it under "set-resources:",
61 | # replace the rule name with your target rule, and adjust the resource parameters as needed.
62 | 


--------------------------------------------------------------------------------
/.test/qc/config/resources.yaml:
--------------------------------------------------------------------------------
  1 | ###
  2 | # fastq2bam workflow
  3 | ##
  4 | 
  5 | # fastq download
  6 | get_fastq_pe:
  7 |     threads: 10
  8 |     mem: 5000
  9 | # compress fastq
 10 | gzip_fastq:
 11 |     mem: 4000
 12 | # fastp program
 13 | fastp:
 14 |     threads: 10
 15 |     mem: 5000
 16 | # index reference
 17 | index_ref:
 18 |     mem: 10000
 19 | # genmap map
 20 | genmap:
 21 |     threads: 10
 22 |     mem: 20000
 23 | genmap_sort:
 24 |     mem: 4000
 25 |   # bwa mapping
 26 | bwa_map:
 27 |     threads: 31
 28 |     mem: 15000
 29 | # sort bam with picard's SortSam tool
 30 | sort_bam:
 31 |     mem: 25000
 32 | # mark duplicates with picard's MarkDuplicates
 33 | dedup:
 34 |     threads: 31
 35 |     mem: 9000
 36 | # calculate BAM summaries with samtools and picard
 37 | bam_sumstats:
 38 |     mem: 9000
 39 | merge_bams:
 40 |     mem: 9000
 41 | bedtools:
 42 |     mem: 4000
 43 |     
 44 | ###
 45 | # Intervals workflow
 46 | ###
 47 | 
 48 | # preprocess genome, create intervals
 49 | # picard's create CreateSequenceDictionary, samtools faidx
 50 | process_ref:
 51 |     mem: 15000
 52 | # custom python algo to create intervals
 53 | create_db_intervals:
 54 |   mem: 5000
 55 | 
 56 | create_gvcf_intervals:
 57 |   mem: 5000
 58 | 
 59 | ## Callable sites workflow
 60 | 
 61 | # genmap map
 62 | genmap:
 63 |     threads: 10
 64 |     mem: 20000
 65 | genmap_sort:
 66 |     mem: 4000
 67 | compute_d4:
 68 |     mem: 4000
 69 |     threads: 4
 70 | merge_d4:
 71 |     mem: 10000
 72 | callable_bed:
 73 |     mem: 10000
 74 | 
 75 | ###
 76 | # bam2vcf workflows
 77 | ###
 78 | 
 79 | # gatk HaplotypeCaller
 80 | bam2gvcf:
 81 |     mem: 30000
 82 | # gatk GenomicsDBImport
 83 | gvcf2DB:
 84 |     mem: 30000
 85 | # gatk GenotypeGVCFs
 86 | DB2vcf:
 87 |     mem: 30000
 88 | ## freebayes program only! ##
 89 | bam2vcf:
 90 |     mem: 30000
 91 | # gatk filterVcfs
 92 | filterVcfs:
 93 |     mem: 30000
 94 | # gatk GatherVcfs
 95 | gatherVcfs:
 96 |     mem: 30000
 97 | # picard SortVcf
 98 | sortVcf:
 99 |     mem: 30000
100 | # vcftools program
101 | vcftools:
102 |     mem: 30000
103 | # bedtools program
104 | bedtools:
105 |     mem: 30000
106 | # plink
107 | plink:
108 |     threads: 5
109 | admixture:
110 |     mem: 4000
111 | 


--------------------------------------------------------------------------------
/.test/postprocess/config/resources.yaml:
--------------------------------------------------------------------------------
  1 | ###
  2 | # fastq2bam workflow
  3 | ##
  4 | 
  5 | # fastq download
  6 | get_fastq_pe:
  7 |     threads: 10
  8 |     mem: 5000
  9 | # compress fastq
 10 | gzip_fastq:
 11 |     mem: 4000
 12 | # fastp program
 13 | fastp:
 14 |     threads: 10
 15 |     mem: 5000
 16 | # index reference
 17 | index_ref:
 18 |     mem: 10000
 19 | # genmap map
 20 | genmap:
 21 |     threads: 10
 22 |     mem: 20000
 23 | genmap_sort:
 24 |     mem: 4000
 25 |   # bwa mapping
 26 | bwa_map:
 27 |     threads: 31
 28 |     mem: 15000
 29 | # sort bam with picard's SortSam tool
 30 | sort_bam:
 31 |     mem: 25000
 32 | # mark duplicates with picard's MarkDuplicates
 33 | dedup:
 34 |     threads: 31
 35 |     mem: 9000
 36 | # calculate BAM summaries with samtools and picard
 37 | bam_sumstats:
 38 |     mem: 9000
 39 | merge_bams:
 40 |     mem: 9000
 41 | bedtools:
 42 |     mem: 4000
 43 |     
 44 | ###
 45 | # Intervals workflow
 46 | ###
 47 | 
 48 | # preprocess genome, create intervals
 49 | # picard's create CreateSequenceDictionary, samtools faidx
 50 | process_ref:
 51 |     mem: 15000
 52 | # custom python algo to create intervals
 53 | create_db_intervals:
 54 |   mem: 5000
 55 | 
 56 | create_gvcf_intervals:
 57 |   mem: 5000
 58 | 
 59 | ## Callable sites workflow
 60 | 
 61 | # genmap map
 62 | genmap:
 63 |     threads: 10
 64 |     mem: 20000
 65 | genmap_sort:
 66 |     mem: 4000
 67 | compute_d4:
 68 |     mem: 4000
 69 |     threads: 4
 70 | merge_d4:
 71 |     mem: 10000
 72 | callable_bed:
 73 |     mem: 10000
 74 | 
 75 | ###
 76 | # bam2vcf workflows
 77 | ###
 78 | 
 79 | # gatk HaplotypeCaller
 80 | bam2gvcf:
 81 |     mem: 30000
 82 | # gatk GenomicsDBImport
 83 | gvcf2DB:
 84 |     mem: 30000
 85 | # gatk GenotypeGVCFs
 86 | DB2vcf:
 87 |     mem: 30000
 88 | ## freebayes program only! ##
 89 | bam2vcf:
 90 |     mem: 30000
 91 | # gatk filterVcfs
 92 | filterVcfs:
 93 |     mem: 30000
 94 | # gatk GatherVcfs
 95 | gatherVcfs:
 96 |     mem: 30000
 97 | # picard SortVcf
 98 | sortVcf:
 99 |     mem: 30000
100 | # vcftools program
101 | vcftools:
102 |     mem: 30000
103 | # bedtools program
104 | bedtools:
105 |     mem: 30000
106 | # plink
107 | plink:
108 |     threads: 5
109 | admixture:
110 |     mem: 4000
111 | 


--------------------------------------------------------------------------------
/.test/trackhub/config/resources.yaml:
--------------------------------------------------------------------------------
  1 | ###
  2 | # fastq2bam workflow
  3 | ##
  4 | 
  5 | # fastq download
  6 | get_fastq_pe:
  7 |     threads: 10
  8 |     mem: 5000
  9 | # compress fastq
 10 | gzip_fastq:
 11 |     mem: 4000
 12 | # fastp program
 13 | fastp:
 14 |     threads: 10
 15 |     mem: 5000
 16 | # index reference
 17 | index_ref:
 18 |     mem: 10000
 19 | # genmap map
 20 | genmap:
 21 |     threads: 10
 22 |     mem: 20000
 23 | genmap_sort:
 24 |     mem: 4000
 25 |   # bwa mapping
 26 | bwa_map:
 27 |     threads: 31
 28 |     mem: 15000
 29 | # sort bam with picard's SortSam tool
 30 | sort_bam:
 31 |     mem: 25000
 32 | # mark duplicates with picard's MarkDuplicates
 33 | dedup:
 34 |     threads: 31
 35 |     mem: 9000
 36 | # calculate BAM summaries with samtools and picard
 37 | bam_sumstats:
 38 |     mem: 9000
 39 | merge_bams:
 40 |     mem: 9000
 41 | bedtools:
 42 |     mem: 4000
 43 |     
 44 | ###
 45 | # Intervals workflow
 46 | ###
 47 | 
 48 | # preprocess genome, create intervals
 49 | # picard's create CreateSequenceDictionary, samtools faidx
 50 | process_ref:
 51 |     mem: 15000
 52 | # custom python algo to create intervals
 53 | create_db_intervals:
 54 |   mem: 5000
 55 | 
 56 | create_gvcf_intervals:
 57 |   mem: 5000
 58 | 
 59 | ## Callable sites workflow
 60 | 
 61 | # genmap map
 62 | genmap:
 63 |     threads: 10
 64 |     mem: 20000
 65 | genmap_sort:
 66 |     mem: 4000
 67 | compute_d4:
 68 |     mem: 4000
 69 |     threads: 4
 70 | merge_d4:
 71 |     mem: 10000
 72 | callable_bed:
 73 |     mem: 10000
 74 | 
 75 | ###
 76 | # bam2vcf workflows
 77 | ###
 78 | 
 79 | # gatk HaplotypeCaller
 80 | bam2gvcf:
 81 |     mem: 30000
 82 | # gatk GenomicsDBImport
 83 | gvcf2DB:
 84 |     mem: 30000
 85 | # gatk GenotypeGVCFs
 86 | DB2vcf:
 87 |     mem: 30000
 88 | ## freebayes program only! ##
 89 | bam2vcf:
 90 |     mem: 30000
 91 | # gatk filterVcfs
 92 | filterVcfs:
 93 |     mem: 30000
 94 | # gatk GatherVcfs
 95 | gatherVcfs:
 96 |     mem: 30000
 97 | # picard SortVcf
 98 | sortVcf:
 99 |     mem: 30000
100 | # vcftools program
101 | vcftools:
102 |     mem: 30000
103 | # bedtools program
104 | bedtools:
105 |     mem: 30000
106 | # plink
107 | plink:
108 |     threads: 5
109 | admixture:
110 |     mem: 4000
111 | 


--------------------------------------------------------------------------------
/.test/ci/config/resources.yaml:
--------------------------------------------------------------------------------
  1 | ###
  2 | # fastq2bam rules
  3 | ##
  4 | 
  5 | # fastq download
  6 | get_fastq_pe:
  7 |   threads: 8
  8 |   mem: 4000
  9 | 
 10 | # index reference
 11 | index_ref:
 12 |   mem: 10000
 13 | 
 14 | # fastp program
 15 | fastp:
 16 |   threads: 8
 17 |   mem: 4000
 18 | 
 19 | # bwa mapping
 20 | bwa_map:
 21 |   threads: 31
 22 |   mem: 15000
 23 | # sort bam with picard's SortSam tool
 24 | sort_bam:
 25 |     threads: 4
 26 |     mem_per_thread: 1000
 27 | 
 28 | #merge bams
 29 | merge_bams:
 30 |     mem: 9000
 31 |     threads: 2
 32 | 
 33 | # mark duplicates with picard's MarkDuplicates
 34 | dedup:
 35 |   threads: 31
 36 |   mem: 9000
 37 | # calculate BAM summaries with samtools and picard
 38 | bam_sumstats:
 39 |   mem: 9000
 40 | merge_bams:
 41 |   mem: 9000
 42 | bedtools:
 43 |   mem: 4000
 44 | 
 45 | # Sentieon tools
 46 | sentieon_map:
 47 |   machine_type: "n2d-standard-32"
 48 |   threads: 31
 49 |   mem: 15000
 50 | 
 51 | sentieon_dedup:
 52 |   machine_type: "n2d-standard-32"
 53 |   threads: 31
 54 |   mem: 15000
 55 | 
 56 | sentieon_haplotyper:
 57 |   machine_type: "n2d-standard-32"
 58 |   threads: 31
 59 |   mem: 15000
 60 | 
 61 | sentieon_combine_gvcf:
 62 |   machine_type: "n2d-standard-32"
 63 |   threads: 31
 64 |   mem: 15000
 65 |   disk_mb: 2000000
 66 | 
 67 | ###
 68 | # Intervals workflow
 69 | ###
 70 | 
 71 | # preprocess genome, create intervals
 72 | # picard's create CreateSequenceDictionary, samtools faidx
 73 | process_ref:
 74 |   mem: 15000
 75 | # custom python algo to create intervals
 76 | create_db_intervals:
 77 |   mem: 5000
 78 | 
 79 | create_gvcf_intervals:
 80 |   mem: 5000
 81 | 
 82 | ## Callable sites workflow
 83 | 
 84 | # genmap map
 85 | genmap:
 86 |   threads: 10
 87 |   mem: 20000
 88 | genmap_sort:
 89 |   mem: 4000
 90 | compute_d4:
 91 |   mem: 4000
 92 |   threads: 4
 93 | merge_d4:
 94 |   mem: 10000
 95 | callable_bed:
 96 |   mem: 10000
 97 | 
 98 | ###
 99 | # bam2vcf workflows
100 | ###
101 | 
102 | # gatk HaplotypeCaller
103 | bam2gvcf:
104 |   mem: 4000
105 | # gatk GenomicsDBImport
106 | gvcf2DB:
107 |   mem: 4000
108 | # gatk GenotypeGVCFs
109 | DB2vcf:
110 |   mem: 4000
111 | # gatk filterVcfs
112 | filterVcfs:
113 |   mem: 4000
114 | # gatk GatherVcfs
115 | gatherVcfs:
116 |   mem: 4000
117 | # vcftools program
118 | vcftools:
119 |   mem: 8000
120 | # plink
121 | plink:
122 |   threads: 5
123 | admixture:
124 |   mem: 4000
125 | 


--------------------------------------------------------------------------------
/workflow/modules/trackhub/html/hub_description.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>snpArcher Track Hub Description</title>
 5 |   <style>
 6 |     body {
 7 |       font-family: Arial, sans-serif;
 8 |       margin: 20px;
 9 |     }
10 |     h1 {
11 |       font-size: 24px;
12 |       font-weight: bold;
13 |     }
14 |     h2 {
15 |       font-size: 20px;
16 |       font-weight: bold;
17 |       margin-top: 20px;
18 |     }
19 |     h3 {
20 |       font-size: 16px;
21 |       font-weight: bold;
22 |       margin-top: 10px;
23 |       margin-left: 20px;
24 |     }
25 |     p {
26 |       font-size: 16px;
27 |       margin-left: 40px;
28 |     }
29 |   </style>
30 | </head>
31 | <body>
32 |   <h1>snpArcher Track Hub Description</h1>
33 |   
34 |   <h2>Introduction</h2>
35 |   <p>To facilitate downstream data exploration and as an example of the module development components of this work, we
36 |     developed a module to generate UCSC Genome Browser track files to explore population variation data (<a
37 |         href="https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1">see preprint for details</a>).</p>
38 |   
39 |   <h2>Track Descriptions</h2>
40 |   
41 |   <h3>Tajima’s D</h3>
42 |   <p>This track provides windowed estimates of Tajima’s D, a population genetic statistic that measures the departure from neutral evolution in a DNA sequence.</p>
43 |   
44 |   <h3>SNP Density</h3>
45 |   <p>This track displays the density of single nucleotide polymorphisms (SNPs) across the genome, showing regions with high or low levels of genetic variation.</p>
46 |   
47 |   <h3>Pi</h3>
48 |   <p>The Pi track represents the average number of nucleotide differences per site between any two sequences in a population, providing an estimate of genetic diversity.</p>
49 |   
50 |   <h3>Minor Allele Frequency</h3>
51 |   <p>This track shows the frequency of the less common allele at a SNP locus, providing insights into the genetic variation within a population.</p>
52 |   
53 |   <h3>SNP Depth</h3>
54 |   <p>The SNP Depth track displays the number of reads or sequencing depth at each SNP position, indicating the coverage and quality of the variant calls.</p>
55 |   
56 |   <h3>Non Callable Sites</h3>
57 |   <p>The Non Callable Sites track highlights regions in the genome that are considered non-callable, meaning that they have low sequencing coverage or other technical limitations that make it difficult to accurately determine genetic variation in those regions.</p>
58 | 
59 | </body>
60 | </html>


--------------------------------------------------------------------------------
/.test/ecoli/config/resources.yaml:
--------------------------------------------------------------------------------
  1 | ########################################
  2 | ## RESOURCES ##
  3 | ########################################
  4 | 
  5 | # fastq download
  6 | get_fastq_pe:
  7 |     threads: 1
  8 |     mem: 4000
  9 | # compress fastq
 10 | gzip_fastq:
 11 |     mem: 4000
 12 | # fastp program
 13 | fastp:
 14 |     threads: 1
 15 |     mem: 4000
 16 | # index reference
 17 | index_ref:
 18 |     mem: 4000
 19 | # genmap map
 20 | genmap:
 21 |     threads: 1
 22 |     mem: 4000
 23 | genmap_sort:
 24 |     mem: 4000
 25 |   # bwa mapping
 26 | bwa_map:
 27 |     threads: 1
 28 |     mem: 4000
 29 | # sort bam with picard's SortSam tool
 30 | sort_bam:
 31 |     mem: 4000
 32 | # mark duplicates with picard's MarkDuplicates
 33 | dedup:
 34 |     threads: 1
 35 |     mem: 4000
 36 | # calculate BAM summaries with samtools and picard
 37 | bam_sumstats:
 38 |     mem: 4000
 39 | merge_bams:
 40 |     mem: 4000
 41 | bedtools:
 42 |     mem: 4000
 43 |     
 44 | ###
 45 | # Intervals workflow
 46 | ###
 47 | 
 48 | # preprocess genome, create intervals
 49 | # picard's create CreateSequenceDictionary, samtools faidx
 50 | process_ref:
 51 |     mem: 4000
 52 | # custom python algo to create intervals
 53 | create_db_intervals:
 54 |   mem: 5000
 55 | 
 56 | create_gvcf_intervals:
 57 |   mem: 5000
 58 | 
 59 | ## Callable sites workflow
 60 | 
 61 | # genmap map
 62 | genmap:
 63 |     threads: 1
 64 |     mem: 4000
 65 | genmap_sort:
 66 |     mem: 4000
 67 | compute_d4:
 68 |     mem: 4000
 69 |     threads: 1
 70 | merge_d4:
 71 |     mem: 4000
 72 | callable_bed:
 73 |     mem: 4000
 74 | 
 75 | 
 76 | ## Callable sites workflow
 77 | 
 78 | # genmap map
 79 | genmap:
 80 |     threads: 10
 81 |     mem: 10000
 82 | genmap_sort:
 83 |     mem: 4000
 84 | compute_d4:
 85 |     mem: 4000
 86 |     threads: 4
 87 | merge_d4:
 88 |     mem: 10000
 89 | callable_bed:
 90 |     mem: 10000
 91 | 
 92 | ###
 93 | # bam2vcf workflows
 94 | ###
 95 | 
 96 | # gatk HaplotypeCaller
 97 | bam2gvcf:
 98 |     mem: 4000
 99 | # gatk GenomicsDBImport
100 | gvcf2DB:
101 |     mem: 4000
102 | # gatk GenotypeGVCFs
103 | DB2vcf:
104 |     mem: 4000
105 | ## freebayes program only! ##
106 | bam2vcf:
107 |     mem: 4000
108 | # gatk filterVcfs
109 | filterVcfs:
110 |     mem: 4000
111 | # gatk GatherVcfs
112 | gatherVcfs:
113 |     mem: 4000
114 | # picard SortVcf
115 | sortVcf:
116 |     mem: 4000
117 | # vcftools program
118 | vcftools:
119 |     mem: 4000
120 | # bedtools program
121 | bedtools:
122 |     mem: 4000
123 | # plink
124 | plink:
125 |     threads: 1
126 | admixture:
127 |     mem: 4000
128 | 


--------------------------------------------------------------------------------
/workflow/rules/fastq.smk:
--------------------------------------------------------------------------------
 1 | rule get_fastq_pe:
 2 |     output:
 3 |         temp("results/data/fastq/{refGenome}/{sample}/{run}_1.fastq.gz"),
 4 |         temp("results/data/fastq/{refGenome}/{sample}/{run}_2.fastq.gz")
 5 |     params:
 6 |         outdir = os.path.join(DEFAULT_STORAGE_PREFIX, "results/data/fastq/{refGenome}/{sample}/")
 7 |     conda:
 8 |         "../envs/fastq2bam.yml"
 9 |     benchmark:
10 |         "benchmarks/{refGenome}/getfastq/{sample}_{run}.txt"
11 |     resources:
12 |         tmpdir = get_big_temp
13 |     shell:
14 |         """
15 |         set +e
16 |         #delete existing prefetch file in case of previous run failure
17 |         rm -rf {wildcards.run}
18 |         ##attempt to get SRA file from NCBI (prefetch) or ENA (wget)
19 |         prefetch --max-size 1T {wildcards.run}
20 |         prefetchExit=$?
21 |         if [[ $prefetchExit -ne 0 ]]
22 |         then
23 |             ffq --ftp {wildcards.run} | grep -Eo '"url": "[^"]*"' | grep -o '"[^"]*"$' | grep "fastq" | xargs curl --remote-name-all --output-dir {params.outdir}
24 |         else
25 |             fasterq-dump {wildcards.run} -O {params.outdir} -e {threads} -t {resources.tmpdir}
26 |             pigz -p {threads} {params.outdir}{wildcards.run}*.fastq
27 |         fi
28 |         rm -rf {wildcards.run}
29 |         """
30 | 
31 | rule sort_reads:
32 |     input:
33 |         unpack(get_reads)
34 |     output:
35 |         r1 = temp("results/{refGenome}/sorted_reads/{sample}/{run}_1.fastq.gz"),
36 |         r2 = temp("results/{refGenome}/sorted_reads/{sample}/{run}_2.fastq.gz"),
37 |     conda:
38 |         "../envs/fastq2bam.yml"
39 |     log:
40 |         "logs/{refGenome}/sort_reads/{sample}/{run}.txt"
41 |     benchmark:
42 |          "benchmarks/{refGenome}/sort_reads/{sample}_{run}.txt"
43 |     shell:
44 |         """
45 |         sortbyname.sh in={input.r1} out={output.r1} &> {log}
46 |         sortbyname.sh in={input.r2} out={output.r2} &>> {log}
47 |         """
48 | 
49 | rule fastp:
50 |     input:
51 |         unpack(get_reads_fastp) 
52 |     output:
53 |         r1 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_1.fastq.gz",
54 |         r2 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_2.fastq.gz",
55 |         summ = "results/{refGenome}/summary_stats/{sample}/{run}.fastp.out"
56 |     conda:
57 |         "../envs/fastq2bam.yml"
58 |     log:
59 |         "logs/{refGenome}/fastp/{sample}/{run}.txt"
60 |     benchmark:
61 |         "benchmarks/{refGenome}/fastp/{sample}_{run}.txt"
62 |     shell:
63 |         """
64 |         fastp --in1 {input.r1} --in2 {input.r2} \
65 |         --out1 {output.r1} --out2 {output.r2} \
66 |         --thread {threads} \
67 |         --detect_adapter_for_pe \
68 |         -j {output.summ} -h /dev/null \
69 |         &>{log}
70 |         """


--------------------------------------------------------------------------------
/workflow/rules/reference.smk:
--------------------------------------------------------------------------------
 1 | ruleorder: download_reference > index_reference
 2 | localrules: download_reference
 3 | 
 4 | # This does not work with SLURM as of 4/3/24. See here for more info:https://github.com/snakemake/snakemake-executor-plugin-slurm/issues/60
 5 | # rule copy_reference:
 6 | #     """Copies user-specified reference genome path to results dir to maintain refGenome wildcard"""
 7 | #     input:
 8 | #         ref = get_ref
 9 | #     output:
10 | #         ref = "results/{refGenome}/data/genome/{refGenome}.fna"
11 | #     log:
12 | #         "logs/{refGenome}/copy_ref/log.txt"
13 | #     shell:
14 | #         #probably don't need to unzip but might as well.
15 | #         """
16 | #         gunzip -c {input.ref} 2> {log} > {output.ref} || cp {input.ref} {output.ref} &> {log}
17 | #         """
18 | 
19 | rule download_reference:
20 |     input:
21 |         ref = get_ref
22 |     output:
23 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna"
24 |     params:
25 |         dataset = "results/{refGenome}/data/genome/{refGenome}_dataset.zip",
26 |         outdir = "results/{refGenome}/data/genome/{refGenome}"
27 |     conda:
28 |         "../envs/fastq2bam.yml"
29 |     log:
30 |         "logs/{refGenome}/download_ref/log.txt"
31 |     benchmark:
32 |         "benchmarks/{refGenome}/download_ref/benchmark.txt"
33 |     shell:
34 |         """
35 |         if [ -z "{input.ref}" ]  # check if this is empty
36 |         then
37 |             mkdir -p {params.outdir}
38 |             datasets download genome accession {wildcards.refGenome} --include genome --filename {params.dataset} \
39 |             && (7z x {params.dataset} -aoa -o{params.outdir} || unzip -o {params.dataset} -d {params.outdir}) \
40 |             && cat {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/*.fna > {output.ref}
41 |         else
42 |             gunzip -c {input.ref} 2> {log} > {output.ref} || cp {input.ref} {output.ref} &> {log}
43 |         fi
44 |         """
45 | 
46 | rule index_reference:
47 |     input:
48 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna"
49 |     output:
50 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb"]),
51 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
52 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict"
53 |     conda:
54 |         "../envs/fastq2bam.yml"
55 |     log:
56 |         "logs/{refGenome}/index_ref/log.txt"
57 |     benchmark:
58 |         "benchmarks/{refGenome}/index_ref/benchmark.txt"
59 |     shell:
60 |         """
61 |         bwa index {input.ref} 2> {log}
62 |         samtools faidx {input.ref} --output {output.fai} >> {log}
63 |         samtools dict {input.ref} -o {output.dictf} >> {log} 2>&1
64 |         """
65 | 


--------------------------------------------------------------------------------
/workflow/scripts/create_coverage_thresholds.py:
--------------------------------------------------------------------------------
 1 | from snakemake.script import snakemake
 2 | from snakemake.exceptions import WorkflowError
 3 | import math
 4 | 
 5 | # read chrom coverage values and compute min/max
 6 | cov_thresh = {}
 7 | stdv_scale = snakemake.params["cov_threshold_stdev"]
 8 | rel_scale = snakemake.params["cov_threshold_rel"]
 9 | mean_lower = snakemake.params["cov_threshold_lower"]
10 | mean_upper = snakemake.params["cov_threshold_upper"]
11 | 
12 | # check that correct settings are set
13 | 
14 | if stdv_scale:
15 |     if rel_scale:
16 |         raise WorkflowError(
17 |             "Both cov_threshold_stdev and cov_threshold_rel are set, please choose one and make sure the other variable is empty in the config file."
18 |         )
19 |     elif mean_lower or mean_upper:
20 |         raise WorkflowError(
21 |             "Both cov_threshold_stdev and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file."
22 |         )
23 | elif rel_scale:
24 |     if mean_lower or mean_upper:
25 |         raise WorkflowError(
26 |             "Both cov_threshold_rel and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file."
27 |         )
28 | elif mean_lower:
29 |     if not mean_upper:
30 |         mean_upper = 50000
31 | elif mean_upper:
32 |     if not mean_lower:
33 |         mean_lower = 1
34 | else:
35 |     raise WorkflowError(
36 |         "Use coverage filter is True, but you did not specify coverage filtering options in the config. Please check."
37 |     )
38 | 
39 | with open(snakemake.input["stats"]) as stats:
40 |     for line in stats:
41 |         if "mean" in line:
42 |             continue
43 | 
44 |         fields = line.split()
45 |         mean = float(fields[1])
46 |         stdev = math.sqrt(mean)
47 |         # 0 is chr, 1 is mean
48 |         if stdv_scale:
49 |             cov_thresh[fields[0]] = {
50 |                 "low": mean - (stdev * float(stdv_scale)),
51 |                 "high": mean + (stdev * float(stdv_scale)),
52 |             }
53 |         elif rel_scale:
54 |             cov_thresh[fields[0]] = {
55 |                 "low": mean / float(rel_scale),
56 |                 "high": mean * float(rel_scale),
57 |             }
58 |         else:
59 |             cov_thresh[fields[0]] = {
60 |                 "low": float(mean_lower),
61 |                 "high": float(mean_upper),
62 |             }
63 | 
64 | # Write the thresholds to a TSV file
65 | with open(snakemake.output[0], "w") as output_file:
66 |     output_file.write("chrom\tmin\tmax\n")  # Header line, if needed
67 |     for chrom, thresholds in cov_thresh.items():
68 |         if chrom == "total":
69 |             continue
70 |         output_file.write(f"{chrom}\t{thresholds['low']}\t{thresholds['high']}\n")
71 | 


--------------------------------------------------------------------------------
/workflow/snparcher_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | try:
 6 |     # Snakemake 8.x.x
 7 |     from snakemake_interface_common.exceptions import WorkflowError
 8 | except ImportError:
 9 |     # Snakmake 7.x.x
10 |     from snakemake.exceptions import WorkflowError
11 | 
12 | def parse_sample_sheet(config: dict) -> pd.DataFrame:
13 |     samples = (
14 |         pd.read_table(config["samples"], sep=",", dtype=str)
15 |         .replace(" ", "_", regex=True)
16 |         .infer_objects(
17 |             copy=False
18 |         )  # needed to maintain same behavior in future pandas versions
19 |     )
20 |     config_genomes = get_config_genomes(config, samples)
21 |     refGenome = 'refGenome' in samples.columns and samples['refGenome'].notna().any()
22 |     refPath = 'refPath' in samples.columns and samples['refPath'].notna().any()
23 |     if not any([config_genomes, refGenome, refPath]):
24 |         raise WorkflowError("No 'refGenome' or 'refPath' found in config or sample sheet.")
25 |     if config_genomes is not None:
26 |         config_refGenome, config_refPath = config_genomes
27 |         samples["refGenome"] = config_refGenome
28 |         samples["refPath"] = config_refPath
29 |     if 'refPath' in samples.columns and samples['refPath'].notna().any():
30 |         check_ref_paths(samples)
31 |     return samples
32 | 
33 | def get_config_genomes(config: dict, samples: pd.DataFrame):
34 |     refGenome = config.get("refGenome", False)
35 |     refPath = config.get("refPath", False)
36 | 
37 |     if refGenome and refPath:
38 |         if 'refGenome' in samples.columns and samples['refGenome'].notna().any():
39 |             raise WorkflowError("'refGenome' is set in sample sheet AND in config. These are mutually exclusive.")
40 |         return refGenome, refPath    
41 |     elif refGenome and not refPath:
42 |         raise WorkflowError("'refGenome' is set in config, but 'refPath' is not. Both are required to use these settings.")
43 |     elif refPath and not refGenome:
44 |         raise WorkflowError("'refPath' is set in config, but 'refGenome' is not. Both are required to use these settings.")
45 |     return None
46 | 
47 | def check_ref_paths(samples: pd.DataFrame) -> None:
48 |     """
49 |     Checks reference paths to make sure they exist, otherwise we might try to download them based on refGenome.
50 |     Also make sure only one refPath per refGenome.
51 |     """ 
52 |     for refname in samples["refGenome"].dropna().tolist():            
53 |         refs = samples[samples["refGenome"] == refname]["refPath"].dropna().unique().tolist()
54 |         if len(refs) > 1:
55 |             raise WorkflowError(f"refGenome '{refname}' has more than one unique 'refPath' specified: {refs}")
56 |         for ref in refs:        
57 |             if not Path(ref).exists:
58 |                 raise WorkflowError(f"refPath: '{ref}' was specified in sample sheet, but could not be found.")
59 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # 🚀 snpArcher
 2 | 
 3 | <img src="./img/logo.png" alt="snpArcher logo" height="300"/>
 4 | 
 5 | Have resequencing data as fastq files and a reference genome? Want a VCF file of genotypes? Use snpRarcher as your one-stop shop to quickly and efficiently produce an analysis-ready dataset. No need to create a hand-tailored workflow cobbled together by tape and error-ridden chatGPT code — use snpArcher for all your variant calling needs on your laptop, on your server, or up in the clouds. 
 6 | 
 7 | snpArcher is a reproducible workflow optimized for nonmodel organisms and comparisons across datasets, built on the [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html#) workflow management system. It provides a streamlined approach to dataset acquisition, variant calling, quality control, and downstream analysis.
 8 | 
 9 | Snakemake makes it easy to bundle together the many steps involved in running a bioinformatics pipeline. The workflow involves mapping reads to a reference genome, calling SNPs using GATK's haplotypecaller, and calling variants at the population level using GATK's combineGVCFs. Each of these steps can be slow and tiresome to run on their own and the workflow has been carefully designed and tested to maximize efficiency. We use intervals to break up jobs into smaller chunks so that time and memory-hungry steps like haplotypecaller run quickly. If you have access to a Sentieon license for accelerated variant calling, we include options for this. 
10 | 
11 | Finally, the pipeline makes it easy to evaluate how the data looks. Review the HTML file in the QC folder at the end of a run to see how your samples relate to each other and also a number of metrics for evaluating variant-calling quality. 
12 | 
13 | Remember to examine the config.yaml file to edit options for each step. We have carefully chosen default options that should work for most users, but these can be tweaked here. 
14 | 
15 | ## Requirements
16 | - Illumina paired-end fastq files for one or more individuals
17 | - A reference genome
18 | - A sample sheet with sample names matched to the read names
19 | - Snakemake and Mamba installed on your system
20 | - If using Google Cloud, you will need to have set up an account on the GCP console
21 | 
22 | ## Using snpArcher
23 | - To get started quickly, check out the quick start tutorial!
24 | - Otherwise start [here](./setup.md).
25 | 
26 | ## Citing
27 | - Please cite our preprint [here](https://www.biorxiv.org/content/10.1101/2023.06.22.546168v1)
28 | - Also, make sure to cite the tools you used within snpArcher.
29 | 
30 | ## Contributing to snpArcher
31 | - If you encounter a bug or want to request a feature, please open a issue on our [github page](https://github.com/harvardinformatics/snpArcher).
32 | - If you'd like to contribute a module, check out our [module contribution guidelines](./modules.md#module-contribution-guidelines).
33 | 
34 | ```{toctree}
35 | :hidden: True
36 | ./setup.md
37 | ./executing.md
38 | ./examples.md
39 | ./modules.md
40 | ./datasets.md
41 | ```
42 | 


--------------------------------------------------------------------------------
/workflow/rules/sumstats.smk:
--------------------------------------------------------------------------------
 1 | rule bam_sumstats:
 2 |     input:
 3 |         unpack(get_bams),
 4 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 5 |     output:
 6 |         cov = "results/{refGenome}/summary_stats/{sample}_coverage.txt",
 7 |         alnSum = "results/{refGenome}/summary_stats/{sample}_AlnSumMets.txt",
 8 |     conda:
 9 |         "../envs/fastq2bam.yml"
10 |     shell:
11 |         """
12 |         samtools coverage --output {output.cov} {input.bam}
13 |         samtools flagstat -O tsv {input.bam} > {output.alnSum}
14 |         """
15 | 
16 | rule sentieon_bam_stats:
17 |     input:
18 |         unpack(get_bams),
19 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
20 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna"
21 |     params:
22 |         lic = config['sentieon_lic']
23 |     output:
24 |         insert_file = "results/{refGenome}/summary_stats/{sample}_insert_metrics.txt",
25 |         qd = "results/{refGenome}/summary_stats/{sample}_qd_metrics.txt",
26 |         gc = "results/{refGenome}/summary_stats/{sample}_gc_metrics.txt",
27 |         gc_summary = "results/{refGenome}/summary_stats/{sample}_gc_summary.txt",
28 |         mq = "results/{refGenome}/summary_stats/{sample}_mq_metrics.txt"
29 |     conda:
30 |         "../envs/sentieon.yml"
31 |     shell:
32 |         """
33 |         export SENTIEON_LICENSE={params.lic}
34 |         sentieon driver -r {input.ref} \
35 |         -t {threads} -i {input.bam} \
36 |         --algo MeanQualityByCycle {output.mq} \
37 |         --algo QualDistribution {output.qd} \
38 |         --algo GCBias --summary {output.gc_summary} {output.gc} \
39 |         --algo InsertSizeMetricAlgo {output.insert_file}
40 |         """
41 | 
42 | rule collect_fastp_stats:
43 |     input:
44 |         collect_fastp_stats_input
45 |     output:
46 |         "results/{refGenome}/summary_stats/{sample}_fastp.out"
47 |     run:
48 |         combine_fastp_files(input, output)
49 | 
50 | rule collect_sumstats:
51 |     input:
52 |         unpack(get_input_sumstats)
53 |     output:
54 |         "results/{refGenome}/summary_stats/{prefix}_bam_sumstats.txt"
55 |     run:
56 |         if not config['sentieon']:
57 |             FractionReadsPassFilter, NumReadsPassFilter = collectFastpOutput(input.fastpFiles)
58 |             aln_metrics = collectAlnSumMets(input.alnSumMetsFiles)
59 |             SeqDepths, CoveredBases = collectCoverageMetrics(input.coverageFiles)
60 |             printBamSumStats(SeqDepths, CoveredBases, aln_metrics, FractionReadsPassFilter, NumReadsPassFilter, output[0])
61 |         else:
62 |             FractionReadsPassFilter, NumReadsPassFilter = collectFastpOutput(input.fastpFiles)
63 |             aln_metrics = collectAlnSumMets(input.alnSumMetsFiles)
64 |             SeqDepths, CoveredBases = collectCoverageMetrics(input.coverageFiles)
65 |             median_inserts, median_insert_std = collect_inserts(input.insert_files)
66 |             printBamSumStats(SeqDepths, CoveredBases, aln_metrics, FractionReadsPassFilter, NumReadsPassFilter, output[0], median_inserts, median_insert_std)


--------------------------------------------------------------------------------
/.test/qc/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV
 6 | resource_config: "config/resources.yaml"  # path to resources yaml config
 7 | final_prefix: "test_qc" # prefix for final output files
 8 | intervals: True    #Set to True if you want to perform variant calling using interval approach. 
 9 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
10 | sentieon_lic: "" #set to path of sentieon license
11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 
12 | remote_reads_prefix: "" # set to google bucket prefix where reads live
13 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
14 | trackhub_email: "hi@website.com"
15 | ##############################
16 | # Variables you *might* need to change
17 | ##############################
18 | 
19 | # Interval approach options, only applicable if intervals is True
20 | minNmer: 500   # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
21 | num_gvcf_intervals: 1 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps.
22 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1
23 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper
24 | 
25 | ## Coverage options ##
26 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
27 | min_depth: 2
28 | # low coverage options (< 10x) 
29 | minP: 1
30 | minD: 1
31 | 
32 | # high coverage options (> 10x)
33 | #minP: 2
34 | #minD: 4
35 | 
36 | het_prior: 0.005    #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods
37 | 
38 | ########################################
39 | ## callable sites bed file thresholds ##
40 | ########################################
41 | 
42 | mappability_min: 1    #regions of the genome with mappability less than this will be removed from callable sites bed file
43 | cov_threshold: 2      #regions of the genome with coverage above/below cov_thresh standard deviations will be filtered
44 | 
45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold
46 | #to do strict filtering, set to 0
47 | 
48 | callable_merge: 100   #merge callable regions separated by this or fewer bp into a single region
49 | 
50 | 
51 | ## QC options ##
52 | nClusters: 3
53 | GoogleAPIKey:
54 | 
55 | ## Filtering options ##
56 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.
57 | 


--------------------------------------------------------------------------------
/workflow/modules/mk/Snakefile:
--------------------------------------------------------------------------------
 1 | configfile: "config/config.yaml"
 2 | include: "common.smk"
 3 | 
 4 | 
 5 | REFGENOME = samples['refGenome'].unique().tolist()
 6 | 
 7 | rule all:
 8 |     input:
 9 |         expand("results/{refGenome}/mk/{prefix}_mk_table.tsv", refGenome=REFGENOME, prefix=config['final_prefix'])
10 | 
11 | rule prep_genome:
12 |     """
13 |     Gets the needed information (fasta and gff) from NCBI
14 |     """
15 |     input: 
16 |         ref = get_ref,
17 |         gff = get_gff
18 |     output:
19 |         ref = "results/{refGenome}/mk/{refGenome}.fna",
20 |         gff = "results/{refGenome}/mk/{refGenome}.gff"
21 |     params:
22 |         dataset = "results/{refGenome}/mk/{refGenome}_dataset.zip",
23 |         outdir = "results/{refGenome}/mk/{refGenome}"
24 |     conda:
25 |         "envs/ncbi.yml"
26 |     shell:
27 |         """
28 |         set +e
29 |         #if genome is local, datasets will fail, we will just continue
30 |         mkdir -p {params.outdir}
31 |         datasets download genome accession  --exclude-protein --exclude-rna --filename {params.dataset} {wildcards.refGenome} \
32 |         && 7z x {params.dataset} -aoa -o{params.outdir}
33 | 
34 |         if [ -z "{input.ref}" ]
35 |         then
36 |             cat {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/*.fna > {output.ref}
37 |         else
38 |             cp {input.ref} {output.ref}
39 |         fi
40 | 
41 |        if [ -z "{input.gff}" ]
42 |         then
43 |             cp {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/genomic.gff {output.gff}
44 |         else
45 |             cp {input.gff} {output.gff}
46 |         fi
47 |         """
48 | 
49 | rule split_samples:
50 |     """
51 |     Splits sample sheet to make ingroup and outgroup files
52 |     """
53 |     output:
54 |         exclude = "results/{refGenome}/mk/{prefix}_exclude.txt",
55 |         outgroups = "results/{refGenome}/mk/{prefix}_ougroups.txt"
56 |     run:
57 |         out_df = samples[["BioSample", "SampleType"]]
58 |         out_df.drop_duplicates("BioSample", inplace=True)
59 |         exclude =out_df[~out_df.SampleType.isin(["ingroup", "outgroup"])].BioSample
60 |         outgroups = out_df[out_df.SampleType.isin(["outgroup"])].BioSample
61 |         exclude.to_csv(output[0], index=False, sep="\t", header=False)
62 |         outgroups.to_csv(output[1], index=False, sep="\t", header=False)
63 | 
64 | rule degenotate:
65 |     """
66 |     Runs degenotate to compute MK tables
67 |     """
68 |     input:
69 |         vcf = "results/{refGenome}/{prefix}_clean_snps.vcf.gz",
70 |         genome = "results/{refGenome}/mk/{refGenome}.fna",
71 |         gff = "results/{refGenome}/mk/{refGenome}.gff",
72 |         exclude = "results/{refGenome}/mk/{prefix}_exclude.txt",
73 |         outgroups = "results/{refGenome}/mk/{prefix}_ougroups.txt"
74 |     output:
75 |         "results/{refGenome}/mk/{prefix}_mk_table.tsv"
76 |     params:
77 |         delim = "space"
78 |     log:
79 |         "logs/{refGenome}/mk/{prefix}_degenotate.txt"
80 |     conda:
81 |         "envs/mk.yml"
82 |     shell:
83 |         """
84 |         if [ -s {input.exclude} ]
85 |         then
86 |             degenotate.py --overwrite -a {input.gff} -g {input.genome} -u {input.outgroups} -e {input.exclude} -d {params.delim} -o "results/{wildcards.refGenome}/mk/{wildcards.prefix}_degen_raw" -v {input.vcf}
87 |         else
88 |             degenotate.py --overwrite -a {input.gff} -g {input.genome} -u {input.outgroups} -d {params.delim} -o "results/{wildcards.refGenome}/mk/{wildcards.prefix}_degen_raw" -v {input.vcf}
89 |         fi
90 |         cp results/{wildcards.refGenome}/mk/{wildcards.prefix}_degen_raw/mk.tsv {output}
91 |         """
92 | 


--------------------------------------------------------------------------------
/workflow/rules/intervals.smk:
--------------------------------------------------------------------------------
 1 | rule picard_intervals:
 2 |     input:
 3 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 4 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
 5 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict"
 6 |     output:
 7 |         intervals = temp("results/{refGenome}/intervals/picard_interval_list.list")
 8 |     params:
 9 |         minNmer = int(config['minNmer'])
10 |     conda:
11 |         '../envs/bam2vcf.yml'
12 |     log:
13 |         "logs/{refGenome}/picard_intervals/log.txt"
14 |     benchmark:
15 |         "benchmarks/{refGenome}/picard_intervals/benchmark.txt"
16 |     shell:
17 |         "picard ScatterIntervalsByNs -Xmx{resources.mem_mb_reduced}m REFERENCE={input.ref} OUTPUT={output.intervals} MAX_TO_MERGE={params.minNmer} OUTPUT_TYPE=ACGT &> {log}\n"
18 | 
19 | rule format_interval_list:
20 |     input:
21 |         intervals = "results/{refGenome}/intervals/picard_interval_list.list"
22 |     output:
23 |         intervals = "results/{refGenome}/intervals/master_interval_list.list"
24 |     run:
25 |         with open(output.intervals, "w") as out:
26 |             with open(input.intervals, "r") as inp:
27 |                 for line in inp:
28 |                     if not line.startswith("@"):
29 |                         line = line.strip().split("\t")
30 |                         chrom, start, end = line[0], line[1], line[2]
31 |                         print(f"{chrom}:{start}-{end}", file=out)
32 |     
33 | 
34 | checkpoint create_db_intervals:
35 |     input:
36 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
37 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
38 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
39 |         intervals = "results/{refGenome}/intervals/master_interval_list.list"
40 |     output:
41 |         fof = "results/{refGenome}/intervals/db_intervals/intervals.txt",
42 |         out_dir = directory("results/{refGenome}/intervals/db_intervals"),
43 |     params:
44 |         max_intervals = get_db_interval_count
45 |     log:
46 |         "logs/{refGenome}/db_intervals/log.txt"
47 |     benchmark:
48 |         "benchmarks/{refGenome}/db_intervals/benchmark.txt"
49 |     conda:
50 |         '../envs/bam2vcf.yml'
51 |     shell:
52 |         """
53 |         gatk SplitIntervals --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' -L {input.intervals} \
54 |         -O {output.out_dir} -R {input.ref} -scatter {params} \
55 |         -mode INTERVAL_SUBDIVISION \
56 |         --interval-merging-rule OVERLAPPING_ONLY &> {log}
57 |         ls -l {output.out_dir}/*scattered.interval_list > {output.fof}
58 |         """
59 | 
60 | checkpoint create_gvcf_intervals:
61 |     input:
62 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
63 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
64 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
65 |         intervals = "results/{refGenome}/intervals/master_interval_list.list"
66 |     output:
67 |         fof = "results/{refGenome}/intervals/gvcf_intervals/intervals.txt",
68 |         out_dir = directory("results/{refGenome}/intervals/gvcf_intervals"),
69 |     params:
70 |         max_intervals = config["num_gvcf_intervals"]
71 |     log:
72 |         "logs/{refGenome}/gvcf_intervals/log.txt"
73 |     benchmark:
74 |         "benchmarks/{refGenome}/gvcf_intervals/benchmark.txt"
75 |     conda:
76 |         '../envs/bam2vcf.yml'
77 |     shell:
78 |         """
79 |         gatk SplitIntervals --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' -L {input.intervals} \
80 |         -O {output.out_dir} -R {input.ref} -scatter {params} \
81 |         -mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION \
82 |         --interval-merging-rule OVERLAPPING_ONLY  &> {log}
83 |         ls -l {output.out_dir}/*scattered.interval_list > {output.fof}
84 |         """


--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: 
 4 |   push:
 5 |     paths:
 6 |       - "**/Snakefile"
 7 |       - "**/*.smk"
 8 |       - "**/*.py"
 9 |       - ".github/workflows/*"
10 |     
11 | 
12 | jobs:
13 | 
14 |   Testing_local-fastq:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Test workflow (Local Fastq > VCF)
19 |       uses: snakemake/snakemake-github-action@v1.25.1
20 |       with:
21 |         directory: .test/ecoli/
22 |         snakefile: workflow/Snakefile
23 |         args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
24 |         stagein: "conda config --set channel_priority strict"
25 |   Testing_config-genomes:
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |     - uses: actions/checkout@v2
29 |     - name: Test workflow (Local Fastq > VCF)
30 |       uses: snakemake/snakemake-github-action@v1.25.1
31 |       with:
32 |         directory: .test/ecoli/
33 |         snakefile: workflow/Snakefile
34 |         args: "--config samples='config/ecoli_config_genome.csv' refGenome='ecoli' refPath='data/local_genome/local_genome.fna.gz' --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
35 |   Testing_local-fastq-sentieon-dryrun:
36 |     runs-on: ubuntu-latest
37 |     steps:
38 |     - uses: actions/checkout@v2
39 |     - name: Test workflow (Local Fastq > VCF)
40 |       uses: snakemake/snakemake-github-action@v1.25.1
41 |       with:
42 |         directory: .test/ecoli/
43 |         snakefile: workflow/Snakefile
44 |         args: "--config sentieon=True --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default --dry-run"
45 |         stagein: "conda config --set channel_priority strict"
46 |   Testing_local-fastq_and_sra:
47 |     runs-on: ubuntu-latest
48 |     steps:
49 |     - uses: actions/checkout@v2
50 |     - name: Test workflow (Local Fastq > VCF)
51 |       uses: snakemake/snakemake-github-action@v1.25.1
52 |       with:
53 |         directory: .test/ecoli/
54 |         snakefile: workflow/Snakefile
55 |         args: "--config samples='config/local_and_sra.csv' --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
56 |   Testing_QC:
57 |     runs-on: ubuntu-latest
58 |     steps:
59 |     - uses: actions/checkout@v2
60 |     - name: Test workflow (QC rules)
61 |       uses: snakemake/snakemake-github-action@v1.25.1
62 |       with:
63 |         snakefile: workflow/modules/qc/Snakefile
64 |         directory: .test/qc/
65 |         args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
66 |         # stagein: "mamba install -y -c conda-forge 'python==3.11.4'"
67 |   Testing_Postprocess:
68 |     runs-on: ubuntu-latest
69 |     steps:
70 |     - uses: actions/checkout@v2
71 |     - name: Test workflow (Postprocess)
72 |       uses: snakemake/snakemake-github-action@v1.25.1
73 |       with:
74 |         snakefile: workflow/modules/postprocess/Snakefile
75 |         directory: .test/postprocess/
76 |         args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
77 |         # stagein: "mamba install -y -c conda-forge 'python==3.11.4'"
78 |   Testing_Trackhub:
79 |     runs-on: ubuntu-latest
80 |     steps:
81 |     - uses: actions/checkout@v2
82 |     - name: Test workflow (Trackhubs)
83 |       uses: snakemake/snakemake-github-action@v1.25.1
84 |       with:
85 |         snakefile: workflow/modules/trackhub/Snakefile
86 |         directory: .test/trackhub/
87 |         args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba --workflow-profile workflow-profiles/default"
88 |         # stagein: "mamba install -y -c conda-forge 'python==3.11.4'"
89 | 


--------------------------------------------------------------------------------
/workflow/rules/cov_filter.smk:
--------------------------------------------------------------------------------
 1 | rule compute_d4:
 2 |     input:
 3 |         unpack(get_bams)
 4 |     output:
 5 |         dist = "results/{refGenome}/callable_sites/{sample}.mosdepth.global.dist.txt",
 6 |         d4="results/{refGenome}/callable_sites/{sample}.per-base.d4.gz",
 7 |         d4gzi ="results/{refGenome}/callable_sites/{sample}.per-base.d4.gz.gzi",
 8 |         summary="results/{refGenome}/callable_sites/{sample}.mosdepth.summary.txt"
 9 |     conda:
10 |         "../envs/cov_filter.yml"
11 |     log:
12 |         "logs/{refGenome}/compute_d4/{sample}.txt"
13 |     benchmark:
14 |         "benchmarks/{refGenome}/compute_d4/{sample}.txt"
15 |     params:
16 |         prefix = subpath(output.summary, strip_suffix=".mosdepth.summary.txt"),
17 |         d4 = subpath(output.d4, strip_suffix=".gz")
18 |     shell:
19 |         """
20 |         mosdepth --d4 -t {threads} {params.prefix} {input.bam} &> {log}
21 |         bgzip --index {params.d4}
22 |         """
23 | 
24 | 
25 | 
26 | rule collect_covstats:
27 |     input:
28 |         unpack(get_input_covstats)
29 |     output:
30 |         "results/{refGenome}/summary_stats/all_cov_sumstats.txt"  
31 |     run:
32 |         covStats = collectCovStats(input.covStatFiles)
33 |         with open(output[0], "w") as f:
34 |             print("chrom\tmean_cov\tstdev_cov", file=f)
35 |             for chrom in covStats:
36 |                 print(chrom, covStats[chrom]['mean'], covStats[chrom]['stdev'], sep="\t", file=f)
37 | 
38 | rule create_cov_thresholds:
39 |     input:
40 |         stats = "results/{refGenome}/summary_stats/all_cov_sumstats.txt",
41 |     output:
42 |         thresholds = "results/{refGenome}/callable_sites/{prefix}_callable_sites_thresholds.tsv"
43 |     
44 |     params:
45 |         cov_threshold_stdev = config["cov_threshold_stdev"],
46 |         cov_threshold_lower = config["cov_threshold_lower"],
47 |         cov_threshold_upper = config["cov_threshold_upper"],
48 |         cov_threshold_rel = config["cov_threshold_rel"]
49 |     conda:
50 |         "../envs/cov_filter.yml"
51 |     script:
52 |         "../scripts/create_coverage_thresholds.py"
53 | 
54 | rule clam_loci:
55 |     input:
56 |         unpack(get_input_for_coverage),
57 |         thresholds = "results/{refGenome}/callable_sites/{prefix}_callable_sites_thresholds.tsv"
58 |     output:
59 |         cov = "results/{refGenome}/callable_sites/{prefix}/callable_sites.d4",
60 |         bed = "results/{refGenome}/callable_sites/{prefix}/callable_sites.bed",
61 |         tmp_bed = temp("results/{refGenome}/callable_sites/{prefix}/callable_sites_temp.bed") # temp fix until clam produces better bed files cm
62 |     params:
63 |         outdir = subpath(output.cov, parent=True)
64 |     conda:
65 |         "../envs/cov_filter.yml"
66 |     log: 
67 |         "logs/{refGenome}/covbed/{prefix}.txt"
68 |     benchmark:
69 |         "benchmarks/{refGenome}/covbed/{prefix}_benchmark.txt"
70 |     shell:
71 |         """
72 |         clam loci -t {threads} --bed --thresholds-file {input.thresholds} -o {params.outdir} {input.d4} 2> {log}
73 |         bedtk merge {output.bed} > {output.tmp_bed} 2>> {log}
74 |         cp {output.tmp_bed} {output.bed} 2>> {log}
75 |         """
76 | 
77 | rule callable_bed:
78 |     input:
79 |         cov = "results/{refGenome}/callable_sites/{prefix}/callable_sites.bed",
80 |         map = "results/{refGenome}/callable_sites/{prefix}_callable_sites_map.bed"
81 |     output:
82 |         callable_sites = "results/{refGenome}/{prefix}_callable_sites.bed",
83 |         tmp_cov = temp("results/{refGenome}/callable_sites/{prefix}_temp_cov.bed")
84 |     conda:
85 |         "../envs/cov_filter.yml"
86 |     benchmark:
87 |         "benchmarks/{refGenome}/callable_bed/{prefix}_benchmark.txt"
88 |     params:
89 |         merge = config['cov_merge']
90 |     shell:
91 |         """
92 |         bedtools merge -d {params.merge} -i {input.cov} > {output.tmp_cov}
93 |         bedtools intersect -a {output.tmp_cov} -b {input.map} | bedtools sort -i - | bedtools merge -i - > {output.callable_sites}
94 |         """
95 | 


--------------------------------------------------------------------------------
/workflow/scripts/create_coverage_bed.py:
--------------------------------------------------------------------------------
 1 | from pyd4 import D4File,D4Builder
 2 | import math
 3 | 
 4 | #read chrom coverage values and compute min/max
 5 | cov_thresh = {}
 6 | stdv_scale = snakemake.params["cov_threshold_stdev"]
 7 | rel_scale = snakemake.params["cov_threshold_rel"]
 8 | mean_lower = snakemake.params["cov_threshold_lower"]
 9 | mean_upper = snakemake.params["cov_threshold_upper"]
10 | 
11 | #check that correct settings are set
12 | 
13 | if stdv_scale:
14 |     if rel_scale:
15 |         raise WorkflowError(f"Both cov_threshold_stdev and cov_threshold_rel are set, please choose one and make sure the other variable is empty in the config file.")
16 |     elif mean_lower or mean_upper:
17 |         raise WorkflowError(f"Both cov_threshold_stdev and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file.")
18 | elif rel_scale:
19 |     if mean_lower or mean_upper:
20 |          raise WorkflowError(f"Both cov_threshold_rel and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file.")
21 | elif mean_lower:
22 |     if not mean_upper:
23 |         mean_upper = 50000
24 | elif mean_upper:
25 |     if not mean_lower:
26 |         mean_lower = 1
27 | else:
28 |     raise WorkflowError(f"Use coverage filter is True, but you did not specify coverage filtering options in the config. Please check.")
29 | 
30 | with open(snakemake.input["stats"]) as stats:
31 |     for line in stats:
32 |         if "mean" in line:
33 |             continue
34 | 
35 |         fields=line.split()
36 |         mean = float(fields[1])
37 |         stdev = math.sqrt(mean)
38 |         #0 is chr, 1 is mean
39 |         if stdv_scale:
40 |             cov_thresh[fields[0]] = {
41 |                 'low' : mean - (stdev * float(stdv_scale)),
42 |                 'high' : mean + (stdev * float(stdv_scale))
43 |                 }
44 |         elif rel_scale: 
45 |             cov_thresh[fields[0]] = {
46 |                 'low' : mean / float(rel_scale),
47 |                 'high' : mean * float(rel_scale)
48 |                 }
49 |         else:
50 |             cov_thresh[fields[0]] = {
51 |                 'low' : float(mean_lower),
52 |                 'high' : float(mean_upper)
53 |                 }
54 | 
55 | #read d4 file into python, convert to
56 | covfile = D4File(snakemake.input["d4"])
57 | covmat = covfile.open_all_tracks()
58 | 
59 | with open(snakemake.output["covbed"], mode='w') as covbed:
60 |     good_interval = False
61 |     for chrom in covfile.chroms():
62 |         
63 |         try: 
64 |             thresh_high = cov_thresh[chrom[0]]['high']
65 |         except KeyError:
66 |             thresh_high = cov_thresh['total']['high']
67 | 
68 |         try:
69 |             thresh_low = cov_thresh[chrom[0]]['low']
70 |         except KeyError:
71 |             thresh_low = cov_thresh['total']['low']
72 | 
73 |         for values in covmat.enumerate_values(chrom[0],0,chrom[1]):
74 |             covs=values[2]
75 |             pos=values[1]
76 |             chr=values[0]
77 |             #get mean coverage for window
78 |             res1=math.fsum(covs)/len(covs)
79 | 
80 |             if res1 <= thresh_high and res1 >= thresh_low and good_interval == False:
81 |                 # we are starting a new interval, print chr and pos
82 |                 print(chr, pos, file=covbed, sep="\t", end="")
83 |                 # set good interval to True
84 |                 good_interval = True
85 |             elif (res1 > thresh_high or res1 < thresh_low) and good_interval:
86 |                 # we are ending a good interval
87 |                 print("\t", pos, file=covbed, sep="")
88 |                 good_interval = False
89 |             else:
90 |                 # we are either in a good interval, or in a bad interval, so just keep going
91 |                 continue
92 |         # if at this stage we are in a good interval, that means the good interval goes ot the end of the chromosome
93 |         if good_interval:
94 |             endpos = chrom[1]+1
95 |             print("\t", endpos, file=covbed, sep="")
96 |             good_interval = False
97 | 
98 | 


--------------------------------------------------------------------------------
/workflow/snparcher_utils/write_samples.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import shutil
 3 | import argparse
 4 | from pathlib import Path
 5 | from typing import TextIO
 6 | # User provides list of sample names, 1 per line
 7 | # User provides path to where fastq files are. Assume paired end and that file name has sample name in it
 8 | # User provides path to reference genome. Need to copy this to proper path
 9 | 
10 | def read_sample_list(sample_fh: TextIO) -> list:
11 |     return sample_fh.read().splitlines()
12 | 
13 | def find_sample_fastqs(samples: list, fastq_dir: Path) -> dict:
14 |     """Searches fastq_dir for sample names and associates in a dict"""
15 |     sample_fastq_paths = {}
16 |     cant_find = []
17 |     for samp_name in samples:
18 |         fqs = sorted(list(fastq_dir.glob(f"*{samp_name}*")))  # Hoping that sorting will make fq1 first. 
19 |         if len(fqs) != 2:
20 |             cant_find.append(samp_name)
21 |         else:
22 |             sample_fastq_paths[samp_name] = fqs
23 |     return sample_fastq_paths, cant_find
24 | 
25 | def copy_reference(ref: Path) -> str:
26 |     exts = ['.fna', '.fa', '.fasta']
27 |     for ext in exts:
28 |         if ext in ref.name:
29 |             ref_name = ref.name.split(ext)[0]
30 |     if Path('..', 'data', 'genome', ref_name + ".fna").exists():
31 |         return ref_name
32 |     if not Path("../data/genome").exists():
33 |         Path("../data/genome").mkdir(parents=True)
34 |     if ref.suffix == ".gz":
35 |         with gzip.open(ref, 'rb') as f_in:
36 |             with open(Path('..', 'data', 'genome', ref_name + ".fna"), 'wb') as f_out:
37 |                 shutil.copyfileobj(f_in, f_out)
38 |     else:
39 |         shutil.copyfile(ref, Path('data', 'genome', ref_name + ".fna"))
40 |     return ref_name
41 | 
42 | def write_sample_sheet(sample_dict: dict, ref_name: str, ref_path: str, ncbi_ref: bool) -> None:
43 |     """Writes the sample sheet"""
44 |     with open(Path("../config", "samples.csv"), "w") as out:
45 |         if (ncbi_ref):
46 |             out.write("BioSample,LibraryName,refGenome,Run,BioProject,fq1,fq2\n")
47 |             for i, (k, v) in enumerate(sample_dict.items()):
48 |                 out.write(f"{k},lib_{k},{ref_name},{i},NaN,{v[0]},{v[1]}\n")
49 |         else:
50 |             out.write("BioSample,LibraryName,refGenome,refPath,Run,BioProject,fq1,fq2\n")
51 |             for i, (k, v) in enumerate(sample_dict.items()):
52 |                 out.write(f"{k},lib_{k},{ref_name},{ref_path},{i},NaN,{v[0]},{v[1]}\n")
53 | 
54 | 
55 | def main() -> None:
56 | 
57 |     parser = argparse.ArgumentParser(description='Write sample files.')
58 |     parser.add_argument('-s', '--sample_list', dest='samp', required=True, help="Specify path to sample list")
59 |     parser.add_argument('-f', '--fastq_dir', dest='fastq', required=True, help="Specify path to fastq dir")
60 |     parser.add_argument('-c', '--copy', dest='copyref', required=False, default=False, help="Copy reference genome to data/genome dir and unzip.")
61 |     
62 |     group = parser.add_mutually_exclusive_group(required=True)
63 |     group.add_argument('-r', '--ref', dest='ref', help="Specify path to reference genome. Mutually exclusive with -a/--acc.")
64 |     group.add_argument('-a', '--acc', dest='acc', help="Specify reference genome accession. Mutually exclusive with -r/--ref")
65 |     args = parser.parse_args()
66 | 
67 |     sample_list = args.samp
68 |     fastq_dir = Path(args.fastq)
69 |     
70 | 
71 |     with open(sample_list, "r") as f:
72 |         samples = read_sample_list(f)
73 | 
74 |     sample_dict, cant_find = find_sample_fastqs(samples, fastq_dir)
75 |     ncbi_ref = True
76 | 
77 |     if (args.ref):
78 |         ref = Path(args.ref)
79 |         ncbi_ref = False
80 |         if args.copyref:
81 |             ref_name = copy_reference(ref)
82 |             ref_path = "../data/genome/" + ref_name + ".fna"
83 |         else:
84 |             ref_name = ref.stem
85 |             ref_path = args.ref
86 |     else:
87 |         ref_name = args.acc
88 |         ref_path = ""
89 | 
90 | 
91 |     write_sample_sheet(sample_dict, ref_name, ref_path, ncbi_ref)
92 | 
93 |     if cant_find:
94 |         print("Couldnt' find fastqs for these files:")
95 |         for name in cant_find:
96 |             print(name)
97 | if __name__ == "__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/.test/qc/config/test_qc_gls_config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV 
 6 | tmp_dir: "tmp/"   # directory path for a temp dir 
 7 | split_by_n: True    #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds.
 8 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
 9 | sentieon_lic: "" #set to path of sentieon license
10 | remote_reads: False # set if you want reads to be on google cloud storage remote
11 | remote_reads_prefix: "" # set to google bucket name where reads live
12 | ##############################
13 | # Variables you *might* need to change
14 | ##############################
15 | 
16 | # if using the BAM -> VCF workflows
17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
18 | bamsForFB: "fastq2bam/01_mappedReads/"   # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
19 | bam_suffix: "_final.bam"                 # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 
20 | 
21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome
22 | maxNumIntervals: 10             # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds.
23 | minNmer: 500                    # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
24 | maxIntervalLen: 15000000          # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point
25 | maxBpPerList: 15000000            # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here
26 | maxIntervalsPerList: 200        # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good.
27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth
28 | 
29 | ## Coverage options ##
30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
31 | 
32 | # low coverage options (< 10x) 
33 | minP: 1
34 | minD: 1
35 | 
36 | # high coverage options (> 10x)
37 | #minP: 2
38 | #minD: 4
39 | 
40 | ## QC options ##
41 | nClusters: 3
42 | 
43 | ##############################
44 | # Variables you DO NOT need to change
45 | # if you do, ensure all path/directory names are followed by a "/"
46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name!
47 | ##############################
48 | 
49 | output: "results/"
50 | fastqDir: "data/fastq/"                           # this is where raw fastqs are downloaded
51 | refGenomeDir: "data/genome/"                      # where reference genomes go
52 | fastq2bamDir: "fastq2bam/"
53 | fastqFilterDir: "00_fastqFiltered/"     # new directory created for filtered fastq reads
54 | bamDir: "01_mappedReads/"               # new directory created for BAM files
55 | sumstatDir: "02_bamSumstats/"           # new directory created for BAM summary statistics
56 | 
57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!)
58 | gatkDir: "gatk/"
59 | gvcfDir: "03_gvcfs/"
60 | dbDir: "04_genomicsDB/"
61 | vcfDir_gatk: "05_vcfs/"
62 | qcDir: "06_QC/"
63 | intDir: "intervalFiles/"
64 | 
65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!)   
66 | fbDir: "freebayes/"
67 | intervalDir: "00_intervals/"
68 | vcfDir_fb: "01_vcfs_byinterval/"
69 | 
70 | #information for read groups
71 | flowcell: "FLOWCELL"
72 | platform: "ILLUMINA"
73 | 
74 | cluster_config: "profiles/slurm/cluster_config.yml"
75 | test_qc: True
76 | 


--------------------------------------------------------------------------------
/.test/postprocess/config/test_qc_gls_config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV 
 6 | tmp_dir: "tmp/"   # directory path for a temp dir 
 7 | split_by_n: True    #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds.
 8 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
 9 | sentieon_lic: "" #set to path of sentieon license
10 | remote_reads: False # set if you want reads to be on google cloud storage remote
11 | remote_reads_prefix: "" # set to google bucket name where reads live
12 | ##############################
13 | # Variables you *might* need to change
14 | ##############################
15 | 
16 | # if using the BAM -> VCF workflows
17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
18 | bamsForFB: "fastq2bam/01_mappedReads/"   # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
19 | bam_suffix: "_final.bam"                 # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 
20 | 
21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome
22 | maxNumIntervals: 10             # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds.
23 | minNmer: 500                    # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
24 | maxIntervalLen: 15000000          # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point
25 | maxBpPerList: 15000000            # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here
26 | maxIntervalsPerList: 200        # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good.
27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth
28 | 
29 | ## Coverage options ##
30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
31 | 
32 | # low coverage options (< 10x) 
33 | minP: 1
34 | minD: 1
35 | 
36 | # high coverage options (> 10x)
37 | #minP: 2
38 | #minD: 4
39 | 
40 | ## QC options ##
41 | nClusters: 3
42 | 
43 | ##############################
44 | # Variables you DO NOT need to change
45 | # if you do, ensure all path/directory names are followed by a "/"
46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name!
47 | ##############################
48 | 
49 | output: "results/"
50 | fastqDir: "data/fastq/"                           # this is where raw fastqs are downloaded
51 | refGenomeDir: "data/genome/"                      # where reference genomes go
52 | fastq2bamDir: "fastq2bam/"
53 | fastqFilterDir: "00_fastqFiltered/"     # new directory created for filtered fastq reads
54 | bamDir: "01_mappedReads/"               # new directory created for BAM files
55 | sumstatDir: "02_bamSumstats/"           # new directory created for BAM summary statistics
56 | 
57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!)
58 | gatkDir: "gatk/"
59 | gvcfDir: "03_gvcfs/"
60 | dbDir: "04_genomicsDB/"
61 | vcfDir_gatk: "05_vcfs/"
62 | qcDir: "06_QC/"
63 | intDir: "intervalFiles/"
64 | 
65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!)   
66 | fbDir: "freebayes/"
67 | intervalDir: "00_intervals/"
68 | vcfDir_fb: "01_vcfs_byinterval/"
69 | 
70 | #information for read groups
71 | flowcell: "FLOWCELL"
72 | platform: "ILLUMINA"
73 | 
74 | cluster_config: "profiles/slurm/cluster_config.yml"
75 | test_qc: True
76 | 


--------------------------------------------------------------------------------
/.test/trackhub/config/test_qc_gls_config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV 
 6 | tmp_dir: "tmp/"   # directory path for a temp dir 
 7 | split_by_n: True    #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds.
 8 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
 9 | sentieon_lic: "" #set to path of sentieon license
10 | remote_reads: False # set if you want reads to be on google cloud storage remote
11 | remote_reads_prefix: "" # set to google bucket name where reads live
12 | ##############################
13 | # Variables you *might* need to change
14 | ##############################
15 | 
16 | # if using the BAM -> VCF workflows
17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
18 | bamsForFB: "fastq2bam/01_mappedReads/"   # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
19 | bam_suffix: "_final.bam"                 # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 
20 | 
21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome
22 | maxNumIntervals: 10             # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds.
23 | minNmer: 500                    # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
24 | maxIntervalLen: 15000000          # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point
25 | maxBpPerList: 15000000            # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here
26 | maxIntervalsPerList: 200        # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good.
27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth
28 | 
29 | ## Coverage options ##
30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
31 | 
32 | # low coverage options (< 10x) 
33 | minP: 1
34 | minD: 1
35 | 
36 | # high coverage options (> 10x)
37 | #minP: 2
38 | #minD: 4
39 | 
40 | ## QC options ##
41 | nClusters: 3
42 | 
43 | ##############################
44 | # Variables you DO NOT need to change
45 | # if you do, ensure all path/directory names are followed by a "/"
46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name!
47 | ##############################
48 | 
49 | output: "results/"
50 | fastqDir: "data/fastq/"                           # this is where raw fastqs are downloaded
51 | refGenomeDir: "data/genome/"                      # where reference genomes go
52 | fastq2bamDir: "fastq2bam/"
53 | fastqFilterDir: "00_fastqFiltered/"     # new directory created for filtered fastq reads
54 | bamDir: "01_mappedReads/"               # new directory created for BAM files
55 | sumstatDir: "02_bamSumstats/"           # new directory created for BAM summary statistics
56 | 
57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!)
58 | gatkDir: "gatk/"
59 | gvcfDir: "03_gvcfs/"
60 | dbDir: "04_genomicsDB/"
61 | vcfDir_gatk: "05_vcfs/"
62 | qcDir: "06_QC/"
63 | intDir: "intervalFiles/"
64 | 
65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!)   
66 | fbDir: "freebayes/"
67 | intervalDir: "00_intervals/"
68 | vcfDir_fb: "01_vcfs_byinterval/"
69 | 
70 | #information for read groups
71 | flowcell: "FLOWCELL"
72 | platform: "ILLUMINA"
73 | 
74 | cluster_config: "profiles/slurm/cluster_config.yml"
75 | test_qc: True
76 | 


--------------------------------------------------------------------------------
/workflow/modules/qc/config/test_qc_gls_config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV 
 6 | tmp_dir: "tmp/"   # directory path for a temp dir 
 7 | split_by_n: True    #set to False to split by chromosome/scaffold; set to True to split on runs of Ns within chromosomes/scaffolds.
 8 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
 9 | sentieon_lic: "" #set to path of sentieon license
10 | remote_reads: False # set if you want reads to be on google cloud storage remote
11 | remote_reads_prefix: "" # set to google bucket name where reads live
12 | ##############################
13 | # Variables you *might* need to change
14 | ##############################
15 | 
16 | # if using the BAM -> VCF workflows
17 | bamsForGatk: "fastq2bam/01_mappedReads/" # directory containing BAMs for GATK. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
18 | bamsForFB: "fastq2bam/01_mappedReads/"   # directory containing BAMs for Freebayes. If you used the fastq -> bam workflow above, simply keep the default value; must be followed by a "/"
19 | bam_suffix: "_final.bam"                 # the suffix for your BAM files that follow all the sample names. If you use the fastq->BAM workflow above, simply keep the default value 
20 | 
21 | # These parameters control how the genome gets split into intervals according to Nmers in the reference genome
22 | maxNumIntervals: 10             # the maximum number of intervals when split_by_n is False. If your reference genome has hundreds of scaffolds, it can be helpful to set this to less than number of scaffolds.
23 | minNmer: 500                    # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
24 | maxIntervalLen: 15000000          # the desired maximum size of an interval for calling variants; more than 2Mb is a good starting point
25 | maxBpPerList: 15000000            # the desired maximum number of bp per list file for GATK4; list files potentially contain many small intervals, and we cap the fraction of the genome contained in each list file here
26 | maxIntervalsPerList: 200        # the desired maximum number of intervals per list file for GATK4; this prevents list files from containing thousands of small intervals, which can slow parts of GATK4. Default is good.
27 | maxDP_fb: 200 # this is the maximum depth parameter used for freebayes, site with more than this number are ignored, change according to expected depth
28 | 
29 | ## Coverage options ##
30 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
31 | 
32 | # low coverage options (< 10x) 
33 | minP: 1
34 | minD: 1
35 | 
36 | # high coverage options (> 10x)
37 | #minP: 2
38 | #minD: 4
39 | 
40 | ## QC options ##
41 | nClusters: 3
42 | 
43 | ##############################
44 | # Variables you DO NOT need to change
45 | # if you do, ensure all path/directory names are followed by a "/"
46 | # These variables control the output of the fastq2bam workflow. You don't need to change these, but if you do please have a forward slash follow name!
47 | ##############################
48 | 
49 | output: "results/"
50 | fastqDir: "data/fastq/"                           # this is where raw fastqs are downloaded
51 | refGenomeDir: "data/genome/"                      # where reference genomes go
52 | fastq2bamDir: "fastq2bam/"
53 | fastqFilterDir: "00_fastqFiltered/"     # new directory created for filtered fastq reads
54 | bamDir: "01_mappedReads/"               # new directory created for BAM files
55 | sumstatDir: "02_bamSumstats/"           # new directory created for BAM summary statistics
56 | 
57 | # These variables control the output of the GATK4 workflow (please have forward slash follow name!)
58 | gatkDir: "gatk/"
59 | gvcfDir: "03_gvcfs/"
60 | dbDir: "04_genomicsDB/"
61 | vcfDir_gatk: "05_vcfs/"
62 | qcDir: "06_QC/"
63 | intDir: "intervalFiles/"
64 | 
65 | # These variables control the output of the FreeBayes workflow (please have forward slash follow name!)   
66 | fbDir: "freebayes/"
67 | intervalDir: "00_intervals/"
68 | vcfDir_fb: "01_vcfs_byinterval/"
69 | 
70 | #information for read groups
71 | flowcell: "FLOWCELL"
72 | platform: "ILLUMINA"
73 | 
74 | cluster_config: "profiles/slurm/cluster_config.yml"
75 | test_qc: True
76 | 


--------------------------------------------------------------------------------
/docs/modules.md:
--------------------------------------------------------------------------------
 1 | # Modules
 2 | A key goal in the design of the snpArcher pipeline is to allow seamless extensibility with downstream processing. We implement this using Snakemake modules, which allow additional rules to easily extend the main pipeline. We present several modular extensions of snpArcher here, but we hope also that user-developed modules will grow the set of tools linked to snpArcher in order to facilitate diverse analysis.
 3 | ## Module Contribution Guidelines
 4 | We developed a set of criteria for including additional user-developed modules into snpArcher. This project is designed to be modular and easily extensible as we and workflow users develop additional features and downstream analysis tools. To ensure that contributed modules are reproducible and easily implemented, we propose the following evaluation criteria:
 5 | 
 6 | 1. Each module must include Snakemake workflow that defines necessary environments using Conda. 
 7 | 2. The module must be freely distributed via Github with sufficient documentation that users can adapt it to their needs
 8 | 3. The module must provide a unit test based on either existing test datasets available from the main snpArcher workflow or via a module-specific minimal test dataset
 9 | 4. Each module should be registered within the main project page to enhance discoverability and ensure the above criteria are met.
10 | 
11 | If you are interested in developing a module please reach out via email or Github, we'd love to know and chat about it. 
12 | ## Quality Control
13 | The quality control module aggregates various statistics from the workflow and produces preliminary analyses and plots in an interactive HTML file, offering visualizations of summary statistics related to population structure, batch effects, sequencing depth, genetic relatedness, geography, and admixture. Most summaries are based on a random sample of 100,000 SNPs, while others provide high-level summaries of the full variant dataset. These visualizations help identify outliers, potential biases, and sequencing artifacts, such as cryptic genetic variation, batch effects, and reference bias. Additionally, an interactive heatmap aids in quickly identifying close relatives within the dataset, and spatial maps provide a visualization of PCA clusters in space.
14 | ### Config Options
15 | | Option | Description | Type |
16 | | ---- | -------------| ------ |
17 | |`nClusters`| Number of clusters for PCA| `int`|
18 | |`GoogleAPIKey`| Google Maps API key (optional).| `str`|
19 | |`min_depth`| Samples with average depth below this will be excluded for QC analysis| `int`|
20 | 
21 | ```{note}
22 | To generate the QC dashboard, you must have at least 3 samples specified in your sample sheet.
23 | ```
24 | ```{note}
25 | The output of the QC module should not be considered a final analysis and is solely intended to direct quality control of the dataset.
26 | ```
27 | ## Postprocessing
28 | The postprocessing module is designed to be run after snpArcher has intially been run and you have determined if there are samples that you would like to exclude from downstream analyses. In order to trigger this module, you must add the `SampleType` column to your sample sheet, and mark samples for inclusion with the value `include` and exclusion with the value `exclude`. 
29 | 
30 | This module produces a filtered VCF by filtering excluded samples as well as sites not passing default and user defined filters.
31 | ### Config Options
32 | | Option | Description | Type |
33 | | ---- | -------------| ------ |
34 | |`contig_size`| SNPs on contigs this size or smaller will be excluded from 'clean' VCF | `int`|
35 | |`maf`| SNPs with MAF below this will be excluded from clean VCF| `float`|
36 | |`missingness`| SNPs with missingness below this will be excluded from clean VCF| `float`|
37 | |`scaffolds_to_exclude` | Comma separated, no spaces list of scaffolds/contigs to exclude from clean VCF|
38 | 
39 | ```{hint}
40 | If you'd like to run the postprocessing module by default, you can add the `SampleType` column in your sample sheet, and mark all samples as `include`.
41 | ```
42 | ## Trackhubs
43 | The trackhub module generates UCSC Genome Browser track files to explore population variation data from the VCF produced by snpArcher. This module computes and generates genome browser tracks for traditional population genomic summary statistics such as windowed estimates of Tajima’s D, SNP density, Pi, Minor Allele Frequency, SNP depth. To trigger this module, you must set the [config](./setup.md#core-configuration) option to `True` and supply a email (a requirement for tracks displayed on the UCSC Genome Browser).
44 | 
45 | ```{warning}
46 | The Trackhubs module is dependent on the postprocessing module.
47 | ```
48 | 


--------------------------------------------------------------------------------
/.test/postprocess/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV
 6 | resource_config: "config/resources.yaml"  # path to resources yaml config
 7 | final_prefix: "test_postprocess" # prefix for final output files
 8 | intervals: True    #Set to True if you want to perform variant calling using interval approach. 
 9 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
10 | sentieon_lic: "" #set to path of sentieon license
11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 
12 | remote_reads_prefix: "" # set to google bucket prefix where reads live
13 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
14 | trackhub_email: "hi@website.com"
15 | ##############################
16 | # Variables you *might* need to change
17 | ##############################
18 | 
19 | # Interval approach options, only applicable if intervals is True
20 | minNmer: 500   # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
21 | num_gvcf_intervals: 1 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps.
22 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1
23 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper
24 | 
25 | ## Coverage options ##
26 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
27 | min_depth: 2
28 | # low coverage options (< 10x) 
29 | minP: 1
30 | minD: 1
31 | 
32 | # high coverage options (> 10x)
33 | #minP: 2
34 | #minD: 4
35 | 
36 | het_prior: 0.005    #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods
37 | 
38 | ########################################
39 | ## callable sites bed file thresholds ##
40 | ########################################
41 | 
42 | mappability_min: 1    #regions of the genome with mappability less than this will be removed from callable sites bed file
43 | cov_threshold: 2      #regions of the genome with coverage above/below cov_thresh standard deviations will be filtered
44 | 
45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold
46 | #to do strict filtering, set to 0
47 | 
48 | callable_merge: 100   #merge callable regions separated by this or fewer bp into a single region
49 | 
50 | 
51 | ## QC options ##
52 | nClusters: 3
53 | GoogleAPIKey:
54 | min_depth: 2
55 | 
56 | ## Filtering options ##
57 | 
58 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable.
59 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable.
60 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable.
61 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.
62 | 
63 | ########################################
64 | ## coverage thresholds ##
65 | ########################################
66 | 
67 | ## If cov_filter is True, use these parameters to control how coverage filtering is done
68 | ## Three options are provided for coverage-based filtering. The default option is to just filter
69 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit
70 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 
71 | ## and make sure all other coverage variables are empty
72 | 
73 | cov_threshold_lower: 1
74 | cov_threshold_upper: 10000
75 |  
76 | ## Alternatively, filtering can be done based on standard deviations 
77 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage),
78 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed.
79 | ## To use this option, set the variables below to the desired X
80 | ## and make sure all other coverage variables are empty
81 | 
82 | cov_threshold_stdev: 
83 | 
84 | ## Finally, filtering can be done based on absolute scaling of the mean, 
85 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed.
86 | ## To use this option, set the variable below to the desired X
87 | ## and make sure all other coverage variables are empty
88 | 
89 | cov_threshold_rel: 
90 | 
91 | 


--------------------------------------------------------------------------------
/.test/trackhub/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/test_coords.csv"            # name of the sample metadata CSV
 6 | resource_config: "config/resources.yaml"  # path to resources yaml config
 7 | final_prefix: "test_postprocess" # prefix for final output files
 8 | intervals: True    #Set to True if you want to perform variant calling using interval approach. 
 9 | sentieon: False  #set to True if you want to use sentieon, False if you want GATK
10 | sentieon_lic: "" #set to path of sentieon license
11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix. 
12 | remote_reads_prefix: "" # set to google bucket prefix where reads live
13 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
14 | trackhub_email: "hi@website.com"
15 | ##############################
16 | # Variables you *might* need to change
17 | ##############################
18 | 
19 | # Interval approach options, only applicable if intervals is True
20 | minNmer: 500   # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
21 | num_gvcf_intervals: 1 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps.
22 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1
23 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper
24 | 
25 | ## Coverage options ##
26 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
27 | min_depth: 2
28 | # low coverage options (< 10x) 
29 | minP: 1
30 | minD: 1
31 | 
32 | # high coverage options (> 10x)
33 | #minP: 2
34 | #minD: 4
35 | 
36 | het_prior: 0.005    #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods
37 | 
38 | ########################################
39 | ## callable sites bed file thresholds ##
40 | ########################################
41 | 
42 | mappability_min: 1    #regions of the genome with mappability less than this will be removed from callable sites bed file
43 | cov_threshold: 2      #regions of the genome with coverage above/below cov_thresh standard deviations will be filtered
44 | 
45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold
46 | #to do strict filtering, set to 0
47 | 
48 | callable_merge: 100   #merge callable regions separated by this or fewer bp into a single region
49 | 
50 | 
51 | ## QC options ##
52 | nClusters: 3
53 | GoogleAPIKey:
54 | min_depth: 2
55 | 
56 | ## Filtering options ##
57 | 
58 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable.
59 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable.
60 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable.
61 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.
62 | 
63 | ########################################
64 | ## coverage thresholds ##
65 | ########################################
66 | 
67 | ## If cov_filter is True, use these parameters to control how coverage filtering is done
68 | ## Three options are provided for coverage-based filtering. The default option is to just filter
69 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit
70 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 
71 | ## and make sure all other coverage variables are empty
72 | 
73 | cov_threshold_lower: 1
74 | cov_threshold_upper: 10000
75 |  
76 | ## Alternatively, filtering can be done based on standard deviations 
77 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage),
78 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed.
79 | ## To use this option, set the variables below to the desired X
80 | ## and make sure all other coverage variables are empty
81 | 
82 | cov_threshold_stdev: 
83 | 
84 | ## Finally, filtering can be done based on absolute scaling of the mean, 
85 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed.
86 | ## To use this option, set the variable below to the desired X
87 | ## and make sure all other coverage variables are empty
88 | 
89 | cov_threshold_rel: 
90 | 
91 | 
92 | 
93 | ###TRACKHUB###
94 | trackhub_windows: [1000, 10000, 100000]


--------------------------------------------------------------------------------
/.test/ecoli/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/ecoli_samples.csv" # name of the sample metadata CSV
 6 | resource_config: "config/resources.yaml" # path to resources yaml config
 7 | final_prefix: "ecoli_test" # prefix for final output files
 8 | intervals: True #Set to True if you want to perform variant calling using interval (split by ns) approach.
 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK
10 | sentieon_lic: ".lic" #set to path of sentieon license
11 | remote_reads: False # set if you want reads to be on google cloud storage remote
12 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty
13 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only)
14 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
15 | trackhub_email: "hi@website.com"
16 | mark_duplicates: True
17 | sort_reads: False
18 | ##############################
19 | # Variables you *might* need to change
20 | ##############################
21 | 
22 | #refGenome: 
23 | #refPath:
24 | 
25 | # Interval approach options, only applicable if intervals is True
26 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
27 | num_gvcf_intervals: 3 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps.
28 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1
29 | ploidy: 1 # Ploidy for HaplotypeCaller and Sentieon Haplotyper
30 | ## Coverage options ##
31 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
32 | 
33 | # low coverage options (< 10x)
34 | minP: 1
35 | minD: 1
36 | 
37 | # high coverage options (> 10x)
38 | #minP: 2
39 | #minD: 4
40 | 
41 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods
42 | 
43 | ## callable sites bed file options ##
44 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file
45 | mappability_k: 150 #the kmer used to compute mappability with genmap; you should not need to change this except in special cases
46 | 
47 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold
48 | #to do strict filtering, set to 0
49 | 
50 | mappability_merge: 100 #merge passing mappability regions separated by this or fewer bp into a single region
51 | cov_merge: 100 #merge passing coverage regions separate by this or fewer bp into a single region
52 | 
53 | ## QC options ##
54 | nClusters: 3
55 | GoogleAPIKey:
56 | min_depth: 2
57 | 
58 | ## Filtering options ##
59 | 
60 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable.
61 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable.
62 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable.
63 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.
64 | 
65 | ########################################
66 | ## coverage thresholds ##
67 | ########################################
68 | 
69 | ## If cov_filter is True, use these parameters to control how coverage filtering is done
70 | ## Three options are provided for coverage-based filtering. 
71 | 
72 | ## The default option is to filter
73 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit. 
74 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 
75 | ## and make sure all other coverage variables are empty.
76 | 
77 | cov_threshold_lower: 1
78 | cov_threshold_upper: 50000
79 |  
80 | ## Alternatively, filtering can be done based on standard deviations 
81 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage),
82 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed.
83 | ## To use this option, set the variables below to the desired X
84 | ## and make sure all other coverage variables are empty
85 | 
86 | cov_threshold_stdev: 
87 | 
88 | ## Finally, filtering can be done based on absolute scaling of the mean, 
89 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed.
90 | ## To use this option, set the variable below to the desired X
91 | ## and make sure all other coverage variables are empty
92 | 
93 | cov_threshold_rel: 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/.test/ci/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/samples.csv" # path to the sample metadata CSV
 6 | resource_config: "config/resources.yaml" # path to resources yaml config
 7 | final_prefix: "test_ci" # prefix for final output files
 8 | intervals: True #Set to True if you want to perform variant calling using interval approach.
 9 | sentieon: False #set to True if you want to use sentieon, False if you want GATK
10 | sentieon_lic: "" #set to path of sentieon license
11 | remote_reads: False # Set True if reads are in a Google Bucket seperate from --default-remote-prefix.
12 | remote_reads_prefix: "" # set to google bucket prefix where reads live
13 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty
14 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only)
15 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
16 | trackhub_email: "hi@website.com"
17 | mark_duplicates: True
18 | sort_reads: False
19 | ##############################
20 | # Variables you *might* need to change
21 | ##############################
22 | 
23 | # Interval approach options, only applicable if intervals is True
24 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
25 | num_gvcf_intervals: 50 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps.
26 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1
27 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper
28 | ## Coverage options ##
29 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
30 | 
31 | # low coverage options (< 10x)
32 | minP: 1
33 | minD: 1
34 | 
35 | # high coverage options (> 10x)
36 | #minP: 2
37 | #minD: 4
38 | 
39 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods
40 | 
41 | ## callable sites bed file options ##
42 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file
43 | mappability_k: 150 #the kmer used to compute mappability with genmap; you should not need to change this except in special cases
44 | 
45 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold
46 | #to do strict filtering, set to 0
47 | 
48 | mappability_merge: 100 #merge passing mappability regions separated by this or fewer bp into a single region
49 | cov_merge: 100 #merge passing coverage regions separate by this or fewer bp into a single region
50 | 
51 | ## QC options ##
52 | nClusters: 3
53 | GoogleAPIKey:
54 | min_depth: 2
55 | 
56 | ## Filtering options ##
57 | 
58 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable.
59 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable.
60 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable.
61 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.
62 | 
63 | ########################################
64 | ## coverage thresholds ##
65 | ########################################
66 | 
67 | ## If cov_filter is True, use these parameters to control how coverage filtering is done
68 | ## Three options are provided for coverage-based filtering. The default option is to just filter
69 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit
70 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 
71 | ## and make sure all other coverage variables are empty
72 | 
73 | cov_threshold_lower: 1
74 | cov_threshold_upper: 10000
75 |  
76 | ## Alternatively, filtering can be done based on standard deviations 
77 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage),
78 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed.
79 | ## To use this option, set the variables below to the desired X
80 | ## and make sure all other coverage variables are empty
81 | 
82 | cov_threshold_stdev: 
83 | 
84 | ## Finally, filtering can be done based on absolute scaling of the mean, 
85 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed.
86 | ## To use this option, set the variable below to the desired X
87 | ## and make sure all other coverage variables are empty
88 | 
89 | cov_threshold_rel: 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Variables you need to change
 3 | ##############################
 4 | 
 5 | samples: "config/samples.csv" # path to the sample metadata CSV
 6 | final_prefix: "" # prefix for final output files
 7 | intervals: True #Set to True if you want to perform variant calling using interval approach.
 8 | sentieon: False #set to True if you want to use sentieon, False if you want GATK
 9 | sentieon_lic: "" #set to path of sentieon license
10 | remote_reads: False # Set True if reads are in a location seperate from --default-remote-prefix.
11 | remote_reads_prefix: "" # set to google bucket prefix where reads live. FOR SNAKEMAKE 7.X.X ONLY.
12 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty
13 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only)
14 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
15 | trackhub_email: ""
16 | mark_duplicates: True
17 | sort_reads: False
18 | ##############################
19 | # Variables you *might* need to change
20 | ##############################
21 | 
22 | # Set reference genome here if you would like to you use the same reference genome for all samples in sample sheet. See docs for more info.
23 | # refGenome:
24 | # refPath:
25 | 
26 | # Interval approach options, only applicable if intervals is True
27 | minNmer: 500 # the minimum Nmer used to split up the genome; e.g. a value of 200 means only Nmers 200 or greater are used to define the boundaries of intervals. The minimum is 50.
28 | num_gvcf_intervals: 50 # The maximum number of intervals to create for GVCF generation. Note: the actual number of intervals may be less than the specified value if the reference genome has very few gaps.
29 | db_scatter_factor: 0.15 # Scatter factor for calculating number of intervals to create for genomics db generation. (scatter_factor * num_samples * num_gvcf_intervals) gives us number of db intervals to create. Reccomend <1
30 | ploidy: 2 # Ploidy for HaplotypeCaller and Sentieon Haplotyper
31 | ## Coverage options ##
32 | ## default pipeline is optimized for low coverage data - if using high coverage data (> 10x), uncomment high coverage options and comment out low coverage options
33 | 
34 | # low coverage options (< 10x)
35 | minP: 1
36 | minD: 1
37 | 
38 | # high coverage options (> 10x)
39 | #minP: 2
40 | #minD: 4
41 | 
42 | het_prior: 0.005 #prior probabilty of heterozygous site; changes likelihood of a site being called non-ref, but not genotype likelihoods
43 | 
44 | ## callable sites bed file options ##
45 | mappability_min: 1 #regions of the genome with mappability less than this will be removed from callable sites bed file
46 | mappability_k: 150 #the kmer used to compute mappability with genmap; you should not need to change this except in special cases
47 | 
48 | #this ignores small regions of abberatant coverage/mappability as often these are just below the threshold
49 | #to do strict filtering, set to 0
50 | 
51 | mappability_merge: 100 #merge passing mappability regions separated by this or fewer bp into a single region
52 | cov_merge: 100 #merge passing coverage regions separate by this or fewer bp into a single region
53 | 
54 | ## QC options ##
55 | nClusters: 3
56 | GoogleAPIKey:
57 | min_depth: 2
58 | 
59 | ## Filtering options ##
60 | 
61 | contig_size: 10000 # snps on contigs this size or smaller will be filtered from the final clean vcfs. Set to 0 to disable.
62 | maf: 0.01 #snps with MAF below this value will be filtered from the final clean vcfs. Set to 0 to disable.
63 | missingness: 0.75 #snps with missingness greater than this value will be filtered from the final clean vcfs. Set to 1 to disable.
64 | scaffolds_to_exclude: "mtDNA,Y" #comma separated, no spaces list of scaffolds to exclude from final clean vcfs. Set to blank to disable.
65 | 
66 | ########################################
67 | ## coverage thresholds ##
68 | ########################################
69 | 
70 | ## If cov_filter is True, use these parameters to control how coverage filtering is done
71 | ## Three options are provided for coverage-based filtering. The default option is to just filter
72 | ## regions of the genome with mean coverage below a minimal threshold (default = 1), with a very large upper limit
73 | ## To use this option, set the variables below to the lower absolute mean coverage limit and upper absolute mean coverage limit, 
74 | ## and make sure all other coverage variables are empty
75 | 
76 | cov_threshold_lower: 1
77 | cov_threshold_upper: 10000
78 |  
79 | ## Alternatively, filtering can be done based on standard deviations 
80 | ## (assumes a Poisson distribution, so stdev_cov equals the square root of the mean coverage),
81 | ## where regions of the genome with mean coverage < X standard deviations or > X standard deviations are removed.
82 | ## To use this option, set the variables below to the desired X
83 | ## and make sure all other coverage variables are empty
84 | 
85 | cov_threshold_stdev: 
86 | 
87 | ## Finally, filtering can be done based on absolute scaling of the mean, 
88 | ## where regions of the genome with mean coverage < (global mean coverge / X) or > (global mean coverge * X) are removed.
89 | ## To use this option, set the variable below to the desired X
90 | ## and make sure all other coverage variables are empty
91 | 
92 | cov_threshold_rel: 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/workflow/modules/trackhub/scripts/write_hub_files.py:
--------------------------------------------------------------------------------
  1 | from os.path import basename
  2 | import shutil
  3 | import os
  4 | 
  5 | # https://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#UseOneFile
  6 | hub_text = """hub {genome}
  7 | shortLabel {genome} snpArcher Track Hub
  8 | longLabel {genome} snpArcher Track Hub
  9 | useOneFile on
 10 | descriptionUrl index.html
 11 | email {email}\n
 12 | genome {genome}\n"""
 13 | 
 14 | vcf_track_txt = """track VCF
 15 | bigDataUrl {vcf_file}
 16 | shortLabel VCF
 17 | longLabel VCF
 18 | visibility squish
 19 | html index.html
 20 | type vcfTabix\n"""
 21 | 
 22 | window_parent_txt = """track {track_type}
 23 | compositeTrack on
 24 | shortLabel {track_type}
 25 | longLabel {track_type}
 26 | color {color}
 27 | altColor 0,102,204
 28 | autoScale on
 29 | type bigWig
 30 | allButtonPair on
 31 | html index.html
 32 | visibility full\n"""
 33 | 
 34 | window_track_txt = """track {track_name}
 35 | parent {parent} on
 36 | bigDataUrl {data_url}
 37 | type bigWig
 38 | visibility {vis}
 39 | shortLabel {label}
 40 | longLabel {label}\n"""
 41 | 
 42 | allele_freq_txt = """track MinorAlleleFrequency
 43 | bigDataUrl {data_url}
 44 | type bigWig
 45 | color 88,85,120
 46 | altColor 0,102,204
 47 | autoScale on
 48 | visibility full
 49 | shortLabel Minor Allele Frequency
 50 | html index.html
 51 | longLabel Minor Allele Frequency\n"""
 52 | 
 53 | snp_depth_txt = """track SNPDepth
 54 | bigDataUrl {data_url}
 55 | type bigWig
 56 | color 120,172,145
 57 | altColor 0,102,204
 58 | autoScale on
 59 | visibility full
 60 | shortLabel SNP Depth
 61 | html index.html
 62 | longLabel SNP Depth\n"""
 63 | 
 64 | coverage_track_txt = """track NonCallableSites
 65 | bigDataUrl {cov_file}
 66 | shortLabel Non Callable Sites
 67 | type bigBed
 68 | longLabel Non Callable Sites
 69 | color 0,0,0
 70 | html index.html
 71 | visibility dense\n"""
 72 | 
 73 | COLORS = {
 74 |     "Tajima": "(70,130,180)",
 75 |     "SNP-Density": "(186,85,211)",
 76 |     "Pi": "(248,174,51)",
 77 | }
 78 | 
 79 | html = """<!DOCTYPE html>
 80 | <html>
 81 | <head>
 82 |   <title>snpArcher Track Hub Description</title>
 83 |   <style>
 84 |     body {
 85 |       font-family: Arial, sans-serif;
 86 |       margin: 20px;
 87 |     }
 88 |     h1 {
 89 |       font-size: 24px;
 90 |       font-weight: bold;
 91 |     }
 92 |     h2 {
 93 |       font-size: 20px;
 94 |       font-weight: bold;
 95 |       margin-top: 20px;
 96 |     }
 97 |     h3 {
 98 |       font-size: 16px;
 99 |       font-weight: bold;
100 |       margin-top: 10px;
101 |       margin-left: 20px;
102 |     }
103 |     p {
104 |       font-size: 16px;
105 |       margin-left: 40px;
106 |     }
107 |   </style>
108 | </head>
109 | <body>
110 |   <h1>snpArcher Track Hub Description</h1>
111 |   
112 |   <h2>Introduction</h2>
113 |   <p>To facilitate downstream data exploration and as an example of the module development components of this work, we
114 |     developed a module to generate UCSC Genome Browser track files to explore population variation data (<a
115 |         href="https://doi.org/10.1093/molbev/msad270">see paper for details</a>).</p>
116 |   
117 |   <h2>Track Descriptions</h2>
118 |   
119 |   <h3>Tajima’s D</h3>
120 |   <p>This track provides windowed estimates of Tajima’s D, a population genetic statistic that measures the departure from neutral evolution in a DNA sequence.</p>
121 |   
122 |   <h3>SNP Density</h3>
123 |   <p>This track displays the density of single nucleotide polymorphisms (SNPs) across the genome, showing regions with high or low levels of genetic variation.</p>
124 |   
125 |   <h3>Pi</h3>
126 |   <p>The Pi track represents the average number of nucleotide differences per site between any two sequences in a population, providing an estimate of genetic diversity.</p>
127 |   
128 |   <h3>Minor Allele Frequency</h3>
129 |   <p>This track shows the frequency of the less common allele at a SNP locus, providing insights into the genetic variation within a population.</p>
130 |   
131 |   <h3>SNP Depth</h3>
132 |   <p>The SNP Depth track displays the number of reads or sequencing depth at each SNP position, indicating the coverage and quality of the variant calls.</p>
133 |   
134 |   <h3>Non Callable Sites</h3>
135 |   <p>The Non Callable Sites track highlights regions in the genome that are considered non-callable, meaning that they have low sequencing coverage or other technical limitations that make it difficult to accurately determine genetic variation in those regions.</p>
136 | 
137 | </body>
138 | </html>
139 | 
140 | 
141 | 
142 | 
143 | """
144 | 
145 | 
146 | def human_format(num):
147 |     num = float("{:.3g}".format(num))
148 |     magnitude = 0
149 |     while abs(num) >= 1000:
150 |         magnitude += 1
151 |         num /= 1000.0
152 |     return "{}{}".format(
153 |         "{:f}".format(num).rstrip("0").rstrip("."), ["", "K", "M", "B", "T"][magnitude]
154 |     )
155 | 
156 | 
157 | def main():
158 |     file_types = snakemake.params["file_types"]  # noqa: F821
159 |     email = snakemake.params["email"]  # noqa: F821
160 |     trackhub_windows = snakemake.params["windows"]  # noqa: F821
161 |     vcf_file = basename(snakemake.input["vcf"][0])  # noqa: F821
162 |     cov_file = basename(snakemake.input["callable_sites"][0])  # noqa: F821
163 |     freq_file = basename(snakemake.input["allele_freq"][0])  # noqa: F821
164 |     depth_file = basename(snakemake.input["depth"][0])  # noqa: F821
165 |     genome = snakemake.params["refGenome"]  # noqa: F821
166 |     trackhub_file = snakemake.output["trackhub_file"]  # noqa: F821
167 |     html_file = snakemake.output["html"]  # noqa: F821
168 | 
169 |     with open(html_file, "w") as f:
170 |         f.write(html)
171 | 
172 |     with open(trackhub_file, "w") as out:
173 |         print(hub_text.format(genome=genome, email=email), file=out)
174 |         print(vcf_track_txt.format(vcf_file=vcf_file), file=out)
175 |         print(coverage_track_txt.format(cov_file=cov_file), file=out)
176 |         print(allele_freq_txt.format(data_url=freq_file), file=out)
177 |         print(snp_depth_txt.format(data_url=depth_file), file=out)
178 | 
179 |         for file in file_types:
180 |             print(
181 |                 window_parent_txt.format(track_type=file, color=COLORS[file]), file=out
182 |             )
183 |             for window in trackhub_windows:
184 |                 track_name = f"{file}_{human_format(window)}_bp_bins"
185 |                 label = f"{file}_{human_format(window)}_bp bins"
186 |                 url = f"{file}_{window}.bw"
187 |                 if window == 1000:
188 |                     vis = "full"
189 |                 else:
190 |                     vis = "hide"
191 |                 print(
192 |                     window_track_txt.format(
193 |                         track_name=track_name,
194 |                         label=label,
195 |                         parent=file,
196 |                         data_url=url,
197 |                         vis=vis,
198 |                     ),
199 |                     file=out,
200 |                 )
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     main()
205 | 


--------------------------------------------------------------------------------
/workflow/modules/postprocess/Snakefile:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want
  6 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve()
  7 | if str(utils_path) not in sys.path:
  8 |     sys.path.append(str(utils_path))
  9 | 
 10 | import pandas as pd
 11 | import snparcher_utils
 12 | configfile: "config/config.yaml"
 13 | 
 14 | samples = snparcher_utils.parse_sample_sheet(config)
 15 | REFGENOME = samples['refGenome'].unique().tolist()
 16 | 
 17 | rule all:
 18 |     input:
 19 |         expand("results/{refGenome}/{prefix}_filtered.vcf.gz", refGenome=REFGENOME, prefix=config['final_prefix']),
 20 |         expand("results/{refGenome}/{prefix}_clean_snps.vcf.gz", refGenome=REFGENOME, prefix=config['final_prefix']),
 21 |         expand("results/{refGenome}/{prefix}_clean_indels.vcf.gz", refGenome=REFGENOME, prefix=config['final_prefix'])
 22 | 
 23 | rule filter_individuals:
 24 |     """
 25 |     make list of individuals to exclude based on sampleType column
 26 |     """
 27 |     output:
 28 |         include = "results/{refGenome}/postprocess/{prefix}_samps.txt",
 29 |     run:
 30 |         out_df = samples[["BioSample", "SampleType"]]
 31 |         out_df.drop_duplicates("BioSample", inplace=True)
 32 |         include =out_df[~out_df.SampleType.isin(["exclude"])].BioSample
 33 |         include_clean = include.dropna()
 34 |         include_clean.to_csv(output[0], index=False, sep="\t", header=False)
 35 | 
 36 | rule basic_filter:
 37 |     """
 38 |     Filters a vcf file to remove samples marked exclude, sites that don't pass filters,
 39 |     sites with reference equal to N or alt equal to ., and sites with AF == 0.
 40 |     """
 41 |     input: 
 42 |         vcf = "results/{refGenome}/{prefix}_raw.vcf.gz",
 43 |         include = "results/{refGenome}/postprocess/{prefix}_samps.txt"
 44 |     output:
 45 |         filtered = "results/{refGenome}/{prefix}_filtered.vcf.gz",
 46 |         filtered_idx = "results/{refGenome}/{prefix}_filtered.vcf.gz.csi"
 47 |     conda:
 48 |         "envs/filter.yml"
 49 |     shell:
 50 |         """
 51 |         bcftools view -S {input.include} -f .,PASS {input.vcf} -a -U -O u | bcftools +fill-tags -Ou |
 52 |         bcftools view -m2 -e 'AF==0 | ref="N" | ALT="."' -O z -o {output.filtered}
 53 |         bcftools index {output.filtered}
 54 |         """
 55 | 
 56 | rule update_bed:
 57 |     """
 58 |     Updates callable sites bed file to add contigs less than threshold to regions to exclude
 59 |     """
 60 |     input:
 61 |         bed = "results/{refGenome}/{prefix}_callable_sites.bed",
 62 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai"
 63 |     output:
 64 |         bed = "results/{refGenome}/postprocess/{prefix}_exclude_sites.bed",
 65 |         tmp_bed = temp("results/{refGenome}/postprocess/{prefix}_tmp.bed")
 66 |     conda:
 67 |         "envs/bed.yml"
 68 |     params:
 69 |         size_filter = config["contig_size"],
 70 |     shell:
 71 |         """
 72 |         awk 'BEGIN{{OFS="\\t"}}{{if ($2<{params.size_filter}) {{print $1,0,$2}}}}' {input.fai} > {output.tmp_bed}
 73 |         cat {output.tmp_bed} {input.bed} | bedtools sort -i - | bedtools merge -i - > {output.bed}
 74 |         """
 75 | 
 76 | rule strict_filter:  
 77 |     input: 
 78 |         bed = "results/{refGenome}/postprocess/{prefix}_exclude_sites.bed",
 79 |         vcf = "results/{refGenome}/{prefix}_filtered.vcf.gz",
 80 |         filtered_idx = "results/{refGenome}/{prefix}_filtered.vcf.gz.csi"
 81 |     output: 
 82 |         vcf = temp("results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz"),
 83 |         idx = temp("results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz.csi")
 84 |     conda:
 85 |         "envs/filter.yml"
 86 |     params:
 87 |         miss = config["missingness"],
 88 |         maf = config["maf"],
 89 |         upper_bound = lambda wildcards: 1 - float(config["maf"]),
 90 |         chr_ex = config["scaffolds_to_exclude"],   
 91 |     shell:
 92 |         """
 93 |         if [ -z "{params.chr_ex}" ]
 94 |         then
 95 |             bcftools view -R {input.bed} -m2 -M2 \
 96 |             -e 'F_MISSING > {params.miss} | AF<{params.maf} | AF>{params.upper_bound}' \
 97 |             {input.vcf} -O u -o {output.vcf}
 98 |         else
 99 |             bcftools view -t ^{params.chr_ex} -R {input.bed} -m2 -M2 \
100 |             -e 'F_MISSING > {params.miss} | AF<{params.maf} | AF>{params.upper_bound}' \
101 |             {input.vcf} -O u -o {output.vcf} 
102 |         fi
103 |         bcftools index {output.vcf}
104 |         """
105 | 
106 | rule subset_indels:
107 |     """
108 |     Produce a clean vcf with only indels variants.
109 |     """
110 |     input: 
111 |         vcf = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz",
112 |         idx = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz.csi",
113 |     output: 
114 |         vcf = "results/{refGenome}/{prefix}_clean_indels.vcf.gz",
115 |         idx = "results/{refGenome}/{prefix}_clean_indels.vcf.gz.tbi"
116 |     conda:
117 |         "envs/filter.yml"
118 |     log:
119 |         "logs/{refGenome}/postprocess/{prefix}_subset_indels.txt"
120 |     shell:
121 |         """
122 |         bcftools view -v indels -O z -o {output.vcf} {input.vcf}
123 |         bcftools index -t {output.vcf}
124 |         """
125 | 
126 | rule subset_snps:
127 |     """
128 |     Produce a clean vcf with only simple snps.
129 |     """
130 |     input: 
131 |         vcf = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz",
132 |         idx = "results/{refGenome}/postprocess/{prefix}_filtered.TEMP.vcf.gz.csi"
133 |     output: 
134 |         vcf = temp("results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz"),
135 |         idx = temp("results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz.tbi")
136 |     conda:
137 |         "envs/filter.yml"    
138 |     log:
139 |         "logs/{refGenome}/postprocess/{prefix}_subset_snps.txt"
140 |     shell:
141 |         """
142 |         bcftools view -v snps -e 'TYPE ~ "indel"' -O z -o {output.vcf} {input.vcf}
143 |         bcftools index -t {output.vcf}
144 |         """
145 | 
146 | rule drop_indel_SNPs:
147 |     """
148 |     identify and remove SNPs that overlapped with indels and are coded as genotype length > 1
149 |     """
150 |     input: 
151 |         vcf = "results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz",
152 |         idx = "results/{refGenome}/postprocess/{prefix}_clean_snps_1.vcf.gz.tbi"
153 |     output:
154 |         keep_snps = temp("results/{refGenome}/postprocess/{prefix}_keep_snp_positions.txt"),
155 |         vcf = "results/{refGenome}/{prefix}_clean_snps.vcf.gz",
156 |         idx = "results/{refGenome}/{prefix}_clean_snps.vcf.gz.tbi"
157 |     conda:
158 |         "envs/filter.yml"    
159 |     log:
160 |         "logs/{refGenome}/postprocess/{prefix}_drop_indel_snps.txt"
161 |     shell:
162 |         """
163 |         bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\n' {input.vcf} | awk 'length($3) == 1 {{print $1"\t"$2}}' | bgzip -c > {output.keep_snps} 
164 |         tabix -s1 -b2 -e2 {output.keep_snps}
165 |         bcftools view -T {output.keep_snps} {input.vcf} -Oz -o {output.vcf}
166 |         bcftools index -t {output.vcf}
167 |         """


--------------------------------------------------------------------------------
/docs/executing.md:
--------------------------------------------------------------------------------
  1 | # Running snpArcher
  2 | ## Setup
  3 | Please refer to our [setup instructions](./setup.md) to prepare the snpArcher environment and requisite files. 
  4 | ## Test datasets
  5 | To test that your environment is properly setup, you can run a quick test with the following command:
  6 | ```
  7 | snakemake -d .test/ecoli --cores 1 --use-conda --workflow-profile workflow-profiles/default
  8 | ```
  9 | If this runs without errors, you are ready to go!
 10 | ## Using the Dry-run option
 11 | Snakemake offers the `--dry-run (-n)` CLI option to perform a dry-run of the workflow to show what jobs would be run. We recommend doing this before executing snpArcher to ensure that the sample sheet was setup correctly, and Snakemake has correctly built the workflow DAG.
 12 | ## Local Execution
 13 | Once you have setup the requisite configuration files and sample sheet, executing snpArcher on your local machine is as simple as running the Snakemake command with the number of cores you would like to use. For example, to use 8 cores you would run:
 14 | ```
 15 | snakemake --cores 8 --use-conda --workflow-profile workflow-profiles/default
 16 | ```
 17 | 
 18 | ### Optional directory setup 
 19 | To maintain organization across many different projects, you may consider creating a new directory for each project you run using snpArcher. This way, each of your project directories will contain the configuration files used for that run. Below is an example directory structure:
 20 | 
 21 | ```
 22 | .
 23 | ├── snpArcher
 24 | ├── project_1/
 25 | │   ├── config/
 26 | │   │   ├── config.yaml
 27 | │   │   └── samples.csv
 28 | │   ├── data
 29 | │   └── results
 30 | └── project_2/
 31 |     ├── config/
 32 |     │   ├── config.yaml
 33 |     │   └── samples.csv
 34 |     └── data
 35 | ```
 36 | 
 37 | When creating a new directory for an analysis, ensure that you copy the `config` directory from the snpArcher directory to your new directory.
 38 | 
 39 | Then, to run snpArcher on `project_2` from our example, we would execute the command:
 40 | ```
 41 | snakemake -s ./snpArcher/workflow/Snakefile -d ./project_2 <other CLI options>
 42 | ```
 43 | 
 44 | ## Cluster Execution
 45 | Snakemake [supports most cluster schedulers](https://snakemake.github.io/snakemake-plugin-catalog/) via executor plugins. Here, we provide documentation for SLURM, however please refer to Snakemake's documentation for further details on using other plugins.
 46 | ### SLURM
 47 | #### Install plugin
 48 | To execute snpArcher on a SLURM cluster, you will need to install the [SLURM executor plugin](https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/slurm.html) into the snpArcher environment.
 49 | ```shell
 50 | conda activate snparcher
 51 | pip install snakemake-executor-plugin-slurm
 52 | ```
 53 | #### Profile Setup
 54 | To specify resources for the workflow to SLURM, you must use a workflow profile. We have provided a SLURM profile template (`workflow-profiles/slurm/config.yaml`) which you can modify to specify SLURM partitions, memory allocation, etc. Please refer to the [profiles setup section](./setup.md#resources) for more details. Also see the [Snakemake documentation on profiles](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) for more info.
 55 | 
 56 | An example SLURM profile specifies required and recommended Snakemake options:
 57 | ```yaml
 58 | executor: slurm
 59 | use-conda: True
 60 | jobs: 100 # Have up to N jobs submitted at any given time
 61 | latency-wait: 20 # Wait N seconds for output files due to latency
 62 | retries: 3 # Retry jobs N times.
 63 | ```
 64 | 
 65 | #### Running the workflow
 66 | Once you have modified the SLURM profile appropriately, you can run snpArcher with the following command:
 67 | ```shell
 68 | snakemake --workflow-profile <path/to/profile-directory> <other options>
 69 | ```
 70 | Depending on your cluster, you can run this command on the head node and Snakemake will submit jobs to the SLURM queue. You can also submit this command via `srun` or `sbatch`.
 71 | 
 72 | ## Cloud Execution
 73 | ```{warning}
 74 | Google Lifesciences execution is not supported by Snakemake versions >8. Please use Snakemake v7.32.4 if you would like to use this execution mode.
 75 | ```
 76 | Like cluster execution, Snakemake [supports a number of cloud providers](https://snakemake.readthedocs.io/en/stable/executing/cloud.html). Here we provide documentation for executing using Snakemake's Google Lifesciences integration. Please refer to Snakemake's documentation for details on using other cloud providers.
 77 | ### Google Lifesciences
 78 | Snakemake's integration with the Google Lifesciences (GLS) API allows you to easily run snpArcher on the Google Cloud Platform (GCP). Using this execution mode allows you to take advantage of hundreds or thousands of GCP virtual machine instances. Snakemake manages deploying instances, running jobs, and deleting instances with finished jobs. Furthermore, you can use preemptible instances which are offered at a large cost discount, but can only run for a maximum of 24 hours. 
 79 | 
 80 | We include profiles for GLS using preemptible instances so that you can get up and running quickly.
 81 | #### Google Credential Setup
 82 | In order to use the Google Lifesciences execution option, you must first setup your Google Cloud credentials. Please refer [here](https://snakemake.readthedocs.io/en/stable/executor_tutorial/google_lifesciences.html#credentials) for full details.
 83 | #### Data setup
 84 | To use this execution mode, you must have a Google Storage bucket with your raw data files. This can be achieved by using Google's web interface, or at the command line using [`gsutil`](https://cloud.google.com/storage/docs/gsutil). For example, if we have some data locally like so:
 85 | ```
 86 | .
 87 | └── data/
 88 |     ├── raw_reads/
 89 |     │   ├── samp_1_R1.fq.gz
 90 |     │   ├── samp_1_R2.fq.gz
 91 |     │   ├── samp_2_R1.fq.gz
 92 |     │   ├── samp_2_R2.fq.gz
 93 |     │   ├── samp_3_R1.fq.gz
 94 |     │   ├── samp_3_R2.fq.gz
 95 |     │   ├── samp_4_R1.fq.gz
 96 |     │   ├── samp_4_R2.fq.gz
 97 |     │   └── ...
 98 |     └── ref_genome/
 99 |         └── genome.fa
100 | ```
101 | 
102 | We can copy this data to our bucket like so:
103 | ```
104 | gsutil cp -r ./data gs://<bucket-name>
105 | ```
106 | 
107 | ```{note}
108 | When using cloud execution, do not include the bucket name in any path fields of the sample sheet, such as fq1, fq2, or refPath
109 | ```
110 | ```{note}
111 | If you are using data hosted on NCBI, you do not need to upload those data to your bucket, snpArcher will handle this for you. However, you still need to create storage bucket to be used for the workflow.
112 | ```
113 | 
114 | Some users may want to store their raw reads in a separate bucket from where the workflow will store files. To do so, you can specify the remote prefix in `config/config.yaml`.
115 | 
116 | #### Running the workflow
117 | Once your credentials and data are setup, you can run snpArcher using the included profile `profiles/google_lifesciences/config.yaml`. You should modify this profile to set resources for the workflow such as instance type, threads, preemptible rules, etc.  
118 | 
119 | To run the workflow, execute the following command:
120 | ```
121 | snakemake --workflow-profile <GLS profile> --default-remote-prefix <bucket name>
122 | ```
123 | 
124 | As the workflow runs, Snakemake will print out logging information to the terminal. Please refer [here](https://snakemake.readthedocs.io/en/stable/executor_tutorial/google_lifesciences.html#step-5-debugging) for further details.
125 | 


--------------------------------------------------------------------------------
/workflow/modules/trackhub/Snakefile:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | # Get utils. This is not great, but we can move to setup.py and install via pip later if want
  6 | utils_path = (Path(workflow.main_snakefile).parent.parent.parent).resolve()
  7 | if str(utils_path) not in sys.path:
  8 |     sys.path.append(str(utils_path))
  9 | 
 10 | import pandas as pd
 11 | import snparcher_utils
 12 | 
 13 | configfile: "config/config.yaml"
 14 | wildcard_constraints:
 15 |     window="\d+"
 16 | 
 17 | samples = snparcher_utils.parse_sample_sheet(config)
 18 | REFGENOME = samples['refGenome'].unique().tolist()
 19 | WINDOWS = [1000, 10000, 100000]
 20 | FILE_TYPES = ["Tajima", "SNP-Density", "Pi"] # dont change this unless you add rules to generate more stats.
 21 | 
 22 | rule all:
 23 |     input:
 24 |         trackhub_file = expand("results/{refGenome}/trackhub/hub.txt", refGenome=REFGENOME),
 25 |         trackhub_html = expand("results/{refGenome}/trackhub/index.html", refGenome=REFGENOME),
 26 | 
 27 | rule write_hub_files:
 28 |     input:
 29 |         window_stat_files = expand("results/{{refGenome}}/trackhub/{file_type}_{window}.bw", file_type=FILE_TYPES, window=WINDOWS),
 30 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
 31 |         callable_sites = "results/{refGenome}/trackhub/non_callable_sites.bb",
 32 |         allele_freq = "results/{refGenome}/trackhub/allele_freq.bw",
 33 |         depth = "results/{refGenome}/trackhub/depth.bw",
 34 |         
 35 |     output:
 36 |         trackhub_file = "results/{refGenome}/trackhub/hub.txt",
 37 |         html = "results/{refGenome}/trackhub/index.html"
 38 |     params:
 39 |         refGenome = "{refGenome}",
 40 |         file_types = FILE_TYPES,
 41 |         windows = WINDOWS,
 42 |         email = config["trackhub_email"]
 43 |     script:
 44 |         "scripts/write_hub_files.py"
 45 | 
 46 | rule strip_vcf:
 47 |     """
 48 |     Strips vcf of all info/filters to reduce size
 49 |     """
 50 |     input:
 51 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
 52 |     output:
 53 |         vcf = "results/{refGenome}/trackhub/info_stripped_snps.vcf.gz",
 54 |         tbi = "results/{refGenome}/trackhub/info_stripped_snps.vcf.gz.tbi"
 55 |     log:
 56 |         "logs/{refGenome}/trackhub/strip_vcf.log"
 57 |     conda:
 58 |         "envs/trackhub.yml"
 59 |     shell:
 60 |         """
 61 |         bcftools annotate -x INFO,FORMAT/DP,FORMAT/GQ,FORMAT/PL {input.vcf} -O z -o {output.vcf}
 62 |         tabix -p vcf {output.vcf}
 63 |         """
 64 | 
 65 | 
 66 | rule calc_tajima:
 67 |     input:
 68 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
 69 |     output:
 70 |         temp("results/{refGenome}/trackhub/{window}.Tajima")
 71 |     log:
 72 |         "logs/{refGenome}/trackhub/tajima/{window}.log"
 73 |     conda:
 74 |         "envs/trackhub.yml"
 75 |     shell:
 76 |         """
 77 |         vcftools --gzvcf {input} --TajimaD {wildcards.window} --stdout > {output} 2>> {log}
 78 |         """
 79 | 
 80 | rule calc_snpden:
 81 |     input:
 82 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
 83 |     output:
 84 |         temp("results/{refGenome}/trackhub/{window}.SNP-Density")
 85 |     log:
 86 |         "logs/{refGenome}/trackhub/SNP-Density/{window}.log"
 87 |     conda:
 88 |         "envs/trackhub.yml"
 89 |     shell:
 90 |         """
 91 |         vcftools --gzvcf {input} --SNPdensity {wildcards.window} --stdout > {output} 2> {log}
 92 |         """
 93 | 
 94 | rule calc_pi:
 95 |     input:
 96 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
 97 |     output:
 98 |         temp( "results/{refGenome}/trackhub/{window}.Pi")
 99 |     log:
100 |         "logs/{refGenome}/trackhub/Pi/{window}.log"
101 |     conda:
102 |         "envs/trackhub.yml"
103 |     shell:
104 |         """
105 |         vcftools --gzvcf {input} --window-pi {wildcards.window} --stdout > {output} 2> {log}
106 |         """
107 | 
108 | rule chrom_sizes:
109 |     input:
110 |         "results/{refGenome}/data/genome/{refGenome}.fna.fai"
111 |     output:
112 |         "results/{refGenome}/trackhub/chrom.sizes"
113 |     shell:
114 |         "cut -f1,2 {input} > {output}"
115 | 
116 | rule bcftools_depth:
117 |     input:
118 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
119 |         chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes"
120 |     output:
121 |         bg = temp("results/{refGenome}/trackhub/depth.bg"),
122 |         bw = "results/{refGenome}/trackhub/depth.bw"
123 |         
124 |     conda:
125 |         "envs/trackhub.yml"
126 |     shell:
127 |         """
128 |         bcftools query -f '%CHROM\t%POS\t%POS\t%DP\n' {input.vcf} | awk -v OFS='\t' '{{print $1,$2-1,$2,$4}}' > {output.bg}
129 |         bedGraphToBigWig {output.bg} {input.chrom_sizes} {output.bw}
130 |         """
131 | 
132 | rule vcftools_freq:
133 |     input:
134 |         vcf = expand("results/{{refGenome}}/{prefix}_clean_snps.vcf.gz", prefix=config['final_prefix']),
135 |         chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes"
136 |     output:
137 |         bg = temp("results/{refGenome}/trackhub/allele_freq.bg"),
138 |         bw = "results/{refGenome}/trackhub/allele_freq.bw"
139 |     
140 |     conda:
141 |         "envs/trackhub.yml"
142 |     shell:
143 |         """
144 |         vcftools --gzvcf {input.vcf} --freq2 --stdout | cut -f 1,2,6 | tail -n +2 | awk -v OFS='\t' '{{print $1,$2-1,$2,$3}}' > {output.bg}
145 |         bedGraphToBigWig {output.bg} {input.chrom_sizes} {output.bw}
146 |         """
147 |         
148 | rule convert_to_bedgraph:
149 |     input:
150 |         stat_file = "results/{refGenome}/trackhub/{window}.{file_type}",
151 |         chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes"
152 |     output:
153 |         temp("results/{refGenome}/trackhub/{file_type}_{window}.bg")
154 |     script:
155 |         "scripts/vcftools_out_to_bg.py"
156 | 
157 | rule bedgraph_to_bigwig:
158 |     input:
159 |         bg = "results/{refGenome}/trackhub/{file_type}_{window}.bg",
160 |         chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes"
161 |     output:
162 |         "results/{refGenome}/trackhub/{file_type}_{window}.bw"
163 |     conda:
164 |         "envs/trackhub.yml"
165 |     shell:
166 |         "bedGraphToBigWig {input.bg} {input.chrom_sizes} {output}"
167 | 
168 | rule non_callable_sites:
169 |     """
170 |     Calculates complement of callable sites to make 'non-callable sites' this makes visualzing on browser easier.
171 |     """
172 |     input:
173 |         callable_sites = expand("results/{{refGenome}}/{prefix}_callable_sites.bed", prefix=config['final_prefix']),
174 |         chrom_sizes = "results/{refGenome}/trackhub/chrom.sizes",
175 |         
176 |     output:
177 |         bed = temp("results/{refGenome}/trackhub/non_callable_sites.bed"),
178 |         bb = "results/{refGenome}/trackhub/non_callable_sites.bb"
179 |     conda:
180 |         "envs/trackhub.yml"
181 |     shadow:
182 |         "minimal"
183 |     shell:
184 |         """
185 |         sort -k1,1V {input.chrom_sizes} > sorted.chrom.sizes
186 |         sort -k1,1V -k2,2n {input.callable_sites} >  sorted_callable_sites.bed
187 |         bedtools complement -i sorted_callable_sites.bed -g sorted.chrom.sizes > {output.bed}
188 | 
189 |         bedSort {output.bed} bedsort_non_callable_sites.bed
190 |         bedToBigBed bedsort_non_callable_sites.bed sorted.chrom.sizes {output.bb}
191 |         """


--------------------------------------------------------------------------------
/workflow/rules/bam2vcf_gatk.smk:
--------------------------------------------------------------------------------
  1 | localrules: create_db_mapfile
  2 | 
  3 | rule bam2gvcf:
  4 |     """
  5 |     TODO
  6 |     """
  7 |     input:
  8 |         unpack(get_bams),
  9 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 10 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
 11 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
 12 |         
 13 |     output:
 14 |         gvcf = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz",
 15 |         tbi = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz.tbi"
 16 |     log:
 17 |         "logs/{refGenome}/gatk_hc/{sample}.txt"
 18 |     benchmark:
 19 |         "benchmarks/{refGenome}/gatk_hc/{sample}.txt"
 20 |     params:
 21 |         minPrun = config['minP'],
 22 |         minDang = config['minD'],
 23 |         ploidy = config['ploidy']
 24 |     conda:
 25 |         "../envs/bam2vcf.yml"
 26 |     shell:
 27 |         "gatk HaplotypeCaller "
 28 |         "--java-options \"-Xmx{resources.mem_mb_reduced}m\" "
 29 |         "-R {input.ref} "
 30 |         "-I {input.bam} "
 31 |         "-O {output.gvcf} "
 32 |         "-ploidy {params.ploidy} "
 33 |         "--emit-ref-confidence GVCF --min-pruning {params.minPrun} --min-dangling-branch-length {params.minDang} &> {log}"
 34 | 
 35 | rule create_db_mapfile:
 36 |     """
 37 |     TODO
 38 |     """
 39 |     input:
 40 |         get_input_for_mapfile
 41 |     output:
 42 |         db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
 43 |     run:
 44 |         with open(output.db_mapfile, "w") as f:
 45 |             for file_path in input:
 46 |                 sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "")
 47 |                 print(sample_name, file_path, sep="\t", file=f)
 48 | 
 49 | rule prepare_db_intervals:
 50 |     """GenomicsDBImport needs list of intervals to operate on so this rule writes that file"""
 51 |     input:
 52 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
 53 |     output:
 54 |         intervals = "results/{refGenome}/genomics_db_import/db_intervals.list"
 55 |     run:
 56 |         with open(output.intervals, "w") as out:
 57 |             with open(input.fai, "r") as f:
 58 |                 for line in f:
 59 |                     line = line.strip().split()
 60 |                     chrom, end = line[0], line[1]
 61 |                     print(f"{chrom}:1-{end}", file=out)
 62 | 
 63 | rule gvcf2DB:
 64 |     """
 65 |     todo
 66 |     """
 67 |     input:
 68 |         unpack(get_gvcfs_db),
 69 |         db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt",
 70 |         intervals = "results/{refGenome}/genomics_db_import/db_intervals.list"
 71 |     output:
 72 |         db = temp(directory("results/{refGenome}/genomics_db_import/DB")),
 73 |         tar = temp("results/{refGenome}/genomics_db_import/DB.tar"),        
 74 |     log:
 75 |         "logs/{refGenome}/gatk_db_import.txt"
 76 |     benchmark:
 77 |         "benchmarks/{refGenome}/gatk_db_import.txt"
 78 |     conda:
 79 |         "../envs/bam2vcf.yml"
 80 |     shell:
 81 |         # NOTE: reader-threads > 1 useless if you specify multiple intervals
 82 |         # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
 83 |         """
 84 |         export TILEDB_DISABLE_FILE_LOCKING=1
 85 |         gatk GenomicsDBImport \
 86 |             --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \
 87 |             --genomicsdb-shared-posixfs-optimizations true \
 88 |             --batch-size 25 \
 89 |             --genomicsdb-workspace-path {output.db} \
 90 |             -L {input.intervals} \
 91 |             --merge-input-intervals \
 92 |             --tmp-dir {resources.tmpdir} \
 93 |             --sample-name-map {input.db_mapfile} &> {log}
 94 |         
 95 |         tar -cf {output.tar} {output.db}
 96 |         """
 97 | 
 98 | rule DB2vcf:
 99 |     """
100 |     This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
101 |     are still scattered.
102 |     """
103 |     input:
104 |         db = "results/{refGenome}/genomics_db_import/DB.tar",
105 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
106 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
107 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
108 |     output:
109 |         vcf = temp("results/{refGenome}/vcfs/raw.vcf.gz"),
110 |         vcfidx = temp("results/{refGenome}/vcfs/raw.vcf.gz.tbi"),
111 |     params:
112 |         het = config['het_prior'],
113 |         db = lambda wc, input: input.db[:-4]
114 |     log:
115 |         "logs/{refGenome}/gatk_genotype_gvcfs.txt"
116 |     benchmark:
117 |         "benchmarks/{refGenome}/gatk_genotype_gvcfs.txt"
118 |     conda:
119 |         "../envs/bam2vcf.yml"
120 |     shell:
121 |         """
122 |         tar -xf {input.db}
123 |         gatk GenotypeGVCFs \
124 |             --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \
125 |             -R {input.ref} \
126 |             --heterozygosity {params.het} \
127 |             --genomicsdb-shared-posixfs-optimizations true \
128 |             -V gendb://{params.db} \
129 |             -O {output.vcf} \
130 |             --tmp-dir {resources.tmpdir} &> {log}
131 |         """
132 | 
133 | rule filterVcfs:
134 |     """
135 |     This rule filters all of the VCFs
136 |     """
137 |     input:
138 |         vcf = "results/{refGenome}/vcfs/raw.vcf.gz",
139 |         vcfidx = "results/{refGenome}/vcfs/raw.vcf.gz.tbi",
140 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
141 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
142 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
143 |     output:
144 |         vcf = temp("results/{refGenome}/vcfs/filtered.vcf.gz"),
145 |         vcfidx = temp("results/{refGenome}/vcfs/filtered.vcf.gz.tbi")
146 |     conda:
147 |         "../envs/bam2vcf.yml"
148 |     log:
149 |         "logs/{refGenome}/gatk_filter.txt"
150 |     benchmark:
151 |         "benchmarks/{refGenome}/gatk_filter.txt"
152 |     shell:
153 |         "gatk VariantFiltration "
154 |         "-R {input.ref} "
155 |         "-V {input.vcf} "
156 |         "--output {output.vcf} "
157 |         "--filter-name \"RPRS_filter\" "
158 |         "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
159 |         "--filter-name \"FS_SOR_filter\" "
160 |         "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') &&  SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') &&  SOR > 10.0)))\" "
161 |         "--filter-name \"MQ_filter\" "
162 |         "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" "
163 |         "--filter-name \"QUAL_filter\" "
164 |         "--filter-expression \"QUAL < 30.0\" "
165 |         "--create-output-variant-index  "
166 |         "--invalidate-previous-filters true &> {log}"
167 | 
168 | rule sort_gatherVcfs:
169 |     input:
170 |         vcf = "results/{refGenome}/vcfs/filtered.vcf.gz",
171 |         vcfidx = "results/{refGenome}/vcfs/filtered.vcf.gz.tbi"
172 |     output:
173 |         vcfFinal = "results/{refGenome}/{prefix}_raw.vcf.gz",
174 |         vcfFinalidx = "results/{refGenome}/{prefix}_raw.vcf.gz.tbi"
175 |     conda:
176 |         "../envs/bcftools.yml"
177 |     log:
178 |         "logs/{refGenome}/sort_gather_vcfs/{prefix}_log.txt"
179 |     benchmark:
180 |         "benchmarks/{refGenome}/sort_gather_vcfs/{prefix}_benchmark.txt"
181 |     shell:
182 |         """
183 |         bcftools sort -Oz -o {output.vcfFinal} {input.vcf} 2>> {log}
184 |         tabix -p vcf {output.vcfFinal} 2>> {log}
185 |         """


--------------------------------------------------------------------------------
/workflow/rules/sentieon.smk:
--------------------------------------------------------------------------------
  1 | rule sentieon_map:
  2 |     input:
  3 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
  4 |         r1 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_1.fastq.gz",
  5 |         r2 = "results/{refGenome}/filtered_fastqs/{sample}/{run}_2.fastq.gz",
  6 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"])
  7 |     output: 
  8 |         bam = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam"),
  9 |         bai = temp("results/{refGenome}/bams/preMerge/{sample}/{run}.bam.bai"),
 10 |     params:
 11 |         rg = get_read_group,
 12 |         lic = config['sentieon_lic']
 13 |     conda:
 14 |         "../envs/sentieon.yml"
 15 |     log:
 16 |         "logs/{refGenome}/sentieon_map/{sample}/{run}.txt"
 17 |     benchmark:
 18 |         "benchmarks/{refGenome}/sentieon_map/{sample}/{run}.txt"
 19 |     shell:
 20 |         """
 21 |         export MALLOC_CONF=lg_dirty_mult:-1
 22 |         export SENTIEON_LICENSE={params.lic}
 23 |         sentieon bwa mem -M -R {params.rg} -t {threads} -K 10000000 {input.ref} {input.r1} {input.r2} | sentieon util sort --bam_compression 1 -r {input.ref} -o {output.bam} -t {threads} --sam2bam -i -
 24 |         samtools index {output.bam} {output.bai}
 25 |         """
 26 | rule merge_bams:
 27 |     input:
 28 |         merge_bams_input
 29 |     output:
 30 |         bam = temp("results/{refGenome}/bams/postMerge/{sample}.bam"),
 31 |         bai = temp("results/{refGenome}/bams/postMerge/{sample}.bam.bai")
 32 |     conda:
 33 |         "../envs/fastq2bam.yml"
 34 |     log:
 35 |         "logs/{refGenome}/merge_bams/{sample}.txt"
 36 |     benchmark:
 37 |         "benchmarks/{refGenome}/merge_bams/{sample}.txt"
 38 |     shell:
 39 |         "samtools merge {output.bam} {input} && samtools index {output.bam}"
 40 | 
 41 | rule sentieon_dedup:
 42 |     input:
 43 |         unpack(dedup_input),
 44 |     output:
 45 |         dedupBam = "results/{refGenome}/bams/{sample}_final.bam",
 46 |         dedupBai = "results/{refGenome}/bams/{sample}_final.bam.bai",
 47 |         score = temp("results/{refGenome}/summary_stats/{sample}/sentieon_dedup_score.txt"),
 48 |         metrics = temp("results/{refGenome}/summary_stats/{sample}/sentieon_dedup_metrics.txt")
 49 |     params:
 50 |         lic = config['sentieon_lic']
 51 |     conda:
 52 |         "../envs/sentieon.yml"
 53 |     log:
 54 |         "logs/{refGenome}/sentieon_dedup/{sample}.txt"
 55 |     benchmark:
 56 |         "benchmarks/{refGenome}/sentieon_dedup/{sample}.txt"
 57 |     shell:
 58 |         """
 59 |         export SENTIEON_LICENSE={params.lic}
 60 |         sentieon driver -t {threads} -i {input.bam} --algo LocusCollector --fun score_info {output.score}
 61 |         sentieon driver -t {threads} -i {input.bam} --algo Dedup --score_info {output.score} --metrics {output.metrics} --bam_compression 1 {output.dedupBam}
 62 |         """
 63 | 
 64 | rule sentieon_haplotyper:
 65 |     input:
 66 |         unpack(get_bams),
 67 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 68 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
 69 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
 70 |     params:
 71 |         lic = config['sentieon_lic'],
 72 |         ploidy = config['ploidy']
 73 |     output:
 74 |         gvcf = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz",
 75 |         gvcf_idx = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz.tbi",
 76 |     conda:
 77 |         "../envs/sentieon.yml"
 78 |     log:
 79 |         "logs/{refGenome}/sentieon_haplotyper/{sample}.txt"
 80 |     benchmark:
 81 |         "benchmarks/{refGenome}/sentieon_haplotyper/{sample}.txt"
 82 |     shell:
 83 |         """
 84 |         export SENTIEON_LICENSE={params.lic}
 85 |         sentieon driver -r {input.ref} -t {threads} -i {input.bam} --algo Haplotyper --genotype_model multinomial --emit_mode gvcf --emit_conf 30 --call_conf 30 {output.gvcf} --ploidy {params.ploidy} 2> {log}
 86 |         """
 87 | 
 88 | rule sentieon_combine_gvcf:
 89 |     input:
 90 |         unpack(sentieon_combine_gvcf_input),
 91 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 92 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
 93 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict"
 94 |     output:
 95 |         vcf = temp("results/{refGenome}/vcfs/raw.vcf.gz"),
 96 |         tbi = temp("results/{refGenome}/vcfs/raw.vcf.gz.tbi")
 97 |     params:
 98 |         glist = lambda wc, input: " ".join(["-v " + gvcf for gvcf in input['gvcfs']]),
 99 |         lic = config['sentieon_lic']
100 |     conda:
101 |         "../envs/sentieon.yml"
102 |     log:
103 |         "logs/{refGenome}/sentieon_combine_gvcf/log.txt"
104 |     benchmark:
105 |         "benchmarks/{refGenome}/sentieon_combine_gvcf/benchmark.txt"
106 |     shell:
107 |         """
108 |         export SENTIEON_LICENSE={params.lic}
109 |         sentieon driver -r {input.ref} -t {threads} --algo GVCFtyper --emit_mode VARIANT {output.vcf} {params.glist} 2> {log}
110 |         """
111 | 
112 | rule filter_vcf:
113 |     """
114 |     This rule applies filters to the raw vcf using GNU Parallel.
115 |     """
116 |     input:
117 |         vcf = "results/{refGenome}/vcfs/raw.vcf.gz",
118 |         tbi = "results/{refGenome}/vcfs/raw.vcf.gz.tbi",
119 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
120 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
121 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict"
122 |     output:
123 |         vcf = "results/{refGenome}/{prefix}_raw.vcf.gz",
124 |         tbi = "results/{refGenome}/{prefix}_raw.vcf.gz.tbi"
125 |     conda:
126 |         "../envs/bam2vcf.yml"
127 |     log:
128 |         "logs/{refGenome}/sentieon_filter_vcfs/{prefix}_log.txt"
129 |     shadow: "minimal"
130 |     benchmark:
131 |         "benchmarks/{refGenome}/sentieon_filter_vcfs/{prefix}_benchmark.txt"
132 |     shell:
133 |         """
134 |         # get the contig names from the .fai index
135 |         contigs=$(cut -f1 {input.indexes[5]})
136 |         
137 |         # create a function that will be passed to gnu parallel
138 |         filter_contig() {{
139 |             contig=$1
140 |             echo $contig
141 | 
142 |             gatk --java-options "-Xmx4g" VariantFiltration \
143 |                 -R {input.ref} \
144 |                 -L ${{contig}} \
145 |                 -V {input.vcf} \
146 |                 --output {wildcards.refGenome}_{wildcards.prefix}_filter_${{contig}}.vcf.gz \
147 |                 --filter-name "RPRS_filter" \
148 |                 --filter-expression "(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)" \
149 |                 --filter-name "FS_SOR_filter" \
150 |                 --filter-expression "(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') &&  SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') &&  SOR > 10.0)))" \
151 |                 --filter-name "MQ_filter" \
152 |                 --filter-expression "vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))" \
153 |                 --filter-name "QUAL_filter" \
154 |                 --filter-expression "QUAL < 30.0" \
155 |                 --invalidate-previous-filters true
156 |         }}
157 |         
158 |         export -f filter_contig
159 |         
160 |         # pass each contig to gnu parallel
161 |         parallel -j {threads} filter_contig ::: ${{contigs}} 2> {log}
162 |         
163 |         bcftools concat {wildcards.refGenome}_{wildcards.prefix}_filter_*.vcf.gz --threads {threads} -Oz -o {output.vcf} 2>> {log}
164 |         tabix -p vcf {output.vcf} 2>> {log}
165 |         """
166 | 


--------------------------------------------------------------------------------
/workflow/rules/bam2vcf_gatk_intervals.smk:
--------------------------------------------------------------------------------
  1 | localrules: create_db_mapfile
  2 | 
  3 | rule bam2gvcf:
  4 |     """
  5 |     TODO
  6 |     """
  7 |     input:
  8 |         unpack(get_bams),
  9 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
 10 |         indexes = expand("results/{{refGenome}}/data/genome/{{refGenome}}.fna.{ext}", ext=["sa", "pac", "bwt", "ann", "amb", "fai"]),
 11 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
 12 |         l = "results/{refGenome}/intervals/gvcf_intervals/{l}-scattered.interval_list",
 13 |         
 14 |     output:
 15 |         gvcf = "results/{refGenome}/interval_gvcfs/{sample}/{l}.raw.g.vcf.gz",
 16 |         gvcf_idx = "results/{refGenome}/interval_gvcfs/{sample}/{l}.raw.g.vcf.gz.tbi"
 17 |     log:
 18 |         "logs/{refGenome}/gatk_hc/{sample}/{l}.txt"
 19 |     benchmark:
 20 |         "benchmarks/{refGenome}/gatk_hc/{sample}_{l}.txt"
 21 |     params:
 22 |         minPrun = config['minP'],
 23 |         minDang = config['minD'],
 24 |         ploidy = config['ploidy'],
 25 |     conda:
 26 |         "../envs/bam2vcf.yml"
 27 |     shell:
 28 |         """
 29 |         gatk HaplotypeCaller \
 30 |         --java-options -Xmx{resources.mem_mb_reduced}m \
 31 |         -R {input.ref} \
 32 |         -I {input.bam} \
 33 |         -O {output.gvcf} \
 34 |         -L {input.l} \
 35 |         -ploidy {params.ploidy} \
 36 |         --emit-ref-confidence GVCF --min-pruning {params.minPrun} --min-dangling-branch-length {params.minDang} &> {log}
 37 |         """
 38 | 
 39 | rule concat_gvcfs:
 40 |     input:
 41 |         gvcfs = get_interval_gvcfs,
 42 |         tbis = get_interval_gvcfs_idx
 43 |     output:
 44 |         gvcf = temp("results/{refGenome}/gvcfs/{sample}.g.vcf.gz"),
 45 |         tbi = temp("results/{refGenome}/gvcfs/{sample}.g.vcf.gz.tbi")
 46 |     log:
 47 |         "logs/{refGenome}/concat_gvcfs/{sample}.txt"
 48 |     benchmark:
 49 |         "benchmarks/{refGenome}/concat_gvcfs/{sample}.txt"
 50 |     resources:
 51 |         tmpdir = get_big_temp
 52 |     conda:
 53 |         "../envs/bcftools.yml"
 54 |     shell:
 55 |         """
 56 |         bcftools concat -D -a -Ou {input.gvcfs} 2> {log} | bcftools sort -T {resources.tmpdir} -Oz -o {output.gvcf} - 2>> {log}
 57 |         tabix -p vcf {output.gvcf} 2>> {log}
 58 |         """
 59 | 
 60 | rule bcftools_norm:
 61 |     input:
 62 |         gvcf = "results/{refGenome}/gvcfs/{sample}.g.vcf.gz",
 63 |     output:
 64 |         gvcf = "results/{refGenome}/gvcfs_norm/{sample}.g.vcf.gz",
 65 |         tbi = "results/{refGenome}/gvcfs_norm/{sample}.g.vcf.gz.tbi"
 66 |     log:
 67 |         "logs/{refGenome}/norm_gvcf/{sample}.txt"
 68 |     benchmark:
 69 |         "benchmarks/{refGenome}/norm_gvcf/{sample}.txt"
 70 |     resources:
 71 |         tmpdir = get_big_temp
 72 |     conda:
 73 |         "../envs/bcftools.yml"
 74 |     shell:
 75 |         """
 76 |         bcftools norm -m +any -Oz -o {output.gvcf} {input.gvcf} 2> {log}
 77 |         tabix -p vcf {output.gvcf} 2>> {log}
 78 |         """
 79 | 
 80 | rule create_db_mapfile:
 81 |     """
 82 |     TODO
 83 |     """
 84 |     input:
 85 |         get_input_for_mapfile
 86 |     output:
 87 |         db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
 88 |     run:
 89 |         with open(output.db_mapfile, "w") as f:
 90 |             for file_path in input:
 91 |                 sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "")
 92 |                 print(sample_name, file_path, sep="\t", file=f)
 93 | 
 94 | rule gvcf2DB:
 95 |     """
 96 |     Create GenomicsDB.
 97 |     """
 98 |     input:
 99 |         unpack(get_gvcfs_db),
100 |         l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
101 |         db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
102 |     output:
103 |         db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}")),
104 |         tar = temp("results/{refGenome}/genomics_db_import/DB_L{l}.tar"),        
105 |     log:
106 |         "logs/{refGenome}/gatk_db_import/{l}.txt"
107 |     benchmark:
108 |         "benchmarks/{refGenome}/gatk_db_import/{l}.txt"
109 |     resources:
110 |         tmpdir = get_big_temp
111 |     conda:
112 |         "../envs/bam2vcf.yml"
113 |     shell:
114 |         # NOTE: reader-threads > 1 useless if you specify multiple intervals
115 |         # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
116 |         """
117 |         export TILEDB_DISABLE_FILE_LOCKING=1
118 |         gatk GenomicsDBImport \
119 |             --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \
120 |             --genomicsdb-shared-posixfs-optimizations true \
121 |             --batch-size 25 \
122 |             --genomicsdb-workspace-path {output.db} \
123 |             --merge-input-intervals \
124 |             -L {input.l} \
125 |             --tmp-dir {resources.tmpdir} \
126 |             --sample-name-map {input.db_mapfile} &> {log}
127 |         
128 |         tar -cf {output.tar} {output.db}
129 |         """
130 | 
131 | rule DB2vcf:
132 |     """
133 |     This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
134 |     are still scattered.
135 |     """
136 |     input:
137 |         db = "results/{refGenome}/genomics_db_import/DB_L{l}.tar",
138 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
139 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
140 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
141 |     output:
142 |         vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
143 |         vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
144 |     params:
145 |         het = config['het_prior'],
146 |         db = lambda wc, input: input.db[:-4]
147 |     resources:
148 |         tmpdir = get_big_temp
149 |     log:
150 |         "logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
151 |     benchmark:
152 |         "benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
153 |     conda:
154 |         "../envs/bam2vcf.yml"
155 |     shell:
156 |         """
157 |         tar -xf {input.db}
158 |         gatk GenotypeGVCFs \
159 |             --java-options '-Xmx{resources.mem_mb_reduced}m -Xms{resources.mem_mb_reduced}m' \
160 |             -R {input.ref} \
161 |             --heterozygosity {params.het} \
162 |             --genomicsdb-shared-posixfs-optimizations true \
163 |             -V gendb://{params.db} \
164 |             -O {output.vcf} \
165 |             --tmp-dir {resources.tmpdir} &> {log}
166 |         """
167 | 
168 | rule filterVcfs:
169 |     """
170 |     This rule filters all of the VCFs
171 |     """
172 |     input:
173 |         vcf = "results/{refGenome}/vcfs/intervals/L{l}.vcf.gz",
174 |         vcfidx = "results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi",
175 |         ref = "results/{refGenome}/data/genome/{refGenome}.fna",
176 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
177 |         dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
178 |     output:
179 |         vcf = temp("results/{refGenome}/vcfs/intervals/filtered_L{l}.vcf.gz"),
180 |         vcfidx = temp("results/{refGenome}/vcfs/intervals/filtered_L{l}.vcf.gz.tbi")
181 |     conda:
182 |         "../envs/bam2vcf.yml"
183 |     log:
184 |         "logs/{refGenome}/gatk_filter/{l}.txt"
185 |     benchmark:
186 |         "benchmarks/{refGenome}/gatk_filter/{l}.txt"
187 |     shell:
188 |         "gatk VariantFiltration "
189 |         "-R {input.ref} "
190 |         "-V {input.vcf} "
191 |         "--output {output.vcf} "
192 |         "--filter-name \"RPRS_filter\" "
193 |         "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
194 |         "--filter-name \"FS_SOR_filter\" "
195 |         "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') &&  SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') &&  SOR > 10.0)))\" "
196 |         "--filter-name \"MQ_filter\" "
197 |         "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" "
198 |         "--filter-name \"QUAL_filter\" "
199 |         "--filter-expression \"QUAL < 30.0\" "
200 |         "--create-output-variant-index  "
201 |         "--invalidate-previous-filters true &> {log}"
202 | 
203 | rule sort_gatherVcfs:
204 |     input:
205 |         vcfs = get_interval_vcfs,
206 |         tbis = get_interval_vcfs_idx
207 |     output:
208 |         vcfFinal = "results/{refGenome}/{prefix}_raw.vcf.gz",
209 |         vcfFinalidx = "results/{refGenome}/{prefix}_raw.vcf.gz.tbi"
210 |     conda:
211 |         "../envs/bcftools.yml"
212 |     log:
213 |         "logs/{refGenome}/sort_gather_vcfs/{prefix}_log.txt"
214 |     benchmark:
215 |         "benchmarks/{refGenome}/sort_gather_vcfs/{prefix}_benchmark.txt"
216 |     resources:
217 |         tmpdir = get_big_temp
218 |     shell:
219 |         """
220 |         bcftools concat -D -a -Ou {input.vcfs} 2> {log}| bcftools sort -T {resources.tmpdir} -Oz -o {output.vcfFinal} - 2>> {log}
221 |         tabix -p vcf {output.vcfFinal} 2>> {log}
222 |         """
223 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
  1 | # Examples
  2 | On this page you will find an example project scenario and how to setup and run it using snpArcher.
  3 | 
  4 | In this example, we have 10 resequenced individuals we would like to generate variant calls for. We will cover creating the sample sheet, selecting config options, and running the workflow.
  5 | ## Directory structure
  6 | First, let's setup our directories as suggested in our [executing](./executing.md#optional-directory-setup) instructions. Let's assume we are working in a directory called `workdir/`, and the snpArcher repository has already been cloned there. We have also already created the `snparcher` conda env as instructed in the [setup docs](./setup.md#environment-setup).
  7 | 
  8 | 1. Let's create a directory to organize this project and future ones, call it `projects`. Then, create a new directory for this project, we'll call it `secretarybird_reseq`. 
  9 | ```
 10 | .
 11 | ├── projects
 12 | │   └── secretarybird_reseq
 13 | └── snpArcher
 14 |     └── ...
 15 | ```
 16 | ```{note}
 17 | Not all files and directories are shown, only relevant ones. 
 18 | ```
 19 | 2. Copy the snpArcher config directory `snpArcher/config` to `projects/secretarybird_reseq`:
 20 | ```
 21 | .
 22 | ├── projects
 23 | │   └── secretarybird_reseq
 24 | │       └── config
 25 | │           └── config.yaml
 26 | └── snpArcher
 27 |     └── ...
 28 | ```
 29 | 
 30 | 3. Assume we already have all our sequence data and reference genome on our system, stored in a different location `/storage/data`. We do not need to move the raw data to our project directory. 
 31 | ```{note}
 32 | We'll cover the cases using SRA data and refSeq genomes later on in this example.
 33 | ```
 34 | ## Sample sheet setup
 35 | Now we need to setup our sample sheet to inform snpArcher of our samples and their metadata. You can use any editor to create the sheet, as long as it is a CSV file. We will save the sample sheet in our project's config directory: `projects/secretarybird_reseq/samples.csv`. Below is the final sample sheet that we will use going forward, with explanations of each column following.
 36 | 
 37 | For a more comprehensive explanation of the sample sheet, please refer to [here](./setup.md#creating-a-sample-sheet) for more details.
 38 | 
 39 | 
 40 | ### Final sample sheet
 41 | ```
 42 | BioSample,LibraryName,Run,fq1,fq2,lat,long
 43 | bird_1,bird_1_lib,1,/storage/data/bird_1_R1.fq.gz,/storage/data/bird_1_R2.fq.gz,-8.758119,-36.280061
 44 | bird_2,bird_2_lib,2,/storage/data/bird_2_R1.fq.gz,/storage/data/bird_2_R2.fq.gz,-72.336165,35.751903
 45 | bird_3,bird_3_lib,3,/storage/data/bird_3_R1.fq.gz,/storage/data/bird_3_R2.fq.gz,-11.874137,-5.382251
 46 | bird_4,bird_4_lib,4,/storage/data/bird_4_R1.fq.gz,/storage/data/bird_4_R2.fq.gz,-73.235723,-145.261219
 47 | bird_5,bird_5_lib,5,/storage/data/bird_5_R1.fq.gz,/storage/data/bird_5_R2.fq.gz,88.08701,-52.658705
 48 | bird_6,bird_6_lib,6,/storage/data/bird_6_R1.fq.gz,/storage/data/bird_6_R2.fq.gz,69.640536,-12.971862
 49 | bird_7,bird_7_lib,7,/storage/data/bird_7_R1.fq.gz,/storage/data/bird_7_R2.fq.gz,18.608941,-100.485774
 50 | bird_8,bird_8_lib,8,/storage/data/bird_8_R1.fq.gz,/storage/data/bird_8_R2.fq.gz,-36.570632,-102.38721
 51 | bird_9,bird_9_lib,9,/storage/data/bird_9_R1.fq.gz,/storage/data/bird_9_R2.fq.gz,-88.592265,157.406505
 52 | bird_10,bird_10_lib,10,/storage/data/bird_10_R1.fq.gz,/storage/data/bird_10_R2.fq.gz,40.106437,-58.649016
 53 | ```
 54 | ### Description of Columns
 55 | 1. **BioSample**: This is the name for the sample.
 56 | 2. **LibraryName**: Identifier for the sample's sequencing library. This is especially important if you have samples that were sequenced multiple times across multiple lanes, which is not the case in this example. See [here](./setup.md#handling-samples-with-more-than-one-pair-of-reads) for more details.
 57 | 3. **Run**: If we were using reads from the SRA, this is where the sample's SRR accession would go. However, since we have local data, this just has to be a unique value.
 58 | 4. **fq1**: Path to the first read pair. Absolute paths are recommended. If we were using SRA data, this column should be omitted.
 59 | 5. **fq1**: Path to the second read pair. Same note as fq1.
 60 | 6. **lat**: Decimal latitude for the sample, used to generate map in QC module output.
 61 | 6. **long**: Decimal longitude for the sample, used to generate map in QC module output. 
 62 | 
 63 | ```{note}
 64 | If your project has multiple genomes, you can add the refPath and refGenome columns.
 65 | ```
 66 | 
 67 | ## Config file setup
 68 | Now that we've created our sample sheet, we need to edit the config file we copied earlier: `projects/secretarybird_reseq/config.yaml`. This file controls the main options for controlling snpArcher's outputs. Refer to the [setup section](./setup.md#configuring-snparcher) for more details. 
 69 | 
 70 | In our example we are using all of the default options. This will configure snpArcher to perform variant calling using GATK with the scatter-by-intervals approach. Also, we have set our reference genome name and path since we want to use the same genome for all samples in our sample sheet.
 71 | 
 72 | ```
 73 | samples: "config/samples.csv" # path to the sample metadata CSV
 74 | final_prefix: "" # prefix for final output files
 75 | intervals: True #Set to True if you want to perform variant calling using interval approach.
 76 | sentieon: False #set to True if you want to use sentieon, False if you want GATK
 77 | sentieon_lic: "" #set to path of sentieon license
 78 | remote_reads: False # Set True if reads are in a location seperate from --default-remote-prefix.
 79 | bigtmp: "" #Set to a path with lots of free space to use for commands that require large amounts of temp space; defaults to system tmpdir if empty
 80 | cov_filter: True #set to True if you want to include coverage thresholds in the callable sites bed file (default uses mappability only)
 81 | generate_trackhub: True #Set to true if you want to generate a Genome Browser Trackhub. Dependent on postprocessing module.
 82 | trackhub_email: "hi@email.com"
 83 | ##############################
 84 | # Variables you *might* need to change
 85 | ##############################
 86 | 
 87 | # Set reference genome here if you would like to you use the same reference genome for all samples in sample sheet. See docs for more info.
 88 | refGenome: "bird_genome" # Name for reference genome
 89 | refPath: "/storage/data/bird.fa.gz"
 90 | ```
 91 | 
 92 | ## Profile setup
 93 | Snakemake uses profile YAML files to specify commonly used command line arguments, so you don't have to remember all of the arguments you need. Read more about profiles [here](https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles). To specify a profile, you can use the `--workflow-profile` option when running Snakemake.
 94 | 
 95 | ```
 96 | cp -r snpArcher/workflow-profiles projects/secretarybird_reseq
 97 | ```
 98 | 
 99 | The profile also enables you to specify the compute resources any of snpArcher's rules can use. This is done via the YAML keys `default-resources`, `set-resources`, and `set-threads`. `default-resources` will apply to all rules, and `set-resources` can be applied to indiviudal rules, overriding what the default was set to. There is no way to set a default thread value. 
100 | 
101 | First, we will specify how many threads each rule can use. This is the same using the default or SLURM profile. Both profiles come with reasonable default thread values, but you may need to adjust based on your system or cluster. 
102 | 
103 | Let's say we wanted the alignment step (bwa mem) to use more threads:
104 | ```
105 | # ...
106 | set-threads:
107 |   bwa_map: 16 # Changed from 8 to 16.
108 | # ...
109 | ```
110 | Next, we will specify memory and other resources. This step only applies if you are running on a SLURM cluster.
111 | 
112 | In our example cluster, we have two compute partitions, "short" and "long". So we want to put long running jobs on the "long" partition, and the rest on "short". Additionally, the "short" partition has a timelimit of 1 hour and "long" 10 hours, so we will specify that. 
113 | 
114 | First, lets specify the default resources:
115 | ```
116 | default-resources:
117 |   mem_mb: attempt * 2000
118 |   mem_mb_reduced: (attempt * 2000) * 0.9 # Mem allocated to java for GATK rules (tries to prevent OOM errors)
119 |   slurm_partition: "short" # This line was changed
120 |   slurm_account: # Same as sbatch -A. Not all clusters use this.
121 |   runtime: 60 # In minutes 
122 | ```
123 | Then, lets modify the specific resources for the GATK HaplotypeCaller step:
124 | ```
125 | set-resources:
126 | # ... other rules
127 |    bam2gvcf: # HaplotypeCaller <--- This line was uncommented
128 | #     mem_mb: attempt * 2000
129 | #     mem_mb_reduced: (attempt * 2000) * 0.9 # Mem allocated to java (tries to prevent OOM errors)
130 |      slurm_partition: "long" # This line was changed
131 |      runtime: 600 # This line was changed
132 | ```
133 | 
134 | ## Running the workflow
135 | We are now ready to run the workflow! From our working directory we can run the command:
136 | ```
137 | snakemake -s snpArcher/workflow/Snakefile -d projects/secretarybird_reseq --workflow-profile projects/secretarybird_reseq/workflow-profiles/default
138 | ```
139 | This instructs Snakemake to use snpArcher's workflow file, and to run in the project directory we setup using the config and sample sheet we setup there.
140 | 
141 | If we were on a SLURM cluster, we would add `--executor slurm` to our command:
142 | ```
143 | snakemake --executor slurm -s snpArcher/workflow/Snakefile -d projects/secretarybird_reseq --workflow-profile projects/secretarybird_reseq/workflow-profiles/default
144 | ```


--------------------------------------------------------------------------------
/workflow/modules/qc/Snakefile:
--------------------------------------------------------------------------------
  1 | configfile: "config/config.yaml"
  2 | include: "common.smk"
  3 | 
  4 | 
  5 | samples = snparcher_utils.parse_sample_sheet(config)
  6 | REFGENOME = samples['refGenome'].unique().tolist()
  7 | 
  8 | rule all:
  9 |     input:
 10 |         expand("results/{refGenome}/QC/{prefix}_qc.html", refGenome=REFGENOME, prefix=config['final_prefix'])
 11 | 
 12 | rule check_fai:
 13 |     """
 14 |     checks fai file for numeric first column, then do not run plink and rest of workflow if they are all numeric
 15 |     """
 16 |     input:
 17 |         vcf = "results/{refGenome}/{prefix}_raw.vcf.gz",
 18 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
 19 |     output:
 20 |         faiResult = "results/{refGenome}/QC/{prefix}_fai_tmp.txt"
 21 |     run:
 22 |         check_contig_names(input.fai, output.faiResult)
 23 | 
 24 | rule vcftools_individuals:
 25 |     input:
 26 |         vcf = "results/{refGenome}/{prefix}_raw.vcf.gz"
 27 |     output:
 28 |         depth = "results/{refGenome}/QC/{prefix}.idepth",
 29 |         miss = "results/{refGenome}/QC/{prefix}.imiss",
 30 |         samps = "results/{refGenome}/QC/{prefix}.samps.txt",
 31 |         summ = "results/{refGenome}/QC/{prefix}.FILTER.summary",
 32 |         het = "results/{refGenome}/QC/{prefix}.het"
 33 |     conda:
 34 |         "envs/vcftools_individuals.yml"
 35 |     params:
 36 |         prefix = lambda wc, input: os.path.join(input.vcf.rsplit("/", 1)[0], "QC", wc.prefix),
 37 |         min_depth = config["min_depth"]
 38 |     log:
 39 |         "logs/{refGenome}/QC/vcftools_individuals/{prefix}.txt"
 40 |     shell:
 41 |         """
 42 |         vcftools --gzvcf {input.vcf} --FILTER-summary --out {params.prefix} &> {log}
 43 |         vcftools --gzvcf {input.vcf} --out {params.prefix} --depth &>> {log}
 44 |         vcftools --gzvcf {input.vcf} --out {params.prefix} --het &>> {log}
 45 |         vcftools --gzvcf {input.vcf} --out {params.prefix} --missing-indv &>> {log}
 46 |         tail -n +2 {output.depth} | awk '$3>{params.min_depth} {{print $1}}'> {output.samps} 2>> {log}
 47 |         """
 48 | 
 49 | rule subsample_snps:
 50 |     input:
 51 |         vcf = "results/{refGenome}/{prefix}_raw.vcf.gz",
 52 |         samps = "results/{refGenome}/QC/{prefix}.samps.txt",
 53 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
 54 |         sumstats = "results/{refGenome}/summary_stats/{prefix}_bam_sumstats.txt"
 55 |     output:
 56 |         filtered = temp("results/{refGenome}/QC/{prefix}_filtered.vcf.gz"),
 57 |         filtered_idx = temp("results/{refGenome}/QC/{prefix}_filtered.vcf.gz.csi"),
 58 |         pruned = "results/{refGenome}/QC/{prefix}.pruned.vcf.gz",
 59 |         snpqc = "results/{refGenome}/QC/{prefix}_snpqc.txt",
 60 |         fai = "results/{refGenome}/QC/{prefix}.fna.fai",
 61 |         sumstats = "results/{refGenome}/QC/{prefix}_bam_sumstats.txt"
 62 |     conda:
 63 |         "envs/subsample_snps.yml"
 64 |     params:
 65 |         chr_ex = config["scaffolds_to_exclude"]
 66 |     log:
 67 |         "logs/{refGenome}/QC/subsample_snps/{prefix}.txt"
 68 |     shell:
 69 |         """
 70 |         ##first remove filtered sites and retain only biallelic SNPs
 71 |         ##Also remove sites with MAF < 0.01 and those with > 75% missing data
 72 |         if [ -z "{params.chr_ex}" ]
 73 |         then
 74 |             bcftools view -S {input.samps} -v snps -m2 -M2 -f .,PASS -e 'AF==1 | AF==0 | AF<0.01 | ALT="*" | F_MISSING > 0.75 | TYPE~"indel" | ref="N"' {input.vcf} -O z -o {output.filtered} &> {log}
 75 |         else
 76 |             bcftools view -S {input.samps} -t ^{params.chr_ex} -v snps -m2 -M2 -f .,PASS -e 'AF==1 | AF==0 | AF<0.01 | ALT="*" | F_MISSING > 0.75 | TYPE~"indel" | ref="N"' {input.vcf} -O z -o {output.filtered} &> {log}
 77 |         fi
 78 |         bcftools index {output.filtered} &>> {log}
 79 | 
 80 |         #figure out how many SNPs are left, then identify how big of SNP window size to get down to between 100 and 150k snps        
 81 |         ALLSITES=`bcftools query -f '%CHROM\t%POS\n' {output.filtered} | wc -l`
 82 |         SITES=`echo $(( ${{ALLSITES}} / 100000 ))`
 83 | 
 84 |         #if the top VCF has < 150k SNPs, then just take all the SNPs
 85 |         if [[ $SITES -gt 1 ]]
 86 |         then
 87 |             bcftools +prune -w $SITES -n 1 -N rand -O z -o {output.pruned} {output.filtered} &>> {log}
 88 |         else
 89 |             bcftools view -O z -o {output.pruned} {output.filtered} &>> {log}
 90 |         fi
 91 | 
 92 |         bcftools query -f '%CHROM\t%POS\t%ID\t%INFO/AF\t%QUAL\t%INFO/ReadPosRankSum\t%INFO/FS\t%INFO/SOR\t%INFO/MQ\t%INFO/MQRankSum\n' {output.pruned} > {output.snpqc} 2>> {log}
 93 |         
 94 |         ##copy the fai file into the QC folder for easy access
 95 |         cp {input.fai} {output.fai} &>> {log}
 96 |         cp {input.sumstats} {output.sumstats} &>> {log}
 97 |         """
 98 | 
 99 | rule plink:
100 |     """
101 |     Call plink PCA.
102 |     """
103 |     input:
104 |         vcf = "results/{refGenome}/QC/{prefix}.pruned.vcf.gz",
105 |         faiResult = "results/{refGenome}/QC/{prefix}_fai_tmp.txt"        
106 |     params:
107 |         prefix = lambda wc, input: input.vcf.replace(".pruned.vcf.gz", "")
108 |     output: 
109 |         bed = "results/{refGenome}/QC/{prefix}.bed",
110 |         bim = "results/{refGenome}/QC/{prefix}.bim",
111 |         fam = "results/{refGenome}/QC/{prefix}.fam",
112 |         eigenvec = "results/{refGenome}/QC/{prefix}.eigenvec",
113 |         eigenval = "results/{refGenome}/QC/{prefix}.eigenval",
114 |         dist = "results/{refGenome}/QC/{prefix}.dist",
115 |         distid = "results/{refGenome}/QC/{prefix}.dist.id",
116 |         king = "results/{refGenome}/QC/{prefix}.king"
117 |     conda:
118 |         "envs/plink.yml"
119 |     log:
120 |         "logs/{refGenome}/QC/plink/{prefix}.txt"
121 |     shell:
122 |         #plink 2 for king relatedness matrix (robust to structure) and plink 1.9 for distance matrix
123 |         """
124 |         plink2 --vcf {input.vcf} --pca 10 --out {params.prefix} --allow-extra-chr --autosome-num 95 --make-bed --make-king square --const-fid --bad-freqs &> {log}
125 |         plink --vcf {input.vcf} --out {params.prefix} --allow-extra-chr --autosome-num 95 --distance square --const-fid &>> {log}
126 |         """
127 | 
128 | rule setup_admixture:
129 |     """
130 |     admixture requires all chromosome names to be integers, this sets them to be 1:n
131 |     """
132 |     input:
133 |         bim = "results/{refGenome}/QC/{prefix}.bim",
134 |         fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
135 |     output:
136 |         bim = "results/{refGenome}/QC/{prefix}.bim_fixed",
137 |         bim_back = "results/{refGenome}/QC/{prefix}.bim.orig"
138 |     script:
139 |         "scripts/contigs4admixture.py"
140 | 
141 | rule admixture:
142 |     """
143 |     Call Admixture. First, make a bim file that has no charecters in the chromosomes
144 |     """
145 |     input:
146 |         bed = "results/{refGenome}/QC/{prefix}.bed",
147 |         bim = "results/{refGenome}/QC/{prefix}.bim",
148 |         fam = "results/{refGenome}/QC/{prefix}.fam",
149 |         bim_fixed = "results/{refGenome}/QC/{prefix}.bim_fixed",
150 |         bim_back = "results/{refGenome}/QC/{prefix}.bim.orig"
151 |     output:
152 |         admix = "results/{refGenome}/QC/{prefix}.3.Q",
153 |         admix2 = "results/{refGenome}/QC/{prefix}.2.Q"
154 |     params:
155 |         outdir = lambda wc, input: input.bed.rsplit("/", 1)[0]
156 |     log:
157 |         "logs/{refGenome}/QC/admixture/{prefix}.txt"
158 |     conda:
159 |         "envs/admixture.yml"
160 |     shell:
161 |         """
162 |         mv {input.bim_fixed} {input.bim} 2> {log}
163 | 
164 |         admixture {input.bed} 2 &>> {log}
165 |         admixture {input.bed} 3 &>> {log}
166 | 
167 |         mv "{wildcards.prefix}".2.* {params.outdir} &>> {log}
168 |         mv "{wildcards.prefix}".3.* {params.outdir} &>> {log}
169 |         """
170 | 
171 | rule generate_coords_file:
172 |     output: 
173 |         "results/{refGenome}/QC/{prefix}.coords.txt"
174 |     run:
175 |         out_df = samples.loc[(samples['refGenome'] == wildcards.refGenome)][["BioSample", "long", "lat"]]
176 |         out_df.drop_duplicates("BioSample", inplace=True)
177 |         out_df.dropna(subset=["long", "lat"], thresh=1, inplace=True)
178 |         out_df.to_csv(output[0], index=False, sep="\t", header=False)
179 | 
180 | rule qc_plots:
181 |     """
182 |     Call plotting script
183 |     """
184 |     input:
185 |         eigenvec = "results/{refGenome}/QC/{prefix}.eigenvec",
186 |         eigenval = "results/{refGenome}/QC/{prefix}.eigenval",
187 |         depth = "results/{refGenome}/QC/{prefix}.idepth",
188 |         dist = "results/{refGenome}/QC/{prefix}.dist",
189 |         distid = "results/{refGenome}/QC/{prefix}.dist.id",
190 |         king = "results/{refGenome}/QC/{prefix}.king",
191 |         miss = "results/{refGenome}/QC/{prefix}.imiss",
192 |         admix3 = "results/{refGenome}/QC/{prefix}.3.Q",
193 |         admix2 = "results/{refGenome}/QC/{prefix}.2.Q",
194 |         snpqc = "results/{refGenome}/QC/{prefix}_snpqc.txt",
195 |         faiResult = "results/{refGenome}/QC/{prefix}_fai_tmp.txt",
196 |         bed = "results/{refGenome}/QC/{prefix}.bed",
197 |         bim = "results/{refGenome}/QC/{prefix}.bim",
198 |         fam = "results/{refGenome}/QC/{prefix}.fam",
199 |         sumstats = "results/{refGenome}/QC/{prefix}_bam_sumstats.txt",
200 |         summ = "results/{refGenome}/QC/{prefix}.FILTER.summary",
201 |         het = "results/{refGenome}/QC/{prefix}.het",
202 |         fai = "results/{refGenome}/QC/{prefix}.fna.fai",
203 |         coords = get_coords_if_available
204 |     params:
205 |         prefix = lambda wc, input: input.het[:-4],
206 |         nClusters = config['nClusters'],
207 |         GMKey = config['GoogleAPIKey']
208 |     output: 
209 |         qcpdf = "results/{refGenome}/QC/{prefix}_qc.html"
210 |     conda:
211 |         "envs/qc.yml"
212 |     script:
213 |         "scripts/qc_dashboard_render.R"
214 | 


--------------------------------------------------------------------------------