├── workflow ├── envs │ ├── create_annotation_db.yaml │ ├── get_1001_genome_snps.yaml │ ├── add_surprise_bonus.yaml │ ├── download_ncbi_dataset.yaml │ ├── simulate_reads.yaml │ ├── get_genome.yaml │ ├── validate_genome.yaml │ ├── add_gpn_score.yaml │ ├── add_sequence_window.yaml │ └── notebook.yaml ├── scripts │ ├── __pycache__ │ │ └── helpers.cpython-314.pyc │ ├── create_annotation_db.py │ ├── add_surprise_bonus.py │ ├── add_gpn_score.py │ ├── helpers.py │ ├── validate_fasta.py │ ├── get_1001_genome_snps.py │ └── add_sequence_window.py ├── rules │ ├── common.smk │ ├── compute_gpn_sb_for_snps.smk │ ├── ncbi.smk │ └── process_reads.smk ├── Snakefile └── notebooks │ └── explore.py.ipynb ├── .test └── config │ ├── samples.tsv │ └── config.yaml ├── .gitignore ├── .github └── workflows │ ├── conventional-prs.yaml │ ├── release-please.yaml │ └── main.yaml ├── config ├── config.yaml ├── schemas │ ├── samples.schema.yaml │ └── config.schema.yaml └── README.md ├── .snakemake-workflow-catalog.yml ├── LICENSE ├── README.md └── CHANGELOG.md /workflow/envs/create_annotation_db.yaml: -------------------------------------------------------------------------------- 1 | name: create_annotation_db 2 | channels: 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - gffutils 7 | -------------------------------------------------------------------------------- /workflow/envs/get_1001_genome_snps.yaml: -------------------------------------------------------------------------------- 1 | name: get_1001_genome_snps 2 | channels: 3 | - defaults 4 | dependencies: 5 | - requests 6 | - pandas 7 | -------------------------------------------------------------------------------- /workflow/envs/add_surprise_bonus.yaml: -------------------------------------------------------------------------------- 1 | name: add_surprise_bonus 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - numpy 7 | - pandas 8 | -------------------------------------------------------------------------------- /workflow/envs/download_ncbi_dataset.yaml: -------------------------------------------------------------------------------- 1 | name: download_ncbi_dataset 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - ncbi-datasets-cli 7 | -------------------------------------------------------------------------------- /workflow/envs/simulate_reads.yaml: -------------------------------------------------------------------------------- 1 | name: simulate_reads 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - dwgsim=1.1.14 8 | -------------------------------------------------------------------------------- /workflow/scripts/__pycache__/helpers.cpython-314.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SilvanCodes/1001_genomes_analysis/main/workflow/scripts/__pycache__/helpers.cpython-314.pyc -------------------------------------------------------------------------------- /workflow/envs/get_genome.yaml: -------------------------------------------------------------------------------- 1 | name: get_genome 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - gzip=1.14 8 | - wget=1.21.4 9 | -------------------------------------------------------------------------------- /workflow/envs/validate_genome.yaml: -------------------------------------------------------------------------------- 1 | name: gpn 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.85 9 | -------------------------------------------------------------------------------- /workflow/envs/add_gpn_score.yaml: -------------------------------------------------------------------------------- 1 | name: add_gpn_score 2 | channels: 3 | - defaults 4 | dependencies: 5 | - pip 6 | - pip: 7 | - git+https://github.com/SilvanCodes/gpn.git@main 8 | -------------------------------------------------------------------------------- /workflow/envs/add_sequence_window.yaml: -------------------------------------------------------------------------------- 1 | name: add_sequence_window 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - pandas 8 | - biopython 9 | - gffutils 10 | -------------------------------------------------------------------------------- /.test/config/samples.tsv: -------------------------------------------------------------------------------- 1 | sample condition replicate read1 read2 2 | sample1 wild_type 1 sample1.bwa.read1.fastq.gz sample1.bwa.read2.fastq.gz 3 | sample2 wild_type 2 sample2.bwa.read1.fastq.gz sample2.bwa.read2.fastq.gz 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | results/** 2 | resources/** 3 | logs/** 4 | .snakemake 5 | .snakemake/** 6 | .test/results/* 7 | workflow/notebooks/.ipynb_checkpoints/** 8 | **/.Rhistory 9 | **/*.Rproj 10 | **/.Rproj.user/** 11 | **/.RData 12 | **/Rplots.pdf 13 | -------------------------------------------------------------------------------- /.test/config/config.yaml: -------------------------------------------------------------------------------- 1 | samplesheet: "config/samples.tsv" 2 | 3 | get_genome: 4 | ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 5 | 6 | simulate_reads: 7 | read_length: 100 8 | read_number: 10000 9 | -------------------------------------------------------------------------------- /workflow/envs/notebook.yaml: -------------------------------------------------------------------------------- 1 | name: notebook 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - nodefaults 6 | dependencies: 7 | - biopython 8 | - gffutils 9 | - pandas 10 | - numpy 11 | - matplotlib 12 | - seaborn 13 | - python 14 | - notebook 15 | - ipykernel 16 | -------------------------------------------------------------------------------- /workflow/scripts/create_annotation_db.py: -------------------------------------------------------------------------------- 1 | import gffutils 2 | from snakemake.script import snakemake 3 | 4 | 5 | def create_annotation_db(gff_file, db_file): 6 | db = gffutils.create_db(gff_file, db_file, merge_strategy="create_unique") 7 | db.update(list(db.create_introns())) 8 | 9 | 10 | create_annotation_db(snakemake.input[0], snakemake.output[0]) 11 | -------------------------------------------------------------------------------- /.github/workflows/conventional-prs.yaml: -------------------------------------------------------------------------------- 1 | name: Lint PR 2 | on: 3 | pull_request_target: 4 | types: 5 | - opened 6 | - reopened 7 | - edited 8 | - synchronize 9 | 10 | permissions: 11 | pull-requests: read 12 | 13 | jobs: 14 | main: 15 | name: Validate PR title 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: amannn/action-semantic-pull-request@v5 19 | env: 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | -------------------------------------------------------------------------------- /workflow/rules/common.smk: -------------------------------------------------------------------------------- 1 | # import basic packages 2 | import pandas as pd 3 | from snakemake.utils import validate 4 | 5 | 6 | # read sample sheet 7 | samples = ( 8 | pd.read_csv(config["samplesheet"], sep="\t", dtype={"sample": str}) 9 | .set_index("sample", drop=False) 10 | .sort_index() 11 | ) 12 | 13 | 14 | # validate sample sheet and config file 15 | validate(samples, schema="../../config/schemas/samples.schema.yaml") 16 | validate(config, schema="../../config/schemas/config.schema.yaml") 17 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | permissions: 7 | contents: write 8 | pull-requests: write 9 | issues: write 10 | 11 | name: release-please 12 | 13 | jobs: 14 | release-please: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: googleapis/release-please-action@v4 18 | with: 19 | token: ${{ secrets.GITHUB_TOKEN }} 20 | release-type: go # just keep a changelog, no version anywhere outside of git tags 21 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | samplesheet: ".test/config/samples.tsv" 2 | 3 | ncbi: 4 | tair10.1: "GCF_000001735.4" 5 | 6 | get_genome: 7 | ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz 8 | 9 | simulate_reads: 10 | read_length: 100 11 | read_number: 10000 12 | 13 | get_1001_genome_snps: 14 | base_url: https://tools.1001genomes.org/api/v1.1/effects.json?type=snps;accs=all 15 | 16 | all: 17 | model: 18 | - "gonzalobenegas/gpn-brassicales" 19 | gene_ids: 20 | - "AT2G19110.1" # HMA4 21 | -------------------------------------------------------------------------------- /.snakemake-workflow-catalog.yml: -------------------------------------------------------------------------------- 1 | # configuration of display in snakemake workflow catalog: https://snakemake.github.io/snakemake-workflow-catalog 2 | 3 | usage: 4 | mandatory-flags: 5 | desc: # describe your flags here in a few sentences 6 | flags: # put your flags here 7 | software-stack-deployment: 8 | conda: true # whether pipeline works with '--sdm conda' 9 | apptainer: true # whether pipeline works with '--sdm apptainer/singularity' 10 | apptainer+conda: true # whether pipeline works with '--sdm conda apptainer/singularity' 11 | report: true # whether creation of reports using 'snakemake --report report.zip' is supported 12 | -------------------------------------------------------------------------------- /workflow/rules/compute_gpn_sb_for_snps.smk: -------------------------------------------------------------------------------- 1 | # get snp data 2 | # get reference genome 3 | # compute gpn and sb for snps 4 | 5 | 6 | rule get_genome: 7 | output: 8 | fasta="results/get_genome/genome.fna", 9 | conda: 10 | "../envs/get_genome.yaml" 11 | message: 12 | """--- Downloading genome sequence.""" 13 | params: 14 | ncbi_ftp=lookup(within=config, dpath="get_genome/ncbi_ftp"), 15 | log: 16 | "results/get_genome/genome.log", 17 | shell: 18 | "wget -O results/get_genome/genome.fna.gz {params.ncbi_ftp} > {log} 2>&1 && " 19 | "gunzip results/get_genome/genome.fna.gz >> {log} 2>&1" 20 | -------------------------------------------------------------------------------- /workflow/scripts/add_surprise_bonus.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from snakemake.script import snakemake 4 | 5 | 6 | def add_surprise_bonus(input_path, output_path): 7 | df = pd.read_csv(input_path) 8 | # Entropy H in base-4 9 | df["H"] = -( 10 | df[["p_a", "p_c", "p_g", "p_t"]] 11 | .pipe(lambda p: p * (np.log2(p) / 2)) 12 | .sum(axis=1) 13 | ) 14 | 15 | # Surprisal I in base-4 16 | df["I"] = -(np.log2(df["p_alt"]) / 2) 17 | df["surprise bonus"] = df["H"] - df["I"] 18 | 19 | df.to_csv(output_path, index=False) 20 | 21 | 22 | add_surprise_bonus(snakemake.input[0], snakemake.output[0]) 23 | -------------------------------------------------------------------------------- /workflow/scripts/add_gpn_score.py: -------------------------------------------------------------------------------- 1 | from helpers import parse_dna_substitution 2 | import pandas as pd 3 | 4 | import gpn.model 5 | import gpn.pipelines 6 | from transformers import pipeline 7 | 8 | 9 | def get_gpn_score(variant): 10 | ref, alt = parse_dna_substitution(variant["amino acid change"]) 11 | 12 | return variant[f"gpn_{alt.lower()}"] 13 | 14 | 15 | def add_gpn_score(df): 16 | gpn_pipeline = pipeline("gpn") 17 | 18 | gpn_scores = pd.concat( 19 | gpn_pipeline(df["sequence_window"], start=256, end=257, batch_size=8), 20 | ignore_index=True, 21 | ) 22 | 23 | df = pd.concat([df, gpn_scores], axis=1) 24 | 25 | df["gpn_score"] = df.apply(lambda x: get_gpn_score(x), axis=1) 26 | return df 27 | -------------------------------------------------------------------------------- /config/schemas/samples.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "https://json-schema.org/draft/2020-12/schema" 2 | description: entries for the sample sheet 3 | properties: 4 | sample: 5 | type: string 6 | description: sample name/identifier 7 | condition: 8 | type: string 9 | description: sample condition that will be compared during differential analysis 10 | replicate: 11 | type: integer 12 | default: 1 13 | description: consecutive numbers representing multiple replicates of one condition 14 | read1: 15 | type: string 16 | description: names of fastq.gz files, read 1 17 | read2: 18 | type: string 19 | description: names of fastq.gz files, read 2 (optional) 20 | 21 | required: 22 | - sample 23 | - condition 24 | - replicate 25 | - read1 26 | -------------------------------------------------------------------------------- /config/schemas/config.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "https://json-schema.org/draft/2020-12/schema" 2 | description: main configuration schema for the workflow 3 | properties: 4 | samplesheet: 5 | type: string 6 | description: path to sample-sheet TSV file 7 | 8 | get_genome: 9 | type: object 10 | properties: 11 | ncbi_ftp: 12 | type: string 13 | description: URL for genome retrieval 14 | required: ["ncbi_ftp"] 15 | 16 | simulate_reads: 17 | type: object 18 | properties: 19 | read_length: 20 | type: number 21 | description: length of target reads in bp 22 | read_number: 23 | type: number 24 | description: number of total reads to be simulated 25 | 26 | required: 27 | - samplesheet 28 | - get_genome 29 | - simulate_reads 30 | -------------------------------------------------------------------------------- /workflow/scripts/helpers.py: -------------------------------------------------------------------------------- 1 | from Bio.SeqUtils import seq1 2 | import re 3 | 4 | 5 | def parse_aa_substitution(HGVS_string): 6 | prot_match = re.match(r"p\.([A-Z]{1}[a-z]{2})\d+([A-Z]{1}[a-z]{2})", HGVS_string) 7 | reference, alternative = prot_match.groups() 8 | return seq1(reference), seq1(alternative) 9 | 10 | 11 | def parse_dna_substitution(HGVS_string): 12 | dna_match = re.search(r"c\..*([ATCG])>([ATCG])$", HGVS_string) 13 | reference, alternative = dna_match.groups() 14 | return reference, alternative 15 | 16 | 17 | def get_start_end_from_seq(seq): 18 | range = seq.id.split("|")[2] 19 | _chrom, range = range.split(":") 20 | start, end = range.split("-") 21 | return int(start), int(end) 22 | 23 | 24 | # parse_dna_substitution("p.Leu16Val/c.46T>G") 25 | 26 | # parse_aa_substitution("p.Leu16Val/c.46T>G") 27 | -------------------------------------------------------------------------------- /workflow/scripts/validate_fasta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Bio import SeqIO 3 | from snakemake.script import snakemake 4 | 5 | sys.stderr = open(snakemake.log[0], "w", buffering=1) 6 | 7 | 8 | def validate_fasta(input_fasta, output_fasta): 9 | try: 10 | with open(input_fasta, "r") as fasta_file: 11 | records = list(SeqIO.parse(fasta_file, "fasta")) 12 | if not records: 13 | raise ValueError("FASTA file is empty or improperly formatted.") 14 | else: 15 | summary = [f"Validated sequence records for {output_fasta}:"] 16 | summary += [f"{i.name}: {i.description}" for i in records] 17 | with open(output_fasta, "w") as validated_file: 18 | SeqIO.write(records, validated_file, "fasta") 19 | sys.stderr.write("\n".join(summary)) 20 | except Exception as e: 21 | sys.stderr.write(f"Validation failed: {e}\n") 22 | raise 23 | 24 | 25 | validate_fasta(snakemake.input["fasta"], snakemake.output["fasta"]) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021, AUTHORS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /workflow/scripts/get_1001_genome_snps.py: -------------------------------------------------------------------------------- 1 | from snakemake.script import snakemake 2 | import requests 3 | import pandas as pd 4 | 5 | 6 | def get_1001_genome_snps(base_url, gene_id, output_path): 7 | url = f"{base_url};gid={gene_id}" 8 | snp_effects_response = requests.get(url) 9 | 10 | df = pd.DataFrame( 11 | snp_effects_response.json()["data"], 12 | columns=[ 13 | "chromosome", 14 | "position", 15 | "accession id", 16 | "type", 17 | "effect impact", 18 | "functional class", 19 | "codon change", 20 | "amino acid change", 21 | "amino acid length", 22 | "gene name", 23 | "transcript biotype", 24 | "gene coding", 25 | "transcript id", 26 | "exon rank", 27 | ], 28 | ) 29 | 30 | df.to_csv(output_path[0], index=False) 31 | 32 | df = df.drop_duplicates(subset=["amino acid change"]) 33 | 34 | df.to_csv(output_path[1], index=False) 35 | 36 | 37 | get_1001_genome_snps( 38 | snakemake.params.base_url, snakemake.wildcards.gene_id, snakemake.output 39 | ) 40 | -------------------------------------------------------------------------------- /workflow/rules/ncbi.smk: -------------------------------------------------------------------------------- 1 | localrules: 2 | download_ncbi_dataset, 3 | unpack_ncbi_dataset, 4 | 5 | 6 | rule download_ncbi_dataset: 7 | output: 8 | "resources/download_ncbi_dataset/{accession}.zip", 9 | conda: 10 | "../envs/download_ncbi_dataset.yaml" 11 | message: 12 | """--- Downloading NCBI dataset for {wildcards.accession}.""" 13 | shell: 14 | "datasets download genome accession {wildcards.accession} --include genome,gff3 --filename {output}" 15 | 16 | 17 | rule unpack_ncbi_dataset: 18 | input: 19 | "resources/download_ncbi_dataset/{accession}.zip", 20 | output: 21 | "resources/unpack_ncbi_dataset/{accession}/genome.fna", 22 | "resources/unpack_ncbi_dataset/{accession}/annotation.gff", 23 | params: 24 | data_path="/tmp/ncbi_dataset/data", 25 | shell: 26 | "unzip {input} -d /tmp" 27 | " && genome_path=$(cat {params.data_path}/dataset_catalog.json | jq -r '.assemblies[1].files | .[] | select(.fileType==\"GENOMIC_NUCLEOTIDE_FASTA\").filePath')" 28 | " && annotation_path=$(cat {params.data_path}/dataset_catalog.json | jq -r '.assemblies[1].files | .[] | select(.fileType==\"GFF3\").filePath')" 29 | " && mv {params.data_path}/$genome_path {output[0]}" 30 | " && mv {params.data_path}/$annotation_path {output[1]}" 31 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | Formatting: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | - name: Formatting 17 | uses: super-linter/super-linter@v7 18 | env: 19 | VALIDATE_ALL_CODEBASE: false 20 | DEFAULT_BRANCH: main 21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | VALIDATE_SNAKEMAKE_SNAKEFMT: true 23 | VALIDATE_YAML_PRETTIER: true 24 | 25 | Linting: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v4 29 | - name: Lint workflow 30 | uses: snakemake/snakemake-github-action@v2 31 | with: 32 | directory: . 33 | snakefile: workflow/Snakefile 34 | args: "--lint" 35 | 36 | Testing: 37 | runs-on: ubuntu-latest 38 | needs: 39 | - Linting 40 | - Formatting 41 | steps: 42 | - uses: actions/checkout@v4 43 | 44 | - name: Test workflow 45 | uses: snakemake/snakemake-github-action@v2 46 | with: 47 | directory: .test 48 | snakefile: workflow/Snakefile 49 | args: "--sdm conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --all-temp" 50 | 51 | - name: Test report 52 | uses: snakemake/snakemake-github-action@v2 53 | with: 54 | directory: .test 55 | snakefile: workflow/Snakefile 56 | args: "--report report.zip" 57 | -------------------------------------------------------------------------------- /workflow/scripts/add_sequence_window.py: -------------------------------------------------------------------------------- 1 | from helpers import parse_dna_substitution 2 | from Bio import SeqIO 3 | import pandas as pd 4 | from gffutils import FeatureDB 5 | from snakemake.script import snakemake 6 | 7 | 8 | def get_substitution_dna_window(variant, gene_sequence, gene_annotation): 9 | in_gene_position = variant["position"] - gene_annotation.start 10 | 11 | ref, _alt = parse_dna_substitution(variant["amino acid change"]) 12 | 13 | assert ref == gene_sequence[in_gene_position].upper() 14 | 15 | # fails for positions close to start or end of gene 16 | return str(gene_sequence[in_gene_position - 256 : in_gene_position + 256].seq) 17 | 18 | 19 | def add_sequence_window( 20 | snp_effect_path, reference_fasta_path, annotation_db_path, output_path 21 | ): 22 | annotation = FeatureDB(annotation_db_path) 23 | genome = SeqIO.to_dict(SeqIO.parse(reference_fasta_path, "fasta")) 24 | 25 | gene_annotation_id = f"gene-{snakemake.wildcards[0].split('.')[0]}" 26 | gene_annotation = annotation[gene_annotation_id] 27 | 28 | # compensate for 1-based indexing in gff 29 | gene_sequence = genome[gene_annotation.seqid][ 30 | gene_annotation.start - 1 : gene_annotation.end 31 | ] 32 | 33 | df = pd.read_csv(snp_effect_path) 34 | 35 | # move data cleaning into seperate rule 36 | df = df.dropna(subset=["amino acid change"]) 37 | df = df[~df["amino acid change"].str.contains("ins", na=False)] 38 | df = df[~df["amino acid change"].str.contains("del", na=False)] 39 | 40 | df["sequence_window"] = df.apply( 41 | lambda x: get_substitution_dna_window(x, gene_sequence, gene_annotation), axis=1 42 | ) 43 | 44 | df.to_csv(output_path, index=False) 45 | 46 | 47 | add_sequence_window( 48 | snakemake.input[2], snakemake.input[0], snakemake.input[1], snakemake.output[0] 49 | ) 50 | -------------------------------------------------------------------------------- /config/README.md: -------------------------------------------------------------------------------- 1 | ## Workflow overview 2 | 3 | This workflow is a best-practice workflow for ``. 4 | The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps: 5 | 6 | 1. Download genome reference from NCBI 7 | 2. Validate downloaded genome (`python` script) 8 | 3. Simulate short read sequencing data on the fly (`dwgsim`) 9 | 4. Check quality of input read data (`FastQC`) 10 | 5. Collect statistics from tool output (`MultiQC`) 11 | 12 | ## Running the workflow 13 | 14 | ### Input data 15 | 16 | This template workflow creates artificial sequencing data in `*.fastq.gz` format. 17 | It does not contain actual input data. 18 | The simulated input files are nevertheless created based on a mandatory table linked in the `config.yaml` file (default: `.test/samples.tsv`). 19 | The sample sheet has the following layout: 20 | 21 | | sample | condition | replicate | read1 | read2 | 22 | | ------- | --------- | --------- | -------------------------- | -------------------------- | 23 | | sample1 | wild_type | 1 | sample1.bwa.read1.fastq.gz | sample1.bwa.read2.fastq.gz | 24 | | sample2 | wild_type | 2 | sample2.bwa.read1.fastq.gz | sample2.bwa.read2.fastq.gz | 25 | 26 | ### Parameters 27 | 28 | This table lists all parameters that can be used to run the workflow. 29 | 30 | | parameter | type | details | default | 31 | | ------------------ | ---- | ------------------------------------- | ------------------------------ | 32 | | **samplesheet** | | | | 33 | | path | str | path to samplesheet, mandatory | "config/samples.tsv" | 34 | | **get_genome** | | | | 35 | | ncbi_ftp | str | link to a genome on NCBI's FTP server | link to _S. cerevisiae_ genome | 36 | | **simulate_reads** | | | | 37 | | read_length | num | length of target reads in bp | 100 | 38 | | read_number | num | number of total reads to be simulated | 10000 | 39 | -------------------------------------------------------------------------------- /workflow/Snakefile: -------------------------------------------------------------------------------- 1 | # Main entrypoint of the workflow. 2 | # Please follow the best practices: 3 | # https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html, 4 | # in particular regarding the standardized folder structure mentioned there. 5 | 6 | 7 | # load configuration 8 | # ----------------------------------------------------- 9 | configfile: "config/config.yaml" 10 | 11 | 12 | # load rules 13 | # ----------------------------------------------------- 14 | include: "rules/ncbi.smk" 15 | 16 | 17 | # local rules 18 | # ----------------------------------------------------- 19 | localrules: 20 | all, 21 | get_1001_genome_snps, 22 | create_annotation_db, 23 | add_sequence_window, 24 | 25 | 26 | # optional messages, log and error handling 27 | # ----------------------------------------------------- 28 | onstart: 29 | print("\n--- Analysis started ---\n") 30 | 31 | 32 | onsuccess: 33 | print("\n--- Workflow finished! ---\n") 34 | 35 | 36 | onerror: 37 | print("\n--- An error occurred! ---\n") 38 | 39 | 40 | # target rules 41 | # ----------------------------------------------------- 42 | rule all: 43 | input: 44 | # expand( 45 | # "results/scores/{model}/{gene_id}/scores.csv", 46 | # model=lookup(within=config, dpath="all/model"), 47 | # gene_id=lookup(within=config, dpath="all/gene_ids"), 48 | # ), 49 | expand( 50 | "results/add_sequence_window/{gene_id}/snp_effects_with_sequence_window.csv", 51 | gene_id=lookup(within=config, dpath="all/gene_ids"), 52 | ), 53 | default_target: True 54 | 55 | 56 | rule get_1001_genome_snps: 57 | output: 58 | "resources/get_1001_genome_snps/{gene_id}/snp_effects.csv", 59 | "resources/get_1001_genome_snps/{gene_id}/snp_effects_deduplicated.csv", 60 | conda: 61 | "envs/get_1001_genome_snps.yaml" 62 | message: 63 | """--- Downloading 1001 genome SNP effects.""" 64 | params: 65 | base_url=lookup(within=config, dpath="get_1001_genome_snps/base_url"), 66 | script: 67 | "scripts/get_1001_genome_snps.py" 68 | 69 | 70 | rule create_annotation_db: 71 | input: 72 | "resources/unpack_ncbi_dataset/{accession}/annotation.gff", 73 | output: 74 | "resources/create_annotation_db/{accession}/annotation.db", 75 | conda: 76 | "envs/create_annotation_db.yaml" 77 | script: 78 | "scripts/create_annotation_db.py" 79 | 80 | 81 | rule add_sequence_window: 82 | input: 83 | expand( 84 | [ 85 | "resources/unpack_ncbi_dataset/{accession}/genome.fna", 86 | "resources/create_annotation_db/{accession}/annotation.db", 87 | ], 88 | accession=lookup(within=config, dpath="ncbi/tair10.1"), 89 | ), 90 | "resources/get_1001_genome_snps/{gene_id}/snp_effects_deduplicated.csv", 91 | output: 92 | "results/add_sequence_window/{gene_id}/snp_effects_with_sequence_window.csv", 93 | conda: 94 | "envs/add_sequence_window.yaml" 95 | message: 96 | """--- Adding sequence windows to SNP effects.""" 97 | script: 98 | "scripts/add_sequence_window.py" 99 | 100 | 101 | # conda: 102 | # "envs/notebook.yaml" 103 | # notebook: 104 | # "notebooks/explore.py.ipynb" 105 | # snakemake --sdm conda --cores 1 --edit-notebook results/add_sequence_window/AT2G19110.1/snp_effects_with_sequence_window.csv" 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Snakemake workflow: `` 2 | 3 | [![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io) 4 | [![GitHub actions status](https://github.com///workflows/Tests/badge.svg?branch=main)](https://github.com///actions?query=branch%3Amain+workflow%3ATests) 5 | [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) 6 | [![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows//) 7 | 8 | A Snakemake workflow for `` 9 | 10 | - [Snakemake workflow: ``](#snakemake-workflow-name) 11 | - [Usage](#usage) 12 | - [Deployment options](#deployment-options) 13 | - [Authors](#authors) 14 | - [References](#references) 15 | - [TODO](#todo) 16 | 17 | ## Usage 18 | 19 | The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows//). 20 | 21 | Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md). 22 | 23 | If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI. 24 | 25 | ## Deployment options 26 | 27 | To run the workflow from command line, change the working directory. 28 | 29 | ```bash 30 | cd path/to/snakemake-workflow-name 31 | ``` 32 | 33 | Adjust options in the default config file `config/config.yaml`. 34 | Before running the complete workflow, you can perform a dry run using: 35 | 36 | ```bash 37 | snakemake --dry-run 38 | ``` 39 | 40 | To run the workflow with test files using **conda**: 41 | 42 | ```bash 43 | snakemake --cores 2 --sdm conda --directory .test 44 | ``` 45 | 46 | To run the workflow with **apptainer** / **singularity**, add a link to a container registry in the `Snakefile`, for example `container: "oras://ghcr.io//:"` for Github's container registry. 47 | Run the workflow with: 48 | 49 | ```bash 50 | snakemake --cores 2 --sdm conda apptainer --directory .test 51 | ``` 52 | 53 | ## Authors 54 | 55 | - Firstname Lastname 56 | - Affiliation 57 | - ORCID profile 58 | - home page 59 | 60 | ## References 61 | 62 | > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2. 63 | 64 | ## TODO 65 | 66 | - Replace `` and `` everywhere in the template with the correct user name/organization, and the repository name. The workflow will be automatically added to the [snakemake workflow catalog](https://snakemake.github.io/snakemake-workflow-catalog/index.html) once it is publicly available on Github. 67 | - Replace `` with the workflow name (can be the same as ``). 68 | - Replace `` with a description of what the workflow does. 69 | - Update the [deployment](#deployment-options), [authors](#authors) and [references](#references) sections. 70 | - Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's [deployment](#deployment-options) options. 71 | - Do not forget to also adjust the configuration-specific `config/README.md` file. 72 | -------------------------------------------------------------------------------- /workflow/rules/process_reads.smk: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------- # 2 | # EXAMPLE WORKFLOW # 3 | # ----------------------------------------------------- # 4 | 5 | 6 | # fetch genome sequence from NCBI 7 | # ----------------------------------------------------- 8 | rule get_genome: 9 | output: 10 | fasta="results/get_genome/genome.fna", 11 | conda: 12 | "../envs/get_genome.yaml" 13 | message: 14 | """--- Downloading genome sequence.""" 15 | params: 16 | ncbi_ftp=lookup(within=config, dpath="get_genome/ncbi_ftp"), 17 | log: 18 | "results/get_genome/genome.log", 19 | shell: 20 | "wget -O results/get_genome/genome.fna.gz {params.ncbi_ftp} > {log} 2>&1 && " 21 | "gunzip results/get_genome/genome.fna.gz >> {log} 2>&1" 22 | 23 | 24 | # validate genome sequence file 25 | # ----------------------------------------------------- 26 | rule validate_genome: 27 | input: 28 | fasta=rules.get_genome.output.fasta, 29 | output: 30 | fasta="results/validate_genome/genome.fna", 31 | conda: 32 | "../envs/validate_genome.yaml" 33 | message: 34 | """--- Validating genome sequence file.""" 35 | log: 36 | "results/validate_genome/genome.log", 37 | script: 38 | "../scripts/validate_fasta.py" 39 | 40 | 41 | # simulate read data using DWGSIM 42 | # ----------------------------------------------------- 43 | rule simulate_reads: 44 | input: 45 | fasta=rules.validate_genome.output.fasta, 46 | output: 47 | multiext( 48 | "results/simulate_reads/{sample}", 49 | read1=".bwa.read1.fastq.gz", 50 | read2=".bwa.read2.fastq.gz", 51 | ), 52 | conda: 53 | "../envs/simulate_reads.yaml" 54 | message: 55 | """--- Simulating read data with DWGSIM.""" 56 | params: 57 | output_type=1, 58 | read_length=lookup(within=config, dpath="simulate_reads/read_length"), 59 | read_number=lookup(within=config, dpath="simulate_reads/read_number"), 60 | log: 61 | "results/simulate_reads/{sample}.log", 62 | shell: 63 | "output_prefix=`echo {output.read1} | cut -f 1 -d .`;" 64 | "dwgsim " 65 | " -1 {params.read_length}" 66 | " -2 {params.read_length}" 67 | " -N {params.read_number}" 68 | " -o {params.output_type}" 69 | " {input.fasta}" 70 | " ${{output_prefix}}" 71 | " > {log} 2>&1" 72 | 73 | 74 | # make QC report 75 | # ----------------------------------------------------- 76 | rule fastqc: 77 | input: 78 | fastq="results/simulate_reads/{sample}.bwa.{read}.fastq.gz", 79 | output: 80 | html="results/fastqc/{sample}.bwa.{read}_fastqc.html", 81 | zip="results/fastqc/{sample}.bwa.{read}_fastqc.zip", 82 | params: 83 | extra="--quiet", 84 | message: 85 | """--- Checking fastq files with FastQC.""" 86 | log: 87 | "results/fastqc/{sample}.bwa.{read}.log", 88 | threads: 1 89 | wrapper: 90 | "v6.0.0/bio/fastqc" 91 | 92 | 93 | # run multiQC on tool output 94 | # ----------------------------------------------------- 95 | rule multiqc: 96 | input: 97 | expand( 98 | "results/fastqc/{sample}.bwa.{read}_fastqc.{ext}", 99 | sample=samples.index, 100 | read=["read1", "read2"], 101 | ext=["html", "zip"], 102 | ), 103 | output: 104 | report="results/multiqc/multiqc_report.html", 105 | params: 106 | extra="--verbose --dirs", 107 | message: 108 | """--- Generating MultiQC report for seq data.""" 109 | log: 110 | "results/multiqc/multiqc.log", 111 | wrapper: 112 | "v6.0.0/bio/multiqc" 113 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [1.1.0](https://github.com/snakemake-workflows/snakemake-workflow-template/compare/v1.0.0...v1.1.0) (2025-07-29) 4 | 5 | 6 | ### Features 7 | 8 | * complete minimal workflow as template ([2348055](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/234805535a6353a3db59d5bba0a4b38fe8194d97)) 9 | * complete, reproducible example workflow ([1dfa7ad](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dfa7adb0120880ae5e85c57551d5e698a057497)) 10 | * larger update to feature fully-functional example and github actions ([93c08fc](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/93c08fc9db2f8619af7b90784db83d18ed656f25)) 11 | * major simplification of rules, replacement of others by wrappers ([3811ef7](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/3811ef796df4fe38fb7161f9a1b06fac9db86d5b)) 12 | * major simplification of template and update docs ([81ee089](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/81ee08989857366893593a333615523f05295f87)) 13 | * replaced get genome script with simple shell command ([9208995](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9208995b78433ce3680a0b0e453ddcf5915abcef)) 14 | * update github actions workflow in linting part ([27d53ee](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/27d53eecfad935f50bc62a30248141891a4329ee)) 15 | * update github actions workflow. check formatting of yaml files using prettier ([9f5131b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9f5131bf0eeaf1eb7fb0937b2840f73db2a02724)) 16 | * updated all GH actions to latest versions ([4d7b3a2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/4d7b3a2b143c304b6dcf487664c392c4a5e98f74)) 17 | * updated github actions workflow ([fd36648](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fd3664841b830ae670549aabb214eb6004aa696d)) 18 | * updated github actions workflow ([7a3a40e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/7a3a40e62df01b37a802a085e7210014eb3fba82)) 19 | 20 | 21 | ### Bug Fixes 22 | 23 | * 2nd attempt to fix release please wf ([f81847f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/f81847fdfd39d99e795006da4f84701ee6ba8ddc)) 24 | * added usage docs ([776b97e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/776b97e3d0e928d98f4c48e619090b47f702dcab)) 25 | * all-temp needs explicit input of multiqc zip dirs ([026c35a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/026c35aebfb140746bc823ce06327e25c9a40cf1)) 26 | * change release type to 'go', fixes release please wf ([658c784](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/658c784ab5d70b117ce9dd386f5b07f8e4ff782d)) 27 | * change release type to 'go', fixes release please wf ([a81ab9d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/a81ab9def05667e23c5e59ac881c7a57b9f1b767)) 28 | * code review issues ([97faf1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/97faf1ae8bde189094e6b46568f3911f01b625fd)) 29 | * dont remove temp files for test runs ([0c2c8d1](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0c2c8d19c51648872d09a8f697826b9445bafc81)) 30 | * formatting, logging ([d6c819e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/d6c819efcadde1ad4af342152d3aef2a982983d0)) 31 | * lint error and docs update ([cf59f11](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/cf59f11acc11c01866ad56971fd132661f4f32be)) 32 | * recommended `.yaml` file extension, latest schema version ([e649e12](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/e649e12be9c447e8c366847ddf3531e216306c97)) 33 | * release please workflow requires additional permission ([0993271](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0993271f0077e5a548755679b2b8952d18795580)) 34 | * release please workflow requires additional permission ([3651295](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/36512953f851611f18676a4f18e6e5684932ef61)) 35 | * removed unused templates, update catalog yml ([b5c292f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b5c292ff4b476441d8068ca8013e3b931d30fc04)) 36 | * revert to GitHub Actions status badge requiring `owner` and `repo` set by user ([dd163f3](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/dd163f33a5299ecbeb10eb019ef5e8c727f0422a)) 37 | * snakefmt error ([70d670a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/70d670a91c79c0a9d89c59fff6add3f1036753a3)) 38 | * update release-please GH workflow ([1dad25d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dad25da5de222982b0cdf35a91be6ecc5a81a42)) 39 | * update release-please GH workflow ([0ea4df2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0ea4df2f746e0fc760c06a3b902e2ee8bdf2ff42)) 40 | * update snakemake action ([fac8662](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fac8662193fa501fdfc2f3bb94e7549b96dec500)) 41 | * updated schemas and params docs ([facf377](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/facf377a7cc107b3e8db0793b21027a9f3df0eeb)) 42 | * updates to enable release-please action again ([8d9552b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/8d9552b8369ca6b115ee00777f45cf641312dde3)) 43 | * use recommended `.yaml` file extension (https://www.yaml.info/learn/bestpractices.html#file) ([dc3dc1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/dc3dc1aa798a009644f938ef41df02f370e09466)) 44 | * various changes to formatting and example rules ([b9b2366](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b9b236645ad961cd7a8886c1697b27f3694ee047)) 45 | 46 | ## 1.0.0 (2025-05-07) 47 | 48 | 49 | ### Features 50 | 51 | * complete minimal workflow as template ([2348055](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/234805535a6353a3db59d5bba0a4b38fe8194d97)) 52 | * complete, reproducible example workflow ([1dfa7ad](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dfa7adb0120880ae5e85c57551d5e698a057497)) 53 | * larger update to feature fully-functional example and github actions ([93c08fc](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/93c08fc9db2f8619af7b90784db83d18ed656f25)) 54 | * major simplification of rules, replacement of others by wrappers ([3811ef7](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/3811ef796df4fe38fb7161f9a1b06fac9db86d5b)) 55 | * major simplification of template and update docs ([81ee089](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/81ee08989857366893593a333615523f05295f87)) 56 | * replaced get genome script with simple shell command ([9208995](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9208995b78433ce3680a0b0e453ddcf5915abcef)) 57 | * update github actions workflow in linting part ([27d53ee](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/27d53eecfad935f50bc62a30248141891a4329ee)) 58 | * update github actions workflow. check formatting of yaml files using prettier ([9f5131b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9f5131bf0eeaf1eb7fb0937b2840f73db2a02724)) 59 | * updated all GH actions to latest versions ([4d7b3a2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/4d7b3a2b143c304b6dcf487664c392c4a5e98f74)) 60 | * updated github actions workflow ([fd36648](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fd3664841b830ae670549aabb214eb6004aa696d)) 61 | * updated github actions workflow ([7a3a40e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/7a3a40e62df01b37a802a085e7210014eb3fba82)) 62 | 63 | 64 | ### Bug Fixes 65 | 66 | * 2nd attempt to fix release please wf ([f81847f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/f81847fdfd39d99e795006da4f84701ee6ba8ddc)) 67 | * added usage docs ([776b97e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/776b97e3d0e928d98f4c48e619090b47f702dcab)) 68 | * all-temp needs explicit input of multiqc zip dirs ([026c35a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/026c35aebfb140746bc823ce06327e25c9a40cf1)) 69 | * change release type to 'go', fixes release please wf ([658c784](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/658c784ab5d70b117ce9dd386f5b07f8e4ff782d)) 70 | * change release type to 'go', fixes release please wf ([a81ab9d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/a81ab9def05667e23c5e59ac881c7a57b9f1b767)) 71 | * code review issues ([97faf1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/97faf1ae8bde189094e6b46568f3911f01b625fd)) 72 | * dont remove temp files for test runs ([0c2c8d1](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0c2c8d19c51648872d09a8f697826b9445bafc81)) 73 | * formatting, logging ([d6c819e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/d6c819efcadde1ad4af342152d3aef2a982983d0)) 74 | * lint error and docs update ([cf59f11](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/cf59f11acc11c01866ad56971fd132661f4f32be)) 75 | * removed unused templates, update catalog yml ([b5c292f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b5c292ff4b476441d8068ca8013e3b931d30fc04)) 76 | * snakefmt error ([70d670a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/70d670a91c79c0a9d89c59fff6add3f1036753a3)) 77 | * update release-please GH workflow ([1dad25d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dad25da5de222982b0cdf35a91be6ecc5a81a42)) 78 | * update release-please GH workflow ([0ea4df2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0ea4df2f746e0fc760c06a3b902e2ee8bdf2ff42)) 79 | * update snakemake action ([fac8662](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fac8662193fa501fdfc2f3bb94e7549b96dec500)) 80 | * updated schemas and params docs ([facf377](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/facf377a7cc107b3e8db0793b21027a9f3df0eeb)) 81 | * updates to enable release-please action again ([8d9552b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/8d9552b8369ca6b115ee00777f45cf641312dde3)) 82 | * various changes to formatting and example rules ([b9b2366](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b9b236645ad961cd7a8886c1697b27f3694ee047)) 83 | -------------------------------------------------------------------------------- /workflow/notebooks/explore.py.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "1c3850be-a729-4c33-ba10-c796d209e4dd", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from Bio import SeqIO\n", 11 | "import pandas as pd\n", 12 | "import gffutils" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "e3ca43d5-7ed1-4be3-afd5-722afb0630a7", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Genome\n", 23 | "for record in SeqIO.parse(snakemake.input[0], \"fasta\"):\n", 24 | " print(record.id)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "b24fa229-9290-4a71-8674-9239a720ed7c", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "genome = SeqIO.to_dict(SeqIO.parse(snakemake.input[0], \"fasta\"))" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "51ccbf9a-bece-40a4-948b-1a8660764bee", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Annotation\n", 45 | "\n", 46 | "db = gffutils.FeatureDB(snakemake.input[1])\n", 47 | "\n", 48 | "db" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "84e54089-109b-419c-8741-608e31153b3b", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "list(db.featuretypes())" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "49f07583-6af1-40a4-8d5c-d1efbda6fb67", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "list(db.features_of_type(\"gene\"))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "659c9eca-46bd-4a3c-8a5b-a6ece7f38fcf", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "for row in db.execute(\"SELECT name FROM sqlite_master WHERE type='table';\"):\n", 79 | " print(row['name'])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "f63c6672-2a23-4c85-9e83-5f58744a13d2", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "for row in db.execute(\"pragma table_info(features);\"):\n", 90 | " print(row['name'])" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "ba9861d1-d4b6-4d7c-ab0b-37955f76b3e2", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "for row in db.execute(\"select id from features where id like '%AT2G19110%';\"):\n", 101 | " print(row['id'])" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "17afaa2b-fbdc-4a34-be67-c7b244a5c3b3", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "list(db['gene-AT2G19110'].attributes)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "1c021e1d-7a5c-4d65-8747-05ea05cf4d7f", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "hma4_annotation = db['gene-AT2G19110']" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "c294d0a6-58f0-4221-bf4b-e799dc4de4fc", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "hma4_annotation.seqid" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "0d9ef868-a8d2-4e81-818e-6618f3e6239c", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "genome[hma4_annotation.seqid][hma4_annotation.start-1:hma4_annotation.end].seq" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "0264e3c3", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "# start coding here\n", 152 | "\n", 153 | "annotation = gffutils.FeatureDB(snakemake.input[1])\n", 154 | "\n", 155 | "genome = SeqIO.to_dict(SeqIO.parse(snakemake.input[0], \"fasta\"))\n", 156 | "\n", 157 | "gene_annotation_id = f\"gene-{snakemake.wildcards[0].split('.')[0]}\"\n", 158 | "\n", 159 | "gene_annotation = annotation[gene_annotation_id]\n", 160 | "\n", 161 | "gene_sequence = genome[gene_annotation.seqid][gene_annotation.start-1:gene_annotation.end]\n", 162 | "\n", 163 | "df = pd.read_csv(snakemake.input[2])\n", 164 | "\n", 165 | "df" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "314d82b6-469c-49bb-89c5-7c0f6815d8fd", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "def get_substitution_dna_window(variant, gene_sequence, gene_annotation):\n", 176 | " in_gene_position = variant[\"position\"] - gene_annotation.start\n", 177 | "\n", 178 | " ref, alt = parse_dna_substitution(variant[\"amino acid change\"])\n", 179 | "\n", 180 | " assert ref == gene_sequence[in_gene_position].upper()\n", 181 | " \n", 182 | " return str(gene_sequence[in_gene_position - 256 : in_gene_position + 256].seq)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "77d73ede-07a6-4534-aff0-58f05d4b056c", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "from Bio.SeqUtils import seq1\n", 193 | "import re\n", 194 | "\n", 195 | "\n", 196 | "def parse_aa_substitution(HGVS_string):\n", 197 | " prot_match = re.match(r\"p\\.([A-Z]{1}[a-z]{2})\\d+([A-Z]{1}[a-z]{2})\", HGVS_string)\n", 198 | " reference, alternative = prot_match.groups()\n", 199 | " return seq1(reference), seq1(alternative)\n", 200 | "\n", 201 | "\n", 202 | "def parse_dna_substitution(HGVS_string):\n", 203 | " dna_match = re.search(r\"c\\..*([ATCG])>([ATCG])$\", HGVS_string)\n", 204 | " reference, alternative = dna_match.groups()\n", 205 | " return reference, alternative\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "2580de4e-db10-4e3b-a7be-0e10a61a5c62", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "def add_sequence_window(snp_effect_path, reference_fasta_path, annotation_db_path):\n", 216 | " annotation = gffutils.FeatureDB(annotation_db_path)\n", 217 | " genome = SeqIO.to_dict(SeqIO.parse(reference_fasta_path, \"fasta\"))\n", 218 | " \n", 219 | " gene_annotation_id = f\"gene-{snakemake.wildcards[0].split('.')[0]}\"\n", 220 | " gene_annotation = annotation[gene_annotation_id]\n", 221 | " \n", 222 | " gene_sequence = genome[gene_annotation.seqid][gene_annotation.start-1:gene_annotation.end]\n", 223 | " \n", 224 | " df = pd.read_csv(snp_effect_path)\n", 225 | "\n", 226 | " df = df.dropna(subset=[\"amino acid change\"])\n", 227 | " df = df[ ~ df[\"amino acid change\"].str.contains(\"ins\", na=False) ]\n", 228 | " df = df[ ~ df[\"amino acid change\"].str.contains(\"del\", na=False) ]\n", 229 | "\n", 230 | "\n", 231 | " df[\"sequence_window\"] = df.apply(\n", 232 | " lambda x: get_substitution_dna_window(x, gene_sequence, gene_annotation), axis=1\n", 233 | " )\n", 234 | " return df\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "478271ca-b2ea-4953-9604-db81542f6715", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "df = add_sequence_window(snakemake.input[2], snakemake.input[0], snakemake.input[1])\n", 245 | "\n", 246 | "df" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "id": "7b06e25a-45eb-4a1f-a705-8529eac39a1f", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "df[df[\"sequence_window\"] == \"-\"]" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "id": "71db427a-445e-4e52-b99a-4a31dc67588b", 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "gene_sequence" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "id": "8185f845-e0ac-44ab-bf58-5559e03f5ebd", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "pos = 8279002 - gene_annotation.start\n", 277 | "pos" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "e47bb086-c4f6-4f86-8167-48f4a85763e8", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "gene_sequence[pos]" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "806fb145-6245-41bb-91ad-bb7b72ab93a5", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "import numpy as np\n", 298 | "\n", 299 | "\n", 300 | "def add_surprise_bonus(df):\n", 301 | " # Entropy H in base-4\n", 302 | " df[\"H\"] = -(\n", 303 | " df[[\"p_a\", \"p_c\", \"p_g\", \"p_t\"]]\n", 304 | " .pipe(lambda p: p * (np.log2(p) / 2))\n", 305 | " .sum(axis=1)\n", 306 | " )\n", 307 | "\n", 308 | " # Surprisal I in base-4\n", 309 | " df[\"I\"] = -(np.log2(df[\"p_obs\"]) / 2)\n", 310 | " df[\"surprise bonus\"] = df[\"H\"] - df[\"I\"]\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "170d379e-f112-47a5-9a5a-3d27420a49e1", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "test = pd.DataFrame({\n", 321 | " \"p_a\": [0.3],\n", 322 | " \"p_c\": [0.3],\n", 323 | " \"p_g\": [0.3],\n", 324 | " \"p_t\": [0.1],\n", 325 | " \"p_obs\": [0.1], \n", 326 | "})\n", 327 | "test" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "61adc227-1bbf-4ff3-bc9e-cb01c5e6a756", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "add_surprise_bonus(test)\n", 338 | "\n", 339 | "test" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "9d96a43b-547e-441d-9d87-c6bc9ea94d4d", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "test = pd.DataFrame({\n", 350 | " \"p_a\": [0.4],\n", 351 | " \"p_c\": [0.4],\n", 352 | " \"p_g\": [0.1],\n", 353 | " \"p_t\": [0.1],\n", 354 | " \"p_obs\": [0.1], \n", 355 | " \"p_ref\": [0.1], \n", 356 | "})\n", 357 | "test" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "id": "ce788bb8-b385-4d67-9ccb-36bc77252299", 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "add_surprise_bonus(test)\n", 368 | "\n", 369 | "test" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "390bbb57-d656-4cd1-97e3-ffffde5435d6", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "gpn = np.log2(test[[\"p_a\", \"p_c\", \"p_g\", \"p_t\"]] / test[\"p_ref\"].to_numpy()[:, None])\n", 380 | "\n", 381 | "gpn\n", 382 | " " 383 | ] 384 | } 385 | ], 386 | "metadata": { 387 | "kernelspec": { 388 | "display_name": "Python 3 (ipykernel)", 389 | "language": "python", 390 | "name": "python3" 391 | }, 392 | "language_info": { 393 | "codemirror_mode": { 394 | "name": "ipython", 395 | "version": 3 396 | }, 397 | "file_extension": ".py", 398 | "mimetype": "text/x-python", 399 | "name": "python", 400 | "nbconvert_exporter": "python", 401 | "pygments_lexer": "ipython3", 402 | "version": "3.14.1" 403 | } 404 | }, 405 | "nbformat": 4, 406 | "nbformat_minor": 5 407 | } 408 | --------------------------------------------------------------------------------