├── workflow
    ├── envs
    │   ├── create_annotation_db.yaml
    │   ├── get_1001_genome_snps.yaml
    │   ├── add_surprise_bonus.yaml
    │   ├── download_ncbi_dataset.yaml
    │   ├── simulate_reads.yaml
    │   ├── get_genome.yaml
    │   ├── validate_genome.yaml
    │   ├── add_gpn_score.yaml
    │   ├── add_sequence_window.yaml
    │   └── notebook.yaml
    ├── scripts
    │   ├── __pycache__
    │   │   └── helpers.cpython-314.pyc
    │   ├── create_annotation_db.py
    │   ├── add_surprise_bonus.py
    │   ├── add_gpn_score.py
    │   ├── helpers.py
    │   ├── validate_fasta.py
    │   ├── get_1001_genome_snps.py
    │   └── add_sequence_window.py
    ├── rules
    │   ├── common.smk
    │   ├── compute_gpn_sb_for_snps.smk
    │   ├── ncbi.smk
    │   └── process_reads.smk
    ├── Snakefile
    └── notebooks
    │   └── explore.py.ipynb
├── .test
    └── config
    │   ├── samples.tsv
    │   └── config.yaml
├── .gitignore
├── .github
    └── workflows
    │   ├── conventional-prs.yaml
    │   ├── release-please.yaml
    │   └── main.yaml
├── config
    ├── config.yaml
    ├── schemas
    │   ├── samples.schema.yaml
    │   └── config.schema.yaml
    └── README.md
├── .snakemake-workflow-catalog.yml
├── LICENSE
├── README.md
└── CHANGELOG.md


/workflow/envs/create_annotation_db.yaml:
--------------------------------------------------------------------------------
1 | name: create_annotation_db
2 | channels:
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - gffutils
7 | 


--------------------------------------------------------------------------------
/workflow/envs/get_1001_genome_snps.yaml:
--------------------------------------------------------------------------------
1 | name: get_1001_genome_snps
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - requests
6 |   - pandas
7 | 


--------------------------------------------------------------------------------
/workflow/envs/add_surprise_bonus.yaml:
--------------------------------------------------------------------------------
1 | name: add_surprise_bonus
2 | channels:
3 |   - conda-forge
4 |   - defaults
5 | dependencies:
6 |   - numpy
7 |   - pandas
8 | 


--------------------------------------------------------------------------------
/workflow/envs/download_ncbi_dataset.yaml:
--------------------------------------------------------------------------------
1 | name: download_ncbi_dataset
2 | channels:
3 |   - conda-forge
4 |   - defaults
5 | dependencies:
6 |   - ncbi-datasets-cli
7 | 


--------------------------------------------------------------------------------
/workflow/envs/simulate_reads.yaml:
--------------------------------------------------------------------------------
1 | name: simulate_reads
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - nodefaults
6 | dependencies:
7 |   - dwgsim=1.1.14
8 | 


--------------------------------------------------------------------------------
/workflow/scripts/__pycache__/helpers.cpython-314.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SilvanCodes/1001_genomes_analysis/main/workflow/scripts/__pycache__/helpers.cpython-314.pyc


--------------------------------------------------------------------------------
/workflow/envs/get_genome.yaml:
--------------------------------------------------------------------------------
1 | name: get_genome
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - nodefaults
6 | dependencies:
7 |   - gzip=1.14
8 |   - wget=1.21.4
9 | 


--------------------------------------------------------------------------------
/workflow/envs/validate_genome.yaml:
--------------------------------------------------------------------------------
1 | name: gpn
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - nodefaults
6 | dependencies:
7 |   - python=3.12
8 |   - biopython=1.85
9 | 


--------------------------------------------------------------------------------
/workflow/envs/add_gpn_score.yaml:
--------------------------------------------------------------------------------
1 | name: add_gpn_score
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - pip
6 |   - pip:
7 |       - git+https://github.com/SilvanCodes/gpn.git@main
8 | 


--------------------------------------------------------------------------------
/workflow/envs/add_sequence_window.yaml:
--------------------------------------------------------------------------------
 1 | name: add_sequence_window
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - pandas
 8 |   - biopython
 9 |   - gffutils
10 | 


--------------------------------------------------------------------------------
/.test/config/samples.tsv:
--------------------------------------------------------------------------------
1 | sample	condition	replicate	read1	read2
2 | sample1	wild_type	1	sample1.bwa.read1.fastq.gz	sample1.bwa.read2.fastq.gz
3 | sample2	wild_type	2	sample2.bwa.read1.fastq.gz	sample2.bwa.read2.fastq.gz
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | results/**
 2 | resources/**
 3 | logs/**
 4 | .snakemake
 5 | .snakemake/**
 6 | .test/results/*
 7 | workflow/notebooks/.ipynb_checkpoints/**
 8 | **/.Rhistory
 9 | **/*.Rproj
10 | **/.Rproj.user/**
11 | **/.RData
12 | **/Rplots.pdf
13 | 


--------------------------------------------------------------------------------
/.test/config/config.yaml:
--------------------------------------------------------------------------------
1 | samplesheet: "config/samples.tsv"
2 | 
3 | get_genome:
4 |   ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
5 | 
6 | simulate_reads:
7 |   read_length: 100
8 |   read_number: 10000
9 | 


--------------------------------------------------------------------------------
/workflow/envs/notebook.yaml:
--------------------------------------------------------------------------------
 1 | name: notebook
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - nodefaults
 6 | dependencies:
 7 |   - biopython
 8 |   - gffutils
 9 |   - pandas
10 |   - numpy
11 |   - matplotlib
12 |   - seaborn
13 |   - python
14 |   - notebook
15 |   - ipykernel
16 | 


--------------------------------------------------------------------------------
/workflow/scripts/create_annotation_db.py:
--------------------------------------------------------------------------------
 1 | import gffutils
 2 | from snakemake.script import snakemake
 3 | 
 4 | 
 5 | def create_annotation_db(gff_file, db_file):
 6 |     db = gffutils.create_db(gff_file, db_file, merge_strategy="create_unique")
 7 |     db.update(list(db.create_introns()))
 8 | 
 9 | 
10 | create_annotation_db(snakemake.input[0], snakemake.output[0])
11 | 


--------------------------------------------------------------------------------
/.github/workflows/conventional-prs.yaml:
--------------------------------------------------------------------------------
 1 | name: Lint PR
 2 | on:
 3 |   pull_request_target:
 4 |     types:
 5 |       - opened
 6 |       - reopened
 7 |       - edited
 8 |       - synchronize
 9 | 
10 | permissions:
11 |   pull-requests: read
12 | 
13 | jobs:
14 |   main:
15 |     name: Validate PR title
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: amannn/action-semantic-pull-request@v5
19 |         env:
20 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 | 


--------------------------------------------------------------------------------
/workflow/rules/common.smk:
--------------------------------------------------------------------------------
 1 | # import basic packages
 2 | import pandas as pd
 3 | from snakemake.utils import validate
 4 | 
 5 | 
 6 | # read sample sheet
 7 | samples = (
 8 |     pd.read_csv(config["samplesheet"], sep="\t", dtype={"sample": str})
 9 |     .set_index("sample", drop=False)
10 |     .sort_index()
11 | )
12 | 
13 | 
14 | # validate sample sheet and config file
15 | validate(samples, schema="../../config/schemas/samples.schema.yaml")
16 | validate(config, schema="../../config/schemas/config.schema.yaml")
17 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 | 
 6 | permissions:
 7 |   contents: write
 8 |   pull-requests: write
 9 |   issues: write
10 | 
11 | name: release-please
12 | 
13 | jobs:
14 |   release-please:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: googleapis/release-please-action@v4
18 |         with:
19 |           token: ${{ secrets.GITHUB_TOKEN }}
20 |           release-type: go # just keep a changelog, no version anywhere outside of git tags
21 | 


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | samplesheet: ".test/config/samples.tsv"
 2 | 
 3 | ncbi:
 4 |   tair10.1: "GCF_000001735.4"
 5 | 
 6 | get_genome:
 7 |   ncbi_ftp: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz
 8 | 
 9 | simulate_reads:
10 |   read_length: 100
11 |   read_number: 10000
12 | 
13 | get_1001_genome_snps:
14 |   base_url: https://tools.1001genomes.org/api/v1.1/effects.json?type=snps;accs=all
15 | 
16 | all:
17 |   model:
18 |     - "gonzalobenegas/gpn-brassicales"
19 |   gene_ids:
20 |     - "AT2G19110.1" # HMA4
21 | 


--------------------------------------------------------------------------------
/.snakemake-workflow-catalog.yml:
--------------------------------------------------------------------------------
 1 | # configuration of display in snakemake workflow catalog: https://snakemake.github.io/snakemake-workflow-catalog
 2 | 
 3 | usage:
 4 |   mandatory-flags:
 5 |     desc: # describe your flags here in a few sentences
 6 |     flags: # put your flags here
 7 |   software-stack-deployment:
 8 |     conda: true # whether pipeline works with '--sdm conda'
 9 |     apptainer: true # whether pipeline works with '--sdm apptainer/singularity'
10 |     apptainer+conda: true # whether pipeline works with '--sdm conda apptainer/singularity'
11 |     report: true # whether creation of reports using 'snakemake --report report.zip' is supported
12 | 


--------------------------------------------------------------------------------
/workflow/rules/compute_gpn_sb_for_snps.smk:
--------------------------------------------------------------------------------
 1 | # get snp data
 2 | # get reference genome
 3 | # compute gpn and sb for snps
 4 | 
 5 | 
 6 | rule get_genome:
 7 |     output:
 8 |         fasta="results/get_genome/genome.fna",
 9 |     conda:
10 |         "../envs/get_genome.yaml"
11 |     message:
12 |         """--- Downloading genome sequence."""
13 |     params:
14 |         ncbi_ftp=lookup(within=config, dpath="get_genome/ncbi_ftp"),
15 |     log:
16 |         "results/get_genome/genome.log",
17 |     shell:
18 |         "wget -O results/get_genome/genome.fna.gz {params.ncbi_ftp} > {log} 2>&1 && "
19 |         "gunzip results/get_genome/genome.fna.gz >> {log} 2>&1"
20 | 


--------------------------------------------------------------------------------
/workflow/scripts/add_surprise_bonus.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from snakemake.script import snakemake
 4 | 
 5 | 
 6 | def add_surprise_bonus(input_path, output_path):
 7 |     df = pd.read_csv(input_path)
 8 |     # Entropy H in base-4
 9 |     df["H"] = -(
10 |         df[["p_a", "p_c", "p_g", "p_t"]]
11 |         .pipe(lambda p: p * (np.log2(p) / 2))
12 |         .sum(axis=1)
13 |     )
14 | 
15 |     # Surprisal I in base-4
16 |     df["I"] = -(np.log2(df["p_alt"]) / 2)
17 |     df["surprise bonus"] = df["H"] - df["I"]
18 | 
19 |     df.to_csv(output_path, index=False)
20 | 
21 | 
22 | add_surprise_bonus(snakemake.input[0], snakemake.output[0])
23 | 


--------------------------------------------------------------------------------
/workflow/scripts/add_gpn_score.py:
--------------------------------------------------------------------------------
 1 | from helpers import parse_dna_substitution
 2 | import pandas as pd
 3 | 
 4 | import gpn.model
 5 | import gpn.pipelines
 6 | from transformers import pipeline
 7 | 
 8 | 
 9 | def get_gpn_score(variant):
10 |     ref, alt = parse_dna_substitution(variant["amino acid change"])
11 | 
12 |     return variant[f"gpn_{alt.lower()}"]
13 | 
14 | 
15 | def add_gpn_score(df):
16 |     gpn_pipeline = pipeline("gpn")
17 | 
18 |     gpn_scores = pd.concat(
19 |         gpn_pipeline(df["sequence_window"], start=256, end=257, batch_size=8),
20 |         ignore_index=True,
21 |     )
22 | 
23 |     df = pd.concat([df, gpn_scores], axis=1)
24 | 
25 |     df["gpn_score"] = df.apply(lambda x: get_gpn_score(x), axis=1)
26 |     return df
27 | 


--------------------------------------------------------------------------------
/config/schemas/samples.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "https://json-schema.org/draft/2020-12/schema"
 2 | description: entries for the sample sheet
 3 | properties:
 4 |   sample:
 5 |     type: string
 6 |     description: sample name/identifier
 7 |   condition:
 8 |     type: string
 9 |     description: sample condition that will be compared during differential analysis
10 |   replicate:
11 |     type: integer
12 |     default: 1
13 |     description: consecutive numbers representing multiple replicates of one condition
14 |   read1:
15 |     type: string
16 |     description: names of fastq.gz files, read 1
17 |   read2:
18 |     type: string
19 |     description: names of fastq.gz files, read 2 (optional)
20 | 
21 | required:
22 |   - sample
23 |   - condition
24 |   - replicate
25 |   - read1
26 | 


--------------------------------------------------------------------------------
/config/schemas/config.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "https://json-schema.org/draft/2020-12/schema"
 2 | description: main configuration schema for the workflow
 3 | properties:
 4 |   samplesheet:
 5 |     type: string
 6 |     description: path to sample-sheet TSV file
 7 | 
 8 |   get_genome:
 9 |     type: object
10 |     properties:
11 |       ncbi_ftp:
12 |         type: string
13 |         description: URL for genome retrieval
14 |     required: ["ncbi_ftp"]
15 | 
16 |   simulate_reads:
17 |     type: object
18 |     properties:
19 |       read_length:
20 |         type: number
21 |         description: length of target reads in bp
22 |       read_number:
23 |         type: number
24 |         description: number of total reads to be simulated
25 | 
26 | required:
27 |   - samplesheet
28 |   - get_genome
29 |   - simulate_reads
30 | 


--------------------------------------------------------------------------------
/workflow/scripts/helpers.py:
--------------------------------------------------------------------------------
 1 | from Bio.SeqUtils import seq1
 2 | import re
 3 | 
 4 | 
 5 | def parse_aa_substitution(HGVS_string):
 6 |     prot_match = re.match(r"p\.([A-Z]{1}[a-z]{2})\d+([A-Z]{1}[a-z]{2})", HGVS_string)
 7 |     reference, alternative = prot_match.groups()
 8 |     return seq1(reference), seq1(alternative)
 9 | 
10 | 
11 | def parse_dna_substitution(HGVS_string):
12 |     dna_match = re.search(r"c\..*([ATCG])>([ATCG])$", HGVS_string)
13 |     reference, alternative = dna_match.groups()
14 |     return reference, alternative
15 | 
16 | 
17 | def get_start_end_from_seq(seq):
18 |     range = seq.id.split("|")[2]
19 |     _chrom, range = range.split(":")
20 |     start, end = range.split("-")
21 |     return int(start), int(end)
22 | 
23 | 
24 | # parse_dna_substitution("p.Leu16Val/c.46T>G")
25 | 
26 | # parse_aa_substitution("p.Leu16Val/c.46T>G")
27 | 


--------------------------------------------------------------------------------
/workflow/scripts/validate_fasta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from Bio import SeqIO
 3 | from snakemake.script import snakemake
 4 | 
 5 | sys.stderr = open(snakemake.log[0], "w", buffering=1)
 6 | 
 7 | 
 8 | def validate_fasta(input_fasta, output_fasta):
 9 |     try:
10 |         with open(input_fasta, "r") as fasta_file:
11 |             records = list(SeqIO.parse(fasta_file, "fasta"))
12 |             if not records:
13 |                 raise ValueError("FASTA file is empty or improperly formatted.")
14 |             else:
15 |                 summary = [f"Validated sequence records for {output_fasta}:"]
16 |                 summary += [f"{i.name}: {i.description}" for i in records]
17 |         with open(output_fasta, "w") as validated_file:
18 |             SeqIO.write(records, validated_file, "fasta")
19 |         sys.stderr.write("\n".join(summary))
20 |     except Exception as e:
21 |         sys.stderr.write(f"Validation failed: {e}\n")
22 |         raise
23 | 
24 | 
25 | validate_fasta(snakemake.input["fasta"], snakemake.output["fasta"])
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021, AUTHORS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/workflow/scripts/get_1001_genome_snps.py:
--------------------------------------------------------------------------------
 1 | from snakemake.script import snakemake
 2 | import requests
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def get_1001_genome_snps(base_url, gene_id, output_path):
 7 |     url = f"{base_url};gid={gene_id}"
 8 |     snp_effects_response = requests.get(url)
 9 | 
10 |     df = pd.DataFrame(
11 |         snp_effects_response.json()["data"],
12 |         columns=[
13 |             "chromosome",
14 |             "position",
15 |             "accession id",
16 |             "type",
17 |             "effect impact",
18 |             "functional class",
19 |             "codon change",
20 |             "amino acid change",
21 |             "amino acid length",
22 |             "gene name",
23 |             "transcript biotype",
24 |             "gene coding",
25 |             "transcript id",
26 |             "exon rank",
27 |         ],
28 |     )
29 | 
30 |     df.to_csv(output_path[0], index=False)
31 | 
32 |     df = df.drop_duplicates(subset=["amino acid change"])
33 | 
34 |     df.to_csv(output_path[1], index=False)
35 | 
36 | 
37 | get_1001_genome_snps(
38 |     snakemake.params.base_url, snakemake.wildcards.gene_id, snakemake.output
39 | )
40 | 


--------------------------------------------------------------------------------
/workflow/rules/ncbi.smk:
--------------------------------------------------------------------------------
 1 | localrules:
 2 |     download_ncbi_dataset,
 3 |     unpack_ncbi_dataset,
 4 | 
 5 | 
 6 | rule download_ncbi_dataset:
 7 |     output:
 8 |         "resources/download_ncbi_dataset/{accession}.zip",
 9 |     conda:
10 |         "../envs/download_ncbi_dataset.yaml"
11 |     message:
12 |         """--- Downloading NCBI dataset for {wildcards.accession}."""
13 |     shell:
14 |         "datasets download genome accession {wildcards.accession} --include genome,gff3 --filename {output}"
15 | 
16 | 
17 | rule unpack_ncbi_dataset:
18 |     input:
19 |         "resources/download_ncbi_dataset/{accession}.zip",
20 |     output:
21 |         "resources/unpack_ncbi_dataset/{accession}/genome.fna",
22 |         "resources/unpack_ncbi_dataset/{accession}/annotation.gff",
23 |     params:
24 |         data_path="/tmp/ncbi_dataset/data",
25 |     shell:
26 |         "unzip {input} -d /tmp"
27 |         " && genome_path=$(cat {params.data_path}/dataset_catalog.json | jq -r '.assemblies[1].files | .[] | select(.fileType==\"GENOMIC_NUCLEOTIDE_FASTA\").filePath')"
28 |         " && annotation_path=$(cat {params.data_path}/dataset_catalog.json | jq -r '.assemblies[1].files | .[] | select(.fileType==\"GFF3\").filePath')"
29 |         " && mv {params.data_path}/$genome_path {output[0]}"
30 |         " && mv {params.data_path}/$annotation_path {output[1]}"
31 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   Formatting:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |         with:
15 |           fetch-depth: 0
16 |       - name: Formatting
17 |         uses: super-linter/super-linter@v7
18 |         env:
19 |           VALIDATE_ALL_CODEBASE: false
20 |           DEFAULT_BRANCH: main
21 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 |           VALIDATE_SNAKEMAKE_SNAKEFMT: true
23 |           VALIDATE_YAML_PRETTIER: true
24 | 
25 |   Linting:
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - name: Lint workflow
30 |         uses: snakemake/snakemake-github-action@v2
31 |         with:
32 |           directory: .
33 |           snakefile: workflow/Snakefile
34 |           args: "--lint"
35 | 
36 |   Testing:
37 |     runs-on: ubuntu-latest
38 |     needs:
39 |       - Linting
40 |       - Formatting
41 |     steps:
42 |       - uses: actions/checkout@v4
43 | 
44 |       - name: Test workflow
45 |         uses: snakemake/snakemake-github-action@v2
46 |         with:
47 |           directory: .test
48 |           snakefile: workflow/Snakefile
49 |           args: "--sdm conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --all-temp"
50 | 
51 |       - name: Test report
52 |         uses: snakemake/snakemake-github-action@v2
53 |         with:
54 |           directory: .test
55 |           snakefile: workflow/Snakefile
56 |           args: "--report report.zip"
57 | 


--------------------------------------------------------------------------------
/workflow/scripts/add_sequence_window.py:
--------------------------------------------------------------------------------
 1 | from helpers import parse_dna_substitution
 2 | from Bio import SeqIO
 3 | import pandas as pd
 4 | from gffutils import FeatureDB
 5 | from snakemake.script import snakemake
 6 | 
 7 | 
 8 | def get_substitution_dna_window(variant, gene_sequence, gene_annotation):
 9 |     in_gene_position = variant["position"] - gene_annotation.start
10 | 
11 |     ref, _alt = parse_dna_substitution(variant["amino acid change"])
12 | 
13 |     assert ref == gene_sequence[in_gene_position].upper()
14 | 
15 |     # fails for positions close to start or end of gene
16 |     return str(gene_sequence[in_gene_position - 256 : in_gene_position + 256].seq)
17 | 
18 | 
19 | def add_sequence_window(
20 |     snp_effect_path, reference_fasta_path, annotation_db_path, output_path
21 | ):
22 |     annotation = FeatureDB(annotation_db_path)
23 |     genome = SeqIO.to_dict(SeqIO.parse(reference_fasta_path, "fasta"))
24 | 
25 |     gene_annotation_id = f"gene-{snakemake.wildcards[0].split('.')[0]}"
26 |     gene_annotation = annotation[gene_annotation_id]
27 | 
28 |     # compensate for 1-based indexing in gff
29 |     gene_sequence = genome[gene_annotation.seqid][
30 |         gene_annotation.start - 1 : gene_annotation.end
31 |     ]
32 | 
33 |     df = pd.read_csv(snp_effect_path)
34 | 
35 |     # move data cleaning into seperate rule
36 |     df = df.dropna(subset=["amino acid change"])
37 |     df = df[~df["amino acid change"].str.contains("ins", na=False)]
38 |     df = df[~df["amino acid change"].str.contains("del", na=False)]
39 | 
40 |     df["sequence_window"] = df.apply(
41 |         lambda x: get_substitution_dna_window(x, gene_sequence, gene_annotation), axis=1
42 |     )
43 | 
44 |     df.to_csv(output_path, index=False)
45 | 
46 | 
47 | add_sequence_window(
48 |     snakemake.input[2], snakemake.input[0], snakemake.input[1], snakemake.output[0]
49 | )
50 | 


--------------------------------------------------------------------------------
/config/README.md:
--------------------------------------------------------------------------------
 1 | ## Workflow overview
 2 | 
 3 | This workflow is a best-practice workflow for `<detailed description>`.
 4 | The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps:
 5 | 
 6 | 1. Download genome reference from NCBI
 7 | 2. Validate downloaded genome (`python` script)
 8 | 3. Simulate short read sequencing data on the fly (`dwgsim`)
 9 | 4. Check quality of input read data (`FastQC`)
10 | 5. Collect statistics from tool output (`MultiQC`)
11 | 
12 | ## Running the workflow
13 | 
14 | ### Input data
15 | 
16 | This template workflow creates artificial sequencing data in `*.fastq.gz` format.
17 | It does not contain actual input data.
18 | The simulated input files are nevertheless created based on a mandatory table linked in the `config.yaml` file (default: `.test/samples.tsv`).
19 | The sample sheet has the following layout:
20 | 
21 | | sample  | condition | replicate | read1                      | read2                      |
22 | | ------- | --------- | --------- | -------------------------- | -------------------------- |
23 | | sample1 | wild_type | 1         | sample1.bwa.read1.fastq.gz | sample1.bwa.read2.fastq.gz |
24 | | sample2 | wild_type | 2         | sample2.bwa.read1.fastq.gz | sample2.bwa.read2.fastq.gz |
25 | 
26 | ### Parameters
27 | 
28 | This table lists all parameters that can be used to run the workflow.
29 | 
30 | | parameter          | type | details                               | default                        |
31 | | ------------------ | ---- | ------------------------------------- | ------------------------------ |
32 | | **samplesheet**    |      |                                       |                                |
33 | | path               | str  | path to samplesheet, mandatory        | "config/samples.tsv"           |
34 | | **get_genome**     |      |                                       |                                |
35 | | ncbi_ftp           | str  | link to a genome on NCBI's FTP server | link to _S. cerevisiae_ genome |
36 | | **simulate_reads** |      |                                       |                                |
37 | | read_length        | num  | length of target reads in bp          | 100                            |
38 | | read_number        | num  | number of total reads to be simulated | 10000                          |
39 | 


--------------------------------------------------------------------------------
/workflow/Snakefile:
--------------------------------------------------------------------------------
  1 | # Main entrypoint of the workflow.
  2 | # Please follow the best practices:
  3 | # https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html,
  4 | # in particular regarding the standardized folder structure mentioned there.
  5 | 
  6 | 
  7 | # load configuration
  8 | # -----------------------------------------------------
  9 | configfile: "config/config.yaml"
 10 | 
 11 | 
 12 | # load rules
 13 | # -----------------------------------------------------
 14 | include: "rules/ncbi.smk"
 15 | 
 16 | 
 17 | # local rules
 18 | # -----------------------------------------------------
 19 | localrules:
 20 |     all,
 21 |     get_1001_genome_snps,
 22 |     create_annotation_db,
 23 |     add_sequence_window,
 24 | 
 25 | 
 26 | # optional messages, log and error handling
 27 | # -----------------------------------------------------
 28 | onstart:
 29 |     print("\n--- Analysis started ---\n")
 30 | 
 31 | 
 32 | onsuccess:
 33 |     print("\n--- Workflow finished! ---\n")
 34 | 
 35 | 
 36 | onerror:
 37 |     print("\n--- An error occurred! ---\n")
 38 | 
 39 | 
 40 | # target rules
 41 | # -----------------------------------------------------
 42 | rule all:
 43 |     input:
 44 |         # expand(
 45 |         #     "results/scores/{model}/{gene_id}/scores.csv",
 46 |         #     model=lookup(within=config, dpath="all/model"),
 47 |         #     gene_id=lookup(within=config, dpath="all/gene_ids"),
 48 |         # ),
 49 |         expand(
 50 |             "results/add_sequence_window/{gene_id}/snp_effects_with_sequence_window.csv",
 51 |             gene_id=lookup(within=config, dpath="all/gene_ids"),
 52 |         ),
 53 |     default_target: True
 54 | 
 55 | 
 56 | rule get_1001_genome_snps:
 57 |     output:
 58 |         "resources/get_1001_genome_snps/{gene_id}/snp_effects.csv",
 59 |         "resources/get_1001_genome_snps/{gene_id}/snp_effects_deduplicated.csv",
 60 |     conda:
 61 |         "envs/get_1001_genome_snps.yaml"
 62 |     message:
 63 |         """--- Downloading 1001 genome SNP effects."""
 64 |     params:
 65 |         base_url=lookup(within=config, dpath="get_1001_genome_snps/base_url"),
 66 |     script:
 67 |         "scripts/get_1001_genome_snps.py"
 68 | 
 69 | 
 70 | rule create_annotation_db:
 71 |     input:
 72 |         "resources/unpack_ncbi_dataset/{accession}/annotation.gff",
 73 |     output:
 74 |         "resources/create_annotation_db/{accession}/annotation.db",
 75 |     conda:
 76 |         "envs/create_annotation_db.yaml"
 77 |     script:
 78 |         "scripts/create_annotation_db.py"
 79 | 
 80 | 
 81 | rule add_sequence_window:
 82 |     input:
 83 |         expand(
 84 |             [
 85 |                 "resources/unpack_ncbi_dataset/{accession}/genome.fna",
 86 |                 "resources/create_annotation_db/{accession}/annotation.db",
 87 |             ],
 88 |             accession=lookup(within=config, dpath="ncbi/tair10.1"),
 89 |         ),
 90 |         "resources/get_1001_genome_snps/{gene_id}/snp_effects_deduplicated.csv",
 91 |     output:
 92 |         "results/add_sequence_window/{gene_id}/snp_effects_with_sequence_window.csv",
 93 |     conda:
 94 |         "envs/add_sequence_window.yaml"
 95 |     message:
 96 |         """--- Adding sequence windows to SNP effects."""
 97 |     script:
 98 |         "scripts/add_sequence_window.py"
 99 | 
100 | 
101 | # conda:
102 | #     "envs/notebook.yaml"
103 | # notebook:
104 | #     "notebooks/explore.py.ipynb"
105 | # snakemake --sdm conda --cores 1 --edit-notebook results/add_sequence_window/AT2G19110.1/snp_effects_with_sequence_window.csv"
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Snakemake workflow: `<name>`
 2 | 
 3 | [![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io)
 4 | [![GitHub actions status](https://github.com/<owner>/<repo>/workflows/Tests/badge.svg?branch=main)](https://github.com/<owner>/<repo>/actions?query=branch%3Amain+workflow%3ATests)
 5 | [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 6 | [![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/<owner>/<repo>)
 7 | 
 8 | A Snakemake workflow for `<description>`
 9 | 
10 | - [Snakemake workflow: `<name>`](#snakemake-workflow-name)
11 |   - [Usage](#usage)
12 |   - [Deployment options](#deployment-options)
13 |   - [Authors](#authors)
14 |   - [References](#references)
15 |   - [TODO](#todo)
16 | 
17 | ## Usage
18 | 
19 | The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/<owner>/<repo>).
20 | 
21 | Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md).
22 | 
23 | If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI.
24 | 
25 | ## Deployment options
26 | 
27 | To run the workflow from command line, change the working directory.
28 | 
29 | ```bash
30 | cd path/to/snakemake-workflow-name
31 | ```
32 | 
33 | Adjust options in the default config file `config/config.yaml`.
34 | Before running the complete workflow, you can perform a dry run using:
35 | 
36 | ```bash
37 | snakemake --dry-run
38 | ```
39 | 
40 | To run the workflow with test files using **conda**:
41 | 
42 | ```bash
43 | snakemake --cores 2 --sdm conda --directory .test
44 | ```
45 | 
46 | To run the workflow with **apptainer** / **singularity**, add a link to a container registry in the `Snakefile`, for example `container: "oras://ghcr.io/<user>/<repository>:<version>"` for Github's container registry.
47 | Run the workflow with:
48 | 
49 | ```bash
50 | snakemake --cores 2 --sdm conda apptainer --directory .test
51 | ```
52 | 
53 | ## Authors
54 | 
55 | - Firstname Lastname
56 |   - Affiliation
57 |   - ORCID profile
58 |   - home page
59 | 
60 | ## References
61 | 
62 | > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2.
63 | 
64 | ## TODO
65 | 
66 | - Replace `<owner>` and `<repo>` everywhere in the template with the correct user name/organization, and the repository name. The workflow will be automatically added to the [snakemake workflow catalog](https://snakemake.github.io/snakemake-workflow-catalog/index.html) once it is publicly available on Github.
67 | - Replace `<name>` with the workflow name (can be the same as `<repo>`).
68 | - Replace `<description>` with a description of what the workflow does.
69 | - Update the [deployment](#deployment-options), [authors](#authors) and [references](#references) sections.
70 | - Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's [deployment](#deployment-options) options.
71 | - Do not forget to also adjust the configuration-specific `config/README.md` file.
72 | 


--------------------------------------------------------------------------------
/workflow/rules/process_reads.smk:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------- #
  2 | # EXAMPLE WORKFLOW                                      #
  3 | # ----------------------------------------------------- #
  4 | 
  5 | 
  6 | # fetch genome sequence from NCBI
  7 | # -----------------------------------------------------
  8 | rule get_genome:
  9 |     output:
 10 |         fasta="results/get_genome/genome.fna",
 11 |     conda:
 12 |         "../envs/get_genome.yaml"
 13 |     message:
 14 |         """--- Downloading genome sequence."""
 15 |     params:
 16 |         ncbi_ftp=lookup(within=config, dpath="get_genome/ncbi_ftp"),
 17 |     log:
 18 |         "results/get_genome/genome.log",
 19 |     shell:
 20 |         "wget -O results/get_genome/genome.fna.gz {params.ncbi_ftp} > {log} 2>&1 && "
 21 |         "gunzip results/get_genome/genome.fna.gz >> {log} 2>&1"
 22 | 
 23 | 
 24 | # validate genome sequence file
 25 | # -----------------------------------------------------
 26 | rule validate_genome:
 27 |     input:
 28 |         fasta=rules.get_genome.output.fasta,
 29 |     output:
 30 |         fasta="results/validate_genome/genome.fna",
 31 |     conda:
 32 |         "../envs/validate_genome.yaml"
 33 |     message:
 34 |         """--- Validating genome sequence file."""
 35 |     log:
 36 |         "results/validate_genome/genome.log",
 37 |     script:
 38 |         "../scripts/validate_fasta.py"
 39 | 
 40 | 
 41 | # simulate read data using DWGSIM
 42 | # -----------------------------------------------------
 43 | rule simulate_reads:
 44 |     input:
 45 |         fasta=rules.validate_genome.output.fasta,
 46 |     output:
 47 |         multiext(
 48 |             "results/simulate_reads/{sample}",
 49 |             read1=".bwa.read1.fastq.gz",
 50 |             read2=".bwa.read2.fastq.gz",
 51 |         ),
 52 |     conda:
 53 |         "../envs/simulate_reads.yaml"
 54 |     message:
 55 |         """--- Simulating read data with DWGSIM."""
 56 |     params:
 57 |         output_type=1,
 58 |         read_length=lookup(within=config, dpath="simulate_reads/read_length"),
 59 |         read_number=lookup(within=config, dpath="simulate_reads/read_number"),
 60 |     log:
 61 |         "results/simulate_reads/{sample}.log",
 62 |     shell:
 63 |         "output_prefix=`echo {output.read1} | cut -f 1 -d .`;"
 64 |         "dwgsim "
 65 |         " -1 {params.read_length}"
 66 |         " -2 {params.read_length}"
 67 |         " -N {params.read_number}"
 68 |         " -o {params.output_type}"
 69 |         " {input.fasta}"
 70 |         " ${{output_prefix}}"
 71 |         " > {log} 2>&1"
 72 | 
 73 | 
 74 | # make QC report
 75 | # -----------------------------------------------------
 76 | rule fastqc:
 77 |     input:
 78 |         fastq="results/simulate_reads/{sample}.bwa.{read}.fastq.gz",
 79 |     output:
 80 |         html="results/fastqc/{sample}.bwa.{read}_fastqc.html",
 81 |         zip="results/fastqc/{sample}.bwa.{read}_fastqc.zip",
 82 |     params:
 83 |         extra="--quiet",
 84 |     message:
 85 |         """--- Checking fastq files with FastQC."""
 86 |     log:
 87 |         "results/fastqc/{sample}.bwa.{read}.log",
 88 |     threads: 1
 89 |     wrapper:
 90 |         "v6.0.0/bio/fastqc"
 91 | 
 92 | 
 93 | # run multiQC on tool output
 94 | # -----------------------------------------------------
 95 | rule multiqc:
 96 |     input:
 97 |         expand(
 98 |             "results/fastqc/{sample}.bwa.{read}_fastqc.{ext}",
 99 |             sample=samples.index,
100 |             read=["read1", "read2"],
101 |             ext=["html", "zip"],
102 |         ),
103 |     output:
104 |         report="results/multiqc/multiqc_report.html",
105 |     params:
106 |         extra="--verbose --dirs",
107 |     message:
108 |         """--- Generating MultiQC report for seq data."""
109 |     log:
110 |         "results/multiqc/multiqc.log",
111 |     wrapper:
112 |         "v6.0.0/bio/multiqc"
113 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [1.1.0](https://github.com/snakemake-workflows/snakemake-workflow-template/compare/v1.0.0...v1.1.0) (2025-07-29)
 4 | 
 5 | 
 6 | ### Features
 7 | 
 8 | * complete minimal workflow as template ([2348055](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/234805535a6353a3db59d5bba0a4b38fe8194d97))
 9 | * complete, reproducible example workflow ([1dfa7ad](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dfa7adb0120880ae5e85c57551d5e698a057497))
10 | * larger update to feature fully-functional example and github actions ([93c08fc](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/93c08fc9db2f8619af7b90784db83d18ed656f25))
11 | * major simplification of rules, replacement of others by wrappers ([3811ef7](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/3811ef796df4fe38fb7161f9a1b06fac9db86d5b))
12 | * major simplification of template and update docs ([81ee089](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/81ee08989857366893593a333615523f05295f87))
13 | * replaced get genome script with simple shell command ([9208995](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9208995b78433ce3680a0b0e453ddcf5915abcef))
14 | * update github actions workflow in linting part ([27d53ee](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/27d53eecfad935f50bc62a30248141891a4329ee))
15 | * update github actions workflow. check formatting of yaml files using prettier ([9f5131b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9f5131bf0eeaf1eb7fb0937b2840f73db2a02724))
16 | * updated all GH actions to latest versions ([4d7b3a2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/4d7b3a2b143c304b6dcf487664c392c4a5e98f74))
17 | * updated github actions workflow ([fd36648](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fd3664841b830ae670549aabb214eb6004aa696d))
18 | * updated github actions workflow ([7a3a40e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/7a3a40e62df01b37a802a085e7210014eb3fba82))
19 | 
20 | 
21 | ### Bug Fixes
22 | 
23 | * 2nd attempt to fix release please wf ([f81847f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/f81847fdfd39d99e795006da4f84701ee6ba8ddc))
24 | * added usage docs ([776b97e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/776b97e3d0e928d98f4c48e619090b47f702dcab))
25 | * all-temp needs explicit input of multiqc zip dirs ([026c35a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/026c35aebfb140746bc823ce06327e25c9a40cf1))
26 | * change release type to 'go', fixes release please wf ([658c784](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/658c784ab5d70b117ce9dd386f5b07f8e4ff782d))
27 | * change release type to 'go', fixes release please wf ([a81ab9d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/a81ab9def05667e23c5e59ac881c7a57b9f1b767))
28 | * code review issues ([97faf1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/97faf1ae8bde189094e6b46568f3911f01b625fd))
29 | * dont remove temp files for test runs ([0c2c8d1](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0c2c8d19c51648872d09a8f697826b9445bafc81))
30 | * formatting, logging ([d6c819e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/d6c819efcadde1ad4af342152d3aef2a982983d0))
31 | * lint error and docs update ([cf59f11](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/cf59f11acc11c01866ad56971fd132661f4f32be))
32 | * recommended `.yaml` file extension, latest schema version ([e649e12](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/e649e12be9c447e8c366847ddf3531e216306c97))
33 | * release please workflow requires additional permission ([0993271](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0993271f0077e5a548755679b2b8952d18795580))
34 | * release please workflow requires additional permission ([3651295](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/36512953f851611f18676a4f18e6e5684932ef61))
35 | * removed unused templates, update catalog yml ([b5c292f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b5c292ff4b476441d8068ca8013e3b931d30fc04))
36 | * revert to GitHub Actions status badge requiring `owner` and `repo` set by user ([dd163f3](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/dd163f33a5299ecbeb10eb019ef5e8c727f0422a))
37 | * snakefmt error ([70d670a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/70d670a91c79c0a9d89c59fff6add3f1036753a3))
38 | * update release-please GH workflow ([1dad25d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dad25da5de222982b0cdf35a91be6ecc5a81a42))
39 | * update release-please GH workflow ([0ea4df2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0ea4df2f746e0fc760c06a3b902e2ee8bdf2ff42))
40 | * update snakemake action ([fac8662](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fac8662193fa501fdfc2f3bb94e7549b96dec500))
41 | * updated schemas and params docs ([facf377](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/facf377a7cc107b3e8db0793b21027a9f3df0eeb))
42 | * updates to enable release-please action again ([8d9552b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/8d9552b8369ca6b115ee00777f45cf641312dde3))
43 | * use recommended `.yaml` file extension (https://www.yaml.info/learn/bestpractices.html#file) ([dc3dc1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/dc3dc1aa798a009644f938ef41df02f370e09466))
44 | * various changes to formatting and example rules ([b9b2366](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b9b236645ad961cd7a8886c1697b27f3694ee047))
45 | 
46 | ## 1.0.0 (2025-05-07)
47 | 
48 | 
49 | ### Features
50 | 
51 | * complete minimal workflow as template ([2348055](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/234805535a6353a3db59d5bba0a4b38fe8194d97))
52 | * complete, reproducible example workflow ([1dfa7ad](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dfa7adb0120880ae5e85c57551d5e698a057497))
53 | * larger update to feature fully-functional example and github actions ([93c08fc](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/93c08fc9db2f8619af7b90784db83d18ed656f25))
54 | * major simplification of rules, replacement of others by wrappers ([3811ef7](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/3811ef796df4fe38fb7161f9a1b06fac9db86d5b))
55 | * major simplification of template and update docs ([81ee089](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/81ee08989857366893593a333615523f05295f87))
56 | * replaced get genome script with simple shell command ([9208995](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9208995b78433ce3680a0b0e453ddcf5915abcef))
57 | * update github actions workflow in linting part ([27d53ee](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/27d53eecfad935f50bc62a30248141891a4329ee))
58 | * update github actions workflow. check formatting of yaml files using prettier ([9f5131b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/9f5131bf0eeaf1eb7fb0937b2840f73db2a02724))
59 | * updated all GH actions to latest versions ([4d7b3a2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/4d7b3a2b143c304b6dcf487664c392c4a5e98f74))
60 | * updated github actions workflow ([fd36648](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fd3664841b830ae670549aabb214eb6004aa696d))
61 | * updated github actions workflow ([7a3a40e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/7a3a40e62df01b37a802a085e7210014eb3fba82))
62 | 
63 | 
64 | ### Bug Fixes
65 | 
66 | * 2nd attempt to fix release please wf ([f81847f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/f81847fdfd39d99e795006da4f84701ee6ba8ddc))
67 | * added usage docs ([776b97e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/776b97e3d0e928d98f4c48e619090b47f702dcab))
68 | * all-temp needs explicit input of multiqc zip dirs ([026c35a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/026c35aebfb140746bc823ce06327e25c9a40cf1))
69 | * change release type to 'go', fixes release please wf ([658c784](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/658c784ab5d70b117ce9dd386f5b07f8e4ff782d))
70 | * change release type to 'go', fixes release please wf ([a81ab9d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/a81ab9def05667e23c5e59ac881c7a57b9f1b767))
71 | * code review issues ([97faf1a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/97faf1ae8bde189094e6b46568f3911f01b625fd))
72 | * dont remove temp files for test runs ([0c2c8d1](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0c2c8d19c51648872d09a8f697826b9445bafc81))
73 | * formatting, logging ([d6c819e](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/d6c819efcadde1ad4af342152d3aef2a982983d0))
74 | * lint error and docs update ([cf59f11](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/cf59f11acc11c01866ad56971fd132661f4f32be))
75 | * removed unused templates, update catalog yml ([b5c292f](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b5c292ff4b476441d8068ca8013e3b931d30fc04))
76 | * snakefmt error ([70d670a](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/70d670a91c79c0a9d89c59fff6add3f1036753a3))
77 | * update release-please GH workflow ([1dad25d](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/1dad25da5de222982b0cdf35a91be6ecc5a81a42))
78 | * update release-please GH workflow ([0ea4df2](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/0ea4df2f746e0fc760c06a3b902e2ee8bdf2ff42))
79 | * update snakemake action ([fac8662](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/fac8662193fa501fdfc2f3bb94e7549b96dec500))
80 | * updated schemas and params docs ([facf377](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/facf377a7cc107b3e8db0793b21027a9f3df0eeb))
81 | * updates to enable release-please action again ([8d9552b](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/8d9552b8369ca6b115ee00777f45cf641312dde3))
82 | * various changes to formatting and example rules ([b9b2366](https://github.com/snakemake-workflows/snakemake-workflow-template/commit/b9b236645ad961cd7a8886c1697b27f3694ee047))
83 | 


--------------------------------------------------------------------------------
/workflow/notebooks/explore.py.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "1c3850be-a729-4c33-ba10-c796d209e4dd",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from Bio import SeqIO\n",
 11 |     "import pandas as pd\n",
 12 |     "import gffutils"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "id": "e3ca43d5-7ed1-4be3-afd5-722afb0630a7",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# Genome\n",
 23 |     "for record in SeqIO.parse(snakemake.input[0], \"fasta\"):\n",
 24 |     "    print(record.id)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "b24fa229-9290-4a71-8674-9239a720ed7c",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "genome = SeqIO.to_dict(SeqIO.parse(snakemake.input[0], \"fasta\"))"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "51ccbf9a-bece-40a4-948b-1a8660764bee",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Annotation\n",
 45 |     "\n",
 46 |     "db = gffutils.FeatureDB(snakemake.input[1])\n",
 47 |     "\n",
 48 |     "db"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "84e54089-109b-419c-8741-608e31153b3b",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "list(db.featuretypes())"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "49f07583-6af1-40a4-8d5c-d1efbda6fb67",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "list(db.features_of_type(\"gene\"))"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "659c9eca-46bd-4a3c-8a5b-a6ece7f38fcf",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "for row in db.execute(\"SELECT name FROM sqlite_master WHERE type='table';\"):\n",
 79 |     "    print(row['name'])"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "f63c6672-2a23-4c85-9e83-5f58744a13d2",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "for row in db.execute(\"pragma table_info(features);\"):\n",
 90 |     "    print(row['name'])"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "ba9861d1-d4b6-4d7c-ab0b-37955f76b3e2",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "for row in db.execute(\"select id from features where id like '%AT2G19110%';\"):\n",
101 |     "    print(row['id'])"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "17afaa2b-fbdc-4a34-be67-c7b244a5c3b3",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "list(db['gene-AT2G19110'].attributes)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "1c021e1d-7a5c-4d65-8747-05ea05cf4d7f",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "hma4_annotation = db['gene-AT2G19110']"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "c294d0a6-58f0-4221-bf4b-e799dc4de4fc",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "hma4_annotation.seqid"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "0d9ef868-a8d2-4e81-818e-6618f3e6239c",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "genome[hma4_annotation.seqid][hma4_annotation.start-1:hma4_annotation.end].seq"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "0264e3c3",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "# start coding here\n",
152 |     "\n",
153 |     "annotation = gffutils.FeatureDB(snakemake.input[1])\n",
154 |     "\n",
155 |     "genome = SeqIO.to_dict(SeqIO.parse(snakemake.input[0], \"fasta\"))\n",
156 |     "\n",
157 |     "gene_annotation_id = f\"gene-{snakemake.wildcards[0].split('.')[0]}\"\n",
158 |     "\n",
159 |     "gene_annotation = annotation[gene_annotation_id]\n",
160 |     "\n",
161 |     "gene_sequence = genome[gene_annotation.seqid][gene_annotation.start-1:gene_annotation.end]\n",
162 |     "\n",
163 |     "df = pd.read_csv(snakemake.input[2])\n",
164 |     "\n",
165 |     "df"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "314d82b6-469c-49bb-89c5-7c0f6815d8fd",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "def get_substitution_dna_window(variant, gene_sequence, gene_annotation):\n",
176 |     "    in_gene_position = variant[\"position\"] - gene_annotation.start\n",
177 |     "\n",
178 |     "    ref, alt = parse_dna_substitution(variant[\"amino acid change\"])\n",
179 |     "\n",
180 |     "    assert ref == gene_sequence[in_gene_position].upper()\n",
181 |     "    \n",
182 |     "    return str(gene_sequence[in_gene_position - 256 : in_gene_position + 256].seq)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "id": "77d73ede-07a6-4534-aff0-58f05d4b056c",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "from Bio.SeqUtils import seq1\n",
193 |     "import re\n",
194 |     "\n",
195 |     "\n",
196 |     "def parse_aa_substitution(HGVS_string):\n",
197 |     "    prot_match = re.match(r\"p\\.([A-Z]{1}[a-z]{2})\\d+([A-Z]{1}[a-z]{2})\", HGVS_string)\n",
198 |     "    reference, alternative = prot_match.groups()\n",
199 |     "    return seq1(reference), seq1(alternative)\n",
200 |     "\n",
201 |     "\n",
202 |     "def parse_dna_substitution(HGVS_string):\n",
203 |     "    dna_match = re.search(r\"c\\..*([ATCG])>([ATCG])$\", HGVS_string)\n",
204 |     "    reference, alternative = dna_match.groups()\n",
205 |     "    return reference, alternative\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "id": "2580de4e-db10-4e3b-a7be-0e10a61a5c62",
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "def add_sequence_window(snp_effect_path, reference_fasta_path, annotation_db_path):\n",
216 |     "    annotation = gffutils.FeatureDB(annotation_db_path)\n",
217 |     "    genome = SeqIO.to_dict(SeqIO.parse(reference_fasta_path, \"fasta\"))\n",
218 |     "    \n",
219 |     "    gene_annotation_id = f\"gene-{snakemake.wildcards[0].split('.')[0]}\"\n",
220 |     "    gene_annotation = annotation[gene_annotation_id]\n",
221 |     "    \n",
222 |     "    gene_sequence = genome[gene_annotation.seqid][gene_annotation.start-1:gene_annotation.end]\n",
223 |     "    \n",
224 |     "    df = pd.read_csv(snp_effect_path)\n",
225 |     "\n",
226 |     "    df = df.dropna(subset=[\"amino acid change\"])\n",
227 |     "    df = df[ ~ df[\"amino acid change\"].str.contains(\"ins\", na=False) ]\n",
228 |     "    df = df[ ~ df[\"amino acid change\"].str.contains(\"del\", na=False) ]\n",
229 |     "\n",
230 |     "\n",
231 |     "    df[\"sequence_window\"] = df.apply(\n",
232 |     "        lambda x: get_substitution_dna_window(x, gene_sequence, gene_annotation), axis=1\n",
233 |     "    )\n",
234 |     "    return df\n"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "id": "478271ca-b2ea-4953-9604-db81542f6715",
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "df = add_sequence_window(snakemake.input[2], snakemake.input[0], snakemake.input[1])\n",
245 |     "\n",
246 |     "df"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "7b06e25a-45eb-4a1f-a705-8529eac39a1f",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "df[df[\"sequence_window\"] == \"-\"]"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "id": "71db427a-445e-4e52-b99a-4a31dc67588b",
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "gene_sequence"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "id": "8185f845-e0ac-44ab-bf58-5559e03f5ebd",
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "pos = 8279002 - gene_annotation.start\n",
277 |     "pos"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "id": "e47bb086-c4f6-4f86-8167-48f4a85763e8",
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "gene_sequence[pos]"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "806fb145-6245-41bb-91ad-bb7b72ab93a5",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "import numpy as np\n",
298 |     "\n",
299 |     "\n",
300 |     "def add_surprise_bonus(df):\n",
301 |     "    # Entropy H in base-4\n",
302 |     "    df[\"H\"] = -(\n",
303 |     "        df[[\"p_a\", \"p_c\", \"p_g\", \"p_t\"]]\n",
304 |     "        .pipe(lambda p: p * (np.log2(p) / 2))\n",
305 |     "        .sum(axis=1)\n",
306 |     "    )\n",
307 |     "\n",
308 |     "    # Surprisal I in base-4\n",
309 |     "    df[\"I\"] = -(np.log2(df[\"p_obs\"]) / 2)\n",
310 |     "    df[\"surprise bonus\"] = df[\"H\"] - df[\"I\"]\n"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "id": "170d379e-f112-47a5-9a5a-3d27420a49e1",
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "test = pd.DataFrame({\n",
321 |     "    \"p_a\": [0.3],\n",
322 |     "    \"p_c\": [0.3],\n",
323 |     "    \"p_g\": [0.3],\n",
324 |     "    \"p_t\": [0.1],\n",
325 |     "    \"p_obs\": [0.1], \n",
326 |     "})\n",
327 |     "test"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "id": "61adc227-1bbf-4ff3-bc9e-cb01c5e6a756",
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "add_surprise_bonus(test)\n",
338 |     "\n",
339 |     "test"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "id": "9d96a43b-547e-441d-9d87-c6bc9ea94d4d",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "test = pd.DataFrame({\n",
350 |     "    \"p_a\": [0.4],\n",
351 |     "    \"p_c\": [0.4],\n",
352 |     "    \"p_g\": [0.1],\n",
353 |     "    \"p_t\": [0.1],\n",
354 |     "    \"p_obs\": [0.1], \n",
355 |     "    \"p_ref\": [0.1], \n",
356 |     "})\n",
357 |     "test"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "id": "ce788bb8-b385-4d67-9ccb-36bc77252299",
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "add_surprise_bonus(test)\n",
368 |     "\n",
369 |     "test"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "id": "390bbb57-d656-4cd1-97e3-ffffde5435d6",
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "gpn = np.log2(test[[\"p_a\", \"p_c\", \"p_g\", \"p_t\"]] / test[\"p_ref\"].to_numpy()[:, None])\n",
380 |     "\n",
381 |     "gpn\n",
382 |     "        "
383 |    ]
384 |   }
385 |  ],
386 |  "metadata": {
387 |   "kernelspec": {
388 |    "display_name": "Python 3 (ipykernel)",
389 |    "language": "python",
390 |    "name": "python3"
391 |   },
392 |   "language_info": {
393 |    "codemirror_mode": {
394 |     "name": "ipython",
395 |     "version": 3
396 |    },
397 |    "file_extension": ".py",
398 |    "mimetype": "text/x-python",
399 |    "name": "python",
400 |    "nbconvert_exporter": "python",
401 |    "pygments_lexer": "ipython3",
402 |    "version": "3.14.1"
403 |   }
404 |  },
405 |  "nbformat": 4,
406 |  "nbformat_minor": 5
407 | }
408 | 


--------------------------------------------------------------------------------