├── ._wb ├── launcher │ └── docker │ │ ├── config.json │ │ └── run.sh └── tool │ └── phip-flow │ ├── config.json │ └── run.sh ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── bin ├── fit-predict-zscore.py ├── generate-fasta.py ├── merge-counts-stats.py ├── replicate-counts.py ├── run_BEER.Rscript ├── run_edgeR.Rscript ├── validate-peptide-table.py └── validate-sample-table.py ├── data ├── misc │ ├── dag.png │ └── simulated-example │ │ ├── NGS │ │ └── expa │ │ │ ├── sample_0.fastq │ │ │ ├── sample_1.fastq │ │ │ ├── sample_10.fastq │ │ │ ├── sample_11.fastq │ │ │ ├── sample_2.fastq │ │ │ ├── sample_3.fastq │ │ │ ├── sample_4.fastq │ │ │ ├── sample_5.fastq │ │ │ ├── sample_6.fastq │ │ │ ├── sample_7.fastq │ │ │ ├── sample_8.fastq │ │ │ └── sample_9.fastq │ │ ├── nextflow.config │ │ ├── peptide_table.csv │ │ ├── peptide_table_replicates.csv │ │ ├── run_phip_flow.sh │ │ └── sample_table.csv └── pan-cov-example │ ├── NGS │ ├── 4A-rep1-27-library_S27_L001_R1_001.fastq.gz.test.gz │ ├── 4A-rep2-22_S49_L001_R1_001.fastq.gz.test.gz │ ├── 4B-rep1-22_S22_L001_R1_001.fastq.gz.test.gz │ ├── 4B-rep1-27-library_S26_L001_R1_001.fastq.gz.test.gz │ ├── ex11a-beads-35_S87_L001_R1_001.fastq.gz.test.gz │ ├── ex8-rep2-42_S87_L001_R1_001.fastq.gz.test.gz │ ├── expt10B-MEGSUB-4_S4_L001_R1_001.fastq.gz.test.gz │ └── rep1-42_S42_L001_R1_001.fastq.gz.test.gz │ ├── peptide_table.csv │ ├── sample_table_with_beads_and_lib.csv │ ├── sample_table_with_beads_no_lib.csv │ └── sample_table_with_beads_one_emp.csv ├── main.nf ├── nextflow.config ├── templates ├── aggregate_organisms.py ├── generate_index.sh ├── join_organisms.py ├── public_epitope_template.csv ├── sam_to_counts.sh ├── sam_to_stats.sh ├── short_read_alignment.sh └── split_samples.py └── workflows ├── aggregate.nf ├── alignment.nf ├── edgeR_BEER.nf ├── output.nf └── statistics.nf /._wb/launcher/docker/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Run phip-flow locally (requires Docker)", 3 | "description": "Configure Nextflow to run with Docker enabled", 4 | "args": { 5 | "work_dir": { 6 | "help": "Working directory used for temporary files", 7 | "wb_env": "WORK_DIR", 8 | "wb_type": "folder", 9 | "default": "work" 10 | }, 11 | "cache_dir": { 12 | "help": "Cache directory for Singularity images (if needed)", 13 | "wb_env": "CACHE_DIR", 14 | "wb_type": "folder", 15 | "default": "cache_dir" 16 | }, 17 | "nxf_ver": { 18 | "help": "Nextflow Version", 19 | "wb_env": "NXF_VER", 20 | "wb_type": "string", 21 | "default": "21.10.6" 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /._wb/launcher/docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: Setting the -m flag is essential for the fg command 4 | set -eumo pipefail 5 | 6 | echo "Setting up nextflow.config" 7 | 8 | echo """ 9 | workDir = '${WORK_DIR}' 10 | singularity.cacheDir = '${CACHE_DIR}' 11 | docker.enabled = true 12 | report.enabled = true 13 | trace.enabled = true 14 | """ > nextflow.config 15 | 16 | cat nextflow.config 17 | echo 18 | 19 | # Disable ANSI logging 20 | export NXF_ANSI_LOG=false 21 | 22 | # Print the Nextflow version being used 23 | echo "Nextflow Version: ${NXF_VER}" 24 | echo 25 | 26 | # Execute the tool in the local environment 27 | echo "Starting tool" 28 | echo 29 | 30 | # Start the tool 31 | /bin/bash ._wb/helpers/run_tool 32 | -------------------------------------------------------------------------------- /._wb/tool/phip-flow/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/FredHutch/bash-workbench/main/docs/schema.json", 3 | "name": "PhIP-Seq", 4 | "description": "PhIP-Seq common analysis workflows", 5 | "args": { 6 | "sample_table": { 7 | "help": "Table describing each input sample, minimally containing the column 'fastq_filepath' with the name of each file to be analyzed. Control samples are indicated with a value of 'beads_only' in the column 'control_status'.", 8 | "wb_type": "file", 9 | "required": true 10 | }, 11 | "reads_prefix": { 12 | "help": "Folder which contains the files listed in the sample table", 13 | "wb_type": "folder", 14 | "required": true 15 | }, 16 | "read_length": { 17 | "help": "Read length for alignment", 18 | "wb_type": "integer", 19 | "default": "125" 20 | }, 21 | "fastq_stream_func": { 22 | "help": "Set this as 'cat' if fastq files not g'zipped", 23 | "wb_type": "string", 24 | "default": "zcat" 25 | }, 26 | "peptide_table": { 27 | "help": "Table describing each peptide in the library, minimally containing the column 'oligo' with the sequence used for each peptide", 28 | "wb_type": "file", 29 | "required": true 30 | }, 31 | "peptide_tile_length": { 32 | "help": "Peptide length for alignment", 33 | "wb_type": "integer", 34 | "default": "117" 35 | }, 36 | "dataset_prefix": { 37 | "help": "String which is prepended to all output files", 38 | "wb_type": "string", 39 | "default": "data" 40 | }, 41 | "output_pickle_xarray": { 42 | "help": "Generate output files in xarray pickle format", 43 | "wb_type": "bool", 44 | "default": true 45 | }, 46 | "output_tall_csv": { 47 | "help": "Generate output files in tall CSV format", 48 | "wb_type": "bool", 49 | "default": true 50 | }, 51 | "output_wide_csv": { 52 | "help": "Generate output files in wide CSV format", 53 | "wb_type": "bool", 54 | "default": true 55 | }, 56 | "n_mismatches": { 57 | "help": "Number of mismatches allowed", 58 | "wb_type": "integer", 59 | "default": "2" 60 | }, 61 | "bowtie_optional_args": { 62 | "help": "Other bowtie options", 63 | "wb_type": "string", 64 | "default": "--tryhard --nomaqround --norc --best --sam --quiet" 65 | }, 66 | "replicate_sequence_counts": { 67 | "help": "Flag for replicating counts for replicate sequences", 68 | "wb_type": "bool", 69 | "default": true 70 | }, 71 | "run_cpm_enr_workflow": { 72 | "help": "Flag for running cpm enrichment workflow", 73 | "wb_type": "bool", 74 | "default": false 75 | }, 76 | "run_zscore_fit_predict": { 77 | "help": "Flag for running Z-score enrichment analysis", 78 | "wb_type": "bool", 79 | "default": false 80 | }, 81 | "summarize_by_organism": { 82 | "help": "Flag used to control the summary of results by organism", 83 | "wb_type": "bool", 84 | "default": false 85 | }, 86 | "peptide_org_col": { 87 | "help": "Column in the peptide table indicating the organism for each peptide", 88 | "wb_type": "string", 89 | "default": "organism" 90 | }, 91 | "peptide_seq_col": { 92 | "help": "Column in the peptide table containing the peptide sequence (used to match against public epitopes)", 93 | "wb_type": "string", 94 | "default": "peptide" 95 | }, 96 | "max_overlap": { 97 | "help": "Maximum allowed overlap between detected peptides", 98 | "wb_type": "integer", 99 | "default": "7" 100 | }, 101 | "zscore_threshold": { 102 | "help": "Minimum z-score threshold", 103 | "wb_type": "float", 104 | "default": "2.5" 105 | }, 106 | "sample_grouping_col": { 107 | "help": "Column in the sample table used for mapping replicates to samples", 108 | "wb_type": "string", 109 | "default": "" 110 | }, 111 | "public_epitopes_csv": { 112 | "help": "Optional, a CSV containing public epitopes", 113 | "wb_type": "file" 114 | }, 115 | "public_epitopes_col": { 116 | "help": "In the public epitopes CSV, the column containing the translated amino acid sequence", 117 | "wb_type": "string", 118 | "default": "peptide_translate" 119 | }, 120 | "nxf_profile": { 121 | "help": "Profile used for resource allocation (options: standard / docker / cluster)", 122 | "wb_env": "PROFILE", 123 | "wb_type": "string", 124 | "default": "standard" 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /._wb/tool/phip-flow/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -meuo pipefail 4 | 5 | date 6 | echo 7 | echo "Running workflow from ${PWD}" 8 | echo 9 | 10 | # Run the workflow 11 | echo Starting workflow 12 | nextflow \ 13 | run \ 14 | "${TOOL_REPO}" \ 15 | --results "${PWD}" \ 16 | -params-file ._wb/tool/params.json \ 17 | -resume \ 18 | -profile "${PROFILE}" & 19 | 20 | # Get the process ID 21 | PID="$!" 22 | 23 | # Make a task which can kill this process 24 | if [ ! -d ._wb/bin ]; then mkdir ._wb/bin; fi 25 | echo """ 26 | #!/bin/bash 27 | 28 | echo \"\$(date) Sending a kill signal to the workflow\" 29 | 30 | kill ${PID} 31 | """ > ._wb/bin/stop 32 | chmod +x ._wb/bin/stop 33 | 34 | # Bring the command back to the foreground 35 | fg %1 36 | 37 | echo 38 | date 39 | echo Done 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Nextflow 2 | .nextflow* 3 | **/*Attic* 4 | **/*_ignore* 5 | 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # Jared Specific Data folders 37 | ./empirical_data 38 | *.code-workspace 39 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/hdc-workflows/ubuntu:20.04 2 | 3 | # bust cache 4 | ADD http://date.jsontest.com /etc/builddate 5 | 6 | LABEL maintainer "Jared Galloway " \ 7 | version "1.1.4" \ 8 | description "Common PhIP-Seq Workflows" 9 | 10 | # install needed tools 11 | RUN apt-get update --fix-missing -qq && \ 12 | DEBIAN_FRONTEND=noninteractive \ 13 | apt-get install -y -q \ 14 | git \ 15 | curl \ 16 | locales \ 17 | libncurses5-dev \ 18 | libncursesw5-dev \ 19 | build-essential \ 20 | pkg-config \ 21 | zlib1g-dev \ 22 | python3 \ 23 | python3-pip \ 24 | python3-venv \ 25 | zip \ 26 | wget 27 | 28 | ENV VIRTUAL_ENV=/opt/venv 29 | RUN python3 -m venv $VIRTUAL_ENV 30 | ENV PATH="$VIRTUAL_ENV/bin:$PATH" 31 | 32 | # install phippery 33 | RUN pip install git+https://github.com/matsengrp/phippery@1.3.1 34 | 35 | # install pre-build binary Bowtie1.3 36 | RUN curl -fksSL https://sourceforge.net/projects/bowtie-bio/files/bowtie/1.3.1/bowtie-1.3.1-linux-x86_64.zip \ 37 | --output bowtie-1.3.1-linux-x86_64.zip \ 38 | && unzip bowtie-1.3.1-linux-x86_64.zip \ 39 | && (cd /usr/bin/ && ln -s /bowtie-1.3.1-linux-x86_64/* ./) 40 | 41 | 42 | # install SAMtools 43 | RUN curl -fksSL https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 | tar xj && \ 44 | cd samtools-1.3.1 && \ 45 | make all all-htslib && make install install-htslib 46 | 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Fred Hutchinson Cancer Research Center 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PHIP-FLOW 2 | A Nextflow pipeline for Common Phage Immuno-Precipitation Sequencing experiments. 3 | See the [Documentation](https://matsengrp.github.io/phippery/introduction.html) 4 | for more details and usage examples. 5 | 6 | ## Quickstart 7 | 8 | Install `Nextflow` by using the following command: 9 | 10 | curl -s https://get.nextflow.io | bash 11 | 12 | Download the `Docker` Desktop, there exists several distributions packaged for 13 | various linux flavors 14 | 15 | curl -fsSL https://get.docker.com -o get-docker.sh && sudo sh get-docker.sh 16 | 17 | Launch the pipeline execution with the following command: 18 | 19 | nextflow run matsengrp/phip-flow -r V1.12 -profile docker 20 | 21 | Note: the ``-r VX.XX`` command runs the specified stable release version of the pipeline. 22 | For running the bleeding edge (not generally recommended) you may also specify ``-r main``. 23 | You may also specify any of the 24 | [parameters](https://matsengrp.github.io/phippery/alignments-pipeline.html#parameters) 25 | for changing the input data and workflow behavior. 26 | 27 | Note: the ``phippery`` [Dockerfile](https://github.com/matsengrp/phippery/blob/main/Dockerfile) 28 | contains all the required dependencies except those for ``EdgeR`` and ``BEER``, 29 | for which the maintainers of that package host their own public image. 30 | 31 | [![Docker Repository on Quay](https://quay.io/repository/hdc-workflows/phippery/status "Docker Repository on Quay")](https://quay.io/repository/hdc-workflows/phippery) 32 | 33 | -------------------------------------------------------------------------------- /bin/fit-predict-zscore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | fit the zscore model. 4 | 5 | Requirements: 6 | 1. a sample table annotation column "control_status" 7 | with each sample either having the string factor level 8 | "beads_only" being the samples which we fit the model 9 | to, or "empirical" being the samples we predict on 10 | after the model is fit to each peptide. 11 | 12 | 2. The xarray phip dataset passed in must have the 13 | "cpm" layer in the enrichment tables. 14 | and we expect the two types of sample groups 15 | were normalized using counts per million together 16 | as is the default when computing stats. 17 | 18 | For a more complete description, 19 | please see the overview by Kevin Sung found at 20 | https://matsengrp.github.io/phippery/ 21 | """ 22 | 23 | from phippery.utils import * 24 | from phippery.modeling import zscore 25 | 26 | import argparse 27 | import warnings 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("-ds", type=str) 31 | parser.add_argument("-o", type=str) 32 | args = parser.parse_args() 33 | 34 | ds = load(args.ds) 35 | beads_ds = ds_query(ds, "control_status == 'beads_only'") 36 | 37 | zscore_ds = zscore( 38 | ds, 39 | beads_ds, 40 | data_table='cpm', 41 | min_Npeptides_per_bin=300, 42 | lower_quantile_limit=0.05, 43 | upper_quantile_limit=0.95, 44 | inplace=False, 45 | new_table_name='zscore' 46 | ) 47 | 48 | dump(zscore_ds, args.o) 49 | -------------------------------------------------------------------------------- /bin/generate-fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | convert peptide metadata to fasta format. 4 | """ 5 | 6 | import pandas as pd 7 | import sys 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("-pt", type=str) 12 | parser.add_argument("-o", type=str) 13 | args = parser.parse_args() 14 | 15 | def trim_index(sequence): 16 | return "".join([nt for nt in sequence if nt.isupper()]) 17 | 18 | fasta_fp = open(f"{sys.argv[1]}.fasta", "w") 19 | with open(args.o, "w") as fasta_fp: 20 | peptide_table = pd.read_csv(args.pt, index_col=0, header=0) 21 | for index, row in peptide_table.iterrows(): 22 | ref_sequence = trim_index(row["oligo"]) 23 | fasta_fp.write(f">{index}\n{ref_sequence}\n") 24 | -------------------------------------------------------------------------------- /bin/merge-counts-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from phippery.utils import * 6 | import argparse 7 | import glob 8 | import os 9 | from functools import reduce 10 | from collections import defaultdict 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-st", type=str) 14 | parser.add_argument("-pt", type=str) 15 | parser.add_argument("-cfp", type=str) 16 | parser.add_argument("-sfp", type=str) 17 | parser.add_argument("-o", type=str) 18 | args = parser.parse_args() 19 | 20 | 21 | def _collect_sample_table(sample_table_filename: str): 22 | """Read and verify a sample table.""" 23 | 24 | sample_table = pd.read_csv(sample_table_filename, sep=",", index_col=0, header=0) 25 | 26 | if sample_table.index.name != "sample_id": 27 | raise ValueError("The name of the index must be 'sample_id'") 28 | 29 | if sample_table.index.dtype != "int64": 30 | raise ValueError("The index values for sample_id must be inferred as integers") 31 | 32 | sample_table.sort_index(inplace=True) 33 | return sample_table 34 | 35 | 36 | def _collect_peptide_table(peptide_table_filename: str): 37 | """Read and verify a peptide table.""" 38 | 39 | peptide_table = pd.read_csv(peptide_table_filename, sep=",", index_col=0, header=0) 40 | 41 | if peptide_table.index.name != "peptide_id": 42 | raise ValueError 43 | 44 | if peptide_table.index.dtype != "int64": 45 | raise ValueError("The index values for peptide_id must be inferred as integers") 46 | 47 | peptide_table.sort_index(inplace=True) 48 | return peptide_table 49 | 50 | 51 | def load_from_counts_tsv( 52 | sample_table, 53 | peptide_table, 54 | counts_file_pattern, 55 | stats_file_pattern, 56 | ): 57 | 58 | counts = [f for f in glob.glob(counts_file_pattern)] 59 | stats_files = [f for f in glob.glob(stats_file_pattern)] 60 | 61 | merged_counts = collect_counts(counts) 62 | peptide_table = _collect_peptide_table(peptide_table) 63 | sample_table = _collect_sample_table(sample_table) 64 | 65 | def num(s): 66 | try: 67 | return int(s) 68 | except ValueError: 69 | return float(s) 70 | 71 | if stats_files is not None: 72 | alignment_stats = defaultdict(list) 73 | for sample_alignment_stats in stats_files: 74 | fp = os.path.basename(sample_alignment_stats) 75 | sample_id = int(fp.strip().split(".")[0]) 76 | alignment_stats["sample_id"].append(sample_id) 77 | for line in open(sample_alignment_stats, "r"): 78 | line = line.strip().split("\t") 79 | x = line[0] 80 | anno_name = "_".join(x.lower().split()).replace(":", "") 81 | alignment_stats[f"{anno_name}"].append(num(line[1])) 82 | 83 | stats_df = pd.DataFrame(alignment_stats).set_index("sample_id") 84 | 85 | sample_table = sample_table.merge( 86 | stats_df, 87 | "outer", 88 | left_index=True, 89 | right_index=True 90 | ) 91 | 92 | # Add more summary metrics per sample 93 | sample_table = sample_table.assign( 94 | percent_mapped=sample_table["reads_mapped"] / sample_table["raw_total_sequences"] * 100., 95 | percent_peptides_detected=(merged_counts > 0).mean() * 100., 96 | percent_peptides_between_10_and_100=merged_counts.applymap(lambda v: (v >= 10) & (v <= 100)).mean() * 100., 97 | ) 98 | 99 | ds = stitch_dataset( 100 | counts=merged_counts, 101 | peptide_table=peptide_table, 102 | sample_table=sample_table, 103 | ) 104 | 105 | #dump(ds, output) 106 | return ds 107 | 108 | 109 | ds = load_from_counts_tsv( 110 | args.st, 111 | args.pt, 112 | args.cfp, 113 | args.sfp, 114 | ) 115 | 116 | dump(ds, args.o) 117 | 118 | #def merge_count_data(counts): 119 | # """ 120 | # This function takes in a list of paths which 121 | # contains the counts for each peptide alignment 122 | # for each sample. These files should contain 123 | # no header. 124 | # 125 | # :param: counts - a list of paths leading 126 | # to raw peptide enrichment counts for each sample 127 | # """ 128 | # 129 | # load = lambda path, sample: pd.read_csv( # noqa 130 | # path, index_col=0, sep="\t", names=["peptide_id", sample] 131 | # ) 132 | # 133 | # sample_dataframes = [ 134 | # load(path, int(os.path.basename(path).split(".")[0])) for path in counts 135 | # ] 136 | # 137 | # merged_counts_df = reduce( 138 | # lambda l, r: pd.merge(l, r, how="outer", left_index=True, right_index=True), 139 | # sample_dataframes, 140 | # ).fillna(0) 141 | # 142 | # merged_counts_df.columns = merged_counts_df.columns.astype(int) 143 | # merged_counts_df.index = merged_counts_df.index.astype(int) 144 | # merged_counts_df.sort_index(inplace=True) 145 | # merged_counts_df.sort_index(axis=1, inplace=True) 146 | # 147 | # return merged_counts_df 148 | 149 | 150 | #def load_from_counts_tsv( 151 | # sample_table, 152 | # peptide_table, 153 | # counts_file_pattern, 154 | # stats_file_pattern, 155 | # output 156 | #): 157 | # """ 158 | # Collect sample and peptide metadata tables along with a 159 | # two-column tsv file for each sample, 160 | # and produce a properly formatted xarray dataset. 161 | # """ 162 | # 163 | # counts = [f for f in glob.glob(counts_file_pattern)] 164 | # stats_files = [f for f in glob.glob(stats_file_pattern)] 165 | # 166 | # merged_counts = collect_counts(counts) 167 | # peptide_table = collect_peptide_table(peptide_table) 168 | # sample_table = collect_sample_table(sample_table) 169 | # 170 | # def num(s): 171 | # try: 172 | # return int(s) 173 | # except ValueError: 174 | # return float(s) 175 | # 176 | # if stats_files is not None: 177 | # alignment_stats = defaultdict(list) 178 | # for sample_alignment_stats in stats_files: 179 | # fp = os.path.basename(sample_alignment_stats) 180 | # sample_id = int(fp.strip().split(".")[0]) 181 | # alignment_stats["sample_id"].append(sample_id) 182 | # for line in open(sample_alignment_stats, "r"): 183 | # line = line.strip().split("\t") 184 | # x = line[0] 185 | # anno_name = "_".join(x.lower().split()).replace(":", "") 186 | # alignment_stats[f"{anno_name}"].append(num(line[1])) 187 | # 188 | # stats_df = pd.DataFrame(alignment_stats).set_index("sample_id") 189 | # 190 | # sample_table = sample_table.merge( 191 | # stats_df, 192 | # "outer", 193 | # left_index=True, 194 | # right_index=True 195 | # ) 196 | # 197 | # ds = stitch_dataset( 198 | # counts=merged_counts, 199 | # peptide_table=peptide_table, 200 | # sample_table=sample_table, 201 | # ) 202 | # 203 | # dump(ds, output) 204 | 205 | -------------------------------------------------------------------------------- /bin/replicate-counts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script should take in a dataset, sum the raw counts 4 | from all replicate sequences in the library, then proceed to 5 | set the value for each replicate to that sum 6 | 7 | Currently, this function only sets the raw counts, in place. 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import phippery 13 | from phippery.utils import load, dump, get_annotation_table 14 | import sys 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-ds", type=str) 19 | parser.add_argument("-o", type=str) 20 | args = parser.parse_args() 21 | 22 | 23 | def replicate_oligo_counts(ds, peptide_oligo_feature="Oligo"): 24 | """This function should take in a dataset, sum the raw counts 25 | from all replicate sequences in the library, then proceed to 26 | set the value for each replicate to that sum 27 | 28 | Currently, this function only sets the raw counts, in place. 29 | """ 30 | 31 | # TODO remove this code 32 | # find all value counts greater than 1, 33 | #pep_anno_table = get_annotation_table(ds, "peptide") 34 | #oligo_vc = pep_anno_table["Oligo"].value_counts() 35 | 36 | ## for each oligo that is not unique in a library 37 | #for oligo, count in oligo_vc[oligo_vc > 1].items(): 38 | # replicate_idxs = pep_anno_table[ 39 | # pep_anno_table["Oligo"]==oligo 40 | # ].index.values 41 | 42 | # # sum the replicate values 43 | # rep_pep_sums = ds.counts.loc[replicate_idxs, :].sum(axis=0).values 44 | 45 | # # set the replicate counts equal to the sum of all 46 | # ds.counts.loc[replicate_idxs, :] = np.tile(rep_pep_sums, (count, 1)) 47 | 48 | # find all value counts greater than 1, 49 | pep_anno_table = get_annotation_table(ds, "peptide") 50 | 51 | # Iterate over every group of peptides which share the same oligo sequence 52 | for oligo_seq, pep_anno_table_oligo in pep_anno_table.groupby(peptide_oligo_feature): 53 | 54 | # Check to see if there are multiple peptides with the same oligo sequence 55 | if pep_anno_table_oligo.shape[0] == 1: 56 | 57 | # Don't make any changes for unique oligos 58 | continue 59 | 60 | # Otherwise, get the sum of the counts across all oligos 61 | idxs = pep_anno_table_oligo.index.values 62 | # rep_pep_sums = ds.counts.loc[idxs, :].sum(axis=0).values 63 | 64 | # Set the summed value for all peptides which share the same oligo sequence 65 | ds.counts.loc[idxs, :] = np.tile( 66 | ds.counts.loc[idxs, :].sum(axis=0).values, 67 | (pep_anno_table_oligo.shape[0], 1) 68 | ) 69 | 70 | ds = phippery.load(args.ds) 71 | replicate_oligo_counts(ds, "oligo") 72 | phippery.dump(ds, args.o) 73 | -------------------------------------------------------------------------------- /bin/run_BEER.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(beer) 4 | library(PhIPData) 5 | library(dplyr) 6 | 7 | # input from the edgeR process 8 | edgeR_out = readRDS("PhIPData.rds") 9 | 10 | # For more on running BEER 11 | # see http://www.bioconductor.org/packages/release/bioc/html/beer.html 12 | 13 | ### Named vector specifying where we want to store the summarized MCMC output 14 | ### NULL indicates that the output should not be stored. 15 | print("Setting up BEER") 16 | assay_locations <- c( 17 | phi = "beer_fc_marg", 18 | phi_Z = "beer_fc_cond", 19 | Z = "beer_prob", 20 | c = "sampleInfo", 21 | pi = "sampleInfo" 22 | ) 23 | 24 | print("Running BEER::brew") 25 | beer_out <- brew(edgeR_out, assay.names = assay_locations) 26 | 27 | ## Define matrix of peptides that were run in BEER 28 | print("Getting matrix of peptides that were run") 29 | was_run <- matrix(rep(beer_out$group != "beads", each = nrow(beer_out)), 30 | nrow = nrow(beer_out)) 31 | 32 | ## Identify super-enriched peptides 33 | ## These peptides were in samples that were run, but have missing posterior 34 | ## probabilities 35 | print("Identifying super-enriched peptides") 36 | are_se <- was_run & is.na(assay(beer_out, "beer_prob")) 37 | 38 | ## Enriched peptides are peptides with: 39 | ## - posterior probability > 0.5, OR 40 | ## - super-enriched peptides 41 | print("Rerunning BEER") 42 | assay(beer_out, "beer_hits") <- assay(beer_out, "beer_prob") > 0.5 | are_se 43 | 44 | write.csv(assays(beer_out)$beer_prob, file="beer_prob.csv") 45 | write.csv(assays(beer_out)$beer_hits, file="beer_hits.csv") 46 | 47 | saveRDS(edgeR_out, "PhIPData.rds") 48 | -------------------------------------------------------------------------------- /bin/run_edgeR.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(beer) 4 | library(PhIPData) 5 | library(dplyr) 6 | 7 | edgeR_threshold = as.double(commandArgs(trailingOnly=TRUE)[1]) 8 | 9 | make_phipdata <- function(){ 10 | 11 | # Following https://www.bioconductor.org/packages/release/bioc/vignettes/PhIPData/inst/doc/PhIPData.html 12 | # Read in the peptide metadata 13 | peptide_info <- read.csv("dataset_peptide_annotation_table.csv") 14 | 15 | # Read in the sample metadata 16 | sample_meta <- read.csv("dataset_sample_annotation_table.csv") 17 | 18 | # Rename the group used to identify controls 19 | sample_meta <- sample_meta %>% rename("group" = "control_status") 20 | 21 | # Conform to the expected syntax for experiment and controls 22 | fix_group <- function(val){ 23 | if(val == 'empirical'){return('trt')} 24 | if(val == 'beads_only'){return('beads')} 25 | if(val == 'library'){return('ctrl')} 26 | return(val) 27 | } 28 | sample_meta$group <- sapply( 29 | sample_meta$group, 30 | fix_group 31 | ) 32 | 33 | # Read in the counts table 34 | counts_dat <- read.csv("dataset_counts.csv", row.names = 1) 35 | 36 | # Make empty tables for the log-fold-change and prob 37 | logfc_dat <- prob_dat <- data.frame( 38 | matrix( 39 | ncol = ncol(counts_dat), 40 | nrow = nrow(counts_dat) 41 | ) 42 | ) 43 | 44 | # Set row/column names ------------- 45 | rownames(counts_dat) <- rownames(logfc_dat) <- 46 | rownames(prob_dat) <- rownames(peptide_info) <- 47 | peptide_info$peptide_id 48 | 49 | colnames(counts_dat) <- colnames(logfc_dat) <- 50 | colnames(prob_dat) <- rownames(sample_meta) <- 51 | sample_meta$sample_id 52 | 53 | # Experimental metadata ------------- 54 | exp_meta <- list(date_run = as.Date("2021/01/20"), 55 | reads_per_sample = colSums(counts_dat)) 56 | 57 | # Make the PhIPData object 58 | phip_obj <- PhIPData(counts_dat, logfc_dat, prob_dat, 59 | peptide_info, sample_meta, 60 | exp_meta) 61 | 62 | return(phip_obj) 63 | } 64 | 65 | # Make the PhIPData object 66 | print("Building PhIPData object") 67 | phip_obj <- make_phipdata() 68 | 69 | # Following https://bioconductor.org/packages/release/bioc/vignettes/beer/inst/doc/beer.html 70 | 71 | # Run edgeR 72 | print("Running edgeR") 73 | edgeR_out <- runEdgeR( 74 | phip_obj, 75 | assay.names = c(logfc = "edgeR_logfc", 76 | prob = "edgeR_logpval")) 77 | 78 | print("Adding edgeR hits") 79 | assay(edgeR_out, "edgeR_hits") <- apply( 80 | assay(edgeR_out, "edgeR_logpval"), 2, 81 | function(sample){ 82 | pval <- 10^(-sample) 83 | p.adjust(pval, method = "BH") < edgeR_threshold 84 | }) 85 | 86 | saveRDS(edgeR_out, "PhIPData.rds") 87 | 88 | # TODO Do these assay objects save the sample (column) order? 89 | write.csv(assays(edgeR_out)$edgeR_logpval, file="edgeR_logpval.csv") 90 | write.csv(assays(edgeR_out)$edgeR_logfc, file="edgeR_logfc.csv") 91 | write.csv(assays(edgeR_out)$edgeR_hits, file="edgeR_hits.csv") 92 | 93 | 94 | # The code below shows how we might run BEER from this point forward. 95 | # We do not run this in the pipeline as we ran into feasibility issues 96 | # For more on running BEER, 97 | # see http://www.bioconductor.org/packages/release/bioc/html/beer.html 98 | 99 | ### Named vector specifying where we want to store the summarized MCMC output 100 | ### NULL indicates that the output should not be stored. 101 | #print("Setting up BEER") 102 | #assay_locations <- c( 103 | # phi = "beer_fc_marg", 104 | # phi_Z = "beer_fc_cond", 105 | # Z = "beer_prob", 106 | # c = "sampleInfo", 107 | # pi = "sampleInfo" 108 | #) 109 | # 110 | #print("Running BEER::brew") 111 | #beer_out <- brew(edgeR_out, assay.names = assay_locations) 112 | # 113 | ### Define matrix of peptides that were run in BEER 114 | #print("Getting matrix of peptides that were run") 115 | #was_run <- matrix(rep(beer_out$group != "beads", each = nrow(beer_out)), 116 | # nrow = nrow(beer_out)) 117 | # 118 | ### Identify super-enriched peptides 119 | ### These peptides were in samples that were run, but have missing posterior 120 | ### probabilities 121 | #print("Identifying super-enriched peptides") 122 | #are_se <- was_run & is.na(assay(beer_out, "beer_prob")) 123 | # 124 | ### Enriched peptides are peptides with: 125 | ### - posterior probability > 0.5, OR 126 | ### - super-enriched peptides 127 | #print("Rerunning BEER") 128 | #assay(beer_out, "beer_hits") <- assay(beer_out, "beer_prob") > 0.5 | are_se 129 | -------------------------------------------------------------------------------- /bin/validate-peptide-table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import pandas as pd 5 | import sys 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("-p", type=str) 9 | parser.add_argument("-o", type=str) 10 | args = parser.parse_args() 11 | 12 | peptide_table = pd.read_csv(args.p, header=0) 13 | if "oligo" not in peptide_table.columns: 14 | raise KeyError("Must provide 'oligo' column in the table") 15 | 16 | if "peptide_id" not in peptide_table.columns: 17 | peptide_table["peptide_id"] = list(range(len(peptide_table))) 18 | peptide_table.set_index("peptide_id", inplace=True) 19 | else: 20 | peptide_table.set_index("peptide_id", inplace=True) 21 | 22 | peptide_table.to_csv(args.o, index=True, na_rep="NA") 23 | -------------------------------------------------------------------------------- /bin/validate-sample-table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import pandas as pd 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("-s", type=str) 8 | parser.add_argument("-o", type=str) 9 | parser.add_argument("--run_zscore_fit_predict", type=str) 10 | args = parser.parse_args() 11 | 12 | sample_table = pd.read_csv(args.s, header=0) 13 | if "fastq_filepath" not in sample_table.columns: 14 | raise KeyError("Must provide 'fastq_filepath' column in the table") 15 | 16 | assert "sample_id" not in sample_table.columns, "Cannot include a column named 'sample_id' in the sample table" 17 | 18 | sample_table["sample_id"] = list(range(len(sample_table))) 19 | sample_table.set_index("sample_id", inplace=True) 20 | 21 | # If Z-score fitting is turned on 22 | if args.run_zscore_fit_predict == "true": 23 | # There must be a column: control_status 24 | msg = "Must provide a column 'control_status'" 25 | assert "control_status" in sample_table.columns.values, msg 26 | 27 | # Raise an error if there are <2 beads_only samples 28 | msg = "Must provide >1 samples labeled 'beads_only' under 'control_status'" 29 | assert (sample_table["control_status"] == "beads_only").sum() > 1, msg 30 | 31 | sample_table.to_csv(args.o, index=True, na_rep="NA") 32 | 33 | -------------------------------------------------------------------------------- /data/misc/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/misc/dag.png -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_0.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATCACCGGAA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAACCCCAGTT 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCATGGTTCT 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGACTATGAAT 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTCGCGTAGT 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACATCCCTAT 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGTCTTGCTT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTGAGTCTCA 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTCACATGGC 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTCTACTGCG 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_1.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATGACTGTGA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAGTCCCCTG 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCATGGATAC 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGATGTGTTAC 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTAGTCGGGT 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACTTGAGAGC 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGGTAGTGCT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTTAGCGATG 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTAGTAGCCA 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTGATGTCAC 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_10.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATATTTTCCT 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAAGGTGGAG 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCTCTACAAC 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAAGAAACAT 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTATTTGCGC 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACGTGTTGCA 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGGTAGGATT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTTGGCTCTG 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTTTTTCCCT 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTGAGTTAGT 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_11.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATGACGGCGG 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAAGTACGCC 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCACTGAACT 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGATCTGCAGG 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTCTAATGTC 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACACACCCCG 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGATGTATAT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTCCTCAATG 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTTATTAGCC 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTCGCAGAAA 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_2.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATAAGACGGC 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAGTCGTAAT 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCTTTAATGT 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAAGCAGTGC 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTGGAGTTAC 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACAAACGCAT 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGCTAAGTGA 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTTGCGCACA 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTGGACATAG 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTGCTATTGG 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_3.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATCTGCTACA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAAACTGGAT 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCAGCCTGTC 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGATAGGAGCA 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTGACAGGAG 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACTTATATCC 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGAGGGTCCA 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTTACATGGA 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTTAGCAAAT 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTAACAGTAG 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_4.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATAGAATCAA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAACCGATAGA 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCGCACATAC 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAGGCAAGCT 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTCTGACGAG 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACTTATCACT 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGCCAAGCGC 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTGCTTGATA 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTGTTTTGAA 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTTTCAAACC 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_5.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATCAAGAGCA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAACTAACCCT 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCTAGTTCAG 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAAATTGCTA 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTCTTGGATA 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACCGCTGTCT 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGGTGAGTTT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTTAAGTCCT 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTTTAAAACC 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTTAGCACAC 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_6.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATTTGTCTGC 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAATTTAGCCC 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCCCCTATGT 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAGTGTCACG 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTTAGGACAG 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACTTGTGTAA 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGCAGACAAT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTCATGGCCG 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTTTCGCCAT 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTTGTGGGTT 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_7.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATACGAGTTA 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAAGAGGGTG 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCGCCAATGT 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAGCGAAGCC 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTGATGAGGA 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACTATCTAAC 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGAGAGCTTG 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTCTGCGTTT 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTGCATAAAC 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTGAAGCTGT 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_8.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATCCTTAATT 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAGAACACCC 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCGATTCTTC 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAAGACCTGC 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTAATTATTG 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACTGCTATTC 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGATTGGTTT 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTCCTGGCCT 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTGTGACGGT 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTGCTGCCTC 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/NGS/expa/sample_9.fastq: -------------------------------------------------------------------------------- 1 | @ 2 | CAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATGGTCAACG 3 | + 4 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ 6 | GGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAACCTATTC 7 | + 8 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 9 | @ 10 | CCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCGAGAATCG 11 | + 12 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 13 | @ 14 | GAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAATGCATAG 15 | + 16 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 17 | @ 18 | AACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTACGCCCGG 19 | + 20 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 21 | @ 22 | CCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACCTTGGCCA 23 | + 24 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 25 | @ 26 | GTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGTTTACTCC 27 | + 28 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 29 | @ 30 | TTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTTCTGTTTA 31 | + 32 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 33 | @ 34 | AGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTAGACATTT 35 | + 36 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 37 | @ 38 | CGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTCAAAAGAC 39 | + 40 | FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 41 | -------------------------------------------------------------------------------- /data/misc/simulated-example/nextflow.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | // PIPELINE KNOBS 4 | params{ 5 | 6 | 7 | // ##### Sample Table ##### 8 | sample_table = 'sample_table.csv' 9 | 10 | // ##### Peptide Table ##### 11 | peptide_table = 'peptide_table_replicates.csv' 12 | 13 | // ##### Output ##### 14 | phip_data_dir = './sim-example-output' 15 | 16 | // ##### Dataset Name Prefix ##### 17 | dataset_prefix = 'sim-example-replicates-4-bt2-a' 18 | 19 | // ##### Stream Function ##### 20 | fastq_stream_func = 'cat' 21 | 22 | // ##### Enrichment Workflow ##### 23 | compute_enrichment = false 24 | 25 | // ##### Output tall format ds ##### 26 | output_tall = false 27 | 28 | // ##### Output wide format ds ##### 29 | output_wide = false 30 | 31 | // ##### read length for align ##### 32 | read_length = 125 33 | 34 | // ##### peptide length for align ##### 35 | peptide_tile_length = 117 36 | 37 | // ##### mismatches allowed (end-to-end) ##### 38 | n_mismatches = 2 39 | 40 | } 41 | 42 | 43 | // COMPUTATIONAL PLATFORM SETUP 44 | profiles { 45 | standard { 46 | process { 47 | 48 | executor = 'slurm' 49 | 50 | 51 | withLabel: phippery { 52 | container = 'quay.io/matsengrp/phippery:135_config_file' 53 | queue = 'campus-new' 54 | cpus = 1 55 | memory = 8.GB 56 | time = '1h' 57 | clusterOptions = '-A overbaugh_j' 58 | } 59 | 60 | withLabel: alignment_tool { 61 | // container = 'quay.io/jgallowa/bowtie1.3:latest' 62 | container = 'quay.io/jgallowa/bowtie2:latest' 63 | queue = 'campus-new' 64 | cpus = 28 65 | memory = 30.GB 66 | time = '4h' 67 | clusterOptions = '-A overbaugh_j' 68 | } 69 | 70 | withLabel: samtools { 71 | container = 'quay.io/biocontainers/samtools:1.3--h0592bc0_3' 72 | queue = 'campus-new' 73 | cpus = 28 74 | memory = 30.GB 75 | time = '4h' 76 | clusterOptions = '-A overbaugh_j' 77 | } 78 | } 79 | singularity { 80 | enabled = true 81 | autoMounts = true 82 | cacheDir = 'temp/containers/' 83 | runOptions = '--contain -W /tmp/' 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /data/misc/simulated-example/peptide_table.csv: -------------------------------------------------------------------------------- 1 | peptide_id,Oligo 2 | 0,gtcagcaggtggtttgacCAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATcatgctggtcttcggg 3 | 1,gtcagcaggtggtttgacGGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAcatgctggtcttcggg 4 | 2,gtcagcaggtggtttgacCCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCcatgctggtcttcggg 5 | 3,gtcagcaggtggtttgacGAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAcatgctggtcttcggg 6 | 4,gtcagcaggtggtttgacAACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTcatgctggtcttcggg 7 | 5,gtcagcaggtggtttgacCCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACcatgctggtcttcggg 8 | 6,gtcagcaggtggtttgacGTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGcatgctggtcttcggg 9 | 7,gtcagcaggtggtttgacTTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTcatgctggtcttcggg 10 | 8,gtcagcaggtggtttgacAGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTcatgctggtcttcggg 11 | 9,gtcagcaggtggtttgacCGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTcatgctggtcttcggg 12 | -------------------------------------------------------------------------------- /data/misc/simulated-example/peptide_table_replicates.csv: -------------------------------------------------------------------------------- 1 | peptide_id,Oligo 2 | 0,gtcagcaggtggtttgacCAGCTTCTATTTATATAGCGAACGATATCAGTATGGCAAGGTGGCCCCGCGAGTAGTGCTTCAGAATTGGTCCTCCACGAATGCGAAGAGGCACTGTATCACGGTAGCTAAAAGGATcatgctggtcttcggg 3 | 1,gtcagcaggtggtttgacGGGCCCCTGGATTCGATCCATCGAAATGTAATAGATTAGCAACGTTATCCGAGAAAAGGTATATAGTGTGATCCACTGTTTTTACTTACCCCGTTTACTATATAGGGACTACCCTAAcatgctggtcttcggg 4 | 2,gtcagcaggtggtttgacCCCGCTAATAAAGCGTGGGCACCATCTCCGTAGGATGAAGTTATTAAAGCACGTGTTGTCACAAAGACGAGAATATGGTGTAAGATCATCGGATTCCGCCACGACTCAGAATGTCTCcatgctggtcttcggg 5 | 3,gtcagcaggtggtttgacGAGCGCTCACTATAATCTAATCGAATCGCATTCCGTGAAAGAATTACTGTTAAAGAGCGCAGGCCGATTCTTGCCTGCATATATACATCATGATAGTTACGCTTGCGTATAAGGGGAcatgctggtcttcggg 6 | 4,gtcagcaggtggtttgacAACAAACGGCGCAACGATACGGTTTGGCCTCAGTTATTGGTTGTAGCCGGAGTATAGCTGGGTCATTAGTTCTCAAAGGATTCCATAGACGGGATCGTCTGCTGTGAAAATTAGGCTcatgctggtcttcggg 7 | 5,gtcagcaggtggtttgacCCCTTGGCCTATTAGCAATAGCCGGTGGCGCTTGTTAAATGCATGCAGCAAGATGCGAGTCTGATTACGCATAGCTAGGTCGGCGGGTATCCACAAGCCCGATCCGGCGACAGCTACcatgctggtcttcggg 8 | 6,gtcagcaggtggtttgacGTGTCAACCCCGCGGTTTGGAACCCAACTATACCACCCGTTACGGCAGGCTTCAGTGGTTCCCCCGACGCAAATGCTGATCCATTTTGACGATGTAAGCAGCTTGGTATTACACGAGcatgctggtcttcggg 9 | 7,gtcagcaggtggtttgacTTCGATGCCATACTCCGCACGTGCGCGTCTACATACTACCGGATTCGAGCTGTACGGAAGGGCCTCGCTTCACATTCATCAGCAGTAAGTGGTTGGGAGGGCTTTTACCTATACAGTcatgctggtcttcggg 10 | 8,gtcagcaggtggtttgacAGTTGCTACATAACGCCAGCCCAGCAACGCTGCGATTGAGATGATTGGATTTTAGATCATAGACTCAGAAGGCTATTTCTACATGATTTACCTATAGCAGTTTTCCTCTTCCAAATTcatgctggtcttcggg 11 | 9,gtcagcaggtggtttgacCGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTcatgctggtcttcggg 12 | 10,gtcagcaggtggtttgacCGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTcatgctggtcttcggg 13 | 11,gtcagcaggtggtttgacCGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTcatgctggtcttcggg 14 | 12,gtcagcaggtggtttgacCGTGTAGTGAATCCGACCGTTCCGTAACGCACCTATTATTAAAACCCCAAATCCGTTCGGAGTCCAGCATTGGTGACGCAAACATTATAACAATTTCTCCGGATTGAGGTCTCCATTcatgctggtcttcggg 15 | -------------------------------------------------------------------------------- /data/misc/simulated-example/run_phip_flow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | source /app/lmod/lmod/init/profile 5 | 6 | module load nextflow 7 | module load Singularity 8 | export PATH=$SINGULARITYROOT/bin/:$PATH 9 | 10 | /usr/bin/time nextflow \ 11 | -C ./nextflow.config \ 12 | run ../main.nf \ 13 | -with-report ./output/nextflow_report.html \ 14 | -work-dir ./output/work/ \ 15 | -resume 16 | -------------------------------------------------------------------------------- /data/misc/simulated-example/sample_table.csv: -------------------------------------------------------------------------------- 1 | fastq_filepath 2 | NGS/expa/sample_0.fastq 3 | NGS/expa/sample_1.fastq 4 | NGS/expa/sample_2.fastq 5 | NGS/expa/sample_3.fastq 6 | NGS/expa/sample_4.fastq 7 | NGS/expa/sample_5.fastq 8 | NGS/expa/sample_6.fastq 9 | NGS/expa/sample_7.fastq 10 | NGS/expa/sample_8.fastq 11 | NGS/expa/sample_9.fastq 12 | NGS/expa/sample_10.fastq 13 | NGS/expa/sample_11.fastq 14 | -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/4A-rep1-27-library_S27_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/4A-rep1-27-library_S27_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/4A-rep2-22_S49_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/4A-rep2-22_S49_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/4B-rep1-22_S22_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/4B-rep1-22_S22_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/4B-rep1-27-library_S26_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/4B-rep1-27-library_S26_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/ex11a-beads-35_S87_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/ex11a-beads-35_S87_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/ex8-rep2-42_S87_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/ex8-rep2-42_S87_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/expt10B-MEGSUB-4_S4_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/expt10B-MEGSUB-4_S4_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/NGS/rep1-42_S42_L001_R1_001.fastq.gz.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matsengrp/phip-flow/66c13635dc494e008acfb67f7f5e260e22a13855/data/pan-cov-example/NGS/rep1-42_S42_L001_R1_001.fastq.gz.test.gz -------------------------------------------------------------------------------- /data/pan-cov-example/sample_table_with_beads_and_lib.csv: -------------------------------------------------------------------------------- 1 | technical_replicate_id,submitted_by,library_batch,control_status,sample_ID,sample_type,species,participant_ID,age,sex,race,days_from_symptom_onset,patient_status,source,pandemic_status,fastq_filepath 2 | 273,hannah,MEGSUB,library,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/4B-rep1-27-library_S26_L001_R1_001.fastq.gz.test.gz 3 | 572,mackenzie,MEGSUB,beads_only,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/expt10B-MEGSUB-4_S4_L001_R1_001.fastq.gz.test.gz 4 | 247,caitlin,SUB2,library,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/4A-rep1-27-library_S27_L001_R1_001.fastq.gz.test.gz 5 | 725,caitlin,SUB2,beads_only,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/ex11a-beads-35_S87_L001_R1_001.fastq.gz.test.gz 6 | 90,caitlin,MEGSUB,empirical,80,ACD plasma,human,32C,36.0,Male,White,30.0,conv outpatient 30d,Helen Chu,pandemic,data/pan-cov-example/NGS/rep1-42_S42_L001_R1_001.fastq.gz.test.gz 7 | 382,caitlin,SUB2,empirical,80,ACD plasma,human,32C,36.0,Male,White,30.0,conv outpatient 30d,Helen Chu,pandemic,data/pan-cov-example/NGS/ex8-rep2-42_S87_L001_R1_001.fastq.gz.test.gz 8 | 269,hannah,MEGSUB,empirical,45,serum,human,13a,NA,NA,NA,NA,healthy adult,Chu lab,pre,data/pan-cov-example/NGS/4B-rep1-22_S22_L001_R1_001.fastq.gz.test.gz 9 | 242,caitlin,SUB2,empirical,45,serum,human,13a,NA,NA,NA,NA,healthy adult,Chu lab,pre,data/pan-cov-example/NGS/4A-rep2-22_S49_L001_R1_001.fastq.gz.test.gz 10 | -------------------------------------------------------------------------------- /data/pan-cov-example/sample_table_with_beads_no_lib.csv: -------------------------------------------------------------------------------- 1 | technical_replicate_id,submitted_by,library_batch,control_status,sample_ID,sample_type,species,participant_ID,age,sex,race,days_from_symptom_onset,patient_status,source,pandemic_status,fastq_filepath 2 | 572,mackenzie,MEGSUB,beads_only,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/expt10B-MEGSUB-4_S4_L001_R1_001.fastq.gz.test.gz 3 | 725,caitlin,SUB2,beads_only,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/ex11a-beads-35_S87_L001_R1_001.fastq.gz.test.gz 4 | 90,caitlin,MEGSUB,empirical,80,ACD plasma,human,32C,36.0,Male,White,30.0,conv outpatient 30d,Helen Chu,pandemic,data/pan-cov-example/NGS/rep1-42_S42_L001_R1_001.fastq.gz.test.gz 5 | 382,caitlin,SUB2,empirical,80,ACD plasma,human,32C,36.0,Male,White,30.0,conv outpatient 30d,Helen Chu,pandemic,data/pan-cov-example/NGS/ex8-rep2-42_S87_L001_R1_001.fastq.gz.test.gz 6 | 269,hannah,MEGSUB,empirical,45,serum,human,13a,NA,NA,NA,NA,healthy adult,Chu lab,pre,data/pan-cov-example/NGS/4B-rep1-22_S22_L001_R1_001.fastq.gz.test.gz 7 | 242,caitlin,SUB2,empirical,45,serum,human,13a,NA,NA,NA,NA,healthy adult,Chu lab,pre,data/pan-cov-example/NGS/4A-rep2-22_S49_L001_R1_001.fastq.gz.test.gz 8 | -------------------------------------------------------------------------------- /data/pan-cov-example/sample_table_with_beads_one_emp.csv: -------------------------------------------------------------------------------- 1 | technical_replicate_id,submitted_by,library_batch,control_status,sample_ID,sample_type,species,participant_ID,age,sex,race,days_from_symptom_onset,patient_status,source,pandemic_status,fastq_filepath 2 | 572,mackenzie,MEGSUB,beads_only,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/expt10B-MEGSUB-4_S4_L001_R1_001.fastq.gz.test.gz 3 | 725,caitlin,SUB2,beads_only,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,data/pan-cov-example/NGS/ex11a-beads-35_S87_L001_R1_001.fastq.gz.test.gz 4 | 90,caitlin,MEGSUB,empirical,80,ACD plasma,human,32C,36.0,Male,White,30.0,conv outpatient 30d,Helen Chu,pandemic,data/pan-cov-example/NGS/rep1-42_S42_L001_R1_001.fastq.gz.test.gz 5 | -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | /* 2 | * This Source Code Form is subject to the terms of the GNU GENERAL PUBLIC LICENCE 3 | * License, v. 3.0. 4 | */ 5 | 6 | 7 | /* 8 | * 'PhIP-Flow' - A Nextflow pipeline for running common phip-seq analysis workflows 9 | * 10 | * Fred Hutchinson Cancer Research Center, Seattle WA. 11 | * 12 | * Jared Galloway 13 | * Kevin Sung 14 | * Sam Minot 15 | * Erick Matsen 16 | */ 17 | 18 | /* 19 | * Enable DSL 2 syntax 20 | */ 21 | nextflow.enable.dsl = 2 22 | 23 | /* 24 | * Define the default parameters - example data get's run by default 25 | */ 26 | params.sample_table = "$baseDir/data/pan-cov-example/sample_table_with_beads_and_lib.csv" 27 | if (params.sample_table != "$baseDir/data/pan-cov-example/sample_table_with_beads_and_lib.csv") 28 | params.reads_prefix = "$launchDir" 29 | else 30 | params.reads_prefix = "$baseDir" 31 | params.peptide_table = "$baseDir/data/pan-cov-example/peptide_table.csv" 32 | params.results = "$PWD/results/" 33 | 34 | 35 | log.info """\ 36 | P H I P - F L O W! 37 | Matsen, Overbaugh, and Minot Labs 38 | Fred Hutchinson CRC, Seattle WA 39 | ================================ 40 | sample_table : $params.sample_table 41 | peptide_table : $params.peptide_table 42 | results : $params.results 43 | 44 | """ 45 | 46 | /* 47 | * Import modules 48 | */ 49 | nextflow.enable.dsl=2 50 | 51 | include { ALIGN } from './workflows/alignment.nf' 52 | include { STATS } from './workflows/statistics.nf' 53 | include { DSOUT } from './workflows/output.nf' 54 | include { AGG } from './workflows/aggregate.nf' 55 | 56 | workflow { 57 | ALIGN | STATS | DSOUT | AGG 58 | } 59 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | manifest { 2 | description = 'PhIP-Seq common analysis workflows' 3 | nextflowVersion = '>= 20.07.0' 4 | } 5 | 6 | // PIPELINE KNOBS 7 | params{ 8 | 9 | 10 | /* 11 | sample reads options 12 | */ 13 | 14 | // Stream Function 15 | // Set this as 'cat' if fastq files not g'zipped 16 | fastq_stream_func = 'zcat' 17 | 18 | 19 | /* 20 | output options 21 | */ 22 | 23 | // Dataset Name Prefix 24 | dataset_prefix = 'data' 25 | 26 | // Output Xarray pickle 27 | output_pickle_xarray = true 28 | 29 | // Output tall format ds 30 | output_tall_csv = false 31 | 32 | // Output wide format ds 33 | output_wide_csv = true 34 | 35 | 36 | /* 37 | alignment options 38 | */ 39 | 40 | // read length for align 41 | read_length = 125 42 | 43 | // peptide oligo encoding length for alignment 44 | oligo_tile_length = 117 45 | 46 | // mismatches allowed (end-to-end) 47 | n_mismatches = 2 48 | 49 | // other bowtie options 50 | bowtie_optional_args = '--tryhard --nomaqround --norc --best --sam --quiet' 51 | 52 | // Flag for replicating counts for replicate sequences 53 | replicate_sequence_counts = true 54 | 55 | 56 | 57 | /* 58 | Optional workflows 59 | */ 60 | 61 | run_cpm_enr_workflow = false 62 | run_zscore_fit_predict = false 63 | run_edgeR = true 64 | 65 | // WARNING: This functionality has not been fully tested 66 | run_BEER = false 67 | 68 | 69 | /* 70 | Options for grouping results by organism (e.g. VirScan) 71 | */ 72 | 73 | // Flag used to control the summary of results by organism 74 | summarize_by_organism = false 75 | 76 | // Column in the peptide table indicating the organism for each peptide 77 | peptide_org_col = "organism" 78 | 79 | // Column in the peptide table containing the peptide sequence 80 | // (used to match against public epitopes, and to filter overlapping peptides) 81 | peptide_seq_col = "seq" 82 | 83 | // Maximum allowed overlap between detected peptides 84 | max_overlap = 7 85 | 86 | // Minimum z-score threshold 87 | zscore_threshold = 2.5 88 | 89 | // Maximum edgeR threshold (BH-adjusted p-value) 90 | edgeR_threshold = 0.05 91 | 92 | // Column in the sample table used for mapping replicates to samples 93 | sample_grouping_col = "" 94 | 95 | // Optional, a CSV containing public epitopes 96 | public_epitopes_csv = "$projectDir/templates/public_epitope_template.csv" 97 | 98 | // In the public epitopes CSV, the column containing the translated amino acid sequence 99 | public_epitopes_col = "peptide_translate" 100 | 101 | } 102 | 103 | // Set the container which can be used for all processes 104 | process { 105 | 106 | // Default for any processes which do not match the selectors below 107 | container = 'quay.io/matsengrp/phip-flow:latest' 108 | 109 | withName: 'run_edgeR|run_BEER' { 110 | container = 'quay.io/biocontainers/bioconductor-beer:1.2.0--r42hdfd78af_0' 111 | } 112 | 113 | } 114 | 115 | profiles { 116 | 117 | // Run locally assuming all deps available in current environment 118 | standard { 119 | } 120 | 121 | // Run locally assuming docker is installed with the latest image 122 | docker { 123 | docker.enabled = true 124 | } 125 | 126 | 127 | singularity { 128 | singularity.enabled = true 129 | singularity.autoMounts = true 130 | //singularity.cacheDir = 'singularity_cache' 131 | singularity.runOptions = '--contain -W /tmp/ -B ${HOME},${PWD}' 132 | } 133 | 134 | apptainer { 135 | apptainer.enabled = true 136 | apptainer.autoMounts = true 137 | //apptainer.cacheDir = 'apptainer_cache' 138 | apptainer.runOptions = '--contain -W /tmp/ -B ${HOME},${PWD}' 139 | } 140 | 141 | // Run batch submission assuming docker is installed with the latest image 142 | cluster { 143 | 144 | singularity { 145 | enabled = true 146 | autoMounts = true 147 | cacheDir = 'singularity_cache' 148 | runOptions = '--contain -W /tmp/ -B ${HOME},${PWD}' 149 | } 150 | 151 | process { 152 | executor = 'slurm' 153 | queue = 'campus-new' 154 | cpus = 1 155 | memory = 16.GB 156 | time = '1h' 157 | clusterOptions = '-A overbaugh_j' 158 | withLabel: mem_large { memory = 48.GB } 159 | withLabel: mem_xlarge { memory = 64.GB } 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /templates/aggregate_organisms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | from typing import List 5 | import pandas as pd 6 | import logging 7 | from scipy.stats import gmean 8 | 9 | # APPROACH 10 | 11 | # 1. Start with the Z-scores calculated per epitope across every sample replicate 12 | 13 | # 2. Combine the epitope-level data collected for multiple 14 | # replicates of the same sample, saving a table with the 15 | # following information per epitope, per sample: 16 | # Mean Z-score, also referred to as the Epitope Binding Score (EBS) 17 | # Hit: 18 | # marked as TRUE if both replicates are above the threshold Z-score 19 | # marked as FALSE if both replicates are below the threshold Z-score 20 | # marked as DISCORDANT if some but not all replicates are above the threshold Z-score 21 | # Public: marked as TRUE if the epitope was included in the input list of public epitopes 22 | 23 | # 3. To combine the virus-level data for each sample, only keep the highest-scoring 24 | # set of epitopes which do not overlap with any other epitope by more than 7aa. 25 | # To identify overlaps, use an exact alignment approach using k-mers. Note that this 26 | # will count two peptides as overlapping if they share any 7aa sequence, without 27 | # performing global alignment. 28 | 29 | # 4. Finally, save a table with the following information per virus, per sample: 30 | # Number of all epitope hits 31 | # Number of public epitope hits 32 | # Number of all discordant epitopes 33 | # Number of public discordant epitopes 34 | # Max EBS across all epitopes 35 | # Max EBS across public epitopes 36 | # Mean EBS across all epitopes 37 | # Mean EBS across public epitopes 38 | 39 | 40 | # INPUTS 41 | 42 | # Read in the input data from an expected location 43 | # The placement of the appropriate files in these locations 44 | # is expected to be performed by the Nextflow wrapper code 45 | # using the configuration of the appropriate module within 46 | # the phip-flow workflow 47 | 48 | class AggregatePhIP: 49 | 50 | def __init__(self): 51 | 52 | # Set up logging 53 | self.logger = self.setup_logging() 54 | 55 | # Mapping replicates to samples 56 | self.sample_mapping = self.read_sample_mapping() 57 | 58 | # List of public epitopes 59 | self.public_epitopes = self.read_public_epitopes() 60 | 61 | # Mapping peptides to organisms 62 | self.peptide_mapping = self.read_peptide_mapping() 63 | 64 | # The user must specify the maximum overlap 65 | self.max_overlap = int("!{params.max_overlap}") 66 | self.logger.info(f"Maximum overlap: {self.max_overlap}") 67 | 68 | # The user must specify the minimum z-score threshold 69 | self.zscore_threshold = float("!{params.zscore_threshold}") 70 | self.logger.info(f"Z-score threshold: {self.zscore_threshold}") 71 | 72 | # Read in the z-scores 73 | zscores_fp = "!{params.dataset_prefix}_zscore.csv.gz" 74 | self.logger.info(f"Reading in z-scores from: {zscores_fp}") 75 | assert os.path.exists(zscores_fp) 76 | self.zscores = pd.read_csv(zscores_fp, index_col=0) 77 | 78 | # Read in the edgeR hits (if present) 79 | edgeR_hits_fp = "!{params.dataset_prefix}_edgeR_hits.csv.gz" 80 | if os.path.exists(edgeR_hits_fp): 81 | self.logger.info(f"Reading in edgeR hits from: {edgeR_hits_fp}") 82 | self.edgeR_hits = pd.read_csv( 83 | edgeR_hits_fp, 84 | index_col=0, 85 | true_values=["TRUE", "True", "true"], 86 | false_values=["FALSE", "False", "false"], 87 | na_values=["NA", "N/A", "Na", "na", "n/a"] 88 | ) 89 | self.has_edgeR_hits = True 90 | else: 91 | self.has_edgeR_hits = False 92 | 93 | # Group the replicates by sample 94 | self.logger.info("Grouping replicates by sample") 95 | self.sample_table = self.group_replicates() 96 | 97 | # Apply the max_overlap filter 98 | # (setting the column 'passes_filter' to True if the peptide passes) 99 | self.sample_table = self.apply_max_overlap_filter() 100 | 101 | # Save to CSV 102 | self.sample_table.to_csv("!{sample_id}.peptide.ebs.csv.gz", index=None) 103 | 104 | # Group the peptides by organism 105 | self.logger.info("Grouping peptides by organism") 106 | self.organism_table = self.group_organisms() 107 | 108 | # Save to CSV 109 | self.logger.info("Writing organism-level outputs to CSV") 110 | self.organism_table.to_csv("!{sample_id}.organism.summary.csv.gz", index=None) 111 | 112 | self.logger.info("Done") 113 | 114 | def setup_logging(self) -> logging.Logger: 115 | """Set up logging.""" 116 | 117 | # Set up logging 118 | logFormatter = logging.Formatter( 119 | '%(asctime)s %(levelname)-8s [aggregate_organisms] %(message)s' 120 | ) 121 | logger = logging.getLogger() 122 | logger.setLevel(logging.INFO) 123 | 124 | # Also write to STDOUT 125 | consoleHandler = logging.StreamHandler() 126 | consoleHandler.setFormatter(logFormatter) 127 | logger.addHandler(consoleHandler) 128 | 129 | return logger 130 | 131 | def read_sample_mapping(self) -> pd.Series: 132 | """Read a mapping of replicates to samples.""" 133 | 134 | # The user must specify a CSV containing the sample mapping 135 | sample_mapping_fp = "!{params.dataset_prefix}_sample_annotation_table.csv.gz" 136 | self.logger.info(f"Reading in sample mapping from: {sample_mapping_fp}") 137 | assert os.path.exists(sample_mapping_fp) 138 | 139 | # Read in the table 140 | df = pd.read_csv(sample_mapping_fp, index_col=0) 141 | self.logger.info(f"Sample mapping table has {df.shape[0]:,} rows and {df.shape[1]:,} columns") 142 | 143 | # If the user specified a column used to group replicates 144 | # from the same sample 145 | sample_grouping_col = "!{params.sample_grouping_col}" 146 | if len(sample_grouping_col) > 0: 147 | 148 | # Make sure that the column is present in the table 149 | msg = f"Column '{sample_grouping_col}' not found ({', '.join(df.columns.values)})" 150 | assert sample_grouping_col in df.columns.values, msg 151 | 152 | # Return the column mapping of replicates to samples 153 | return df[sample_grouping_col] 154 | 155 | # Otherwise, if no grouping was specified 156 | else: 157 | 158 | # Just treat each sample the same 159 | return { 160 | int(replicate_id): str(replicate_id) 161 | for replicate_id in df.index.values 162 | } 163 | 164 | def read_peptide_mapping(self) -> pd.DataFrame: 165 | """Read the table mapping peptides (by ID) to organism, protein, and start position ('pos').""" 166 | 167 | peptide_mapping_fp = "!{params.dataset_prefix}_peptide_annotation_table.csv.gz" 168 | self.logger.info(f"Reading in peptide mappings from: {peptide_mapping_fp}") 169 | assert os.path.exists(peptide_mapping_fp) 170 | 171 | # Read in the table 172 | df = pd.read_csv(peptide_mapping_fp, index_col=0) 173 | self.logger.info(f"Peptide mapping table has {df.shape[0]:,} rows and {df.shape[1]:,} columns") 174 | 175 | # Map the user-provided names to controlled values 176 | mapping = { 177 | # The user must specify the column used to group peptides by organism 178 | "!{params.peptide_org_col}": "organism", 179 | # And by the protein sequence (which corresponds to the public epitope sequences) 180 | "!{params.peptide_seq_col}": "seq" 181 | } 182 | 183 | # For each of the user-provided columns 184 | for cname in mapping.keys(): 185 | 186 | # Make sure that it is in the table 187 | msg = f"Column '{cname}' not found ({', '.join(df.columns.values)})" 188 | assert cname in df.columns.values, msg 189 | 190 | # Change the names 191 | df = df.rename(columns=mapping) 192 | 193 | # Only return those columns 194 | df = df.reindex( 195 | columns=list(mapping.values()) 196 | ) 197 | 198 | # Assign the column `public` True if the protein sequence is in the public epitope list 199 | # Make sure to strip everything after the "*" 200 | df = df.assign( 201 | public=df["seq"].apply( 202 | lambda s: s.split("*")[0] 203 | ).isin(self.public_epitopes) 204 | ) 205 | 206 | self.logger.info(f"Public Epitopes: {df['public'].sum():,} / {df.shape[0]:,}") 207 | 208 | # Add the length of the peptide 209 | df = df.assign( 210 | peptide_length=lambda d: d["seq"].apply(len) 211 | ) 212 | 213 | return df 214 | 215 | def read_public_epitopes(self) -> List[str]: 216 | """Read the list of public epitopes provided.""" 217 | 218 | # Table of public epitopes 219 | df = pd.read_csv("!{public_epitopes_csv}") 220 | self.logger.info(f"Public epitope table has {df.shape[0]:,} rows") 221 | 222 | # The user must specify the column which contains the public epitopes 223 | public_epitopes_col = "peptide_translate" 224 | 225 | msg = f"Column not found: {public_epitopes_col} in ({', '.join(df.columns.values)})" 226 | assert public_epitopes_col in df.columns.values, msg 227 | 228 | # Strip everything after the "*" 229 | return df[ 230 | public_epitopes_col 231 | ].apply( 232 | lambda s: s.split("*")[0] 233 | ).tolist() 234 | 235 | def group_replicates(self) -> pd.DataFrame: 236 | """Group together the replicates of the same sample.""" 237 | 238 | # Get the replicates which should be combined for this sample 239 | replicates = [ 240 | rep_i 241 | for rep_i in self.zscores.columns.values 242 | if self.sample_mapping.get(int(rep_i)) == '!{sample_id}' 243 | ] 244 | 245 | self.logger.info(f"Filtering down to the {len(replicates):,} replicates for sample '!{sample_id}'") 246 | assert len(replicates) > 0 247 | 248 | # Take a slice of the table 249 | df = self.zscores.reindex(columns=replicates) 250 | # If we have edgeR data, filter it to the same replicates 251 | if self.has_edgeR_hits: 252 | self.edgeR_hits = self.edgeR_hits.reindex(columns=replicates) 253 | 254 | # Add summary metrics 255 | df = df.assign( 256 | n_replicates=len(replicates), 257 | EBS=df.mean(axis=1), 258 | hit=df.apply(self.classify_hit, axis=1), 259 | edgeR_hit=( 260 | self.edgeR_hits.apply(self.classify_edgeR_hit, axis=1) 261 | if self.has_edgeR_hits 262 | else None 263 | ), 264 | sample='!{sample_id}' 265 | ).reset_index( 266 | ).rename( 267 | columns=dict(index="peptide") 268 | ).drop( 269 | columns=replicates + ( 270 | ["edgeR_hit"] if not self.has_edgeR_hits else [] 271 | ) 272 | ) 273 | 274 | # Mark whether each peptide is public 275 | df = df.assign( 276 | public=df["peptide"].apply(int).apply( 277 | lambda i: self.peptide_mapping["public"][i] 278 | ) 279 | ) 280 | 281 | return df 282 | 283 | def classify_hit(self, r): 284 | """Determine whether a peptide is a hit, or discordant.""" 285 | 286 | # Get the vector of whether each replicate 287 | # is above the z-score threshold 288 | hit_vec = r > self.zscore_threshold 289 | 290 | # Determine the hit type 291 | if hit_vec.all(): 292 | return "TRUE" 293 | elif not hit_vec.any(): 294 | return "FALSE" 295 | else: 296 | return "DISCORDANT" 297 | 298 | def classify_edgeR_hit(self, r: pd.Series) -> str: 299 | """ 300 | Determine whether a peptide is a hit, or discordant - 301 | based on edgeR hits. 302 | """ 303 | 304 | # Drop NA values before classification 305 | r = r.dropna() 306 | if len(r) == 0: # If all values were NA 307 | return "NA" 308 | 309 | # Determine the hit type 310 | if r.all(): 311 | return "TRUE" 312 | elif not r.any(): 313 | return "FALSE" 314 | else: 315 | return "DISCORDANT" 316 | 317 | def apply_max_overlap_filter(self) -> pd.DataFrame: 318 | """Apply the max_overlap filter to each sample/organism.""" 319 | 320 | # Analyze each sample/organism independently 321 | df = pd.concat([ 322 | self.apply_max_overlap_filter_sub(d) 323 | for _, d in self.sample_table.assign( 324 | organism=lambda d: d["peptide"].apply( 325 | self.peptide_mapping["organism"].get 326 | ) 327 | ).groupby( 328 | ["sample", "organism"] 329 | ) 330 | ]) 331 | 332 | return df 333 | 334 | def apply_max_overlap_filter_sub( 335 | self, 336 | df: pd.DataFrame 337 | ) -> pd.DataFrame: 338 | 339 | # Add the sequence information for each peptide 340 | df = df.assign( 341 | seq=df["peptide"].apply( 342 | self.peptide_mapping["seq"].get 343 | ).apply( 344 | lambda s: s.rstrip("*") 345 | ) 346 | ) 347 | 348 | # Sort by EBS (descending) 349 | df = df.sort_values(by="EBS", ascending=False) 350 | 351 | # Keep track of the peptide kmers which have been observed so far 352 | kmers_seen = set() 353 | 354 | # Make a list of the indices pass the filter 355 | passes_filter = list() 356 | 357 | # Go down the list, starting with the tightest binders 358 | for _, r in df.iterrows(): 359 | 360 | # Get the kmers by this peptide 361 | row_kmers = set([ 362 | r["seq"][n:(n + self.max_overlap)] 363 | for n in range(len(r["seq"]) - self.max_overlap) 364 | ]) 365 | 366 | # If none of those kmers have been seen before, 367 | # it passes the filter 368 | passes_filter.append(len(row_kmers & kmers_seen) == 0) 369 | 370 | # If it passes 371 | if passes_filter[-1]: 372 | 373 | # Add the covered positions 374 | kmers_seen |= row_kmers 375 | 376 | # Add a column to the table indicating 377 | # whether the peptide passes the filter 378 | df = df.assign( 379 | passes_filter=passes_filter 380 | ) 381 | 382 | # Drop the sequence column 383 | return ( 384 | df 385 | .drop(columns=["seq"]) 386 | .sort_index() 387 | ) 388 | 389 | def group_organisms(self) -> pd.DataFrame: 390 | """Group together the results by organism.""" 391 | 392 | # Analyze each organism independently 393 | df = pd.concat([ 394 | self.group_sample_organisms(d, sample, organism) 395 | for (sample, organism), d in self.sample_table.assign( 396 | organism=lambda d: d["peptide"].apply( 397 | self.peptide_mapping["organism"].get 398 | ) 399 | ).groupby( 400 | ["sample", "organism"] 401 | ) 402 | ]).fillna( 403 | 0 404 | ) 405 | 406 | return df 407 | 408 | def group_sample_organisms( 409 | self, 410 | df: pd.DataFrame, 411 | sample: str, 412 | organism: str 413 | ) -> pd.DataFrame: 414 | 415 | """Analyze the data for a single sample, single organism.""" 416 | 417 | # For this summary, drop peptides which don't pass the filter 418 | df = df.query("passes_filter") 419 | 420 | # Return the number of hits, etc. for all and just public epitopes 421 | dat = pd.DataFrame([{ 422 | "sample": sample, 423 | "organism": organism, 424 | **{ 425 | k: v 426 | for label, d in [ 427 | ("all", df), 428 | ("public", df.query("public")), 429 | ("hits", df.query("hit == 'TRUE'")), 430 | ] 431 | if d.shape[0] > 0 432 | for k, v in [ 433 | (f"n_hits_{label}", (d["hit"] == "TRUE").sum()), 434 | (f"n_discordant_{label}", (d["hit"] == "DISCORDANT").sum()), 435 | (f"max_ebs_{label}", d["EBS"].max()), 436 | (f"mean_ebs_{label}", d["EBS"].mean()), 437 | (f"gmean_ebs_{label}", gmean(d["EBS"])) 438 | ] + ( 439 | [ 440 | (f"n_edgeR_hits_{label}", (d["edgeR_hit"] == "TRUE").sum()), 441 | (f"n_edgeR_discordant_{label}", (d["edgeR_hit"] == "DISCORDANT").sum()), 442 | ] 443 | if self.has_edgeR_hits 444 | else [] 445 | ) 446 | if k not in [ 447 | "n_hits_hits", 448 | "n_discordant_hits", 449 | "gmean_ebs_all", 450 | "gmean_ebs_public" 451 | ] 452 | } 453 | }]) 454 | 455 | return dat 456 | 457 | 458 | AggregatePhIP() 459 | -------------------------------------------------------------------------------- /templates/generate_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | FASTA=!{oligo_fasta} 6 | CPUS=!{task.cpus} 7 | 8 | mkdir peptide_index 9 | bowtie-build \ 10 | --threads $CPUS \ 11 | $FASTA \ 12 | peptide_index/peptide 13 | -------------------------------------------------------------------------------- /templates/join_organisms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import pandas as pd 4 | 5 | peptide_dtypes = dict( 6 | peptide=str, 7 | n_replicates=int, 8 | EBS=float, 9 | hit=str, 10 | sample=str, 11 | public=bool 12 | ) 13 | 14 | org_dtypes = dict( 15 | sample=str, 16 | organism=str, 17 | n_hits_all=int, 18 | n_discordant_all=int, 19 | max_ebs_all=float, 20 | mean_ebs_all=float, 21 | n_hits_public=int, 22 | n_discordant_public=int, 23 | max_ebs_public=float, 24 | mean_ebs_public=float, 25 | 26 | ) 27 | 28 | for suffix, dtype_dict in [ 29 | ("peptide.ebs.csv.gz", peptide_dtypes), 30 | ("organism.summary.csv.gz", org_dtypes) 31 | ]: 32 | 33 | pd.concat([ 34 | pd.read_csv( 35 | os.path.join("input", fp), 36 | dtype=dtype_dict 37 | ) 38 | for fp in os.listdir("input") 39 | if fp.endswith(suffix) 40 | ]).to_csv( 41 | suffix, 42 | index=None 43 | ) -------------------------------------------------------------------------------- /templates/public_epitope_template.csv: -------------------------------------------------------------------------------- 1 | peptide_translate -------------------------------------------------------------------------------- /templates/sam_to_counts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | CPUS=!{task.cpus} 5 | 6 | # Convert SAM to BAM, and sort 7 | samtools view -u -@ ${CPUS} !{sam_file} | \ 8 | samtools sort -@ ${CPUS} - > !{sample_id}.bam 9 | 10 | # Sort the BAM again 11 | samtools sort -@ ${CPUS} !{sample_id}.bam -o !{sample_id}.sorted 12 | 13 | # Overwrite the first sorted BAM with the second 14 | mv !{sample_id}.sorted !{sample_id}.bam 15 | 16 | # Index the BAM 17 | samtools index -b !{sample_id}.bam 18 | 19 | # Count the number of reads per chromosome (excluding unmapped) 20 | samtools idxstats !{sample_id}.bam | \ 21 | cut -f 1,3 | \ 22 | sed "/^*/d" > !{sample_id}.counts 23 | -------------------------------------------------------------------------------- /templates/sam_to_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | samtools stats !{sam_file} | \ 6 | grep ^SN | 7 | cut -f 2- | \ 8 | sed '1p;7p;22p;25p;d' \ 9 | > !{sample_id}.stats 10 | -------------------------------------------------------------------------------- /templates/short_read_alignment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : ' 4 | This template aligns reads to the index after trimming the 5 | read to be the same length as tiles in the library. 6 | For more on bowtie alignment, see the options 7 | documented by bowtie. 8 | 9 | We report only the best alignment found, and no more. 10 | 11 | You may specify the number of allowed mismatches in the config file 12 | or hard-code it (or any other options) here. 13 | ' 14 | 15 | set -euo pipefail 16 | 17 | STREAM_FILE_CMD=!{params.fastq_stream_func} 18 | FASTQ=!{respective_replicate_path} 19 | INDEX=!{index}/peptide 20 | ALIGN_OUT_FN=!{sample_id}.sam 21 | READ_LENGTH=!{params.read_length} 22 | PEPTIDE_LENGTH=!{params.oligo_tile_length} 23 | CPUS=!{task.cpus} 24 | MM=!{params.n_mismatches} 25 | OP_ARGS="!{params.bowtie_optional_args}" 26 | 27 | if [ ${PEPTIDE_LENGTH} -lt ${READ_LENGTH} ]; then 28 | let TRIM3=${READ_LENGTH}-${PEPTIDE_LENGTH} 29 | else 30 | TRIM3=0 31 | fi 32 | 33 | echo $OP_ARGS 34 | 35 | $STREAM_FILE_CMD $FASTQ | bowtie \ 36 | --trim3 $TRIM3 \ 37 | --threads $CPUS \ 38 | -n $MM \ 39 | -l $PEPTIDE_LENGTH \ 40 | $OP_ARGS \ 41 | -x $INDEX - > $ALIGN_OUT_FN 42 | -------------------------------------------------------------------------------- /templates/split_samples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from collections import defaultdict 4 | import os 5 | from typing import List 6 | import pandas as pd 7 | import logging 8 | 9 | 10 | def setup_logging() -> logging.Logger: 11 | """Set up logging.""" 12 | 13 | # Set up logging 14 | logFormatter = logging.Formatter( 15 | '%(asctime)s %(levelname)-8s [split_samples] %(message)s' 16 | ) 17 | logger = logging.getLogger() 18 | logger.setLevel(logging.INFO) 19 | 20 | # Also write to STDOUT 21 | consoleHandler = logging.StreamHandler() 22 | consoleHandler.setFormatter(logFormatter) 23 | logger.addHandler(consoleHandler) 24 | 25 | return logger 26 | 27 | logger = setup_logging() 28 | 29 | # The user must specify a CSV containing the sample mapping 30 | sample_mapping_fp = "!{params.dataset_prefix}_sample_annotation_table.csv.gz" 31 | logger.info(f"Reading in sample mapping from: {sample_mapping_fp}") 32 | assert os.path.exists(sample_mapping_fp) 33 | 34 | # Read in the table 35 | df = pd.read_csv(sample_mapping_fp, index_col=0) 36 | logger.info(f"Sample mapping table has {df.shape[0]:,} rows and {df.shape[1]:,} columns") 37 | 38 | # If the user specified a column used to group replicates 39 | # from the same sample 40 | sample_grouping_col = "!{params.sample_grouping_col}" 41 | if len(sample_grouping_col) > 0: 42 | 43 | # Make sure that the column is present in the table 44 | msg = f"Column '{sample_grouping_col}' not found ({', '.join(df.columns.values)})" 45 | assert sample_grouping_col in df.columns.values, msg 46 | 47 | # Write out a file containing the unique list of sample names 48 | df.reindex( 49 | columns=[sample_grouping_col] 50 | ).drop_duplicates( 51 | ).to_csv( 52 | "sample_list", 53 | header=None, 54 | index=None 55 | ) 56 | 57 | # If no such grouping was found 58 | else: 59 | 60 | # Just write out a list of each replicate 61 | with open("sample_list", "w") as handle: 62 | handle.write( 63 | "\n".join(list(map(str, df.index.values))) 64 | ) 65 | -------------------------------------------------------------------------------- /workflows/aggregate.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | // Using DSL-2 4 | nextflow.enable.dsl=2 5 | 6 | 7 | // Get the list of all samples so that aggregate_organisms can be sharded 8 | process split_samples { 9 | 10 | input: 11 | // All output data in wide format (CSV) 12 | path "*" 13 | 14 | output: path "sample_list" 15 | when: params.summarize_by_organism 16 | shell: 17 | template "split_samples.py" 18 | } 19 | 20 | process aggregate_organisms { 21 | tag "${sample_id}" 22 | cpus 1 23 | memory "4.GB" 24 | input: 25 | // All output data in wide format (CSV) 26 | tuple path("*"), val(sample_id) 27 | // Any public epitopes defined in CSV format 28 | path public_epitopes_csv 29 | output: path "*.csv.gz" 30 | when: params.summarize_by_organism 31 | shell: 32 | template "aggregate_organisms.py" 33 | } 34 | 35 | process join_organisms { 36 | publishDir "$params.results/aggregated_data/", mode: 'copy', overwrite: true 37 | input: path "input/" 38 | output: path "*.csv.gz" 39 | when: params.summarize_by_organism 40 | shell: 41 | template 'join_organisms.py' 42 | } 43 | 44 | workflow AGG { 45 | take: 46 | dump_binary 47 | dump_wide_csv 48 | dump_tall_csv 49 | main: 50 | 51 | // Get the list of all samples 52 | split_samples(dump_wide_csv) 53 | 54 | aggregate_organisms( 55 | dump_wide_csv 56 | .toSortedList() 57 | .combine( 58 | split_samples 59 | .out 60 | .splitText(){it.replace("\n", "")} 61 | ), 62 | file("${params.public_epitopes_csv}") 63 | ) 64 | 65 | join_organisms( 66 | aggregate_organisms 67 | .out 68 | .flatten() 69 | .toSortedList() 70 | ) 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /workflows/alignment.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | // Using DSL-2 4 | nextflow.enable.dsl=2 5 | 6 | /* 7 | Validate and output the sample table 8 | */ 9 | process validate_sample_table { 10 | input: path samples 11 | output: path "validated_sample_table.csv" 12 | script: 13 | """ 14 | validate-sample-table.py \ 15 | -s $samples \ 16 | -o validated_sample_table.csv \ 17 | --run_zscore_fit_predict ${params.run_zscore_fit_predict} 18 | """ 19 | } 20 | 21 | /* 22 | Validate and output the peptide table 23 | */ 24 | // TODO fix script for no peptide id provided 25 | process validate_peptide_table{ 26 | input: path peptides 27 | output: path "validated_peptide_table.csv" 28 | script: 29 | """ 30 | validate-peptide-table.py \ 31 | -p $peptides \ 32 | -o validated_peptide_table.csv 33 | """ 34 | } 35 | 36 | // CONVERT PEPTIDE METADATA TO FASTA 37 | process generate_fasta_reference { 38 | input: path peptide_table 39 | output: path "peptides.fasta" 40 | script: 41 | """ 42 | generate-fasta.py \ 43 | -pt $peptide_table \ 44 | -o peptides.fasta 45 | """ 46 | } 47 | 48 | 49 | // GENERATE INDEX 50 | process generate_index { 51 | input: 52 | path "oligo_fasta" 53 | output: 54 | tuple val("peptide_ref"), path("peptide_index") 55 | shell: 56 | template "generate_index.sh" 57 | } 58 | 59 | 60 | // ALIGN ALL SAMPLES TO THE REFERENCE 61 | process short_read_alignment { 62 | label 'alignment_tool' 63 | input: 64 | tuple val(sample_id), path(index), path(respective_replicate_path) 65 | output: 66 | tuple val(sample_id), path("${sample_id}.sam") 67 | shell: 68 | template "short_read_alignment.sh" 69 | 70 | } 71 | 72 | 73 | // COMPUTE ALIGNMENT STATS FOR ALL STATS 74 | process sam_to_stats { 75 | input: 76 | tuple val(sample_id), path(sam_file) 77 | output: 78 | path "${sample_id}.stats" 79 | shell: 80 | template "sam_to_stats.sh" 81 | } 82 | 83 | 84 | // COMPUTE COUNTS FOR ALL SAMPLES 85 | process sam_to_counts { 86 | input: tuple val(sample_id), path(sam_file) 87 | output: path "${sample_id}.counts" 88 | shell: 89 | template "sam_to_counts.sh" 90 | } 91 | 92 | 93 | // COLLECT AND MERGE ALL 94 | // TODO move to bin script remove from phippery 95 | process collect_phip_data { 96 | input: 97 | path all_counts_files 98 | path all_alignment_stats 99 | path sample_table 100 | path peptide_table 101 | output: 102 | path "data.phip" 103 | 104 | shell: 105 | """ 106 | merge-counts-stats.py \ 107 | -st ${sample_table} \ 108 | -pt ${peptide_table} \ 109 | -cfp "*.counts" \ 110 | -sfp "*.stats" \ 111 | -o data.phip 112 | """ 113 | } 114 | 115 | process replicate_counts { 116 | input: path ds 117 | output: path "replicated_counts.phip" 118 | script: 119 | """ 120 | replicate-counts.py \ 121 | -ds ${ds} \ 122 | -o replicated_counts.phip 123 | """ 124 | } 125 | 126 | workflow ALIGN { 127 | 128 | main: 129 | sample_ch = Channel.fromPath(params.sample_table) 130 | peptide_ch = Channel.fromPath(params.peptide_table) 131 | 132 | validate_sample_table(sample_ch) 133 | validate_peptide_table(peptide_ch) \ 134 | | generate_fasta_reference | generate_index 135 | 136 | validate_sample_table.out 137 | .splitCsv(header:true ) 138 | .map{ row -> 139 | tuple( 140 | "peptide_ref", 141 | row.sample_id, 142 | file( 143 | "$params.reads_prefix/$row.fastq_filepath", 144 | checkIfExists:true 145 | ) 146 | ) 147 | } 148 | .set { samples_ch } 149 | 150 | short_read_alignment( 151 | generate_index.out 152 | .cross(samples_ch) 153 | .map{ ref, sample -> 154 | tuple( 155 | sample[1], // sample_id 156 | file(ref[1]), // index files 157 | file(sample[2]), // sample path 158 | ) 159 | } 160 | ) | (sam_to_counts & sam_to_stats) 161 | 162 | ds = collect_phip_data( 163 | sam_to_counts.out.toSortedList(), 164 | sam_to_stats.out.toSortedList(), 165 | validate_sample_table.out, 166 | validate_peptide_table.out 167 | ) 168 | 169 | final_output = ds 170 | if ( params.replicate_sequence_counts ) 171 | final_output = replicate_counts(ds) 172 | 173 | emit: 174 | final_output 175 | } 176 | -------------------------------------------------------------------------------- /workflows/edgeR_BEER.nf: -------------------------------------------------------------------------------- 1 | // Run external statistical analysis tools 2 | 3 | 4 | // EXTRACT WIDE CSV 5 | process to_csv { 6 | input: path phip_data 7 | output: 8 | tuple path(phip_data), path("*.csv") 9 | shell: 10 | """ 11 | phippery to-wide-csv -o dataset $phip_data 12 | """ 13 | } 14 | 15 | // RUN BEER 16 | process run_edgeR { 17 | // publishDir "$params.results/rds_data/", mode: 'copy', overwrite: true 18 | input: 19 | tuple path(phip_data), path(phip_data_csvs) 20 | output: 21 | tuple path(phip_data), path("edgeR*.csv"), path("PhIPData.rds"), val("edgeR") 22 | shell: 23 | """ 24 | run_edgeR.Rscript ${params.edgeR_threshold} 25 | """ 26 | } 27 | //mv PhIPData.rds ${params.dataset_prefix}.rds 28 | 29 | process run_BEER { 30 | // publishDir "$params.results/rds_data/", mode: 'copy', overwrite: true 31 | input: 32 | tuple path(phip_data), path("*"), path(edgeR_rds), val(method) 33 | output: 34 | tuple path(phip_data), path("beer*.csv"), path("PhIPData.rds"), val("BEER") 35 | shell: 36 | """ 37 | run_BEER.Rscript 38 | """ 39 | 40 | } 41 | 42 | process publish_rds { 43 | publishDir "$params.results/rds_data/", mode: 'copy', overwrite: true 44 | input: 45 | tuple path(phip_data), path(csvs), path(rds_data), val(method) 46 | output: 47 | path rds_data 48 | """ 49 | echo publishing $rds_data 50 | """ 51 | } 52 | 53 | // APPEND EDGER RESULTS INTO XARRAY DATASET 54 | process append_assay_csvs_to_xarray { 55 | input: 56 | tuple path(phip_data), path(csvs), path(rds_data), val(method) 57 | output: 58 | path "${method}.phip" 59 | shell: 60 | """ 61 | #!/usr/bin/env python3 62 | 63 | import glob 64 | from phippery.utils import * 65 | import pandas as pd 66 | 67 | ds = load("$phip_data") 68 | for csv in glob.glob("*.csv"): 69 | df = pd.read_csv(csv, index_col=0) 70 | table_name = csv.split(".")[0] 71 | add_enrichment_layer_from_array( 72 | ds, df.values, new_table_name=table_name 73 | ) 74 | 75 | dump(ds, "${method}.phip") 76 | """ 77 | } 78 | 79 | workflow edgeR_BEER_workflows { 80 | take: 81 | ds 82 | main: 83 | 84 | if ( params.run_BEER ) 85 | ds | to_csv \ 86 | | run_edgeR \ 87 | | run_BEER \ 88 | | (append_assay_csvs_to_xarray & publish_rds) 89 | else 90 | ds | to_csv \ 91 | | run_edgeR \ 92 | | (append_assay_csvs_to_xarray & publish_rds) 93 | 94 | emit: 95 | append_assay_csvs_to_xarray.out 96 | 97 | } 98 | 99 | 100 | -------------------------------------------------------------------------------- /workflows/output.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | // Using DSL-2 4 | nextflow.enable.dsl=2 5 | 6 | process dump_tall_csv { 7 | publishDir "$params.results/tall_data/", mode: 'copy', overwrite: true 8 | input: file phip_data 9 | output: file "*.csv.gz" 10 | when: params.output_tall_csv 11 | shell: 12 | """ 13 | phippery to-tall-csv -o ${params.dataset_prefix}-tall.csv $phip_data 14 | gzip *.csv 15 | """ 16 | } 17 | 18 | process dump_wide_csv { 19 | publishDir "$params.results/wide_data/", mode: 'copy', overwrite: true 20 | input: path phip_data 21 | output: path "*.csv.gz" 22 | when: params.output_wide_csv || params.summarize_by_organism 23 | shell: 24 | """ 25 | phippery to-wide-csv -o $params.dataset_prefix $phip_data 26 | gzip *.csv 27 | """ 28 | } 29 | 30 | process dump_binary { 31 | publishDir "$params.results/pickle_data/", mode: 'copy', overwrite: true 32 | input: file phip_data 33 | output: file "${params.dataset_prefix}.phip" 34 | when: params.output_pickle_xarray 35 | shell: 36 | """ 37 | cp ${phip_data} ${params.dataset_prefix}.phip 38 | """ 39 | } 40 | 41 | workflow DSOUT { 42 | take: dataset 43 | main: 44 | dump_binary(dataset) 45 | dump_wide_csv(dataset) 46 | dump_tall_csv(dataset) 47 | 48 | emit: 49 | dump_binary.out 50 | dump_wide_csv.out 51 | dump_tall_csv.out 52 | } 53 | 54 | 55 | -------------------------------------------------------------------------------- /workflows/statistics.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | /* 3 | Compute fold enrichment workflow 4 | 5 | Author: Jared G. Galloway 6 | */ 7 | 8 | // Using DSL-2 9 | nextflow.enable.dsl=2 10 | 11 | // Import a subworkflow to run the BEER enrichment analysis 12 | // https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeR.html 13 | include { edgeR_BEER_workflows } from './edgeR_BEER.nf' 14 | 15 | /* 16 | AUTOMATICALLY COMPUTED 17 | ---------------------- 18 | (NO REQUIRED ANNOTATIONS) 19 | */ 20 | 21 | process counts_per_million { 22 | input: path phip_data 23 | output: path "cpm.phip" 24 | shell: 25 | """ 26 | #!/usr/bin/env python3 27 | 28 | from phippery.normalize import counts_per_million 29 | from phippery.utils import * 30 | 31 | ds = load("$phip_data") 32 | counts_per_million(ds) 33 | dump(ds, "cpm.phip") 34 | """ 35 | } 36 | 37 | process size_factors { 38 | input: path phip_data 39 | output: path "sf.phip" 40 | shell: 41 | """ 42 | #!/usr/bin/env python3 43 | 44 | from phippery.normalize import size_factors 45 | from phippery.utils import * 46 | 47 | ds = load("$phip_data") 48 | size_factors(ds) 49 | dump(ds, "sf.phip") 50 | """ 51 | } 52 | 53 | /* 54 | OPTIONALLY RUN STATISTICS 55 | ------------------------- 56 | (ANNOTATIONS REQUIRED & FLAG) 57 | */ 58 | 59 | process cpm_fold_enrichment { 60 | input: path phip_data 61 | output: path "fold_enr.phip" 62 | when: params.run_cpm_enr_workflow 63 | shell: 64 | """ 65 | #!/usr/bin/env python3 66 | 67 | from phippery.normalize import enrichment 68 | from phippery.utils import * 69 | 70 | ds = load("$phip_data") 71 | lib_ds = ds_query(ds, "control_status == 'library'") 72 | enrichment(ds, lib_ds, data_table="cpm") 73 | dump(ds, "fold_enr.phip") 74 | """ 75 | } 76 | 77 | 78 | process fit_predict_zscore { 79 | input: path phip_data 80 | output: path "fit_predict_zscore.phip" 81 | when: params.run_zscore_fit_predict || params.summarize_by_organism 82 | shell: 83 | """ 84 | fit-predict-zscore.py \ 85 | -ds ${phip_data} \ 86 | -o fit_predict_zscore.phip 87 | """ 88 | } 89 | 90 | 91 | /* 92 | a generic process using the xarray 93 | merge infrastructure 94 | */ 95 | 96 | process merge_binary_datasets { 97 | input: 98 | path all_phip_datasets 99 | output: 100 | path "merged.phip" 101 | shell: 102 | """ 103 | phippery merge -o merged.phip '*.phip' 104 | """ 105 | } 106 | 107 | 108 | workflow STATS { 109 | 110 | take: dataset 111 | main: 112 | 113 | // we automatically compute some stats 114 | // which are independent of any annotations 115 | dataset | \ 116 | (counts_per_million & size_factors) | \ 117 | mix | set { auto_stats_ch } 118 | 119 | if( params.run_edgeR | params.run_BEER ) 120 | dataset | edgeR_BEER_workflows | set { edgeR_BEER_ch } 121 | else 122 | Channel.empty() | set { edgeR_BEER_ch } 123 | 124 | // run some optional statistics which 125 | // depend on certain annotations 126 | cpm_fold_enrichment(counts_per_million.out) | set { cpm_fold_enr_ch } 127 | fit_predict_zscore(counts_per_million.out) | set { fit_pred_zscore_ch } 128 | 129 | // collect all the datasets statistics and merge 130 | auto_stats_ch.concat( 131 | cpm_fold_enr_ch, 132 | fit_pred_zscore_ch, 133 | edgeR_BEER_ch 134 | ) | collect | merge_binary_datasets 135 | 136 | emit: 137 | merge_binary_datasets.out 138 | } 139 | --------------------------------------------------------------------------------