├── .DS_Store ├── .gitignore ├── .gitmodules ├── .travis.yml ├── AUTHORS.rst ├── EMX1_visualization.png ├── LICENSE ├── MANIFEST ├── MANIFEST.in ├── Makefile ├── README.md ├── conda-build ├── meta.yaml ├── py2_all │ ├── linux-32 │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ ├── linux-aarch64 │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ ├── linux-armv6l │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ ├── linux-armv7l │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ ├── linux-ppc64le │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ ├── osx-64 │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ ├── win-32 │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 │ └── win-64 │ │ └── guide_seq-1.0.2-py27_0.tar.bz2 ├── py35 │ ├── linux-32 │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ ├── linux-aarch64 │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ ├── linux-armv6l │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ ├── linux-armv7l │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ ├── linux-ppc64le │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ ├── osx-64 │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ ├── win-32 │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 │ └── win-64 │ │ └── guide_seq-1.0.2-py35_0.tar.bz2 └── py3_all │ ├── linux-32 │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ ├── linux-aarch64 │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ ├── linux-armv6l │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ ├── linux-armv7l │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ ├── linux-ppc64le │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ ├── osx-64 │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ ├── win-32 │ └── guide_seq-1.0.2-py37_0.tar.bz2 │ └── win-64 │ └── guide_seq-1.0.2-py37_0.tar.bz2 ├── guideseq ├── #guideseq_visualize_only.py# ├── NUC_SIMPLE ├── __init__.py ├── alignReads.py ├── filterBackgroundSites.py ├── guideseq.py ├── guideseq_visualize_only.py ├── guideseq_visualize_only.py~ ├── identifyOfftargetSites.py ├── log.py ├── validation.py ├── visualization.py ├── visualization2.py └── visualization_bk.py ├── guideseq_flowchart.png ├── requirements.txt ├── setup.cfg ├── setup.py ├── test ├── __init__.py ├── data │ ├── aligned │ │ ├── EMX1.sam │ │ └── control.sam │ ├── consolidated │ │ ├── EMX1.r1.consolidated.fastq │ │ ├── EMX1.r2.consolidated.fastq │ │ ├── control.r1.consolidated.fastq │ │ └── control.r2.consolidated.fastq │ ├── demultiplexed │ │ ├── EMX1.i1.fastq │ │ ├── EMX1.i2.fastq │ │ ├── EMX1.r1.fastq │ │ ├── EMX1.r2.fastq │ │ ├── control.i1.fastq │ │ ├── control.i2.fastq │ │ ├── control.r1.fastq │ │ ├── control.r2.fastq │ │ ├── undetermined.i1.fastq │ │ ├── undetermined.i2.fastq │ │ ├── undetermined.r1.fastq │ │ └── undetermined.r2.fastq │ ├── filtered │ │ └── EMX1_backgroundFiltered.txt │ ├── identified │ │ ├── EMX1_identifiedOfftargets.txt │ │ └── control_identifiedOfftargets.txt │ ├── umitagged │ │ ├── EMX1.r1.umitagged.fastq │ │ ├── EMX1.r2.umitagged.fastq │ │ ├── control.r1.umitagged.fastq │ │ └── control.r2.umitagged.fastq │ ├── undemultiplexed │ │ ├── undemux.i1.fastq.gz │ │ ├── undemux.i2.fastq.gz │ │ ├── undemux.r1.fastq.gz │ │ └── undemux.r2.fastq.gz │ └── visualization │ │ └── EMX1_identifiedOfftargets_offtargets.svg ├── demultiplex_manifest.yaml ├── large_test.sh ├── large_test │ └── reference_output │ │ ├── EMX1_backgroundFiltered.txt │ │ ├── VEGFA_site1_backgroundFiltered.txt │ │ ├── VEGFA_site2_backgroundFiltered.txt │ │ ├── VEGFA_site3_backgroundFiltered.txt │ │ └── md5.txt ├── scripts │ ├── bwa_index_alignment.sh │ ├── compile_dependencies.sh │ ├── prepare_test_data.sh │ ├── prepare_test_genome.sh │ ├── prepare_test_genome_index.sh │ ├── samplekey.txt │ └── test_regions.bed ├── test_genome.fa ├── test_guideseq.py ├── test_manifest.yaml └── utils.py └── tox.ini /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | guideseq.egg-info/ 4 | 5 | *.py[cod] 6 | test/output 7 | .genome 8 | 9 | # Genome indices 10 | *.amb 11 | *.ann 12 | *.bwt 13 | *.fai 14 | *.pac 15 | *.sa 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Idea folder 21 | .idea 22 | 23 | # Packages 24 | *.egg 25 | *.egg-info 26 | dist 27 | build 28 | eggs 29 | parts 30 | bin 31 | var 32 | sdist 33 | develop-eggs 34 | .installed.cfg 35 | lib64 36 | 37 | # Installer logs 38 | pip-log.txt 39 | 40 | # Unit test / coverage reports 41 | .coverage 42 | .tox 43 | nosetests.xml 44 | htmlcov 45 | 46 | # Translations 47 | *.mo 48 | 49 | # Mr Developer 50 | .mr.developer.cfg 51 | .project 52 | .pydevproject 53 | 54 | # Complexity 55 | output/*.html 56 | output/*/index.html 57 | 58 | # Sphinx 59 | docs/_build 60 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "guideseq/umi"] 2 | path = guideseq/umi 3 | url = https://github.com/aryeelab/umi.git 4 | branch = master 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "2.7" 7 | 8 | before_install: 9 | - cd test 10 | - git clone https://github.com/lh3/bwa.git 11 | - cd bwa 12 | - git checkout tags/0.7.9a 13 | - make 14 | - cd .. 15 | - PATH=`pwd`/bwa:$PATH 16 | - git clone https://github.com/arq5x/bedtools2.git 17 | - cd bedtools2 18 | - git checkout tags/v2.25.0 19 | - make 20 | - cd .. 21 | - PATH=`pwd`/bedtools2/bin:$PATH 22 | - cd .. 23 | 24 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 25 | install: 26 | - pip install -r requirements.txt 27 | 28 | # command to run tests, e.g. python setup.py test 29 | script: 30 | cd test && nosetests --exe -v 31 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Leads 6 | ---------------- 7 | 8 | * Shengdar Q Tsai 9 | * Martin Aryaa 10 | * Ved V Topkar 11 | 12 | Contributors 13 | ------------ 14 | 15 | None yet. Why not be the first? 16 | -------------------------------------------------------------------------------- /EMX1_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/EMX1_visualization.png -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | AUTHORS.rst 3 | LICENSE 4 | requirements.txt 5 | setup.cfg 6 | setup.py 7 | guideseq/__init__.py 8 | guideseq/alignReads.py 9 | guideseq/filterBackgroundSites.py 10 | guideseq/guideseq.py 11 | guideseq/identifyOfftargetSites.py 12 | guideseq/log.py 13 | guideseq/validation.py 14 | guideseq/visualization.py 15 | test/test_guideseq.py 16 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include LICENSE 3 | include README.rst 4 | include requirements.txt 5 | 6 | recursive-include tests * 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | 10 | recursive-include docs *.rst conf.py Makefile make.bat 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean - remove all build, test, coverage and Python artifacts" 5 | @echo "clean-build - remove build artifacts" 6 | @echo "clean-pyc - remove Python file artifacts" 7 | @echo "clean-test - remove test and coverage artifacts" 8 | @echo "test - run tests quickly with the default Python" 9 | @echo "docs - generate Sphinx HTML documentation, including API docs" 10 | @echo "install - install the package to the active Python's site-packages" 11 | 12 | clean: clean-build clean-pyc clean-test 13 | 14 | clean-build: 15 | rm -fr build/ 16 | rm -fr dist/ 17 | rm -fr .eggs/ 18 | find . -name '*.egg-info' -exec rm -fr {} + 19 | find . -name '*.egg' -exec rm -f {} + 20 | 21 | clean-pyc: 22 | find . -name '*.pyc' -exec rm -f {} + 23 | find . -name '*.pyo' -exec rm -f {} + 24 | find . -name '*~' -exec rm -f {} + 25 | find . -name '__pycache__' -exec rm -fr {} + 26 | 27 | clean-test: 28 | rm -fr .tox/ 29 | rm -f .coverage 30 | rm -fr htmlcov/ 31 | 32 | test: 33 | python setup.py test 34 | 35 | docs: 36 | rm -f docs/guideseq.rst 37 | rm -f docs/modules.rst 38 | sphinx-apidoc -o docs/ guideseq 39 | $(MAKE) -C docs clean 40 | $(MAKE) -C docs html 41 | open docs/_build/html/index.html 42 | 43 | install: clean 44 | python setup.py install 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Version][version-shield]][version-url] 3 | [![Python versions][python-shield]][python-url] 4 | [![Platforms][platform-shield]][python-url] 5 | 6 | # guideseq: The GUIDE-Seq Analysis Package 7 | 8 | **Note that an updated version of this package, including Python 3 support, is maintained by the Tsai Lab: https://github.com/tsailabSJ/guideseq** 9 | 10 | This repo (aryeelab/guideseq) contains experimental features. 11 | 12 | ------ 13 | 14 | The guideseq package implements our data preprocessing and analysis pipeline for GUIDE-Seq data. It takes raw sequencing reads (FASTQ) and a parameter manifest file (.yaml) as input and produces a table of annotated off-target sites as output. 15 | 16 | ### References 17 | 18 | ##### The original paper describing the GUIDE-Seq method: 19 | 20 | Tsai SQ, Zheng Z, Nguyen NT, Liebers M, Topkar VV, Thapar V, Wyvekens N, Khayter C, Iafrate AJ, Le LP, Aryee MJ, Joung JK. [GUIDE-seq enables genome-wide profiling of off-target cleavage by CRISPR-Cas nucleases](https://www.ncbi.nlm.nih.gov/pubmed/25513782). Nat Biotechnol. 2015 Feb;33(2):187-197 21 | 22 | ##### A description of this analysis package: 23 | Tsai SQ, Topkar VV, Joung JK, Aryee MJ. [Open-source guideseq software for analysis of GUIDE-seq data](https://www.ncbi.nlm.nih.gov/pubmed/27153277). Nat Biotechnol. 2016 May 6;34(5):483 24 | 25 | ## Table of Contents 26 | - [Features](#features) 27 | - [Dependencies](#dependencies) 28 | - [Getting Set Up](#setup) 29 | - [Installation](#Installation) 30 | - [Quickstart](#Quickstart) 31 | - [Running the Full Analysis Pipeline](#full_pipeline) 32 | - [Quickstart](#quickstart) 33 | - [Writing A Manifest File](#write_manifest) 34 | - [A Full Manifest File Example](manifest_example) 35 | - [Pipeline Outputs](#pipeline_output) 36 | - [Running Analysis Steps Individually](#) 37 | - [Demultiplex](#demultiplex) 38 | - [UMItag](#umitag) 39 | - [Consolidate](#consolidate) 40 | - [Align](#align) 41 | - [Identify](#identify) 42 | - [Filter](#filter) 43 | - [Visualize](#visualize) 44 | - [Frequently Asked Questions](#FAQ) 45 | - [How do I Run the Pipeline with Demultiplexed Data?](#demultiplexed_run) 46 | - [Can I analyze data without UMIs?](#no_umis) 47 | 48 | 49 | ## Features 50 | 51 | 52 | The package implements a pipeline consisting of a read preprocessing module followed by an off-target identification module. The preprocessing module takes raw reads (FASTQ) from a pooled multi-sample sequencing run as input. Reads are demultiplexed into sample-specific FASTQs and PCR duplicates are removed using unique molecular index (UMI) barcode information. 53 | 54 | ![guideseq_flowchart](guideseq_flowchart.png) 55 | 56 | The individual pipeline steps are: 57 | 58 | 1. **Sample demultiplexing**: A pooled multi-sample sequencing run is demultiplexed into sample-specific read files based on sample-specific dual-indexed barcodes 59 | 2. **PCR Duplicate Consolidation**:Reads that share the same UMI and the same first six bases of genomic sequence are presumed to originate from the same pre-PCR molecule and are thus consolidated into a single consensus read to improve quantitative interpretation of GUIDE-Seq read counts. 60 | 3. **Read Alignment**: The demultiplexed, consolidated paired end reads are aligned to a reference genome using the BWA-MEM algorithm with default parameters (Li. H, 2009). 61 | 4. **Candidate Site Identification**: The start mapping positions of the read amplified with the tag-specific primer (second of pair) are tabulated on a genome-wide basis. Start mapping positions are consolidated using a 10-bp sliding window. Windows with reads mapping to both + and - strands, or to the same strand but amplified with both forward and reverse tag-specific primers, are flagged as sites of potential DSBs. 25 bp of reference sequence is retrieved on either side of the most frequently occuring start-mapping position in each flagged window. The retrieved sequence is aligned to the intended target sequence using a Smith-Waterman local-alignment algorithm. 62 | 5. **False positive filtering**: Off-target cleavage sites with more than six mismatches to the intended target sequence, or that are present in background controls, are filtered out. 63 | 6. **Reporting**: Identified off-targets, sorted by GUIDE-Seq read count are annotated in a final output table. The GUIDE-Seq read count is expected to scale approximately linearly with cleavage rates (Tsai et al., *Nat Biotechnol.* 2015). 64 | 7. **Visualization**: Alignment of detected off-target sites is visualized via a color-coded sequence grid, as seen below: 65 | 66 | ![guideseq_flowchart](EMX1_visualization.png) 67 | 68 | ## Dependencies 69 | * Python 2 or 3 70 | * Reference genome fasta file ([Example](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)) 71 | * [`bwa`]() alignment tool 72 | * [`bedtools`]() genome arithmetic utility 73 | 74 | 75 | ## Getting Set Up 76 | 77 | ### Installation 78 | 79 | ``` 80 | # It's recommended (but not essential) to set up a conda environment to manage dependencies 81 | conda create -n guideseq python=3.8 82 | conda activate guideseq 83 | 84 | git clone --recursive https://github.com/aryeelab/guideseq 85 | cd guideseq 86 | 87 | pip install -r requirements.txt 88 | python setup.py install 89 | 90 | guideseq.py -h 91 | 92 | ## Please install BWA and bedtools if you choose this option 93 | 94 | ``` 95 | 96 | - **Burrows-Wheeler Aligner (bwa)**: You can either install bwa with a package manager (e.g. `brew` on OSX or `apt-get` on Ubuntu/Debian), or you can download it from the [project page](http://bio-bwa.sourceforge.net/) and compile it from source. 97 | - **Bedtools**: You can either install bwa with a package manager (e.g. `brew` or `apt-get`), or you can download it from the [project page](http://bedtools.readthedocs.org/en/latest/content/installation.html) and compile it from source. 98 | 99 | For both bwa and bedtools, make sure you know the path to the respective executables, as they need to be specified in the pipeline manifest file. 100 | 101 | 102 | 103 | ## Quickstart 104 | 105 | ``` 106 | guideseq.py all -m test_manifest.yaml 107 | ``` 108 | 109 | ## Running the Full Analysis Pipeline 110 | 111 | 112 | To run the full guideseq analysis pipeline, you must first create a manifest YAML file that describes all pipeline inputs. Once you have done so, you can simply run 113 | 114 | ``` 115 | guideseq.py all -m /path/to/manifest.yaml 116 | ``` 117 | 118 | to run the entire pipeline. Below are specific instructions detailing how to write the manifest file. 119 | 120 | If you wish to run an example on our abridged test data, you can simply run 121 | 122 | ``` 123 | 124 | cd guideseq/test 125 | 126 | 127 | guideseq.py all -m test_manifest.yaml 128 | ``` 129 | from the guideseq root directory. The `test_manifest` assumes that both the `bwa` and `bedtools`executables are in your system PATH. You will see the pipeline results outputted to the `test/output` folder. 130 | 131 | ### Writing A Manifest File 132 | When running the end-to-end analysis functionality of the guideseq package, a number of inputs are required. To simplify the formatting of these inputs and to encourage reproducibility, these parameters are inputted into the pipeline via a manifest formatted as a YAML file. YAML files allow easy-to-read specification of key-value pairs. This allows us to easily specify our parameters. The following fields are required in the manifest: 133 | 134 | - `reference_genome`: The absolute path to the reference genome FASTA file. 135 | - `output_folder`: The absolute path to the folder in which all pipeline outputs will be saved. 136 | - `bwa`: The absolute path to the `bwa` executable 137 | - `bedtools`: The absolute path to the `bedtools` executable 138 | - `PAM`: PAM sequence (optional), default is NGG. 139 | - `search_radius`: Search radius for search. Set to 10 for Cas9 and 75 for Cpf1. 140 | - `max_mismatches`: The maximum number of mismatches allowed to report a sequence-matched off-target 141 | - `undemultiplexed`: The absolute paths to the undemultiplexed paired end sequencing files. The required parameters are: 142 | - `forward`: The absolute path to the FASTQ file containing the forward reads. 143 | - `reverse`: The absolute path to the FASTQ file containing the reverse reads. 144 | - `index1`: The absolute path to the FASTQ file containing the forward index reads. 145 | - `index2`: The absolute path to the FASTQ file containing the reverse index reads. 146 | 147 | An example `undemultiplexed` field: 148 | 149 | ``` 150 | undemultiplexed: 151 | forward: ../test/data/undemux.r1.fastq.gz 152 | reverse: ../test/data/undemux.r2.fastq.gz 153 | index1: ../test/data/undemux.i1.fastq.gz 154 | index2: ../test/data/undemux.i2.fastq.gz 155 | ``` 156 | 157 | - `samples`: A nested field containing the details of each sample. At least two samples must be specified: a "control" sample (to be used to filter out background off-target sites) and at least one treatment sample. The required parameters are: 158 | - `target`: The sample targetsites 159 | - `barcode1`: The forward barcode 160 | - `barcode2`: The reverse barcode 161 | - `description`: A description of the sample 162 | 163 | An example `samples` field: 164 | 165 | ``` 166 | samples: 167 | control: 168 | target: 169 | barcode1: CTCTCTAC 170 | barcode2: CTCTCTAT 171 | description: Control 172 | 173 | [SAMPLENAME]: 174 | target: GAGTCCGAGCAGAAGAAGAANGG 175 | barcode1: TAGGCATG 176 | barcode2: TAGATCGC 177 | description: EMX1 178 | ``` 179 | 180 | ### A Full Manifest File Example 181 | 182 | Below is an example of a full manifest file. Feel free to copy it and replace the parameters with your own experiment data. Remember that you can input more than just one treatment sample (e.g. the "EMX1" data below). 183 | 184 | ``` 185 | reference_genome: test/test_genome.fa 186 | output_folder: test/output 187 | 188 | bwa: bwa 189 | bedtools: bedtools 190 | PAM: NGG 191 | demultiplex_min_reads: 1000 192 | window_size: 75 193 | max_mismatches: 7 194 | 195 | undemultiplexed: 196 | forward: test/data/undemultiplexed/undemux.r1.fastq.gz 197 | reverse: test/data/undemultiplexed/undemux.r2.fastq.gz 198 | index1: test/data/undemultiplexed/undemux.i1.fastq.gz 199 | index2: test/data/undemultiplexed/undemux.i2.fastq.gz 200 | 201 | samples: 202 | control: 203 | target: 204 | barcode1: CTCTCTAC 205 | barcode2: CTCTCTAT 206 | description: Control 207 | 208 | EMX1: 209 | target: GAGTCCGAGCAGAAGAAGAANGG 210 | barcode1: TAGGCATG 211 | barcode2: TAGATCGC 212 | description: EMX_site1 213 | 214 | ``` 215 | 216 | ### Pipeline Output 217 | 218 | When running the full pipeline, the results of each step are outputted to the `output_folder` in a separate folder for each step. The output folders and their respective contents are as follows: 219 | 220 | 221 | #### Output Folders 222 | - `output_folder/demultiplexed`: Contains the four undemultiplexed reads files (forward, reverse, index1, index2) for each sample. 223 | - `output_folder/umitagged`: Contains the two umitgged reads files (forward, reverse) for each sample. 224 | - `output_folder/consolidated`: Contains the two consolidated reads files (forward, reverse) for each sample. 225 | - `output_folder/aligned`: Contains an alignment `.sam` file for each sample. 226 | - `output_folder/identified`: Contains a tab-delimited `.txt` file for each sample with an identified off-target in each row. 227 | - `output_folder/filtered`: Contains a tab-delimited `.txt` file for each sample containing the identified DSBs that are background sites (not off-targets) 228 | - `output_folder/visualization`: Contains a `.svg` vector image representing an alignment of all detected off-targets to the targetsite for each sample. 229 | 230 | 231 | The final detected off-target sites are placed in the `output_folder/identified` folder, with one `.txt` file for each sample specified in the manifest. The fields that are populated in each row of these off-target files are specified below: 232 | 233 | ####Output Off-Targets `.txt` Fields: 234 | 235 | - `BED Chromosome`: Window chromosome 236 | - `BED Min.Position`: Window 0-based start position 237 | - `BED Max.Position`: Window 0-based end position 238 | - `BED Name`: Name of window 239 | - `Filename`: The name of the current `.SAM` file used in analysis. 240 | - `WindowIndex`: Index number of window 241 | - `Chromosome`: Chromosome corresponding to position with maximum reads in window (matches `BED Chromosome`) 242 | - `Position`: Position with maximum number of reads in window 243 | - `Sequence`: The window sequence, starting 25 bp upstream and ending 25 bp downstream of `Chromosome:Position` 244 | - `+.mi`: Number of forward reads with distinct molecular indices 245 | - `-.mi`: Number of reverse reads with distinct molecular indices 246 | - `bi.sum.mi`: Sum of the `+.mi` and `-.mi` fields (GUIDE-seq Read Count) 247 | - `bi.geometric_mean.mi`: Geometric mean of the `+.mi` and `-.mi` fields 248 | - `+.total`: Total number of forward mapping reads 249 | - `-.total`: Total number of reverse mapping reads 250 | - `total.sum`: Sum of `+.total` and `-.total` fields 251 | - `total.geometric_mean`: Geometric mean of the `+.total` and `-.total` fields 252 | - `primer1.mi`: Number of reads amplified by forward primer with distinct molecular indices 253 | - `primer2.mi`: Number of reads amplified by reverse primer with distinct molecular indices 254 | - `primer.geometric_mean`: Geometric mean of the `primer1.mi` and `primer2.mi` fields 255 | - `position.stdev`: Standard deviation of positions within genomic window 256 | - `Off-Target Sequence`: Off-target sequence derived from genome reference 257 | - `Mismatches`: Number of mismatches between the intended target sequence and the off-target sequence 258 | - `Length`: Length of the target sequence 259 | - `BED off-target Chromosome`: Off-target chromosome 260 | - `BED off-target start`: Off-target 0-based start position 261 | - `BED off-target end`: Off-target 0-based end position 262 | - `BED off-target name`: Off-target name 263 | - `BED Score`: Field to conform to standard BED format 264 | - `Strand`: Indicates the strand of detected off-target site. `+` for forward strand and `-` for reverse strand 265 | - `Cells`: Cell type 266 | - `Target site`: Targetsite name 267 | - `Target Sequence`: Intended target site sequence (including PAM) 268 | 269 | The key fields for interpreting this output and identifying off-target sites are: `BED off-target Chromosome`, `BED off-target start`, `BED off-target end`, `BED off-target name`, `BED off-target strand`, `Off-Target Sequence`, `bi.sum.mi` 270 | 271 | #### Output Visualizations 272 | 273 | The outputted visualizations are in the `.svg` vector format, which is an open image standard that can be viewed in any modern web browser (e.g. Google Chrome, Apple Safari, Mozilla Firefox), and can be viewed and edited in any vector editing application (e.g. Adobe Illustrator). Because the output visualizations are vector images, they can be scaled up or down infinitely without a loss in quality, and can also be edited as shapes with ease. This makes the images produced by the guideseq package ideal for posters, presentations, and papers. 274 | 275 | ## Running Analysis Steps Individually 276 | 277 | In addition to end-to-end pipeline analysis functionality, the guideseq package also allows for every step fo the analysis to be run individually. Here we have detailed the required inputs and expected outputs of each step. For each step, we have included a "runnable example" command that can be executed from the guideseq root directory to run that step on the included sample data. These "runnable example" snippets put their output in the `test/output` folder. 278 | 279 | ### `demultiplex` Pooled Multi-Sample Sequencing (Manifest Required) 280 | 281 | - **Functionality**: Given undemultiplexed sequence files and sample barcodes specified in the manifest, output the demultiplexed sample-specific reads in FASTQ format. The forward, reverse, and two index files for each sample in the manifest are outputted to the `output_folder/consolidated` folder. 282 | - **Required Parameters**: 283 | - `-m or --manifest`: Specify the path to the manifest YAML file 284 | - **Runnable Example**: 285 | - `python guideseq/guideseq.py demultiplex -m test/test_manifest.yaml` 286 | 287 | ### `umitag` Reads 288 | 289 | - **Functionality**: Given the demultiplexed files in the folder `output_folder/undemultiplexed` (where `output_folder` is specified in the manifest), 'tag' the reads by adding the UMI barcode sequence to the FASTQ read name header in preparation for the subsequent PCR duplicate read consolidation step. The forward and reverse files for each sample in the manifest are outputted to the `output_folder/umitagged` folder. 290 | - **Required Parameters**: 291 | - `--read1`: Path to the forward demultiplexed reads file (FASTQ) 292 | - `--read2`: Path to the reverse demultiplexed reads file (FASTQ) 293 | - `--index1`: Path to the index1 demultiplexed reads file (FASTQ) 294 | - `--index2`: Path to the index2 demultiplexed reads file (FASTQ) 295 | - `--outfolder`: Path to the folder in which the output files will be saved 296 | - **Runnable Example**: 297 | 298 | ``` 299 | python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq \ 300 | --read2 test/data/demultiplexed/EMX1.r2.fastq \ 301 | --index1 test/data/demultiplexed/EMX1.i1.fastq \ 302 | --index2 test/data/demultiplexed/EMX1.i2.fastq \ 303 | --outfolder test/output/ 304 | ``` 305 | 306 | ### `consolidate` PCR Duplicates 307 | 308 | - **Functionality**: Given undemultiplexed sequence files and sample barcodes specified in the manifest, output the consolidated forward and reversed reads to the `outfolder`. 309 | - **Required Parameters**: 310 | - `--read1`: Path to the forward umitagged reads file (FASTQ) 311 | - `--read2`: Path to the reverse umitagged reads file (FASTQ) 312 | - `--outfolder`: Path to the folder in which the output files will be saved 313 | - **Optional Parameters**: 314 | - `--min_quality`: The minimum quality of a read for it to be considered in the consolidation 315 | - `--min_frequency`: The minimum frequency of a read for the position to be consolidated 316 | - **Runnable Example**: 317 | 318 | ``` 319 | python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq \ 320 | --read2 test/data/umitagged/EMX1.r2.umitagged.fastq \ 321 | --outfolder test/output/ 322 | ``` 323 | 324 | ### `align` Sites to Genome 325 | 326 | - **Functionality**: Given the consolidated forward and reverse reads, execute a paired-end mapping of the sequences to the reference genome using the `bwa` package. Outputs an alignment `.sam` file to the `outfolder`. 327 | - **Required Parameters**: 328 | - `--bwa`: Path to the `bwa` executable 329 | - `--genome`: Path to the reference genome FASTA file 330 | - `--read1`: Path to the consolidated forward read FASTQ file 331 | - `--read2`: Path to the consolidated reverse read FASTQ file 332 | - `--outfolder`: Path to the folder in which the output files will be saved 333 | - **Runnable Example**: 334 | 335 | ``` 336 | python guideseq/guideseq.py align --bwa bwa --genome test/test_genome.fa\ 337 | --read1 test/data/consolidated/EMX1.r1.consolidated.fastq\ 338 | --read2 test/data/consolidated/EMX1.r2.consolidated.fastq\ 339 | --outfolder test/output/ 340 | ``` 341 | 342 | ### `identify` Off-target Site Candidates 343 | 344 | - **Functionality**: Given the alignment samfile for a given site, a reference genome, and a target sequence, output a tab-delimited `.txt` file containing the identified off-target sites. 345 | - **Required Parameters**: 346 | - `--aligned`: Path to the site-specific alignment `.sam` file. 347 | - `--genome`: Path to the reference genome FASTA file. 348 | - `--outfolder`: Path to the folder in which the output files will be saved. 349 | - `--target_sequence`: The sequence targeted in the sample (blank for control sample) 350 | - **Optional Parameters**: 351 | - `--description`: Specify additional information about the sample. 352 | - **Runnable Example**: 353 | 354 | ``` 355 | python guideseq/guideseq.py identify --aligned test/data/aligned/EMX1.sam\ 356 | --genome test/test_genome.fa --outfolder test/output/\ 357 | --target_sequence GAGTCCGAGCAGAAGAAGAANGG --description EMX1 358 | ``` 359 | 360 | ### `filter` Background DSB Sites 361 | 362 | - **Functionality**: Given the identified site `.txt` files for a treatment and control samples, output a `.txt` file in the same format as outputted by the `identify` step containing the sites filtered out as false-positives. 363 | - **Required Parameters**: 364 | - `--bedtools`: Path to the `bedtools` executable 365 | - `--identified`: Path to the `.txt` file outputted by the `identify` step for a treatment sample. 366 | - `--background`: Path to the `.txt` file outputted by the `identify` step for a control sample. 367 | - `--outfolder`: Path to the folder in which the output files will be saved. 368 | - **Runnable Example**: 369 | 370 | ``` 371 | python guideseq/guideseq.py filter --bedtools bedtools\ 372 | --identified test/data/identified/EMX1_identifiedOfftargets.txt\ 373 | --background test/data/identified/control_identifiedOfftargets.txt\ 374 | --outfolder test/output/ 375 | ``` 376 | 377 | ### `visualize` Detected Off-Target Sites 378 | 379 | - **Functionality**: Given an identified off-target sites `.txt` file, output an alignment visualization of the off-target sites. 380 | - **Required Parameters**: 381 | - `--infile`: Path to the input `.txt.` off-targets file 382 | - `--outfolder`: Path to the outputted folder containing the outputted `.svg` graphic 383 | - **Optional Parameters**: 384 | - `--title`: Specify the title of the visualization, to be printed at the top of the graphic. Useful for posters and presentations. 385 | - **Runnable Example**: 386 | 387 | ``` 388 | python guideseq/guideseq.py visualize --infile test/data/identified/EMX1_identifiedOfftargets.txt\ 389 | --outfolder test/output/ --title EMX1 390 | ``` 391 | 392 | ## Frequently Asked Questions 393 | 394 | ### How do I Run the Pipeline with Demultiplexed Data? 395 | 396 | If you already have demultiplexed data, you can run the pipeline on the data by running each step after demultiplexing individually, as described in the "Running Analysis Steps Individually" section above. Be sure to run the individual steps in the following orders: 397 | 398 | - `umitag` 399 | - `consolidate` 400 | - `align` 401 | - `identify` 402 | - `filter` 403 | - `visualize` 404 | 405 | ### Can I analyze data without UMIs? 406 | 407 | Yes. If your reads do not have UMIs, you can run the pipeline on previously demultiplexed data as described in the "Running Analysis Steps Individually" section above, starting with the `align` step. **Note that we have not analyzed such data ourselves!** We suspect that PCR duplication bias may affect the quantitative interpretion of GUIDE-Seq read counts, but have not explored this. 408 | 409 | ### Download Reference Genome 410 | 411 | The guideseq package requires a reference genome for read mapping. You can use any genome of your choosing, but for all of our testing and original GUIDE-seq analyses (Tsai et al. *Nature Biotechnol* 2015) we use hg19 ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)). Be sure to (g)unzip the FASTA file before use if it is compressed. 412 | 413 | 414 | ### Configuring a MiSeq to Output Index Reads 415 | 416 | The guideseq package requires index reads from the MiSeq sequencing run for read consolidation. The default MiSeq Reporter settings do not generate index (I1, I2) reads. This feature can be enabled by adding the line 417 | 418 | ```xml 419 | 420 | ``` 421 | 422 | to the ``Miseq Reporter.exe.config`` file located in the Miseq Reporter installation folder. The default installation folder is ``C:\Illumina\MiSeqReporter``. After modifying the config file it should look like this: 423 | 424 | 425 | ```xml 426 | 427 | ... [LEAVE EXISTING LINES UNCHANGED] ... 428 | 429 | 430 | ``` 431 | 432 | The MiSeq Reporter service needs to be restarted for the change to take effect. Future runs of the GenerateFASTQ workflow (and probably other workflows) will generate I1 and I2 reads in addition to R1 and R2. All four of these reads files will be needed for guideseq analysis. 433 | 434 | See page 29 of the Miseq Reporter User Guide for further instructions. 435 | 436 | [version-shield]: https://img.shields.io/conda/v/tsailabsj/guide_seq.svg 437 | [version-url]: https://anaconda.org/tsailabSJ/guide_seq 438 | [python-shield]: https://img.shields.io/pypi/pyversions/guide_seq.svg 439 | [python-url]: https://pypi.python.org/pypi/guide_seq 440 | [platform-shield]: https://img.shields.io/badge/Platforms-linux--64,osx--64,linux--32-orange.svg?style=flat-square 441 | 442 | -------------------------------------------------------------------------------- /conda-build/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "guide_seq" %} 2 | {% set version = "1.0.2" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | source: 9 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz 10 | sha256: 733c04ad671727aeb9559ada2eb464d5e270ab41fa572827fb5f281e4c303f40 11 | 12 | build: 13 | number: 0 14 | script: python setup.py install --single-version-externally-managed --record=record.txt 15 | 16 | requirements: 17 | host: 18 | - pip 19 | - python 20 | run: 21 | - python 22 | - biopython 23 | - bwa=0.7.17 24 | - htseq 25 | - matplotlib 26 | - numpy 27 | - pandas 28 | - pyfaidx 29 | - pygments 30 | - pysam 31 | - pyyaml 32 | - regex 33 | - scipy 34 | - setuptools 35 | - sqlite 36 | - statsmodels 37 | - svgwrite 38 | - yaml 39 | - zlib 40 | - htslib=1.9 41 | - samtools=1.9 42 | - bedtools 43 | 44 | test: 45 | imports: 46 | - guideseq 47 | - umi 48 | 49 | about: 50 | home: https://github.com/tsailabSJ/guideseq 51 | license: GNU General Public License v2 (GPLv2) 52 | license_family: GPL2 53 | license_file: '' 54 | summary: An easy to use bioinformatic pipeline for the GUIDE-seq assay. 55 | description: "guide-seq\n\n\n" 56 | doc_url: '' 57 | dev_url: '' 58 | 59 | extra: 60 | recipe-maintainers: 61 | - YichaoOU 62 | -------------------------------------------------------------------------------- /conda-build/py2_all/linux-32/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-32/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/linux-aarch64/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-aarch64/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/linux-armv6l/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-armv6l/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/linux-armv7l/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-armv7l/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/linux-ppc64le/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-ppc64le/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/osx-64/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/osx-64/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/win-32/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/win-32/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py2_all/win-64/guide_seq-1.0.2-py27_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/win-64/guide_seq-1.0.2-py27_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/linux-32/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-32/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/linux-aarch64/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-aarch64/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/linux-armv6l/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-armv6l/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/linux-armv7l/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-armv7l/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/linux-ppc64le/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-ppc64le/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/osx-64/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/osx-64/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/win-32/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/win-32/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py35/win-64/guide_seq-1.0.2-py35_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/win-64/guide_seq-1.0.2-py35_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/linux-32/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-32/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/linux-aarch64/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-aarch64/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/linux-armv6l/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-armv6l/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/linux-armv7l/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-armv7l/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/linux-ppc64le/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-ppc64le/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/osx-64/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/osx-64/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/win-32/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/win-32/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /conda-build/py3_all/win-64/guide_seq-1.0.2-py37_0.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/win-64/guide_seq-1.0.2-py37_0.tar.bz2 -------------------------------------------------------------------------------- /guideseq/#guideseq_visualize_only.py#: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | guideseq.py 5 | =========== 6 | serves as the wrapper for all guideseq pipeline 7 | 8 | """ 9 | 10 | import os 11 | import sys 12 | import yaml 13 | import argparse 14 | import traceback 15 | 16 | # Set up logger 17 | import log 18 | logger = log.createCustomLogger('root') 19 | 20 | from alignReads import alignReads 21 | from filterBackgroundSites import filterBackgroundSites 22 | from umi import demultiplex, umitag, consolidate 23 | from visualization import visualizeOfftargets 24 | import identifyOfftargetSites 25 | import validation 26 | 27 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000 28 | DEFAULT_WINDOW_SIZE = 25 29 | DEFAULT_MAX_SCORE = 7 30 | 31 | CONSOLIDATE_MIN_QUAL = 15 32 | CONSOLIDATE_MIN_FREQ = 0.9 33 | 34 | 35 | class GuideSeq: 36 | 37 | def __init__(self): 38 | pass 39 | 40 | def parseManifest(self, manifest_path): 41 | logger.info('Loading manifest...') 42 | 43 | with open(manifest_path, 'r') as f: 44 | manifest_data = yaml.load(f) 45 | 46 | try: 47 | # Validate manifest data 48 | validation.validateManifest(manifest_data) 49 | 50 | self.BWA_path = manifest_data['bwa'] 51 | self.bedtools = manifest_data['bedtools'] 52 | self.reference_genome = manifest_data['reference_genome'] 53 | self.output_folder = manifest_data['output_folder'] 54 | self.undemultiplexed = manifest_data['undemultiplexed'] 55 | self.samples = manifest_data['samples'] 56 | 57 | except Exception as e: 58 | logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.') 59 | sys.exit() 60 | 61 | # Allow the user to specify min reads for demultiplex if they want 62 | if 'demultiplex_min_reads' in manifest_data: 63 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 64 | else: 65 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 66 | # Allow the user to specify window size for off-target search 67 | if 'window_size' in manifest_data: 68 | self.window_size = manifest_data['window_size'] 69 | else: 70 | self.window_size = DEFAULT_WINDOW_SIZE 71 | # Allow the user to specify window size for off-target search 72 | if 'max_score' in manifest_data: 73 | self.max_score = manifest_data['max_score'] 74 | else: 75 | self.max_score = DEFAULT_MAX_SCORE 76 | 77 | # Make sure the user has specified a control barcode 78 | if 'control' not in self.samples.keys(): 79 | raise AssertionError('Your manifest must have a control sample specified.') 80 | 81 | # Make sure the user has both a sample and a control 82 | if len(self.samples) < 2: 83 | raise AssertionError('Your manifest must have at least one control and one treatment sample.') 84 | 85 | logger.info('Successfully loaded manifest.') 86 | 87 | def parseManifestDemultiplex(self, manifest_path): 88 | logger.info('Loading manifest for demultiplexing...') 89 | 90 | with open(manifest_path, 'r') as f: 91 | manifest_data = yaml.load(f) 92 | 93 | try: 94 | self.output_folder = manifest_data['output_folder'] 95 | self.undemultiplexed = manifest_data['undemultiplexed'] 96 | self.samples = manifest_data['samples'] 97 | 98 | except Exception as e: 99 | logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.') 100 | quit() 101 | 102 | # Allow the user to specify min reads for demultiplex if they want 103 | if 'demultiplex_min_reads' in manifest_data: 104 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 105 | else: 106 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 107 | 108 | logger.info('Successfully loaded manifest for single-step demultiplexing.') 109 | 110 | def demultiplex(self): 111 | 112 | logger.info('Demultiplexing undemultiplexed files...') 113 | 114 | # Take our two barcodes and concatenate them 115 | swapped_sample_barcodes = {} 116 | for sample in self.samples: 117 | barcode1 = self.samples[sample]['barcode1'] 118 | barcode2 = self.samples[sample]['barcode2'] 119 | barcode = barcode1[1:8] + barcode2[1:8] 120 | swapped_sample_barcodes[barcode] = sample 121 | 122 | try: 123 | demultiplex.demultiplex(self.undemultiplexed['forward'], 124 | self.undemultiplexed['reverse'], 125 | self.undemultiplexed['index1'], 126 | self.undemultiplexed['index2'], 127 | swapped_sample_barcodes, 128 | os.path.join(self.output_folder, 'demultiplexed'), 129 | min_reads=self.demultiplex_min_reads) 130 | 131 | self.demultiplexed = {} 132 | for sample in self.samples: 133 | self.demultiplexed[sample] = {} 134 | self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq') 135 | self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq') 136 | self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq') 137 | self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq') 138 | 139 | logger.info('Successfully demultiplexed reads.') 140 | except Exception as e: 141 | logger.error('Error demultiplexing reads.') 142 | logger.error(traceback.format_exc()) 143 | quit() 144 | 145 | def umitag(self): 146 | logger.info('umitagging reads...') 147 | 148 | try: 149 | self.umitagged = {} 150 | for sample in self.samples: 151 | self.umitagged[sample] = {} 152 | self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq') 153 | self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq') 154 | 155 | umitag.umitag(self.demultiplexed[sample]['read1'], 156 | self.demultiplexed[sample]['read2'], 157 | self.demultiplexed[sample]['index1'], 158 | self.demultiplexed[sample]['index2'], 159 | self.umitagged[sample]['read1'], 160 | self.umitagged[sample]['read2'], 161 | os.path.join(self.output_folder, 'umitagged')) 162 | 163 | logger.info('Successfully umitagged reads.') 164 | except Exception as e: 165 | logger.error('Error umitagging') 166 | logger.error(traceback.format_exc()) 167 | quit() 168 | 169 | def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL): 170 | logger.info('Consolidating reads...') 171 | 172 | try: 173 | self.consolidated = {} 174 | 175 | for sample in self.samples: 176 | self.consolidated[sample] = {} 177 | self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq') 178 | self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq') 179 | 180 | consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq) 181 | consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq) 182 | 183 | logger.info('Successfully consolidated reads.') 184 | except Exception as e: 185 | logger.error('Error umitagging') 186 | logger.error(traceback.format_exc()) 187 | quit() 188 | 189 | def alignReads(self): 190 | logger.info('Aligning reads...') 191 | 192 | try: 193 | self.aligned = {} 194 | for sample in self.samples: 195 | sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam') 196 | alignReads(self.BWA_path, 197 | self.reference_genome, 198 | self.consolidated[sample]['read1'], 199 | self.consolidated[sample]['read2'], 200 | sample_alignment_path) 201 | self.aligned[sample] = sample_alignment_path 202 | logger.info('Finished aligning reads to genome.') 203 | 204 | except Exception as e: 205 | logger.error('Error aligning') 206 | logger.error(traceback.format_exc()) 207 | quit() 208 | 209 | def identifyOfftargetSites(self): 210 | logger.info('Identifying offtarget sites...') 211 | 212 | try: 213 | self.identified = {} 214 | 215 | # Identify offtarget sites for each sample 216 | for sample in self.samples: 217 | 218 | # Prepare sample annotations 219 | sample_data = self.samples[sample] 220 | annotations = {} 221 | annotations['Description'] = sample_data['description'] 222 | annotations['Targetsite'] = sample 223 | 224 | if sample is 'control': 225 | annotations['Sequence'] = '' 226 | else: 227 | annotations['Sequence'] = sample_data['target'] 228 | 229 | samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam') 230 | 231 | self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt') 232 | 233 | identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations, 234 | self.window_size, self.max_score) 235 | 236 | logger.info('Finished identifying offtarget sites.') 237 | 238 | except Exception as e: 239 | logger.error('Error identifying offtarget sites.') 240 | logger.error(traceback.format_exc()) 241 | quit() 242 | 243 | def filterBackgroundSites(self): 244 | logger.info('Filtering background sites') 245 | 246 | try: 247 | self.filtered = {} 248 | 249 | # Filter background in each sample 250 | for sample in self.samples: 251 | if sample != 'control': 252 | self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt') 253 | filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample]) 254 | logger.info('Finished background filtering for {0} sample'.format(sample)) 255 | 256 | logger.info('Finished filtering background sites.') 257 | 258 | except Exception as e: 259 | logger.error('Error filtering background sites.') 260 | logger.error(traceback.format_exc()) 261 | 262 | def visualize(self): 263 | logger.info('Visualizing off-target sites') 264 | 265 | try: 266 | for sample in self.samples: 267 | if sample != 'control': 268 | infile = self.identified[sample] 269 | outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets') 270 | visualizeOfftargets(infile, outfile, title=sample) 271 | 272 | logger.info('Finished visualizing off-target sites') 273 | 274 | except Exception as e: 275 | logger.error('Error visualizing off-target sites.') 276 | logger.error(traceback.format_exc()) 277 | 278 | 279 | def parse_args(): 280 | parser = argparse.ArgumentParser() 281 | 282 | subparsers = parser.add_subparsers(description='Individual Step Commands', 283 | help='Use this to run individual steps of the pipeline', 284 | dest='command') 285 | 286 | all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline') 287 | all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 288 | all_parser.add_argument('--identifyAndFilter', action='store_true', default=False) 289 | 290 | demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files') 291 | demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True) 292 | 293 | umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation') 294 | umitag_parser.add_argument('--read1', required=True) 295 | umitag_parser.add_argument('--read2', required=True) 296 | umitag_parser.add_argument('--index1', required=True) 297 | umitag_parser.add_argument('--index2', required=True) 298 | umitag_parser.add_argument('--outfolder', required=True) 299 | 300 | consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs') 301 | consolidate_parser.add_argument('--read1', required=True) 302 | consolidate_parser.add_argument('--read2', required=True) 303 | consolidate_parser.add_argument('--outfolder', required=True) 304 | consolidate_parser.add_argument('--min_quality', required=False, type=float) 305 | consolidate_parser.add_argument('--min_frequency', required=False, type=float) 306 | 307 | align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome') 308 | align_parser.add_argument('--bwa', required=True) 309 | align_parser.add_argument('--genome', required=True) 310 | align_parser.add_argument('--read1', required=True) 311 | align_parser.add_argument('--read2', required=True) 312 | align_parser.add_argument('--outfolder', required=True) 313 | 314 | identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets') 315 | identify_parser.add_argument('--aligned', required=True) 316 | identify_parser.add_argument('--genome', required=True) 317 | identify_parser.add_argument('--outfolder', required=True) 318 | identify_parser.add_argument('--target_sequence', required=True) 319 | identify_parser.add_argument('--description', required=False) 320 | identify_parser.add_argument('--max_score', required=False, type=int, default=7) 321 | identify_parser.add_argument('--window_size', required=False, type=int, default=25) 322 | 323 | filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites') 324 | filter_parser.add_argument('--bedtools', required=True) 325 | filter_parser.add_argument('--identified', required=True) 326 | filter_parser.add_argument('--background', required=True) 327 | filter_parser.add_argument('--outfolder', required=True) 328 | 329 | visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites') 330 | visualize_parser.add_argument('--infile', required=True) 331 | visualize_parser.add_argument('--outfolder', required=True) 332 | visualize_parser.add_argument('--title', required=False) 333 | 334 | return parser.parse_args() 335 | 336 | 337 | def main(): 338 | args = parse_args() 339 | 340 | if args.command == 'all': 341 | 342 | if args.identifyAndFilter: 343 | try: 344 | g = GuideSeq() 345 | g.parseManifest(args.manifest) 346 | 347 | # Bootstrap the aligned samfile paths 348 | g.aligned = {} 349 | for sample in g.samples: 350 | g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam') 351 | 352 | g.identifyOfftargetSites() 353 | g.filterBackgroundSites() 354 | g.visualize() 355 | 356 | except Exception as e: 357 | print 'Error running only identify and filter.' 358 | print traceback.format_exc() 359 | quit() 360 | else: 361 | g = GuideSeq() 362 | g.parseManifest(args.manifest) 363 | g.demultiplex() 364 | g.umitag() 365 | g.consolidate() 366 | g.alignReads() 367 | g.identifyOfftargetSites() 368 | g.filterBackgroundSites() 369 | g.visualize() 370 | 371 | elif args.command == 'demultiplex': 372 | """ 373 | Run just the demultiplex step given the manifest 374 | """ 375 | g = GuideSeq() 376 | g.parseManifestDemultiplex(args.manifest) 377 | g.demultiplex() 378 | 379 | elif args.command == 'umitag': 380 | """ 381 | Run just the umitag step 382 | python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/ 383 | """ 384 | g = GuideSeq() 385 | g.output_folder = args.outfolder 386 | sample = os.path.basename(args.read1).split('.')[0] 387 | g.samples = [sample] 388 | g.demultiplexed = {sample: {}} 389 | g.demultiplexed[sample]['read1'] = args.read1 390 | g.demultiplexed[sample]['read2'] = args.read2 391 | g.demultiplexed[sample]['index1'] = args.index1 392 | g.demultiplexed[sample]['index2'] = args.index2 393 | g.umitag() 394 | 395 | elif args.command == 'consolidate': 396 | """ 397 | Run just the consolidate step 398 | python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14 399 | """ 400 | sample = os.path.basename(args.read1).split('.')[0] 401 | g = GuideSeq() 402 | g.output_folder = args.outfolder 403 | g.samples = [sample] 404 | g.umitagged = {sample: {}} 405 | g.umitagged[sample]['read1'] = args.read1 406 | g.umitagged[sample]['read2'] = args.read2 407 | 408 | if 'min_quality' in args: 409 | min_qual = args.min_quality 410 | else: 411 | min_qual = CONSOLIDATE_MIN_QUAL 412 | 413 | if 'min_frequency' in args: 414 | min_freq = args.min_frequency 415 | else: 416 | min_freq = CONSOLIDATE_MIN_FREQ 417 | 418 | g.consolidate(min_freq=min_freq, min_qual=min_qual) 419 | 420 | elif args.command == 'align': 421 | """ 422 | Run just the alignment step 423 | python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/ 424 | """ 425 | sample = os.path.basename(args.read1).split('.')[0] 426 | g = GuideSeq() 427 | g.BWA_path = args.bwa 428 | g.reference_genome = args.genome 429 | g.output_folder = args.outfolder 430 | g.samples = [sample] 431 | g.consolidated = {sample: {}} 432 | g.consolidated[sample]['read1'] = args.read1 433 | g.consolidated[sample]['read2'] = args.read2 434 | g.alignReads() 435 | 436 | elif args.command == 'identify': 437 | """ 438 | Run just the identify step 439 | python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG 440 | """ 441 | if 'description' in args: 442 | description = args.description 443 | else: 444 | description = '' 445 | 446 | if 'max_score' in args: 447 | max_score = args.max_score 448 | else: 449 | max_score = 7 450 | 451 | if 'window_size' in args: 452 | window_size = args.window_size 453 | else: 454 | window_size = 25 455 | 456 | g = GuideSeq() 457 | g.output_folder = args.outfolder 458 | g.reference_genome = args.genome 459 | sample = os.path.basename(args.aligned).split('.')[0] 460 | g.samples = {sample: {'description': description, 'target': args.target_sequence}} 461 | g.aligned = {sample: args.aligned} 462 | g.max_score = max_score 463 | g.window_size = window_size 464 | g.identifyOfftargetSites() 465 | 466 | elif args.command == 'filter': 467 | """ 468 | Run just the filter step 469 | 470 | """ 471 | sample = os.path.basename(args.identified).split('.')[0] 472 | g = GuideSeq() 473 | g.output_folder = args.outfolder 474 | g.bedtools = args.bedtools 475 | g.samples = {sample: {}, 'control': {}} 476 | g.identified = {} 477 | g.identified[sample] = args.identified 478 | g.identified['control'] = args.background 479 | g.filterBackgroundSites() 480 | 481 | elif args.command == 'visualize': 482 | """ 483 | Run just the visualize step 484 | """ 485 | g = GuideSeq() 486 | g.output_folder = os.path.dirname(args.outfolder) 487 | sample = os.path.basename(args.infile).split('.')[0] 488 | g.samples = {sample: {}} 489 | g.identified = {} 490 | g.identified[sample] = args.infile 491 | g.visualize() 492 | 493 | 494 | if __name__ == '__main__': 495 | main() 496 | -------------------------------------------------------------------------------- /guideseq/NUC_SIMPLE: -------------------------------------------------------------------------------- 1 | # 2 | # This matrix was created by Todd Lowe 12/10/92 3 | # 4 | # Uses ambiguous nucleotide codes, probabilities rounded to 5 | # nearest integer 6 | # 7 | # Lowest score = -4, Highest score = 5 8 | # 9 | # Modified by Shengdar Tsai 1/23/16 10 | A T G C N 11 | A 10 -5 -5 -5 10 12 | T -5 10 -5 -5 10 13 | G -5 -5 10 -5 10 14 | C -5 -5 -5 10 10 15 | N 10 10 10 10 10 -------------------------------------------------------------------------------- /guideseq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __version__ = "1.0.2" 4 | -------------------------------------------------------------------------------- /guideseq/alignReads.py: -------------------------------------------------------------------------------- 1 | """ 2 | alignReads 3 | """ 4 | 5 | import subprocess 6 | import os 7 | import logging 8 | 9 | logger = logging.getLogger('root') 10 | logger.propagate = False 11 | 12 | 13 | def alignReads(cores, BWA_path, genome_path, read1, read2, outfile): 14 | 15 | sample_name = os.path.basename(outfile).split('.')[0] 16 | output_folder = os.path.dirname(outfile) 17 | if not os.path.exists(output_folder): 18 | os.makedirs(output_folder) 19 | 20 | sample_alignment_paths = {} 21 | 22 | # Check if genome is already indexed by bwa 23 | index_files_extensions = ['.pac', '.amb', '.ann', '.bwt', '.sa'] 24 | 25 | genome_indexed = True 26 | for extension in index_files_extensions: 27 | if not os.path.isfile(genome_path + extension): 28 | genome_indexed = False 29 | break 30 | 31 | # If the genome is not already indexed, index it 32 | if not genome_indexed: 33 | logger.info('Genome index files not detected. Running BWA to generate indices.') 34 | bwa_index_command = '{0} index {1}'.format(BWA_path, genome_path) 35 | logger.info('Running bwa command: %s', bwa_index_command) 36 | subprocess.call(bwa_index_command.split()) 37 | logger.info('BWA genome index generated') 38 | else: 39 | logger.info('BWA genome index found.') 40 | 41 | # Run paired end alignment against the genome 42 | logger.info('Running paired end mapping for {0}'.format(sample_name)) 43 | bwa_alignment_command = '{0} mem -t {1} {2} {3} {4}'.format(BWA_path, 44 | cores, 45 | genome_path, 46 | read1, 47 | read2) 48 | 49 | logger.info(bwa_alignment_command) 50 | 51 | # Open the outfile and redirect the output of the alignment to it. 52 | with open(outfile, 'w') as f: 53 | subprocess.call(bwa_alignment_command.split(), stdout=f) 54 | 55 | logger.info('Paired end mapping for {0} completed.'.format(sample_name)) 56 | -------------------------------------------------------------------------------- /guideseq/filterBackgroundSites.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | 4 | def filterBackgroundSites(bedtools_path, sample_path, control_path, outfile): 5 | output_folder = os.path.dirname(outfile) 6 | if not os.path.exists(output_folder): 7 | os.makedirs(output_folder) 8 | 9 | sample_noHeader = os.path.join(os.path.dirname(sample_path), 'sample_noHeader.txt') 10 | control_noHeader = os.path.join(os.path.dirname(control_path), 'control_noHeader.txt') 11 | 12 | sample_noHeader_command = "sed '1d' {0} > {1}".format(sample_path, sample_noHeader) 13 | control_noHeader_command = "sed '1d' {0} > {1}".format(control_path, control_noHeader) 14 | clean_command = "rm {0} {1}".format(control_noHeader, sample_noHeader) 15 | bedtools_filter_command = '{0} intersect -a {1} -b {2}'.format(bedtools_path, sample_noHeader, control_noHeader) 16 | 17 | subprocess.check_call(sample_noHeader_command, shell=True, env=os.environ.copy()) 18 | subprocess.check_call(control_noHeader_command, shell=True, env=os.environ.copy()) 19 | 20 | with open(outfile, 'w') as output_file: 21 | subprocess.check_call(bedtools_filter_command, shell=True, env=os.environ.copy(), stdout=output_file) 22 | subprocess.check_call(clean_command, shell=True, env=os.environ.copy()) 23 | -------------------------------------------------------------------------------- /guideseq/guideseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | guideseq.py 6 | =========== 7 | serves as the wrapper for all guideseq pipeline 8 | 9 | """ 10 | 11 | import os 12 | import sys 13 | import yaml 14 | import argparse 15 | import traceback 16 | 17 | # Set up logger 18 | import log 19 | logger = log.createCustomLogger('root') 20 | 21 | from alignReads import alignReads 22 | from filterBackgroundSites import filterBackgroundSites 23 | from umi import demultiplex, umitag, consolidate 24 | from visualization import visualizeOfftargets 25 | import identifyOfftargetSites 26 | import validation 27 | 28 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000 29 | DEFAULT_WINDOW_SIZE = 25 30 | DEFAULT_MAX_SCORE = 7 31 | 32 | CONSOLIDATE_MIN_QUAL = 15 33 | CONSOLIDATE_MIN_FREQ = 0.9 34 | 35 | 36 | class GuideSeq: 37 | 38 | def __init__(self): 39 | pass 40 | 41 | def parseManifest(self, manifest_path): 42 | logger.info('Loading manifest...') 43 | 44 | with open(manifest_path, 'r') as f: 45 | manifest_data = yaml.safe_load(f) 46 | 47 | if not "cores" in manifest_data: 48 | manifest_data['cores'] = 4 49 | 50 | # Set default tag/primer sequences if not specified 51 | if not "primer1" in manifest_data: 52 | manifest_data['primer1'] = 'TTGAGTTGTCATATGTTAAT' 53 | if not "primer2" in manifest_data: 54 | manifest_data['primer2'] = 'ACATATGACAACTCAATTAA' 55 | 56 | try: 57 | # Validate manifest data 58 | validation.validateManifest(manifest_data) 59 | 60 | self.cores = manifest_data['cores'] 61 | self.BWA_path = manifest_data['bwa'] 62 | self.bedtools = manifest_data['bedtools'] 63 | self.reference_genome = manifest_data['reference_genome'] 64 | self.output_folder = manifest_data['output_folder'] 65 | self.undemultiplexed = manifest_data['undemultiplexed'] 66 | self.samples = manifest_data['samples'] 67 | self.primer1 = manifest_data['primer1'] 68 | self.primer2 = manifest_data['primer2'] 69 | 70 | except Exception as e: 71 | logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.') 72 | sys.exit() 73 | 74 | # Allow the user to specify min reads for demultiplex if they want 75 | if 'demultiplex_min_reads' in manifest_data: 76 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 77 | else: 78 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 79 | # Allow the user to specify window size for off-target search 80 | if 'search_radius' in manifest_data: 81 | self.search_radius = manifest_data['search_radius'] 82 | else: 83 | self.search_radius = DEFAULT_WINDOW_SIZE 84 | # Allow the user to specify window size for off-target search 85 | if 'max_score' in manifest_data: 86 | self.max_score = manifest_data['max_score'] 87 | else: 88 | self.max_score = DEFAULT_MAX_SCORE 89 | # Allow the user to specify PAM seq. Yichao 3/6/2020 90 | if 'PAM' in manifest_data: 91 | self.PAM = manifest_data['PAM'] 92 | else: 93 | self.PAM = "NGG" 94 | 95 | # Make sure the user has specified a control barcode 96 | if 'control' not in self.samples.keys(): 97 | raise AssertionError('Your manifest must have a control sample specified.') 98 | 99 | # Make sure the user has both a sample and a control 100 | if len(self.samples) < 2: 101 | raise AssertionError('Your manifest must have at least one control and one treatment sample.') 102 | 103 | logger.info('Successfully loaded manifest.') 104 | 105 | def parseManifestDemultiplex(self, manifest_path): 106 | logger.info('Loading manifest for demultiplexing...') 107 | 108 | with open(manifest_path, 'r') as f: 109 | manifest_data = yaml.load(f) 110 | 111 | try: 112 | self.output_folder = manifest_data['output_folder'] 113 | self.undemultiplexed = manifest_data['undemultiplexed'] 114 | self.samples = manifest_data['samples'] 115 | 116 | except Exception as e: 117 | logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.') 118 | quit() 119 | 120 | # Allow the user to specify min reads for demultiplex if they want 121 | if 'demultiplex_min_reads' in manifest_data: 122 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 123 | else: 124 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 125 | 126 | logger.info('Successfully loaded manifest for single-step demultiplexing.') 127 | 128 | def demultiplex(self): 129 | 130 | logger.info('Demultiplexing undemultiplexed files...') 131 | 132 | # Take our two barcodes and concatenate them 133 | swapped_sample_barcodes = {} 134 | for sample in self.samples: 135 | barcode1 = self.samples[sample]['barcode1'] 136 | barcode2 = self.samples[sample]['barcode2'] 137 | barcode = barcode1[1:8] + barcode2[1:8] 138 | swapped_sample_barcodes[barcode] = sample 139 | 140 | try: 141 | demultiplex.demultiplex(self.undemultiplexed['forward'], 142 | self.undemultiplexed['reverse'], 143 | self.undemultiplexed['index1'], 144 | self.undemultiplexed['index2'], 145 | swapped_sample_barcodes, 146 | os.path.join(self.output_folder, 'demultiplexed'), 147 | min_reads=self.demultiplex_min_reads) 148 | 149 | self.demultiplexed = {} 150 | for sample in self.samples: 151 | self.demultiplexed[sample] = {} 152 | self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq') 153 | self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq') 154 | self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq') 155 | self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq') 156 | 157 | logger.info('Successfully demultiplexed reads.') 158 | except Exception as e: 159 | logger.error('Error demultiplexing reads.') 160 | logger.error(traceback.format_exc()) 161 | quit() 162 | 163 | def umitag(self): 164 | logger.info('umitagging reads...') 165 | 166 | try: 167 | self.umitagged = {} 168 | for sample in self.samples: 169 | self.umitagged[sample] = {} 170 | self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq') 171 | self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq') 172 | 173 | umitag.umitag(self.demultiplexed[sample]['read1'], 174 | self.demultiplexed[sample]['read2'], 175 | self.demultiplexed[sample]['index1'], 176 | self.demultiplexed[sample]['index2'], 177 | self.umitagged[sample]['read1'], 178 | self.umitagged[sample]['read2'], 179 | os.path.join(self.output_folder, 'umitagged')) 180 | 181 | logger.info('Successfully umitagged reads.') 182 | except Exception as e: 183 | logger.error('Error umitagging') 184 | logger.error(traceback.format_exc()) 185 | quit() 186 | 187 | def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL): 188 | logger.info('Consolidating reads...') 189 | 190 | try: 191 | self.consolidated = {} 192 | 193 | for sample in self.samples: 194 | self.consolidated[sample] = {} 195 | self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq') 196 | self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq') 197 | 198 | consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq) 199 | consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq) 200 | 201 | logger.info('Successfully consolidated reads.') 202 | except Exception as e: 203 | logger.error('Error umitagging') 204 | logger.error(traceback.format_exc()) 205 | quit() 206 | 207 | def alignReads(self): 208 | logger.info('Aligning reads...') 209 | 210 | try: 211 | self.aligned = {} 212 | for sample in self.samples: 213 | sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam') 214 | alignReads(self.cores, 215 | self.BWA_path, 216 | self.reference_genome, 217 | self.consolidated[sample]['read1'], 218 | self.consolidated[sample]['read2'], 219 | sample_alignment_path) 220 | self.aligned[sample] = sample_alignment_path 221 | logger.info('Finished aligning reads to genome.') 222 | 223 | except Exception as e: 224 | logger.error('Error aligning') 225 | logger.error(traceback.format_exc()) 226 | quit() 227 | 228 | def identifyOfftargetSites(self): 229 | logger.info('Identifying offtarget sites...') 230 | 231 | try: 232 | self.identified = {} 233 | 234 | # Identify offtarget sites for each sample 235 | for sample in self.samples: 236 | 237 | # Prepare sample annotations 238 | sample_data = self.samples[sample] 239 | annotations = {} 240 | annotations['Description'] = sample_data['description'] 241 | annotations['Targetsite'] = sample 242 | 243 | if sample == 'control': 244 | annotations['Sequence'] = '' 245 | else: 246 | annotations['Sequence'] = sample_data['target'] 247 | 248 | samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam') 249 | 250 | self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt') 251 | 252 | identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations, 253 | self.search_radius, self.max_score, self.primer1, self.primer2) 254 | 255 | logger.info('Finished identifying offtarget sites.') 256 | 257 | except Exception as e: 258 | logger.error('Error identifying offtarget sites.') 259 | logger.error(traceback.format_exc()) 260 | quit() 261 | 262 | def filterBackgroundSites(self): 263 | logger.info('Filtering background sites') 264 | 265 | try: 266 | self.filtered = {} 267 | 268 | # Filter background in each sample 269 | for sample in self.samples: 270 | if sample != 'control': 271 | self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt') 272 | filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample]) 273 | logger.info('Finished background filtering for {0} sample'.format(sample)) 274 | 275 | logger.info('Finished filtering background sites.') 276 | 277 | except Exception as e: 278 | logger.error('Error filtering background sites.') 279 | logger.error(traceback.format_exc()) 280 | 281 | def visualize(self): 282 | logger.info('Visualizing off-target sites') 283 | 284 | # try: 285 | # for sample in self.samples: 286 | # if sample != 'control': 287 | # infile = self.identified[sample] 288 | # outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets') 289 | # visualizeOfftargets(infile, outfile, title=sample) 290 | 291 | # logger.info('Finished visualizing off-target sites') 292 | 293 | # except Exception as e: 294 | # logger.error('Error visualizing off-target sites.') 295 | # logger.error(traceback.format_exc()) 296 | 297 | for sample in self.samples: ## 3/6/2020 Yichao solved: visualization stopped when one sample failed 298 | if sample != 'control': 299 | try: 300 | infile = self.identified[sample] 301 | outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets') 302 | try: 303 | self.PAM 304 | visualizeOfftargets(infile, outfile, title=sample,PAM=self.PAM) 305 | except: 306 | visualizeOfftargets(infile, outfile, title=sample,PAM="NGG") 307 | except Exception as e: 308 | logger.error('Error visualizing off-target sites: %s'%(sample)) 309 | logger.error(traceback.format_exc()) 310 | logger.info('Finished visualizing off-target sites') 311 | 312 | def parse_args(): 313 | parser = argparse.ArgumentParser() 314 | 315 | subparsers = parser.add_subparsers(description='Individual Step Commands', 316 | help='Use this to run individual steps of the pipeline', 317 | dest='command') 318 | 319 | all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline') 320 | all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 321 | all_parser.add_argument('--identifyAndFilter', action='store_true', default=False) 322 | all_parser.add_argument('--skip_demultiplex', action='store_true', default=False) 323 | 324 | demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files') 325 | demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True) 326 | 327 | umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation') 328 | umitag_parser.add_argument('--read1', required=True) 329 | umitag_parser.add_argument('--read2', required=True) 330 | umitag_parser.add_argument('--index1', required=True) 331 | umitag_parser.add_argument('--index2', required=True) 332 | umitag_parser.add_argument('--outfolder', required=True) 333 | 334 | consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs') 335 | consolidate_parser.add_argument('--read1', required=True) 336 | consolidate_parser.add_argument('--read2', required=True) 337 | consolidate_parser.add_argument('--outfolder', required=True) 338 | consolidate_parser.add_argument('--min_quality', required=False, type=float) 339 | consolidate_parser.add_argument('--min_frequency', required=False, type=float) 340 | 341 | align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome') 342 | align_parser.add_argument('--bwa', required=True) 343 | align_parser.add_argument('--genome', required=True) 344 | align_parser.add_argument('--read1', required=True) 345 | align_parser.add_argument('--read2', required=True) 346 | align_parser.add_argument('--outfolder', required=True) 347 | 348 | identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets') 349 | identify_parser.add_argument('--aligned', required=True) 350 | identify_parser.add_argument('--genome', required=True) 351 | identify_parser.add_argument('--outfolder', required=True) 352 | identify_parser.add_argument('--target_sequence', required=True) 353 | identify_parser.add_argument('--description', required=False) 354 | identify_parser.add_argument('--max_score', required=False, type=int, default=7) 355 | identify_parser.add_argument('--search_radius', required=False, type=int, default=25) 356 | 357 | filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites') 358 | filter_parser.add_argument('--bedtools', required=True) 359 | filter_parser.add_argument('--identified', required=True) 360 | filter_parser.add_argument('--background', required=True) 361 | filter_parser.add_argument('--outfolder', required=True) 362 | 363 | visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites') 364 | visualize_parser.add_argument('--infile', required=True) 365 | visualize_parser.add_argument('--outfolder', required=True) 366 | visualize_parser.add_argument('--title', required=False) 367 | 368 | return parser.parse_args() 369 | 370 | 371 | def main(): 372 | args = parse_args() 373 | 374 | if args.command == 'all': 375 | 376 | if args.identifyAndFilter: 377 | try: 378 | g = GuideSeq() 379 | g.parseManifest(args.manifest) 380 | 381 | # Bootstrap the aligned samfile paths 382 | g.aligned = {} 383 | for sample in g.samples: 384 | g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam') 385 | 386 | g.identifyOfftargetSites() 387 | g.filterBackgroundSites() 388 | g.visualize() 389 | 390 | except Exception as e: 391 | print ('Error running only identify and filter.') 392 | print (traceback.format_exc()) 393 | quit() 394 | elif args.skip_demultiplex: 395 | try: 396 | g = GuideSeq() 397 | g.parseManifest(args.manifest) 398 | g.demultiplexed = {} 399 | for sample in g.samples: 400 | g.demultiplexed[sample] = {} 401 | g.demultiplexed[sample]['read1'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.r1.fastq') 402 | g.demultiplexed[sample]['read2'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.r2.fastq') 403 | g.demultiplexed[sample]['index1'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.i1.fastq') 404 | g.demultiplexed[sample]['index2'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.i2.fastq') 405 | if not os.path.isfile(g.demultiplexed[sample]['read1']): 406 | print ("Can't find ",g.demultiplexed[sample]['read1']) 407 | exit() 408 | if not os.path.isfile(g.demultiplexed[sample]['read2']): 409 | print ("Can't find ",g.demultiplexed[sample]['read2']) 410 | exit() 411 | if not os.path.isfile(g.demultiplexed[sample]['index1']): 412 | print ("Can't find ",g.demultiplexed[sample]['index1']) 413 | exit() 414 | if not os.path.isfile(g.demultiplexed[sample]['index2']): 415 | print ("Can't find ",g.demultiplexed[sample]['index2']) 416 | exit() 417 | 418 | # Bootstrap the aligned samfile paths 419 | # g.aligned = {} 420 | # for sample in g.samples: 421 | # g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam') 422 | 423 | 424 | g.umitag() 425 | g.consolidate() 426 | g.alignReads() 427 | g.identifyOfftargetSites() 428 | g.filterBackgroundSites() 429 | g.visualize() 430 | 431 | except Exception as e: 432 | print ('Error running only identify and filter.') 433 | print (traceback.format_exc()) 434 | quit() 435 | else: 436 | g = GuideSeq() 437 | g.parseManifest(args.manifest) 438 | g.demultiplex() 439 | g.umitag() 440 | g.consolidate() 441 | g.alignReads() 442 | g.identifyOfftargetSites() 443 | g.filterBackgroundSites() 444 | g.visualize() 445 | 446 | elif args.command == 'demultiplex': 447 | """ 448 | Run just the demultiplex step given the manifest 449 | """ 450 | g = GuideSeq() 451 | g.parseManifestDemultiplex(args.manifest) 452 | g.demultiplex() 453 | 454 | elif args.command == 'umitag': 455 | """ 456 | Run just the umitag step 457 | python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/ 458 | """ 459 | g = GuideSeq() 460 | g.output_folder = args.outfolder 461 | sample = os.path.basename(args.read1).split('.')[0] 462 | g.samples = [sample] 463 | g.demultiplexed = {sample: {}} 464 | g.demultiplexed[sample]['read1'] = args.read1 465 | g.demultiplexed[sample]['read2'] = args.read2 466 | g.demultiplexed[sample]['index1'] = args.index1 467 | g.demultiplexed[sample]['index2'] = args.index2 468 | g.umitag() 469 | 470 | elif args.command == 'consolidate': 471 | """ 472 | Run just the consolidate step 473 | python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14 474 | """ 475 | sample = os.path.basename(args.read1).split('.')[0] 476 | g = GuideSeq() 477 | g.output_folder = args.outfolder 478 | g.samples = [sample] 479 | g.umitagged = {sample: {}} 480 | g.umitagged[sample]['read1'] = args.read1 481 | g.umitagged[sample]['read2'] = args.read2 482 | 483 | if 'min_quality' in args: 484 | min_qual = args.min_quality 485 | else: 486 | min_qual = CONSOLIDATE_MIN_QUAL 487 | 488 | if 'min_frequency' in args: 489 | min_freq = args.min_frequency 490 | else: 491 | min_freq = CONSOLIDATE_MIN_FREQ 492 | 493 | g.consolidate(min_freq=min_freq, min_qual=min_qual) 494 | 495 | elif args.command == 'align': 496 | """ 497 | Run just the alignment step 498 | python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/ 499 | """ 500 | sample = os.path.basename(args.read1).split('.')[0] 501 | g = GuideSeq() 502 | g.BWA_path = args.bwa 503 | g.reference_genome = args.genome 504 | g.output_folder = args.outfolder 505 | g.samples = [sample] 506 | g.consolidated = {sample: {}} 507 | g.consolidated[sample]['read1'] = args.read1 508 | g.consolidated[sample]['read2'] = args.read2 509 | g.alignReads() 510 | 511 | elif args.command == 'identify': 512 | """ 513 | Run just the identify step 514 | python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG 515 | """ 516 | if 'description' in args: 517 | description = args.description 518 | else: 519 | description = '' 520 | 521 | if 'max_score' in args: 522 | max_score = args.max_score 523 | else: 524 | max_score = 7 525 | 526 | if 'search_radius' in args: 527 | search_radius = args.search_radius 528 | else: 529 | search_radius = 25 530 | 531 | g = GuideSeq() 532 | g.output_folder = args.outfolder 533 | g.reference_genome = args.genome 534 | sample = os.path.basename(args.aligned).split('.')[0] 535 | g.samples = {sample: {'description': description, 'target': args.target_sequence}} 536 | g.aligned = {sample: args.aligned} 537 | g.max_score = max_score 538 | g.search_radius = search_radius 539 | g.identifyOfftargetSites() 540 | 541 | elif args.command == 'filter': 542 | """ 543 | Run just the filter step 544 | 545 | """ 546 | sample = os.path.basename(args.identified).split('.')[0] 547 | g = GuideSeq() 548 | g.output_folder = args.outfolder 549 | g.bedtools = args.bedtools 550 | g.samples = {sample: {}, 'control': {}} 551 | g.identified = {} 552 | g.identified[sample] = args.identified 553 | g.identified['control'] = args.background 554 | g.filterBackgroundSites() 555 | 556 | elif args.command == 'visualize': 557 | """ 558 | Run just the visualize step 559 | """ 560 | g = GuideSeq() 561 | g.output_folder = os.path.dirname(args.outfolder) 562 | sample = os.path.basename(args.infile).split('.')[0] 563 | g.samples = {sample: {}} 564 | g.identified = {} 565 | g.identified[sample] = args.infile 566 | g.visualize() 567 | 568 | 569 | if __name__ == '__main__': 570 | main() 571 | -------------------------------------------------------------------------------- /guideseq/guideseq_visualize_only.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | guideseq.py 5 | =========== 6 | serves as the wrapper for all guideseq pipeline 7 | 8 | """ 9 | 10 | import os 11 | import sys 12 | import yaml 13 | import argparse 14 | import traceback 15 | 16 | # Set up logger 17 | import log 18 | logger = log.createCustomLogger('root') 19 | 20 | from alignReads import alignReads 21 | from filterBackgroundSites import filterBackgroundSites 22 | from umi import demultiplex, umitag, consolidate 23 | from visualization import visualizeOfftargets 24 | import identifyOfftargetSites 25 | import validation 26 | 27 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000 28 | DEFAULT_WINDOW_SIZE = 25 29 | DEFAULT_MAX_SCORE = 7 30 | 31 | CONSOLIDATE_MIN_QUAL = 15 32 | CONSOLIDATE_MIN_FREQ = 0.9 33 | 34 | 35 | class GuideSeq: 36 | 37 | def __init__(self): 38 | pass 39 | 40 | def parseManifest(self, manifest_path): 41 | logger.info('Loading manifest...') 42 | 43 | with open(manifest_path, 'r') as f: 44 | manifest_data = yaml.load(f) 45 | 46 | try: 47 | # Validate manifest data 48 | validation.validateManifest(manifest_data) 49 | 50 | self.BWA_path = manifest_data['bwa'] 51 | self.bedtools = manifest_data['bedtools'] 52 | self.reference_genome = manifest_data['reference_genome'] 53 | self.output_folder = manifest_data['output_folder'] 54 | self.undemultiplexed = manifest_data['undemultiplexed'] 55 | self.samples = manifest_data['samples'] 56 | 57 | except Exception as e: 58 | logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.') 59 | sys.exit() 60 | 61 | # Allow the user to specify min reads for demultiplex if they want 62 | if 'demultiplex_min_reads' in manifest_data: 63 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 64 | else: 65 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 66 | # Allow the user to specify window size for off-target search 67 | if 'window_size' in manifest_data: 68 | self.window_size = manifest_data['window_size'] 69 | else: 70 | self.window_size = DEFAULT_WINDOW_SIZE 71 | # Allow the user to specify window size for off-target search 72 | if 'max_score' in manifest_data: 73 | self.max_score = manifest_data['max_score'] 74 | else: 75 | self.max_score = DEFAULT_MAX_SCORE 76 | # Allow the user to specify PAM seq. Yichao 3/6/2020 77 | if 'PAM' in manifest_data: 78 | self.PAM = manifest_data['PAM'] 79 | else: 80 | self.PAM = "NGG" 81 | 82 | # Make sure the user has specified a control barcode 83 | if 'control' not in self.samples.keys(): 84 | raise AssertionError('Your manifest must have a control sample specified.') 85 | 86 | # Make sure the user has both a sample and a control 87 | if len(self.samples) < 2: 88 | raise AssertionError('Your manifest must have at least one control and one treatment sample.') 89 | 90 | logger.info('Successfully loaded manifest.') 91 | 92 | def parseManifestDemultiplex(self, manifest_path): 93 | logger.info('Loading manifest for demultiplexing...') 94 | 95 | with open(manifest_path, 'r') as f: 96 | manifest_data = yaml.load(f) 97 | 98 | try: 99 | self.output_folder = manifest_data['output_folder'] 100 | self.undemultiplexed = manifest_data['undemultiplexed'] 101 | self.samples = manifest_data['samples'] 102 | 103 | except Exception as e: 104 | logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.') 105 | quit() 106 | 107 | # Allow the user to specify min reads for demultiplex if they want 108 | if 'demultiplex_min_reads' in manifest_data: 109 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 110 | else: 111 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 112 | 113 | logger.info('Successfully loaded manifest for single-step demultiplexing.') 114 | 115 | def demultiplex(self): 116 | 117 | logger.info('Demultiplexing undemultiplexed files...') 118 | 119 | # Take our two barcodes and concatenate them 120 | swapped_sample_barcodes = {} 121 | for sample in self.samples: 122 | barcode1 = self.samples[sample]['barcode1'] 123 | barcode2 = self.samples[sample]['barcode2'] 124 | barcode = barcode1[1:8] + barcode2[1:8] 125 | swapped_sample_barcodes[barcode] = sample 126 | 127 | try: 128 | demultiplex.demultiplex(self.undemultiplexed['forward'], 129 | self.undemultiplexed['reverse'], 130 | self.undemultiplexed['index1'], 131 | self.undemultiplexed['index2'], 132 | swapped_sample_barcodes, 133 | os.path.join(self.output_folder, 'demultiplexed'), 134 | min_reads=self.demultiplex_min_reads) 135 | 136 | self.demultiplexed = {} 137 | for sample in self.samples: 138 | self.demultiplexed[sample] = {} 139 | self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq') 140 | self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq') 141 | self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq') 142 | self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq') 143 | 144 | logger.info('Successfully demultiplexed reads.') 145 | except Exception as e: 146 | logger.error('Error demultiplexing reads.') 147 | logger.error(traceback.format_exc()) 148 | quit() 149 | 150 | def umitag(self): 151 | logger.info('umitagging reads...') 152 | 153 | try: 154 | self.umitagged = {} 155 | for sample in self.samples: 156 | self.umitagged[sample] = {} 157 | self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq') 158 | self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq') 159 | 160 | umitag.umitag(self.demultiplexed[sample]['read1'], 161 | self.demultiplexed[sample]['read2'], 162 | self.demultiplexed[sample]['index1'], 163 | self.demultiplexed[sample]['index2'], 164 | self.umitagged[sample]['read1'], 165 | self.umitagged[sample]['read2'], 166 | os.path.join(self.output_folder, 'umitagged')) 167 | 168 | logger.info('Successfully umitagged reads.') 169 | except Exception as e: 170 | logger.error('Error umitagging') 171 | logger.error(traceback.format_exc()) 172 | quit() 173 | 174 | def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL): 175 | logger.info('Consolidating reads...') 176 | 177 | try: 178 | self.consolidated = {} 179 | 180 | for sample in self.samples: 181 | self.consolidated[sample] = {} 182 | self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq') 183 | self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq') 184 | 185 | consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq) 186 | consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq) 187 | 188 | logger.info('Successfully consolidated reads.') 189 | except Exception as e: 190 | logger.error('Error umitagging') 191 | logger.error(traceback.format_exc()) 192 | quit() 193 | 194 | def alignReads(self): 195 | logger.info('Aligning reads...') 196 | 197 | try: 198 | self.aligned = {} 199 | for sample in self.samples: 200 | sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam') 201 | alignReads(self.BWA_path, 202 | self.reference_genome, 203 | self.consolidated[sample]['read1'], 204 | self.consolidated[sample]['read2'], 205 | sample_alignment_path) 206 | self.aligned[sample] = sample_alignment_path 207 | logger.info('Finished aligning reads to genome.') 208 | 209 | except Exception as e: 210 | logger.error('Error aligning') 211 | logger.error(traceback.format_exc()) 212 | quit() 213 | 214 | def identifyOfftargetSites(self): 215 | logger.info('Identifying offtarget sites...') 216 | 217 | try: 218 | self.identified = {} 219 | 220 | # Identify offtarget sites for each sample 221 | for sample in self.samples: 222 | 223 | # Prepare sample annotations 224 | sample_data = self.samples[sample] 225 | annotations = {} 226 | annotations['Description'] = sample_data['description'] 227 | annotations['Targetsite'] = sample 228 | 229 | if sample is 'control': 230 | annotations['Sequence'] = '' 231 | else: 232 | annotations['Sequence'] = sample_data['target'] 233 | 234 | samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam') 235 | 236 | self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt') 237 | 238 | identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations, 239 | self.window_size, self.max_score) 240 | 241 | logger.info('Finished identifying offtarget sites.') 242 | 243 | except Exception as e: 244 | logger.error('Error identifying offtarget sites.') 245 | logger.error(traceback.format_exc()) 246 | quit() 247 | 248 | def filterBackgroundSites(self): 249 | logger.info('Filtering background sites') 250 | 251 | try: 252 | self.filtered = {} 253 | 254 | # Filter background in each sample 255 | for sample in self.samples: 256 | if sample != 'control': 257 | self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt') 258 | filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample]) 259 | logger.info('Finished background filtering for {0} sample'.format(sample)) 260 | 261 | logger.info('Finished filtering background sites.') 262 | 263 | except Exception as e: 264 | logger.error('Error filtering background sites.') 265 | logger.error(traceback.format_exc()) 266 | 267 | # def visualize(self): 268 | # logger.info('Visualizing off-target sites') 269 | 270 | # try: 271 | # for sample in self.samples: 272 | # if sample != 'control': 273 | # infile = self.identified[sample] 274 | # outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets') 275 | # visualizeOfftargets(infile, outfile, title=sample) 276 | 277 | # logger.info('Finished visualizing off-target sites') 278 | 279 | # except Exception as e: 280 | # logger.error('Error visualizing off-target sites.') 281 | # logger.error(traceback.format_exc()) 282 | def visualize(self): 283 | logger.info('Visualizing off-target sites') 284 | 285 | for sample in self.samples: ## 3/6/2020 Yichao solved: visualization stopped when one sample failed 286 | if sample != 'control': 287 | try: 288 | infile = self.identified[sample] 289 | outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets') 290 | visualizeOfftargets(infile, outfile, title=sample,PAM=self.PAM) 291 | except Exception as e: 292 | logger.error('Error visualizing off-target sites: %s'%(sample)) 293 | logger.error(traceback.format_exc()) 294 | logger.info('Finished visualizing off-target sites') 295 | 296 | def parse_args(): 297 | parser = argparse.ArgumentParser() 298 | 299 | subparsers = parser.add_subparsers(description='Individual Step Commands', 300 | help='Use this to run individual steps of the pipeline', 301 | dest='command') 302 | 303 | all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline') 304 | all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 305 | all_parser.add_argument('--identifyAndFilter', action='store_true', default=False) 306 | 307 | demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files') 308 | demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True) 309 | 310 | umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation') 311 | umitag_parser.add_argument('--read1', required=True) 312 | umitag_parser.add_argument('--read2', required=True) 313 | umitag_parser.add_argument('--index1', required=True) 314 | umitag_parser.add_argument('--index2', required=True) 315 | umitag_parser.add_argument('--outfolder', required=True) 316 | 317 | consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs') 318 | consolidate_parser.add_argument('--read1', required=True) 319 | consolidate_parser.add_argument('--read2', required=True) 320 | consolidate_parser.add_argument('--outfolder', required=True) 321 | consolidate_parser.add_argument('--min_quality', required=False, type=float) 322 | consolidate_parser.add_argument('--min_frequency', required=False, type=float) 323 | 324 | align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome') 325 | align_parser.add_argument('--bwa', required=True) 326 | align_parser.add_argument('--genome', required=True) 327 | align_parser.add_argument('--read1', required=True) 328 | align_parser.add_argument('--read2', required=True) 329 | align_parser.add_argument('--outfolder', required=True) 330 | 331 | identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets') 332 | identify_parser.add_argument('--aligned', required=True) 333 | identify_parser.add_argument('--genome', required=True) 334 | identify_parser.add_argument('--outfolder', required=True) 335 | identify_parser.add_argument('--target_sequence', required=True) 336 | identify_parser.add_argument('--description', required=False) 337 | identify_parser.add_argument('--max_score', required=False, type=int, default=7) 338 | identify_parser.add_argument('--window_size', required=False, type=int, default=25) 339 | 340 | filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites') 341 | filter_parser.add_argument('--bedtools', required=True) 342 | filter_parser.add_argument('--identified', required=True) 343 | filter_parser.add_argument('--background', required=True) 344 | filter_parser.add_argument('--outfolder', required=True) 345 | 346 | visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites') 347 | visualize_parser.add_argument('--infile', required=True) 348 | visualize_parser.add_argument('--outfolder', required=True) 349 | visualize_parser.add_argument('--title', required=False) 350 | 351 | return parser.parse_args() 352 | 353 | 354 | def main(): 355 | args = parse_args() 356 | 357 | if args.command == 'all': 358 | 359 | if args.identifyAndFilter: 360 | try: 361 | g = GuideSeq() 362 | g.parseManifest(args.manifest) 363 | 364 | # Bootstrap the aligned samfile paths 365 | g.aligned = {} 366 | for sample in g.samples: 367 | g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam') 368 | 369 | g.identifyOfftargetSites() 370 | g.filterBackgroundSites() 371 | g.visualize() 372 | 373 | except Exception as e: 374 | print ('Error running only identify and filter.') 375 | print (traceback.format_exc()) 376 | quit() 377 | else: 378 | g = GuideSeq() 379 | g.parseManifest(args.manifest) 380 | # g.demultiplex() 381 | # g.umitag() 382 | # g.consolidate() 383 | # g.alignReads() 384 | # g.identifyOfftargetSites() 385 | # g.filterBackgroundSites() 386 | g.visualize() 387 | 388 | elif args.command == 'demultiplex': 389 | """ 390 | Run just the demultiplex step given the manifest 391 | """ 392 | g = GuideSeq() 393 | g.parseManifestDemultiplex(args.manifest) 394 | g.demultiplex() 395 | 396 | elif args.command == 'umitag': 397 | """ 398 | Run just the umitag step 399 | python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/ 400 | """ 401 | g = GuideSeq() 402 | g.output_folder = args.outfolder 403 | sample = os.path.basename(args.read1).split('.')[0] 404 | g.samples = [sample] 405 | g.demultiplexed = {sample: {}} 406 | g.demultiplexed[sample]['read1'] = args.read1 407 | g.demultiplexed[sample]['read2'] = args.read2 408 | g.demultiplexed[sample]['index1'] = args.index1 409 | g.demultiplexed[sample]['index2'] = args.index2 410 | g.umitag() 411 | 412 | elif args.command == 'consolidate': 413 | """ 414 | Run just the consolidate step 415 | python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14 416 | """ 417 | sample = os.path.basename(args.read1).split('.')[0] 418 | g = GuideSeq() 419 | g.output_folder = args.outfolder 420 | g.samples = [sample] 421 | g.umitagged = {sample: {}} 422 | g.umitagged[sample]['read1'] = args.read1 423 | g.umitagged[sample]['read2'] = args.read2 424 | 425 | if 'min_quality' in args: 426 | min_qual = args.min_quality 427 | else: 428 | min_qual = CONSOLIDATE_MIN_QUAL 429 | 430 | if 'min_frequency' in args: 431 | min_freq = args.min_frequency 432 | else: 433 | min_freq = CONSOLIDATE_MIN_FREQ 434 | 435 | g.consolidate(min_freq=min_freq, min_qual=min_qual) 436 | 437 | elif args.command == 'align': 438 | """ 439 | Run just the alignment step 440 | python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/ 441 | """ 442 | sample = os.path.basename(args.read1).split('.')[0] 443 | g = GuideSeq() 444 | g.BWA_path = args.bwa 445 | g.reference_genome = args.genome 446 | g.output_folder = args.outfolder 447 | g.samples = [sample] 448 | g.consolidated = {sample: {}} 449 | g.consolidated[sample]['read1'] = args.read1 450 | g.consolidated[sample]['read2'] = args.read2 451 | g.alignReads() 452 | 453 | elif args.command == 'identify': 454 | """ 455 | Run just the identify step 456 | python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG 457 | """ 458 | if 'description' in args: 459 | description = args.description 460 | else: 461 | description = '' 462 | 463 | if 'max_score' in args: 464 | max_score = args.max_score 465 | else: 466 | max_score = 7 467 | 468 | if 'window_size' in args: 469 | window_size = args.window_size 470 | else: 471 | window_size = 25 472 | 473 | g = GuideSeq() 474 | g.output_folder = args.outfolder 475 | g.reference_genome = args.genome 476 | sample = os.path.basename(args.aligned).split('.')[0] 477 | g.samples = {sample: {'description': description, 'target': args.target_sequence}} 478 | g.aligned = {sample: args.aligned} 479 | g.max_score = max_score 480 | g.window_size = window_size 481 | g.identifyOfftargetSites() 482 | 483 | elif args.command == 'filter': 484 | """ 485 | Run just the filter step 486 | 487 | """ 488 | sample = os.path.basename(args.identified).split('.')[0] 489 | g = GuideSeq() 490 | g.output_folder = args.outfolder 491 | g.bedtools = args.bedtools 492 | g.samples = {sample: {}, 'control': {}} 493 | g.identified = {} 494 | g.identified[sample] = args.identified 495 | g.identified['control'] = args.background 496 | g.filterBackgroundSites() 497 | 498 | elif args.command == 'visualize': 499 | """ 500 | Run just the visualize step 501 | """ 502 | g = GuideSeq() 503 | g.output_folder = os.path.dirname(args.outfolder) 504 | sample = os.path.basename(args.infile).split('.')[0] 505 | g.samples = {sample: {}} 506 | g.identified = {} 507 | g.identified[sample] = args.infile 508 | g.visualize() 509 | 510 | 511 | if __name__ == '__main__': 512 | main() 513 | -------------------------------------------------------------------------------- /guideseq/guideseq_visualize_only.py~: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | guideseq.py 5 | =========== 6 | serves as the wrapper for all guideseq pipeline 7 | 8 | """ 9 | 10 | import os 11 | import sys 12 | import yaml 13 | import argparse 14 | import traceback 15 | 16 | # Set up logger 17 | import log 18 | logger = log.createCustomLogger('root') 19 | 20 | from alignReads import alignReads 21 | from filterBackgroundSites import filterBackgroundSites 22 | from umi import demultiplex, umitag, consolidate 23 | from visualization import visualizeOfftargets 24 | import identifyOfftargetSites 25 | import validation 26 | 27 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000 28 | DEFAULT_WINDOW_SIZE = 25 29 | DEFAULT_MAX_SCORE = 7 30 | 31 | CONSOLIDATE_MIN_QUAL = 15 32 | CONSOLIDATE_MIN_FREQ = 0.9 33 | 34 | 35 | class GuideSeq: 36 | 37 | def __init__(self): 38 | pass 39 | 40 | def parseManifest(self, manifest_path): 41 | logger.info('Loading manifest...') 42 | 43 | with open(manifest_path, 'r') as f: 44 | manifest_data = yaml.load(f) 45 | 46 | try: 47 | # Validate manifest data 48 | validation.validateManifest(manifest_data) 49 | 50 | self.BWA_path = manifest_data['bwa'] 51 | self.bedtools = manifest_data['bedtools'] 52 | self.reference_genome = manifest_data['reference_genome'] 53 | self.output_folder = manifest_data['output_folder'] 54 | self.undemultiplexed = manifest_data['undemultiplexed'] 55 | self.samples = manifest_data['samples'] 56 | 57 | except Exception as e: 58 | logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.') 59 | sys.exit() 60 | 61 | # Allow the user to specify min reads for demultiplex if they want 62 | if 'demultiplex_min_reads' in manifest_data: 63 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 64 | else: 65 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 66 | # Allow the user to specify window size for off-target search 67 | if 'window_size' in manifest_data: 68 | self.window_size = manifest_data['window_size'] 69 | else: 70 | self.window_size = DEFAULT_WINDOW_SIZE 71 | # Allow the user to specify window size for off-target search 72 | if 'max_score' in manifest_data: 73 | self.max_score = manifest_data['max_score'] 74 | else: 75 | self.max_score = DEFAULT_MAX_SCORE 76 | 77 | # Make sure the user has specified a control barcode 78 | if 'control' not in self.samples.keys(): 79 | raise AssertionError('Your manifest must have a control sample specified.') 80 | 81 | # Make sure the user has both a sample and a control 82 | if len(self.samples) < 2: 83 | raise AssertionError('Your manifest must have at least one control and one treatment sample.') 84 | 85 | logger.info('Successfully loaded manifest.') 86 | 87 | def parseManifestDemultiplex(self, manifest_path): 88 | logger.info('Loading manifest for demultiplexing...') 89 | 90 | with open(manifest_path, 'r') as f: 91 | manifest_data = yaml.load(f) 92 | 93 | try: 94 | self.output_folder = manifest_data['output_folder'] 95 | self.undemultiplexed = manifest_data['undemultiplexed'] 96 | self.samples = manifest_data['samples'] 97 | 98 | except Exception as e: 99 | logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.') 100 | quit() 101 | 102 | # Allow the user to specify min reads for demultiplex if they want 103 | if 'demultiplex_min_reads' in manifest_data: 104 | self.demultiplex_min_reads = manifest_data['demultiplex_min_reads'] 105 | else: 106 | self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS 107 | 108 | logger.info('Successfully loaded manifest for single-step demultiplexing.') 109 | 110 | def demultiplex(self): 111 | 112 | logger.info('Demultiplexing undemultiplexed files...') 113 | 114 | # Take our two barcodes and concatenate them 115 | swapped_sample_barcodes = {} 116 | for sample in self.samples: 117 | barcode1 = self.samples[sample]['barcode1'] 118 | barcode2 = self.samples[sample]['barcode2'] 119 | barcode = barcode1[1:8] + barcode2[1:8] 120 | swapped_sample_barcodes[barcode] = sample 121 | 122 | try: 123 | demultiplex.demultiplex(self.undemultiplexed['forward'], 124 | self.undemultiplexed['reverse'], 125 | self.undemultiplexed['index1'], 126 | self.undemultiplexed['index2'], 127 | swapped_sample_barcodes, 128 | os.path.join(self.output_folder, 'demultiplexed'), 129 | min_reads=self.demultiplex_min_reads) 130 | 131 | self.demultiplexed = {} 132 | for sample in self.samples: 133 | self.demultiplexed[sample] = {} 134 | self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq') 135 | self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq') 136 | self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq') 137 | self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq') 138 | 139 | logger.info('Successfully demultiplexed reads.') 140 | except Exception as e: 141 | logger.error('Error demultiplexing reads.') 142 | logger.error(traceback.format_exc()) 143 | quit() 144 | 145 | def umitag(self): 146 | logger.info('umitagging reads...') 147 | 148 | try: 149 | self.umitagged = {} 150 | for sample in self.samples: 151 | self.umitagged[sample] = {} 152 | self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq') 153 | self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq') 154 | 155 | umitag.umitag(self.demultiplexed[sample]['read1'], 156 | self.demultiplexed[sample]['read2'], 157 | self.demultiplexed[sample]['index1'], 158 | self.demultiplexed[sample]['index2'], 159 | self.umitagged[sample]['read1'], 160 | self.umitagged[sample]['read2'], 161 | os.path.join(self.output_folder, 'umitagged')) 162 | 163 | logger.info('Successfully umitagged reads.') 164 | except Exception as e: 165 | logger.error('Error umitagging') 166 | logger.error(traceback.format_exc()) 167 | quit() 168 | 169 | def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL): 170 | logger.info('Consolidating reads...') 171 | 172 | try: 173 | self.consolidated = {} 174 | 175 | for sample in self.samples: 176 | self.consolidated[sample] = {} 177 | self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq') 178 | self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq') 179 | 180 | consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq) 181 | consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq) 182 | 183 | logger.info('Successfully consolidated reads.') 184 | except Exception as e: 185 | logger.error('Error umitagging') 186 | logger.error(traceback.format_exc()) 187 | quit() 188 | 189 | def alignReads(self): 190 | logger.info('Aligning reads...') 191 | 192 | try: 193 | self.aligned = {} 194 | for sample in self.samples: 195 | sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam') 196 | alignReads(self.BWA_path, 197 | self.reference_genome, 198 | self.consolidated[sample]['read1'], 199 | self.consolidated[sample]['read2'], 200 | sample_alignment_path) 201 | self.aligned[sample] = sample_alignment_path 202 | logger.info('Finished aligning reads to genome.') 203 | 204 | except Exception as e: 205 | logger.error('Error aligning') 206 | logger.error(traceback.format_exc()) 207 | quit() 208 | 209 | def identifyOfftargetSites(self): 210 | logger.info('Identifying offtarget sites...') 211 | 212 | try: 213 | self.identified = {} 214 | 215 | # Identify offtarget sites for each sample 216 | for sample in self.samples: 217 | 218 | # Prepare sample annotations 219 | sample_data = self.samples[sample] 220 | annotations = {} 221 | annotations['Description'] = sample_data['description'] 222 | annotations['Targetsite'] = sample 223 | 224 | if sample is 'control': 225 | annotations['Sequence'] = '' 226 | else: 227 | annotations['Sequence'] = sample_data['target'] 228 | 229 | samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam') 230 | 231 | self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt') 232 | 233 | identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations, 234 | self.window_size, self.max_score) 235 | 236 | logger.info('Finished identifying offtarget sites.') 237 | 238 | except Exception as e: 239 | logger.error('Error identifying offtarget sites.') 240 | logger.error(traceback.format_exc()) 241 | quit() 242 | 243 | def filterBackgroundSites(self): 244 | logger.info('Filtering background sites') 245 | 246 | try: 247 | self.filtered = {} 248 | 249 | # Filter background in each sample 250 | for sample in self.samples: 251 | if sample != 'control': 252 | self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt') 253 | filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample]) 254 | logger.info('Finished background filtering for {0} sample'.format(sample)) 255 | 256 | logger.info('Finished filtering background sites.') 257 | 258 | except Exception as e: 259 | logger.error('Error filtering background sites.') 260 | logger.error(traceback.format_exc()) 261 | 262 | def visualize(self): 263 | logger.info('Visualizing off-target sites') 264 | 265 | try: 266 | for sample in self.samples: 267 | if sample != 'control': 268 | infile = self.identified[sample] 269 | outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets') 270 | visualizeOfftargets(infile, outfile, title=sample) 271 | 272 | logger.info('Finished visualizing off-target sites') 273 | 274 | except Exception as e: 275 | logger.error('Error visualizing off-target sites.') 276 | logger.error(traceback.format_exc()) 277 | 278 | 279 | def parse_args(): 280 | parser = argparse.ArgumentParser() 281 | 282 | subparsers = parser.add_subparsers(description='Individual Step Commands', 283 | help='Use this to run individual steps of the pipeline', 284 | dest='command') 285 | 286 | all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline') 287 | all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 288 | all_parser.add_argument('--identifyAndFilter', action='store_true', default=False) 289 | 290 | demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files') 291 | demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True) 292 | 293 | umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation') 294 | umitag_parser.add_argument('--read1', required=True) 295 | umitag_parser.add_argument('--read2', required=True) 296 | umitag_parser.add_argument('--index1', required=True) 297 | umitag_parser.add_argument('--index2', required=True) 298 | umitag_parser.add_argument('--outfolder', required=True) 299 | 300 | consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs') 301 | consolidate_parser.add_argument('--read1', required=True) 302 | consolidate_parser.add_argument('--read2', required=True) 303 | consolidate_parser.add_argument('--outfolder', required=True) 304 | consolidate_parser.add_argument('--min_quality', required=False, type=float) 305 | consolidate_parser.add_argument('--min_frequency', required=False, type=float) 306 | 307 | align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome') 308 | align_parser.add_argument('--bwa', required=True) 309 | align_parser.add_argument('--genome', required=True) 310 | align_parser.add_argument('--read1', required=True) 311 | align_parser.add_argument('--read2', required=True) 312 | align_parser.add_argument('--outfolder', required=True) 313 | 314 | identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets') 315 | identify_parser.add_argument('--aligned', required=True) 316 | identify_parser.add_argument('--genome', required=True) 317 | identify_parser.add_argument('--outfolder', required=True) 318 | identify_parser.add_argument('--target_sequence', required=True) 319 | identify_parser.add_argument('--description', required=False) 320 | identify_parser.add_argument('--max_score', required=False, type=int, default=7) 321 | identify_parser.add_argument('--window_size', required=False, type=int, default=25) 322 | 323 | filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites') 324 | filter_parser.add_argument('--bedtools', required=True) 325 | filter_parser.add_argument('--identified', required=True) 326 | filter_parser.add_argument('--background', required=True) 327 | filter_parser.add_argument('--outfolder', required=True) 328 | 329 | visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites') 330 | visualize_parser.add_argument('--infile', required=True) 331 | visualize_parser.add_argument('--outfolder', required=True) 332 | visualize_parser.add_argument('--title', required=False) 333 | 334 | return parser.parse_args() 335 | 336 | 337 | def main(): 338 | args = parse_args() 339 | 340 | if args.command == 'all': 341 | 342 | if args.identifyAndFilter: 343 | try: 344 | g = GuideSeq() 345 | g.parseManifest(args.manifest) 346 | 347 | # Bootstrap the aligned samfile paths 348 | g.aligned = {} 349 | for sample in g.samples: 350 | g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam') 351 | 352 | g.identifyOfftargetSites() 353 | g.filterBackgroundSites() 354 | g.visualize() 355 | 356 | except Exception as e: 357 | print 'Error running only identify and filter.' 358 | print traceback.format_exc() 359 | quit() 360 | else: 361 | g = GuideSeq() 362 | g.parseManifest(args.manifest) 363 | g.demultiplex() 364 | g.umitag() 365 | g.consolidate() 366 | g.alignReads() 367 | g.identifyOfftargetSites() 368 | g.filterBackgroundSites() 369 | g.visualize() 370 | 371 | elif args.command == 'demultiplex': 372 | """ 373 | Run just the demultiplex step given the manifest 374 | """ 375 | g = GuideSeq() 376 | g.parseManifestDemultiplex(args.manifest) 377 | g.demultiplex() 378 | 379 | elif args.command == 'umitag': 380 | """ 381 | Run just the umitag step 382 | python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/ 383 | """ 384 | g = GuideSeq() 385 | g.output_folder = args.outfolder 386 | sample = os.path.basename(args.read1).split('.')[0] 387 | g.samples = [sample] 388 | g.demultiplexed = {sample: {}} 389 | g.demultiplexed[sample]['read1'] = args.read1 390 | g.demultiplexed[sample]['read2'] = args.read2 391 | g.demultiplexed[sample]['index1'] = args.index1 392 | g.demultiplexed[sample]['index2'] = args.index2 393 | g.umitag() 394 | 395 | elif args.command == 'consolidate': 396 | """ 397 | Run just the consolidate step 398 | python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14 399 | """ 400 | sample = os.path.basename(args.read1).split('.')[0] 401 | g = GuideSeq() 402 | g.output_folder = args.outfolder 403 | g.samples = [sample] 404 | g.umitagged = {sample: {}} 405 | g.umitagged[sample]['read1'] = args.read1 406 | g.umitagged[sample]['read2'] = args.read2 407 | 408 | if 'min_quality' in args: 409 | min_qual = args.min_quality 410 | else: 411 | min_qual = CONSOLIDATE_MIN_QUAL 412 | 413 | if 'min_frequency' in args: 414 | min_freq = args.min_frequency 415 | else: 416 | min_freq = CONSOLIDATE_MIN_FREQ 417 | 418 | g.consolidate(min_freq=min_freq, min_qual=min_qual) 419 | 420 | elif args.command == 'align': 421 | """ 422 | Run just the alignment step 423 | python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/ 424 | """ 425 | sample = os.path.basename(args.read1).split('.')[0] 426 | g = GuideSeq() 427 | g.BWA_path = args.bwa 428 | g.reference_genome = args.genome 429 | g.output_folder = args.outfolder 430 | g.samples = [sample] 431 | g.consolidated = {sample: {}} 432 | g.consolidated[sample]['read1'] = args.read1 433 | g.consolidated[sample]['read2'] = args.read2 434 | g.alignReads() 435 | 436 | elif args.command == 'identify': 437 | """ 438 | Run just the identify step 439 | python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG 440 | """ 441 | if 'description' in args: 442 | description = args.description 443 | else: 444 | description = '' 445 | 446 | if 'max_score' in args: 447 | max_score = args.max_score 448 | else: 449 | max_score = 7 450 | 451 | if 'window_size' in args: 452 | window_size = args.window_size 453 | else: 454 | window_size = 25 455 | 456 | g = GuideSeq() 457 | g.output_folder = args.outfolder 458 | g.reference_genome = args.genome 459 | sample = os.path.basename(args.aligned).split('.')[0] 460 | g.samples = {sample: {'description': description, 'target': args.target_sequence}} 461 | g.aligned = {sample: args.aligned} 462 | g.max_score = max_score 463 | g.window_size = window_size 464 | g.identifyOfftargetSites() 465 | 466 | elif args.command == 'filter': 467 | """ 468 | Run just the filter step 469 | 470 | """ 471 | sample = os.path.basename(args.identified).split('.')[0] 472 | g = GuideSeq() 473 | g.output_folder = args.outfolder 474 | g.bedtools = args.bedtools 475 | g.samples = {sample: {}, 'control': {}} 476 | g.identified = {} 477 | g.identified[sample] = args.identified 478 | g.identified['control'] = args.background 479 | g.filterBackgroundSites() 480 | 481 | elif args.command == 'visualize': 482 | """ 483 | Run just the visualize step 484 | """ 485 | g = GuideSeq() 486 | g.output_folder = os.path.dirname(args.outfolder) 487 | sample = os.path.basename(args.infile).split('.')[0] 488 | g.samples = {sample: {}} 489 | g.identified = {} 490 | g.identified[sample] = args.infile 491 | g.visualize() 492 | 493 | 494 | if __name__ == '__main__': 495 | main() 496 | -------------------------------------------------------------------------------- /guideseq/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | log.py 3 | ===== 4 | 5 | Setup logging utils for nested module logging 6 | 7 | Adapted from the accepted answer here: http://stackoverflow.com/questions/7621897/python-logging-module-globally 8 | """ 9 | 10 | import logging 11 | 12 | 13 | def createCustomLogger(name): 14 | formatter = logging.Formatter(fmt='[%(asctime)s][%(levelname)s][%(module)s] %(message)s', datefmt='%m/%d %I:%M:%S%p') 15 | 16 | handler = logging.StreamHandler() 17 | handler.setFormatter(formatter) 18 | 19 | logger = logging.getLogger(name) 20 | logger.setLevel(logging.DEBUG) 21 | logger.addHandler(handler) 22 | return logger 23 | -------------------------------------------------------------------------------- /guideseq/validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | validation.py 3 | ============= 4 | 5 | Contains utils for validating the filetype and existence of manifest-defined files/folders 6 | 7 | """ 8 | 9 | import logging 10 | import os 11 | import sys 12 | from distutils.spawn import find_executable 13 | 14 | logger = logging.getLogger('root') 15 | 16 | 17 | def exists(filepath): 18 | if not os.path.isfile(filepath): 19 | logger.error('{0} does not exist'.format(filepath)) 20 | sys.exit() 21 | 22 | 23 | def checkIfBinary(filepath): 24 | executable = find_executable(filepath) 25 | 26 | if executable is None: 27 | logger.error('Executable binary not found at {0}'.format(filepath)) 28 | sys.exit() 29 | 30 | # First check if file exists 31 | exists(executable) 32 | 33 | # Check if file is a valid binary 34 | # Adapted from http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python 35 | textchars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7f}) 36 | is_binary_string = lambda bytes: bool(bytes.translate(None, textchars)) 37 | 38 | if not is_binary_string(open(executable, 'rb').read(1024)): 39 | logger.error('{0} is not a valid binary'.format(executable)) 40 | sys.exit() 41 | 42 | 43 | def checkIfFasta(filepath): 44 | # First check if file exists 45 | exists(os.path.abspath(filepath)) 46 | 47 | 48 | def checkIfFolder(folderpath): 49 | # Check if the folder exists 50 | if not os.path.isdir(os.path.abspath(folderpath)): 51 | logger.error('{0} is not a valid folder path'.format(folderpath)) 52 | sys.exit() 53 | 54 | 55 | def checkIfValidUndemultiplexed(undemultiplexed): 56 | # Check if read1, read2, index1, and index2 exist 57 | fields = ['forward', 'reverse', 'index1', 'index2'] 58 | 59 | if set(fields) != set(undemultiplexed.keys()): 60 | logger.error('Undemultiplexed field must contain references to "forward", "reverse", "index1", "index2"') 61 | sys.exit() 62 | 63 | invalid_file = False 64 | for field in fields: 65 | if not os.path.isfile(undemultiplexed[field]): 66 | logger.error('"read1" undemultiplexed field does not reference a valid file') 67 | invalid_file = True 68 | 69 | if invalid_file: 70 | sys.exit() 71 | 72 | 73 | def checkIfValidSamples(samples): 74 | # Check if control is one of the samples 75 | if 'control' not in samples: 76 | logger.error('A control sample must be specified') 77 | sys.exit() 78 | 79 | if len(samples.keys()) == 0: 80 | logger.error('No samples defined') 81 | sys.exit() 82 | 83 | for sample in samples: 84 | if 'barcode1' not in samples[sample] or 'barcode2' not in samples[sample]: 85 | logger.error('barcode1 and barcode2 must be specified for {0} sample'.format(sample)) 86 | sys.exit() 87 | if 'target' not in samples[sample]: 88 | logger.error('target sequence must be specified for {0} sample'.format(sample)) 89 | sys.exit() 90 | 91 | 92 | def validateManifest(manifest_data): 93 | # Check if manifest contains the required fields 94 | fields = ['bwa', 'bedtools', 'reference_genome', 'output_folder', 'samples', 'undemultiplexed'] 95 | missing_fields = False 96 | 97 | for field in fields: 98 | if field not in manifest_data.keys(): 99 | logger.error('"{0}" field must be specified in manifest'.format(field)) 100 | missing_fields = True 101 | 102 | if missing_fields: 103 | sys.exit() 104 | 105 | # Now validate each field 106 | checkIfBinary(manifest_data['bwa']) 107 | checkIfBinary(manifest_data['bedtools']) 108 | checkIfFasta(manifest_data['reference_genome']) 109 | checkIfValidUndemultiplexed(manifest_data['undemultiplexed']) 110 | checkIfValidSamples(manifest_data['samples']) 111 | -------------------------------------------------------------------------------- /guideseq/visualization.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import svgwrite 4 | import os 5 | import logging 6 | import argparse 7 | 8 | 9 | logger = logging.getLogger('root') 10 | logger.propagate = False 11 | 12 | boxWidth = 10 13 | box_size = 15 14 | v_spacing = 3 15 | 16 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', '-': '#FFFFFF'} 17 | 18 | def parseSitesFile(infile): 19 | offtargets = [] 20 | total_seq = 0 21 | with open(infile, 'r') as f: 22 | f.readline() 23 | for line in f: 24 | line = line.rstrip('\n') 25 | line_items = line.split('\t') 26 | offtarget_reads = line_items[9] 27 | no_bulge_offtarget_sequence = line_items[19] 28 | bulge_offtarget_sequence = line_items[24] 29 | target_seq = line_items[35] 30 | realigned_target_seq = line_items[36] 31 | 32 | if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '': 33 | if no_bulge_offtarget_sequence: 34 | total_seq += 1 35 | if bulge_offtarget_sequence: 36 | total_seq += 1 37 | offtargets.append({'seq': no_bulge_offtarget_sequence.strip(), 38 | 'bulged_seq': bulge_offtarget_sequence.strip(), 39 | 'reads': int(offtarget_reads.strip()), 40 | 'target_seq': target_seq.strip(), 41 | 'realigned_target_seq': realigned_target_seq.strip() 42 | }) 43 | offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True) 44 | return offtargets, target_seq, total_seq 45 | 46 | 47 | def visualizeOfftargets(infile, outfile, title=None, PAM="NGG"): 48 | # Note: PAM is not currently used 49 | output_folder = os.path.dirname(outfile) 50 | if not os.path.exists(output_folder): 51 | os.makedirs(output_folder) 52 | 53 | # Get offtargets array from file 54 | offtargets, target_seq, total_seq = parseSitesFile(infile) 55 | 56 | # Initiate canvas 57 | dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1))) 58 | 59 | if title is not None: 60 | # Define top and left margins 61 | x_offset = 20 62 | y_offset = 50 63 | dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier")) 64 | else: 65 | # Define top and left margins 66 | x_offset = 20 67 | y_offset = 20 68 | 69 | # Draw ticks 70 | tick_locations = [1, len(target_seq)] # limits 71 | if target_seq.find('N') >= 0: 72 | if target_seq.index('N') > len(target_seq)/2: # PAM on the right end 73 | tick_locations += range(len(target_seq) + 1)[::10][1:] # intermediate values 74 | tick_locations += range(len(target_seq) + 1)[len(target_seq) - 2: len(target_seq)] # complementing PAM 75 | tick_locations.sort() 76 | tick_legend = [str(x) for x in tick_locations[:-3][::-1]] + ['P', 'A', 'M'] 77 | else: 78 | tick_locations += [range(4, len(target_seq) + 1)[::10][1]] 79 | tick_locations += range(2, 4) + [5] 80 | tick_locations.sort() 81 | tick_legend = ['P', 'A', 'M'] + [str(x) for x in [str(x - 4) for x in tick_locations[3:]]] 82 | for x, y in zip(tick_locations, tick_legend): 83 | dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 84 | else: 85 | tick_locations = [1, len(target_seq)] 86 | tick_locations += range(len(target_seq) + 1)[::10][1:] 87 | for x in tick_locations: 88 | dwg.add(dwg.text(str(x), insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 89 | 90 | for x,y in zip(tick_locations, tick_legend): 91 | dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 92 | 93 | # Draw reference sequence row 94 | for i, c in enumerate(target_seq): 95 | y = y_offset 96 | x = x_offset + i * box_size 97 | dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c])) 98 | dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier")) 99 | 100 | dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 101 | 102 | # Draw aligned sequence rows 103 | y_offset += 1 # leave some extra space after the reference row 104 | line_number = 0 # keep track of plotted sequences 105 | for j, seq in enumerate(offtargets): 106 | realigned_target_seq = offtargets[j]['realigned_target_seq'] 107 | no_bulge_offtarget_sequence = offtargets[j]['seq'] 108 | bulge_offtarget_sequence = offtargets[j]['bulged_seq'] 109 | 110 | if no_bulge_offtarget_sequence != '': 111 | k = 0 112 | line_number += 1 113 | y = y_offset + line_number * box_size 114 | for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)): 115 | x = x_offset + k * box_size 116 | if r == '-': 117 | if 0 < k < len(target_seq): 118 | x = x_offset + (k - 0.25) * box_size 119 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 120 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 121 | elif c == r: 122 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 123 | k += 1 124 | elif r == 'N': 125 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 126 | k += 1 127 | else: 128 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 129 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 130 | k += 1 131 | if bulge_offtarget_sequence != '': 132 | k = 0 133 | line_number += 1 134 | y = y_offset + line_number * box_size 135 | for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)): 136 | x = x_offset + k * box_size 137 | if r == '-': 138 | if 0 < k < len(realigned_target_seq): 139 | x = x_offset + (k - 0.25) * box_size 140 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 141 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 142 | elif c == r: 143 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 144 | k += 1 145 | elif r == 'N': 146 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 147 | k += 1 148 | else: 149 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 150 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 151 | k += 1 152 | 153 | if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '': 154 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2), 155 | fill='black', style="font-size:15px; font-family:Courier") 156 | dwg.add(reads_text) 157 | else: 158 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5), 159 | fill='black', style="font-size:15px; font-family:Courier") 160 | dwg.add(reads_text) 161 | reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5), 162 | fill='black', style="font-size:23px; font-family:Courier") 163 | dwg.add(reads_text02) 164 | dwg.save() 165 | 166 | 167 | def main(): 168 | parser = argparse.ArgumentParser(description='Plot visualization plots for aligned reads.') 169 | parser.add_argument("--identified", help="Full path to sample identified output", required=True) 170 | parser.add_argument("--outfile", help="Full path to output file", required=True) 171 | parser.add_argument("--title", help="Plot title", required=True) 172 | args = parser.parse_args() 173 | 174 | print(args) 175 | 176 | visualizeOfftargets(args.identified, args.outfile, title=args.title) 177 | 178 | if __name__ == "__main__": 179 | 180 | main() 181 | -------------------------------------------------------------------------------- /guideseq/visualization2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import svgwrite 3 | import sys 4 | import os 5 | import logging 6 | 7 | logger = logging.getLogger('root') 8 | logger.propagate = False 9 | 10 | boxWidth = 10 11 | box_size = 15 12 | v_spacing = 3 13 | 14 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#FFFFFF'} 15 | 16 | 17 | def parseSitesFile(infile): 18 | offtargets = [] 19 | total_seq = 0 20 | with open(infile, 'r') as f: 21 | f.readline() 22 | for line in f: 23 | line = line.rstrip('\n') 24 | line_items = line.split('\t') 25 | offtarget_reads = line_items[11] 26 | no_bulge_offtarget_sequence = line_items[24] 27 | bulge_offtarget_sequence = line_items[29] 28 | target_seq = line_items[40] 29 | realigned_target_seq = line_items[41] 30 | 31 | if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '': 32 | if no_bulge_offtarget_sequence: 33 | total_seq += 1 34 | if bulge_offtarget_sequence: 35 | total_seq += 1 36 | offtargets.append({'seq': no_bulge_offtarget_sequence.strip(), 37 | 'bulged_seq': bulge_offtarget_sequence.strip(), 38 | 'reads': int(offtarget_reads.strip()), 39 | 'target_seq': target_seq.strip(), 40 | 'realigned_target_seq': realigned_target_seq.strip() 41 | }) 42 | offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True) 43 | return offtargets, target_seq, total_seq 44 | 45 | 46 | def visualizeOfftargets(infile, outfile, title,PAM): 47 | 48 | output_folder = os.path.dirname(outfile) 49 | if not os.path.exists(output_folder): 50 | os.makedirs(output_folder) 51 | 52 | # Get offtargets array from file 53 | offtargets, target_seq, total_seq = parseSitesFile(infile) 54 | 55 | # Initiate canvas 56 | dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1))) 57 | 58 | if title is not None: 59 | # Define top and left margins 60 | x_offset = 20 61 | y_offset = 50 62 | dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier")) 63 | else: 64 | # Define top and left margins 65 | x_offset = 20 66 | y_offset = 20 67 | 68 | # Draw ticks 69 | # tick_locations = [1, len(target_seq)] # limits 70 | # if target_seq.index('N') > len(target_seq)/2: # PAM on the right end 71 | # tick_locations += range(len(target_seq) + 1)[::10][1:] # intermediate values 72 | # tick_locations += range(len(target_seq) + 1)[len(target_seq) - 2: len(target_seq)] # complementing PAM 73 | # tick_locations.sort() 74 | # tick_legend = [str(x) for x in tick_locations[:-3][::-1]] + ['P', 'A', 'M'] 75 | # else: 76 | # tick_locations += [range(3, len(target_seq) + 1)[::10][1]] 77 | # tick_locations += range(2, 5) 78 | # tick_locations.sort() 79 | # tick_legend = ['P', 'A', 'M'] + [str(x) for x in [str(x-3) for x in tick_locations[3:]]] 80 | ## Assume PAM is on the right end 81 | tick_locations = [] 82 | tick_legend = [] 83 | PAM_index = target_seq.index(PAM) 84 | print (PAM_index) 85 | count = 0 86 | for i in range(PAM_index,0,-1): 87 | print (i) 88 | count = count+1 89 | if count % 10 == 0: 90 | tick_legend.append(count) 91 | tick_locations.append(i) 92 | tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3) 93 | tick_locations+=range(PAM_index+1,len(target_seq)+1) 94 | 95 | 96 | 97 | for x,y in zip(tick_locations, tick_legend): 98 | dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 99 | 100 | # Draw reference sequence row 101 | for i, c in enumerate(target_seq): 102 | y = y_offset 103 | x = x_offset + i * box_size 104 | if i < PAM_index: 105 | dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c])) 106 | else: 107 | dwg.add(dwg.rect((x, y), (box_size, box_size), fill="#B3B3B3")) 108 | dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier")) 109 | 110 | dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 111 | 112 | # Draw aligned sequence rows 113 | y_offset += 1 # leave some extra space after the reference row 114 | line_number = 0 # keep track of plotted sequences 115 | for j, seq in enumerate(offtargets): 116 | realigned_target_seq = offtargets[j]['realigned_target_seq'] 117 | no_bulge_offtarget_sequence = offtargets[j]['seq'] 118 | bulge_offtarget_sequence = offtargets[j]['bulged_seq'] 119 | 120 | if no_bulge_offtarget_sequence != '': 121 | k = 0 122 | line_number += 1 123 | y = y_offset + line_number * box_size 124 | for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)): 125 | x = x_offset + k * box_size 126 | if r == '-': 127 | if 0 < k < len(target_seq): 128 | x = x_offset + (k - 0.25) * box_size 129 | if i < PAM_index: 130 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 131 | else: 132 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill="#FFFFFF")) 133 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 134 | elif c == r: 135 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 136 | k += 1 137 | elif r == 'N': 138 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 139 | k += 1 140 | else: 141 | if i < PAM_index: 142 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 143 | else: 144 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#FFFFFF")) 145 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 146 | k += 1 147 | if bulge_offtarget_sequence != '': 148 | k = 0 149 | line_number += 1 150 | y = y_offset + line_number * box_size 151 | for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)): 152 | x = x_offset + k * box_size 153 | if r == '-': 154 | if 0 < k < len(realigned_target_seq): 155 | x = x_offset + (k - 0.25) * box_size 156 | if i < PAM_index: 157 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 158 | else: 159 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill="#FFFFFF")) 160 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 161 | elif c == r: 162 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 163 | k += 1 164 | elif r == 'N': 165 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 166 | k += 1 167 | else: 168 | if i < PAM_index: 169 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 170 | else: 171 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#FFFFFF")) 172 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 173 | k += 1 174 | 175 | if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '': 176 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2), 177 | fill='black', style="font-size:15px; font-family:Courier") 178 | dwg.add(reads_text) 179 | else: 180 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5), 181 | fill='black', style="font-size:15px; font-family:Courier") 182 | dwg.add(reads_text) 183 | reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5), 184 | fill='black', style="font-size:23px; font-family:Courier") 185 | dwg.add(reads_text02) 186 | dwg.save() 187 | 188 | 189 | def main(): 190 | try: 191 | 192 | visualizeOfftargets(sys.argv[1], sys.argv[2], sys.argv[3],sys.argv[4]) 193 | except: 194 | print('Usage: python visualization.py INFILE OUTFILE TITLE PAM') 195 | 196 | 197 | if __name__ == '__main__': 198 | main() 199 | -------------------------------------------------------------------------------- /guideseq/visualization_bk.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import svgwrite 3 | import sys 4 | import os 5 | import logging 6 | 7 | logger = logging.getLogger('root') 8 | logger.propagate = False 9 | 10 | boxWidth = 10 11 | box_size = 15 12 | v_spacing = 3 13 | 14 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#FFFFFF'} 15 | 16 | 17 | def parseSitesFile(infile): 18 | offtargets = [] 19 | total_seq = 0 20 | with open(infile, 'r') as f: 21 | f.readline() 22 | for line in f: 23 | line = line.rstrip('\n') 24 | line_items = line.split('\t') 25 | offtarget_reads = line_items[11] 26 | no_bulge_offtarget_sequence = line_items[24] 27 | bulge_offtarget_sequence = line_items[29] 28 | target_seq = line_items[40] 29 | realigned_target_seq = line_items[41] 30 | 31 | if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '': 32 | if no_bulge_offtarget_sequence: 33 | total_seq += 1 34 | if bulge_offtarget_sequence: 35 | total_seq += 1 36 | offtargets.append({'seq': no_bulge_offtarget_sequence.strip(), 37 | 'bulged_seq': bulge_offtarget_sequence.strip(), 38 | 'reads': int(offtarget_reads.strip()), 39 | 'target_seq': target_seq.strip(), 40 | 'realigned_target_seq': realigned_target_seq.strip() 41 | }) 42 | offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True) 43 | return offtargets, target_seq, total_seq 44 | 45 | 46 | def visualizeOfftargets(infile, outfile, title=None): 47 | 48 | output_folder = os.path.dirname(outfile) 49 | if not os.path.exists(output_folder): 50 | os.makedirs(output_folder) 51 | 52 | # Get offtargets array from file 53 | offtargets, target_seq, total_seq = parseSitesFile(infile) 54 | 55 | # Initiate canvas 56 | dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1))) 57 | 58 | if title is not None: 59 | # Define top and left margins 60 | x_offset = 20 61 | y_offset = 50 62 | dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier")) 63 | else: 64 | # Define top and left margins 65 | x_offset = 20 66 | y_offset = 20 67 | 68 | # Draw ticks 69 | tick_locations = [1, len(target_seq)] # limits 70 | if target_seq.index('N') > len(target_seq)/2: # PAM on the right end 71 | tick_locations += range(len(target_seq) + 1)[::10][1:] # intermediate values 72 | tick_locations += range(len(target_seq) + 1)[len(target_seq) - 2: len(target_seq)] # complementing PAM 73 | tick_locations.sort() 74 | tick_legend = [str(x) for x in tick_locations[:-3][::-1]] + ['P', 'A', 'M'] 75 | else: 76 | tick_locations += [range(3, len(target_seq) + 1)[::10][1]] 77 | tick_locations += range(2, 5) 78 | tick_locations.sort() 79 | tick_legend = ['P', 'A', 'M'] + [str(x) for x in [str(x-3) for x in tick_locations[3:]]] 80 | 81 | for x,y in zip(tick_locations, tick_legend): 82 | dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 83 | 84 | # Draw reference sequence row 85 | for i, c in enumerate(target_seq): 86 | y = y_offset 87 | x = x_offset + i * box_size 88 | dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c])) 89 | dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier")) 90 | 91 | dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 92 | 93 | # Draw aligned sequence rows 94 | y_offset += 1 # leave some extra space after the reference row 95 | line_number = 0 # keep track of plotted sequences 96 | for j, seq in enumerate(offtargets): 97 | realigned_target_seq = offtargets[j]['realigned_target_seq'] 98 | no_bulge_offtarget_sequence = offtargets[j]['seq'] 99 | bulge_offtarget_sequence = offtargets[j]['bulged_seq'] 100 | 101 | if no_bulge_offtarget_sequence != '': 102 | k = 0 103 | line_number += 1 104 | y = y_offset + line_number * box_size 105 | for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)): 106 | x = x_offset + k * box_size 107 | if r == '-': 108 | if 0 < k < len(target_seq): 109 | x = x_offset + (k - 0.25) * box_size 110 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 111 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 112 | elif c == r: 113 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 114 | k += 1 115 | elif r == 'N': 116 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 117 | k += 1 118 | else: 119 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 120 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 121 | k += 1 122 | if bulge_offtarget_sequence != '': 123 | k = 0 124 | line_number += 1 125 | y = y_offset + line_number * box_size 126 | for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)): 127 | x = x_offset + k * box_size 128 | if r == '-': 129 | if 0 < k < len(realigned_target_seq): 130 | x = x_offset + (k - 0.25) * box_size 131 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 132 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 133 | elif c == r: 134 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 135 | k += 1 136 | elif r == 'N': 137 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 138 | k += 1 139 | else: 140 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 141 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 142 | k += 1 143 | 144 | if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '': 145 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2), 146 | fill='black', style="font-size:15px; font-family:Courier") 147 | dwg.add(reads_text) 148 | else: 149 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5), 150 | fill='black', style="font-size:15px; font-family:Courier") 151 | dwg.add(reads_text) 152 | reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5), 153 | fill='black', style="font-size:23px; font-family:Courier") 154 | dwg.add(reads_text02) 155 | dwg.save() 156 | 157 | 158 | def main(): 159 | if len(sys.argv) >= 3: 160 | if len(sys.argv) == 4: 161 | title = sys.argv[3] 162 | else: 163 | title = None 164 | visualizeOfftargets(sys.argv[1], sys.argv[2], title=title) 165 | else: 166 | print('Usage: python visualization.py INFILE OUTFILE [TITLE]') 167 | 168 | 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /guideseq_flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/guideseq_flowchart.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | HTSeq 2 | PyYAML 3 | pyfaidx 4 | svgwrite 5 | regex 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | verbosity=1 3 | detailed-errors=1 4 | exe=1 5 | where=test/ -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | import guideseq 6 | 7 | import os 8 | if os.path.isfile("README.MD"): 9 | with open("README.MD", "r") as fh: 10 | long_description = fh.read() 11 | else: 12 | long_description="guide-seq" 13 | 14 | 15 | setup( 16 | name='guide_seq', 17 | version=str(guideseq.__version__), 18 | description="An easy to use bioinformatic pipeline for the GUIDE-seq assay.", 19 | author="Shengdar Q Tsai, Martin Aryee, Ved V Topkar", 20 | author_email='STSAI4@mgh.harvard.edu, Aryee.Martin@mgh.harvard.edu, vedtopkar@gmail.com', 21 | url='https://github.com/tsailabSJ/guideseq', 22 | # packages=find_packages(), 23 | packages=[ 24 | 'guideseq', 25 | 'umi', 26 | ], 27 | package_dir={'guideseq': 28 | 'guideseq','umi':'guideseq/umi'}, 29 | 30 | scripts=['guideseq/guideseq.py','guideseq/alignReads.py','guideseq/visualization.py', 31 | 'guideseq/filterBackgroundSites.py','guideseq/identifyOfftargetSites.py','guideseq/log.py', 32 | 'guideseq/validation.py'], 33 | package_data={'test': ["test/*"]}, 34 | license="AGPL", 35 | include_package_data=True, 36 | long_description=long_description, 37 | long_description_content_type='text/markdown', 38 | keywords='guideseq', 39 | classifiers=[ 40 | 'Development Status :: 4 - Beta', 41 | 'Intended Audience :: Science/Research', 42 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 43 | 'Topic :: Scientific/Engineering :: Visualization', 44 | 'Topic :: Scientific/Engineering :: Information Analysis', 45 | 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 46 | 'Operating System :: Unix', 47 | 'Natural Language :: English', 48 | "Programming Language :: Python :: 2", 49 | 'Programming Language :: Python :: 3' 50 | ] 51 | ) 52 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /test/data/demultiplexed/undetermined.i1.fastq: -------------------------------------------------------------------------------- 1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 1:N:0:0 2 | CAGGCATG 3 | + 4 | CCCCCCFF 5 | -------------------------------------------------------------------------------- /test/data/demultiplexed/undetermined.i2.fastq: -------------------------------------------------------------------------------- 1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 2:N:0:0 2 | TGGATTGTTTTATGTC 3 | + 4 | CBCCCFFFFFFFGGGG 5 | -------------------------------------------------------------------------------- /test/data/demultiplexed/undetermined.r1.fastq: -------------------------------------------------------------------------------- 1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 1:N:0:0 2 | TTCAATAAACTGGGGACAGACTGAGGCAATTACATCATAAACTCTTATTTTTAAAATGAATTAAAAAGAAACCTTTTTGACGGTTTAATTGAGTTGTCATATGTATCACCGACTGCCCATAGAGAGGACTCCAGTCACCAGGCATGATCTC 3 | + 4 | CCCDCFFFFFFFGG2FEGGGGGHHHFGFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHIHHHHHHHHHGGHHHHHHHHHGGHGGGGGHHHHHGHHHHHHHHHHEGDDDFHHFGGGGHHHHHHHHHHGEHHGHHHHHHHHHHHGHHHGHD3F3 5 | -------------------------------------------------------------------------------- /test/data/demultiplexed/undetermined.r2.fastq: -------------------------------------------------------------------------------- 1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 2:N:0:0 2 | ACATATGACAACTCAATTAAACCGTCAAAAAGGTTTCTTTTTAATTCATTTTAAAAATAAGAGTTTATGATGTAATTGCCTCAGTCTGTCCCCAGTTTATTGAAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTGACATAAAAAAAAA 3 | + 4 | BCBBCFFFFFFCGGGGGGGGGGHGGGGGGHHGGHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHGHHHHHHHHHHHHHEHHHHHHFHAGFHHHGHGFFHHHHHHHGGGFHHHGEEGEEFGHHGHHHHHH?FFHHHHGHHHH2BF// 5 | -------------------------------------------------------------------------------- /test/data/filtered/EMX1_backgroundFiltered.txt: -------------------------------------------------------------------------------- 1 | 1:236259170-236261754 1473 1486 chr1:236259170-236261754_1486_7 EMX1.sam 1486 ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT 7 0 7 0.0 33 0 33 0.0 2 5 3.16227766017 7.116178749862878 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 2 | -------------------------------------------------------------------------------- /test/data/identified/EMX1_identifiedOfftargets.txt: -------------------------------------------------------------------------------- 1 | Chromosome Min.Position Max.Position Name Filename Position WindowSequence +.mi -.mi bi.sum.mi bi.geometric_mean.mi +.total -.total total.sum total.geometric_mean primer1.mi primer2.mi primer.geometric_mean position.stdev Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End Cell Targetsite TargetSequence RealignedTargetSequence 2 | 15:44108746-44110769 1007 1025 chr15:44108746-44110769_1017_189 EMX1.sam 1017 GTAGACAAGAGTCTAAGCAGAAGAAGAAGAGAGCCACTACCCAACCATCT 116 73 189 92.0217365626 258 148 406 195.407267009 96 80 87.6356092008 4.931631338038255 GAGTCTAAGCAGAAGAAGAAGAG 3 + 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 3 | 1:236259170-236261754 1465 1486 chr1:236259170-236261754_1486_7 EMX1.sam 1486 ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT 7 0 7 0.0 33 0 33 0.0 2 5 3.16227766017 7.116178749862878 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 4 | 1:236259170-236261754 1531 1539 chr1:236259170-236261754_1531_5 EMX1.sam 1531 GGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTACT 0 5 5 0.0 0 5 5 0.0 1 2 1.41421356237 2.947456530637899 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 5 | 2:73159981-73162004 1008 1024 chr2:73159981-73162004_1017_489 EMX1.sam 1017 AAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGG 243 246 489 244.49539873 619 541 1160 578.68730762 236 231 233.486616319 4.710360920354193 GAGTCCGAGCAGAAGAAGAAGGG 0 + 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 6 | 3:197899267-197901348 1075 1081 chr3:197899267-197901348_1080_10 EMX1.sam 1080 TTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGG 0 10 10 0.0 0 32 32 0.0 9 1 3.0 2.5495097567963922 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 7 | 6:9117792-9119815 1007 1007 chr6:9117792-9119815_1007_4 EMX1.sam 1007 ATGTCCTCAGAGTTCTGTCCATTCTTCTTCTGCTCAGACGTTTTGTCTGA 1 3 4 1.73205080757 1 9 10 3.0 2 2 2.0 0.0 ACGTCTGAGCAGAAGAAGAATGG 3 - 1000 1023 EMX_site1 EMX1 GAGTCCGAGCAGAAGAAGAANGG none 8 | -------------------------------------------------------------------------------- /test/data/identified/control_identifiedOfftargets.txt: -------------------------------------------------------------------------------- 1 | Chromosome Min.Position Max.Position Name Filename Position WindowSequence +.mi -.mi bi.sum.mi bi.geometric_mean.mi +.total -.total total.sum total.geometric_mean primer1.mi primer2.mi primer.geometric_mean position.stdev Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End Cell Targetsite TargetSequence RealignedTargetSequence 2 | 1:236259170-236261754 1473 1490 chr1:236259170-236261754_1481_7 control.sam 1481 TCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCA 1 6 7 2.44948974278 1 9 10 3.0 2 5 3.16227766017 5.535341001239219 Control control None none 3 | 1:236259170-236261754 1521 1531 chr1:236259170-236261754_1523_14 control.sam 1523 GGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGT 0 14 14 0.0 0 18 18 0.0 7 7 7.0 3.7094473981982814 Control control None none 4 | 3:197899267-197901348 1035 1040 chr3:197899267-197901348_1040_3 control.sam 1040 TAGGGTTGGGTTAGGGTTAGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAG 3 0 3 0.0 5 0 5 0.0 1 1 1.0 2.0548046676563256 Control control None none 5 | -------------------------------------------------------------------------------- /test/data/undemultiplexed/undemux.i1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.i1.fastq.gz -------------------------------------------------------------------------------- /test/data/undemultiplexed/undemux.i2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.i2.fastq.gz -------------------------------------------------------------------------------- /test/data/undemultiplexed/undemux.r1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.r1.fastq.gz -------------------------------------------------------------------------------- /test/data/undemultiplexed/undemux.r2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.r2.fastq.gz -------------------------------------------------------------------------------- /test/data/visualization/EMX1_identifiedOfftargets_offtargets.svg: -------------------------------------------------------------------------------- 1 | 2 | EMX1_identifiedOfftargets1231020GAGTCCGAGCAGAAGAAGAANGGReads489TAA189ACT4 -------------------------------------------------------------------------------- /test/demultiplex_manifest.yaml: -------------------------------------------------------------------------------- 1 | output_folder: test/output 2 | 3 | demultiplex_min_reads: 1000 4 | 5 | undemultiplexed: 6 | forward: test/data/undemultiplexed/undemux.r1.fastq.gz 7 | reverse: test/data/undemultiplexed/undemux.r2.fastq.gz 8 | index1: test/data/undemultiplexed/undemux.i1.fastq.gz 9 | index2: test/data/undemultiplexed/undemux.i2.fastq.gz 10 | 11 | samples: 12 | control: 13 | target: 14 | barcode1: CTCTCTAC 15 | barcode2: CTCTCTAT 16 | description: Control 17 | 18 | EMX1: 19 | target: GAGTCCGAGCAGAAGAAGAANGG 20 | barcode1: TAGGCATG 21 | barcode2: TAGATCGC 22 | description: EMX_site1 23 | -------------------------------------------------------------------------------- /test/large_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script downloads a full GUIDE-Seq dataset and performs runs the analysis pipeline. 3 | # It should be run from the test directory. 4 | 5 | cd large_test 6 | 7 | # Create an output directory with a commit id hash suffix 8 | OUTDIR=output.`git log --pretty=format:'%h' -n 1` 9 | mkdir -p $OUTDIR 10 | ln -sf $OUTDIR output 11 | 12 | # Install bwa 13 | git clone https://github.com/lh3/bwa.git 14 | cd bwa 15 | git checkout tags/0.7.9a 16 | make 17 | cd .. 18 | PATH=`pwd`/bwa:$PATH 19 | 20 | # Install bedtools 21 | git clone https://github.com/arq5x/bedtools2.git 22 | cd bedtools2 23 | git checkout tags/v2.25.0 24 | make 25 | cd .. 26 | PATH=`pwd`/bedtools2/bin:$PATH 27 | 28 | # Download test data FASTQs and manifest 29 | wget http://aryee.mgh.harvard.edu/guideseq/data/guideseq_test_fastq.zip 30 | unzip guideseq_test_fastq.zip 31 | 32 | # Download the reference genome 33 | wget http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta 34 | 35 | # Run analysis pipeline 36 | python ../../guideseq/guideseq.py all -m test_manifest.yaml 37 | 38 | # Check that output tables match the reference output 39 | cd output/filtered 40 | md5sum -c ../../reference_output/md5.txt 41 | -------------------------------------------------------------------------------- /test/large_test/reference_output/EMX1_backgroundFiltered.txt: -------------------------------------------------------------------------------- 1 | chr1 236260648 236260654 chr1_236260603_164 EMX1.sam 517 1 236260603 GCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTC 160 4 164 25.2982212813 314 4 318 35.4400902933 133 26 58.804761712 18.5089768011 EMX1_U2OS EMX1 GAGTCCGAGCAGAAGAAGAANGG 2 | chr1 236260691 236260709 chr1_236260677_32 EMX1.sam 518 1 236260677 GACTCAGAATGGAGCAGGTGACCAGGGGTGACTCAGAATGGAGCAGGTGA 7 25 32 13.2287565553 31 28 59 29.4618397253 14 17 15.4272486205 9.32693602086 EMX1_U2OS EMX1 GAGTCCGAGCAGAAGAAGAANGG 3 | chr2 9877830 9877831 chr2_9877830_36 EMX1.sam 3108 2 9877830 CAGTACCTCCCACTCCCCCAGTGCCCCCCACTCCTCCTAGTACCCCCATT 36 0 36 0.0 39 0 39 0.0 6 27 12.7279220614 1.6996731712 EMX1_U2OS EMX1 GAGTCCGAGCAGAAGAAGAANGG 4 | -------------------------------------------------------------------------------- /test/large_test/reference_output/VEGFA_site1_backgroundFiltered.txt: -------------------------------------------------------------------------------- 1 | chr1 236260648 236260649 chr1_236260619_18 VEGFA_site1.sam 841 1 236260619 GACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCGA 18 0 18 0.0 19 0 19 0.0 5 12 7.74596669241 13.0478700783 VEGFA_site1_U2OS VEGFA_site1 GGGTGGGGGGAGTTTGCTCCNGG 2 | chr1 236260691 236260709 chr1_236260693_19 VEGFA_site1.sam 843 1 236260693 GGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGT 0 19 19 0.0 0 25 25 0.0 11 8 9.38083151965 8.38152730712 VEGFA_site1_U2OS VEGFA_site1 GGGTGGGGGGAGTTTGCTCCNGG 3 | chr2 9877830 9877831 chr2_9877830_119 VEGFA_site1.sam 4347 2 9877830 CAGTACCTCCCACTCCCCCAGTGCCCCCCACTCCTCCTAGTACCCCCATT 119 0 119 0.0 617 0 617 0.0 33 81 51.7010638188 1.0 VEGFA_site1_U2OS VEGFA_site1 GGGTGGGGGGAGTTTGCTCCNGG 4 | -------------------------------------------------------------------------------- /test/large_test/reference_output/VEGFA_site2_backgroundFiltered.txt: -------------------------------------------------------------------------------- 1 | chr1 121485100 121485106 chr1_121485107_44 VEGFA_site2.sam 1168 1 121485107 ACAGATGAATTCTCAGTAACTTCCTTGTGTTGTGTGTATTCAACTCACAG 30 14 44 20.4939015319 76 36 112 52.3067873225 20 23 21.4476105895 9.2269064409 VEGFA_site2_U2OS VEGFA_site2 GACCCCCTCCACCCCGCCTCNGG 2 | chr2 9877830 9877831 chr2_9877830_133 VEGFA_site2.sam 11661 2 9877830 CAGTACCTCCCACTCCCCCAGTGCCCCCCACTCCTCCTAGTACCCCCATT 133 0 133 0.0 297 0 297 0.0 33 96 56.2849891179 0.816496580928 VEGFA_site2_U2OS VEGFA_site2 GACCCCCTCCACCCCGCCTCNGG 3 | -------------------------------------------------------------------------------- /test/large_test/reference_output/VEGFA_site3_backgroundFiltered.txt: -------------------------------------------------------------------------------- 1 | chr1 236260648 236260654 chr1_236260643_273 VEGFA_site3.sam 455 1 236260643 GGGGTGACTCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCA 35 238 273 91.2688336728 35 239 274 91.4603739332 6 260 39.4968353163 11.292504645 VEGFA_site3_U2OS VEGFA_site3 GGTGAGTGAGTGTGTGCGTGNGG 2 | chr1 236260691 236260709 chr1_236260699_88 VEGFA_site3.sam 456 1 236260699 CAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTA 16 72 88 33.941125497 102 171 273 132.068164218 27 58 39.5727178748 9.71494545992 VEGFA_site3_U2OS VEGFA_site3 GGTGAGTGAGTGTGTGCGTGNGG 3 | chr2 242838693 242838704 chr2_242838710_5 VEGFA_site3.sam 3057 2 242838710 GTGTGGGTGTTGGGGTGTGTGTGTTGGGGTGTGGGTGTGGGGGTGTGGGT 4 1 5 2.0 4 2 6 2.82842712475 3 2 2.44948974278 6.11228271597 GGGGTGTGGGTGTGGGGGTGTGG 5 23 chr2 242838710 242838733 chr2_242838710_5 1 + VEGFA_site3_U2OS VEGFA_site3 GGTGAGTGAGTGTGTGCGTGNGG 4 | -------------------------------------------------------------------------------- /test/large_test/reference_output/md5.txt: -------------------------------------------------------------------------------- 1 | e5b7a9b887ecead4a44de490cf9dff2a EMX1_backgroundFiltered.txt 2 | dc7b726532518eaa1933086e41277c61 VEGFA_site1_backgroundFiltered.txt 3 | fe98c1eb14adbe175a70ecc0b9b452dc VEGFA_site2_backgroundFiltered.txt 4 | 1533f99dcd11f332e32a655ba25a6efc VEGFA_site3_backgroundFiltered.txt 5 | -------------------------------------------------------------------------------- /test/scripts/bwa_index_alignment.sh: -------------------------------------------------------------------------------- 1 | # Run bwa to generate index 2 | 3 | bwa index ~/GRCh37/Homo_sapiens_assembly19.fasta 4 | 5 | # Run paired end mapping to generate SAM files 6 | 7 | bwa mem ~/GRCh37/Homo_sapiens_assembly19.fasta emx1.r1.fastq.gz emx1.r2.fastq.gz > ../output/emx1.sam 8 | bwa mem ~/GRCh37/Homo_sapiens_assembly19.fasta control.r1.fastq.gz control.r2.fastq.gz > ../output/control.sam 9 | 10 | -------------------------------------------------------------------------------- /test/scripts/compile_dependencies.sh: -------------------------------------------------------------------------------- 1 | cd test 2 | git clone https://github.com/lh3/bwa.git 3 | cd bwa 4 | git checkout tags/0.7.9a 5 | make 6 | cd .. 7 | PATH=`pwd`/bwa:$PATH 8 | git clone https://github.com/arq5x/bedtools2.git 9 | cd bedtools2 10 | git checkout tags/v2.25.0 11 | make 12 | cd .. 13 | PATH=`pwd`/bedtools2/bin:$PATH 14 | cd .. 15 | echo $PATH -------------------------------------------------------------------------------- /test/scripts/prepare_test_data.sh: -------------------------------------------------------------------------------- 1 | ## This script generates two test FASTQ datasets: 2 | ## 1. An undemultiplexed dataset representing several barcoded samples with molecular indexing. 3 | ## This dataset represents raw data from a MiSeq run following the GUIDE-Seq protocol 4 | ## described in Tsai et al., 2014 (PMID XXXX) 5 | ## 2. A two sample dataset containing reads from a control and an EMX guide experiment 6 | ## that overlap with a small set of test regions: 7 | ## the on-target location, 3 off-target locations and two DSB hotspots. 8 | ## The reads representing the same template molecule (i.e. those with the same 9 | ## molecular barcode have been consolidated). 10 | ## 11 | ## Raw input dataset: 12 | ## /data/joung/sequencing_fastq/131007_M01326_0075_000000000-A6B33/fastq_with_indexes 13 | ## -rw-rw----. 1 ma695 aryee 2.7G Oct 31 12:44 guideseq_test_fastq.zip 14 | ## -rw-rw----. 1 st680 joung 120M Oct 14 14:52 Undetermined_S0_L001_I1_001.fastq.gz 15 | ## -rw-rw----. 1 st680 joung 221M Oct 14 14:53 Undetermined_S0_L001_I2_001.fastq.gz 16 | ## -rw-rw----. 1 st680 joung 1.1G Oct 14 14:58 Undetermined_S0_L001_R1_001.fastq.gz 17 | ## -rw-rw----. 1 st680 joung 1.3G Oct 14 14:59 Undetermined_S0_L001_R2_001.fastq.gz 18 | ## 19 | ## EMX1 has barcode P706 (TAGGCATG), A01 (TAGATCGC). 20 | ## Ignoring the first base and concatenating gives AGGCATGAGATCGC. 21 | ## EMX1 target sequence: GAGTCCGAGCAGAAGAAGAANGG 22 | ## 23 | ## Oligo control has barcode P707 (CTCTCTAC), A02 (CTCTCTAT) 24 | ## Ignoring the first base and concatenating gives TCTCTACTCTCTAT. 25 | 26 | ON_TARGET="2:73160981-73161004" 27 | OFF_TARGET="15:44109746-44109769 6:9118792-9118815 2:218378101-218378124" 28 | DSB_HOTSPOTS="1:236260170-236260754 3:197900267-197900348" 29 | 30 | UMI_PKG_DIR="../../guideseq/umi" 31 | 32 | # Align reads 33 | INPUT_DIR="/data/joung/sequencing_fastq/131007_M01326_0075_000000000-A6B33/fastq_with_indexes" 34 | BWA_INDEX="/data/aryee/pub/genomes/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/genome.fa" 35 | module load aryee/bwa-0.7.9a 36 | time bwa mem $BWA_INDEX $INPUT_DIR/Undetermined_S0_L001_R1_001.fastq.gz $INPUT_DIR/Undetermined_S0_L001_R2_001.fastq.gz > undemux.sam 37 | 38 | # Generate BAM: 39 | module load samtools/0.1.19 40 | samtools view -bS undemux.sam > undemux.bam 41 | 42 | # Sort BAM 43 | samtools sort undemux.bam undemux.sorted 44 | 45 | # Index BAMs 46 | samtools index undemux.sorted.bam 47 | 48 | # Get the names of reads that overlap with the selected test regions: 49 | samtools view undemux.sorted.bam $ON_TARGET $OFF_TARGET $DSB_HOTSPOTS | cut -f1 | sort | uniq > read_names.txt 50 | 51 | # Subset FASTQs to extract _all_ read pairs where at least one of the reads falls in a specified test region 52 | zcat $INPUT_DIR/Undetermined_S0_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.r1.fastq.gz 53 | zcat $INPUT_DIR/Undetermined_S0_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.r2.fastq.gz 54 | zcat $INPUT_DIR/Undetermined_S0_L001_I1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.i1.fastq.gz 55 | zcat $INPUT_DIR/Undetermined_S0_L001_I2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.i2.fastq.gz 56 | 57 | # Demultiplex full target region FASTQs 58 | python $UMI_PKG_DIR/demultiplex.py --min_reads 1000 --read1 undemux_all.r1.fastq.gz --read2 undemux_all.r2.fastq.gz --index1 undemux_all.i1.fastq.gz --index2 undemux_all.i2.fastq.gz --sample_barcodes samplekey.txt 59 | 60 | # Choose a subset of EMX1 and control read names: 61 | cat emx1.r1.fastq | grep ^@M01326 | cut -f1 -d ' ' | sort | uniq | shuf --random-source emx1.r1.fastq -n 6000 > read_names_sample.txt 62 | cat control.r1.fastq | grep ^@M01326 | cut -f1 -d ' ' | sort | uniq | shuf --random-source control.r1.fastq -n 2000 >> read_names_sample.txt 63 | 64 | # Subset FASTQs to extract _a sample of_ read pairs where at least one of the reads falls in a specified test region 65 | zcat $INPUT_DIR/Undetermined_S0_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.r1.fastq.gz 66 | zcat $INPUT_DIR/Undetermined_S0_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.r2.fastq.gz 67 | zcat $INPUT_DIR/Undetermined_S0_L001_I1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.i1.fastq.gz 68 | zcat $INPUT_DIR/Undetermined_S0_L001_I2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.i2.fastq.gz 69 | 70 | # Demultiplex sub-sampled target region FASTQs 71 | python $UMI_PKG_DIR/demultiplex.py --min_reads 1000 --read1 undemux.r1.fastq.gz --read2 undemux.r2.fastq.gz --index1 undemux.i1.fastq.gz --index2 undemux.i2.fastq.gz --sample_barcodes samplekey.txt 72 | 73 | # Consolidate reads with the same molecular index 74 | for SAMPLE in emx1 control 75 | do 76 | echo "Consolidating reads for $SAMPLE" 77 | python $UMI_PKG_DIR/umitag.py --read1_in $SAMPLE.r1.fastq --read2_in $SAMPLE.r2.fastq --read1_out $SAMPLE.r1.umitagged.fastq --read2_out $SAMPLE.r2.umitagged.fastq --index1 $SAMPLE.i1.fastq --index2 $SAMPLE.i2.fastq 78 | python $UMI_PKG_DIR/consolidate.py $SAMPLE.r1.umitagged.fastq $SAMPLE.r1.consolidated.fastq 15 0.9 79 | python $UMI_PKG_DIR/consolidate.py $SAMPLE.r2.umitagged.fastq $SAMPLE.r2.consolidated.fastq 15 0.9 80 | done 81 | 82 | # Copy test datasets to data dir 83 | cp undemux.*.fastq.gz data 84 | for SAMPLE in emx1 control 85 | do 86 | gzip -c $SAMPLE.r1.consolidated.fastq > data/$SAMPLE.r1.fastq.gz 87 | gzip -c $SAMPLE.r2.consolidated.fastq > data/$SAMPLE.r2.fastq.gz 88 | done 89 | -------------------------------------------------------------------------------- /test/scripts/prepare_test_genome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script generates a subsetted genome index for use in unit tests 3 | # Requirements: samtools, bedtools 4 | 5 | mkdir -p genome_prep 6 | cd genome_prep 7 | 8 | # Download chromosomes 1 2 3 6 15 9 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.1.fa.gz 10 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.2.fa.gz 11 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.3.fa.gz 12 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.6.fa.gz 13 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.15.fa.gz 14 | 15 | cat *.fa.gz > Homo_sapiens.GRCh37.75.dna.subset.fa.gz 16 | gunzip Homo_sapiens.GRCh37.75.dna.subset.fa.gz 17 | samtools faidx Homo_sapiens.GRCh37.75.dna.subset.fa 18 | 19 | # Pad test regions with 1kb on either side 20 | bedtools slop -i ../test_regions.bed -g Homo_sapiens.GRCh37.75.dna.subset.fa.fai -b 1000 > test_regions_padded.bed 21 | 22 | # Extract test genome regions 23 | bedtools getfasta -fi Homo_sapiens.GRCh37.75.dna.subset.fa -bed test_regions_padded.bed -fo test_genome.fa 24 | 25 | # Move genome fasta to test dir 26 | mv test_genome.fa ../.. 27 | cd .. 28 | 29 | -------------------------------------------------------------------------------- /test/scripts/prepare_test_genome_index.sh: -------------------------------------------------------------------------------- 1 | # This script generates a subsetted genome index for use in unit tests 2 | # The index is hosted at: http://aryee.mgh.harvard.edu/guideseq/data/Homo_sapiens.GRCh38.dna.subset.masked.fa.index.zip 3 | # Requirements: samtools, bedtools 4 | 5 | mkdir -p genome_prep 6 | cd genome_prep 7 | 8 | # Download chromosomes 1 2 3 6 15 9 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz 10 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.2.fa.gz 11 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz 12 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz 13 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.15.fa.gz 14 | 15 | cat *.fa.gz > Homo_sapiens.GRCh38.dna.subset.fa.gz 16 | gunzip Homo_sapiens.GRCh38.dna.subset.fa.gz 17 | samtools faidx Homo_sapiens.GRCh38.dna.subset.fa 18 | 19 | # Pad test regions with 1kb on either side 20 | bedtools slop -i ../test_regions.bed -g Homo_sapiens.GRCh38.dna.subset.fa.fai -b 1000 > test_regions_padded.bed 21 | 22 | # Generate complement bed file (i.e. non-test regions) 23 | bedtools complement -i test_regions_padded.bed -g Homo_sapiens.GRCh38.dna.subset.fa.fai > test_regions_complement.bed 24 | 25 | # Mask non-test regions with Ns 26 | bedtools maskfasta -fi Homo_sapiens.GRCh38.dna.subset.fa -fo Homo_sapiens.GRCh38.dna.subset.masked.fa -bed test_regions_complement.bed 27 | 28 | # Move genome fasta to test dir 29 | mv Homo_sapiens.GRCh38.dna.subset.masked.fa .. 30 | cd .. 31 | 32 | # Get bwa 33 | git clone https://github.com/lh3/bwa.git 34 | cd bwa 35 | git checkout tags/0.7.12 36 | make 37 | cd .. 38 | 39 | # Index the genome 40 | bwa/bwa index Homo_sapiens.GRCh38.dna.subset.masked.fa -------------------------------------------------------------------------------- /test/scripts/samplekey.txt: -------------------------------------------------------------------------------- 1 | emx1 AGGCATGAGATCGC 2 | control TCTCTACTCTCTAT 3 | -------------------------------------------------------------------------------- /test/scripts/test_regions.bed: -------------------------------------------------------------------------------- 1 | 1 236260170 236260754 2 | 2 73160981 73161004 3 | 2 218378101 218378124 4 | 3 197900267 197900348 5 | 6 9118792 9118815 6 | 15 44109746 44109769 7 | -------------------------------------------------------------------------------- /test/test_genome.fa: -------------------------------------------------------------------------------- 1 | >1:236259170-236261754 2 | ACTAACCCTGACTAGCCTGCTTATATTGCATCATCTATTTCTTCCCATGAAAACCATGATAAAGGCTCCTGCCCCCAGGTCCCATCACCCCAGCCTGCTGTCTTACCTGAGTACTTCTCCCTGTGGCCCTGCATGGAGTGCCATGCCTCCTGTCTCTAGGGACTGAGTATAACAAAAACCTTTTCCTTTATACCAATTATTTTCATATCTGCATGTCTTACCATACCCAATTAAAACAAATCTCAAATACAATTAAAACTCACTTCTGAGAAATCTTTGCCTAACCCAACCTCGCAAAGATTTTCTACATGTTTTCTTCTAGAACAGAGGTCACCAAGCTACGGCCTGTGGGCCAGTGGCTTGTTTTTCTAAATAAGGTTTTACTGGATTATGGCCATACTCTTCATTTTTGTAATGTTTATGGCTGCTTTTGTGCTATGACAGAGTTAAGTAGTCATGACAGAGACCACATGGCCCATAAGCCTAAACATCTGCTACCTCGCCCTTTAAGAAAAAGTAAGATGACCTCTGTTTTAAAAATATTATAGTTTTAGTTCTTACATTTAGGTCTGAGACCCTTTCTGAGTTAAACATTTTTATGATGTGAAGTAAAAGTTAAGGATATTTTTGTGTATGAACATCCACTTCATCCACAAAGCAGCAATGTTTCTTTTTCAAAAAAGACCATCTTTCCATGTTGAATTATGTTGGTCGTCATCAAAAATTAATTGACCATGTAGCTGTTTCTCAACTTTTTATTCTGTTCCATTGATCTTCGTGTCTATTCTTCTACCAATACCACCCTGTCACGTCTATTGTATCTTTCTAGTAAATCTTAAAATTGGGTAACATAAGCTCTCCATTTCCAGTTTCTGGAAAAATTTGTGTAGAATTTGTTGTAAATAAATTTTTGGTGCTGCAAAAGAAATAGCACTCAAACATAAGTTTAATTTTCTCAGCAAGGCAATTTTACTTCTCTAGAAGGGTGCGACTCGCAGATGGAGCAATGGCCAGAGCACACCTGAACAAGGGAGGGGAAGGGGTTCTGATTCCTGACACAGGTAGCCCCTACTGATGCGTCGTTCCCGTATTGGCTAGGGTTGGACTGCACAGTCTAAGCTAATTCCGATTGGCTACTTTAAAGAGAGCAGGGGTATGAGCCAGAGTGGCGGGGTGAGTAGTTTGGTGGGAAGGGTGGTTACAGAACAGGTGACTCAGGATGATTCAGGTCAGAGCAGGTGACCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTACTGATTAGAACTGTTGGAAAAGGTTGTTTAGTGAAACTAGGGCTGAGGAGAACGAGGAAGTTCAACTTTAAAATGGAGAACAAAGAACTGAACATACTGACATACTGATTCTTTGAAGAGAAATTTAGAACTCACTGTATTCAACAAATTATTATTTTTGCTTTTAAGTGTCTGTGGAATTCACCGGTGATCCACCTGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCATTGTGCCCGGCCAAGAACAGCCTTCCATTTCTACCTTGCCTGTAGCCTCCCAAACCAAAATACGGGTCTCAACACCTGGTCCTTGATCTCCTTGACTCCCGTTGGACCCAAGAAGTATGGCTTCTCTCCCCTTTCCTCTGGTGGATGGCCCTGGACACAACCCACCTCACACTAACAGAGACAGAGGGGTGGTCCAAGGGAAGTGAGTGAGAAGTTACCTGCAGAAGAGGGAACTTTTGAGAGAAGGAACCAGGTAGCTGTGAGGCCCTTGAAAACAGAAGGCTGTCAAGGCAGATTTGAAGGGAGAATATATGTGAATTAGCAAGTGCAGGAGATAGAGAGAGCACTAACTATTAGAAAGAAATGAAAATAGGGCCAGGCGCGGTGGCTCACACTTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGTGTATCACTTGAGGTCAGGAGTTCGAGTCCAGCCTGGCCAACATGGCGAAACCTCCCCACCATGTCTACTAAAAATACAAAAATTAGCTGGGTGTGATGGTGTGCACATGTAATCCCAACTACTTGGGAGGCTGAGGCAGGAGAATCTCTTGAACCTGGGAGGCGGAGGCTGCAGTGAGTCAAGATTGTGCCACTGCACTCCAGCCTGGGCGAAAGAATGAGACTCCATCAGAAAGGAAGAAAGGAAGGGAAGGGGAAAAGAAGGGAAGGGAAGGGAAGGGAAGGGAAGGGAAGGAAGGAAAGGAAGGGAAGGAAGGAAGGGAAAGGAAGAAAGGAAGGGAAGGGAAAAGAAGGGAAGGGAAG 3 | >2:73159981-73162004 4 | TCTCCTGACTGTTCCTTGTGTGACCTGTTCCCACATCTGGATGGGCTGCAGGAGCCAGTGCTGTGGGGACAGAAGGTCTGGAGCTGCCCGTGAAGGGCAGAATGCTGCCCTCAGACCCGCTTCCTCCCTGTCCTTGTCTGTCCAAGGAGAATGAGGTCTCACTGGTGGATTTCGGACTACCCTGAGGAGCTGGCACCTGAGGGACAAGGCCCCCCACCTGCCCAGCTCCAGCCTCTGATGAGGGGTGGGAGAGAGCTACATGAGGTTGCTAAGAAAGCCTCCCCTGAAGGAGACCACACAGTGTGTGAGGTTGGAGTCTCTAGCAGCGGGTTCTGTGCCCCCAGGGATAGTCTGGCTGTCCAGGCACTGCTCTTGATATAAACACCACCTCCTAGTTATGAAACCATGCCCATTCTGCCTCTCTGTATGGAAAAGAGCATGGGGCTGGCCCGTGGGGTGGTGTCCACTTTAGGCCCTGTGGGAGATCATGGGAACCCACGCAGTGGGTCATAGGCTCTCTCATTTACTACTCACATCCACTCTGTGAAGAAGCGATTATGATCTCTCCTCTAGAAACTCGTAGAGTCCCATGTCTGCCGGCTTCCAGAGCCTGCACTCCTCCACCTTGGCTTGGCTTTGCTGGGGCTAGAGGAGCTAGGATGCACAGCAGCTCTGTGACCCTTTGTTTGAGAGGAACAGGAAAACCACCCTTCTCTCTGGCCCACTGTGTCCTCTTCCTGCCCTGCCATCCCCTTCTGTGAATGTTAGACCCATGGGAGCAGCTGGTCAGAGGGGACCCCGGCCTGGGGCCCCTAACCCTATGTAGCCTCAGTCTTCCCATCAGGCTCTCAGCTCAGCCTGAGTGTTGAGGCCCCAGTGGCTGCTCTGGGGGCCTCCTGAGTTTCTCATCTGTGCCCCTCCCTCCCTGGCCCAGGTGAAGGTGTGGTTCCAGAACCGGAGGACAAAGTACAAACGGCAGAAGCTGGAGGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCGCATTGCCACGAAGCAGGCCAATGGGGAGGACATCGATGTCACCTCCAATGACTAGGGTGGGCAACCACAAACCCACGAGGGCAGAGTGCTGCTTGCTGCTGGCCAGGCCCCTGCGTGGGCCCAAGCTGGACTCTGGCCACTCCCTGGCCAGGCTTTGGGGAGGCCTGGAGTCATGGCCCCACAGGGCTTGAAGCCCGGGGCCGCCATTGACAGAGGGACAAGCAATGGGCTGGCTGAGGCCTGGGACCACTTGGCCTTCTCCTCGGAGAGCCTGCCTGCCTGGGCGGGCCCGCCCGCCACCGCAGCCTCCCAGCTGCTCTCCGTGTCTCCAATCTCCCTTTTGTTTTGATGCATTTCTGTTTTAATTTATTTTCCAGGCACCACTGTAGTTTAGTGATCCCCAGTGTCCCCCTTCCCTATGGGAATAATAAAAGTCTCTCTCTTAATGACACGGGCATCCAGCTCCAGCCCCAGAGCCTGGGGTGGTAGATTCCGGCTCTGAGGGCCAGTGGGGGCTGGTAGAGCAAACGCGTTCAGGGCCTGGGAGCCTGGGGTGGGGTACTGGTGGAGGGGGTCAAGGGTAATTCATTAACTCCTCTCTTTTGTTGGGGGACCCTGGTCTCTACCTCCAGCTCCACAGCAGGAGAAACAGGCTAGACATAGGGAAGGGCCATCCTGTATCTTGAGGGAGGACAGGCCCAGGTCTTTCTTAACGTATTGAGAGGTGGGAATCAGGCCCAGGTAGTTCAATGGGAGAGGGAGAGTGCTTCCCTCTGCCTAGAGACTCTGGTGGCTTCTCCAGTTGAGGAGAAACCAGAGGAAAGGGGAGGATTGGGGTCTGGGGGAGGGAACACCATTCACAAAGGCTGACGGTTCCAGTCCGAAGTCGTGGGCCCACCAGGATGCTCACCTGTCCTTGGAGAACCGCTGGGCAGGTTGAGACTGCAGAGACAGGGCTTAAGGCTGAGCCTGCAAC 5 | >2:218377101-218379124 6 | GCAATATCACAGATGATTCATCCCAAAAGGGCCCTCCAAAGGTCATCCTGGCAATGCCCTTGCCTTGCTTTGAAATGACTCTTGCCTGTTATAAATAGATGCAAATATTGAGTGGGTAGGGAGATTCTCCTATTCTTAAAGCTGTCAAGGTCAGGAGATGAATCTGCCCCCTCAGTCACCTGCTCACAGGTTCCCACGACTCTGACATTCAGGAAGCCAAGAAGAGATAAACTTCCTTTATCAGGATCCAATCTTCAACACTAGTCATTTGAAAACTATATCCAACTGTCACTTGTAGATCCTCTATCTCTCTGCATCTGCATCCGGAATACTGGCAGCAATGACGACAACAATAGCAGATCAGCATAAAACCAAAAACACATTCATAAACACACACACATACACACTACACACACACATACATACATACACACATATACACACAAACACACACATGCACACATGCATACATACATGTACACACACATGCATACACACATACACATGCATACGCCATACAAATGCATACACATACACATGCATACACACACATATACAAACATCCATGTATACACACACATACACACATGCATATACATATACACATACACACTACACACATACACACATACGTGCATACACACATACATGCATGCATACACATACACGCATACACACATCCATACATACACACAAACACACATGCATATACACACGCACACACACACAATTAGGCAGAGTCTCAGAGGAAAGAGGAAGATCTGTTGAACTGAAAAGTATCCTTAGAGAAAGAATTGGGAAACCAGAGGCAGAAAAGAATAGACTTTCATAGCCCCTGAGCACTCCAAAATATTGCAGAGTTTCCTCATCTACTTTCATTTTCTCCACCAGCCCAGCTTCACCCCTAGAGGTCAAGAAGGCTCTTTTTGATTCTGTCAAAGTCTCCCTGGCCAACCTCTTTGTATAAGAAAACAGAAAAACATTCAGACACGAACCTGACCCTTTATTCTCCTGCTTAGACTCCTTCACTGGCATCCCCTATCTAAGCCCAGCTAAGCTAAGTCACATAGATAAATAGCTACTTTACCAAGTGGAATGTATCAAGTGTGATAAGAAATGCAGGGAGGAACTCAGTGAAAGAAGAAATTAATTCCAACTGGAGAGATCCAGTTCCACTGAGAAGGTGGTATTATACAGAGCCCCAGAGCAGGCATAGAGGTAGAAAGACTCCAAAGTGCAATGTTCTAAGAAACAGTGAGTGTGCTGGGCTGTAATGAAGAATGCATATCGTGAGGTGGCAGGATATATTGTCATCAGAGAGTAGGCTGGGGCCATGTCAGGGAACACCTGAAATGTAAAATTTCCTGTATCCTTGAATCACTTTTTTGGCTTCACTTTTCTTTTTCTTTTTTTTTTTTTTTTTGAGACGGAGTCTTGCTTTGTCGCCCAGGCTGGAGTACAGTGGCACGATCTCAGCTCACTGCAACCTCCGCCTCTCAGGTTCAAGCAATTCTCCTGTCTCAGCCTCCTGAGTAGCTGGGGATTACAGGCGCACACCACCACGCCTGGCTAATTTTTGTATTTTTTTAGTAGAGACAGGGTTTCACCAGGTTGGCCAGGCTGGTTCCGAACTCATGACCTCGTGATCTGCCTGCCTCGGCCTCCCAAAGTGCTGAGATTACAGGCGTGAGCCACCATGCTCAGCCTTGGCTTCACTTTTCTAATGTCTCCACATTTGGGATCCAGAATGGTTATCATATACCACAAGCCTGGTCATGTTTGATGCATCAAGATTTATAGCTTTTAAAGTGCTTTGAATTATAGACTAATAGATTAATAGATATTAATTAATAGATAAAAATAACAAAAATAGTCCACCTTTGGAAGTAATATGCTTAATGTGCTCTATGTAACCTTGGAGTGAACTGTTTCTGATAACTTCTTGTGGCCCAAAATGTTATCCTGCCCTAAAGAAATACTAACTGGCTCAGTTGAAGATGGGCATGGTAGTTTCCATGTGAGG 7 | >3:197899267-197901348 8 | ACGGATTGCTTTGTGTACTTTGGGAAACTTAACAATGTGGTCTACAAATCCACAAATAAGATACATTTTTACATTTATTGGAAGTTTAATTTCCTTAAGTAATGTCTTATAATTTCCCTCATCTAAGTCTTGTCGTTTCATTCCATTTATTCCTAAGTATAATATTGCTATTGGTATTATTTAAGGTAGAATTTTCATAATTTGGTTTAGAGATTATTCATTCCTAGCATATACATATAAAATGGAATGTTTGGCCAGGCACCCGGGCTCATACCTGTAACCCAAGCAGGTTGAGAGGCTGAGGAAGGGTTAGGGTTAGGGTTGGGGTTGGGGTTGGGGTTAGGCTTAGGGCTTAGGGCTAGGGCTAGGGCTAGGGCTAGAGTTAGGGTTGGGTTAGGGTTGGGTTAGGGTAGGGTTAGGGTTAGGGGTTAGGGGTTAGGGTTCGGGTTCGGGTTTGGGTTATGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGGTTCGGGTTAGGGTTCAGGTTAGGGTTCTGGTTGGGTTTAGTGTTAGGGTTTAGGGTTCGGGTTTGGGTTAGGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAAGGATTAAGGGTTAAGGGTCAGGGTCAGGGGTTTGGGTCAAGGGTTAGGGTTAGGGGTTAAGAGTTAGGGGTTAGGGATTATGGTTTGGGTGAGGGGTGAGGGGTGAGGGTGAGGGTTAGGGTTAGCGTTTTAGGGTTATGGTTAGGGTTAAGGGTTAGGGTTAGGGGTTAGGGGTTAAGGGTTAGGGGTAGGATAAGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTAAGGATTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGGTTCGGGTTAGGGTTCAGGTTAGGGTTCTGGTTGGGTTTAGTGTTAGGGTTTAGGGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGGTTCGGGTTAGGGTTCAGGTTAGGTTTCTGGTTGGGTTTAGTGTTAGGGTTTAGGGTTCGGGTTTGGGTTAGGGGTTAGGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAAGGGTTAAGGGTCAGGGTCAGGGTCAGGGGTTTGGGTCAAGGGTTAGGGTCAAGTGTTAGGGTTAGGGGTTAAGAGTTAAGGCTTAGGGATTATGGTTTGGGTGAGGGGTGAGGGGTGAGGGTGAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGATGTGGGTGAGGGTGAGGATGAAGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGTTTAGGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAAGGGTTAAGGGTCAGGGTCAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGTTTAGGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAAGGGTTAAGGGTCAGGGTCAGGGGTTTGGGTCAAGGGTTAGGGTCAAGGGTTAGGGTTAGGGGTTAAGAGTTAGGGGTTAGGGATTATGGTTTGGGTGAGGGGTGAGGGGTGAGGGTGAGGGTTAGGGTTAGCGTTTTAGGGTTATGGTTTGGGTTAAGGGTTAGGGTTAGGGGTTAGGGGTTAAGGGTTAGGGGTAGGATAAGGGTAAGGATTAGGGTTAGGGTCAGGGTAAGGGTAAGGGTAAGGATTAGGGTTAGGATTAGGGTTAGGGTTAGGGTTAGGGTTTTAGGGTTAGGGT 9 | >6:9117792-9119815 10 | AATTCAAACCCTGATTCTGTCATTCAGTTATGTAAACACAAGCAAGTAAATTCACCTCTCTGAAATTTAGTGTTTATCAGTAAAGTGGAAAAAACACTTAACTACAGTATTAAAAGAGAATTCCATAAGGTAACTAGCAGACAGTGCTCAGTATCACATTTGGCATTATTTCATATTCATTCAATTCAATCCTATGCTAATATAGCTGAAATGTCCTCATGCCGGACTGTTTGTAATGAGTAGAAGAATAATCCTACATAAAGCTCCCTACGTCTCTGTTCAATTTAATTCAATTTGACCAAGATGTATTGAGCAGCTACTACATGCAAGGCTCACCGACATAGTGATGCCAACATGAGCTACATTTTATTGAGTACTTACTCTGCTTCAGGAACTCGCTAAGAGTTTCATTACCTTATATACTTTGATTCTTACAACAACATGAGAGAGGGGTGTTGTTCAAGATGTTTCACAAGTAGCATAGTTCTGCCCGTAGTACATTGCCTGTATGGGAAGCTCAAAGGGCCTATTTTACAGAAGAGAAAATTAAGGATTTGAGATATTAAAGGAGCTCATCTTTGCTATAGCAAGTGGCCAAACTTTTAGGGCTGTTTCATGAGACTGTAATTCATGGGCTACCATTGTGAATCAAGGTAAGTCCTTCCCATCGCTGTGGCTCTCAGTTCTAATTCTCTTGTTTGGGGTTCTGTTATATAGCAAGTGAATACAGTTTTCATAAGTCAGTGCATCCAGAAAGCTTTGTTTCAGATGTTCCAAGCCACTGGTTTCTCAGTCAGAAATATTTATTGCCTCTTTTCATTGCCATTAAAGGTTCAAGCTCAGAATGCTTTCTAACACCTGTTAATATTAATACAATTTCAGTAGTAGCATTAAGGAATGCCAGTTCTGGGTTGTGGGGGTCCCTAGGCCCACACCAGCAATGCTCTCGTCTTCCTGCAGAGGTTCTGCCAGTGCCTCAAGAATGTCCTCAGAGTTCTGTCCATTCTTCTTCTGCTCAGACGTTTTGTCTGAAAGTATTTTCCCAGGCCAGAGCTGGACTCTGTATAGAGTACAGAGGAAGGGCCAAGGTATGACTTTACTCCCACCACTTATATCTGAGAGCACAGGCACAGAGGGCAGTTTGTCACAAATCTGAGTGTGCCCGAGCTACATTCATGAATAATAAATAGGCTTCATATTGCCTCACATTCCCATTCCCATCCCAAGTATGAGGCTGGCTTTGTTGACTGCAAATGAAAACTGCTTATTTATAGGGAGGAAGATGGGAGTGGAAGGGCAGATTTTAGCGGGTTGGAAATCATATTAATAAATAAGCTGAGAAATATTTGACTTGGGTAAATGCTCAGGTAGAATTTGAACATTGAGAATCATGCATTCATCTGTGTGTAAGAAATAAGAGTCAAGGATAATCCTATAGCTTAATCCCCCAAAATATGATCCACAGAGCATCCTGTGTATTTCAGAATGCAACAGGAATGTGTGCGTATGCTCTTGGATTTCGACGAATGACTTCGAGCCCTGGGATCTGTTGCCTTGTGAACTTTTGCAGCCTGTCAAACCACAGTCTAGTAGTAAGAGAAACCGTCCTCTGCCTGGTCCAAAGATGCCAATGTGGATAGTTTGTCGTCCTTGAAATGTGGTGTGACCCTCACAGCTACCACGTTTCTGAAGGCATCAGAAAAATCAACCATACTTAAGAGTTAAAATACCGCACGTATCATACATTTTTAAAAGTCAGAGTAACAAGAGTATATGGAATAAGAAGATGAAGCAATCCACTAATCACCATTTTAATCTCACTGTCAAGTCAGCTTTATATATATTTTGCCTACATAGTTTCTTAGGAGTAGTGTCTCCTTGTAAATGTGCTGATTGCCTCTTAATTAAGTGATCTTATTTATAATACTAAAGAAACTCAGTAGAGACAAAAATGAGAATCACATTTTAAACTATTAGACATAAGGTCAAAAATAAAGGCTTATTGCAAGGCAGCATTATCATC 11 | >15:44108746-44110769 12 | GCACTGTTAATCTTCAAGTGGCAAAGTTATTAGGACATAAATACTTATTAAGTTACTTGGCTCTAACTACTTAATAACTAAAATAAACAGAACTGTTTTGGCCCAGGAGCTCTGTGAAAAAACTCCTGGGACAATTATGAACGGTGAAAGTTCAGGACTCCTGGCCTACTCTATAAAGTAATGTCTCCATCAATATATAACATACAAAATCCCTCACTGGGAGGCTCTCTTTCTACCATGAGACCTTTATAAGGTCTTTTCTCCTACTTCAGTGTCATTTCCCCTTATTAGCATTTAGAATAAGTAGATGTATAAAATCAATTTACCAAAGTTTATAAAACATCTTTGTGCTAAAAAGACTCCTCTCAACAGCCAACAAATGGTCTGTGTTTTCTATTCCCATAAGGAGTTGAGTAGGTCCTTTAGGCCAGAAGTAACCTGCTGTTTCCTGAAGCTGCCACTTTGTAGCTTTGGTCATGTAACTTTCTTAGCCCCTTTGTGCTCTTAGTTTCCTCCTTTTGTGAAATGGGGATAATAGCACCTATTTCATAAAGTTATTGTATGGATTACAAGAATTAGCAAATGTAATCTACTTGGTAGTACTATTCAAGTATTAGCTATTATTATTATTATTAAGCTGTAGAGGTTCATGTTCAAGTAAACACAATCACAGACATTACCTCTCTTCCACATCTTCACTAATACGGTTCTGTAAACGCCGTAGCCGGGGGTCACTGGATGAATCCTCCTCCTGTTCCTCAGGCTCTGCTTCTTGTTCTTTGGCTTTCTTAATGAACTGAAATTCTTCATCCTCCTCATCTGAGGACTCCATAGGGGCATAGTCTGGCCTTTTTCCGGACACATAACGCTTTACCTTCACTTTTTCCATTGAAATCTCACCTGGGCGAGAAAGGTAACTTATGTTTCAGTAGCCTCTTTCTCAATGTGCTTCAACCCATCACGGCCTTTGCAAATAGAGCCCTTTATTCATAGTAGACAAGAGTCTAAGCAGAAGAAGAAGAGAGCCACTACCCAACCATCTACTCTTCTAATGGTGTTTTCCTACAAAGGCCAAGTCATGAGACTGCATCCTTGTGAAAGCCAACACTGATGATAATGAGGCTTACCTTGAGTACAATGAAGTAGAGGAAGGTAGGCAGTGAAACAGTAGAAAAAAGTCCCCCCCCAAAAAGGCAGACTGCATCCATCACAAATTCATGGTATCCCACCTCAACTATACCCTTAAACAAATTATTTGTAACAGTGCCCAGCACATAGTAAGGGTAATTTCTGCAGGAACATAAAACTGCTCAGGCATTCTTGTAGTTCCCTCTGGAATTTCCATGGCAGCCTTTATAACATACTGCCACATGACTCAATATTCTAATCTTACCTATTTCTACCTTCTTTTCCTTTACCTTATGTGTTATCAAACATGCAAGAGTATGTAATCTCTTACAAATACAATTCTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGTGGTCTCGGCTCACTGCAACCTCCGCCTCCCGGGTTCAAGTGATTCTTCTGCCTCAGCTTCCTGAGTAGCTGGGACTACAGACGCATGCCACCATGCCCAGCTAATTTTTTGTATTTTTAGTAGAGATGGGGTTTCACCGTATTAGCCATGATGGTCTCGATCTCCTGACCTCATGATCCGCCCACCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACAGCACCCAGCCTACAAATACAAACAATTCTAGTGAACAAAGACAGTGAAAACTAACCAACTTATTTTATGAGTATAACCTTGATAACCAAATTGGACAGTTCAAAGGAAAACTTTAGGTGGATTTCACTTACAAACACAAATACAAAAATACTAAATAACGTAAGAGCAAATTTATCCCAGCAATATAAAAACTACCAAAAAAAAAAAAAGAAACAAAAAAAAAAAACCCCTAGACCAACACAACTTATTAGAATTTAGCCCAA 13 | -------------------------------------------------------------------------------- /test/test_guideseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | test_guideseq 6 | ---------------------------------- 7 | 8 | Tests for `guideseq` module. 9 | """ 10 | 11 | import yaml 12 | import unittest 13 | import os 14 | import shutil 15 | import utils 16 | from guideseq import guideseq 17 | 18 | TEST_SAMPLE_BARCODES = {'AGGCATGAGATCGC': 'mysample', 'GACTCCTGCGATAT': 'sample2'} 19 | TEST_UNDEMULTIPLEXED_FILES = {'forward': 'data/undemultiplexed/undemux.r1.fastq.gz', 20 | 'reverse': 'data/undemultiplexed/undemux.r2.fastq.gz', 21 | 'index1': 'data/undemultiplexed/undemux.i1.fastq.gz', 22 | 'index2': 'data/undemultiplexed/undemux.i2.fastq.gz'} 23 | TEST_DEMULTIPLEXED_FILES = {'read1': 'data/demultiplexed/EMX1.r1.fastq', 24 | 'read2': 'data/demultiplexed/EMX1.r2.fastq', 25 | 'index1': 'data/demultiplexed/EMX1.i1.fastq', 26 | 'index2': 'data/demultiplexed/EMX1.i2.fastq'} 27 | TEST_SAMPLES = { 28 | 'control':{ 29 | 'barcode1':'CTCTCTAC', 30 | 'description':'Control', 31 | 'barcode2':'CTCTCTAT', 32 | 'target':None 33 | }, 34 | 'EMX1':{ 35 | 'barcode1':'TAGGCATG', 36 | 'description':'EMX_site1', 37 | 'barcode2':'TAGATCGC', 38 | 'target':'GAGTCCGAGCAGAAGAAGAANGG' 39 | } 40 | } 41 | 42 | TEST_SAMPLE_NAME = 'EMX1' 43 | TEST_OUTPUT_PATH = 'test_output' 44 | TEST_MIN_READS = 1000 45 | TEST_DEMULTIPLEX_MANIFEST_PATH = os.path.join(TEST_OUTPUT_PATH, 'demultiplex_manifest.yaml') 46 | TEST_MANIFEST_PATH = os.path.join(TEST_OUTPUT_PATH, 'test_manifest.yaml') 47 | 48 | TEST_BWA_PATH = 'bwa' 49 | TEST_BEDTOOLS_PATH = 'bedtools' 50 | 51 | TEST_REFERENCE_GENOME = 'test_genome.fa' 52 | 53 | CORRECT_DEMULTIPLEXED_OUTPUT = 'data/demultiplexed' 54 | CORRECT_UMITAGGED_OUTPUT = 'data/umitagged' 55 | CORRECT_CONSOLDIATED_OUTPUT = 'data/consolidated' 56 | CORRECT_ALIGNED_OUTPUT = 'data/aligned' 57 | CORRECT_IDENTIFIED_OUTPUT = 'data/identified' 58 | CORRECT_FILTERED_OUTPUT = 'data/filtered' 59 | 60 | CORRECT_ALL_OUTPUT = 'data' 61 | 62 | class FullPipelineTestCase(unittest.TestCase): 63 | 64 | def setUp(self): 65 | # Create the test output folder 66 | os.makedirs(TEST_OUTPUT_PATH) 67 | 68 | # Create the test demultiplexing YAML 69 | test_manifest_data = {} 70 | test_manifest_data['undemultiplexed'] = TEST_UNDEMULTIPLEXED_FILES 71 | test_manifest_data['demultiplex_min_reads'] = TEST_MIN_READS 72 | test_manifest_data['samples'] = TEST_SAMPLES 73 | test_manifest_data['output_folder'] = TEST_OUTPUT_PATH 74 | test_manifest_data['bwa'] = TEST_BWA_PATH 75 | test_manifest_data['bedtools'] = TEST_BEDTOOLS_PATH 76 | test_manifest_data['reference_genome'] = TEST_REFERENCE_GENOME 77 | 78 | with open(TEST_MANIFEST_PATH, 'w') as f: 79 | f.write(yaml.dump(test_manifest_data, default_flow_style=False)) 80 | 81 | 82 | def testFullPipeline(self): 83 | g = guideseq.GuideSeq() 84 | g.parseManifest(TEST_MANIFEST_PATH) 85 | 86 | # Demultiplex and test the demultiplex output 87 | g.demultiplex() 88 | self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'demultiplexed'), CORRECT_DEMULTIPLEXED_OUTPUT)) 89 | 90 | # UMITag and test the umitagging output 91 | g.umitag() 92 | self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'umitagged'), CORRECT_UMITAGGED_OUTPUT)) 93 | 94 | # Consolidate and test the consolidation output 95 | g.consolidate() 96 | self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'consolidated'), CORRECT_CONSOLDIATED_OUTPUT)) 97 | 98 | # Align and test the alignment output 99 | g.alignReads() 100 | self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'aligned'), CORRECT_ALIGNED_OUTPUT)) 101 | 102 | # Identify offtargets and test the output 103 | g.identifyOfftargetSites() 104 | self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'identified'), CORRECT_IDENTIFIED_OUTPUT)) 105 | 106 | # Filter background sites and test if correct 107 | g.filterBackgroundSites() 108 | self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'filtered'), CORRECT_FILTERED_OUTPUT)) 109 | 110 | 111 | def tearDown(self): 112 | # Delete temp output 113 | #shutil.rmtree(TEST_OUTPUT_PATH) 114 | pass 115 | 116 | if __name__ == '__main__': 117 | unittest.main() -------------------------------------------------------------------------------- /test/test_manifest.yaml: -------------------------------------------------------------------------------- 1 | reference_genome: test_genome.fa 2 | output_folder: output 3 | 4 | bwa: bwa 5 | bedtools: bedtools 6 | 7 | demultiplex_min_reads: 1000 8 | window_size: 25 9 | max_mismatches: 7 10 | 11 | undemultiplexed: 12 | forward: data/undemultiplexed/undemux.r1.fastq.gz 13 | reverse: data/undemultiplexed/undemux.r2.fastq.gz 14 | index1: data/undemultiplexed/undemux.i1.fastq.gz 15 | index2: data/undemultiplexed/undemux.i2.fastq.gz 16 | 17 | samples: 18 | control: 19 | target: 20 | barcode1: CTCTCTAC 21 | barcode2: CTCTCTAT 22 | description: Control 23 | 24 | EMX1: 25 | target: GAGTCCGAGCAGAAGAAGAANGG 26 | barcode1: TAGGCATG 27 | barcode2: TAGATCGC 28 | description: EMX_site1 29 | -------------------------------------------------------------------------------- /test/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import inspect 4 | import filecmp 5 | from itertools import islice 6 | 7 | def checkFolderEquality(folder1, folder2): 8 | """ 9 | Given two folders, check if there are the same number of files, 10 | that the names of files are the same, and that the files with the same 11 | names are the same. 12 | """ 13 | 14 | folder1_files = [x for x in os.listdir(folder1) if not x.startswith('.')] 15 | folder2_files = [x for x in os.listdir(folder2) if not x.startswith('.')] 16 | 17 | if set(folder1_files) != set(folder2_files): 18 | print 'Folders do not have the same filenames.' 19 | return False 20 | 21 | for f in folder1_files: 22 | file1 = os.path.join(folder1, f) 23 | file2 = os.path.join(folder2, f) 24 | 25 | if f.split('.')[-1] == 'sam': 26 | with open(file1, 'r') as a, open(file2, 'r') as b: 27 | for line1, line2 in zip(a,b): 28 | if line1.startswith('@'): 29 | continue 30 | elif line1 != line2: 31 | return False 32 | else: 33 | if not filecmp.cmp(file1, file2): 34 | print '{0} does not match between folders.'.format(f) 35 | return False 36 | 37 | return True 38 | 39 | 40 | def head(filepath, n=10): 41 | with open(filepath) as f: 42 | for line in islice(f, n): 43 | print line 44 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27 8 | skipsdist = false 9 | 10 | [testenv] 11 | deps = -rrequirements.txt 12 | commands = nosetests -w test/ --exe -v 13 | whitelist_externals=* --------------------------------------------------------------------------------