├── .DS_Store
├── .gitignore
├── .gitmodules
├── .travis.yml
├── AUTHORS.rst
├── EMX1_visualization.png
├── LICENSE
├── MANIFEST
├── MANIFEST.in
├── Makefile
├── README.md
├── conda-build
    ├── meta.yaml
    ├── py2_all
    │   ├── linux-32
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   ├── linux-aarch64
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   ├── linux-armv6l
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   ├── linux-armv7l
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   ├── linux-ppc64le
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   ├── osx-64
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   ├── win-32
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    │   └── win-64
    │   │   └── guide_seq-1.0.2-py27_0.tar.bz2
    ├── py35
    │   ├── linux-32
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   ├── linux-aarch64
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   ├── linux-armv6l
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   ├── linux-armv7l
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   ├── linux-ppc64le
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   ├── osx-64
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   ├── win-32
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    │   └── win-64
    │   │   └── guide_seq-1.0.2-py35_0.tar.bz2
    └── py3_all
    │   ├── linux-32
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   ├── linux-aarch64
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   ├── linux-armv6l
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   ├── linux-armv7l
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   ├── linux-ppc64le
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   ├── osx-64
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   ├── win-32
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
    │   └── win-64
    │       └── guide_seq-1.0.2-py37_0.tar.bz2
├── guideseq
    ├── #guideseq_visualize_only.py#
    ├── NUC_SIMPLE
    ├── __init__.py
    ├── alignReads.py
    ├── filterBackgroundSites.py
    ├── guideseq.py
    ├── guideseq_visualize_only.py
    ├── guideseq_visualize_only.py~
    ├── identifyOfftargetSites.py
    ├── log.py
    ├── validation.py
    ├── visualization.py
    ├── visualization2.py
    └── visualization_bk.py
├── guideseq_flowchart.png
├── requirements.txt
├── setup.cfg
├── setup.py
├── test
    ├── __init__.py
    ├── data
    │   ├── aligned
    │   │   ├── EMX1.sam
    │   │   └── control.sam
    │   ├── consolidated
    │   │   ├── EMX1.r1.consolidated.fastq
    │   │   ├── EMX1.r2.consolidated.fastq
    │   │   ├── control.r1.consolidated.fastq
    │   │   └── control.r2.consolidated.fastq
    │   ├── demultiplexed
    │   │   ├── EMX1.i1.fastq
    │   │   ├── EMX1.i2.fastq
    │   │   ├── EMX1.r1.fastq
    │   │   ├── EMX1.r2.fastq
    │   │   ├── control.i1.fastq
    │   │   ├── control.i2.fastq
    │   │   ├── control.r1.fastq
    │   │   ├── control.r2.fastq
    │   │   ├── undetermined.i1.fastq
    │   │   ├── undetermined.i2.fastq
    │   │   ├── undetermined.r1.fastq
    │   │   └── undetermined.r2.fastq
    │   ├── filtered
    │   │   └── EMX1_backgroundFiltered.txt
    │   ├── identified
    │   │   ├── EMX1_identifiedOfftargets.txt
    │   │   └── control_identifiedOfftargets.txt
    │   ├── umitagged
    │   │   ├── EMX1.r1.umitagged.fastq
    │   │   ├── EMX1.r2.umitagged.fastq
    │   │   ├── control.r1.umitagged.fastq
    │   │   └── control.r2.umitagged.fastq
    │   ├── undemultiplexed
    │   │   ├── undemux.i1.fastq.gz
    │   │   ├── undemux.i2.fastq.gz
    │   │   ├── undemux.r1.fastq.gz
    │   │   └── undemux.r2.fastq.gz
    │   └── visualization
    │   │   └── EMX1_identifiedOfftargets_offtargets.svg
    ├── demultiplex_manifest.yaml
    ├── large_test.sh
    ├── large_test
    │   └── reference_output
    │   │   ├── EMX1_backgroundFiltered.txt
    │   │   ├── VEGFA_site1_backgroundFiltered.txt
    │   │   ├── VEGFA_site2_backgroundFiltered.txt
    │   │   ├── VEGFA_site3_backgroundFiltered.txt
    │   │   └── md5.txt
    ├── scripts
    │   ├── bwa_index_alignment.sh
    │   ├── compile_dependencies.sh
    │   ├── prepare_test_data.sh
    │   ├── prepare_test_genome.sh
    │   ├── prepare_test_genome_index.sh
    │   ├── samplekey.txt
    │   └── test_regions.bed
    ├── test_genome.fa
    ├── test_guideseq.py
    ├── test_manifest.yaml
    └── utils.py
└── tox.ini


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | guideseq.egg-info/
 4 | 
 5 | *.py[cod]
 6 | test/output
 7 | .genome
 8 | 
 9 | # Genome indices
10 | *.amb
11 | *.ann
12 | *.bwt
13 | *.fai
14 | *.pac
15 | *.sa
16 | 
17 | # C extensions
18 | *.so
19 | 
20 | # Idea folder
21 | .idea
22 | 
23 | # Packages
24 | *.egg
25 | *.egg-info
26 | dist
27 | build
28 | eggs
29 | parts
30 | bin
31 | var
32 | sdist
33 | develop-eggs
34 | .installed.cfg
35 | lib64
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | 
40 | # Unit test / coverage reports
41 | .coverage
42 | .tox
43 | nosetests.xml
44 | htmlcov
45 | 
46 | # Translations
47 | *.mo
48 | 
49 | # Mr Developer
50 | .mr.developer.cfg
51 | .project
52 | .pydevproject
53 | 
54 | # Complexity
55 | output/*.html
56 | output/*/index.html
57 | 
58 | # Sphinx
59 | docs/_build
60 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "guideseq/umi"]
2 | 	path = guideseq/umi
3 | 	url = https://github.com/aryeelab/umi.git
4 |     branch = master
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "2.7"
 7 | 
 8 | before_install:
 9 |   - cd test
10 |   - git clone https://github.com/lh3/bwa.git
11 |   - cd bwa
12 |   - git checkout tags/0.7.9a
13 |   - make
14 |   - cd ..
15 |   - PATH=`pwd`/bwa:$PATH
16 |   - git clone https://github.com/arq5x/bedtools2.git
17 |   - cd bedtools2
18 |   - git checkout tags/v2.25.0
19 |   - make
20 |   - cd ..
21 |   - PATH=`pwd`/bedtools2/bin:$PATH
22 |   - cd ..
23 | 
24 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
25 | install: 
26 |   - pip install -r requirements.txt
27 | 
28 | # command to run tests, e.g. python setup.py test
29 | script:
30 |     cd test && nosetests --exe -v
31 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Leads
 6 | ----------------
 7 | 
 8 | * Shengdar Q Tsai <STSAI4@mgh.harvard.edu>
 9 | * Martin Aryaa <Aryee.Martin@mgh.harvard.edu>
10 | * Ved V Topkar <vedtopkar@gmail.com>
11 | 
12 | Contributors
13 | ------------
14 | 
15 | None yet. Why not be the first?
16 | 


--------------------------------------------------------------------------------
/EMX1_visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/EMX1_visualization.png


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | AUTHORS.rst
 3 | LICENSE
 4 | requirements.txt
 5 | setup.cfg
 6 | setup.py
 7 | guideseq/__init__.py
 8 | guideseq/alignReads.py
 9 | guideseq/filterBackgroundSites.py
10 | guideseq/guideseq.py
11 | guideseq/identifyOfftargetSites.py
12 | guideseq/log.py
13 | guideseq/validation.py
14 | guideseq/visualization.py
15 | test/test_guideseq.py
16 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include LICENSE
 3 | include README.rst
 4 | include requirements.txt
 5 | 
 6 | recursive-include tests *
 7 | recursive-exclude * __pycache__
 8 | recursive-exclude * *.py[co]
 9 | 
10 | recursive-include docs *.rst conf.py Makefile make.bat
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean - remove all build, test, coverage and Python artifacts"
 5 | 	@echo "clean-build - remove build artifacts"
 6 | 	@echo "clean-pyc - remove Python file artifacts"
 7 | 	@echo "clean-test - remove test and coverage artifacts"
 8 | 	@echo "test - run tests quickly with the default Python"
 9 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
10 | 	@echo "install - install the package to the active Python's site-packages"
11 | 
12 | clean: clean-build clean-pyc clean-test
13 | 
14 | clean-build:
15 | 	rm -fr build/
16 | 	rm -fr dist/
17 | 	rm -fr .eggs/
18 | 	find . -name '*.egg-info' -exec rm -fr {} +
19 | 	find . -name '*.egg' -exec rm -f {} +
20 | 
21 | clean-pyc:
22 | 	find . -name '*.pyc' -exec rm -f {} +
23 | 	find . -name '*.pyo' -exec rm -f {} +
24 | 	find . -name '*~' -exec rm -f {} +
25 | 	find . -name '__pycache__' -exec rm -fr {} +
26 | 
27 | clean-test:
28 | 	rm -fr .tox/
29 | 	rm -f .coverage
30 | 	rm -fr htmlcov/
31 | 
32 | test:
33 | 	python setup.py test
34 | 
35 | docs:
36 | 	rm -f docs/guideseq.rst
37 | 	rm -f docs/modules.rst
38 | 	sphinx-apidoc -o docs/ guideseq
39 | 	$(MAKE) -C docs clean
40 | 	$(MAKE) -C docs html
41 | 	open docs/_build/html/index.html
42 | 
43 | install: clean
44 | 	python setup.py install
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![Version][version-shield]][version-url]
  3 | [![Python versions][python-shield]][python-url]
  4 | [![Platforms][platform-shield]][python-url]
  5 | 
  6 | # guideseq: The GUIDE-Seq Analysis Package
  7 | 
  8 | **Note that an updated version of this package, including Python 3 support, is maintained by the Tsai Lab: https://github.com/tsailabSJ/guideseq**
  9 | 
 10 | This repo (aryeelab/guideseq) contains experimental features.
 11 | 
 12 | ------
 13 | 
 14 | The guideseq package implements our data preprocessing and analysis pipeline for GUIDE-Seq data. It takes raw sequencing reads (FASTQ) and a parameter manifest file (.yaml) as input and produces a table of annotated off-target sites as output.
 15 | 
 16 | ### References
 17 | 
 18 | ##### The original paper describing the GUIDE-Seq method:
 19 | 
 20 | Tsai SQ, Zheng Z, Nguyen NT, Liebers M, Topkar VV, Thapar V, Wyvekens N, Khayter C, Iafrate AJ, Le LP, Aryee MJ, Joung JK. [GUIDE-seq enables genome-wide profiling of off-target cleavage by CRISPR-Cas nucleases](https://www.ncbi.nlm.nih.gov/pubmed/25513782). Nat Biotechnol. 2015 Feb;33(2):187-197
 21 | 
 22 | ##### A description of this analysis package:
 23 | Tsai SQ, Topkar VV, Joung JK, Aryee MJ. [Open-source guideseq software for analysis of GUIDE-seq data](https://www.ncbi.nlm.nih.gov/pubmed/27153277). Nat Biotechnol. 2016 May 6;34(5):483 
 24 | 
 25 | ## Table of Contents
 26 | - [Features](#features)
 27 | - [Dependencies](#dependencies)
 28 | - [Getting Set Up](#setup)
 29 | 	- [Installation](#Installation)
 30 | 	- [Quickstart](#Quickstart)
 31 | - [Running the Full Analysis Pipeline](#full_pipeline)
 32 | 	- [Quickstart](#quickstart)
 33 | 	- [Writing A Manifest File](#write_manifest)
 34 | 	- [A Full Manifest File Example](manifest_example)
 35 | 	- [Pipeline Outputs](#pipeline_output)
 36 | - [Running Analysis Steps Individually](#)
 37 | 	- [Demultiplex](#demultiplex)
 38 | 	- [UMItag](#umitag)
 39 | 	- [Consolidate](#consolidate)
 40 | 	- [Align](#align)
 41 | 	- [Identify](#identify)
 42 | 	- [Filter](#filter)
 43 | 	- [Visualize](#visualize)
 44 | - [Frequently Asked Questions](#FAQ)
 45 | 	- [How do I Run the Pipeline with Demultiplexed Data?](#demultiplexed_run)
 46 | 	- [Can I analyze data without UMIs?](#no_umis)
 47 | 
 48 | 
 49 | ## Features<a name="features"></a>
 50 | 
 51 | 
 52 | The package implements a pipeline consisting of a read preprocessing module followed by an off-target identification module. The preprocessing module takes raw reads (FASTQ) from a pooled multi-sample sequencing run as input. Reads are demultiplexed into sample-specific FASTQs and PCR duplicates are removed using unique molecular index (UMI) barcode information.
 53 | 
 54 | ![guideseq_flowchart](guideseq_flowchart.png)
 55 | 
 56 | The individual pipeline steps are:
 57 | 
 58 | 1. **Sample demultiplexing**: A pooled multi-sample sequencing run is demultiplexed into sample-specific read files based on sample-specific dual-indexed barcodes
 59 | 2. **PCR Duplicate Consolidation**:Reads that share the same UMI and the same first six bases of genomic sequence are presumed to originate from the same pre-PCR molecule and are thus consolidated into a single consensus read to improve quantitative interpretation of GUIDE-Seq read counts.
 60 | 3. **Read Alignment**: The demultiplexed, consolidated paired end reads are aligned to a reference genome using the BWA-MEM algorithm with default parameters (Li. H, 2009).
 61 | 4. **Candidate Site Identification**: The start mapping positions of the read amplified with the tag-specific primer (second of pair) are tabulated on a genome-wide basis. Start mapping positions are consolidated using a 10-bp sliding window. Windows with reads mapping to both + and - strands, or to the same strand but amplified with both forward and reverse tag-specific primers, are flagged as sites of potential DSBs. 25 bp of reference sequence is retrieved on either side of the most frequently occuring start-mapping position in each flagged window. The retrieved sequence is aligned to the intended target sequence using a Smith-Waterman local-alignment algorithm. 
 62 | 5. **False positive filtering**: Off-target cleavage sites with more than six mismatches to the intended target sequence, or that are present in background controls, are filtered out.
 63 | 6. **Reporting**: Identified off-targets, sorted by GUIDE-Seq read count are annotated in a final output table. The GUIDE-Seq read count is expected to scale approximately linearly with cleavage rates (Tsai et al., *Nat Biotechnol.* 2015).
 64 | 7. **Visualization**: Alignment of detected off-target sites is visualized via a color-coded sequence grid, as seen below:
 65 | 
 66 | ![guideseq_flowchart](EMX1_visualization.png)
 67 | 
 68 | ## Dependencies<a name="dependencies"></a>
 69 | * Python 2 or 3
 70 | * Reference genome fasta file ([Example](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta))
 71 | * [`bwa`](<http://bio-bwa.sourceforge.net/>) alignment tool
 72 | * [`bedtools`](<http://bedtools.readthedocs.org/en/latest/>) genome arithmetic utility
 73 | 
 74 | 
 75 | ## Getting Set Up<a name="setup"></a>
 76 | 
 77 | ### Installation<a name="Installation"></a>
 78 | 
 79 | ```
 80 | # It's recommended (but not essential) to set up a conda environment to manage dependencies
 81 | conda create -n guideseq python=3.8
 82 | conda activate guideseq
 83 | 
 84 | git clone --recursive https://github.com/aryeelab/guideseq
 85 | cd guideseq
 86 | 
 87 | pip install -r requirements.txt
 88 | python setup.py install
 89 | 
 90 | guideseq.py -h
 91 | 
 92 | ## Please install BWA and bedtools if you choose this option
 93 | 
 94 | ```
 95 | 
 96 | - **Burrows-Wheeler Aligner (bwa)**: You can either install bwa with a package manager (e.g. `brew` on OSX or `apt-get` on Ubuntu/Debian), or you can download it from the [project page](http://bio-bwa.sourceforge.net/) and compile it from source.
 97 | - **Bedtools**: You can either install bwa with a package manager (e.g. `brew` or `apt-get`), or you can download it from the [project page](http://bedtools.readthedocs.org/en/latest/content/installation.html) and compile it from source.
 98 | 
 99 | For both bwa and bedtools, make sure you know the path to the respective executables, as they need to be specified in the pipeline manifest file.
100 | 
101 | 
102 | 
103 | ## Quickstart <a name="Quickstart"></a>
104 | 
105 | ```
106 | guideseq.py all -m test_manifest.yaml
107 | ```
108 | 
109 | ## Running the Full Analysis Pipeline<a name="full_pipeline"></a>
110 | 
111 | 
112 | To run the full guideseq analysis pipeline, you must first create a manifest YAML file that describes all pipeline inputs. Once you have done so, you can simply run
113 | 
114 | ```
115 | guideseq.py all -m /path/to/manifest.yaml
116 | ```
117 | 
118 | to run the entire pipeline. Below are specific instructions detailing how to write the manifest file.
119 | 
120 | If you wish to run an example on our abridged test data, you can simply run
121 | 
122 | ```
123 | 
124 | cd guideseq/test
125 | 
126 | 
127 | guideseq.py all -m test_manifest.yaml
128 | ```
129 | from the guideseq root directory. The `test_manifest` assumes that both the `bwa` and `bedtools`executables are in your system PATH. You will see the pipeline results outputted to the `test/output` folder.
130 | 
131 | ### Writing A Manifest File<a name="write_manifest"></a>
132 | When running the end-to-end analysis functionality of the guideseq package, a number of inputs are required. To simplify the formatting of these inputs and to encourage reproducibility, these parameters are inputted into the pipeline via a manifest formatted as a YAML file. YAML files allow easy-to-read specification of key-value pairs. This allows us to easily specify our parameters. The following fields are required in the manifest:
133 | 
134 | - `reference_genome`: The absolute path to the reference genome FASTA file.
135 | - `output_folder`: The absolute path to the folder in which all pipeline outputs will be saved.
136 | - `bwa`: The absolute path to the `bwa` executable
137 | - `bedtools`: The absolute path to the `bedtools` executable
138 | - `PAM`: PAM sequence (optional), default is NGG.
139 | - `search_radius`: Search radius for search. Set to 10 for Cas9 and 75 for Cpf1.
140 | - `max_mismatches`: The maximum number of mismatches allowed to report a sequence-matched off-target
141 | - `undemultiplexed`: The absolute paths to the undemultiplexed paired end sequencing files. The required parameters are:
142 | 	- `forward`: The absolute path to the FASTQ file containing the forward reads.
143 | 	- `reverse`: The absolute path to the FASTQ file containing the reverse reads.
144 | 	- `index1`: The absolute path to the FASTQ file containing the forward index reads.
145 | 	- `index2`: The absolute path to the FASTQ file containing the reverse index reads.
146 | 
147 | An example `undemultiplexed` field:
148 | 
149 | ```
150 | undemultiplexed:
151 |     forward: ../test/data/undemux.r1.fastq.gz
152 |     reverse: ../test/data/undemux.r2.fastq.gz
153 |     index1: ../test/data/undemux.i1.fastq.gz
154 |     index2: ../test/data/undemux.i2.fastq.gz
155 | ```
156 | 
157 | - `samples`: A nested field containing the details of each sample. At least two samples must be specified: a "control" sample (to be used to filter out background off-target sites) and at least one treatment sample. The required parameters are:
158 | 	- `target`: The sample targetsites
159 | 	- `barcode1`: The forward barcode
160 | 	- `barcode2`: The reverse barcode
161 | 	- `description`: A description of the sample
162 | 
163 | An example `samples` field:
164 | 
165 | ```
166 | samples:
167 |     control:
168 |         target:
169 |         barcode1: CTCTCTAC
170 |         barcode2: CTCTCTAT
171 |         description: Control
172 | 
173 |     [SAMPLENAME]:
174 |         target: GAGTCCGAGCAGAAGAAGAANGG
175 |         barcode1: TAGGCATG
176 |         barcode2: TAGATCGC
177 |         description: EMX1
178 | ```
179 | 
180 | ### A Full Manifest File Example<a name="manifest_example"></a>
181 | 
182 | Below is an example of a full manifest file. Feel free to copy it and replace the parameters with your own experiment data. Remember that you can input more than just one treatment sample (e.g. the "EMX1" data below).
183 | 
184 | ```
185 | reference_genome: test/test_genome.fa
186 | output_folder: test/output
187 | 
188 | bwa: bwa
189 | bedtools: bedtools
190 | PAM: NGG
191 | demultiplex_min_reads: 1000
192 | window_size: 75
193 | max_mismatches: 7
194 | 
195 | undemultiplexed:
196 |     forward: test/data/undemultiplexed/undemux.r1.fastq.gz
197 |     reverse: test/data/undemultiplexed/undemux.r2.fastq.gz
198 |     index1: test/data/undemultiplexed/undemux.i1.fastq.gz
199 |     index2: test/data/undemultiplexed/undemux.i2.fastq.gz
200 | 
201 | samples:
202 |     control:
203 |         target:  
204 |         barcode1: CTCTCTAC
205 |         barcode2: CTCTCTAT
206 |         description: Control
207 | 
208 |     EMX1:
209 |         target: GAGTCCGAGCAGAAGAAGAANGG
210 |         barcode1: TAGGCATG
211 |         barcode2: TAGATCGC
212 |         description: EMX_site1
213 | 
214 | ```
215 | 
216 | ### Pipeline Output<a name="pipeline_output"></a>
217 | 
218 | When running the full pipeline, the results of each step are outputted to the `output_folder` in a separate folder for each step. The output folders and their respective contents are as follows:
219 | 
220 | 
221 | #### Output Folders
222 | - `output_folder/demultiplexed`: Contains the four undemultiplexed reads files (forward, reverse, index1, index2) for each sample.
223 | - `output_folder/umitagged`: Contains the two umitgged reads files (forward, reverse) for each sample.
224 | - `output_folder/consolidated`: Contains the two consolidated reads files (forward, reverse) for each sample.
225 | - `output_folder/aligned`: Contains an alignment `.sam` file for each sample.
226 | - `output_folder/identified`: Contains a tab-delimited `.txt` file for each sample with an identified off-target in each row.
227 | - `output_folder/filtered`: Contains a tab-delimited `.txt` file for each sample containing the identified DSBs that are background sites (not off-targets)
228 | - `output_folder/visualization`: Contains a `.svg` vector image representing an alignment of all detected off-targets to the targetsite for each sample.
229 | 
230 | 
231 | The final detected off-target sites are placed in the `output_folder/identified` folder, with one `.txt` file for each sample specified in the manifest. The fields that are populated in each row of these off-target files are specified below:
232 | 
233 | ####Output Off-Targets `.txt` Fields:
234 | 
235 | - `BED Chromosome`: Window chromosome
236 | - `BED Min.Position`: Window 0-based start position
237 | - `BED Max.Position`: Window 0-based end position
238 | - `BED Name`: Name of window 
239 | - `Filename`: The name of the current `.SAM` file used in analysis.
240 | - `WindowIndex`: Index number of window
241 | - `Chromosome`: Chromosome corresponding to position with maximum reads in window (matches `BED Chromosome`)
242 | - `Position`: Position with maximum number of reads in window
243 | - `Sequence`: The window sequence, starting 25 bp upstream and ending 25 bp downstream of `Chromosome:Position`
244 | - `+.mi`: Number of forward reads with distinct molecular indices
245 | - `-.mi`: Number of reverse reads with distinct molecular indices
246 | - `bi.sum.mi`: Sum of the `+.mi` and `-.mi` fields (GUIDE-seq Read Count)
247 | - `bi.geometric_mean.mi`: Geometric mean of the `+.mi` and `-.mi` fields
248 | - `+.total`: Total number of forward mapping reads 
249 | - `-.total`: Total number of reverse mapping reads 
250 | - `total.sum`: Sum of `+.total` and `-.total` fields
251 | - `total.geometric_mean`: Geometric mean of the `+.total` and `-.total` fields
252 | - `primer1.mi`: Number of reads amplified by forward primer with distinct molecular indices
253 | - `primer2.mi`: Number of reads amplified by reverse primer with distinct molecular indices
254 | - `primer.geometric_mean`: Geometric mean of the `primer1.mi` and `primer2.mi` fields
255 | - `position.stdev`: Standard deviation of positions within genomic window
256 | - `Off-Target Sequence`: Off-target sequence derived from genome reference
257 | - `Mismatches`: Number of mismatches between the intended target sequence and the off-target sequence
258 | - `Length`: Length of the target sequence
259 | - `BED off-target Chromosome`: Off-target chromosome
260 | - `BED off-target start`: Off-target 0-based start position
261 | - `BED off-target end`: Off-target 0-based end position
262 | - `BED off-target name`: Off-target name
263 | - `BED Score`: Field to conform to standard BED format
264 | - `Strand`: Indicates the strand of detected off-target site. `+` for forward strand and `-` for reverse strand
265 | - `Cells`: Cell type
266 | - `Target site`: Targetsite name
267 | - `Target Sequence`: Intended target site sequence (including PAM)
268 | 
269 | The key fields for interpreting this output and identifying off-target sites are: `BED off-target Chromosome`, `BED off-target start`, `BED off-target end`, `BED off-target name`, `BED off-target strand`, `Off-Target Sequence`, `bi.sum.mi`
270 | 
271 | #### Output Visualizations
272 | 
273 | The outputted visualizations are in the `.svg` vector format, which is an open image standard that can be viewed in any modern web browser (e.g. Google Chrome, Apple Safari, Mozilla Firefox), and can be viewed and edited in any vector editing application (e.g. Adobe Illustrator). Because the output visualizations are vector images, they can be scaled up or down infinitely without a loss in quality, and can also be edited as shapes with ease. This makes the images produced by the guideseq package ideal for posters, presentations, and papers.
274 | 
275 | ## Running Analysis Steps Individually<a name="individual_steps"></a>
276 | 
277 | In addition to end-to-end pipeline analysis functionality, the guideseq package also allows for every step fo the analysis to be run individually. Here we have detailed the required inputs and expected outputs of each step. For each step, we have included a "runnable example" command that can be executed from the guideseq root directory to run that step on the included sample data. These "runnable example" snippets put their output in the `test/output` folder.
278 | 
279 | ### `demultiplex` Pooled Multi-Sample Sequencing (Manifest Required)<a name="demultiplex"></a>
280 | 
281 | - **Functionality**: Given undemultiplexed sequence files and sample barcodes specified in the manifest, output the demultiplexed sample-specific reads in FASTQ format. The forward, reverse, and two index files for each sample in the manifest	 are outputted to the `output_folder/consolidated` folder.
282 | - **Required Parameters**:
283 | 	- `-m or --manifest`: Specify the path to the manifest YAML file
284 | - **Runnable Example**:
285 | 	- `python guideseq/guideseq.py demultiplex -m test/test_manifest.yaml`
286 | 
287 | ### `umitag` Reads<a name="umitag"></a>
288 | 
289 | - **Functionality**: Given the demultiplexed files in the folder `output_folder/undemultiplexed` (where `output_folder` is specified in the manifest), 'tag' the reads by adding the UMI barcode sequence to the FASTQ read name header in preparation for the subsequent PCR duplicate read consolidation step. The forward and reverse files for each sample in the manifest are outputted to the `output_folder/umitagged` folder.
290 | - **Required Parameters**:
291 | 	- `--read1`: Path to the forward demultiplexed reads file (FASTQ)
292 | 	- `--read2`: Path to the reverse demultiplexed reads file (FASTQ)
293 | 	- `--index1`: Path to the index1 demultiplexed reads file (FASTQ)
294 | 	- `--index2`: Path to the index2 demultiplexed reads file (FASTQ)
295 | 	- `--outfolder`: Path to the folder in which the output files will be saved
296 | - **Runnable Example**:
297 | 
298 | 	```
299 | 	python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq \
300 | 	--read2 test/data/demultiplexed/EMX1.r2.fastq \
301 | 	--index1 test/data/demultiplexed/EMX1.i1.fastq \
302 | 	--index2 test/data/demultiplexed/EMX1.i2.fastq \
303 | 	--outfolder test/output/
304 | 	```
305 | 
306 | ### `consolidate` PCR Duplicates<a name="consolidate"></a>
307 | 
308 | - **Functionality**: Given undemultiplexed sequence files and sample barcodes specified in the manifest, output the consolidated forward and reversed reads to the `outfolder`.
309 | - **Required Parameters**:
310 | 	- `--read1`: Path to the forward umitagged reads file (FASTQ)
311 | 	- `--read2`: Path to the reverse umitagged reads file (FASTQ)
312 | 	- `--outfolder`: Path to the folder in which the output files will be saved
313 | - **Optional Parameters**:
314 | 	- `--min_quality`: The minimum quality of a read for it to be considered in the consolidation
315 | 	- `--min_frequency`: The minimum frequency of a read for the position to be consolidated
316 | - **Runnable Example**:
317 | 
318 | 	```
319 | 	python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq \
320 | 	 --read2 test/data/umitagged/EMX1.r2.umitagged.fastq \
321 | 	 --outfolder test/output/
322 | 	```
323 | 
324 | ### `align` Sites to Genome<a name="align"></a>
325 | 
326 | - **Functionality**: Given the consolidated forward and reverse reads, execute a paired-end mapping of the sequences to the reference genome using the `bwa` package. Outputs an alignment `.sam` file to the `outfolder`.
327 | - **Required Parameters**:
328 | 	- `--bwa`: Path to the `bwa` executable
329 | 	- `--genome`: Path to the reference genome FASTA file
330 | 	- `--read1`: Path to the consolidated forward read FASTQ file
331 | 	- `--read2`: Path to the consolidated reverse read FASTQ file
332 | 	- `--outfolder`: Path to the folder in which the output files will be saved
333 | - **Runnable Example**:
334 | 
335 | 	```
336 | 	python guideseq/guideseq.py align --bwa bwa --genome test/test_genome.fa\
337 | 	 --read1 test/data/consolidated/EMX1.r1.consolidated.fastq\
338 | 	 --read2 test/data/consolidated/EMX1.r2.consolidated.fastq\
339 | 	 --outfolder test/output/
340 | 	```
341 | 
342 | ### `identify` Off-target Site Candidates<a name="identify"></a>
343 | 
344 | - **Functionality**: Given the alignment samfile for a given site, a reference genome, and a target sequence, output a tab-delimited `.txt` file containing the identified off-target sites.
345 | - **Required Parameters**:
346 | 	- `--aligned`: Path to the site-specific alignment `.sam` file.
347 | 	- `--genome`: Path to the reference genome FASTA file.
348 | 	- `--outfolder`: Path to the folder in which the output files will be saved.
349 | 	- `--target_sequence`: The sequence targeted in the sample (blank for control sample)
350 | - **Optional Parameters**:
351 | 	- `--description`: Specify additional information about the sample.
352 | - **Runnable Example**:
353 | 
354 | 	```
355 | 	python guideseq/guideseq.py identify --aligned test/data/aligned/EMX1.sam\
356 | 	 --genome test/test_genome.fa --outfolder test/output/\
357 | 	 --target_sequence GAGTCCGAGCAGAAGAAGAANGG --description EMX1
358 | 	```
359 | 
360 | ### `filter` Background DSB Sites<a name="filter"></a>
361 | 
362 | - **Functionality**: Given the identified site `.txt` files for a treatment and control samples, output a `.txt` file in the same format as outputted by the `identify` step containing the sites filtered out as false-positives.
363 | - **Required Parameters**:
364 | 	- `--bedtools`: Path to the `bedtools` executable
365 | 	- `--identified`: Path to the `.txt` file outputted by the `identify` step for a treatment sample.
366 | 	- `--background`: Path to the `.txt` file outputted by the `identify` step for a control sample.
367 | 	- `--outfolder`: Path to the folder in which the output files will be saved.
368 | - **Runnable Example**:
369 | 
370 | 	```
371 | 	python guideseq/guideseq.py filter --bedtools bedtools\
372 | 	 --identified test/data/identified/EMX1_identifiedOfftargets.txt\
373 | 	 --background test/data/identified/control_identifiedOfftargets.txt\
374 | 	 --outfolder test/output/
375 | 	```
376 | 
377 | ### `visualize` Detected Off-Target Sites<a name="visualize"></a>
378 | 
379 | - **Functionality**: Given an identified off-target sites `.txt` file, output an alignment visualization of the off-target sites.
380 | - **Required Parameters**:
381 | 	- `--infile`:  Path to the input `.txt.` off-targets file
382 | 	- `--outfolder`: Path to the outputted folder containing the outputted `.svg` graphic
383 | - **Optional Parameters**:
384 | 	- `--title`: Specify the title of the visualization, to be printed at the top of the graphic. Useful for posters and presentations.
385 | - **Runnable Example**:
386 | 
387 | 	```
388 | 	python guideseq/guideseq.py visualize --infile test/data/identified/EMX1_identifiedOfftargets.txt\
389 | 	 --outfolder test/output/ --title EMX1
390 | 	```
391 | 
392 | ## Frequently Asked Questions<a name="FAQ"></a>
393 | 
394 | ### How do I Run the Pipeline with Demultiplexed Data?<a name="demultiplexed_run"></a>
395 | 
396 | If you already have demultiplexed data, you can run the pipeline on the data by running each step after demultiplexing individually, as described in the "Running Analysis Steps Individually" section above. Be sure to run the individual steps in the following orders:
397 | 
398 | - `umitag`
399 | - `consolidate`
400 | - `align`
401 | - `identify`
402 | - `filter`
403 | - `visualize`
404 | 
405 | ### Can I analyze data without UMIs?<a name="no_umis"></a>
406 | 
407 | Yes. If your reads do not have UMIs, you can run the pipeline on previously demultiplexed data as described in the "Running Analysis Steps Individually" section above, starting with the `align` step. **Note that we have not analyzed such data ourselves!** We suspect that PCR duplication bias may affect the quantitative interpretion of GUIDE-Seq read counts, but have not explored this.
408 | 
409 | ### Download Reference Genome<a name="reference_genome"></a>
410 | 
411 | The guideseq package requires a reference genome for read mapping. You can use any genome of your choosing, but for all of our testing and original GUIDE-seq analyses (Tsai et al. *Nature Biotechnol* 2015) we use hg19 ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)). Be sure to (g)unzip the FASTA file before use if it is compressed.
412 | 
413 | 
414 | ### Configuring a MiSeq to Output Index Reads<a name="miseq"></a>
415 | 
416 | The guideseq package requires index reads from the MiSeq sequencing run for read consolidation. The default MiSeq Reporter settings do not generate index (I1, I2) reads. This feature can be enabled by adding the line 
417 | 
418 | ```xml
419 | <add key="CreateFastqForIndexReads" value="1"> 
420 | ```
421 | 
422 | to the ``Miseq Reporter.exe.config`` file located in the Miseq Reporter installation folder. The default installation folder is  ``C:\Illumina\MiSeqReporter``. After modifying the config file it should look like this:
423 | 
424 | 
425 | ```xml
426 | <appSettings>
427 |     ... [LEAVE EXISTING LINES UNCHANGED] ...
428 |     <add key="CreateFastqForIndexReads" value="1"> 
429 | </appSettings>
430 | ```
431 | 
432 | The MiSeq Reporter service needs to be restarted for the change to take effect. Future runs of the GenerateFASTQ workflow (and probably other workflows) will generate I1 and I2 reads in addition to R1 and R2. All four of these reads files will be needed for guideseq analysis.
433 | 
434 | See page 29 of the Miseq Reporter User Guide for further instructions.
435 | 
436 | [version-shield]: https://img.shields.io/conda/v/tsailabsj/guide_seq.svg
437 | [version-url]: https://anaconda.org/tsailabSJ/guide_seq
438 | [python-shield]: https://img.shields.io/pypi/pyversions/guide_seq.svg
439 | [python-url]: https://pypi.python.org/pypi/guide_seq
440 | [platform-shield]: https://img.shields.io/badge/Platforms-linux--64,osx--64,linux--32-orange.svg?style=flat-square
441 | 
442 | 


--------------------------------------------------------------------------------
/conda-build/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "guide_seq" %}
 2 | {% set version = "1.0.2" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ version }}"
 7 | 
 8 | source:
 9 |   url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz
10 |   sha256: 733c04ad671727aeb9559ada2eb464d5e270ab41fa572827fb5f281e4c303f40
11 | 
12 | build:
13 |   number: 0
14 |   script: python setup.py install --single-version-externally-managed --record=record.txt
15 | 
16 | requirements:
17 |   host:
18 |     - pip
19 |     - python
20 |   run:
21 |     - python
22 |     - biopython
23 |     - bwa=0.7.17
24 |     - htseq
25 |     - matplotlib
26 |     - numpy
27 |     - pandas
28 |     - pyfaidx
29 |     - pygments
30 |     - pysam
31 |     - pyyaml
32 |     - regex
33 |     - scipy
34 |     - setuptools
35 |     - sqlite
36 |     - statsmodels
37 |     - svgwrite
38 |     - yaml
39 |     - zlib
40 |     - htslib=1.9
41 |     - samtools=1.9
42 |     - bedtools
43 | 
44 | test:
45 |   imports:
46 |     - guideseq
47 |     - umi
48 | 
49 | about:
50 |   home: https://github.com/tsailabSJ/guideseq
51 |   license: GNU General Public License v2 (GPLv2)
52 |   license_family: GPL2
53 |   license_file: ''
54 |   summary: An easy to use bioinformatic pipeline for the GUIDE-seq assay.
55 |   description: "guide-seq\n\n\n"
56 |   doc_url: ''
57 |   dev_url: ''
58 | 
59 | extra:
60 |   recipe-maintainers:
61 |     - YichaoOU
62 | 


--------------------------------------------------------------------------------
/conda-build/py2_all/linux-32/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-32/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/linux-aarch64/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-aarch64/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/linux-armv6l/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-armv6l/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/linux-armv7l/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-armv7l/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/linux-ppc64le/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/linux-ppc64le/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/osx-64/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/osx-64/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/win-32/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/win-32/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py2_all/win-64/guide_seq-1.0.2-py27_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py2_all/win-64/guide_seq-1.0.2-py27_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/linux-32/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-32/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/linux-aarch64/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-aarch64/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/linux-armv6l/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-armv6l/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/linux-armv7l/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-armv7l/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/linux-ppc64le/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/linux-ppc64le/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/osx-64/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/osx-64/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/win-32/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/win-32/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py35/win-64/guide_seq-1.0.2-py35_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py35/win-64/guide_seq-1.0.2-py35_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/linux-32/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-32/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/linux-aarch64/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-aarch64/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/linux-armv6l/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-armv6l/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/linux-armv7l/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-armv7l/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/linux-ppc64le/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/linux-ppc64le/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/osx-64/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/osx-64/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/win-32/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/win-32/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/conda-build/py3_all/win-64/guide_seq-1.0.2-py37_0.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/conda-build/py3_all/win-64/guide_seq-1.0.2-py37_0.tar.bz2


--------------------------------------------------------------------------------
/guideseq/#guideseq_visualize_only.py#:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | guideseq.py
  5 | ===========
  6 | serves as the wrapper for all guideseq pipeline
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import yaml
 13 | import argparse
 14 | import traceback
 15 | 
 16 | # Set up logger
 17 | import log
 18 | logger = log.createCustomLogger('root')
 19 | 
 20 | from alignReads import alignReads
 21 | from filterBackgroundSites import filterBackgroundSites
 22 | from umi import demultiplex, umitag, consolidate
 23 | from visualization import visualizeOfftargets
 24 | import identifyOfftargetSites
 25 | import validation
 26 | 
 27 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000
 28 | DEFAULT_WINDOW_SIZE = 25
 29 | DEFAULT_MAX_SCORE = 7
 30 | 
 31 | CONSOLIDATE_MIN_QUAL = 15
 32 | CONSOLIDATE_MIN_FREQ = 0.9
 33 | 
 34 | 
 35 | class GuideSeq:
 36 | 
 37 |     def __init__(self):
 38 |         pass
 39 | 
 40 |     def parseManifest(self, manifest_path):
 41 |         logger.info('Loading manifest...')
 42 | 
 43 |         with open(manifest_path, 'r') as f:
 44 |             manifest_data = yaml.load(f)
 45 | 
 46 |         try:
 47 |             # Validate manifest data
 48 |             validation.validateManifest(manifest_data)
 49 | 
 50 |             self.BWA_path = manifest_data['bwa']
 51 |             self.bedtools = manifest_data['bedtools']
 52 |             self.reference_genome = manifest_data['reference_genome']
 53 |             self.output_folder = manifest_data['output_folder']
 54 |             self.undemultiplexed = manifest_data['undemultiplexed']
 55 |             self.samples = manifest_data['samples']
 56 | 
 57 |         except Exception as e:
 58 |             logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
 59 |             sys.exit()
 60 | 
 61 |         # Allow the user to specify min reads for demultiplex if they want
 62 |         if 'demultiplex_min_reads' in manifest_data:
 63 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
 64 |         else:
 65 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
 66 |         # Allow the user to specify window size for off-target search
 67 |         if 'window_size' in manifest_data:
 68 |             self.window_size = manifest_data['window_size']
 69 |         else:
 70 |             self.window_size = DEFAULT_WINDOW_SIZE
 71 |         # Allow the user to specify window size for off-target search
 72 |         if 'max_score' in manifest_data:
 73 |             self.max_score = manifest_data['max_score']
 74 |         else:
 75 |             self.max_score = DEFAULT_MAX_SCORE
 76 | 
 77 |         # Make sure the user has specified a control barcode
 78 |         if 'control' not in self.samples.keys():
 79 |             raise AssertionError('Your manifest must have a control sample specified.')
 80 | 
 81 |         # Make sure the user has both a sample and a control
 82 |         if len(self.samples) < 2:
 83 |             raise AssertionError('Your manifest must have at least one control and one treatment sample.')
 84 | 
 85 |         logger.info('Successfully loaded manifest.')
 86 | 
 87 |     def parseManifestDemultiplex(self, manifest_path):
 88 |         logger.info('Loading manifest for demultiplexing...')
 89 | 
 90 |         with open(manifest_path, 'r') as f:
 91 |             manifest_data = yaml.load(f)
 92 | 
 93 |             try:
 94 |                 self.output_folder = manifest_data['output_folder']
 95 |                 self.undemultiplexed = manifest_data['undemultiplexed']
 96 |                 self.samples = manifest_data['samples']
 97 | 
 98 |             except Exception as e:
 99 |                 logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.')
100 |                 quit()
101 | 
102 |         # Allow the user to specify min reads for demultiplex if they want
103 |         if 'demultiplex_min_reads' in manifest_data:
104 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
105 |         else:
106 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
107 | 
108 |         logger.info('Successfully loaded manifest for single-step demultiplexing.')
109 | 
110 |     def demultiplex(self):
111 | 
112 |         logger.info('Demultiplexing undemultiplexed files...')
113 | 
114 |         # Take our two barcodes and concatenate them
115 |         swapped_sample_barcodes = {}
116 |         for sample in self.samples:
117 |             barcode1 = self.samples[sample]['barcode1']
118 |             barcode2 = self.samples[sample]['barcode2']
119 |             barcode = barcode1[1:8] + barcode2[1:8]
120 |             swapped_sample_barcodes[barcode] = sample
121 | 
122 |         try:
123 |             demultiplex.demultiplex(self.undemultiplexed['forward'],
124 |                                     self.undemultiplexed['reverse'],
125 |                                     self.undemultiplexed['index1'],
126 |                                     self.undemultiplexed['index2'],
127 |                                     swapped_sample_barcodes,
128 |                                     os.path.join(self.output_folder, 'demultiplexed'),
129 |                                     min_reads=self.demultiplex_min_reads)
130 | 
131 |             self.demultiplexed = {}
132 |             for sample in self.samples:
133 |                 self.demultiplexed[sample] = {}
134 |                 self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq')
135 |                 self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq')
136 |                 self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq')
137 |                 self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq')
138 | 
139 |             logger.info('Successfully demultiplexed reads.')
140 |         except Exception as e:
141 |             logger.error('Error demultiplexing reads.')
142 |             logger.error(traceback.format_exc())
143 |             quit()
144 | 
145 |     def umitag(self):
146 |         logger.info('umitagging reads...')
147 | 
148 |         try:
149 |             self.umitagged = {}
150 |             for sample in self.samples:
151 |                 self.umitagged[sample] = {}
152 |                 self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq')
153 |                 self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq')
154 | 
155 |                 umitag.umitag(self.demultiplexed[sample]['read1'],
156 |                               self.demultiplexed[sample]['read2'],
157 |                               self.demultiplexed[sample]['index1'],
158 |                               self.demultiplexed[sample]['index2'],
159 |                               self.umitagged[sample]['read1'],
160 |                               self.umitagged[sample]['read2'],
161 |                               os.path.join(self.output_folder, 'umitagged'))
162 | 
163 |             logger.info('Successfully umitagged reads.')
164 |         except Exception as e:
165 |             logger.error('Error umitagging')
166 |             logger.error(traceback.format_exc())
167 |             quit()
168 | 
169 |     def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL):
170 |         logger.info('Consolidating reads...')
171 | 
172 |         try:
173 |             self.consolidated = {}
174 | 
175 |             for sample in self.samples:
176 |                 self.consolidated[sample] = {}
177 |                 self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq')
178 |                 self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq')
179 | 
180 |                 consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq)
181 |                 consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq)
182 | 
183 |             logger.info('Successfully consolidated reads.')
184 |         except Exception as e:
185 |             logger.error('Error umitagging')
186 |             logger.error(traceback.format_exc())
187 |             quit()
188 | 
189 |     def alignReads(self):
190 |         logger.info('Aligning reads...')
191 | 
192 |         try:
193 |             self.aligned = {}
194 |             for sample in self.samples:
195 |                 sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam')
196 |                 alignReads(self.BWA_path,
197 |                            self.reference_genome,
198 |                            self.consolidated[sample]['read1'],
199 |                            self.consolidated[sample]['read2'],
200 |                            sample_alignment_path)
201 |                 self.aligned[sample] = sample_alignment_path
202 |                 logger.info('Finished aligning reads to genome.')
203 | 
204 |         except Exception as e:
205 |             logger.error('Error aligning')
206 |             logger.error(traceback.format_exc())
207 |             quit()
208 | 
209 |     def identifyOfftargetSites(self):
210 |         logger.info('Identifying offtarget sites...')
211 | 
212 |         try:
213 |             self.identified = {}
214 | 
215 |             # Identify offtarget sites for each sample
216 |             for sample in self.samples:
217 | 
218 |                 # Prepare sample annotations
219 |                 sample_data = self.samples[sample]
220 |                 annotations = {}
221 |                 annotations['Description'] = sample_data['description']
222 |                 annotations['Targetsite'] = sample
223 | 
224 |                 if sample is 'control':
225 |                     annotations['Sequence'] = ''
226 |                 else:
227 |                     annotations['Sequence'] = sample_data['target']
228 | 
229 |                 samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam')
230 | 
231 |                 self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')
232 | 
233 |                 identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
234 |                                                self.window_size, self.max_score)
235 | 
236 |             logger.info('Finished identifying offtarget sites.')
237 | 
238 |         except Exception as e:
239 |             logger.error('Error identifying offtarget sites.')
240 |             logger.error(traceback.format_exc())
241 |             quit()
242 | 
243 |     def filterBackgroundSites(self):
244 |         logger.info('Filtering background sites')
245 | 
246 |         try:
247 |             self.filtered = {}
248 | 
249 |             # Filter background in each sample
250 |             for sample in self.samples:
251 |                 if sample != 'control':
252 |                     self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt')
253 |                     filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample])
254 |                     logger.info('Finished background filtering for {0} sample'.format(sample))
255 | 
256 |             logger.info('Finished filtering background sites.')
257 | 
258 |         except Exception as e:
259 |             logger.error('Error filtering background sites.')
260 |             logger.error(traceback.format_exc())
261 | 
262 |     def visualize(self):
263 |         logger.info('Visualizing off-target sites')
264 | 
265 |         try:
266 |             for sample in self.samples:
267 |                 if sample != 'control':
268 |                     infile = self.identified[sample]
269 |                     outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets')
270 |                     visualizeOfftargets(infile, outfile, title=sample)
271 | 
272 |             logger.info('Finished visualizing off-target sites')
273 | 
274 |         except Exception as e:
275 |             logger.error('Error visualizing off-target sites.')
276 |             logger.error(traceback.format_exc())
277 | 
278 | 
279 | def parse_args():
280 |     parser = argparse.ArgumentParser()
281 | 
282 |     subparsers = parser.add_subparsers(description='Individual Step Commands',
283 |                                        help='Use this to run individual steps of the pipeline',
284 |                                        dest='command')
285 | 
286 |     all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline')
287 |     all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
288 |     all_parser.add_argument('--identifyAndFilter', action='store_true', default=False)
289 | 
290 |     demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files')
291 |     demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True)
292 | 
293 |     umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation')
294 |     umitag_parser.add_argument('--read1', required=True)
295 |     umitag_parser.add_argument('--read2', required=True)
296 |     umitag_parser.add_argument('--index1', required=True)
297 |     umitag_parser.add_argument('--index2', required=True)
298 |     umitag_parser.add_argument('--outfolder', required=True)
299 | 
300 |     consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs')
301 |     consolidate_parser.add_argument('--read1', required=True)
302 |     consolidate_parser.add_argument('--read2', required=True)
303 |     consolidate_parser.add_argument('--outfolder', required=True)
304 |     consolidate_parser.add_argument('--min_quality', required=False, type=float)
305 |     consolidate_parser.add_argument('--min_frequency', required=False, type=float)
306 | 
307 |     align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome')
308 |     align_parser.add_argument('--bwa', required=True)
309 |     align_parser.add_argument('--genome', required=True)
310 |     align_parser.add_argument('--read1', required=True)
311 |     align_parser.add_argument('--read2', required=True)
312 |     align_parser.add_argument('--outfolder', required=True)
313 | 
314 |     identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets')
315 |     identify_parser.add_argument('--aligned', required=True)
316 |     identify_parser.add_argument('--genome', required=True)
317 |     identify_parser.add_argument('--outfolder', required=True)
318 |     identify_parser.add_argument('--target_sequence', required=True)
319 |     identify_parser.add_argument('--description', required=False)
320 |     identify_parser.add_argument('--max_score', required=False, type=int, default=7)
321 |     identify_parser.add_argument('--window_size', required=False, type=int, default=25)
322 | 
323 |     filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites')
324 |     filter_parser.add_argument('--bedtools', required=True)
325 |     filter_parser.add_argument('--identified', required=True)
326 |     filter_parser.add_argument('--background', required=True)
327 |     filter_parser.add_argument('--outfolder', required=True)
328 | 
329 |     visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites')
330 |     visualize_parser.add_argument('--infile', required=True)
331 |     visualize_parser.add_argument('--outfolder', required=True)
332 |     visualize_parser.add_argument('--title', required=False)
333 | 
334 |     return parser.parse_args()
335 | 
336 | 
337 | def main():
338 |     args = parse_args()
339 | 
340 |     if args.command == 'all':
341 | 
342 |         if args.identifyAndFilter:
343 |             try:
344 |                 g = GuideSeq()
345 |                 g.parseManifest(args.manifest)
346 | 
347 |                 # Bootstrap the aligned samfile paths
348 |                 g.aligned = {}
349 |                 for sample in g.samples:
350 |                     g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam')
351 | 
352 |                 g.identifyOfftargetSites()
353 |                 g.filterBackgroundSites()
354 |                 g.visualize()
355 | 
356 |             except Exception as e:
357 |                 print 'Error running only identify and filter.'
358 |                 print traceback.format_exc()
359 |                 quit()
360 |         else:
361 |             g = GuideSeq()
362 |             g.parseManifest(args.manifest)
363 |             g.demultiplex()
364 |             g.umitag()
365 |             g.consolidate()
366 |             g.alignReads()
367 |             g.identifyOfftargetSites()
368 |             g.filterBackgroundSites()
369 |             g.visualize()
370 | 
371 |     elif args.command == 'demultiplex':
372 |         """
373 |         Run just the demultiplex step given the manifest
374 |         """
375 |         g = GuideSeq()
376 |         g.parseManifestDemultiplex(args.manifest)
377 |         g.demultiplex()
378 | 
379 |     elif args.command == 'umitag':
380 |         """
381 |         Run just the umitag step
382 |         python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/
383 |         """
384 |         g = GuideSeq()
385 |         g.output_folder = args.outfolder
386 |         sample = os.path.basename(args.read1).split('.')[0]
387 |         g.samples = [sample]
388 |         g.demultiplexed = {sample: {}}
389 |         g.demultiplexed[sample]['read1'] = args.read1
390 |         g.demultiplexed[sample]['read2'] = args.read2
391 |         g.demultiplexed[sample]['index1'] = args.index1
392 |         g.demultiplexed[sample]['index2'] = args.index2
393 |         g.umitag()
394 | 
395 |     elif args.command == 'consolidate':
396 |         """
397 |         Run just the consolidate step
398 |         python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14
399 |         """
400 |         sample = os.path.basename(args.read1).split('.')[0]
401 |         g = GuideSeq()
402 |         g.output_folder = args.outfolder
403 |         g.samples = [sample]
404 |         g.umitagged = {sample: {}}
405 |         g.umitagged[sample]['read1'] = args.read1
406 |         g.umitagged[sample]['read2'] = args.read2
407 | 
408 |         if 'min_quality' in args:
409 |             min_qual = args.min_quality
410 |         else:
411 |             min_qual = CONSOLIDATE_MIN_QUAL
412 | 
413 |         if 'min_frequency' in args:
414 |             min_freq = args.min_frequency
415 |         else:
416 |             min_freq = CONSOLIDATE_MIN_FREQ
417 | 
418 |         g.consolidate(min_freq=min_freq, min_qual=min_qual)
419 | 
420 |     elif args.command == 'align':
421 |         """
422 |         Run just the alignment step
423 |         python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/
424 |         """
425 |         sample = os.path.basename(args.read1).split('.')[0]
426 |         g = GuideSeq()
427 |         g.BWA_path = args.bwa
428 |         g.reference_genome = args.genome
429 |         g.output_folder = args.outfolder
430 |         g.samples = [sample]
431 |         g.consolidated = {sample: {}}
432 |         g.consolidated[sample]['read1'] = args.read1
433 |         g.consolidated[sample]['read2'] = args.read2
434 |         g.alignReads()
435 | 
436 |     elif args.command == 'identify':
437 |         """
438 |         Run just the identify step
439 |         python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG
440 |         """
441 |         if 'description' in args:
442 |             description = args.description
443 |         else:
444 |             description = ''
445 | 
446 |         if 'max_score' in args:
447 |             max_score = args.max_score
448 |         else:
449 |             max_score = 7
450 | 
451 |         if 'window_size' in args:
452 |             window_size = args.window_size
453 |         else:
454 |             window_size = 25
455 | 
456 |         g = GuideSeq()
457 |         g.output_folder = args.outfolder
458 |         g.reference_genome = args.genome
459 |         sample = os.path.basename(args.aligned).split('.')[0]
460 |         g.samples = {sample: {'description': description, 'target': args.target_sequence}}
461 |         g.aligned = {sample: args.aligned}
462 |         g.max_score = max_score
463 |         g.window_size = window_size
464 |         g.identifyOfftargetSites()
465 | 
466 |     elif args.command == 'filter':
467 |         """
468 |         Run just the filter step
469 | 
470 |         """
471 |         sample = os.path.basename(args.identified).split('.')[0]
472 |         g = GuideSeq()
473 |         g.output_folder = args.outfolder
474 |         g.bedtools = args.bedtools
475 |         g.samples = {sample: {}, 'control': {}}
476 |         g.identified = {}
477 |         g.identified[sample] = args.identified
478 |         g.identified['control'] = args.background
479 |         g.filterBackgroundSites()
480 | 
481 |     elif args.command == 'visualize':
482 |         """
483 |         Run just the visualize step
484 |         """
485 |         g = GuideSeq()
486 |         g.output_folder = os.path.dirname(args.outfolder)
487 |         sample = os.path.basename(args.infile).split('.')[0]
488 |         g.samples = {sample: {}}
489 |         g.identified = {}
490 |         g.identified[sample] = args.infile
491 |         g.visualize()
492 | 
493 | 
494 | if __name__ == '__main__':
495 |     main()
496 | 


--------------------------------------------------------------------------------
/guideseq/NUC_SIMPLE:
--------------------------------------------------------------------------------
 1 | #
 2 | # This matrix was created by Todd Lowe   12/10/92
 3 | #
 4 | # Uses ambiguous nucleotide codes, probabilities rounded to
 5 | #  nearest integer
 6 | #
 7 | # Lowest score = -4, Highest score = 5
 8 | #
 9 | # Modified by Shengdar Tsai 1/23/16
10 |     A   T   G   C   N
11 | A   10  -5  -5  -5   10
12 | T   -5  10  -5  -5   10
13 | G   -5  -5  10  -5   10
14 | C   -5  -5  -5  10   10
15 | N   10  10  10  10   10


--------------------------------------------------------------------------------
/guideseq/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __version__ = "1.0.2"
4 | 


--------------------------------------------------------------------------------
/guideseq/alignReads.py:
--------------------------------------------------------------------------------
 1 | """
 2 | alignReads
 3 | """
 4 | 
 5 | import subprocess
 6 | import os
 7 | import logging
 8 | 
 9 | logger = logging.getLogger('root')
10 | logger.propagate = False
11 | 
12 | 
13 | def alignReads(cores, BWA_path, genome_path, read1, read2, outfile):
14 | 
15 |     sample_name = os.path.basename(outfile).split('.')[0]
16 |     output_folder = os.path.dirname(outfile)
17 |     if not os.path.exists(output_folder):
18 |         os.makedirs(output_folder)
19 | 
20 |     sample_alignment_paths = {}
21 | 
22 |     # Check if genome is already indexed by bwa
23 |     index_files_extensions = ['.pac', '.amb', '.ann', '.bwt', '.sa']
24 | 
25 |     genome_indexed = True
26 |     for extension in index_files_extensions:
27 |         if not os.path.isfile(genome_path + extension):
28 |             genome_indexed = False
29 |             break
30 | 
31 |     # If the genome is not already indexed, index it
32 |     if not genome_indexed:
33 |         logger.info('Genome index files not detected. Running BWA to generate indices.')
34 |         bwa_index_command = '{0} index {1}'.format(BWA_path, genome_path)
35 |         logger.info('Running bwa command: %s', bwa_index_command)
36 |         subprocess.call(bwa_index_command.split())
37 |         logger.info('BWA genome index generated')
38 |     else:
39 |         logger.info('BWA genome index found.')
40 | 
41 |     # Run paired end alignment against the genome
42 |     logger.info('Running paired end mapping for {0}'.format(sample_name))
43 |     bwa_alignment_command = '{0} mem -t {1} {2} {3} {4}'.format(BWA_path,
44 |                                                          cores,
45 |                                                          genome_path,
46 |                                                          read1,
47 |                                                          read2)
48 | 
49 |     logger.info(bwa_alignment_command)
50 | 
51 |     # Open the outfile and redirect the output of the alignment to it.
52 |     with open(outfile, 'w') as f:
53 |         subprocess.call(bwa_alignment_command.split(), stdout=f)
54 | 
55 |     logger.info('Paired end mapping for {0} completed.'.format(sample_name))
56 | 


--------------------------------------------------------------------------------
/guideseq/filterBackgroundSites.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import os
 3 | 
 4 | def filterBackgroundSites(bedtools_path, sample_path, control_path, outfile):
 5 |     output_folder = os.path.dirname(outfile)
 6 |     if not os.path.exists(output_folder):
 7 |         os.makedirs(output_folder)
 8 | 
 9 |     sample_noHeader = os.path.join(os.path.dirname(sample_path), 'sample_noHeader.txt')
10 |     control_noHeader = os.path.join(os.path.dirname(control_path), 'control_noHeader.txt')
11 | 
12 |     sample_noHeader_command = "sed '1d' {0} > {1}".format(sample_path, sample_noHeader)
13 |     control_noHeader_command = "sed '1d' {0} > {1}".format(control_path, control_noHeader)
14 |     clean_command = "rm {0} {1}".format(control_noHeader, sample_noHeader)
15 |     bedtools_filter_command = '{0} intersect -a {1} -b {2}'.format(bedtools_path, sample_noHeader, control_noHeader)
16 | 
17 |     subprocess.check_call(sample_noHeader_command, shell=True, env=os.environ.copy())
18 |     subprocess.check_call(control_noHeader_command, shell=True, env=os.environ.copy())
19 | 
20 |     with open(outfile, 'w') as output_file:
21 |         subprocess.check_call(bedtools_filter_command, shell=True, env=os.environ.copy(), stdout=output_file)
22 |     subprocess.check_call(clean_command, shell=True, env=os.environ.copy())
23 | 


--------------------------------------------------------------------------------
/guideseq/guideseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | guideseq.py
  6 | ===========
  7 | serves as the wrapper for all guideseq pipeline
  8 | 
  9 | """
 10 | 
 11 | import os
 12 | import sys
 13 | import yaml
 14 | import argparse
 15 | import traceback
 16 | 
 17 | # Set up logger
 18 | import log
 19 | logger = log.createCustomLogger('root')
 20 | 
 21 | from alignReads import alignReads
 22 | from filterBackgroundSites import filterBackgroundSites
 23 | from umi import demultiplex, umitag, consolidate
 24 | from visualization import visualizeOfftargets
 25 | import identifyOfftargetSites
 26 | import validation
 27 | 
 28 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000
 29 | DEFAULT_WINDOW_SIZE = 25
 30 | DEFAULT_MAX_SCORE = 7
 31 | 
 32 | CONSOLIDATE_MIN_QUAL = 15
 33 | CONSOLIDATE_MIN_FREQ = 0.9
 34 | 
 35 | 
 36 | class GuideSeq:
 37 | 
 38 |     def __init__(self):
 39 |         pass
 40 | 
 41 |     def parseManifest(self, manifest_path):
 42 |         logger.info('Loading manifest...')
 43 | 
 44 |         with open(manifest_path, 'r') as f:
 45 |             manifest_data = yaml.safe_load(f)
 46 |         
 47 |         if not "cores" in manifest_data:
 48 |             manifest_data['cores'] = 4
 49 |         
 50 |         # Set default tag/primer sequences if not specified
 51 |         if not "primer1" in manifest_data:
 52 |             manifest_data['primer1'] = 'TTGAGTTGTCATATGTTAAT'
 53 |         if not "primer2" in manifest_data:
 54 |             manifest_data['primer2'] = 'ACATATGACAACTCAATTAA'
 55 | 
 56 |         try:
 57 |             # Validate manifest data
 58 |             validation.validateManifest(manifest_data)
 59 | 
 60 |             self.cores = manifest_data['cores']
 61 |             self.BWA_path = manifest_data['bwa']
 62 |             self.bedtools = manifest_data['bedtools']
 63 |             self.reference_genome = manifest_data['reference_genome']
 64 |             self.output_folder = manifest_data['output_folder']
 65 |             self.undemultiplexed = manifest_data['undemultiplexed']
 66 |             self.samples = manifest_data['samples']
 67 |             self.primer1 = manifest_data['primer1']
 68 |             self.primer2 = manifest_data['primer2']
 69 | 
 70 |         except Exception as e:
 71 |             logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
 72 |             sys.exit()
 73 | 
 74 |         # Allow the user to specify min reads for demultiplex if they want
 75 |         if 'demultiplex_min_reads' in manifest_data:
 76 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
 77 |         else:
 78 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
 79 |         # Allow the user to specify window size for off-target search
 80 |         if 'search_radius' in manifest_data:
 81 |             self.search_radius = manifest_data['search_radius']
 82 |         else:
 83 |             self.search_radius = DEFAULT_WINDOW_SIZE
 84 |         # Allow the user to specify window size for off-target search
 85 |         if 'max_score' in manifest_data:
 86 |             self.max_score = manifest_data['max_score']
 87 |         else:
 88 |             self.max_score = DEFAULT_MAX_SCORE
 89 |         # Allow the user to specify PAM seq. Yichao 3/6/2020
 90 |         if 'PAM' in manifest_data:
 91 |             self.PAM = manifest_data['PAM']
 92 |         else:
 93 |             self.PAM = "NGG"
 94 | 
 95 |         # Make sure the user has specified a control barcode
 96 |         if 'control' not in self.samples.keys():
 97 |             raise AssertionError('Your manifest must have a control sample specified.')
 98 | 
 99 |         # Make sure the user has both a sample and a control
100 |         if len(self.samples) < 2:
101 |             raise AssertionError('Your manifest must have at least one control and one treatment sample.')
102 | 
103 |         logger.info('Successfully loaded manifest.')
104 | 
105 |     def parseManifestDemultiplex(self, manifest_path):
106 |         logger.info('Loading manifest for demultiplexing...')
107 | 
108 |         with open(manifest_path, 'r') as f:
109 |             manifest_data = yaml.load(f)
110 | 
111 |             try:
112 |                 self.output_folder = manifest_data['output_folder']
113 |                 self.undemultiplexed = manifest_data['undemultiplexed']
114 |                 self.samples = manifest_data['samples']
115 | 
116 |             except Exception as e:
117 |                 logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.')
118 |                 quit()
119 | 
120 |         # Allow the user to specify min reads for demultiplex if they want
121 |         if 'demultiplex_min_reads' in manifest_data:
122 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
123 |         else:
124 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
125 | 
126 |         logger.info('Successfully loaded manifest for single-step demultiplexing.')
127 | 
128 |     def demultiplex(self):
129 | 
130 |         logger.info('Demultiplexing undemultiplexed files...')
131 | 
132 |         # Take our two barcodes and concatenate them
133 |         swapped_sample_barcodes = {}
134 |         for sample in self.samples:
135 |             barcode1 = self.samples[sample]['barcode1']
136 |             barcode2 = self.samples[sample]['barcode2']
137 |             barcode = barcode1[1:8] + barcode2[1:8]
138 |             swapped_sample_barcodes[barcode] = sample
139 | 
140 |         try:
141 |             demultiplex.demultiplex(self.undemultiplexed['forward'],
142 |                                     self.undemultiplexed['reverse'],
143 |                                     self.undemultiplexed['index1'],
144 |                                     self.undemultiplexed['index2'],
145 |                                     swapped_sample_barcodes,
146 |                                     os.path.join(self.output_folder, 'demultiplexed'),
147 |                                     min_reads=self.demultiplex_min_reads)
148 | 
149 |             self.demultiplexed = {}
150 |             for sample in self.samples:
151 |                 self.demultiplexed[sample] = {}
152 |                 self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq')
153 |                 self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq')
154 |                 self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq')
155 |                 self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq')
156 | 
157 |             logger.info('Successfully demultiplexed reads.')
158 |         except Exception as e:
159 |             logger.error('Error demultiplexing reads.')
160 |             logger.error(traceback.format_exc())
161 |             quit()
162 | 
163 |     def umitag(self):
164 |         logger.info('umitagging reads...')
165 | 
166 |         try:
167 |             self.umitagged = {}
168 |             for sample in self.samples:
169 |                 self.umitagged[sample] = {}
170 |                 self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq')
171 |                 self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq')
172 | 
173 |                 umitag.umitag(self.demultiplexed[sample]['read1'],
174 |                               self.demultiplexed[sample]['read2'],
175 |                               self.demultiplexed[sample]['index1'],
176 |                               self.demultiplexed[sample]['index2'],
177 |                               self.umitagged[sample]['read1'],
178 |                               self.umitagged[sample]['read2'],
179 |                               os.path.join(self.output_folder, 'umitagged'))
180 | 
181 |             logger.info('Successfully umitagged reads.')
182 |         except Exception as e:
183 |             logger.error('Error umitagging')
184 |             logger.error(traceback.format_exc())
185 |             quit()
186 | 
187 |     def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL):
188 |         logger.info('Consolidating reads...')
189 | 
190 |         try:
191 |             self.consolidated = {}
192 | 
193 |             for sample in self.samples:
194 |                 self.consolidated[sample] = {}
195 |                 self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq')
196 |                 self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq')
197 | 
198 |                 consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq)
199 |                 consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq)
200 | 
201 |             logger.info('Successfully consolidated reads.')
202 |         except Exception as e:
203 |             logger.error('Error umitagging')
204 |             logger.error(traceback.format_exc())
205 |             quit()
206 | 
207 |     def alignReads(self):
208 |         logger.info('Aligning reads...')
209 | 
210 |         try:
211 |             self.aligned = {}
212 |             for sample in self.samples:
213 |                 sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam')
214 |                 alignReads(self.cores,
215 |                            self.BWA_path,
216 |                            self.reference_genome,
217 |                            self.consolidated[sample]['read1'],
218 |                            self.consolidated[sample]['read2'],
219 |                            sample_alignment_path)
220 |                 self.aligned[sample] = sample_alignment_path
221 |                 logger.info('Finished aligning reads to genome.')
222 | 
223 |         except Exception as e:
224 |             logger.error('Error aligning')
225 |             logger.error(traceback.format_exc())
226 |             quit()
227 | 
228 |     def identifyOfftargetSites(self):
229 |         logger.info('Identifying offtarget sites...')
230 | 
231 |         try:
232 |             self.identified = {}
233 | 
234 |             # Identify offtarget sites for each sample
235 |             for sample in self.samples:
236 | 
237 |                 # Prepare sample annotations
238 |                 sample_data = self.samples[sample]
239 |                 annotations = {}
240 |                 annotations['Description'] = sample_data['description']
241 |                 annotations['Targetsite'] = sample
242 | 
243 |                 if sample == 'control':
244 |                     annotations['Sequence'] = ''
245 |                 else:
246 |                     annotations['Sequence'] = sample_data['target']
247 | 
248 |                 samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam')
249 | 
250 |                 self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')
251 | 
252 |                 identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
253 |                                                self.search_radius, self.max_score, self.primer1, self.primer2)
254 | 
255 |             logger.info('Finished identifying offtarget sites.')
256 | 
257 |         except Exception as e:
258 |             logger.error('Error identifying offtarget sites.')
259 |             logger.error(traceback.format_exc())
260 |             quit()
261 | 
262 |     def filterBackgroundSites(self):
263 |         logger.info('Filtering background sites')
264 | 
265 |         try:
266 |             self.filtered = {}
267 | 
268 |             # Filter background in each sample
269 |             for sample in self.samples:
270 |                 if sample != 'control':
271 |                     self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt')
272 |                     filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample])
273 |                     logger.info('Finished background filtering for {0} sample'.format(sample))
274 | 
275 |             logger.info('Finished filtering background sites.')
276 | 
277 |         except Exception as e:
278 |             logger.error('Error filtering background sites.')
279 |             logger.error(traceback.format_exc())
280 | 
281 |     def visualize(self):
282 |         logger.info('Visualizing off-target sites')
283 | 
284 |         # try:
285 |             # for sample in self.samples:
286 |                 # if sample != 'control':
287 |                     # infile = self.identified[sample]
288 |                     # outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets')
289 |                     # visualizeOfftargets(infile, outfile, title=sample)
290 | 
291 |             # logger.info('Finished visualizing off-target sites')
292 | 
293 |         # except Exception as e:
294 |             # logger.error('Error visualizing off-target sites.')
295 |             # logger.error(traceback.format_exc())
296 | 
297 |         for sample in self.samples: ## 3/6/2020 Yichao solved: visualization stopped when one sample failed
298 |             if sample != 'control':
299 |                 try:
300 |                     infile = self.identified[sample]
301 |                     outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets')
302 |                     try:
303 |                         self.PAM
304 |                         visualizeOfftargets(infile, outfile, title=sample,PAM=self.PAM)
305 |                     except:
306 |                         visualizeOfftargets(infile, outfile, title=sample,PAM="NGG")
307 |                 except Exception as e:
308 |                     logger.error('Error visualizing off-target sites: %s'%(sample))
309 |                     logger.error(traceback.format_exc())
310 |         logger.info('Finished visualizing off-target sites')
311 | 
312 | def parse_args():
313 |     parser = argparse.ArgumentParser()
314 | 
315 |     subparsers = parser.add_subparsers(description='Individual Step Commands',
316 |                                        help='Use this to run individual steps of the pipeline',
317 |                                        dest='command')
318 | 
319 |     all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline')
320 |     all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
321 |     all_parser.add_argument('--identifyAndFilter', action='store_true', default=False)
322 |     all_parser.add_argument('--skip_demultiplex', action='store_true', default=False)
323 | 
324 |     demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files')
325 |     demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True)
326 | 
327 |     umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation')
328 |     umitag_parser.add_argument('--read1', required=True)
329 |     umitag_parser.add_argument('--read2', required=True)
330 |     umitag_parser.add_argument('--index1', required=True)
331 |     umitag_parser.add_argument('--index2', required=True)
332 |     umitag_parser.add_argument('--outfolder', required=True)
333 | 
334 |     consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs')
335 |     consolidate_parser.add_argument('--read1', required=True)
336 |     consolidate_parser.add_argument('--read2', required=True)
337 |     consolidate_parser.add_argument('--outfolder', required=True)
338 |     consolidate_parser.add_argument('--min_quality', required=False, type=float)
339 |     consolidate_parser.add_argument('--min_frequency', required=False, type=float)
340 | 
341 |     align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome')
342 |     align_parser.add_argument('--bwa', required=True)
343 |     align_parser.add_argument('--genome', required=True)
344 |     align_parser.add_argument('--read1', required=True)
345 |     align_parser.add_argument('--read2', required=True)
346 |     align_parser.add_argument('--outfolder', required=True)
347 | 
348 |     identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets')
349 |     identify_parser.add_argument('--aligned', required=True)
350 |     identify_parser.add_argument('--genome', required=True)
351 |     identify_parser.add_argument('--outfolder', required=True)
352 |     identify_parser.add_argument('--target_sequence', required=True)
353 |     identify_parser.add_argument('--description', required=False)
354 |     identify_parser.add_argument('--max_score', required=False, type=int, default=7)
355 |     identify_parser.add_argument('--search_radius', required=False, type=int, default=25)
356 | 
357 |     filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites')
358 |     filter_parser.add_argument('--bedtools', required=True)
359 |     filter_parser.add_argument('--identified', required=True)
360 |     filter_parser.add_argument('--background', required=True)
361 |     filter_parser.add_argument('--outfolder', required=True)
362 | 
363 |     visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites')
364 |     visualize_parser.add_argument('--infile', required=True)
365 |     visualize_parser.add_argument('--outfolder', required=True)
366 |     visualize_parser.add_argument('--title', required=False)
367 | 
368 |     return parser.parse_args()
369 | 
370 | 
371 | def main():
372 |     args = parse_args()
373 | 
374 |     if args.command == 'all':
375 | 
376 |         if args.identifyAndFilter:
377 |             try:
378 |                 g = GuideSeq()
379 |                 g.parseManifest(args.manifest)
380 | 
381 |                 # Bootstrap the aligned samfile paths
382 |                 g.aligned = {}
383 |                 for sample in g.samples:
384 |                     g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam')
385 | 
386 |                 g.identifyOfftargetSites()
387 |                 g.filterBackgroundSites()
388 |                 g.visualize()
389 | 
390 |             except Exception as e:
391 |                 print ('Error running only identify and filter.')
392 |                 print (traceback.format_exc())
393 |                 quit()
394 |         elif args.skip_demultiplex:
395 |             try:
396 |                 g = GuideSeq()
397 |                 g.parseManifest(args.manifest)
398 |                 g.demultiplexed = {}
399 |                 for sample in g.samples:
400 |                     g.demultiplexed[sample] = {}
401 |                     g.demultiplexed[sample]['read1'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.r1.fastq')
402 |                     g.demultiplexed[sample]['read2'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.r2.fastq')
403 |                     g.demultiplexed[sample]['index1'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.i1.fastq')
404 |                     g.demultiplexed[sample]['index2'] = os.path.join(g.output_folder, 'demultiplexed', sample + '.i2.fastq')
405 |                     if not os.path.isfile(g.demultiplexed[sample]['read1']):
406 |                         print ("Can't find ",g.demultiplexed[sample]['read1'])
407 |                         exit()
408 |                     if not os.path.isfile(g.demultiplexed[sample]['read2']):
409 |                         print ("Can't find ",g.demultiplexed[sample]['read2'])
410 |                         exit()
411 |                     if not os.path.isfile(g.demultiplexed[sample]['index1']):
412 |                         print ("Can't find ",g.demultiplexed[sample]['index1'])
413 |                         exit()
414 |                     if not os.path.isfile(g.demultiplexed[sample]['index2']):
415 |                         print ("Can't find ",g.demultiplexed[sample]['index2'])
416 |                         exit()
417 | 
418 |                 # Bootstrap the aligned samfile paths
419 |                 # g.aligned = {}
420 |                 # for sample in g.samples:
421 |                     # g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam')
422 | 
423 | 
424 |                 g.umitag()
425 |                 g.consolidate()
426 |                 g.alignReads()
427 |                 g.identifyOfftargetSites()
428 |                 g.filterBackgroundSites()
429 |                 g.visualize()
430 | 
431 |             except Exception as e:
432 |                 print ('Error running only identify and filter.')
433 |                 print (traceback.format_exc())
434 |                 quit()
435 |         else:
436 |             g = GuideSeq()
437 |             g.parseManifest(args.manifest)
438 |             g.demultiplex()
439 |             g.umitag()
440 |             g.consolidate()
441 |             g.alignReads()
442 |             g.identifyOfftargetSites()
443 |             g.filterBackgroundSites()
444 |             g.visualize()
445 | 
446 |     elif args.command == 'demultiplex':
447 |         """
448 |         Run just the demultiplex step given the manifest
449 |         """
450 |         g = GuideSeq()
451 |         g.parseManifestDemultiplex(args.manifest)
452 |         g.demultiplex()
453 | 
454 |     elif args.command == 'umitag':
455 |         """
456 |         Run just the umitag step
457 |         python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/
458 |         """
459 |         g = GuideSeq()
460 |         g.output_folder = args.outfolder
461 |         sample = os.path.basename(args.read1).split('.')[0]
462 |         g.samples = [sample]
463 |         g.demultiplexed = {sample: {}}
464 |         g.demultiplexed[sample]['read1'] = args.read1
465 |         g.demultiplexed[sample]['read2'] = args.read2
466 |         g.demultiplexed[sample]['index1'] = args.index1
467 |         g.demultiplexed[sample]['index2'] = args.index2
468 |         g.umitag()
469 | 
470 |     elif args.command == 'consolidate':
471 |         """
472 |         Run just the consolidate step
473 |         python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14
474 |         """
475 |         sample = os.path.basename(args.read1).split('.')[0]
476 |         g = GuideSeq()
477 |         g.output_folder = args.outfolder
478 |         g.samples = [sample]
479 |         g.umitagged = {sample: {}}
480 |         g.umitagged[sample]['read1'] = args.read1
481 |         g.umitagged[sample]['read2'] = args.read2
482 | 
483 |         if 'min_quality' in args:
484 |             min_qual = args.min_quality
485 |         else:
486 |             min_qual = CONSOLIDATE_MIN_QUAL
487 | 
488 |         if 'min_frequency' in args:
489 |             min_freq = args.min_frequency
490 |         else:
491 |             min_freq = CONSOLIDATE_MIN_FREQ
492 | 
493 |         g.consolidate(min_freq=min_freq, min_qual=min_qual)
494 | 
495 |     elif args.command == 'align':
496 |         """
497 |         Run just the alignment step
498 |         python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/
499 |         """
500 |         sample = os.path.basename(args.read1).split('.')[0]
501 |         g = GuideSeq()
502 |         g.BWA_path = args.bwa
503 |         g.reference_genome = args.genome
504 |         g.output_folder = args.outfolder
505 |         g.samples = [sample]
506 |         g.consolidated = {sample: {}}
507 |         g.consolidated[sample]['read1'] = args.read1
508 |         g.consolidated[sample]['read2'] = args.read2
509 |         g.alignReads()
510 | 
511 |     elif args.command == 'identify':
512 |         """
513 |         Run just the identify step
514 |         python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG
515 |         """
516 |         if 'description' in args:
517 |             description = args.description
518 |         else:
519 |             description = ''
520 | 
521 |         if 'max_score' in args:
522 |             max_score = args.max_score
523 |         else:
524 |             max_score = 7
525 | 
526 |         if 'search_radius' in args:
527 |             search_radius = args.search_radius
528 |         else:
529 |             search_radius = 25
530 | 
531 |         g = GuideSeq()
532 |         g.output_folder = args.outfolder
533 |         g.reference_genome = args.genome
534 |         sample = os.path.basename(args.aligned).split('.')[0]
535 |         g.samples = {sample: {'description': description, 'target': args.target_sequence}}
536 |         g.aligned = {sample: args.aligned}
537 |         g.max_score = max_score
538 |         g.search_radius = search_radius
539 |         g.identifyOfftargetSites()
540 | 
541 |     elif args.command == 'filter':
542 |         """
543 |         Run just the filter step
544 | 
545 |         """
546 |         sample = os.path.basename(args.identified).split('.')[0]
547 |         g = GuideSeq()
548 |         g.output_folder = args.outfolder
549 |         g.bedtools = args.bedtools
550 |         g.samples = {sample: {}, 'control': {}}
551 |         g.identified = {}
552 |         g.identified[sample] = args.identified
553 |         g.identified['control'] = args.background
554 |         g.filterBackgroundSites()
555 | 
556 |     elif args.command == 'visualize':
557 |         """
558 |         Run just the visualize step
559 |         """
560 |         g = GuideSeq()
561 |         g.output_folder = os.path.dirname(args.outfolder)
562 |         sample = os.path.basename(args.infile).split('.')[0]
563 |         g.samples = {sample: {}}
564 |         g.identified = {}
565 |         g.identified[sample] = args.infile
566 |         g.visualize()
567 | 
568 | 
569 | if __name__ == '__main__':
570 |     main()
571 | 


--------------------------------------------------------------------------------
/guideseq/guideseq_visualize_only.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | guideseq.py
  5 | ===========
  6 | serves as the wrapper for all guideseq pipeline
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import yaml
 13 | import argparse
 14 | import traceback
 15 | 
 16 | # Set up logger
 17 | import log
 18 | logger = log.createCustomLogger('root')
 19 | 
 20 | from alignReads import alignReads
 21 | from filterBackgroundSites import filterBackgroundSites
 22 | from umi import demultiplex, umitag, consolidate
 23 | from visualization import visualizeOfftargets
 24 | import identifyOfftargetSites
 25 | import validation
 26 | 
 27 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000
 28 | DEFAULT_WINDOW_SIZE = 25
 29 | DEFAULT_MAX_SCORE = 7
 30 | 
 31 | CONSOLIDATE_MIN_QUAL = 15
 32 | CONSOLIDATE_MIN_FREQ = 0.9
 33 | 
 34 | 
 35 | class GuideSeq:
 36 | 
 37 |     def __init__(self):
 38 |         pass
 39 | 
 40 |     def parseManifest(self, manifest_path):
 41 |         logger.info('Loading manifest...')
 42 | 
 43 |         with open(manifest_path, 'r') as f:
 44 |             manifest_data = yaml.load(f)
 45 | 
 46 |         try:
 47 |             # Validate manifest data
 48 |             validation.validateManifest(manifest_data)
 49 | 
 50 |             self.BWA_path = manifest_data['bwa']
 51 |             self.bedtools = manifest_data['bedtools']
 52 |             self.reference_genome = manifest_data['reference_genome']
 53 |             self.output_folder = manifest_data['output_folder']
 54 |             self.undemultiplexed = manifest_data['undemultiplexed']
 55 |             self.samples = manifest_data['samples']
 56 | 
 57 |         except Exception as e:
 58 |             logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
 59 |             sys.exit()
 60 | 
 61 |         # Allow the user to specify min reads for demultiplex if they want
 62 |         if 'demultiplex_min_reads' in manifest_data:
 63 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
 64 |         else:
 65 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
 66 |         # Allow the user to specify window size for off-target search
 67 |         if 'window_size' in manifest_data:
 68 |             self.window_size = manifest_data['window_size']
 69 |         else:
 70 |             self.window_size = DEFAULT_WINDOW_SIZE
 71 |         # Allow the user to specify window size for off-target search
 72 |         if 'max_score' in manifest_data:
 73 |             self.max_score = manifest_data['max_score']
 74 |         else:
 75 |             self.max_score = DEFAULT_MAX_SCORE
 76 |         # Allow the user to specify PAM seq. Yichao 3/6/2020
 77 |         if 'PAM' in manifest_data:
 78 |             self.PAM = manifest_data['PAM']
 79 |         else:
 80 |             self.PAM = "NGG"
 81 | 
 82 |         # Make sure the user has specified a control barcode
 83 |         if 'control' not in self.samples.keys():
 84 |             raise AssertionError('Your manifest must have a control sample specified.')
 85 | 
 86 |         # Make sure the user has both a sample and a control
 87 |         if len(self.samples) < 2:
 88 |             raise AssertionError('Your manifest must have at least one control and one treatment sample.')
 89 | 
 90 |         logger.info('Successfully loaded manifest.')
 91 | 
 92 |     def parseManifestDemultiplex(self, manifest_path):
 93 |         logger.info('Loading manifest for demultiplexing...')
 94 | 
 95 |         with open(manifest_path, 'r') as f:
 96 |             manifest_data = yaml.load(f)
 97 | 
 98 |             try:
 99 |                 self.output_folder = manifest_data['output_folder']
100 |                 self.undemultiplexed = manifest_data['undemultiplexed']
101 |                 self.samples = manifest_data['samples']
102 | 
103 |             except Exception as e:
104 |                 logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.')
105 |                 quit()
106 | 
107 |         # Allow the user to specify min reads for demultiplex if they want
108 |         if 'demultiplex_min_reads' in manifest_data:
109 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
110 |         else:
111 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
112 | 
113 |         logger.info('Successfully loaded manifest for single-step demultiplexing.')
114 | 
115 |     def demultiplex(self):
116 | 
117 |         logger.info('Demultiplexing undemultiplexed files...')
118 | 
119 |         # Take our two barcodes and concatenate them
120 |         swapped_sample_barcodes = {}
121 |         for sample in self.samples:
122 |             barcode1 = self.samples[sample]['barcode1']
123 |             barcode2 = self.samples[sample]['barcode2']
124 |             barcode = barcode1[1:8] + barcode2[1:8]
125 |             swapped_sample_barcodes[barcode] = sample
126 | 
127 |         try:
128 |             demultiplex.demultiplex(self.undemultiplexed['forward'],
129 |                                     self.undemultiplexed['reverse'],
130 |                                     self.undemultiplexed['index1'],
131 |                                     self.undemultiplexed['index2'],
132 |                                     swapped_sample_barcodes,
133 |                                     os.path.join(self.output_folder, 'demultiplexed'),
134 |                                     min_reads=self.demultiplex_min_reads)
135 | 
136 |             self.demultiplexed = {}
137 |             for sample in self.samples:
138 |                 self.demultiplexed[sample] = {}
139 |                 self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq')
140 |                 self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq')
141 |                 self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq')
142 |                 self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq')
143 | 
144 |             logger.info('Successfully demultiplexed reads.')
145 |         except Exception as e:
146 |             logger.error('Error demultiplexing reads.')
147 |             logger.error(traceback.format_exc())
148 |             quit()
149 | 
150 |     def umitag(self):
151 |         logger.info('umitagging reads...')
152 | 
153 |         try:
154 |             self.umitagged = {}
155 |             for sample in self.samples:
156 |                 self.umitagged[sample] = {}
157 |                 self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq')
158 |                 self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq')
159 | 
160 |                 umitag.umitag(self.demultiplexed[sample]['read1'],
161 |                               self.demultiplexed[sample]['read2'],
162 |                               self.demultiplexed[sample]['index1'],
163 |                               self.demultiplexed[sample]['index2'],
164 |                               self.umitagged[sample]['read1'],
165 |                               self.umitagged[sample]['read2'],
166 |                               os.path.join(self.output_folder, 'umitagged'))
167 | 
168 |             logger.info('Successfully umitagged reads.')
169 |         except Exception as e:
170 |             logger.error('Error umitagging')
171 |             logger.error(traceback.format_exc())
172 |             quit()
173 | 
174 |     def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL):
175 |         logger.info('Consolidating reads...')
176 | 
177 |         try:
178 |             self.consolidated = {}
179 | 
180 |             for sample in self.samples:
181 |                 self.consolidated[sample] = {}
182 |                 self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq')
183 |                 self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq')
184 | 
185 |                 consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq)
186 |                 consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq)
187 | 
188 |             logger.info('Successfully consolidated reads.')
189 |         except Exception as e:
190 |             logger.error('Error umitagging')
191 |             logger.error(traceback.format_exc())
192 |             quit()
193 | 
194 |     def alignReads(self):
195 |         logger.info('Aligning reads...')
196 | 
197 |         try:
198 |             self.aligned = {}
199 |             for sample in self.samples:
200 |                 sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam')
201 |                 alignReads(self.BWA_path,
202 |                            self.reference_genome,
203 |                            self.consolidated[sample]['read1'],
204 |                            self.consolidated[sample]['read2'],
205 |                            sample_alignment_path)
206 |                 self.aligned[sample] = sample_alignment_path
207 |                 logger.info('Finished aligning reads to genome.')
208 | 
209 |         except Exception as e:
210 |             logger.error('Error aligning')
211 |             logger.error(traceback.format_exc())
212 |             quit()
213 | 
214 |     def identifyOfftargetSites(self):
215 |         logger.info('Identifying offtarget sites...')
216 | 
217 |         try:
218 |             self.identified = {}
219 | 
220 |             # Identify offtarget sites for each sample
221 |             for sample in self.samples:
222 | 
223 |                 # Prepare sample annotations
224 |                 sample_data = self.samples[sample]
225 |                 annotations = {}
226 |                 annotations['Description'] = sample_data['description']
227 |                 annotations['Targetsite'] = sample
228 | 
229 |                 if sample is 'control':
230 |                     annotations['Sequence'] = ''
231 |                 else:
232 |                     annotations['Sequence'] = sample_data['target']
233 | 
234 |                 samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam')
235 | 
236 |                 self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')
237 | 
238 |                 identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
239 |                                                self.window_size, self.max_score)
240 | 
241 |             logger.info('Finished identifying offtarget sites.')
242 | 
243 |         except Exception as e:
244 |             logger.error('Error identifying offtarget sites.')
245 |             logger.error(traceback.format_exc())
246 |             quit()
247 | 
248 |     def filterBackgroundSites(self):
249 |         logger.info('Filtering background sites')
250 | 
251 |         try:
252 |             self.filtered = {}
253 | 
254 |             # Filter background in each sample
255 |             for sample in self.samples:
256 |                 if sample != 'control':
257 |                     self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt')
258 |                     filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample])
259 |                     logger.info('Finished background filtering for {0} sample'.format(sample))
260 | 
261 |             logger.info('Finished filtering background sites.')
262 | 
263 |         except Exception as e:
264 |             logger.error('Error filtering background sites.')
265 |             logger.error(traceback.format_exc())
266 | 
267 |     # def visualize(self):
268 |         # logger.info('Visualizing off-target sites')
269 | 
270 |         # try:
271 |             # for sample in self.samples:
272 |                 # if sample != 'control':
273 |                     # infile = self.identified[sample]
274 |                     # outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets')
275 |                     # visualizeOfftargets(infile, outfile, title=sample)
276 | 
277 |             # logger.info('Finished visualizing off-target sites')
278 | 
279 |         # except Exception as e:
280 |             # logger.error('Error visualizing off-target sites.')
281 |             # logger.error(traceback.format_exc())
282 |     def visualize(self):
283 |         logger.info('Visualizing off-target sites')
284 | 
285 |         for sample in self.samples: ## 3/6/2020 Yichao solved: visualization stopped when one sample failed
286 |             if sample != 'control':
287 |                 try:
288 |                     infile = self.identified[sample]
289 |                     outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets')
290 |                     visualizeOfftargets(infile, outfile, title=sample,PAM=self.PAM)
291 |                 except Exception as e:
292 |                     logger.error('Error visualizing off-target sites: %s'%(sample))
293 |                     logger.error(traceback.format_exc())
294 |         logger.info('Finished visualizing off-target sites')
295 | 
296 | def parse_args():
297 |     parser = argparse.ArgumentParser()
298 | 
299 |     subparsers = parser.add_subparsers(description='Individual Step Commands',
300 |                                        help='Use this to run individual steps of the pipeline',
301 |                                        dest='command')
302 | 
303 |     all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline')
304 |     all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
305 |     all_parser.add_argument('--identifyAndFilter', action='store_true', default=False)
306 | 
307 |     demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files')
308 |     demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True)
309 | 
310 |     umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation')
311 |     umitag_parser.add_argument('--read1', required=True)
312 |     umitag_parser.add_argument('--read2', required=True)
313 |     umitag_parser.add_argument('--index1', required=True)
314 |     umitag_parser.add_argument('--index2', required=True)
315 |     umitag_parser.add_argument('--outfolder', required=True)
316 | 
317 |     consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs')
318 |     consolidate_parser.add_argument('--read1', required=True)
319 |     consolidate_parser.add_argument('--read2', required=True)
320 |     consolidate_parser.add_argument('--outfolder', required=True)
321 |     consolidate_parser.add_argument('--min_quality', required=False, type=float)
322 |     consolidate_parser.add_argument('--min_frequency', required=False, type=float)
323 | 
324 |     align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome')
325 |     align_parser.add_argument('--bwa', required=True)
326 |     align_parser.add_argument('--genome', required=True)
327 |     align_parser.add_argument('--read1', required=True)
328 |     align_parser.add_argument('--read2', required=True)
329 |     align_parser.add_argument('--outfolder', required=True)
330 | 
331 |     identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets')
332 |     identify_parser.add_argument('--aligned', required=True)
333 |     identify_parser.add_argument('--genome', required=True)
334 |     identify_parser.add_argument('--outfolder', required=True)
335 |     identify_parser.add_argument('--target_sequence', required=True)
336 |     identify_parser.add_argument('--description', required=False)
337 |     identify_parser.add_argument('--max_score', required=False, type=int, default=7)
338 |     identify_parser.add_argument('--window_size', required=False, type=int, default=25)
339 | 
340 |     filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites')
341 |     filter_parser.add_argument('--bedtools', required=True)
342 |     filter_parser.add_argument('--identified', required=True)
343 |     filter_parser.add_argument('--background', required=True)
344 |     filter_parser.add_argument('--outfolder', required=True)
345 | 
346 |     visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites')
347 |     visualize_parser.add_argument('--infile', required=True)
348 |     visualize_parser.add_argument('--outfolder', required=True)
349 |     visualize_parser.add_argument('--title', required=False)
350 | 
351 |     return parser.parse_args()
352 | 
353 | 
354 | def main():
355 |     args = parse_args()
356 | 
357 |     if args.command == 'all':
358 | 
359 |         if args.identifyAndFilter:
360 |             try:
361 |                 g = GuideSeq()
362 |                 g.parseManifest(args.manifest)
363 | 
364 |                 # Bootstrap the aligned samfile paths
365 |                 g.aligned = {}
366 |                 for sample in g.samples:
367 |                     g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam')
368 | 
369 |                 g.identifyOfftargetSites()
370 |                 g.filterBackgroundSites()
371 |                 g.visualize()
372 | 
373 |             except Exception as e:
374 |                 print ('Error running only identify and filter.')
375 |                 print (traceback.format_exc())
376 |                 quit()
377 |         else:
378 |             g = GuideSeq()
379 |             g.parseManifest(args.manifest)
380 | #            g.demultiplex()
381 | #            g.umitag()
382 | #            g.consolidate()
383 | #            g.alignReads()
384 | #            g.identifyOfftargetSites()
385 | #            g.filterBackgroundSites()
386 |             g.visualize()
387 | 
388 |     elif args.command == 'demultiplex':
389 |         """
390 |         Run just the demultiplex step given the manifest
391 |         """
392 |         g = GuideSeq()
393 |         g.parseManifestDemultiplex(args.manifest)
394 |         g.demultiplex()
395 | 
396 |     elif args.command == 'umitag':
397 |         """
398 |         Run just the umitag step
399 |         python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/
400 |         """
401 |         g = GuideSeq()
402 |         g.output_folder = args.outfolder
403 |         sample = os.path.basename(args.read1).split('.')[0]
404 |         g.samples = [sample]
405 |         g.demultiplexed = {sample: {}}
406 |         g.demultiplexed[sample]['read1'] = args.read1
407 |         g.demultiplexed[sample]['read2'] = args.read2
408 |         g.demultiplexed[sample]['index1'] = args.index1
409 |         g.demultiplexed[sample]['index2'] = args.index2
410 |         g.umitag()
411 | 
412 |     elif args.command == 'consolidate':
413 |         """
414 |         Run just the consolidate step
415 |         python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14
416 |         """
417 |         sample = os.path.basename(args.read1).split('.')[0]
418 |         g = GuideSeq()
419 |         g.output_folder = args.outfolder
420 |         g.samples = [sample]
421 |         g.umitagged = {sample: {}}
422 |         g.umitagged[sample]['read1'] = args.read1
423 |         g.umitagged[sample]['read2'] = args.read2
424 | 
425 |         if 'min_quality' in args:
426 |             min_qual = args.min_quality
427 |         else:
428 |             min_qual = CONSOLIDATE_MIN_QUAL
429 | 
430 |         if 'min_frequency' in args:
431 |             min_freq = args.min_frequency
432 |         else:
433 |             min_freq = CONSOLIDATE_MIN_FREQ
434 | 
435 |         g.consolidate(min_freq=min_freq, min_qual=min_qual)
436 | 
437 |     elif args.command == 'align':
438 |         """
439 |         Run just the alignment step
440 |         python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/
441 |         """
442 |         sample = os.path.basename(args.read1).split('.')[0]
443 |         g = GuideSeq()
444 |         g.BWA_path = args.bwa
445 |         g.reference_genome = args.genome
446 |         g.output_folder = args.outfolder
447 |         g.samples = [sample]
448 |         g.consolidated = {sample: {}}
449 |         g.consolidated[sample]['read1'] = args.read1
450 |         g.consolidated[sample]['read2'] = args.read2
451 |         g.alignReads()
452 | 
453 |     elif args.command == 'identify':
454 |         """
455 |         Run just the identify step
456 |         python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG
457 |         """
458 |         if 'description' in args:
459 |             description = args.description
460 |         else:
461 |             description = ''
462 | 
463 |         if 'max_score' in args:
464 |             max_score = args.max_score
465 |         else:
466 |             max_score = 7
467 | 
468 |         if 'window_size' in args:
469 |             window_size = args.window_size
470 |         else:
471 |             window_size = 25
472 | 
473 |         g = GuideSeq()
474 |         g.output_folder = args.outfolder
475 |         g.reference_genome = args.genome
476 |         sample = os.path.basename(args.aligned).split('.')[0]
477 |         g.samples = {sample: {'description': description, 'target': args.target_sequence}}
478 |         g.aligned = {sample: args.aligned}
479 |         g.max_score = max_score
480 |         g.window_size = window_size
481 |         g.identifyOfftargetSites()
482 | 
483 |     elif args.command == 'filter':
484 |         """
485 |         Run just the filter step
486 | 
487 |         """
488 |         sample = os.path.basename(args.identified).split('.')[0]
489 |         g = GuideSeq()
490 |         g.output_folder = args.outfolder
491 |         g.bedtools = args.bedtools
492 |         g.samples = {sample: {}, 'control': {}}
493 |         g.identified = {}
494 |         g.identified[sample] = args.identified
495 |         g.identified['control'] = args.background
496 |         g.filterBackgroundSites()
497 | 
498 |     elif args.command == 'visualize':
499 |         """
500 |         Run just the visualize step
501 |         """
502 |         g = GuideSeq()
503 |         g.output_folder = os.path.dirname(args.outfolder)
504 |         sample = os.path.basename(args.infile).split('.')[0]
505 |         g.samples = {sample: {}}
506 |         g.identified = {}
507 |         g.identified[sample] = args.infile
508 |         g.visualize()
509 | 
510 | 
511 | if __name__ == '__main__':
512 |     main()
513 | 


--------------------------------------------------------------------------------
/guideseq/guideseq_visualize_only.py~:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | guideseq.py
  5 | ===========
  6 | serves as the wrapper for all guideseq pipeline
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import yaml
 13 | import argparse
 14 | import traceback
 15 | 
 16 | # Set up logger
 17 | import log
 18 | logger = log.createCustomLogger('root')
 19 | 
 20 | from alignReads import alignReads
 21 | from filterBackgroundSites import filterBackgroundSites
 22 | from umi import demultiplex, umitag, consolidate
 23 | from visualization import visualizeOfftargets
 24 | import identifyOfftargetSites
 25 | import validation
 26 | 
 27 | DEFAULT_DEMULTIPLEX_MIN_READS = 10000
 28 | DEFAULT_WINDOW_SIZE = 25
 29 | DEFAULT_MAX_SCORE = 7
 30 | 
 31 | CONSOLIDATE_MIN_QUAL = 15
 32 | CONSOLIDATE_MIN_FREQ = 0.9
 33 | 
 34 | 
 35 | class GuideSeq:
 36 | 
 37 |     def __init__(self):
 38 |         pass
 39 | 
 40 |     def parseManifest(self, manifest_path):
 41 |         logger.info('Loading manifest...')
 42 | 
 43 |         with open(manifest_path, 'r') as f:
 44 |             manifest_data = yaml.load(f)
 45 | 
 46 |         try:
 47 |             # Validate manifest data
 48 |             validation.validateManifest(manifest_data)
 49 | 
 50 |             self.BWA_path = manifest_data['bwa']
 51 |             self.bedtools = manifest_data['bedtools']
 52 |             self.reference_genome = manifest_data['reference_genome']
 53 |             self.output_folder = manifest_data['output_folder']
 54 |             self.undemultiplexed = manifest_data['undemultiplexed']
 55 |             self.samples = manifest_data['samples']
 56 | 
 57 |         except Exception as e:
 58 |             logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
 59 |             sys.exit()
 60 | 
 61 |         # Allow the user to specify min reads for demultiplex if they want
 62 |         if 'demultiplex_min_reads' in manifest_data:
 63 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
 64 |         else:
 65 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
 66 |         # Allow the user to specify window size for off-target search
 67 |         if 'window_size' in manifest_data:
 68 |             self.window_size = manifest_data['window_size']
 69 |         else:
 70 |             self.window_size = DEFAULT_WINDOW_SIZE
 71 |         # Allow the user to specify window size for off-target search
 72 |         if 'max_score' in manifest_data:
 73 |             self.max_score = manifest_data['max_score']
 74 |         else:
 75 |             self.max_score = DEFAULT_MAX_SCORE
 76 | 
 77 |         # Make sure the user has specified a control barcode
 78 |         if 'control' not in self.samples.keys():
 79 |             raise AssertionError('Your manifest must have a control sample specified.')
 80 | 
 81 |         # Make sure the user has both a sample and a control
 82 |         if len(self.samples) < 2:
 83 |             raise AssertionError('Your manifest must have at least one control and one treatment sample.')
 84 | 
 85 |         logger.info('Successfully loaded manifest.')
 86 | 
 87 |     def parseManifestDemultiplex(self, manifest_path):
 88 |         logger.info('Loading manifest for demultiplexing...')
 89 | 
 90 |         with open(manifest_path, 'r') as f:
 91 |             manifest_data = yaml.load(f)
 92 | 
 93 |             try:
 94 |                 self.output_folder = manifest_data['output_folder']
 95 |                 self.undemultiplexed = manifest_data['undemultiplexed']
 96 |                 self.samples = manifest_data['samples']
 97 | 
 98 |             except Exception as e:
 99 |                 logger.error('Incomplete or incorrect manifest file. Please ensure your manifest contains all required fields.')
100 |                 quit()
101 | 
102 |         # Allow the user to specify min reads for demultiplex if they want
103 |         if 'demultiplex_min_reads' in manifest_data:
104 |             self.demultiplex_min_reads = manifest_data['demultiplex_min_reads']
105 |         else:
106 |             self.demultiplex_min_reads = DEFAULT_DEMULTIPLEX_MIN_READS
107 | 
108 |         logger.info('Successfully loaded manifest for single-step demultiplexing.')
109 | 
110 |     def demultiplex(self):
111 | 
112 |         logger.info('Demultiplexing undemultiplexed files...')
113 | 
114 |         # Take our two barcodes and concatenate them
115 |         swapped_sample_barcodes = {}
116 |         for sample in self.samples:
117 |             barcode1 = self.samples[sample]['barcode1']
118 |             barcode2 = self.samples[sample]['barcode2']
119 |             barcode = barcode1[1:8] + barcode2[1:8]
120 |             swapped_sample_barcodes[barcode] = sample
121 | 
122 |         try:
123 |             demultiplex.demultiplex(self.undemultiplexed['forward'],
124 |                                     self.undemultiplexed['reverse'],
125 |                                     self.undemultiplexed['index1'],
126 |                                     self.undemultiplexed['index2'],
127 |                                     swapped_sample_barcodes,
128 |                                     os.path.join(self.output_folder, 'demultiplexed'),
129 |                                     min_reads=self.demultiplex_min_reads)
130 | 
131 |             self.demultiplexed = {}
132 |             for sample in self.samples:
133 |                 self.demultiplexed[sample] = {}
134 |                 self.demultiplexed[sample]['read1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r1.fastq')
135 |                 self.demultiplexed[sample]['read2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.r2.fastq')
136 |                 self.demultiplexed[sample]['index1'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i1.fastq')
137 |                 self.demultiplexed[sample]['index2'] = os.path.join(self.output_folder, 'demultiplexed', sample + '.i2.fastq')
138 | 
139 |             logger.info('Successfully demultiplexed reads.')
140 |         except Exception as e:
141 |             logger.error('Error demultiplexing reads.')
142 |             logger.error(traceback.format_exc())
143 |             quit()
144 | 
145 |     def umitag(self):
146 |         logger.info('umitagging reads...')
147 | 
148 |         try:
149 |             self.umitagged = {}
150 |             for sample in self.samples:
151 |                 self.umitagged[sample] = {}
152 |                 self.umitagged[sample]['read1'] = os.path.join(self.output_folder, 'umitagged', sample + '.r1.umitagged.fastq')
153 |                 self.umitagged[sample]['read2'] = os.path.join(self.output_folder, 'umitagged', sample + '.r2.umitagged.fastq')
154 | 
155 |                 umitag.umitag(self.demultiplexed[sample]['read1'],
156 |                               self.demultiplexed[sample]['read2'],
157 |                               self.demultiplexed[sample]['index1'],
158 |                               self.demultiplexed[sample]['index2'],
159 |                               self.umitagged[sample]['read1'],
160 |                               self.umitagged[sample]['read2'],
161 |                               os.path.join(self.output_folder, 'umitagged'))
162 | 
163 |             logger.info('Successfully umitagged reads.')
164 |         except Exception as e:
165 |             logger.error('Error umitagging')
166 |             logger.error(traceback.format_exc())
167 |             quit()
168 | 
169 |     def consolidate(self, min_freq=CONSOLIDATE_MIN_FREQ, min_qual=CONSOLIDATE_MIN_QUAL):
170 |         logger.info('Consolidating reads...')
171 | 
172 |         try:
173 |             self.consolidated = {}
174 | 
175 |             for sample in self.samples:
176 |                 self.consolidated[sample] = {}
177 |                 self.consolidated[sample]['read1'] = os.path.join(self.output_folder, 'consolidated', sample + '.r1.consolidated.fastq')
178 |                 self.consolidated[sample]['read2'] = os.path.join(self.output_folder, 'consolidated', sample + '.r2.consolidated.fastq')
179 | 
180 |                 consolidate.consolidate(self.umitagged[sample]['read1'], self.consolidated[sample]['read1'], min_qual, min_freq)
181 |                 consolidate.consolidate(self.umitagged[sample]['read2'], self.consolidated[sample]['read2'], min_qual, min_freq)
182 | 
183 |             logger.info('Successfully consolidated reads.')
184 |         except Exception as e:
185 |             logger.error('Error umitagging')
186 |             logger.error(traceback.format_exc())
187 |             quit()
188 | 
189 |     def alignReads(self):
190 |         logger.info('Aligning reads...')
191 | 
192 |         try:
193 |             self.aligned = {}
194 |             for sample in self.samples:
195 |                 sample_alignment_path = os.path.join(self.output_folder, 'aligned', sample + '.sam')
196 |                 alignReads(self.BWA_path,
197 |                            self.reference_genome,
198 |                            self.consolidated[sample]['read1'],
199 |                            self.consolidated[sample]['read2'],
200 |                            sample_alignment_path)
201 |                 self.aligned[sample] = sample_alignment_path
202 |                 logger.info('Finished aligning reads to genome.')
203 | 
204 |         except Exception as e:
205 |             logger.error('Error aligning')
206 |             logger.error(traceback.format_exc())
207 |             quit()
208 | 
209 |     def identifyOfftargetSites(self):
210 |         logger.info('Identifying offtarget sites...')
211 | 
212 |         try:
213 |             self.identified = {}
214 | 
215 |             # Identify offtarget sites for each sample
216 |             for sample in self.samples:
217 | 
218 |                 # Prepare sample annotations
219 |                 sample_data = self.samples[sample]
220 |                 annotations = {}
221 |                 annotations['Description'] = sample_data['description']
222 |                 annotations['Targetsite'] = sample
223 | 
224 |                 if sample is 'control':
225 |                     annotations['Sequence'] = ''
226 |                 else:
227 |                     annotations['Sequence'] = sample_data['target']
228 | 
229 |                 samfile = os.path.join(self.output_folder, 'aligned', sample + '.sam')
230 | 
231 |                 self.identified[sample] = os.path.join(self.output_folder, 'identified', sample + '_identifiedOfftargets.txt')
232 | 
233 |                 identifyOfftargetSites.analyze(samfile, self.reference_genome, self.identified[sample], annotations,
234 |                                                self.window_size, self.max_score)
235 | 
236 |             logger.info('Finished identifying offtarget sites.')
237 | 
238 |         except Exception as e:
239 |             logger.error('Error identifying offtarget sites.')
240 |             logger.error(traceback.format_exc())
241 |             quit()
242 | 
243 |     def filterBackgroundSites(self):
244 |         logger.info('Filtering background sites')
245 | 
246 |         try:
247 |             self.filtered = {}
248 | 
249 |             # Filter background in each sample
250 |             for sample in self.samples:
251 |                 if sample != 'control':
252 |                     self.filtered[sample] = os.path.join(self.output_folder, 'filtered', sample + '_backgroundFiltered.txt')
253 |                     filterBackgroundSites(self.bedtools, self.identified[sample], self.identified['control'], self.filtered[sample])
254 |                     logger.info('Finished background filtering for {0} sample'.format(sample))
255 | 
256 |             logger.info('Finished filtering background sites.')
257 | 
258 |         except Exception as e:
259 |             logger.error('Error filtering background sites.')
260 |             logger.error(traceback.format_exc())
261 | 
262 |     def visualize(self):
263 |         logger.info('Visualizing off-target sites')
264 | 
265 |         try:
266 |             for sample in self.samples:
267 |                 if sample != 'control':
268 |                     infile = self.identified[sample]
269 |                     outfile = os.path.join(self.output_folder, 'visualization', sample + '_offtargets')
270 |                     visualizeOfftargets(infile, outfile, title=sample)
271 | 
272 |             logger.info('Finished visualizing off-target sites')
273 | 
274 |         except Exception as e:
275 |             logger.error('Error visualizing off-target sites.')
276 |             logger.error(traceback.format_exc())
277 | 
278 | 
279 | def parse_args():
280 |     parser = argparse.ArgumentParser()
281 | 
282 |     subparsers = parser.add_subparsers(description='Individual Step Commands',
283 |                                        help='Use this to run individual steps of the pipeline',
284 |                                        dest='command')
285 | 
286 |     all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline')
287 |     all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
288 |     all_parser.add_argument('--identifyAndFilter', action='store_true', default=False)
289 | 
290 |     demultiplex_parser = subparsers.add_parser('demultiplex', help='Demultiplex undemultiplexed FASTQ files')
291 |     demultiplex_parser.add_argument('--manifest', '-m', help='Specify the manifest path', required=True)
292 | 
293 |     umitag_parser = subparsers.add_parser('umitag', help='UMI tag demultiplexed FASTQ files for consolidation')
294 |     umitag_parser.add_argument('--read1', required=True)
295 |     umitag_parser.add_argument('--read2', required=True)
296 |     umitag_parser.add_argument('--index1', required=True)
297 |     umitag_parser.add_argument('--index2', required=True)
298 |     umitag_parser.add_argument('--outfolder', required=True)
299 | 
300 |     consolidate_parser = subparsers.add_parser('consolidate', help='Consolidate UMI tagged FASTQs')
301 |     consolidate_parser.add_argument('--read1', required=True)
302 |     consolidate_parser.add_argument('--read2', required=True)
303 |     consolidate_parser.add_argument('--outfolder', required=True)
304 |     consolidate_parser.add_argument('--min_quality', required=False, type=float)
305 |     consolidate_parser.add_argument('--min_frequency', required=False, type=float)
306 | 
307 |     align_parser = subparsers.add_parser('align', help='Paired end read mapping to genome')
308 |     align_parser.add_argument('--bwa', required=True)
309 |     align_parser.add_argument('--genome', required=True)
310 |     align_parser.add_argument('--read1', required=True)
311 |     align_parser.add_argument('--read2', required=True)
312 |     align_parser.add_argument('--outfolder', required=True)
313 | 
314 |     identify_parser = subparsers.add_parser('identify', help='Identify GUIDE-seq offtargets')
315 |     identify_parser.add_argument('--aligned', required=True)
316 |     identify_parser.add_argument('--genome', required=True)
317 |     identify_parser.add_argument('--outfolder', required=True)
318 |     identify_parser.add_argument('--target_sequence', required=True)
319 |     identify_parser.add_argument('--description', required=False)
320 |     identify_parser.add_argument('--max_score', required=False, type=int, default=7)
321 |     identify_parser.add_argument('--window_size', required=False, type=int, default=25)
322 | 
323 |     filter_parser = subparsers.add_parser('filter', help='Filter identified sites from control sites')
324 |     filter_parser.add_argument('--bedtools', required=True)
325 |     filter_parser.add_argument('--identified', required=True)
326 |     filter_parser.add_argument('--background', required=True)
327 |     filter_parser.add_argument('--outfolder', required=True)
328 | 
329 |     visualize_parser = subparsers.add_parser('visualize', help='Visualize off-target sites')
330 |     visualize_parser.add_argument('--infile', required=True)
331 |     visualize_parser.add_argument('--outfolder', required=True)
332 |     visualize_parser.add_argument('--title', required=False)
333 | 
334 |     return parser.parse_args()
335 | 
336 | 
337 | def main():
338 |     args = parse_args()
339 | 
340 |     if args.command == 'all':
341 | 
342 |         if args.identifyAndFilter:
343 |             try:
344 |                 g = GuideSeq()
345 |                 g.parseManifest(args.manifest)
346 | 
347 |                 # Bootstrap the aligned samfile paths
348 |                 g.aligned = {}
349 |                 for sample in g.samples:
350 |                     g.aligned[sample] = os.path.join(g.output_folder, 'aligned', sample + '.sam')
351 | 
352 |                 g.identifyOfftargetSites()
353 |                 g.filterBackgroundSites()
354 |                 g.visualize()
355 | 
356 |             except Exception as e:
357 |                 print 'Error running only identify and filter.'
358 |                 print traceback.format_exc()
359 |                 quit()
360 |         else:
361 |             g = GuideSeq()
362 |             g.parseManifest(args.manifest)
363 |             g.demultiplex()
364 |             g.umitag()
365 |             g.consolidate()
366 |             g.alignReads()
367 |             g.identifyOfftargetSites()
368 |             g.filterBackgroundSites()
369 |             g.visualize()
370 | 
371 |     elif args.command == 'demultiplex':
372 |         """
373 |         Run just the demultiplex step given the manifest
374 |         """
375 |         g = GuideSeq()
376 |         g.parseManifestDemultiplex(args.manifest)
377 |         g.demultiplex()
378 | 
379 |     elif args.command == 'umitag':
380 |         """
381 |         Run just the umitag step
382 |         python guideseq/guideseq.py umitag --read1 test/data/demultiplexed/EMX1.r1.fastq --read2 test/data/demultiplexed/EMX1.r2.fastq --index1 test/data/demultiplexed/EMX1.i1.fastq --index2 test/data/demultiplexed/EMX1.i2.fastq --outfolder test/output/
383 |         """
384 |         g = GuideSeq()
385 |         g.output_folder = args.outfolder
386 |         sample = os.path.basename(args.read1).split('.')[0]
387 |         g.samples = [sample]
388 |         g.demultiplexed = {sample: {}}
389 |         g.demultiplexed[sample]['read1'] = args.read1
390 |         g.demultiplexed[sample]['read2'] = args.read2
391 |         g.demultiplexed[sample]['index1'] = args.index1
392 |         g.demultiplexed[sample]['index2'] = args.index2
393 |         g.umitag()
394 | 
395 |     elif args.command == 'consolidate':
396 |         """
397 |         Run just the consolidate step
398 |         python guideseq/guideseq.py consolidate --read1 test/data/umitagged/EMX1.r1.umitagged.fastq --read2 test/data/umitagged/EMX1.r2.umitagged.fastq --outfolder test/output/ --min_frequency 0.8 --min_quality 14
399 |         """
400 |         sample = os.path.basename(args.read1).split('.')[0]
401 |         g = GuideSeq()
402 |         g.output_folder = args.outfolder
403 |         g.samples = [sample]
404 |         g.umitagged = {sample: {}}
405 |         g.umitagged[sample]['read1'] = args.read1
406 |         g.umitagged[sample]['read2'] = args.read2
407 | 
408 |         if 'min_quality' in args:
409 |             min_qual = args.min_quality
410 |         else:
411 |             min_qual = CONSOLIDATE_MIN_QUAL
412 | 
413 |         if 'min_frequency' in args:
414 |             min_freq = args.min_frequency
415 |         else:
416 |             min_freq = CONSOLIDATE_MIN_FREQ
417 | 
418 |         g.consolidate(min_freq=min_freq, min_qual=min_qual)
419 | 
420 |     elif args.command == 'align':
421 |         """
422 |         Run just the alignment step
423 |         python guideseq/guideseq.py align --bwa bwa --read1 test/data/consolidated/EMX1.r1.consolidated.fastq --read2 test/data/consolidated/EMX1.r2.consolidated.fastq --genome /Volumes/Media/hg38/hg38.fa --outfolder test/output/
424 |         """
425 |         sample = os.path.basename(args.read1).split('.')[0]
426 |         g = GuideSeq()
427 |         g.BWA_path = args.bwa
428 |         g.reference_genome = args.genome
429 |         g.output_folder = args.outfolder
430 |         g.samples = [sample]
431 |         g.consolidated = {sample: {}}
432 |         g.consolidated[sample]['read1'] = args.read1
433 |         g.consolidated[sample]['read2'] = args.read2
434 |         g.alignReads()
435 | 
436 |     elif args.command == 'identify':
437 |         """
438 |         Run just the identify step
439 |         python guideseq/guideseq.py identify --genome /Volumes/Media/hg38/hg38.fa --aligned test/output/aligned/EMX1.sam --outfolder test/output/ --target_sequence GAGTCCGAGCAGAAGAAGAANGG
440 |         """
441 |         if 'description' in args:
442 |             description = args.description
443 |         else:
444 |             description = ''
445 | 
446 |         if 'max_score' in args:
447 |             max_score = args.max_score
448 |         else:
449 |             max_score = 7
450 | 
451 |         if 'window_size' in args:
452 |             window_size = args.window_size
453 |         else:
454 |             window_size = 25
455 | 
456 |         g = GuideSeq()
457 |         g.output_folder = args.outfolder
458 |         g.reference_genome = args.genome
459 |         sample = os.path.basename(args.aligned).split('.')[0]
460 |         g.samples = {sample: {'description': description, 'target': args.target_sequence}}
461 |         g.aligned = {sample: args.aligned}
462 |         g.max_score = max_score
463 |         g.window_size = window_size
464 |         g.identifyOfftargetSites()
465 | 
466 |     elif args.command == 'filter':
467 |         """
468 |         Run just the filter step
469 | 
470 |         """
471 |         sample = os.path.basename(args.identified).split('.')[0]
472 |         g = GuideSeq()
473 |         g.output_folder = args.outfolder
474 |         g.bedtools = args.bedtools
475 |         g.samples = {sample: {}, 'control': {}}
476 |         g.identified = {}
477 |         g.identified[sample] = args.identified
478 |         g.identified['control'] = args.background
479 |         g.filterBackgroundSites()
480 | 
481 |     elif args.command == 'visualize':
482 |         """
483 |         Run just the visualize step
484 |         """
485 |         g = GuideSeq()
486 |         g.output_folder = os.path.dirname(args.outfolder)
487 |         sample = os.path.basename(args.infile).split('.')[0]
488 |         g.samples = {sample: {}}
489 |         g.identified = {}
490 |         g.identified[sample] = args.infile
491 |         g.visualize()
492 | 
493 | 
494 | if __name__ == '__main__':
495 |     main()
496 | 


--------------------------------------------------------------------------------
/guideseq/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | log.py
 3 | =====
 4 | 
 5 | Setup logging utils for nested module logging
 6 | 
 7 | Adapted from the accepted answer here: http://stackoverflow.com/questions/7621897/python-logging-module-globally
 8 | """
 9 | 
10 | import logging
11 | 
12 | 
13 | def createCustomLogger(name):
14 |     formatter = logging.Formatter(fmt='[%(asctime)s][%(levelname)s][%(module)s] %(message)s', datefmt='%m/%d %I:%M:%S%p')
15 | 
16 |     handler = logging.StreamHandler()
17 |     handler.setFormatter(formatter)
18 | 
19 |     logger = logging.getLogger(name)
20 |     logger.setLevel(logging.DEBUG)
21 |     logger.addHandler(handler)
22 |     return logger
23 | 


--------------------------------------------------------------------------------
/guideseq/validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | validation.py
  3 | =============
  4 | 
  5 | Contains utils for validating the filetype and existence of manifest-defined files/folders
  6 | 
  7 | """
  8 | 
  9 | import logging
 10 | import os
 11 | import sys
 12 | from distutils.spawn import find_executable
 13 | 
 14 | logger = logging.getLogger('root')
 15 | 
 16 | 
 17 | def exists(filepath):
 18 |     if not os.path.isfile(filepath):
 19 |         logger.error('{0} does not exist'.format(filepath))
 20 |         sys.exit()
 21 | 
 22 | 
 23 | def checkIfBinary(filepath):
 24 |     executable = find_executable(filepath)
 25 | 
 26 |     if executable is None:
 27 |         logger.error('Executable binary not found at {0}'.format(filepath))
 28 |         sys.exit()
 29 | 
 30 |     # First check if file exists
 31 |     exists(executable)
 32 | 
 33 |     # Check if file is a valid binary
 34 |     # Adapted from http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python
 35 |     textchars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7f})
 36 |     is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))
 37 | 
 38 |     if not is_binary_string(open(executable, 'rb').read(1024)):
 39 |         logger.error('{0} is not a valid binary'.format(executable))
 40 |         sys.exit()
 41 | 
 42 | 
 43 | def checkIfFasta(filepath):
 44 |     # First check if file exists
 45 |     exists(os.path.abspath(filepath))
 46 | 
 47 | 
 48 | def checkIfFolder(folderpath):
 49 |     # Check if the folder exists
 50 |     if not os.path.isdir(os.path.abspath(folderpath)):
 51 |         logger.error('{0} is not a valid folder path'.format(folderpath))
 52 |         sys.exit()
 53 | 
 54 | 
 55 | def checkIfValidUndemultiplexed(undemultiplexed):
 56 |     # Check if read1, read2, index1, and index2 exist
 57 |     fields = ['forward', 'reverse', 'index1', 'index2']
 58 | 
 59 |     if set(fields) != set(undemultiplexed.keys()):
 60 |         logger.error('Undemultiplexed field must contain references to "forward", "reverse", "index1", "index2"')
 61 |         sys.exit()
 62 | 
 63 |     invalid_file = False
 64 |     for field in fields:
 65 |         if not os.path.isfile(undemultiplexed[field]):
 66 |             logger.error('"read1" undemultiplexed field does not reference a valid file')
 67 |             invalid_file = True
 68 | 
 69 |     if invalid_file:
 70 |         sys.exit()
 71 | 
 72 | 
 73 | def checkIfValidSamples(samples):
 74 |     # Check if control is one of the samples
 75 |     if 'control' not in samples:
 76 |         logger.error('A control sample must be specified')
 77 |         sys.exit()
 78 | 
 79 |     if len(samples.keys()) == 0:
 80 |         logger.error('No samples defined')
 81 |         sys.exit()
 82 | 
 83 |     for sample in samples:
 84 |         if 'barcode1' not in samples[sample] or 'barcode2' not in samples[sample]:
 85 |             logger.error('barcode1 and barcode2 must be specified for {0} sample'.format(sample))
 86 |             sys.exit()
 87 |         if 'target' not in samples[sample]:
 88 |             logger.error('target sequence must be specified for {0} sample'.format(sample))
 89 |             sys.exit()
 90 | 
 91 | 
 92 | def validateManifest(manifest_data):
 93 |     # Check if manifest contains the required fields
 94 |     fields = ['bwa', 'bedtools', 'reference_genome', 'output_folder', 'samples', 'undemultiplexed']
 95 |     missing_fields = False
 96 | 
 97 |     for field in fields:
 98 |         if field not in manifest_data.keys():
 99 |             logger.error('"{0}" field must be specified in manifest'.format(field))
100 |             missing_fields = True
101 | 
102 |     if missing_fields:
103 |         sys.exit()
104 | 
105 |     # Now validate each field
106 |     checkIfBinary(manifest_data['bwa'])
107 |     checkIfBinary(manifest_data['bedtools'])
108 |     checkIfFasta(manifest_data['reference_genome'])
109 |     checkIfValidUndemultiplexed(manifest_data['undemultiplexed'])
110 |     checkIfValidSamples(manifest_data['samples'])
111 | 


--------------------------------------------------------------------------------
/guideseq/visualization.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import svgwrite
  4 | import os
  5 | import logging
  6 | import argparse
  7 | 
  8 | 
  9 | logger = logging.getLogger('root')
 10 | logger.propagate = False
 11 | 
 12 | boxWidth = 10
 13 | box_size = 15
 14 | v_spacing = 3
 15 | 
 16 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', '-': '#FFFFFF'}
 17 | 
 18 | def parseSitesFile(infile):
 19 |     offtargets = []
 20 |     total_seq = 0
 21 |     with open(infile, 'r') as f:
 22 |         f.readline()
 23 |         for line in f:
 24 |             line = line.rstrip('\n')
 25 |             line_items = line.split('\t')
 26 |             offtarget_reads = line_items[9]
 27 |             no_bulge_offtarget_sequence = line_items[19]
 28 |             bulge_offtarget_sequence = line_items[24]
 29 |             target_seq = line_items[35]
 30 |             realigned_target_seq = line_items[36]
 31 | 
 32 |             if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '':
 33 |                 if no_bulge_offtarget_sequence:
 34 |                     total_seq += 1
 35 |                 if bulge_offtarget_sequence:
 36 |                     total_seq += 1
 37 |                 offtargets.append({'seq': no_bulge_offtarget_sequence.strip(),
 38 |                                    'bulged_seq': bulge_offtarget_sequence.strip(),
 39 |                                    'reads': int(offtarget_reads.strip()),
 40 |                                    'target_seq': target_seq.strip(),
 41 |                                    'realigned_target_seq': realigned_target_seq.strip()
 42 |                                    })
 43 |     offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True)
 44 |     return offtargets, target_seq, total_seq
 45 | 
 46 | 
 47 | def visualizeOfftargets(infile, outfile, title=None, PAM="NGG"):
 48 |     # Note: PAM is not currently used
 49 |     output_folder = os.path.dirname(outfile)
 50 |     if not os.path.exists(output_folder):
 51 |         os.makedirs(output_folder)
 52 | 
 53 |     # Get offtargets array from file
 54 |     offtargets, target_seq, total_seq = parseSitesFile(infile)
 55 | 
 56 |     # Initiate canvas
 57 |     dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1)))
 58 | 
 59 |     if title is not None:
 60 |         # Define top and left margins
 61 |         x_offset = 20
 62 |         y_offset = 50
 63 |         dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier"))
 64 |     else:
 65 |         # Define top and left margins
 66 |         x_offset = 20
 67 |         y_offset = 20
 68 | 
 69 |     # Draw ticks
 70 |     tick_locations = [1, len(target_seq)]  # limits
 71 |     if target_seq.find('N') >= 0:
 72 |         if target_seq.index('N') > len(target_seq)/2:  # PAM on the right end
 73 |             tick_locations += range(len(target_seq) + 1)[::10][1:]  # intermediate values
 74 |             tick_locations += range(len(target_seq) + 1)[len(target_seq) - 2: len(target_seq)]  # complementing PAM
 75 |             tick_locations.sort()
 76 |             tick_legend = [str(x) for x in tick_locations[:-3][::-1]] + ['P', 'A', 'M']
 77 |         else:
 78 |             tick_locations += [range(4, len(target_seq) + 1)[::10][1]]
 79 |             tick_locations += range(2, 4) + [5]
 80 |             tick_locations.sort()
 81 |             tick_legend = ['P', 'A', 'M'] + [str(x) for x in [str(x - 4) for x in tick_locations[3:]]]
 82 |         for x, y in zip(tick_locations, tick_legend):
 83 |             dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
 84 |     else:
 85 |         tick_locations = [1, len(target_seq)]
 86 |         tick_locations += range(len(target_seq) + 1)[::10][1:]
 87 |         for x in tick_locations:
 88 |             dwg.add(dwg.text(str(x), insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
 89 | 
 90 |     for x,y in zip(tick_locations, tick_legend):
 91 |         dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
 92 | 
 93 |     # Draw reference sequence row
 94 |     for i, c in enumerate(target_seq):
 95 |         y = y_offset
 96 |         x = x_offset + i * box_size
 97 |         dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c]))
 98 |         dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier"))
 99 | 
100 |     dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
101 | 
102 |     # Draw aligned sequence rows
103 |     y_offset += 1  # leave some extra space after the reference row
104 |     line_number = 0  # keep track of plotted sequences
105 |     for j, seq in enumerate(offtargets):
106 |         realigned_target_seq = offtargets[j]['realigned_target_seq']
107 |         no_bulge_offtarget_sequence = offtargets[j]['seq']
108 |         bulge_offtarget_sequence = offtargets[j]['bulged_seq']
109 | 
110 |         if no_bulge_offtarget_sequence != '':
111 |             k = 0
112 |             line_number += 1
113 |             y = y_offset + line_number * box_size
114 |             for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)):
115 |                 x = x_offset + k * box_size
116 |                 if r == '-':
117 |                     if 0 < k < len(target_seq):
118 |                         x = x_offset + (k - 0.25) * box_size
119 |                         dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
120 |                         dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
121 |                 elif c == r:
122 |                     dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
123 |                     k += 1
124 |                 elif r == 'N':
125 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
126 |                     k += 1
127 |                 else:
128 |                     dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
129 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
130 |                     k += 1
131 |         if bulge_offtarget_sequence != '':
132 |             k = 0
133 |             line_number += 1
134 |             y = y_offset + line_number * box_size
135 |             for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)):
136 |                 x = x_offset + k * box_size
137 |                 if r == '-':
138 |                     if 0 < k < len(realigned_target_seq):
139 |                         x = x_offset + (k - 0.25) * box_size
140 |                         dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
141 |                         dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
142 |                 elif c == r:
143 |                     dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
144 |                     k += 1
145 |                 elif r == 'N':
146 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
147 |                     k += 1
148 |                 else:
149 |                     dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
150 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
151 |                     k += 1
152 | 
153 |         if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '':
154 |             reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2),
155 |                                   fill='black', style="font-size:15px; font-family:Courier")
156 |             dwg.add(reads_text)
157 |         else:
158 |             reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5),
159 |                                   fill='black', style="font-size:15px; font-family:Courier")
160 |             dwg.add(reads_text)
161 |             reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5),
162 |                                   fill='black', style="font-size:23px; font-family:Courier")
163 |             dwg.add(reads_text02)
164 |     dwg.save()
165 | 
166 | 
167 | def main():
168 |     parser = argparse.ArgumentParser(description='Plot visualization plots for aligned reads.')
169 |     parser.add_argument("--identified", help="Full path to sample identified output", required=True)
170 |     parser.add_argument("--outfile", help="Full path to output file", required=True)
171 |     parser.add_argument("--title", help="Plot title", required=True)
172 |     args = parser.parse_args()
173 | 
174 |     print(args)
175 | 
176 |     visualizeOfftargets(args.identified, args.outfile, title=args.title)
177 | 
178 | if __name__ == "__main__":
179 | 
180 |     main()
181 | 


--------------------------------------------------------------------------------
/guideseq/visualization2.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import svgwrite
  3 | import sys
  4 | import os
  5 | import logging
  6 | 
  7 | logger = logging.getLogger('root')
  8 | logger.propagate = False
  9 | 
 10 | boxWidth = 10
 11 | box_size = 15
 12 | v_spacing = 3
 13 | 
 14 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#FFFFFF'}
 15 | 
 16 | 
 17 | def parseSitesFile(infile):
 18 | 	offtargets = []
 19 | 	total_seq = 0
 20 | 	with open(infile, 'r') as f:
 21 | 		f.readline()
 22 | 		for line in f:
 23 | 			line = line.rstrip('\n')
 24 | 			line_items = line.split('\t')
 25 | 			offtarget_reads = line_items[11]
 26 | 			no_bulge_offtarget_sequence = line_items[24]
 27 | 			bulge_offtarget_sequence = line_items[29]
 28 | 			target_seq = line_items[40]
 29 | 			realigned_target_seq = line_items[41]
 30 | 
 31 | 			if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '':
 32 | 				if no_bulge_offtarget_sequence:
 33 | 					total_seq += 1
 34 | 				if bulge_offtarget_sequence:
 35 | 					total_seq += 1
 36 | 				offtargets.append({'seq': no_bulge_offtarget_sequence.strip(),
 37 | 								   'bulged_seq': bulge_offtarget_sequence.strip(),
 38 | 								   'reads': int(offtarget_reads.strip()),
 39 | 								   'target_seq': target_seq.strip(),
 40 | 								   'realigned_target_seq': realigned_target_seq.strip()
 41 | 								   })
 42 | 	offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True)
 43 | 	return offtargets, target_seq, total_seq
 44 | 
 45 | 
 46 | def visualizeOfftargets(infile, outfile, title,PAM):
 47 | 
 48 | 	output_folder = os.path.dirname(outfile)
 49 | 	if not os.path.exists(output_folder):
 50 | 		os.makedirs(output_folder)
 51 | 
 52 | 	# Get offtargets array from file
 53 | 	offtargets, target_seq, total_seq = parseSitesFile(infile)
 54 | 
 55 | 	# Initiate canvas
 56 | 	dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1)))
 57 | 
 58 | 	if title is not None:
 59 | 		# Define top and left margins
 60 | 		x_offset = 20
 61 | 		y_offset = 50
 62 | 		dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier"))
 63 | 	else:
 64 | 		# Define top and left margins
 65 | 		x_offset = 20
 66 | 		y_offset = 20
 67 | 
 68 | 	# Draw ticks
 69 | 	# tick_locations = [1, len(target_seq)]  # limits
 70 | 	# if target_seq.index('N') > len(target_seq)/2:  # PAM on the right end
 71 | 		# tick_locations += range(len(target_seq) + 1)[::10][1:]  # intermediate values
 72 | 		# tick_locations += range(len(target_seq) + 1)[len(target_seq) - 2: len(target_seq)]  # complementing PAM
 73 | 		# tick_locations.sort()
 74 | 		# tick_legend = [str(x) for x in tick_locations[:-3][::-1]] + ['P', 'A', 'M']
 75 | 	# else:
 76 | 		# tick_locations += [range(3, len(target_seq) + 1)[::10][1]]
 77 | 		# tick_locations += range(2, 5)
 78 | 		# tick_locations.sort()
 79 | 		# tick_legend = ['P', 'A', 'M'] + [str(x) for x in [str(x-3) for x in tick_locations[3:]]]
 80 | 	## Assume PAM is on the right end
 81 | 	tick_locations = []
 82 | 	tick_legend = []
 83 | 	PAM_index = target_seq.index(PAM)
 84 | 	print (PAM_index)
 85 | 	count = 0
 86 | 	for i in range(PAM_index,0,-1):
 87 | 		print (i)
 88 | 		count = count+1
 89 | 		if count % 10 == 0:
 90 | 			tick_legend.append(count)
 91 | 			tick_locations.append(i)
 92 | 	tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
 93 | 	tick_locations+=range(PAM_index+1,len(target_seq)+1)
 94 | 	
 95 | 
 96 | 
 97 | 	for x,y in zip(tick_locations, tick_legend):
 98 | 		dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
 99 | 
100 | 	# Draw reference sequence row
101 | 	for i, c in enumerate(target_seq):
102 | 		y = y_offset
103 | 		x = x_offset + i * box_size
104 | 		if i < PAM_index:
105 | 			dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c]))
106 | 		else:
107 | 			dwg.add(dwg.rect((x, y), (box_size, box_size), fill="#B3B3B3"))
108 | 		dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier"))
109 | 
110 | 	dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
111 | 
112 | 	# Draw aligned sequence rows
113 | 	y_offset += 1  # leave some extra space after the reference row
114 | 	line_number = 0  # keep track of plotted sequences
115 | 	for j, seq in enumerate(offtargets):
116 | 		realigned_target_seq = offtargets[j]['realigned_target_seq']
117 | 		no_bulge_offtarget_sequence = offtargets[j]['seq']
118 | 		bulge_offtarget_sequence = offtargets[j]['bulged_seq']
119 | 
120 | 		if no_bulge_offtarget_sequence != '':
121 | 			k = 0
122 | 			line_number += 1
123 | 			y = y_offset + line_number * box_size
124 | 			for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)):
125 | 				x = x_offset + k * box_size
126 | 				if r == '-':
127 | 					if 0 < k < len(target_seq):
128 | 						x = x_offset + (k - 0.25) * box_size
129 | 						if i < PAM_index:
130 | 							dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
131 | 						else:
132 | 							dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill="#FFFFFF"))
133 | 						dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
134 | 				elif c == r:
135 | 					dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
136 | 					k += 1
137 | 				elif r == 'N':
138 | 					dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
139 | 					k += 1
140 | 				else:
141 | 					if i < PAM_index:
142 | 						dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
143 | 					else:
144 | 						dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#FFFFFF"))
145 | 					dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
146 | 					k += 1
147 | 		if bulge_offtarget_sequence != '':
148 | 			k = 0
149 | 			line_number += 1
150 | 			y = y_offset + line_number * box_size
151 | 			for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)):
152 | 				x = x_offset + k * box_size
153 | 				if r == '-':
154 | 					if 0 < k < len(realigned_target_seq):
155 | 						x = x_offset + (k - 0.25) * box_size
156 | 						if i < PAM_index:
157 | 							dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
158 | 						else:
159 | 							dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill="#FFFFFF"))
160 | 						dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
161 | 				elif c == r:
162 | 					dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
163 | 					k += 1
164 | 				elif r == 'N':
165 | 					dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
166 | 					k += 1
167 | 				else:
168 | 					if i < PAM_index:
169 | 						dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
170 | 					else:
171 | 						dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#FFFFFF"))
172 | 					dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
173 | 					k += 1
174 | 
175 | 		if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '':
176 | 			reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2),
177 | 								  fill='black', style="font-size:15px; font-family:Courier")
178 | 			dwg.add(reads_text)
179 | 		else:
180 | 			reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5),
181 | 								  fill='black', style="font-size:15px; font-family:Courier")
182 | 			dwg.add(reads_text)
183 | 			reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5),
184 | 								  fill='black', style="font-size:23px; font-family:Courier")
185 | 			dwg.add(reads_text02)
186 | 	dwg.save()
187 | 
188 | 
189 | def main():
190 | 	try:
191 | 	
192 | 		visualizeOfftargets(sys.argv[1], sys.argv[2], sys.argv[3],sys.argv[4])
193 | 	except:
194 | 		print('Usage: python visualization.py INFILE OUTFILE TITLE PAM')
195 | 
196 | 
197 | if __name__ == '__main__':
198 | 	main()
199 | 


--------------------------------------------------------------------------------
/guideseq/visualization_bk.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import svgwrite
  3 | import sys
  4 | import os
  5 | import logging
  6 | 
  7 | logger = logging.getLogger('root')
  8 | logger.propagate = False
  9 | 
 10 | boxWidth = 10
 11 | box_size = 15
 12 | v_spacing = 3
 13 | 
 14 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#FFFFFF'}
 15 | 
 16 | 
 17 | def parseSitesFile(infile):
 18 |     offtargets = []
 19 |     total_seq = 0
 20 |     with open(infile, 'r') as f:
 21 |         f.readline()
 22 |         for line in f:
 23 |             line = line.rstrip('\n')
 24 |             line_items = line.split('\t')
 25 |             offtarget_reads = line_items[11]
 26 |             no_bulge_offtarget_sequence = line_items[24]
 27 |             bulge_offtarget_sequence = line_items[29]
 28 |             target_seq = line_items[40]
 29 |             realigned_target_seq = line_items[41]
 30 | 
 31 |             if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '':
 32 |                 if no_bulge_offtarget_sequence:
 33 |                     total_seq += 1
 34 |                 if bulge_offtarget_sequence:
 35 |                     total_seq += 1
 36 |                 offtargets.append({'seq': no_bulge_offtarget_sequence.strip(),
 37 |                                    'bulged_seq': bulge_offtarget_sequence.strip(),
 38 |                                    'reads': int(offtarget_reads.strip()),
 39 |                                    'target_seq': target_seq.strip(),
 40 |                                    'realigned_target_seq': realigned_target_seq.strip()
 41 |                                    })
 42 |     offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True)
 43 |     return offtargets, target_seq, total_seq
 44 | 
 45 | 
 46 | def visualizeOfftargets(infile, outfile, title=None):
 47 | 
 48 |     output_folder = os.path.dirname(outfile)
 49 |     if not os.path.exists(output_folder):
 50 |         os.makedirs(output_folder)
 51 | 
 52 |     # Get offtargets array from file
 53 |     offtargets, target_seq, total_seq = parseSitesFile(infile)
 54 | 
 55 |     # Initiate canvas
 56 |     dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1)))
 57 | 
 58 |     if title is not None:
 59 |         # Define top and left margins
 60 |         x_offset = 20
 61 |         y_offset = 50
 62 |         dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier"))
 63 |     else:
 64 |         # Define top and left margins
 65 |         x_offset = 20
 66 |         y_offset = 20
 67 | 
 68 |     # Draw ticks
 69 |     tick_locations = [1, len(target_seq)]  # limits
 70 |     if target_seq.index('N') > len(target_seq)/2:  # PAM on the right end
 71 |         tick_locations += range(len(target_seq) + 1)[::10][1:]  # intermediate values
 72 |         tick_locations += range(len(target_seq) + 1)[len(target_seq) - 2: len(target_seq)]  # complementing PAM
 73 |         tick_locations.sort()
 74 |         tick_legend = [str(x) for x in tick_locations[:-3][::-1]] + ['P', 'A', 'M']
 75 |     else:
 76 |         tick_locations += [range(3, len(target_seq) + 1)[::10][1]]
 77 |         tick_locations += range(2, 5)
 78 |         tick_locations.sort()
 79 |         tick_legend = ['P', 'A', 'M'] + [str(x) for x in [str(x-3) for x in tick_locations[3:]]]
 80 | 
 81 |     for x,y in zip(tick_locations, tick_legend):
 82 |         dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
 83 | 
 84 |     # Draw reference sequence row
 85 |     for i, c in enumerate(target_seq):
 86 |         y = y_offset
 87 |         x = x_offset + i * box_size
 88 |         dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c]))
 89 |         dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier"))
 90 | 
 91 |     dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
 92 | 
 93 |     # Draw aligned sequence rows
 94 |     y_offset += 1  # leave some extra space after the reference row
 95 |     line_number = 0  # keep track of plotted sequences
 96 |     for j, seq in enumerate(offtargets):
 97 |         realigned_target_seq = offtargets[j]['realigned_target_seq']
 98 |         no_bulge_offtarget_sequence = offtargets[j]['seq']
 99 |         bulge_offtarget_sequence = offtargets[j]['bulged_seq']
100 | 
101 |         if no_bulge_offtarget_sequence != '':
102 |             k = 0
103 |             line_number += 1
104 |             y = y_offset + line_number * box_size
105 |             for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)):
106 |                 x = x_offset + k * box_size
107 |                 if r == '-':
108 |                     if 0 < k < len(target_seq):
109 |                         x = x_offset + (k - 0.25) * box_size
110 |                         dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
111 |                         dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
112 |                 elif c == r:
113 |                     dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
114 |                     k += 1
115 |                 elif r == 'N':
116 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
117 |                     k += 1
118 |                 else:
119 |                     dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
120 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
121 |                     k += 1
122 |         if bulge_offtarget_sequence != '':
123 |             k = 0
124 |             line_number += 1
125 |             y = y_offset + line_number * box_size
126 |             for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)):
127 |                 x = x_offset + k * box_size
128 |                 if r == '-':
129 |                     if 0 < k < len(realigned_target_seq):
130 |                         x = x_offset + (k - 0.25) * box_size
131 |                         dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
132 |                         dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
133 |                 elif c == r:
134 |                     dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
135 |                     k += 1
136 |                 elif r == 'N':
137 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
138 |                     k += 1
139 |                 else:
140 |                     dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
141 |                     dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
142 |                     k += 1
143 | 
144 |         if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '':
145 |             reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2),
146 |                                   fill='black', style="font-size:15px; font-family:Courier")
147 |             dwg.add(reads_text)
148 |         else:
149 |             reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5),
150 |                                   fill='black', style="font-size:15px; font-family:Courier")
151 |             dwg.add(reads_text)
152 |             reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5),
153 |                                   fill='black', style="font-size:23px; font-family:Courier")
154 |             dwg.add(reads_text02)
155 |     dwg.save()
156 | 
157 | 
158 | def main():
159 |     if len(sys.argv) >= 3:
160 |         if len(sys.argv) == 4:
161 |             title = sys.argv[3]
162 |         else:
163 |             title = None
164 |         visualizeOfftargets(sys.argv[1], sys.argv[2], title=title)
165 |     else:
166 |         print('Usage: python visualization.py INFILE OUTFILE [TITLE]')
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     main()
171 | 


--------------------------------------------------------------------------------
/guideseq_flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/guideseq_flowchart.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | HTSeq
2 | PyYAML
3 | pyfaidx
4 | svgwrite
5 | regex
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | verbosity=1
3 | detailed-errors=1
4 | exe=1
5 | where=test/


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | import guideseq
 6 | 
 7 | import os
 8 | if os.path.isfile("README.MD"):
 9 | 	with open("README.MD", "r") as fh:
10 | 		long_description = fh.read()
11 | else:
12 | 	long_description="guide-seq"
13 | 
14 | 
15 | setup(
16 | 	name='guide_seq',
17 | 	version=str(guideseq.__version__),
18 | 	description="An easy to use bioinformatic pipeline for the GUIDE-seq assay.",
19 | 	author="Shengdar Q Tsai, Martin Aryee, Ved V Topkar",
20 | 	author_email='STSAI4@mgh.harvard.edu, Aryee.Martin@mgh.harvard.edu, vedtopkar@gmail.com',
21 | 	url='https://github.com/tsailabSJ/guideseq',
22 | 	# packages=find_packages(),
23 | 	packages=[
24 | 		'guideseq',
25 | 		'umi',
26 | 	],
27 | 	package_dir={'guideseq':
28 | 				 'guideseq','umi':'guideseq/umi'},
29 | 	
30 | 	scripts=['guideseq/guideseq.py','guideseq/alignReads.py','guideseq/visualization.py',
31 | 		'guideseq/filterBackgroundSites.py','guideseq/identifyOfftargetSites.py','guideseq/log.py',
32 | 		'guideseq/validation.py'],
33 | 	package_data={'test': ["test/*"]},
34 | 	license="AGPL",
35 | 	include_package_data=True,
36 | 	long_description=long_description,
37 | 	long_description_content_type='text/markdown',
38 | 	keywords='guideseq',
39 | 	classifiers=[
40 | 		'Development Status :: 4 - Beta',
41 | 		'Intended Audience :: Science/Research',
42 | 		'Topic :: Scientific/Engineering :: Bio-Informatics',
43 | 		'Topic :: Scientific/Engineering :: Visualization',
44 | 		'Topic :: Scientific/Engineering :: Information Analysis',
45 | 		'License :: OSI Approved :: GNU General Public License v2 (GPLv2)',
46 | 		'Operating System :: Unix',
47 | 		'Natural Language :: English',
48 | 		"Programming Language :: Python :: 2",
49 | 		'Programming Language :: Python :: 3'
50 | 	]
51 | )
52 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/test/data/demultiplexed/undetermined.i1.fastq:
--------------------------------------------------------------------------------
1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 1:N:0:0
2 | CAGGCATG
3 | +
4 | CCCCCCFF
5 | 


--------------------------------------------------------------------------------
/test/data/demultiplexed/undetermined.i2.fastq:
--------------------------------------------------------------------------------
1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 2:N:0:0
2 | TGGATTGTTTTATGTC
3 | +
4 | CBCCCFFFFFFFGGGG
5 | 


--------------------------------------------------------------------------------
/test/data/demultiplexed/undetermined.r1.fastq:
--------------------------------------------------------------------------------
1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 1:N:0:0
2 | TTCAATAAACTGGGGACAGACTGAGGCAATTACATCATAAACTCTTATTTTTAAAATGAATTAAAAAGAAACCTTTTTGACGGTTTAATTGAGTTGTCATATGTATCACCGACTGCCCATAGAGAGGACTCCAGTCACCAGGCATGATCTC
3 | +
4 | CCCDCFFFFFFFGG2FEGGGGGHHHFGFHHHHHHHHHHHHHHHHHHHHHHHHHHHHHIHHHHHHHHHGGHHHHHHHHHGGHGGGGGHHHHHGHHHHHHHHHHEGDDDFHHFGGGGHHHHHHHHHHGEHHGHHHHHHHHHHHGHHHGHD3F3
5 | 


--------------------------------------------------------------------------------
/test/data/demultiplexed/undetermined.r2.fastq:
--------------------------------------------------------------------------------
1 | @M01326:74:000000000-A6B33:1:2112:16279:24208 2:N:0:0
2 | ACATATGACAACTCAATTAAACCGTCAAAAAGGTTTCTTTTTAATTCATTTTAAAAATAAGAGTTTATGATGTAATTGCCTCAGTCTGTCCCCAGTTTATTGAAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTGACATAAAAAAAAA
3 | +
4 | BCBBCFFFFFFCGGGGGGGGGGHGGGGGGHHGGHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHGHHHHHHHHHHHHHEHHHHHHFHAGFHHHGHGFFHHHHHHHGGGFHHHGEEGEEFGHHGHHHHHH?FFHHHHGHHHH2BF//
5 | 


--------------------------------------------------------------------------------
/test/data/filtered/EMX1_backgroundFiltered.txt:
--------------------------------------------------------------------------------
1 | 1:236259170-236261754	1473	1486	chr1:236259170-236261754_1486_7	EMX1.sam	1486	ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT	7	0	7	0.0	33	0	33	0.0	2	5	3.16227766017	7.116178749862878															EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
2 | 


--------------------------------------------------------------------------------
/test/data/identified/EMX1_identifiedOfftargets.txt:
--------------------------------------------------------------------------------
1 | Chromosome	Min.Position	Max.Position	Name	Filename	Position	WindowSequence	+.mi	-.mi	bi.sum.mi	bi.geometric_mean.mi	+.total	-.total	total.sum	total.geometric_mean	primer1.mi	primer2.mi	primer.geometric_mean	position.stdev	Site_SubstitutionsOnly.Sequence	Site_SubstitutionsOnly.NumSubstitutions	Site_SubstitutionsOnly.Strand	Site_SubstitutionsOnly.Start	Site_SubstitutionsOnly.End	Site_GapsAllowed.Sequence	Site_GapsAllowed.Length	Site_GapsAllowed.Score	Site_GapsAllowed.Substitutions	Site_GapsAllowed.Insertions	Site_GapsAllowed.Deletions	Site_GapsAllowed.Strand	Site_GapsAllowed.Start	Site_GapsAllowed.End	Cell	Targetsite	TargetSequence	RealignedTargetSequence
2 | 15:44108746-44110769	1007	1025	chr15:44108746-44110769_1017_189	EMX1.sam	1017	GTAGACAAGAGTCTAAGCAGAAGAAGAAGAGAGCCACTACCCAACCATCT	116	73	189	92.0217365626	258	148	406	195.407267009	96	80	87.6356092008	4.931631338038255	GAGTCTAAGCAGAAGAAGAAGAG	3	+	1000	1023										EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
3 | 1:236259170-236261754	1465	1486	chr1:236259170-236261754_1486_7	EMX1.sam	1486	ATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGT	7	0	7	0.0	33	0	33	0.0	2	5	3.16227766017	7.116178749862878															EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
4 | 1:236259170-236261754	1531	1539	chr1:236259170-236261754_1531_5	EMX1.sam	1531	GGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTACT	0	5	5	0.0	0	5	5	0.0	1	2	1.41421356237	2.947456530637899															EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
5 | 2:73159981-73162004	1008	1024	chr2:73159981-73162004_1017_489	EMX1.sam	1017	AAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGG	243	246	489	244.49539873	619	541	1160	578.68730762	236	231	233.486616319	4.710360920354193	GAGTCCGAGCAGAAGAAGAAGGG	0	+	1000	1023										EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
6 | 3:197899267-197901348	1075	1081	chr3:197899267-197901348_1080_10	EMX1.sam	1080	TTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGG	0	10	10	0.0	0	32	32	0.0	9	1	3.0	2.5495097567963922															EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
7 | 6:9117792-9119815	1007	1007	chr6:9117792-9119815_1007_4	EMX1.sam	1007	ATGTCCTCAGAGTTCTGTCCATTCTTCTTCTGCTCAGACGTTTTGTCTGA	1	3	4	1.73205080757	1	9	10	3.0	2	2	2.0	0.0	ACGTCTGAGCAGAAGAAGAATGG	3	-	1000	1023										EMX_site1	EMX1	GAGTCCGAGCAGAAGAAGAANGG	none
8 | 


--------------------------------------------------------------------------------
/test/data/identified/control_identifiedOfftargets.txt:
--------------------------------------------------------------------------------
1 | Chromosome	Min.Position	Max.Position	Name	Filename	Position	WindowSequence	+.mi	-.mi	bi.sum.mi	bi.geometric_mean.mi	+.total	-.total	total.sum	total.geometric_mean	primer1.mi	primer2.mi	primer.geometric_mean	position.stdev	Site_SubstitutionsOnly.Sequence	Site_SubstitutionsOnly.NumSubstitutions	Site_SubstitutionsOnly.Strand	Site_SubstitutionsOnly.Start	Site_SubstitutionsOnly.End	Site_GapsAllowed.Sequence	Site_GapsAllowed.Length	Site_GapsAllowed.Score	Site_GapsAllowed.Substitutions	Site_GapsAllowed.Insertions	Site_GapsAllowed.Deletions	Site_GapsAllowed.Strand	Site_GapsAllowed.Start	Site_GapsAllowed.End	Cell	Targetsite	TargetSequence	RealignedTargetSequence
2 | 1:236259170-236261754	1473	1490	chr1:236259170-236261754_1481_7	control.sam	1481	TCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCA	1	6	7	2.44948974278	1	9	10	3.0	2	5	3.16227766017	5.535341001239219															Control	control	None	none
3 | 1:236259170-236261754	1521	1531	chr1:236259170-236261754_1523_14	control.sam	1523	GGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGT	0	14	14	0.0	0	18	18	0.0	7	7	7.0	3.7094473981982814															Control	control	None	none
4 | 3:197899267-197901348	1035	1040	chr3:197899267-197901348_1040_3	control.sam	1040	TAGGGTTGGGTTAGGGTTAGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAG	3	0	3	0.0	5	0	5	0.0	1	1	1.0	2.0548046676563256															Control	control	None	none
5 | 


--------------------------------------------------------------------------------
/test/data/undemultiplexed/undemux.i1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.i1.fastq.gz


--------------------------------------------------------------------------------
/test/data/undemultiplexed/undemux.i2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.i2.fastq.gz


--------------------------------------------------------------------------------
/test/data/undemultiplexed/undemux.r1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.r1.fastq.gz


--------------------------------------------------------------------------------
/test/data/undemultiplexed/undemux.r2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aryeelab/guideseq/997b89242dec2ac4a1a63273025aad371e448b8b/test/data/undemultiplexed/undemux.r2.fastq.gz


--------------------------------------------------------------------------------
/test/data/visualization/EMX1_identifiedOfftargets_offtargets.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8" ?>
2 | <svg baseProfile="full" height="100%" version="1.1" width="100%" xmlns="http://www.w3.org/2000/svg" xmlns:ev="http://www.w3.org/2001/xml-events" xmlns:xlink="http://www.w3.org/1999/xlink"><defs /><text style="font-size:20px; font-family:Courier" x="20" y="30">EMX1_identifiedOfftargets</text><text style="font-size:10px; font-family:Courier" x="22" y="48">1</text><text style="font-size:10px; font-family:Courier" x="352" y="48">23</text><text style="font-size:10px; font-family:Courier" x="157" y="48">10</text><text style="font-size:10px; font-family:Courier" x="307" y="48">20</text><rect fill="#F5F500" height="15" width="15" x="20" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="23" y="62">G</text><rect fill="#FF5454" height="15" width="15" x="35" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="38" y="62">A</text><rect fill="#F5F500" height="15" width="15" x="50" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="53" y="62">G</text><rect fill="#00D118" height="15" width="15" x="65" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="68" y="62">T</text><rect fill="#26A8FF" height="15" width="15" x="80" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="83" y="62">C</text><rect fill="#26A8FF" height="15" width="15" x="95" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="98" y="62">C</text><rect fill="#F5F500" height="15" width="15" x="110" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="113" y="62">G</text><rect fill="#FF5454" height="15" width="15" x="125" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="128" y="62">A</text><rect fill="#F5F500" height="15" width="15" x="140" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="143" y="62">G</text><rect fill="#26A8FF" height="15" width="15" x="155" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="158" y="62">C</text><rect fill="#FF5454" height="15" width="15" x="170" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="173" y="62">A</text><rect fill="#F5F500" height="15" width="15" x="185" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="188" y="62">G</text><rect fill="#FF5454" height="15" width="15" x="200" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="203" y="62">A</text><rect fill="#FF5454" height="15" width="15" x="215" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="218" y="62">A</text><rect fill="#F5F500" height="15" width="15" x="230" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="233" y="62">G</text><rect fill="#FF5454" height="15" width="15" x="245" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="248" y="62">A</text><rect fill="#FF5454" height="15" width="15" x="260" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="263" y="62">A</text><rect fill="#F5F500" height="15" width="15" x="275" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="278" y="62">G</text><rect fill="#FF5454" height="15" width="15" x="290" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="293" y="62">A</text><rect fill="#FF5454" height="15" width="15" x="305" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="308" y="62">A</text><rect fill="#B3B3B3" height="15" width="15" x="320" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="323" y="62">N</text><rect fill="#F5F500" height="15" width="15" x="335" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="338" y="62">G</text><rect fill="#F5F500" height="15" width="15" x="350" y="50" /><text fill="black" style="font-size:15px; font-family:Courier" x="353" y="62">G</text><text style="font-size:15px; font-family:Courier" x="381" y="62">Reads</text><text fill="black" style="font-size:10px; font-family:Courier" x="24.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="39.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="54.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="69.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="84.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="99.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="114.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="129.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="144.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="159.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="174.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="189.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="204.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="219.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="234.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="249.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="264.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="279.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="294.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="309.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="324.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="339.5" y="86">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="354.5" y="86">•</text><text fill="black" style="font-size:15px; font-family:Courier" x="380" y="88">489</text><text fill="black" style="font-size:10px; font-family:Courier" x="24.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="39.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="54.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="69.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="84.5" y="101">•</text><rect fill="#00D118" height="15" width="15" x="95" y="90" /><text fill="black" style="font-size:15px; font-family:Courier" x="98" y="102">T</text><rect fill="#FF5454" height="15" width="15" x="110" y="90" /><text fill="black" style="font-size:15px; font-family:Courier" x="113" y="102">A</text><text fill="black" style="font-size:10px; font-family:Courier" x="129.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="144.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="159.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="174.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="189.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="204.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="219.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="234.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="249.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="264.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="279.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="294.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="309.5" y="101">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="324.5" y="101">•</text><rect fill="#FF5454" height="15" width="15" x="335" y="90" /><text fill="black" style="font-size:15px; font-family:Courier" x="338" y="102">A</text><text fill="black" style="font-size:10px; font-family:Courier" x="354.5" y="101">•</text><text fill="black" style="font-size:15px; font-family:Courier" x="380" y="103">189</text><rect fill="#FF5454" height="15" width="15" x="20" y="105" /><text fill="black" style="font-size:15px; font-family:Courier" x="23" y="117">A</text><rect fill="#26A8FF" height="15" width="15" x="35" y="105" /><text fill="black" style="font-size:15px; font-family:Courier" x="38" y="117">C</text><text fill="black" style="font-size:10px; font-family:Courier" x="54.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="69.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="84.5" y="116">•</text><rect fill="#00D118" height="15" width="15" x="95" y="105" /><text fill="black" style="font-size:15px; font-family:Courier" x="98" y="117">T</text><text fill="black" style="font-size:10px; font-family:Courier" x="114.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="129.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="144.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="159.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="174.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="189.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="204.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="219.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="234.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="249.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="264.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="279.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="294.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="309.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="324.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="339.5" y="116">•</text><text fill="black" style="font-size:10px; font-family:Courier" x="354.5" y="116">•</text><text fill="black" style="font-size:15px; font-family:Courier" x="380" y="118">4</text></svg>


--------------------------------------------------------------------------------
/test/demultiplex_manifest.yaml:
--------------------------------------------------------------------------------
 1 | output_folder: test/output
 2 | 
 3 | demultiplex_min_reads: 1000
 4 | 
 5 | undemultiplexed:
 6 |     forward: test/data/undemultiplexed/undemux.r1.fastq.gz
 7 |     reverse: test/data/undemultiplexed/undemux.r2.fastq.gz
 8 |     index1: test/data/undemultiplexed/undemux.i1.fastq.gz
 9 |     index2: test/data/undemultiplexed/undemux.i2.fastq.gz
10 | 
11 | samples:
12 |     control:
13 |         target:
14 |         barcode1: CTCTCTAC
15 |         barcode2: CTCTCTAT
16 |         description: Control
17 | 
18 |     EMX1:
19 |         target: GAGTCCGAGCAGAAGAAGAANGG
20 |         barcode1: TAGGCATG
21 |         barcode2: TAGATCGC
22 |         description: EMX_site1
23 | 


--------------------------------------------------------------------------------
/test/large_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script downloads a full GUIDE-Seq dataset and performs runs the analysis pipeline.
 3 | # It should be run from the test directory.
 4 | 
 5 | cd large_test
 6 | 
 7 | # Create an output directory with a commit id hash suffix
 8 | OUTDIR=output.`git log --pretty=format:'%h' -n 1`
 9 | mkdir -p $OUTDIR
10 | ln -sf $OUTDIR output
11 | 
12 | # Install bwa
13 | git clone https://github.com/lh3/bwa.git
14 | cd bwa
15 | git checkout tags/0.7.9a
16 | make
17 | cd ..
18 | PATH=`pwd`/bwa:$PATH
19 | 
20 | # Install bedtools
21 | git clone https://github.com/arq5x/bedtools2.git
22 | cd bedtools2
23 | git checkout tags/v2.25.0
24 | make
25 | cd ..
26 | PATH=`pwd`/bedtools2/bin:$PATH
27 | 
28 | # Download test data FASTQs and manifest
29 | wget http://aryee.mgh.harvard.edu/guideseq/data/guideseq_test_fastq.zip
30 | unzip guideseq_test_fastq.zip
31 | 
32 | # Download the reference genome
33 | wget http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta
34 | 
35 | # Run analysis pipeline
36 | python ../../guideseq/guideseq.py all -m test_manifest.yaml
37 | 
38 | # Check that output tables match the reference output
39 | cd output/filtered
40 | md5sum -c ../../reference_output/md5.txt
41 | 


--------------------------------------------------------------------------------
/test/large_test/reference_output/EMX1_backgroundFiltered.txt:
--------------------------------------------------------------------------------
1 | chr1	236260648	236260654	chr1_236260603_164	EMX1.sam	517	1	236260603	GCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTC	160	4	164	25.2982212813	314	4	318	35.4400902933	133	26	58.804761712	18.5089768011										EMX1_U2OS	EMX1	GAGTCCGAGCAGAAGAAGAANGG
2 | chr1	236260691	236260709	chr1_236260677_32	EMX1.sam	518	1	236260677	GACTCAGAATGGAGCAGGTGACCAGGGGTGACTCAGAATGGAGCAGGTGA	7	25	32	13.2287565553	31	28	59	29.4618397253	14	17	15.4272486205	9.32693602086										EMX1_U2OS	EMX1	GAGTCCGAGCAGAAGAAGAANGG
3 | chr2	9877830	9877831	chr2_9877830_36	EMX1.sam	3108	2	9877830	CAGTACCTCCCACTCCCCCAGTGCCCCCCACTCCTCCTAGTACCCCCATT	36	0	36	0.0	39	0	39	0.0	6	27	12.7279220614	1.6996731712										EMX1_U2OS	EMX1	GAGTCCGAGCAGAAGAAGAANGG
4 | 


--------------------------------------------------------------------------------
/test/large_test/reference_output/VEGFA_site1_backgroundFiltered.txt:
--------------------------------------------------------------------------------
1 | chr1	236260648	236260649	chr1_236260619_18	VEGFA_site1.sam	841	1	236260619	GACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCGA	18	0	18	0.0	19	0	19	0.0	5	12	7.74596669241	13.0478700783										VEGFA_site1_U2OS	VEGFA_site1	GGGTGGGGGGAGTTTGCTCCNGG
2 | chr1	236260691	236260709	chr1_236260693_19	VEGFA_site1.sam	843	1	236260693	GGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGT	0	19	19	0.0	0	25	25	0.0	11	8	9.38083151965	8.38152730712										VEGFA_site1_U2OS	VEGFA_site1	GGGTGGGGGGAGTTTGCTCCNGG
3 | chr2	9877830	9877831	chr2_9877830_119	VEGFA_site1.sam	4347	2	9877830	CAGTACCTCCCACTCCCCCAGTGCCCCCCACTCCTCCTAGTACCCCCATT	119	0	119	0.0	617	0	617	0.0	33	81	51.7010638188	1.0										VEGFA_site1_U2OS	VEGFA_site1	GGGTGGGGGGAGTTTGCTCCNGG
4 | 


--------------------------------------------------------------------------------
/test/large_test/reference_output/VEGFA_site2_backgroundFiltered.txt:
--------------------------------------------------------------------------------
1 | chr1	121485100	121485106	chr1_121485107_44	VEGFA_site2.sam	1168	1	121485107	ACAGATGAATTCTCAGTAACTTCCTTGTGTTGTGTGTATTCAACTCACAG	30	14	44	20.4939015319	76	36	112	52.3067873225	20	23	21.4476105895	9.2269064409										VEGFA_site2_U2OS	VEGFA_site2	GACCCCCTCCACCCCGCCTCNGG
2 | chr2	9877830	9877831	chr2_9877830_133	VEGFA_site2.sam	11661	2	9877830	CAGTACCTCCCACTCCCCCAGTGCCCCCCACTCCTCCTAGTACCCCCATT	133	0	133	0.0	297	0	297	0.0	33	96	56.2849891179	0.816496580928										VEGFA_site2_U2OS	VEGFA_site2	GACCCCCTCCACCCCGCCTCNGG
3 | 


--------------------------------------------------------------------------------
/test/large_test/reference_output/VEGFA_site3_backgroundFiltered.txt:
--------------------------------------------------------------------------------
1 | chr1	236260648	236260654	chr1_236260643_273	VEGFA_site3.sam	455	1	236260643	GGGGTGACTCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCA	35	238	273	91.2688336728	35	239	274	91.4603739332	6	260	39.4968353163	11.292504645										VEGFA_site3_U2OS	VEGFA_site3	GGTGAGTGAGTGTGTGCGTGNGG
2 | chr1	236260691	236260709	chr1_236260699_88	VEGFA_site3.sam	456	1	236260699	CAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTA	16	72	88	33.941125497	102	171	273	132.068164218	27	58	39.5727178748	9.71494545992										VEGFA_site3_U2OS	VEGFA_site3	GGTGAGTGAGTGTGTGCGTGNGG
3 | chr2	242838693	242838704	chr2_242838710_5	VEGFA_site3.sam	3057	2	242838710	GTGTGGGTGTTGGGGTGTGTGTGTTGGGGTGTGGGTGTGGGGGTGTGGGT	4	1	5	2.0	4	2	6	2.82842712475	3	2	2.44948974278	6.11228271597	GGGGTGTGGGTGTGGGGGTGTGG	5	23	chr2	242838710	242838733	chr2_242838710_5	1	+	VEGFA_site3_U2OS	VEGFA_site3	GGTGAGTGAGTGTGTGCGTGNGG
4 | 


--------------------------------------------------------------------------------
/test/large_test/reference_output/md5.txt:
--------------------------------------------------------------------------------
1 | e5b7a9b887ecead4a44de490cf9dff2a  EMX1_backgroundFiltered.txt
2 | dc7b726532518eaa1933086e41277c61  VEGFA_site1_backgroundFiltered.txt
3 | fe98c1eb14adbe175a70ecc0b9b452dc  VEGFA_site2_backgroundFiltered.txt
4 | 1533f99dcd11f332e32a655ba25a6efc  VEGFA_site3_backgroundFiltered.txt
5 | 


--------------------------------------------------------------------------------
/test/scripts/bwa_index_alignment.sh:
--------------------------------------------------------------------------------
 1 | # Run bwa to generate index
 2 | 
 3 | bwa index ~/GRCh37/Homo_sapiens_assembly19.fasta
 4 | 
 5 | # Run paired end mapping to generate SAM files
 6 | 
 7 | bwa mem ~/GRCh37/Homo_sapiens_assembly19.fasta emx1.r1.fastq.gz emx1.r2.fastq.gz > ../output/emx1.sam
 8 | bwa mem ~/GRCh37/Homo_sapiens_assembly19.fasta control.r1.fastq.gz control.r2.fastq.gz > ../output/control.sam
 9 | 
10 | 


--------------------------------------------------------------------------------
/test/scripts/compile_dependencies.sh:
--------------------------------------------------------------------------------
 1 | cd test
 2 | git clone https://github.com/lh3/bwa.git
 3 | cd bwa
 4 | git checkout tags/0.7.9a
 5 | make
 6 | cd ..
 7 | PATH=`pwd`/bwa:$PATH
 8 | git clone https://github.com/arq5x/bedtools2.git
 9 | cd bedtools2
10 | git checkout tags/v2.25.0
11 | make
12 | cd ..
13 | PATH=`pwd`/bedtools2/bin:$PATH
14 | cd ..
15 | echo $PATH


--------------------------------------------------------------------------------
/test/scripts/prepare_test_data.sh:
--------------------------------------------------------------------------------
 1 | ## This script generates two test FASTQ datasets:
 2 | ##  1. An undemultiplexed dataset representing several barcoded samples with molecular indexing.
 3 | ##     This dataset represents raw data from a MiSeq run following the GUIDE-Seq protocol
 4 | ##     described in Tsai et al., 2014 (PMID XXXX)
 5 | ##  2. A two sample dataset containing reads from a control and an EMX guide experiment
 6 | ##     that overlap with a small set of test regions: 
 7 | ##     the on-target location, 3 off-target locations and two DSB hotspots. 
 8 | ##     The reads representing the same template molecule (i.e. those with the same 
 9 | ##     molecular barcode have been consolidated).
10 | ##
11 | ## Raw input dataset:
12 | ## /data/joung/sequencing_fastq/131007_M01326_0075_000000000-A6B33/fastq_with_indexes
13 | ## -rw-rw----. 1 ma695 aryee 2.7G Oct 31 12:44 guideseq_test_fastq.zip
14 | ## -rw-rw----. 1 st680 joung 120M Oct 14 14:52 Undetermined_S0_L001_I1_001.fastq.gz
15 | ## -rw-rw----. 1 st680 joung 221M Oct 14 14:53 Undetermined_S0_L001_I2_001.fastq.gz
16 | ## -rw-rw----. 1 st680 joung 1.1G Oct 14 14:58 Undetermined_S0_L001_R1_001.fastq.gz
17 | ## -rw-rw----. 1 st680 joung 1.3G Oct 14 14:59 Undetermined_S0_L001_R2_001.fastq.gz
18 | ##
19 | ## EMX1 has barcode P706 (TAGGCATG), A01 (TAGATCGC).
20 | ## Ignoring the first base and concatenating gives AGGCATGAGATCGC.
21 | ## EMX1 target sequence: GAGTCCGAGCAGAAGAAGAANGG
22 | ##
23 | ## Oligo control has barcode P707 (CTCTCTAC), A02 (CTCTCTAT)
24 | ## Ignoring the first base and concatenating gives TCTCTACTCTCTAT.
25 | 
26 | ON_TARGET="2:73160981-73161004" 
27 | OFF_TARGET="15:44109746-44109769 6:9118792-9118815 2:218378101-218378124"
28 | DSB_HOTSPOTS="1:236260170-236260754 3:197900267-197900348"
29 | 
30 | UMI_PKG_DIR="../../guideseq/umi"
31 | 
32 | # Align reads
33 | INPUT_DIR="/data/joung/sequencing_fastq/131007_M01326_0075_000000000-A6B33/fastq_with_indexes"
34 | BWA_INDEX="/data/aryee/pub/genomes/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/genome.fa"
35 | module load aryee/bwa-0.7.9a 
36 | time bwa mem $BWA_INDEX $INPUT_DIR/Undetermined_S0_L001_R1_001.fastq.gz $INPUT_DIR/Undetermined_S0_L001_R2_001.fastq.gz > undemux.sam
37 | 
38 | # Generate BAM:
39 | module load samtools/0.1.19
40 | samtools view -bS undemux.sam > undemux.bam
41 | 
42 | # Sort BAM
43 | samtools sort undemux.bam undemux.sorted
44 | 
45 | # Index BAMs
46 | samtools index undemux.sorted.bam
47 | 
48 | # Get the names of reads that overlap with the selected test regions:
49 | samtools view undemux.sorted.bam $ON_TARGET $OFF_TARGET $DSB_HOTSPOTS | cut -f1 | sort | uniq > read_names.txt
50 | 
51 | # Subset FASTQs to extract _all_ read pairs where at least one of the reads falls in a specified test region
52 | zcat $INPUT_DIR/Undetermined_S0_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.r1.fastq.gz
53 | zcat $INPUT_DIR/Undetermined_S0_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.r2.fastq.gz
54 | zcat $INPUT_DIR/Undetermined_S0_L001_I1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.i1.fastq.gz
55 | zcat $INPUT_DIR/Undetermined_S0_L001_I2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > undemux_all.i2.fastq.gz
56 | 
57 | # Demultiplex full target region FASTQs
58 | python $UMI_PKG_DIR/demultiplex.py --min_reads 1000 --read1 undemux_all.r1.fastq.gz --read2 undemux_all.r2.fastq.gz --index1 undemux_all.i1.fastq.gz --index2 undemux_all.i2.fastq.gz --sample_barcodes samplekey.txt
59 | 
60 | # Choose a subset of EMX1 and control read names:
61 | cat emx1.r1.fastq  | grep ^@M01326 | cut -f1 -d ' ' | sort | uniq | shuf --random-source emx1.r1.fastq -n 6000 > read_names_sample.txt
62 | cat control.r1.fastq  | grep ^@M01326 | cut -f1 -d ' ' | sort | uniq | shuf --random-source control.r1.fastq -n 2000 >> read_names_sample.txt
63 | 
64 | # Subset FASTQs to extract _a sample of_ read pairs where at least one of the reads falls in a specified test region
65 | zcat $INPUT_DIR/Undetermined_S0_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.r1.fastq.gz
66 | zcat $INPUT_DIR/Undetermined_S0_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.r2.fastq.gz
67 | zcat $INPUT_DIR/Undetermined_S0_L001_I1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.i1.fastq.gz
68 | zcat $INPUT_DIR/Undetermined_S0_L001_I2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names_sample.txt | gzip -c > undemux.i2.fastq.gz
69 | 
70 | # Demultiplex sub-sampled target region FASTQs
71 | python $UMI_PKG_DIR/demultiplex.py --min_reads 1000 --read1 undemux.r1.fastq.gz --read2 undemux.r2.fastq.gz --index1 undemux.i1.fastq.gz --index2 undemux.i2.fastq.gz --sample_barcodes samplekey.txt
72 | 
73 | # Consolidate reads with the same molecular index
74 | for SAMPLE in emx1 control
75 | do
76 |     echo "Consolidating reads for $SAMPLE"
77 |     python $UMI_PKG_DIR/umitag.py --read1_in $SAMPLE.r1.fastq --read2_in $SAMPLE.r2.fastq --read1_out $SAMPLE.r1.umitagged.fastq --read2_out $SAMPLE.r2.umitagged.fastq --index1 $SAMPLE.i1.fastq --index2 $SAMPLE.i2.fastq
78 |     python $UMI_PKG_DIR/consolidate.py $SAMPLE.r1.umitagged.fastq $SAMPLE.r1.consolidated.fastq 15 0.9
79 |     python $UMI_PKG_DIR/consolidate.py $SAMPLE.r2.umitagged.fastq $SAMPLE.r2.consolidated.fastq 15 0.9
80 | done
81 | 
82 | # Copy test datasets to data dir
83 | cp undemux.*.fastq.gz data
84 | for SAMPLE in emx1 control
85 | do
86 |     gzip -c $SAMPLE.r1.consolidated.fastq > data/$SAMPLE.r1.fastq.gz
87 |     gzip -c $SAMPLE.r2.consolidated.fastq > data/$SAMPLE.r2.fastq.gz
88 | done
89 | 


--------------------------------------------------------------------------------
/test/scripts/prepare_test_genome.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script generates a subsetted genome index for use in unit tests
 3 | # Requirements: samtools, bedtools
 4 | 
 5 | mkdir -p genome_prep
 6 | cd genome_prep
 7 | 
 8 | # Download chromosomes 1 2 3 6 15
 9 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.1.fa.gz
10 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.2.fa.gz
11 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.3.fa.gz
12 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.6.fa.gz
13 | wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.15.fa.gz
14 | 
15 | cat *.fa.gz > Homo_sapiens.GRCh37.75.dna.subset.fa.gz
16 | gunzip Homo_sapiens.GRCh37.75.dna.subset.fa.gz
17 | samtools faidx Homo_sapiens.GRCh37.75.dna.subset.fa
18 | 
19 | # Pad test regions with 1kb on either side
20 | bedtools slop -i ../test_regions.bed -g Homo_sapiens.GRCh37.75.dna.subset.fa.fai -b 1000 > test_regions_padded.bed
21 | 
22 | # Extract test genome regions
23 | bedtools getfasta -fi Homo_sapiens.GRCh37.75.dna.subset.fa -bed test_regions_padded.bed -fo test_genome.fa
24 | 
25 | # Move genome fasta to test dir
26 | mv test_genome.fa ../..
27 | cd ..
28 | 
29 | 


--------------------------------------------------------------------------------
/test/scripts/prepare_test_genome_index.sh:
--------------------------------------------------------------------------------
 1 | # This script generates a subsetted genome index for use in unit tests
 2 | # The index is hosted at: http://aryee.mgh.harvard.edu/guideseq/data/Homo_sapiens.GRCh38.dna.subset.masked.fa.index.zip
 3 | # Requirements: samtools, bedtools
 4 | 
 5 | mkdir -p genome_prep
 6 | cd genome_prep
 7 | 
 8 | # Download chromosomes 1 2 3 6 15
 9 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz
10 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.2.fa.gz
11 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.3.fa.gz
12 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.6.fa.gz
13 | wget ftp://ftp.ensembl.org/pub/release-82/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.15.fa.gz
14 | 
15 | cat *.fa.gz > Homo_sapiens.GRCh38.dna.subset.fa.gz
16 | gunzip Homo_sapiens.GRCh38.dna.subset.fa.gz
17 | samtools faidx Homo_sapiens.GRCh38.dna.subset.fa
18 | 
19 | # Pad test regions with 1kb on either side
20 | bedtools slop -i ../test_regions.bed -g Homo_sapiens.GRCh38.dna.subset.fa.fai -b 1000 > test_regions_padded.bed
21 | 
22 | # Generate complement bed file (i.e. non-test regions)
23 | bedtools complement -i test_regions_padded.bed -g Homo_sapiens.GRCh38.dna.subset.fa.fai > test_regions_complement.bed
24 | 
25 | # Mask non-test regions with Ns
26 | bedtools maskfasta -fi Homo_sapiens.GRCh38.dna.subset.fa -fo Homo_sapiens.GRCh38.dna.subset.masked.fa -bed test_regions_complement.bed
27 | 
28 | # Move genome fasta to test dir
29 | mv Homo_sapiens.GRCh38.dna.subset.masked.fa ..
30 | cd ..
31 | 
32 | # Get bwa
33 | git clone https://github.com/lh3/bwa.git
34 | cd bwa
35 | git checkout tags/0.7.12
36 | make
37 | cd ..
38 | 
39 | # Index the genome
40 | bwa/bwa index Homo_sapiens.GRCh38.dna.subset.masked.fa


--------------------------------------------------------------------------------
/test/scripts/samplekey.txt:
--------------------------------------------------------------------------------
1 | emx1	AGGCATGAGATCGC
2 | control	TCTCTACTCTCTAT
3 | 


--------------------------------------------------------------------------------
/test/scripts/test_regions.bed:
--------------------------------------------------------------------------------
1 | 1	236260170	236260754
2 | 2	73160981	73161004 
3 | 2	218378101	218378124
4 | 3	197900267	197900348
5 | 6	9118792	9118815
6 | 15	44109746	44109769
7 | 


--------------------------------------------------------------------------------
/test/test_genome.fa:
--------------------------------------------------------------------------------
 1 | >1:236259170-236261754
 2 | ACTAACCCTGACTAGCCTGCTTATATTGCATCATCTATTTCTTCCCATGAAAACCATGATAAAGGCTCCTGCCCCCAGGTCCCATCACCCCAGCCTGCTGTCTTACCTGAGTACTTCTCCCTGTGGCCCTGCATGGAGTGCCATGCCTCCTGTCTCTAGGGACTGAGTATAACAAAAACCTTTTCCTTTATACCAATTATTTTCATATCTGCATGTCTTACCATACCCAATTAAAACAAATCTCAAATACAATTAAAACTCACTTCTGAGAAATCTTTGCCTAACCCAACCTCGCAAAGATTTTCTACATGTTTTCTTCTAGAACAGAGGTCACCAAGCTACGGCCTGTGGGCCAGTGGCTTGTTTTTCTAAATAAGGTTTTACTGGATTATGGCCATACTCTTCATTTTTGTAATGTTTATGGCTGCTTTTGTGCTATGACAGAGTTAAGTAGTCATGACAGAGACCACATGGCCCATAAGCCTAAACATCTGCTACCTCGCCCTTTAAGAAAAAGTAAGATGACCTCTGTTTTAAAAATATTATAGTTTTAGTTCTTACATTTAGGTCTGAGACCCTTTCTGAGTTAAACATTTTTATGATGTGAAGTAAAAGTTAAGGATATTTTTGTGTATGAACATCCACTTCATCCACAAAGCAGCAATGTTTCTTTTTCAAAAAAGACCATCTTTCCATGTTGAATTATGTTGGTCGTCATCAAAAATTAATTGACCATGTAGCTGTTTCTCAACTTTTTATTCTGTTCCATTGATCTTCGTGTCTATTCTTCTACCAATACCACCCTGTCACGTCTATTGTATCTTTCTAGTAAATCTTAAAATTGGGTAACATAAGCTCTCCATTTCCAGTTTCTGGAAAAATTTGTGTAGAATTTGTTGTAAATAAATTTTTGGTGCTGCAAAAGAAATAGCACTCAAACATAAGTTTAATTTTCTCAGCAAGGCAATTTTACTTCTCTAGAAGGGTGCGACTCGCAGATGGAGCAATGGCCAGAGCACACCTGAACAAGGGAGGGGAAGGGGTTCTGATTCCTGACACAGGTAGCCCCTACTGATGCGTCGTTCCCGTATTGGCTAGGGTTGGACTGCACAGTCTAAGCTAATTCCGATTGGCTACTTTAAAGAGAGCAGGGGTATGAGCCAGAGTGGCGGGGTGAGTAGTTTGGTGGGAAGGGTGGTTACAGAACAGGTGACTCAGGATGATTCAGGTCAGAGCAGGTGACCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGTGACTCAGAATGGAGCAGGTGACCAGGGGAATAGACGTTAACTACTGATTAGAACTGTTGGAAAAGGTTGTTTAGTGAAACTAGGGCTGAGGAGAACGAGGAAGTTCAACTTTAAAATGGAGAACAAAGAACTGAACATACTGACATACTGATTCTTTGAAGAGAAATTTAGAACTCACTGTATTCAACAAATTATTATTTTTGCTTTTAAGTGTCTGTGGAATTCACCGGTGATCCACCTGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCATTGTGCCCGGCCAAGAACAGCCTTCCATTTCTACCTTGCCTGTAGCCTCCCAAACCAAAATACGGGTCTCAACACCTGGTCCTTGATCTCCTTGACTCCCGTTGGACCCAAGAAGTATGGCTTCTCTCCCCTTTCCTCTGGTGGATGGCCCTGGACACAACCCACCTCACACTAACAGAGACAGAGGGGTGGTCCAAGGGAAGTGAGTGAGAAGTTACCTGCAGAAGAGGGAACTTTTGAGAGAAGGAACCAGGTAGCTGTGAGGCCCTTGAAAACAGAAGGCTGTCAAGGCAGATTTGAAGGGAGAATATATGTGAATTAGCAAGTGCAGGAGATAGAGAGAGCACTAACTATTAGAAAGAAATGAAAATAGGGCCAGGCGCGGTGGCTCACACTTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGTGTATCACTTGAGGTCAGGAGTTCGAGTCCAGCCTGGCCAACATGGCGAAACCTCCCCACCATGTCTACTAAAAATACAAAAATTAGCTGGGTGTGATGGTGTGCACATGTAATCCCAACTACTTGGGAGGCTGAGGCAGGAGAATCTCTTGAACCTGGGAGGCGGAGGCTGCAGTGAGTCAAGATTGTGCCACTGCACTCCAGCCTGGGCGAAAGAATGAGACTCCATCAGAAAGGAAGAAAGGAAGGGAAGGGGAAAAGAAGGGAAGGGAAGGGAAGGGAAGGGAAGGGAAGGAAGGAAAGGAAGGGAAGGAAGGAAGGGAAAGGAAGAAAGGAAGGGAAGGGAAAAGAAGGGAAGGGAAG
 3 | >2:73159981-73162004
 4 | TCTCCTGACTGTTCCTTGTGTGACCTGTTCCCACATCTGGATGGGCTGCAGGAGCCAGTGCTGTGGGGACAGAAGGTCTGGAGCTGCCCGTGAAGGGCAGAATGCTGCCCTCAGACCCGCTTCCTCCCTGTCCTTGTCTGTCCAAGGAGAATGAGGTCTCACTGGTGGATTTCGGACTACCCTGAGGAGCTGGCACCTGAGGGACAAGGCCCCCCACCTGCCCAGCTCCAGCCTCTGATGAGGGGTGGGAGAGAGCTACATGAGGTTGCTAAGAAAGCCTCCCCTGAAGGAGACCACACAGTGTGTGAGGTTGGAGTCTCTAGCAGCGGGTTCTGTGCCCCCAGGGATAGTCTGGCTGTCCAGGCACTGCTCTTGATATAAACACCACCTCCTAGTTATGAAACCATGCCCATTCTGCCTCTCTGTATGGAAAAGAGCATGGGGCTGGCCCGTGGGGTGGTGTCCACTTTAGGCCCTGTGGGAGATCATGGGAACCCACGCAGTGGGTCATAGGCTCTCTCATTTACTACTCACATCCACTCTGTGAAGAAGCGATTATGATCTCTCCTCTAGAAACTCGTAGAGTCCCATGTCTGCCGGCTTCCAGAGCCTGCACTCCTCCACCTTGGCTTGGCTTTGCTGGGGCTAGAGGAGCTAGGATGCACAGCAGCTCTGTGACCCTTTGTTTGAGAGGAACAGGAAAACCACCCTTCTCTCTGGCCCACTGTGTCCTCTTCCTGCCCTGCCATCCCCTTCTGTGAATGTTAGACCCATGGGAGCAGCTGGTCAGAGGGGACCCCGGCCTGGGGCCCCTAACCCTATGTAGCCTCAGTCTTCCCATCAGGCTCTCAGCTCAGCCTGAGTGTTGAGGCCCCAGTGGCTGCTCTGGGGGCCTCCTGAGTTTCTCATCTGTGCCCCTCCCTCCCTGGCCCAGGTGAAGGTGTGGTTCCAGAACCGGAGGACAAAGTACAAACGGCAGAAGCTGGAGGAGGAAGGGCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAACCGGTGGCGCATTGCCACGAAGCAGGCCAATGGGGAGGACATCGATGTCACCTCCAATGACTAGGGTGGGCAACCACAAACCCACGAGGGCAGAGTGCTGCTTGCTGCTGGCCAGGCCCCTGCGTGGGCCCAAGCTGGACTCTGGCCACTCCCTGGCCAGGCTTTGGGGAGGCCTGGAGTCATGGCCCCACAGGGCTTGAAGCCCGGGGCCGCCATTGACAGAGGGACAAGCAATGGGCTGGCTGAGGCCTGGGACCACTTGGCCTTCTCCTCGGAGAGCCTGCCTGCCTGGGCGGGCCCGCCCGCCACCGCAGCCTCCCAGCTGCTCTCCGTGTCTCCAATCTCCCTTTTGTTTTGATGCATTTCTGTTTTAATTTATTTTCCAGGCACCACTGTAGTTTAGTGATCCCCAGTGTCCCCCTTCCCTATGGGAATAATAAAAGTCTCTCTCTTAATGACACGGGCATCCAGCTCCAGCCCCAGAGCCTGGGGTGGTAGATTCCGGCTCTGAGGGCCAGTGGGGGCTGGTAGAGCAAACGCGTTCAGGGCCTGGGAGCCTGGGGTGGGGTACTGGTGGAGGGGGTCAAGGGTAATTCATTAACTCCTCTCTTTTGTTGGGGGACCCTGGTCTCTACCTCCAGCTCCACAGCAGGAGAAACAGGCTAGACATAGGGAAGGGCCATCCTGTATCTTGAGGGAGGACAGGCCCAGGTCTTTCTTAACGTATTGAGAGGTGGGAATCAGGCCCAGGTAGTTCAATGGGAGAGGGAGAGTGCTTCCCTCTGCCTAGAGACTCTGGTGGCTTCTCCAGTTGAGGAGAAACCAGAGGAAAGGGGAGGATTGGGGTCTGGGGGAGGGAACACCATTCACAAAGGCTGACGGTTCCAGTCCGAAGTCGTGGGCCCACCAGGATGCTCACCTGTCCTTGGAGAACCGCTGGGCAGGTTGAGACTGCAGAGACAGGGCTTAAGGCTGAGCCTGCAAC
 5 | >2:218377101-218379124
 6 | GCAATATCACAGATGATTCATCCCAAAAGGGCCCTCCAAAGGTCATCCTGGCAATGCCCTTGCCTTGCTTTGAAATGACTCTTGCCTGTTATAAATAGATGCAAATATTGAGTGGGTAGGGAGATTCTCCTATTCTTAAAGCTGTCAAGGTCAGGAGATGAATCTGCCCCCTCAGTCACCTGCTCACAGGTTCCCACGACTCTGACATTCAGGAAGCCAAGAAGAGATAAACTTCCTTTATCAGGATCCAATCTTCAACACTAGTCATTTGAAAACTATATCCAACTGTCACTTGTAGATCCTCTATCTCTCTGCATCTGCATCCGGAATACTGGCAGCAATGACGACAACAATAGCAGATCAGCATAAAACCAAAAACACATTCATAAACACACACACATACACACTACACACACACATACATACATACACACATATACACACAAACACACACATGCACACATGCATACATACATGTACACACACATGCATACACACATACACATGCATACGCCATACAAATGCATACACATACACATGCATACACACACATATACAAACATCCATGTATACACACACATACACACATGCATATACATATACACATACACACTACACACATACACACATACGTGCATACACACATACATGCATGCATACACATACACGCATACACACATCCATACATACACACAAACACACATGCATATACACACGCACACACACACAATTAGGCAGAGTCTCAGAGGAAAGAGGAAGATCTGTTGAACTGAAAAGTATCCTTAGAGAAAGAATTGGGAAACCAGAGGCAGAAAAGAATAGACTTTCATAGCCCCTGAGCACTCCAAAATATTGCAGAGTTTCCTCATCTACTTTCATTTTCTCCACCAGCCCAGCTTCACCCCTAGAGGTCAAGAAGGCTCTTTTTGATTCTGTCAAAGTCTCCCTGGCCAACCTCTTTGTATAAGAAAACAGAAAAACATTCAGACACGAACCTGACCCTTTATTCTCCTGCTTAGACTCCTTCACTGGCATCCCCTATCTAAGCCCAGCTAAGCTAAGTCACATAGATAAATAGCTACTTTACCAAGTGGAATGTATCAAGTGTGATAAGAAATGCAGGGAGGAACTCAGTGAAAGAAGAAATTAATTCCAACTGGAGAGATCCAGTTCCACTGAGAAGGTGGTATTATACAGAGCCCCAGAGCAGGCATAGAGGTAGAAAGACTCCAAAGTGCAATGTTCTAAGAAACAGTGAGTGTGCTGGGCTGTAATGAAGAATGCATATCGTGAGGTGGCAGGATATATTGTCATCAGAGAGTAGGCTGGGGCCATGTCAGGGAACACCTGAAATGTAAAATTTCCTGTATCCTTGAATCACTTTTTTGGCTTCACTTTTCTTTTTCTTTTTTTTTTTTTTTTTGAGACGGAGTCTTGCTTTGTCGCCCAGGCTGGAGTACAGTGGCACGATCTCAGCTCACTGCAACCTCCGCCTCTCAGGTTCAAGCAATTCTCCTGTCTCAGCCTCCTGAGTAGCTGGGGATTACAGGCGCACACCACCACGCCTGGCTAATTTTTGTATTTTTTTAGTAGAGACAGGGTTTCACCAGGTTGGCCAGGCTGGTTCCGAACTCATGACCTCGTGATCTGCCTGCCTCGGCCTCCCAAAGTGCTGAGATTACAGGCGTGAGCCACCATGCTCAGCCTTGGCTTCACTTTTCTAATGTCTCCACATTTGGGATCCAGAATGGTTATCATATACCACAAGCCTGGTCATGTTTGATGCATCAAGATTTATAGCTTTTAAAGTGCTTTGAATTATAGACTAATAGATTAATAGATATTAATTAATAGATAAAAATAACAAAAATAGTCCACCTTTGGAAGTAATATGCTTAATGTGCTCTATGTAACCTTGGAGTGAACTGTTTCTGATAACTTCTTGTGGCCCAAAATGTTATCCTGCCCTAAAGAAATACTAACTGGCTCAGTTGAAGATGGGCATGGTAGTTTCCATGTGAGG
 7 | >3:197899267-197901348
 8 | ACGGATTGCTTTGTGTACTTTGGGAAACTTAACAATGTGGTCTACAAATCCACAAATAAGATACATTTTTACATTTATTGGAAGTTTAATTTCCTTAAGTAATGTCTTATAATTTCCCTCATCTAAGTCTTGTCGTTTCATTCCATTTATTCCTAAGTATAATATTGCTATTGGTATTATTTAAGGTAGAATTTTCATAATTTGGTTTAGAGATTATTCATTCCTAGCATATACATATAAAATGGAATGTTTGGCCAGGCACCCGGGCTCATACCTGTAACCCAAGCAGGTTGAGAGGCTGAGGAAGGGTTAGGGTTAGGGTTGGGGTTGGGGTTGGGGTTAGGCTTAGGGCTTAGGGCTAGGGCTAGGGCTAGGGCTAGAGTTAGGGTTGGGTTAGGGTTGGGTTAGGGTAGGGTTAGGGTTAGGGGTTAGGGGTTAGGGTTCGGGTTCGGGTTTGGGTTATGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGGTTCGGGTTAGGGTTCAGGTTAGGGTTCTGGTTGGGTTTAGTGTTAGGGTTTAGGGTTCGGGTTTGGGTTAGGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAAGGATTAAGGGTTAAGGGTCAGGGTCAGGGGTTTGGGTCAAGGGTTAGGGTTAGGGGTTAAGAGTTAGGGGTTAGGGATTATGGTTTGGGTGAGGGGTGAGGGGTGAGGGTGAGGGTTAGGGTTAGCGTTTTAGGGTTATGGTTAGGGTTAAGGGTTAGGGTTAGGGGTTAGGGGTTAAGGGTTAGGGGTAGGATAAGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTAAGGATTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGGTTCGGGTTAGGGTTCAGGTTAGGGTTCTGGTTGGGTTTAGTGTTAGGGTTTAGGGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTCGGGTTTAGGGTTCAGGTTTATGGTTCGGGTTAGGGTTCAGGTTAGGTTTCTGGTTGGGTTTAGTGTTAGGGTTTAGGGTTCGGGTTTGGGTTAGGGGTTAGGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAAGGGTTAAGGGTCAGGGTCAGGGTCAGGGGTTTGGGTCAAGGGTTAGGGTCAAGTGTTAGGGTTAGGGGTTAAGAGTTAAGGCTTAGGGATTATGGTTTGGGTGAGGGGTGAGGGGTGAGGGTGAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGATGTGGGTGAGGGTGAGGATGAAGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGTTTAGGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAAGGGTTAAGGGTCAGGGTCAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTGTGGGTGAGGGTGAGGATGAAGGTTAAGGGTTAGGTTTAGGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAAGGGTTAAGGGTCAGGGTCAGGGGTTTGGGTCAAGGGTTAGGGTCAAGGGTTAGGGTTAGGGGTTAAGAGTTAGGGGTTAGGGATTATGGTTTGGGTGAGGGGTGAGGGGTGAGGGTGAGGGTTAGGGTTAGCGTTTTAGGGTTATGGTTTGGGTTAAGGGTTAGGGTTAGGGGTTAGGGGTTAAGGGTTAGGGGTAGGATAAGGGTAAGGATTAGGGTTAGGGTCAGGGTAAGGGTAAGGGTAAGGATTAGGGTTAGGATTAGGGTTAGGGTTAGGGTTAGGGTTTTAGGGTTAGGGT
 9 | >6:9117792-9119815
10 | AATTCAAACCCTGATTCTGTCATTCAGTTATGTAAACACAAGCAAGTAAATTCACCTCTCTGAAATTTAGTGTTTATCAGTAAAGTGGAAAAAACACTTAACTACAGTATTAAAAGAGAATTCCATAAGGTAACTAGCAGACAGTGCTCAGTATCACATTTGGCATTATTTCATATTCATTCAATTCAATCCTATGCTAATATAGCTGAAATGTCCTCATGCCGGACTGTTTGTAATGAGTAGAAGAATAATCCTACATAAAGCTCCCTACGTCTCTGTTCAATTTAATTCAATTTGACCAAGATGTATTGAGCAGCTACTACATGCAAGGCTCACCGACATAGTGATGCCAACATGAGCTACATTTTATTGAGTACTTACTCTGCTTCAGGAACTCGCTAAGAGTTTCATTACCTTATATACTTTGATTCTTACAACAACATGAGAGAGGGGTGTTGTTCAAGATGTTTCACAAGTAGCATAGTTCTGCCCGTAGTACATTGCCTGTATGGGAAGCTCAAAGGGCCTATTTTACAGAAGAGAAAATTAAGGATTTGAGATATTAAAGGAGCTCATCTTTGCTATAGCAAGTGGCCAAACTTTTAGGGCTGTTTCATGAGACTGTAATTCATGGGCTACCATTGTGAATCAAGGTAAGTCCTTCCCATCGCTGTGGCTCTCAGTTCTAATTCTCTTGTTTGGGGTTCTGTTATATAGCAAGTGAATACAGTTTTCATAAGTCAGTGCATCCAGAAAGCTTTGTTTCAGATGTTCCAAGCCACTGGTTTCTCAGTCAGAAATATTTATTGCCTCTTTTCATTGCCATTAAAGGTTCAAGCTCAGAATGCTTTCTAACACCTGTTAATATTAATACAATTTCAGTAGTAGCATTAAGGAATGCCAGTTCTGGGTTGTGGGGGTCCCTAGGCCCACACCAGCAATGCTCTCGTCTTCCTGCAGAGGTTCTGCCAGTGCCTCAAGAATGTCCTCAGAGTTCTGTCCATTCTTCTTCTGCTCAGACGTTTTGTCTGAAAGTATTTTCCCAGGCCAGAGCTGGACTCTGTATAGAGTACAGAGGAAGGGCCAAGGTATGACTTTACTCCCACCACTTATATCTGAGAGCACAGGCACAGAGGGCAGTTTGTCACAAATCTGAGTGTGCCCGAGCTACATTCATGAATAATAAATAGGCTTCATATTGCCTCACATTCCCATTCCCATCCCAAGTATGAGGCTGGCTTTGTTGACTGCAAATGAAAACTGCTTATTTATAGGGAGGAAGATGGGAGTGGAAGGGCAGATTTTAGCGGGTTGGAAATCATATTAATAAATAAGCTGAGAAATATTTGACTTGGGTAAATGCTCAGGTAGAATTTGAACATTGAGAATCATGCATTCATCTGTGTGTAAGAAATAAGAGTCAAGGATAATCCTATAGCTTAATCCCCCAAAATATGATCCACAGAGCATCCTGTGTATTTCAGAATGCAACAGGAATGTGTGCGTATGCTCTTGGATTTCGACGAATGACTTCGAGCCCTGGGATCTGTTGCCTTGTGAACTTTTGCAGCCTGTCAAACCACAGTCTAGTAGTAAGAGAAACCGTCCTCTGCCTGGTCCAAAGATGCCAATGTGGATAGTTTGTCGTCCTTGAAATGTGGTGTGACCCTCACAGCTACCACGTTTCTGAAGGCATCAGAAAAATCAACCATACTTAAGAGTTAAAATACCGCACGTATCATACATTTTTAAAAGTCAGAGTAACAAGAGTATATGGAATAAGAAGATGAAGCAATCCACTAATCACCATTTTAATCTCACTGTCAAGTCAGCTTTATATATATTTTGCCTACATAGTTTCTTAGGAGTAGTGTCTCCTTGTAAATGTGCTGATTGCCTCTTAATTAAGTGATCTTATTTATAATACTAAAGAAACTCAGTAGAGACAAAAATGAGAATCACATTTTAAACTATTAGACATAAGGTCAAAAATAAAGGCTTATTGCAAGGCAGCATTATCATC
11 | >15:44108746-44110769
12 | GCACTGTTAATCTTCAAGTGGCAAAGTTATTAGGACATAAATACTTATTAAGTTACTTGGCTCTAACTACTTAATAACTAAAATAAACAGAACTGTTTTGGCCCAGGAGCTCTGTGAAAAAACTCCTGGGACAATTATGAACGGTGAAAGTTCAGGACTCCTGGCCTACTCTATAAAGTAATGTCTCCATCAATATATAACATACAAAATCCCTCACTGGGAGGCTCTCTTTCTACCATGAGACCTTTATAAGGTCTTTTCTCCTACTTCAGTGTCATTTCCCCTTATTAGCATTTAGAATAAGTAGATGTATAAAATCAATTTACCAAAGTTTATAAAACATCTTTGTGCTAAAAAGACTCCTCTCAACAGCCAACAAATGGTCTGTGTTTTCTATTCCCATAAGGAGTTGAGTAGGTCCTTTAGGCCAGAAGTAACCTGCTGTTTCCTGAAGCTGCCACTTTGTAGCTTTGGTCATGTAACTTTCTTAGCCCCTTTGTGCTCTTAGTTTCCTCCTTTTGTGAAATGGGGATAATAGCACCTATTTCATAAAGTTATTGTATGGATTACAAGAATTAGCAAATGTAATCTACTTGGTAGTACTATTCAAGTATTAGCTATTATTATTATTATTAAGCTGTAGAGGTTCATGTTCAAGTAAACACAATCACAGACATTACCTCTCTTCCACATCTTCACTAATACGGTTCTGTAAACGCCGTAGCCGGGGGTCACTGGATGAATCCTCCTCCTGTTCCTCAGGCTCTGCTTCTTGTTCTTTGGCTTTCTTAATGAACTGAAATTCTTCATCCTCCTCATCTGAGGACTCCATAGGGGCATAGTCTGGCCTTTTTCCGGACACATAACGCTTTACCTTCACTTTTTCCATTGAAATCTCACCTGGGCGAGAAAGGTAACTTATGTTTCAGTAGCCTCTTTCTCAATGTGCTTCAACCCATCACGGCCTTTGCAAATAGAGCCCTTTATTCATAGTAGACAAGAGTCTAAGCAGAAGAAGAAGAGAGCCACTACCCAACCATCTACTCTTCTAATGGTGTTTTCCTACAAAGGCCAAGTCATGAGACTGCATCCTTGTGAAAGCCAACACTGATGATAATGAGGCTTACCTTGAGTACAATGAAGTAGAGGAAGGTAGGCAGTGAAACAGTAGAAAAAAGTCCCCCCCCAAAAAGGCAGACTGCATCCATCACAAATTCATGGTATCCCACCTCAACTATACCCTTAAACAAATTATTTGTAACAGTGCCCAGCACATAGTAAGGGTAATTTCTGCAGGAACATAAAACTGCTCAGGCATTCTTGTAGTTCCCTCTGGAATTTCCATGGCAGCCTTTATAACATACTGCCACATGACTCAATATTCTAATCTTACCTATTTCTACCTTCTTTTCCTTTACCTTATGTGTTATCAAACATGCAAGAGTATGTAATCTCTTACAAATACAATTCTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGTGGTCTCGGCTCACTGCAACCTCCGCCTCCCGGGTTCAAGTGATTCTTCTGCCTCAGCTTCCTGAGTAGCTGGGACTACAGACGCATGCCACCATGCCCAGCTAATTTTTTGTATTTTTAGTAGAGATGGGGTTTCACCGTATTAGCCATGATGGTCTCGATCTCCTGACCTCATGATCCGCCCACCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACAGCACCCAGCCTACAAATACAAACAATTCTAGTGAACAAAGACAGTGAAAACTAACCAACTTATTTTATGAGTATAACCTTGATAACCAAATTGGACAGTTCAAAGGAAAACTTTAGGTGGATTTCACTTACAAACACAAATACAAAAATACTAAATAACGTAAGAGCAAATTTATCCCAGCAATATAAAAACTACCAAAAAAAAAAAAAGAAACAAAAAAAAAAAACCCCTAGACCAACACAACTTATTAGAATTTAGCCCAA
13 | 


--------------------------------------------------------------------------------
/test/test_guideseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | test_guideseq
  6 | ----------------------------------
  7 | 
  8 | Tests for `guideseq` module.
  9 | """
 10 | 
 11 | import yaml
 12 | import unittest
 13 | import os
 14 | import shutil
 15 | import utils
 16 | from guideseq import guideseq
 17 | 
 18 | TEST_SAMPLE_BARCODES = {'AGGCATGAGATCGC': 'mysample', 'GACTCCTGCGATAT': 'sample2'}
 19 | TEST_UNDEMULTIPLEXED_FILES = {'forward': 'data/undemultiplexed/undemux.r1.fastq.gz',
 20 |                   'reverse': 'data/undemultiplexed/undemux.r2.fastq.gz',
 21 |                   'index1': 'data/undemultiplexed/undemux.i1.fastq.gz',
 22 |                   'index2': 'data/undemultiplexed/undemux.i2.fastq.gz'}
 23 | TEST_DEMULTIPLEXED_FILES = {'read1': 'data/demultiplexed/EMX1.r1.fastq',
 24 |                             'read2': 'data/demultiplexed/EMX1.r2.fastq',
 25 |                             'index1': 'data/demultiplexed/EMX1.i1.fastq',
 26 |                             'index2': 'data/demultiplexed/EMX1.i2.fastq'}
 27 | TEST_SAMPLES = {
 28 |                 'control':{
 29 |                  'barcode1':'CTCTCTAC',
 30 |                  'description':'Control',
 31 |                  'barcode2':'CTCTCTAT',
 32 |                  'target':None
 33 |                 },
 34 |                 'EMX1':{
 35 |                  'barcode1':'TAGGCATG',
 36 |                  'description':'EMX_site1',
 37 |                  'barcode2':'TAGATCGC',
 38 |                  'target':'GAGTCCGAGCAGAAGAAGAANGG'
 39 |                 }
 40 |                }
 41 | 
 42 | TEST_SAMPLE_NAME = 'EMX1'
 43 | TEST_OUTPUT_PATH = 'test_output'
 44 | TEST_MIN_READS = 1000
 45 | TEST_DEMULTIPLEX_MANIFEST_PATH = os.path.join(TEST_OUTPUT_PATH, 'demultiplex_manifest.yaml')
 46 | TEST_MANIFEST_PATH = os.path.join(TEST_OUTPUT_PATH, 'test_manifest.yaml')
 47 | 
 48 | TEST_BWA_PATH = 'bwa'
 49 | TEST_BEDTOOLS_PATH = 'bedtools'
 50 | 
 51 | TEST_REFERENCE_GENOME = 'test_genome.fa'
 52 | 
 53 | CORRECT_DEMULTIPLEXED_OUTPUT = 'data/demultiplexed'
 54 | CORRECT_UMITAGGED_OUTPUT = 'data/umitagged'
 55 | CORRECT_CONSOLDIATED_OUTPUT = 'data/consolidated'
 56 | CORRECT_ALIGNED_OUTPUT = 'data/aligned'
 57 | CORRECT_IDENTIFIED_OUTPUT = 'data/identified'
 58 | CORRECT_FILTERED_OUTPUT = 'data/filtered'
 59 | 
 60 | CORRECT_ALL_OUTPUT = 'data'
 61 | 
 62 | class FullPipelineTestCase(unittest.TestCase):
 63 | 
 64 |     def setUp(self):
 65 |         # Create the test output folder
 66 |         os.makedirs(TEST_OUTPUT_PATH)
 67 | 
 68 |         # Create the test demultiplexing YAML
 69 |         test_manifest_data = {}
 70 |         test_manifest_data['undemultiplexed'] = TEST_UNDEMULTIPLEXED_FILES
 71 |         test_manifest_data['demultiplex_min_reads'] = TEST_MIN_READS
 72 |         test_manifest_data['samples'] = TEST_SAMPLES
 73 |         test_manifest_data['output_folder'] = TEST_OUTPUT_PATH
 74 |         test_manifest_data['bwa'] = TEST_BWA_PATH
 75 |         test_manifest_data['bedtools'] = TEST_BEDTOOLS_PATH
 76 |         test_manifest_data['reference_genome'] = TEST_REFERENCE_GENOME
 77 | 
 78 |         with open(TEST_MANIFEST_PATH, 'w') as f:
 79 |             f.write(yaml.dump(test_manifest_data, default_flow_style=False))
 80 | 
 81 | 
 82 |     def testFullPipeline(self):
 83 |         g = guideseq.GuideSeq()
 84 |         g.parseManifest(TEST_MANIFEST_PATH)
 85 | 
 86 |         # Demultiplex and test the demultiplex output
 87 |         g.demultiplex()
 88 |         self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'demultiplexed'), CORRECT_DEMULTIPLEXED_OUTPUT))
 89 | 
 90 |         # UMITag and test the umitagging output
 91 |         g.umitag()
 92 |         self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'umitagged'), CORRECT_UMITAGGED_OUTPUT))
 93 | 
 94 |         # Consolidate and test the consolidation output
 95 |         g.consolidate()
 96 |         self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'consolidated'), CORRECT_CONSOLDIATED_OUTPUT))
 97 | 
 98 |         # Align and test the alignment output
 99 |         g.alignReads()
100 |         self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'aligned'), CORRECT_ALIGNED_OUTPUT))
101 | 
102 |         # Identify offtargets and test the output
103 |         g.identifyOfftargetSites()
104 |         self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'identified'), CORRECT_IDENTIFIED_OUTPUT))
105 | 
106 |         # Filter background sites and test if correct
107 |         g.filterBackgroundSites()
108 |         self.assertTrue(utils.checkFolderEquality(os.path.join(TEST_OUTPUT_PATH, 'filtered'), CORRECT_FILTERED_OUTPUT))
109 | 
110 | 
111 |     def tearDown(self):
112 |         # Delete temp output
113 |         #shutil.rmtree(TEST_OUTPUT_PATH)
114 |         pass
115 | 
116 | if __name__ == '__main__':
117 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_manifest.yaml:
--------------------------------------------------------------------------------
 1 | reference_genome: test_genome.fa
 2 | output_folder: output
 3 | 
 4 | bwa: bwa
 5 | bedtools: bedtools
 6 | 
 7 | demultiplex_min_reads: 1000
 8 | window_size: 25
 9 | max_mismatches: 7
10 | 
11 | undemultiplexed:
12 |     forward: data/undemultiplexed/undemux.r1.fastq.gz
13 |     reverse: data/undemultiplexed/undemux.r2.fastq.gz
14 |     index1: data/undemultiplexed/undemux.i1.fastq.gz
15 |     index2: data/undemultiplexed/undemux.i2.fastq.gz
16 | 
17 | samples:
18 |     control:
19 |         target:  
20 |         barcode1: CTCTCTAC
21 |         barcode2: CTCTCTAT
22 |         description: Control
23 | 
24 |     EMX1:
25 |         target: GAGTCCGAGCAGAAGAAGAANGG
26 |         barcode1: TAGGCATG
27 |         barcode2: TAGATCGC
28 |         description: EMX_site1
29 | 


--------------------------------------------------------------------------------
/test/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import inspect
 4 | import filecmp
 5 | from itertools import islice
 6 | 
 7 | def checkFolderEquality(folder1, folder2):
 8 |     """
 9 |     Given two folders, check if there are the same number of files,
10 |     that the names of files are the same, and that the files with the same
11 |     names are the same.
12 |     """
13 | 
14 |     folder1_files = [x for x in os.listdir(folder1) if not x.startswith('.')]
15 |     folder2_files = [x for x in os.listdir(folder2) if not x.startswith('.')]
16 | 
17 |     if set(folder1_files) != set(folder2_files):
18 |         print 'Folders do not have the same filenames.'
19 |         return False
20 | 
21 |     for f in folder1_files:
22 |         file1 = os.path.join(folder1, f)
23 |         file2 = os.path.join(folder2, f)
24 | 
25 |         if f.split('.')[-1] == 'sam':
26 |             with open(file1, 'r') as a, open(file2, 'r') as b:
27 |                 for line1, line2 in zip(a,b):
28 |                     if line1.startswith('@'):
29 |                         continue
30 |                     elif line1 != line2:
31 |                         return False
32 |         else:
33 |             if not filecmp.cmp(file1, file2):
34 |                 print '{0} does not match between folders.'.format(f)
35 |                 return False
36 | 
37 |     return True
38 | 
39 | 
40 | def head(filepath, n=10):
41 |     with open(filepath) as f:
42 |         for line in islice(f, n):
43 |             print line
44 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py27
 8 | skipsdist = false
 9 | 
10 | [testenv]
11 | deps = -rrequirements.txt
12 | commands = nosetests -w test/ --exe -v
13 | whitelist_externals=*


--------------------------------------------------------------------------------