├── ._README.MD
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.MD
├── build_pypi.sh
├── changeseq
├── NUC_SIMPLE
├── __init__.py
├── alignReads.py
├── callVariants.py
├── changeseq.py
├── findCleavageSites.py
├── log.py
├── mergeReads.py
├── referenceFree.py
├── refseq_gene_name.list
├── refseq_gene_name.py
├── test.yaml
├── utility.py
├── validation.py
└── visualization.py
├── conda_build
├── conda_build_config.yaml
└── meta.yaml
├── example_output.png
├── requirements.txt
├── scripts
├── NUC_SIMPLE
├── __init__.py
├── site_pvalue.R
├── test.py
├── test_align.py
└── test_ga.py
├── setup.cfg
├── setup.py
└── test
├── CIRCLEseq_MergedTest.yaml
├── CIRCLEseq_StandardTest.yaml
├── __init__.py
├── data
├── MergedOutput
│ ├── aligned
│ │ ├── TestSample.bam
│ │ ├── TestSample.bam.bai
│ │ ├── TestSample.sam
│ │ ├── TestSample_sorted.bam
│ │ ├── control_TestSample.bam
│ │ ├── control_TestSample.bam.bai
│ │ ├── control_TestSample.sam
│ │ └── control_TestSample_sorted.bam
│ ├── fastq
│ │ ├── TestSample_merged.fastq.gz
│ │ └── control_TestSample_merged.fastq.gz
│ ├── identified
│ │ ├── TestSample_CONTROL_coordinates.txt
│ │ ├── TestSample_NUCLEASE_coordinates.txt
│ │ ├── TestSample_count.txt
│ │ ├── TestSample_identified_matched.txt
│ │ └── TestSample_identified_unmatched.txt
│ └── visualization
│ │ └── TestSample_offtargets.svg
├── StandardOutput
│ ├── aligned
│ │ ├── TestSample.bam
│ │ ├── TestSample.bam.bai
│ │ ├── TestSample.sam
│ │ ├── TestSample_sorted.bam
│ │ ├── control_TestSample.bam
│ │ ├── control_TestSample.bam.bai
│ │ ├── control_TestSample.sam
│ │ └── control_TestSample_sorted.bam
│ ├── identified
│ │ ├── TestSample_CONTROL_coordinates.txt
│ │ ├── TestSample_NUCLEASE_coordinates.txt
│ │ ├── TestSample_count.txt
│ │ ├── TestSample_identified_matched.txt
│ │ └── TestSample_identified_unmatched.txt
│ ├── variants
│ │ ├── TestSample_Variants.txt
│ │ └── TestSample_mpileupCall.txt
│ └── visualization
│ │ └── TestSample_offtargets.svg
└── input
│ ├── CIRCLEseq_test_genome.fa
│ ├── CIRCLEseq_test_genome.fa.amb
│ ├── CIRCLEseq_test_genome.fa.ann
│ ├── CIRCLEseq_test_genome.fa.bwt
│ ├── CIRCLEseq_test_genome.fa.fai
│ ├── CIRCLEseq_test_genome.fa.pac
│ ├── CIRCLEseq_test_genome.fa.sa
│ ├── TEST.r1.fastq.gz
│ ├── TEST.r2.fastq.gz
│ ├── TEST_control.r1.fastq.gz
│ └── TEST_control.r2.fastq.gz
├── scripts
├── CIRCLEseq_prepare_test_data.sh
├── CIRCLEseq_prepare_test_reference.sh
├── CIRCLEseq_test_bed.R
└── Test.sh
├── test_circleseq_merged.py
├── test_circleseq_std.py
└── utils.py
/._README.MD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/._README.MD
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | changeseq.egg-info/
4 | *.py[cod]
5 | test/output
6 | <<<<<<< HEAD
7 | *.DS_Store
8 | =======
9 | .DS_Store
10 | >>>>>>> add-testing
11 |
12 | # PyCharm Pref Folder
13 | .idea
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Packages
19 | *.egg
20 | *.egg-info
21 | dist
22 | build
23 | eggs
24 | parts
25 | bin
26 | var
27 | sdist
28 | develop-eggs
29 | .installed.cfg
30 | lib64
31 |
32 | # Installer logs
33 | pip-log.txt
34 |
35 | # Unit test / coverage reports
36 | .coverage
37 | .tox
38 | nosetests.xml
39 | htmlcov
40 |
41 | # Translations
42 | *.mo
43 |
44 | # Mr Developer
45 | .mr.developer.cfg
46 | .project
47 | .pydevproject
48 |
49 | # Complexity
50 | output/*.html
51 | output/*/index.html
52 |
53 | # Sphinx
54 | docs/_build
55 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Config file for automatic testing at travis-ci.org
2 |
3 | language: python
4 |
5 | python:
6 | - "2.7"
7 |
8 | before_install:
9 | - cd test
10 | - git clone https://github.com/lh3/bwa.git
11 | - cd bwa
12 | - git checkout tags/v0.7.13
13 | - make
14 | - cd ..
15 | - PATH=`pwd`/bwa:$PATH
16 | - git clone https://github.com/samtools/htslib.git
17 | - cd htslib
18 | - git checkout tags/1.3
19 | - make
20 | - cd ..
21 | - git clone https://github.com/samtools/samtools.git
22 | - cd samtools
23 | - git checkout tags/1.3
24 | - make
25 | - cd ..
26 | - PATH=`pwd`/samtools:$PATH
27 | - cd ..
28 | - git clone git://github.com/samtools/htslib.git
29 | - cd htslib
30 | - git checkout tags/1.7
31 | - cd ..
32 | - git clone git://github.com/samtools/bcftools.git
33 | - cd bcftools
34 | - git checkout tags/1.6
35 | - make
36 | - cd ..
37 | - PATH=`pwd`/bcftools:$PATH
38 |
39 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
40 | install:
41 | - pip install --upgrade pip setuptools wheel
42 | - pip install --only-binary=numpy,scipy numpy scipy
43 | - pip install -r requirements.txt
44 |
45 | # command to run tests, e.g. python setup.py test
46 | script:
47 | cd test && nosetests --exe -v
48 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 |
2 | [![Version][version-shield]][version-url]
3 | [![Python versions][python-shield]][python-url]
4 | [![Platforms][platform-shield]][python-url]
5 |
6 |
7 | # CHANGE-seq: Circularization for High-throughput Analysis Nuclease Genome-wide Effects by Sequencing
8 |
9 | This is a repository for CHANGE-seq analytical software, which takes sample-specific paired-end FASTQ files as input and produces a list of CHANGE-seq detected off-target cleavage sites as output.
10 |
11 | # Summary
12 |
13 | This package implements a pipeline that takes in reads from the CHANGE-seq assay and returns detected cleavage sites as output. The individual pipeline steps are:
14 |
15 | 1. **Merge**: Merge read1 an read2 for easier mapping to genome.
16 | 2. **Read Alignment**: Merged paired end reads from the assay are aligned to the reference genome using the BWA-MEM algorithm with default parameters (Li. H, 2009).
17 | 3. **Cleavage Site Identification**: Mapped sites are analyzed to determine which represent high-quality cleavage sites.
18 | 4. **Visualization of Results**: Identified on-target and off-target cleavage sites are rendered as a color-coded alignment map for easy analysis of results.
19 |
20 | # Installation
21 |
22 | The most easiest way to install change-seq pipeline is via conda.
23 |
24 | ```
25 |
26 | conda create -n changeseq -c conda-forge -c bioconda -c anaconda -c omnia -c tsailabSJ changeseq
27 |
28 | source activate changeseq
29 |
30 | changeseq.py -h
31 |
32 | ## BWA 0.7.17 and samtools 1.9 are automatically installed
33 |
34 | ## If Homer is available, the identified off-targets will be annotated using "annotatePeaks.pl", specify the genome version in the YAML file.
35 |
36 |
37 | ```
38 |
39 | Alternatively, you can git clone this repository and install
40 |
41 | ```
42 |
43 | git clone https://github.com/tsailabSJ/changeseq
44 |
45 | cd changeseq
46 |
47 | pip install -r requirements.txt
48 |
49 | python setup.py install
50 |
51 | changeseq.py -h
52 |
53 | ## Please install BWA and samtools if you choose this option
54 |
55 | ```
56 |
57 | ## Download Reference Genome
58 |
59 | The CHANGEseq package requires a reference genome for read mapping. You can use any genome of your choosing, but for all of our testing and original CHANGE-seq analyses we use hg19 ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)). Be sure to (g)unzip the FASTA file before use if it is compressed.
60 |
61 | # Usage
62 |
63 | The change-seq pipeline requires a manifest yaml file specifying input files, output directory, and pipeline parameters. Once the yaml file is created, users can simply run ``change_seq.py all --manifest /path/to/manifest.yaml``
64 |
65 |
66 | Below is an example ``manifest.yaml`` file::
67 |
68 | reference_genome: /data/joung/genomes/Homo_sapiens_assembly19.fasta
69 | analysis_folder: /data/joung/CHANGE-Seq/test2
70 |
71 | bwa: bwa
72 | samtools: samtools
73 |
74 | read_threshold: 4
75 | window_size: 3
76 | mapq_threshold: 50
77 | start_threshold: 1
78 | gap_threshold: 3
79 | mismatch_threshold: 6
80 | search_radius: 30
81 | merged_analysis: True
82 |
83 | samples:
84 | U2OS_exp1_VEGFA_site_1:
85 | target: GGGTGGGGGGAGTTTGCTCCNGG
86 | read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R1_001.fastq.gz
87 | read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R2_001.fastq.gz
88 | controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz
89 | controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz
90 | description: U2OS_exp1
91 | U2OS_exp1_EMX1:
92 | target: GAGTCCGAGCAGAAGAAGAANGG
93 | read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R1_001.fastq.gz
94 | read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R2_001.fastq.gz
95 | controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz
96 | controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz
97 | description: U2OS_exp1
98 |
99 | ## Quickstart
100 |
101 | ```
102 |
103 | git clone https://github.com/tsailabSJ/changeseq
104 |
105 | cd changeseq/test
106 |
107 | changeseq.py all --manifest CIRCLEseq_MergedTest.yaml
108 |
109 | ```
110 |
111 | ## Example Output
112 |
113 | 
114 |
115 | # Writing A Manifest File
116 | When running the end-to-end analysis functionality of the CHANGEseq package a number of inputs are required. To simplify the formatting of these inputs and to encourage reproducibility, these parameters are inputted into the pipeline via a manifest formatted as a YAML file. YAML files allow easy-to-read specification of key-value pairs. This allows us to easily specify our parameters. The following fields are required in the manifest:
117 |
118 | - `reference_genome`: The absolute path to the reference genome FASTA file.
119 | - `output_folder`: The absolute path to the folder in which all pipeline outputs will be saved.
120 | - `bwa`: The absolute path to the `bwa` executable
121 | - `samtools`: The absolute path to the `samtools` executable
122 | - `read_threshold`: The minimum number of reads at a location for that location to be called as a site. We recommend leaving it to the default value of 4.
123 | - `window_size`: Size of the sliding window, we recommend leaving it to the default value of 3.
124 | - `mapq_threshold`: Minimum read mapping quality score. We recommend leaving it to the default value of 50.
125 | - `start_threshold`: Tolerance for breakpoint location. We recommend leaving it to the default value of 1.
126 | - `gap_threshold`: Distance between breakpoints. We recommend leaving it to the default value of 3 for Cas9.
127 | - `mismatch_threshold`: Number of tolerated gaps in the fuzzy target search setp. We recommend leaving it to the default value of 6.
128 | - `read_length`: Fastq file read length, default is 151.
129 | - `PAM`: PAM sequence, default is NGG.
130 | - `genome`: used for homer peak annotation, e.g., hg19, hg38, mm9, or mm10.
131 | - `merged_analysis`: Whether or not the paired read merging step should takingTrue
132 | - `samples`: Lists the samples you wish to analyze and the details for each. Each sample name should be nested under the top level samples key, and each sample detail should be nested under the sample name. See the sample manifest for an example.
133 | - For each sample, you must provide the following parameters:
134 | - `target`: Target sequence for that sample. Accepts degenerate bases.
135 | - `read1`: The absolute path to the .FASTQ(.gz) file containing the read1 reads.
136 | - `read2`: The absolute path to the .FASTQ(.gz) file containing the read2 reads.
137 | - `controlread1`: The absolute path to the .FASTQ(.gz) file containing the control read1 reads.
138 | - `controlread2`: The absolute path to the .FASTQ(.gz) file containing the control read2 reads.
139 | - `description`: A brief description of the sample
140 |
141 |
142 | # Pipeline Output
143 | When running the full pipeline, the results of each step are outputted to the `output_folder` in a separate folder for each step. The output folders and their respective contents are as follows:
144 |
145 | - `output_folder/aligned`: Contains an alignment `.sam`, alignment `.bam`, sorted `bam`, and `.bai` index file for each sample.
146 | - `output_folder/fastq`: Merged `.fastq.gz` files for each sample.
147 | - `output_folder/identified`: Contains tab-delimited `.txt` files for each sample containing the identified DSBs, control DSBs, filtered DSBs, and read quantification.
148 | - `output_folder/visualization`: Contains a `.svg` vector image representing an alignment of all detected off-targets to the targetsite for each sample.
149 |
150 | # FAQ
151 |
152 | ## Homer installation
153 |
154 | ```
155 |
156 | conda install -c bioconda homer
157 |
158 | # To install genome annotation
159 | # Ref: http://homer.ucsd.edu/homer/introduction/configure.html
160 |
161 | ## Suppose you want to install hg19, follow the command here:
162 |
163 | annotatePeaks.pl xxx hg19
164 |
165 | ## You should be able to see:
166 |
167 | !!!!Genome hg19 not found in /rgs01/project_space/tsaigrp/Genomics/common/anaconda3/envs/changeseq/share/homer-4.11-2/.//config.txt
168 |
169 | To check if is available, run "perl /rgs01/project_space/tsaigrp/Genomics/common/anaconda3/envs/changeseq/share/homer-4.11-2/.//configureHomer.pl -list"
170 | If so, add it by typing "perl /rgs01/project_space/tsaigrp/Genomics/common/anaconda3/envs/changeseq/share/homer-4.11-2/.//configureHomer.pl -install hg19"
171 |
172 | ## Copy and paste the perl command to install genome annotation
173 | ```
174 |
175 |
176 |
177 | [version-shield]: https://img.shields.io/conda/v/tsailabsj/changeseq.svg
178 | [version-url]: https://anaconda.org/tsailabSJ/changeseq
179 | [python-shield]: https://img.shields.io/pypi/pyversions/changeseq.svg
180 | [python-url]: https://pypi.python.org/pypi/changeseq
181 | [platform-shield]: https://anaconda.org/tsailabsj/changeseq/badges/platforms.svg
182 |
--------------------------------------------------------------------------------
/build_pypi.sh:
--------------------------------------------------------------------------------
1 | python setup.py sdist
2 | python setup.py bdist_wheel
3 | twine upload dist/*
4 |
5 |
--------------------------------------------------------------------------------
/changeseq/NUC_SIMPLE:
--------------------------------------------------------------------------------
1 | #
2 | # This matrix was created by Todd Lowe 12/10/92
3 | #
4 | # Uses ambiguous nucleotide codes, probabilities rounded to
5 | # nearest integer
6 | #
7 | # Lowest score = -4, Highest score = 5
8 | #
9 | # Modified by Shengdar Tsai 1/23/16
10 | A T G C N
11 | A 10 -5 -5 -5 10
12 | T -5 10 -5 -5 10
13 | G -5 -5 10 -5 10
14 | C -5 -5 -5 10 10
15 | N 10 10 10 10 10
--------------------------------------------------------------------------------
/changeseq/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Using __init__.py to organize the structure.
3 | """
4 |
5 | __version__ = "1.2.9.1"
--------------------------------------------------------------------------------
/changeseq/alignReads.py:
--------------------------------------------------------------------------------
1 | """
2 | alignReads
3 | """
4 |
5 | from __future__ import print_function
6 |
7 | import subprocess
8 | import os
9 | import logging
10 |
11 | logger = logging.getLogger('root')
12 | logger.propagate = False
13 |
14 | def alignReads(BWA_path, HG19_path, read1, read2, outfile):
15 |
16 | sample_name = os.path.basename(outfile).split('.')[0]
17 | output_folder = os.path.dirname(outfile)
18 | base_name = os.path.join(output_folder, sample_name)
19 | sam_filename = outfile
20 | bam_filename = base_name + '.bam'
21 |
22 | if not os.path.exists(output_folder):
23 | os.makedirs(output_folder)
24 |
25 | # Check if genome is already indexed by bwa
26 | index_files_extensions = ['.pac', '.amb', '.ann', '.bwt', '.sa']
27 |
28 | genome_indexed = True
29 | for extension in index_files_extensions:
30 | if not os.path.isfile(HG19_path + extension):
31 | genome_indexed = False
32 | break
33 |
34 | # If the genome is not already indexed, index it
35 | if not genome_indexed:
36 | logger.info('Genome index files not detected. Running BWA to generate indices.')
37 | bwa_index_command = '{0} index {1}'.format(BWA_path, HG19_path)
38 | logger.info('Running bwa command: %s', bwa_index_command)
39 | subprocess.call(bwa_index_command.split())
40 | logger.info('BWA genome index generated')
41 | else:
42 | logger.info('BWA genome index found.')
43 |
44 | # Run paired end alignment against the genome
45 | logger.info('Running paired end mapping for {0}'.format(sample_name))
46 | bwa_alignment_command = '{0} mem {1} {2} {3} > {4}'.format(BWA_path, HG19_path, read1, read2, sam_filename)
47 | samtools_sam_to_bam_command = 'samtools sort -o {0} {1}'.format(bam_filename, sam_filename)
48 | samtools_index_command = 'samtools index {0}'.format(bam_filename)
49 | samtools_sort_by_name_command = 'samtools sort -o {0} -n {1}'.format("".join([base_name, '_sorted.bam']), bam_filename)
50 |
51 | # Open the outfile and redirect the output of the alignment to it.
52 | logger.info(bwa_alignment_command)
53 | subprocess.check_call(bwa_alignment_command, shell=True)
54 | logger.info('Paired end mapping for {0} completed.'.format(sample_name))
55 |
56 | # Convert SAM to BAM file
57 | logger.info(samtools_sam_to_bam_command)
58 | subprocess.check_call(samtools_sam_to_bam_command, shell=True)
59 | logger.info('Sorting by coordinate position for {0} complete.'.format(sample_name))
60 |
61 | # Index BAM file
62 | logger.info(samtools_index_command)
63 | subprocess.check_call(samtools_index_command, shell=True)
64 | logger.info('Indexing for {0} complete.'.format(sample_name))
65 |
66 | # Sort BAM file by name
67 | logger.info(samtools_sort_by_name_command)
68 | subprocess.check_call(samtools_sort_by_name_command, shell=True)
69 | logger.info('Sorting for {0} by name complete.'.format(sample_name))
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/changeseq/callVariants.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import subprocess
4 | import sys
5 | import os
6 | import argparse
7 | import regex
8 | import re
9 | import HTSeq
10 | import pyfaidx
11 | from findCleavageSites import get_sequence, regexFromSequence, alignSequences, reverseComplement, extendedPattern, realignedSequences
12 |
13 |
14 | """
15 | Run samtools:mpileup and get all identified variants in the window sequences
16 | """
17 | def snpCall(matched_file, reference, bam_file, out, search_radius):
18 | basename = os.path.basename(out)
19 | output_folder = os.path.dirname(out)
20 |
21 | # open matched file
22 | regions = list()
23 | with open(matched_file, 'rU') as f:
24 | f.readline()
25 | for line in f:
26 | site = line.strip().split('\t')
27 | # chromosome, windowStart, windowEnd, strand, bam, region_basename (=Targetsite_Name)
28 | regions.append([site[0], int(site[6]) - search_radius, int(site[7]) + search_radius, '*', bam_file, '_'.join([site[26], site[3]])])
29 |
30 | print('Running samtools:mpileup for %s' % basename, file=sys.stderr)
31 | out_vcf = os.path.join(output_folder, basename + '_mpileup_output')
32 | if os.path.exists(out_vcf):
33 | subprocess.check_call('rm -r %s' % out_vcf, shell=True, env=os.environ.copy())
34 | os.makedirs(out_vcf)
35 | process_mpileup = open(os.path.join(out_vcf, 'logFile_mpileup'), 'w')
36 |
37 | for item in regions:
38 | chromosome, windowStart, windowEnd, strand, bam_file, region_basename = item
39 | region = '%s%s%s%s%s' % (chromosome, ":", int(windowStart), "-", int(windowEnd))
40 | output = os.path.join(out_vcf, region_basename + '.vcf')
41 |
42 | cl_vcf = 'samtools mpileup -v --region %s --fasta-ref %s %s > %s' % (region, reference, bam_file, output)
43 | subprocess.check_call(cl_vcf, shell=True, env=os.environ.copy(), stderr=process_mpileup, stdout=process_mpileup)
44 | process_mpileup.close()
45 |
46 | print('Collecting variants for %s' % basename, file=sys.stderr)
47 | out_bcf = os.path.join(output_folder, basename + '_output_bcftools')
48 | if os.path.exists(out_bcf):
49 | subprocess.check_call('rm -r %s' % out_bcf, shell=True, env=os.environ.copy())
50 | os.makedirs(out_bcf)
51 | process_bcftools = open(os.path.join(out_bcf, 'logFile_bcftools'), 'w')
52 |
53 | vcf_files = [f for f in os.listdir(out_vcf) if os.path.isfile(os.path.join(out_vcf, f))]
54 | for arch in vcf_files:
55 | if not arch.startswith('.') and arch.endswith('.vcf'):
56 | name = arch[:-4]
57 | output = os.path.join(out_bcf, name + '_BCFcall.vcf')
58 |
59 | cl_bcf = 'bcftools call -v -c %s > %s' % (os.path.join(out_vcf, arch), output)
60 | subprocess.check_call(cl_bcf, shell=True, env=os.environ.copy(), stderr=process_bcftools, stdout=process_bcftools)
61 | process_bcftools.close()
62 |
63 | print('Collecting significant variant calls for %s' % basename, file=sys.stderr)
64 | out_svc = os.path.join(output_folder, basename + '_output_svc')
65 | if os.path.exists(out_svc):
66 | subprocess.check_call('rm -r %s' % out_svc, shell=True, env=os.environ.copy())
67 | os.makedirs(out_svc)
68 | process_svc = open(os.path.join(out_svc, 'logFile_svc'), 'w')
69 |
70 | bcf_files = [f for f in os.listdir(out_bcf) if os.path.isfile(os.path.join(out_bcf, f))]
71 | for arch in bcf_files:
72 | if not arch.startswith('.') and arch.endswith('.vcf'):
73 | name = arch[:-12]
74 | output = os.path.join(out_svc, name + '_SIGNFcall.txt')
75 |
76 | cl_sed = "sed -n '/##/!p' %s | awk 'FNR>1' > %s" % (os.path.join(out_bcf, arch), output)
77 | subprocess.check_call(cl_sed, shell=True, env=os.environ.copy(), stderr=process_svc, stdout=process_svc)
78 | process_svc.close()
79 |
80 | print('Consolidating all the significant variant calls for %s' % basename, file=sys.stderr)
81 | header = ['targetsite', 'site_name', 'chromosome', 'one_based_position', 'reference', 'variant', 'quality', 'genotype', 'depth', 'PL']
82 | variants = list()
83 |
84 | svc_files = [f for f in os.listdir(out_svc) if os.path.isfile(os.path.join(out_svc, f))]
85 | for arch in svc_files:
86 | if not arch.startswith('.') and arch.endswith('.txt'):
87 | tag = arch[:-14]
88 | f = open(os.path.join(out_svc, arch), 'r')
89 | reads = f.readlines()
90 | f.close()
91 |
92 | for line in reads:
93 | item = line.split()
94 | if 'INDEL' in item[7]:
95 | variants.append(
96 | [basename, tag] + item[:2] + item[3:6] + [str(int(item[9][0])) + '|' + str(int(item[9][2]))] +
97 | [item[7].split(';')[3][3:]] + ['_'.join(item[9][4:].split(','))])
98 | else:
99 | variants.append(
100 | [basename, tag] + item[:2] + item[3:6] + [str(int(item[9][0])) + '|' + str(int(item[9][2]))] +
101 | [item[7].split(';')[0][3:]] + ['_'.join(item[9][4:].split(','))])
102 |
103 | out_file = open(out + '_mpileupCall.txt', 'w')
104 | print(*header, sep='\t', file=out_file)
105 | for item in variants:
106 | print(*item, sep='\t', file=out_file)
107 | out_file.close()
108 |
109 | print('Cleaning up directive for %s' % basename, file=sys.stderr)
110 | subprocess.check_call('rm -r %s' % out_vcf, shell=True, env=os.environ.copy())
111 | subprocess.check_call('rm -r %s' % out_bcf, shell=True, env=os.environ.copy())
112 | subprocess.check_call('rm -r %s' % out_svc, shell=True, env=os.environ.copy())
113 |
114 | print('Done running samtools:mpileup for %s' % basename, file=sys.stderr)
115 | return variants
116 |
117 |
118 | """
119 | Obtain variant off-target sequences
120 | """
121 | def realignVariantBulge(bulge_sequence, window_sequence_variant, bulge_strand):
122 | bseq = bulge_sequence.replace('-', '')
123 | if bulge_strand == '+':
124 | m_bulge = re.search(bseq, window_sequence_variant, re.I)
125 | else:
126 | m_bulge = re.search(bseq, reverseComplement(window_sequence_variant), re.I)
127 | variant_bseq = m_bulge.group()
128 | variant_bseq = variant_bseq[:bulge_sequence.find('-')] + '-' + variant_bseq[bulge_sequence.find('-'):]
129 | return variant_bseq
130 |
131 |
132 | def SNPreader(snp_file):
133 | ga = HTSeq.GenomicArray("auto", stranded=False, typecode='O')
134 |
135 | for snp in snp_file:
136 | basename, snpID, chromosome, one_based_position, reference, variant, quality, genotype, depth, PL = snp
137 | position = int(one_based_position) - 1
138 | key = '_'.join([basename, chromosome])
139 | ga[HTSeq.GenomicInterval(chromosome, position, position + 1, ".")] = [position, reference, variant, genotype, quality, key]
140 | return ga
141 |
142 |
143 | def arrayOffTargets(matched_file, search_radius):
144 | offtargets_dict = {}
145 | gi_dict = {}
146 |
147 | with open(matched_file, 'r') as g:
148 | g.readline()
149 | for line in g:
150 | site = line.strip().split('\t')
151 |
152 | Chromosome = site[0]
153 | start = int(site[6]) - search_radius
154 | end = int(site[7]) + search_radius
155 | Name = site[3]
156 |
157 | offtargets_dict[Name] = site
158 |
159 | # create a genomic interval for each window sequence
160 | gi_dict[Name] = HTSeq.GenomicInterval(Chromosome, start, end, ".")
161 | return offtargets_dict, gi_dict
162 |
163 |
164 | def snpAdjustment(matched_file, snp_file, out, mismatch_threshold, search_radius):
165 | output_file = open(out + '_Variants.txt', 'w')
166 | print('Chromosome', 'Start', 'End', 'Name', 'ReadCount', 'Strand',
167 | 'Variant_WindowSequence',
168 | 'Variant_Site_SubstitutionsOnly.Sequence', 'Variant_Site_SubstitutionsOnly.NumSubstitutions',
169 | 'Variant_Site_SubstitutionsOnly.Strand',
170 | 'Variant_Site_GapsAllowed.Sequence', 'Variant_Site_GapsAllowed.Length',
171 | 'Variant_Site_GapsAllowed.Substitutions', 'Variant_Site_GapsAllowed.Insertions', 'Variant_Site_GapsAllowed.Deletions',
172 | 'Variant_Site_GapsAllowed.Strand',
173 | 'Cell', 'Targetsite', 'TargetSequence', 'Variant_RealignedTargetSequence',
174 | 'Reference', 'Variant', 'Genotype', 'Quality',
175 | sep='\t', file=output_file)
176 | output_file.close()
177 |
178 | basename = os.path.basename(out)
179 | offtargets, gi_offtargets = arrayOffTargets(matched_file, search_radius)
180 | ga_snp = SNPreader(snp_file)
181 |
182 | for name in offtargets:
183 | variant_flag = False
184 | site = offtargets[name]
185 | gi = gi_offtargets[name]
186 |
187 | chromosome = site[0]
188 | window_sequence = site[9]
189 | window_sequence = window_sequence.upper()
190 | cell, targetsite = site[25:27]
191 | TargetSequence = site[28]
192 | output01 = site[0:6]
193 | output03 = [cell, targetsite, TargetSequence]
194 | ots_nb, ots_bu = site[10], site[15]
195 |
196 | # obtain variant window sequence
197 | wkey = '_'.join([basename, chromosome])
198 | insert_start, insert_end, insert_var, snp_data = list(), list(), list(), {}
199 |
200 | for i, v in ga_snp[gi].steps():
201 | if v:
202 | position, reference, variant, genotype, quality, key = v
203 | if key == wkey:
204 | variant = variant.split(',')[0]
205 | for n, pos in enumerate(range(gi.start, gi.end)):
206 | if pos == int(position):
207 | insert_var.append(variant.lower())
208 | insert_start.append(n)
209 | end_pos = n + len(reference)
210 | insert_end.append(end_pos)
211 | snp_data[str(position)] = [position, reference, variant, genotype, quality]
212 |
213 | tri = 0
214 | window_sequence_variant = ''
215 | for i in range(len(insert_var)):
216 | variant = insert_var[i]
217 | pos = insert_start[i]
218 | window_sequence_variant += window_sequence[tri:pos] + variant.lower()
219 | tri = insert_end[i]
220 | window_sequence_variant += window_sequence[tri:]
221 |
222 | # variant off-target sequences: only proceed if there is a variant in the window sequence
223 | window_sequence_var = window_sequence_variant.upper()
224 | if window_sequence_var != window_sequence:
225 | offtarget_sequence_no_bulge, mismatches, offtarget_sequence_length, chosen_alignment_strand_m, start_no_bulge, end_no_bulge, \
226 | realigned_target, \
227 | bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end = \
228 | alignSequences(TargetSequence, window_sequence_var, max_score=mismatch_threshold)
229 |
230 | variant_ots_no_bulge, variant_ots_bulge = '', ''
231 |
232 | # get variant sequence if the off-target sequences have changed by considering the variant window
233 | if ots_nb != offtarget_sequence_no_bulge:
234 | variant_flag = True
235 | if chosen_alignment_strand_m == '+':
236 | m_no_bulge = re.search(offtarget_sequence_no_bulge, window_sequence_variant, re.I)
237 | else:
238 | m_no_bulge = re.search(offtarget_sequence_no_bulge, reverseComplement(window_sequence_variant), re.I)
239 | variant_ots_no_bulge = m_no_bulge.group()
240 |
241 | if ots_bu != bulged_offtarget_sequence:
242 | variant_flag = True
243 | variant_ots_bulge = realignVariantBulge(bulged_offtarget_sequence, window_sequence_variant, chosen_alignment_strand_b)
244 |
245 | # collect and write variant data if we have variant off-target sequence(s)
246 | if variant_flag:
247 | total_genotype, total_reference, total_variant, total_quality = '', '', '', ''
248 | for pos in snp_data:
249 | position, reference, variant, genotype, quality = snp_data[pos]
250 | if total_genotype != '':
251 | total_genotype += ''.join([':', genotype])
252 | total_reference += ''.join([':', reference])
253 | total_variant += ''.join([':', variant])
254 | total_quality += ''.join([':', quality])
255 | else:
256 | total_genotype += ''.join([genotype])
257 | total_reference += ''.join([reference])
258 | total_variant += ''.join([variant])
259 | total_quality += ''.join([quality])
260 |
261 | output02 = [variant_ots_no_bulge, mismatches, chosen_alignment_strand_m,
262 | variant_ots_bulge, length, substitutions, insertions, deletions, chosen_alignment_strand_b]
263 | output04 = [total_reference, total_variant, total_genotype, total_quality]
264 | output_line = output01 + [window_sequence_variant] + output02 + output03 + [realigned_target] + output04
265 |
266 | with open(out + '_Variants.txt', 'a') as output_file:
267 | print(*output_line, sep='\t', file=output_file)
268 |
269 |
270 | """
271 | Main function
272 | """
273 | def getVariants(matched_file, ref, bam_file, out, search_radius, mismatch_threshold):
274 | basename = os.path.basename(out)
275 | output_folder = os.path.dirname(out)
276 | if not os.path.exists(output_folder):
277 | os.makedirs(output_folder)
278 |
279 | snp_file = snpCall(matched_file, ref, bam_file, out, search_radius)
280 |
281 | print('Obtaining Variant Off-Target Sequences for %s' % basename, file=sys.stderr)
282 | snpAdjustment(matched_file, snp_file, out, mismatch_threshold, search_radius)
283 |
284 |
285 | def main():
286 | parser = argparse.ArgumentParser(description='Implement samtools:mpileup to identify genomic variants and adjust the off-target sequence when required.')
287 | parser.add_argument('--matched_file', help="full_path_to/matched file in 'identified' folder", required=True)
288 | parser.add_argument('--ref', help="Reference Genome Fasta", required=True)
289 | parser.add_argument('--bam', help="Sorted BAM file", required=True)
290 | parser.add_argument('--search_radius', help="Search radius around the position window", default=20, type=int)
291 | parser.add_argument('--mismatch_threshold', help='Maximum score threshold', default=7, type=int)
292 | parser.add_argument('--out', help="Output file basename, with full path", required=True)
293 | args = parser.parse_args()
294 |
295 | getVariants(args.matched_file, args.ref, args.bam, args.out, args.search_radius, args.mismatch_threshold)
296 |
297 | if __name__ == "__main__":
298 | main()
299 |
--------------------------------------------------------------------------------
/changeseq/changeseq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding: utf-8 -*-
3 |
4 | """
5 | circleseq.py as the wrapper for CIRCLE-seq analysis
6 | """
7 |
8 | from alignReads import alignReads
9 | from visualization import visualizeOfftargets
10 | from mergeReads import mergeReads
11 | import argparse
12 | import os
13 | import sys
14 | import subprocess
15 | import traceback
16 | import log
17 | import yaml
18 | import validation
19 | import findCleavageSites
20 | import callVariants
21 |
22 | logger = log.createCustomLogger('root')
23 | p_dir = os.path.dirname(os.path.realpath(__file__))
24 |
25 | class CircleSeq:
26 |
27 | def __init__(self):
28 | self.search_radius = 20
29 | self.window_size = 3
30 | self.mapq_threshold = 50
31 | self.start_threshold = 1
32 | self.gap_threshold = 3
33 | self.mismatch_threshold = 6
34 | self.read_threshold = 6
35 | self.merged_analysis = True
36 | self.all_chromosomes = False
37 | self.variant_analysis = False
38 | self.genome = None
39 | self.refseq_names = None
40 |
41 |
42 | def parseManifest(self, manifest_path, sample='all'):
43 | logger.info('Loading manifest...')
44 |
45 | with open(manifest_path, 'r') as f:
46 | manifest_data = yaml.load(f)
47 |
48 | try:
49 | # Validate manifest data
50 | validation.validateManifest(manifest_data)
51 |
52 | self.BWA_path = manifest_data['bwa']
53 | self.reference_genome = manifest_data['reference_genome']
54 | self.analysis_folder = manifest_data['analysis_folder']
55 |
56 | # Allow the user to specify read threshold, window_size and search_radius if they'd like
57 | if 'search_radius' in manifest_data:
58 | self.search_radius = manifest_data['search_radius']
59 | if 'window_size' in manifest_data:
60 | self.window_size = manifest_data['window_size']
61 | if 'mapq_threshold' in manifest_data:
62 | self.mapq_threshold = manifest_data['mapq_threshold']
63 | if 'start_threshold' in manifest_data:
64 | self.start_threshold = manifest_data['start_threshold']
65 | if 'gap_threshold' in manifest_data:
66 | self.gap_threshold = manifest_data['gap_threshold']
67 | if 'mismatch_threshold' in manifest_data:
68 | self.mismatch_threshold = manifest_data['mismatch_threshold']
69 | if 'read_threshold' in manifest_data:
70 | self.read_threshold = manifest_data['read_threshold']
71 | if 'merged_analysis' in manifest_data:
72 | self.merged_analysis = manifest_data['merged_analysis']
73 | if 'all_chromosomes' in manifest_data:
74 | self.all_chromosomes = manifest_data['all_chromosomes']
75 | if 'variant_analysis' in manifest_data:
76 | self.variant_analysis = manifest_data['variant_analysis']
77 | if 'genome' in manifest_data:
78 | self.genome = manifest_data['genome']
79 | if self.genome in ['hg38','hg19']:
80 | self.refseq_names = p_dir+"/refseq_gene_name.py"
81 | # Allow the user to specify PAM seq. Yichao 4/29/2020
82 | if 'PAM' in manifest_data:
83 | self.PAM = manifest_data['PAM']
84 | else:
85 | self.PAM = "NGG"
86 | # Allow the user to specify Read Length. Yichao 4/29/2020
87 | if 'read_length' in manifest_data:
88 | self.read_length = manifest_data['read_length']
89 | else:
90 | self.read_length = 151
91 | # Allow the user to specify Read Count cutoff. Yichao 4/29/2020
92 | if 'read_count_cutoff' in manifest_data:
93 | self.read_count_cutoff = manifest_data['read_count_cutoff']
94 | else:
95 | self.read_count_cutoff = 6
96 |
97 | # Do not allow to run variant_analysis with merged_analysis
98 | if self.merged_analysis and self.variant_analysis:
99 | logger.error('merged_analysis is not compatible with variant_analysis. Please remove one option.')
100 | sys.exit()
101 |
102 | if sample == 'all':
103 | self.samples = manifest_data['samples']
104 | else:
105 | self.samples = {}
106 | self.samples[sample] = manifest_data['samples'][sample]
107 | # Make folders for output
108 | for folder in ['aligned', 'identified', 'fastq', 'visualization', 'variants']:
109 | output_folder = os.path.join(self.analysis_folder, folder)
110 | if not os.path.exists(output_folder):
111 | os.makedirs(output_folder)
112 |
113 | except Exception as e:
114 | logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.')
115 | sys.exit()
116 |
117 | def alignReads(self):
118 | if self.merged_analysis:
119 | logger.info('Merging reads...')
120 | try:
121 | self.merged = {}
122 | for sample in self.samples:
123 | sample_merge_path = os.path.join(self.analysis_folder, 'fastq', sample + '_merged.fastq.gz')
124 | control_sample_merge_path = os.path.join(self.analysis_folder, 'fastq', 'control_' + sample + '_merged.fastq.gz')
125 | mergeReads(self.samples[sample]['read1'],
126 | self.samples[sample]['read2'],
127 | sample_merge_path)
128 | mergeReads(self.samples[sample]['controlread1'],
129 | self.samples[sample]['controlread2'],
130 | control_sample_merge_path)
131 |
132 | sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', sample + '.sam')
133 | control_sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '.sam')
134 |
135 | alignReads(self.BWA_path,
136 | self.reference_genome,
137 | sample_merge_path,
138 | '',
139 | sample_alignment_path)
140 |
141 | alignReads(self.BWA_path,
142 | self.reference_genome,
143 | control_sample_merge_path,
144 | '',
145 | control_sample_alignment_path)
146 |
147 | self.merged[sample] = sample_alignment_path
148 | logger.info('Finished merging and aligning reads.')
149 |
150 | except Exception as e:
151 | logger.error('Error aligning')
152 | logger.error(traceback.format_exc())
153 | quit()
154 | else:
155 | logger.info('Aligning reads...')
156 | try:
157 | self.aligned = {}
158 | self.aligned_sorted = {}
159 | for sample in self.samples:
160 | sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', sample + '.sam')
161 | control_sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '.sam')
162 | alignReads(self.BWA_path,
163 | self.reference_genome,
164 | self.samples[sample]['read1'],
165 | self.samples[sample]['read2'],
166 | sample_alignment_path)
167 | alignReads(self.BWA_path,
168 | self.reference_genome,
169 | self.samples[sample]['controlread1'],
170 | self.samples[sample]['controlread2'],
171 | control_sample_alignment_path)
172 | self.aligned[sample] = sample_alignment_path
173 | self.aligned_sorted[sample] = os.path.join(self.analysis_folder, 'aligned', sample + '_sorted.bam')
174 | logger.info('Finished aligning reads to genome.')
175 |
176 | except Exception as e:
177 | logger.error('Error aligning')
178 | logger.error(traceback.format_exc())
179 | quit()
180 |
181 | def findCleavageSites(self):
182 | logger.info('Identifying off-target cleavage sites.')
183 |
184 | try:
185 | for sample in self.samples:
186 | if self.merged_analysis:
187 | sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', sample + '.bam')
188 | control_sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '.bam')
189 | else:
190 | sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', sample + '_sorted.bam')
191 | control_sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '_sorted.bam')
192 | identified_sites_file = os.path.join(self.analysis_folder, 'identified', sample)
193 | logger.info('Window: {0}, MAPQ: {1}, Gap: {2}, Start {3}, Mismatches {4}, Search_Radius {5}'.format(self.window_size, self.mapq_threshold, self.gap_threshold, self.start_threshold, self.mismatch_threshold, self.search_radius))
194 | findCleavageSites.compare(self.reference_genome, sorted_bam_file, control_sorted_bam_file, self.samples[sample]['target'],
195 | self.search_radius, self.window_size, self.mapq_threshold, self.gap_threshold,
196 | self.start_threshold, self.mismatch_threshold, sample, self.samples[sample]['description'],
197 | identified_sites_file, self.all_chromosomes, merged=self.merged_analysis,read_count_cutoff=self.read_threshold,read_length=self.read_length)
198 | except Exception as e:
199 | logger.error('Error identifying off-target cleavage site.')
200 | logger.error(traceback.format_exc())
201 | quit()
202 |
203 | def visualize(self):
204 | logger.info('Visualizing off-target sites')
205 |
206 | # try:
207 | # for sample in self.samples:
208 | # if sample != 'control':
209 | # infile = os.path.join(self.analysis_folder, 'identified', sample + '_identified_matched.txt')
210 | # outfile = os.path.join(self.analysis_folder, 'visualization', sample + '_offtargets')
211 | # visualizeOfftargets(infile, outfile, title=sample)
212 |
213 | # logger.info('Finished visualizing off-target sites')
214 |
215 | # except Exception as e:
216 | # logger.error('Error visualizing off-target sites.')
217 | # logger.error(traceback.format_exc())
218 |
219 | for sample in self.samples: ## 4/29/2020 Yichao solved: visualization stopped when sample has no off-target
220 | if sample != 'control':
221 | try:
222 | infile = os.path.join(self.analysis_folder, 'identified', sample + '_identified_matched.txt')
223 | outfile = os.path.join(self.analysis_folder, 'visualization', sample + '_offtargets')
224 | visualizeOfftargets(infile, outfile, title=sample,PAM=self.PAM,genome=self.genome,refseq_names=self.refseq_names)
225 | except Exception as e:
226 | logger.error('Error visualizing off-target sites: %s'%(sample))
227 | logger.error(traceback.format_exc())
228 | logger.info('Finished visualizing off-target sites')
229 |
230 |
231 | def callVariants(self):
232 |
233 | try:
234 | if self.variant_analysis:
235 | logger.info('Identifying genomic variants')
236 |
237 | for sample in self.samples:
238 | sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', sample + '.bam')
239 | identified_sites_file = os.path.join(self.analysis_folder, 'identified', sample + '_identified_matched.txt')
240 | variants_basename = os.path.join(self.analysis_folder, 'variants', sample)
241 | logger.info('Mismatches {0}, Search_Radius {1}'.format(self.mismatch_threshold, self.search_radius))
242 | callVariants.getVariants(identified_sites_file, self.reference_genome, sorted_bam_file, variants_basename, self.search_radius, self.mismatch_threshold)
243 |
244 | logger.info('Finished identifying genomic variants')
245 |
246 | except Exception as e:
247 | logger.error('Error identifying genomic variants.')
248 | logger.error(traceback.format_exc())
249 | quit()
250 |
251 | def parallel(self, manifest_path, lsf, run='all'):
252 | logger.info('Submitting parallel jobs')
253 | current_script = __file__
254 |
255 | try:
256 | for sample in self.samples:
257 | cmd = 'python {0} {1} --manifest {2} --sample {3}'.format(current_script, run, manifest_path, sample)
258 | logger.info(cmd)
259 | subprocess.call(lsf.split() + [cmd])
260 | logger.info('Finished job submission')
261 |
262 | except Exception as e:
263 | logger.error('Error submitting jobs.')
264 | logger.error(traceback.format_exc())
265 |
266 | def referenceFree(self):
267 | pass
268 |
269 | def parse_args():
270 | parser = argparse.ArgumentParser()
271 |
272 | subparsers = parser.add_subparsers(description='Individual Step Commands',
273 | help='Use this to run individual steps of the pipeline',
274 | dest='command')
275 |
276 | all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline')
277 | all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
278 | all_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
279 |
280 | parallel_parser = subparsers.add_parser('parallel', help='Run all steps of the pipeline in parallel')
281 | parallel_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
282 | parallel_parser.add_argument('--lsf', '-l', help='Specify LSF CMD', default='bsub -R rusage[mem=32000] -P Genomics -q standard')
283 | parallel_parser.add_argument('--run', '-r', help='Specify which steps of pipepline to run (all, align, identify, visualize, variants)', default='all')
284 |
285 | align_parser = subparsers.add_parser('align', help='Run alignment only')
286 | align_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
287 | align_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
288 |
289 | merge_parser = subparsers.add_parser('merge', help='Merge paired end reads')
290 | merge_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
291 | merge_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
292 |
293 | identify_parser = subparsers.add_parser('identify', help='Run identification only')
294 | identify_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
295 | identify_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
296 |
297 | visualize_parser = subparsers.add_parser('visualize', help='Run visualization only')
298 | visualize_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
299 | visualize_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
300 |
301 | variants_parser = subparsers.add_parser('variants', help='Run variants analysis only')
302 | variants_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
303 | variants_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
304 |
305 | reference_free_parser = subparsers.add_parser('reference-free', help='Run reference-free discovery only')
306 | reference_free_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True)
307 | reference_free_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all')
308 |
309 | return parser.parse_args()
310 |
311 | def main():
312 | args = parse_args()
313 |
314 | if args.command == 'all':
315 | c = CircleSeq()
316 | c.parseManifest(args.manifest, args.sample)
317 | c.alignReads()
318 | c.findCleavageSites()
319 | c.visualize()
320 | c.callVariants()
321 | elif args.command == 'parallel':
322 | c = CircleSeq()
323 | c.parseManifest(args.manifest)
324 | c.parallel(args.manifest, args.lsf, args.run)
325 | elif args.command == 'align':
326 | c = CircleSeq()
327 | c.parseManifest(args.manifest, args.sample)
328 | c.alignReads()
329 | elif args.command == 'identify':
330 | c = CircleSeq()
331 | c.parseManifest(args.manifest, args.sample)
332 | c.findCleavageSites()
333 | elif args.command == 'merge':
334 | c = CircleSeq()
335 | c.parseManifest(args.manifest, args.sample)
336 | c.mergeAlignReads()
337 | elif args.command == 'visualize':
338 | c = CircleSeq()
339 | c.parseManifest(args.manifest, args.sample)
340 | c.visualize()
341 | elif args.command == 'variants':
342 | c = CircleSeq()
343 | c.parseManifest(args.manifest, args.sample)
344 | c.callVariants()
345 |
346 | if __name__ == '__main__':
347 | main()
348 |
--------------------------------------------------------------------------------
/changeseq/log.py:
--------------------------------------------------------------------------------
1 | """
2 | log.py
3 | =====
4 |
5 | Setup logging utils for nested module logging
6 |
7 | Adapted from the accepted answer here: http://stackoverflow.com/questions/7621897/python-logging-module-globally
8 | """
9 |
10 | import logging
11 |
12 | def createCustomLogger(name):
13 | formatter = logging.Formatter(fmt='[%(asctime)s][%(levelname)s][%(module)s] %(message)s', datefmt='%m/%d %I:%M:%S%p')
14 |
15 | handler = logging.StreamHandler()
16 | handler.setFormatter(formatter)
17 |
18 | logger = logging.getLogger(name)
19 | logger.setLevel(logging.DEBUG)
20 | logger.addHandler(handler)
21 | return logger
22 |
--------------------------------------------------------------------------------
/changeseq/mergeReads.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import argparse
3 | import itertools
4 | import gzip
5 | from utility import reverseComplement, fq
6 |
7 | def mergeReads(fastq1_filename, fastq2_filename, out):
8 | fastq1_file = fq(fastq1_filename)
9 | fastq2_file = fq(fastq2_filename)
10 |
11 | with gzip.open(out, 'wb') as o:
12 | for r1, r2 in itertools.izip(fastq1_file, fastq2_file):
13 | merged_sequence = reverseComplement(r1[1]) + r2[1]
14 | merged_quality_scores = r1[3][::-1] + r2[3]
15 | print(r1[0], file=o)
16 | print(merged_sequence, file=o)
17 | print(r1[2], file=o)
18 | print(merged_quality_scores, file=o)
19 |
20 | def main():
21 | parser = argparse.ArgumentParser(description='Merge CIRCLE-seq reads for alignment.')
22 | parser.add_argument('--read1', help='Read 1 filename', required=True)
23 | parser.add_argument('--read2', help='Read 2 filename', required=True)
24 | parser.add_argument('--out', help='Output filename', required=True)
25 |
26 | args = parser.parse_args()
27 |
28 | mergeReads(args.read1, args.read2, args.out)
29 |
30 | if __name__ == "__main__":
31 | main()
--------------------------------------------------------------------------------
/changeseq/referenceFree.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import argparse
4 | import itertools
5 | import regex
6 | import re
7 | import gzip
8 | import sys
9 | import collections
10 | from findCleavageSites import regexFromSequence, alignSequences, reverseComplement, extendedPattern, realignedSequences
11 |
12 | """
13 | FASTQ generator function from umi package
14 | """
15 | def fq(file):
16 | if re.search('.gz$', file):
17 | fastq = gzip.open(file, 'rb')
18 | else:
19 | fastq = open(file, 'r')
20 | with fastq as f:
21 | while True:
22 | l1 = f.readline()
23 | if not l1:
24 | break
25 | l2 = f.readline()
26 | l3 = f.readline()
27 | l4 = f.readline()
28 | yield [l1, l2, l3, l4]
29 |
30 | """
31 | Main function to find off-target sites in reference-free fashion
32 | """
33 | def analyze(fastq1_filename, fastq2_filename, targetsite, out_base, name='', cells='', mismatch_threshold=7):
34 |
35 | read_count = 0
36 | c = collections.Counter()
37 | d = collections.defaultdict(list)
38 |
39 | fastq1_file = fq(fastq1_filename)
40 | fastq2_file = fq(fastq2_filename)
41 | for r1, r2 in itertools.izip(fastq1_file, fastq2_file):
42 | r1_sequence = r1[1].rstrip('\n')
43 | r2_sequence = r2[1].rstrip('\n')
44 | joined_seq = reverseComplement(r1_sequence) + r2_sequence
45 | truncated_joined_seq = joined_seq[130:170]
46 |
47 | sequence_data = alignSequences(targetsite, truncated_joined_seq, max_score=mismatch_threshold)
48 | offtarget, mismatch, length, strand, start, end, realigned_target = sequence_data[:7]
49 |
50 | if offtarget:
51 | c[offtarget] += 1
52 | d[offtarget].append(joined_seq)
53 |
54 | read_count += 1
55 | if not read_count % 100000:
56 | print(read_count/float(1000000), end=" ", file=sys.stderr)
57 |
58 | print('Finished tabulating reference-free discovery counts.', file=sys.stderr)
59 | out_filename = out_base + '.txt'
60 |
61 | with open(out_filename, 'w') as o:
62 | for target_sequence, target_count in c.most_common():
63 | print(target_sequence, target_count, file=o)
64 | off_target_fasta_filename = '{0}_{1:04d}_{2}.fasta'.format(out_base, target_count, target_sequence)
65 | with open(off_target_fasta_filename, 'w') as off_target_fasta_file:
66 | j = 0
67 | for sequence in d[target_sequence]:
68 | j += 1
69 | print('>{0:04d}_{1}_{2}'.format(target_count, target_sequence, j), file=off_target_fasta_file)
70 | print(sequence, file=off_target_fasta_file)
71 |
72 | def join_write_output(fastq1_filename, fastq2_filename, out):
73 | fastq1_file = fq(fastq1_filename)
74 | fastq2_file = fq(fastq2_filename)
75 |
76 | with open(out, 'w') as o:
77 | for r1, r2 in itertools.izip(fastq1_file, fastq2_file):
78 | header = '>{0}'.format(r1[0])
79 | r1_sequence = r1[1].rstrip('\n')
80 | r2_sequence = r2[1].rstrip('\n')
81 | joined_seq = reverseComplement(r1_sequence) + r2_sequence
82 | print(header, end='', file=o)
83 | print(joined_seq, file=o)
84 |
85 |
86 | def main():
87 | parser = argparse.ArgumentParser(description='Identify off-target candidates from Illumina short read sequencing data.')
88 | parser.add_argument('--fq1', help='FASTQ Read 1', required=True)
89 | parser.add_argument('--fq2', help='FASTQ Read 2', required=True)
90 | parser.add_argument('--targetsite', help='Targetsite Sequence', required=True)
91 | parser.add_argument('--name', help='Targetsite Name', required=False)
92 | parser.add_argument('--cells', help='Cells', required=False)
93 | parser.add_argument('--mismatch_threshold', help='Maximum score threshold', default=7, type=int)
94 | parser.add_argument('--out', help='Output file base', required=True)
95 | args = parser.parse_args()
96 |
97 | analyze(args.fq1, args.fq2, args.targetsite, args.out, args.name, args.cells, args.mismatch_threshold)
98 |
99 | if __name__ == "__main__":
100 | main()
101 |
--------------------------------------------------------------------------------
/changeseq/test.yaml:
--------------------------------------------------------------------------------
1 | reference_genome: /Users/shengdar/genomes/Homo_sapiens_assembly19.fasta
2 | analysis_folder: /Users/shengdar/Local/circleseq-test/merged
3 |
4 | bwa: bwa
5 | samtools: samtools
6 |
7 | read_threshold: 4
8 | window_size: 3
9 | mapq_threshold: 50
10 | start_threshold: 1
11 | gap_threshold: 3
12 | mismatch_threshold: 6
13 | merged_analysis: True
14 |
15 | samples:
16 | U2OS_exp1_VEGFA_site_1:
17 | target: GGGTGGGGGGAGTTTGCTCCNGG
18 | read1: /Users/shengdar/Local/circleseq-test/1_S1_subset_100000_R1.fastq
19 | read2: /Users/shengdar/Local/circleseq-test/1_S1_subset_100000_R2.fastq
20 | controlread1: /Users/shengdar/Local/circleseq-test/4_S4_subset_R1.fastq
21 | controlread2: /Users/shengdar/Local/circleseq-test/4_S4_subset_R2.fastq
22 | description: U2OS_exp1_VEGFA_site_1
--------------------------------------------------------------------------------
/changeseq/utility.py:
--------------------------------------------------------------------------------
1 | import string
2 | import re
3 | import gzip
4 | """
5 | FASTQ generator function from umi package
6 | """
7 |
8 | def fq(file):
9 | if re.search('.gz$', file):
10 | fastq = gzip.open(file, 'rb')
11 | else:
12 | fastq = open(file, 'r')
13 | with fastq as f:
14 | while True:
15 | l1 = f.readline().rstrip('\n')
16 | if not l1:
17 | break
18 | l2 = f.readline().rstrip('\n')
19 | l3 = f.readline().rstrip('\n')
20 | l4 = f.readline().rstrip('\n')
21 | yield [l1, l2, l3, l4]
22 |
23 | def reverseComplement(sequence):
24 | transtab = string.maketrans("ACGT","TGCA")
25 | return sequence.translate(transtab)[::-1]
26 |
--------------------------------------------------------------------------------
/changeseq/validation.py:
--------------------------------------------------------------------------------
1 | """
2 | validation.py
3 | =============
4 |
5 | Contains utils for validating the filetype and existence of manifest-defined files/folders
6 |
7 | """
8 |
9 | import logging
10 | import os
11 | import sys
12 | from distutils.spawn import find_executable
13 |
14 | logger = logging.getLogger('root')
15 |
16 |
17 | def exists(filepath):
18 | if not os.path.isfile(filepath):
19 | logger.error('{0} does not exist'.format(filepath))
20 | sys.exit()
21 |
22 |
23 | def checkIfBinary(filepath):
24 | executable = find_executable(filepath)
25 |
26 | if executable is None:
27 | logger.error('Executable binary not found at {0}'.format(filepath))
28 | sys.exit()
29 |
30 | # First check if file exists
31 | exists(executable)
32 |
33 | # Check if file is a valid binary
34 | # Adapted from http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python
35 | textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})
36 | is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))
37 |
38 | if not is_binary_string(open(executable, 'rb').read(1024)):
39 | logger.error('{0} is not a valid binary'.format(executable))
40 | sys.exit()
41 |
42 |
43 | def checkIfFasta(filepath):
44 | # First check if file exists
45 | exists(os.path.abspath(filepath))
46 |
47 |
48 | def checkIfFolder(folderpath):
49 | # Check if the folder exists
50 | if not os.path.isdir(os.path.abspath(folderpath)):
51 | logger.error('{0} is not a valid folder path'.format(folderpath))
52 | sys.exit()
53 |
54 |
55 | def checkIfValidUndemultiplexed(undemultiplexed):
56 | # Check if read1, read2, index1, and index2 exist
57 | fields = ['forward', 'reverse', 'index1', 'index2']
58 |
59 | if set(fields) != set(undemultiplexed.keys()):
60 | logger.error('Undemultiplexed field must contain references to "forward", "reverse", "index1", "index2"')
61 | sys.exit()
62 |
63 | invalid_file = False
64 | for field in fields:
65 | if not os.path.isfile(undemultiplexed[field]):
66 | logger.error('"read1" undemultiplexed field does not reference a valid file')
67 | invalid_file = True
68 |
69 | if invalid_file:
70 | sys.exit()
71 |
72 |
73 | def checkIfValidSamples(samples):
74 | # # Check if control is one of the samples
75 | # if 'control' not in samples:
76 | # logger.error('A control sample must be specified')
77 | # sys.exit()
78 |
79 | if len(samples.keys()) == 0:
80 | logger.error('No samples defined')
81 | sys.exit()
82 |
83 | for sample in samples:
84 | if 'read1' not in samples[sample] or 'read2' not in samples[sample]:
85 | logger.error('read1 and read2 must be specified for {0} sample'.format(sample))
86 | sys.exit()
87 | if 'controlread1' not in samples[sample] or 'controlread2' not in samples[sample]:
88 | logger.error('controlread1 and controlread2 must be specified for {0} sample'.format(sample))
89 | sys.exit()
90 | if 'target' not in samples[sample]:
91 | logger.error('target sequence must be specified for {0} sample'.format(sample))
92 | sys.exit()
93 |
94 | def validateManifest(manifest_data):
95 | # Check if manifest contains the required fields
96 | fields = ['bwa', 'reference_genome', 'analysis_folder', 'samples']
97 | missing_fields = False
98 |
99 | for field in fields:
100 | if field not in manifest_data.keys():
101 | logger.error('"{0}" field must be specified in manifest'.format(field))
102 | missing_fields = True
103 |
104 | if missing_fields:
105 | sys.exit()
106 |
107 | # Now validate each field
108 | checkIfBinary(manifest_data['bwa'])
109 | checkIfBinary(manifest_data['samtools'])
110 | checkIfFasta(manifest_data['reference_genome'])
111 | checkIfValidSamples(manifest_data['samples'])
--------------------------------------------------------------------------------
/changeseq/visualization.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import svgwrite
4 | import os
5 | import logging
6 | import argparse
7 | import pandas as pd
8 |
9 | ### 2017-October-11: Adapt plots to new output; inputs are managed using "argparse".
10 |
11 | logger = logging.getLogger('root')
12 | logger.propagate = False
13 |
14 | boxWidth = 10
15 | box_size = 15
16 | v_spacing = 3
17 |
18 | # colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', '-': '#B3B3B3'}
19 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#B3B3B3'}
20 | for c in ['Y','S','W','K','M','B','D','H','V','.']:
21 | colors[c] = "#B3B3B3"
22 |
23 | def refseqID_to_HGNC_symbol(x,myDict):
24 | if "(" in x:
25 | ID = x.split()[1].split(",")[0].replace(")","").replace("(","")
26 | # print (ID)
27 | if ID in myDict:
28 | gene = myDict[ID]
29 | # print (ID,gene)
30 | return x.replace(ID,gene)
31 | return x
32 |
33 | def reformat_homer_annotation(r):
34 | if r.Annotation =="Intergenic":
35 | return "%s (%s)"%(r.Annotation,r['Gene Name'])
36 | return r.Annotation
37 | def parse_homer(identified,homer_output,genome,refseq_names=None):
38 | select_col="Annotation"
39 | command = "annotatePeaks.pl %s %s > %s"%(identified,genome,homer_output)
40 | os.system(command)
41 | df = pd.read_csv(identified,sep="\t")
42 | df.index = df['Genomic Coordinate'].to_list()
43 | df2 = pd.read_csv(homer_output,sep="\t",index_col=0)
44 | df2[select_col] = df2.apply(reformat_homer_annotation,axis=1)
45 | df['Annotation'] = df2[select_col]
46 | if refseq_names!=None:
47 | myDict = parse_HGNC(refseq_names)
48 | df['Annotation'] = [refseqID_to_HGNC_symbol(x,myDict) for x in df.Annotation]
49 | out = identified.replace(".txt",".annot.tsv")
50 | df.to_csv(out,sep="\t",index=False)
51 | return out
52 |
53 | def get_int(x):
54 | try:
55 | x = float(x)
56 | except:
57 | return ""
58 | return int(x)
59 |
60 | def parse_HGNC(f):
61 | refseq = "#name"
62 | symbol = "name2"
63 | df = pd.read_csv(f,sep="\t")
64 | # print (df.head())
65 | df = df[[refseq,symbol]]
66 | df = df.dropna()
67 | df.index = df[refseq].to_list()
68 | # print (df.head())
69 | return df[symbol].to_dict()
70 | def parseSitesFile(infile):
71 | offtargets = []
72 | total_seq = 0
73 | with open(infile, 'r') as f:
74 | f.readline()
75 | for line in f:
76 | line = line.rstrip('\n')
77 | line_items = line.split('\t')
78 | # offtarget_reads = line_items[4]
79 | # no_bulge_offtarget_sequence = line_items[10]
80 | # bulge_offtarget_sequence = line_items[15]
81 | # target_seq = line_items[28]
82 | # realigned_target_seq = line_items[29]
83 | offtarget_reads = line_items[4]
84 | no_bulge_offtarget_sequence = line_items[7]
85 | bulge_offtarget_sequence = line_items[9]
86 | target_seq = line_items[14]
87 | realigned_target_seq = line_items[15]
88 | coord = line_items[3]
89 | num_mismatch = get_int(line_items[8])
90 | try:
91 | annot = line_items[16]
92 | except:
93 | annot = ""
94 |
95 | if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '':
96 | if no_bulge_offtarget_sequence:
97 | total_seq += 1
98 | if bulge_offtarget_sequence:
99 | total_seq += 1
100 | offtargets.append({'seq': no_bulge_offtarget_sequence.strip(),
101 | 'bulged_seq': bulge_offtarget_sequence.strip(),
102 | 'reads': int(offtarget_reads.strip()),
103 | 'coord': str(coord),
104 | 'annot': str(annot),
105 | 'num_mismatch': str(num_mismatch),
106 | 'target_seq': target_seq.strip(),
107 | 'realigned_target_seq': realigned_target_seq.strip()
108 | })
109 | offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True)
110 | return offtargets, target_seq, total_seq
111 |
112 | # 3/6/2020 Yichao
113 | def check_mismatch(a,b):
114 | from Bio.Data import IUPACData
115 | dna_dict = IUPACData.ambiguous_dna_values
116 | set_a = dna_dict[a.upper()]
117 | set_b = dna_dict[b.upper()]
118 | overlap = list(set(list(set_a)).intersection(list(set_b)))
119 | if len(overlap) == 0:
120 | return True
121 | else:
122 | return False
123 | from Bio import SeqUtils
124 | def find_PAM(seq,PAM):
125 | try:
126 | PAM_index = seq.index(PAM)
127 | except:
128 | # PAM on the left
129 | left_search = SeqUtils.nt_search(seq[:len(PAM)], PAM)
130 | if len(left_search)>1:
131 | PAM_index = left_search[1]
132 | else:
133 | right_search = SeqUtils.nt_search(seq[-len(PAM):], PAM)
134 | if len(right_search)>1:
135 | PAM_index = len(seq)-len(PAM)
136 | else:
137 | print ("PAM: %s not found in %s. Set PAM index to 20"%(PAM,seq))
138 | PAM_index=20
139 | return PAM_index
140 |
141 | def visualizeOfftargets(infile, outfile, title, PAM, genome=None,refseq_names=None):
142 |
143 | output_folder = os.path.dirname(outfile)
144 | if not os.path.exists(output_folder):
145 | os.makedirs(output_folder)
146 |
147 |
148 | if genome!=None:
149 | infile = parse_homer(infile,outfile+".raw.homer.tsv",genome,refseq_names=refseq_names)
150 | # Get offtargets array from file
151 | offtargets, target_seq, total_seq = parseSitesFile(infile)
152 |
153 | # Initiate canvas
154 | dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1)))
155 |
156 | if title is not None:
157 | # Define top and left margins
158 | x_offset = 20
159 | y_offset = 50
160 | dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier"))
161 | else:
162 | # Define top and left margins
163 | x_offset = 20
164 | y_offset = 20
165 |
166 | # Draw ticks
167 | # if target_seq.find('N') >= 0:
168 | # p = target_seq.index('N')
169 | # if p > len(target_seq) / 2: # PAM on the right end
170 | # tick_locations = [1, len(target_seq)] + range(p, len(target_seq)) # limits and PAM
171 | # tick_locations += [x + p - 20 + 1 for x in range(p)[::10][1:]] # intermediate values
172 | # tick_locations = list(set(tick_locations))
173 | # tick_locations.sort()
174 | # tick_legend = [p, 10, 1] + ['P', 'A', 'M']
175 | # else:
176 | # tick_locations = range(2, 6) + [14, len(target_seq)] # complementing PAM and limits
177 | # tick_legend = ['P', 'A', 'M', '1', '10'] + [str(len(target_seq) - 4)]
178 |
179 | # for x, y in zip(tick_locations, tick_legend):
180 | # dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
181 | # else:
182 | # tick_locations = [1, len(target_seq)] # limits
183 | # tick_locations += range(len(target_seq) + 1)[::10][1:]
184 | # tick_locations.sort()
185 | # for x in tick_locations:
186 | # dwg.add(dwg.text(str(x), insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
187 | ## Assume PAM is on the right end Yichao rewrite visualization code, generic PAM
188 | ## PAM can be on the left or right, Yichao 0713
189 | tick_locations = []
190 | tick_legend = []
191 | # PAM_index = target_seq.index(PAM)
192 | PAM_index = find_PAM(target_seq,PAM)
193 | count = 0
194 | for i in range(PAM_index,0,-1):
195 | count = count+1
196 | if count % 10 == 0:
197 | tick_legend.append(count)
198 | # print (count,i)
199 | tick_locations.append(i)
200 | if len(PAM)>=3:
201 | tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
202 | else:
203 | tick_legend+=["PAM"]+['-']*(len(PAM)-3)
204 | tick_locations+=range(PAM_index+1,len(target_seq)+1)
205 | if PAM_index == 0:
206 | tick_legend = []
207 | tick_locations = []
208 | tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
209 | tick_locations+=range(1,len(PAM)+1)
210 | count = 0
211 | for i in range(len(PAM)+1,len(target_seq)+1):
212 | count = count+1
213 | if count % 10 == 0 or count == 1:
214 | tick_legend.append(count)
215 | # print (count,i)
216 | tick_locations.append(i)
217 | # print (zip(tick_locations, tick_legend))
218 | for x,y in zip(tick_locations, tick_legend):
219 | dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))
220 |
221 | # Draw reference sequence row
222 | for i, c in enumerate(target_seq):
223 | y = y_offset
224 | x = x_offset + i * box_size
225 | dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c]))
226 | dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier"))
227 | dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
228 | dwg.add(dwg.text('Mismatches', insert=(box_size * (len(target_seq) + 1) + 90, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
229 | dwg.add(dwg.text('Coordinates', insert=(box_size * (len(target_seq) + 1) + 200, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
230 | if genome!=None:
231 | dwg.add(dwg.text('Annotation', insert=(box_size * (len(target_seq) + 1) + 450, y_offset + box_size - 3), style="font-size:15px; font-family:Courier"))
232 |
233 | # Draw aligned sequence rows
234 | y_offset += 1 # leave some extra space after the reference row
235 | line_number = 0 # keep track of plotted sequences
236 | for j, seq in enumerate(offtargets):
237 | realigned_target_seq = offtargets[j]['realigned_target_seq']
238 | no_bulge_offtarget_sequence = offtargets[j]['seq']
239 | bulge_offtarget_sequence = offtargets[j]['bulged_seq']
240 |
241 | if no_bulge_offtarget_sequence != '':
242 | k = 0
243 | line_number += 1
244 | y = y_offset + line_number * box_size
245 | for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)):
246 | x = x_offset + k * box_size
247 | if r == '-':
248 | if 0 < k < len(target_seq):
249 | x = x_offset + (k - 0.25) * box_size
250 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
251 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
252 | elif c == r:
253 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
254 | k += 1
255 | elif r == 'N':
256 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
257 | k += 1
258 | else:
259 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
260 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
261 | k += 1
262 | if bulge_offtarget_sequence != '':
263 | k = 0
264 | line_number += 1
265 | y = y_offset + line_number * box_size
266 | for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)):
267 | x = x_offset + k * box_size
268 | if r == '-':
269 | if 0 < k < len(realigned_target_seq):
270 | x = x_offset + (k - 0.25) * box_size
271 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
272 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
273 | elif c == r:
274 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
275 | k += 1
276 | elif r == 'N':
277 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
278 | k += 1
279 | else:
280 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
281 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier"))
282 | k += 1
283 |
284 | if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '':
285 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2),
286 | fill='black', style="font-size:15px; font-family:Courier")
287 | dwg.add(reads_text)
288 | mismatch_text = dwg.text(seq['num_mismatch'], insert=(box_size * (len(target_seq) + 1) + 130, y_offset + box_size * (line_number + 2) - 2),
289 | fill='black', style="font-size:15px; font-family:Courier")
290 | dwg.add(mismatch_text)
291 | mismatch_text = dwg.text(seq['coord'], insert=(box_size * (len(target_seq) + 1) + 200, y_offset + box_size * (line_number + 2) - 2),
292 | fill='black', style="font-size:15px; font-family:Courier")
293 | dwg.add(mismatch_text)
294 | if genome!= None:
295 | annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 450, y_offset + box_size * (line_number + 2) - 2),
296 | fill='black', style="font-size:15px; font-family:Courier")
297 | dwg.add(annot_text)
298 | else:
299 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5),
300 | fill='black', style="font-size:15px; font-family:Courier")
301 | dwg.add(reads_text)
302 | mismatch_text = dwg.text(seq['num_mismatch'], insert=(box_size * (len(target_seq) + 1) + 130, y_offset + box_size * (line_number + 1) + 5),
303 | fill='black', style="font-size:15px; font-family:Courier")
304 | dwg.add(mismatch_text)
305 | mismatch_text = dwg.text(seq['coord'], insert=(box_size * (len(target_seq) + 1) + 200, y_offset + box_size * (line_number + 1) + 5),
306 | fill='black', style="font-size:15px; font-family:Courier")
307 | dwg.add(mismatch_text)
308 | if genome!= None:
309 | annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 450, y_offset + box_size * (line_number + 1) + 5),
310 | fill='black', style="font-size:15px; font-family:Courier")
311 | dwg.add(annot_text)
312 | reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5),
313 | fill='black', style="font-size:23px; font-family:Courier")
314 | dwg.add(reads_text02)
315 | dwg.save()
316 |
317 | def main():
318 | parser = argparse.ArgumentParser(description='Plot visualization plots for re-aligned reads.')
319 | parser.add_argument("-f","--identified_file", help="FullPath/output file from reAlignment_circleseq.py", required=True)
320 | parser.add_argument("-o","--outfile", help="FullPath/VIZ", required=True)
321 | parser.add_argument("-t","--title", help="Plot title", required=True)
322 | parser.add_argument("-g","--genome", help="if specified, homer annotation will be performed", default=None)
323 | parser.add_argument("-a","--annotation", help="refseqID to gene name mapping", default=None)
324 | parser.add_argument("--PAM", help="PAM sequence", default="NGG")
325 | args = parser.parse_args()
326 |
327 | print(args)
328 |
329 | visualizeOfftargets(args.identified_file, args.outfile, args.title, args.PAM,args.genome,args.annotation)
330 |
331 | if __name__ == "__main__":
332 |
333 | main()
334 |
--------------------------------------------------------------------------------
/conda_build/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | python:
2 | - 2.7
3 |
--------------------------------------------------------------------------------
/conda_build/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "changeseq" %}
2 | {% set version = "1.2.8" %}
3 | {% set file_ext = "tar.gz" %}
4 | {% set hash_type = "sha256" %}
5 | {% set hash_value = "42dde92e84e63369e4c0f2d6f1135952a6478644df9a6f303d3f93507e1f6573" %}
6 |
7 | package:
8 | name: "{{ name|lower }}"
9 | version: "{{ version }}"
10 |
11 | source:
12 | fn: '{{ name }}-{{ version }}.{{ file_ext }}'
13 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.{{ file_ext }}
14 | '{{ hash_type }}': '{{ hash_value }}'
15 |
16 | build:
17 | number: 0
18 | script: python setup.py install --single-version-externally-managed --record=record.txt
19 |
20 | requirements:
21 | host:
22 | - pip
23 | - python
24 | run:
25 | - python
26 | - bwa=0.7.17
27 | - htseq
28 | - matplotlib
29 | - numpy
30 | - pandas
31 | - pyfaidx
32 | - pygments
33 | - pysam
34 | - pyyaml
35 | - regex
36 | - scipy
37 | - setuptools
38 | - sqlite
39 | - statsmodels
40 | - svgwrite
41 | - yaml
42 | - zlib
43 | - htslib=1.9
44 | - samtools=1.9
45 |
46 | test:
47 | imports:
48 | - changeseq
49 |
50 | about:
51 | home: https://github.com/tsailabSJ/changeseq
52 | license: GNU General Public License v2 (GPLv2)
53 | license_family: GPL2
54 | license_file: ''
55 | summary: Bioinformatic pipeline for the CHANGE-seq assay.
56 | description: "[![Version][version-shield]][version-url]\n[![Python versions][python-shield]][python-url]\n[![Platforms][platform-shield]][python-url]\n\n\n# CHANGE-seq: Circularization for High-throughput\
57 | \ Analysis Nuclease Genome-wide Effects by Sequencing\n\nThis is a repository for CHANGE-seq analytical software, which takes sample-specific paired-end FASTQ files as input and produces a list of CHANGE-seq\
58 | \ detected off-target cleavage sites as output.\n\n# Summary\n\nThis package implements a pipeline that takes in reads from the CHANGE-seq assay and returns detected cleavage sites as output. The individual\
59 | \ pipeline steps are:\n\n1. **Merge**: Merge read1 an read2 for easier mapping to genome.\n2. **Read Alignment**: Merged paired end reads from the assay are aligned to the reference genome using the\
60 | \ BWA-MEM algorithm with default parameters (Li. H, 2009).\n3. **Cleavage Site Identification**: Mapped sites are analyzed to determine which represent high-quality cleavage sites.\n4. **Visualization\
61 | \ of Results**: Identified on-target and off-target cleavage sites are rendered as a color-coded alignment map for easy analysis of results.\n\n# Installation\n\nThe most easiest way to install change-seq\
62 | \ pipeline is via conda.\n\n```\n\nconda create -n changeseq -c conda-forge -c bioconda -c anaconda -c omnia -c tsailabSJ changeseq\n\nsource activate changeseq\n\nchangeseq.py -h\n\n## BWA 0.7.17 and\
63 | \ samtools 1.9 are automatically installed\n\n```\n\nAlternatively, you can git clone this repository and install\n\n```\n\ngit clone https://github.com/tsailabSJ/changeseq\n\ncd changeseq\n\npip install\
64 | \ -r requirements.txt\n\npython setup.py install\n\nchangeseq.py -h\n\n## Please install BWA and samtools if you choose this option\n\n```\n\n## Download Reference Genome\n\nThe CHANGEseq package requires\
65 | \ a reference genome for read mapping. You can use any genome of your choosing, but for all of our testing and original CHANGE-seq analyses we use hg19 ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)).\
66 | \ Be sure to (g)unzip the FASTA file before use if it is compressed.\n\n# Usage\n\nThe change-seq pipeline requires a manifest yaml file specifying input files, output directory, and pipeline parameters.\
67 | \ Once the yaml file is created, users can simply run ``change_seq.py all --manifest /path/to/manifest.yaml``\n\n\nBelow is an example ``manifest.yaml`` file::\n\n reference_genome: /data/joung/genomes/Homo_sapiens_assembly19.fasta\n\
68 | \ analysis_folder: /data/joung/CHANGE-Seq/test2\n\n bwa: bwa\n samtools: samtools\n\n read_threshold: 4\n window_size: 3\n mapq_threshold: 50\n start_threshold: 1\n gap_threshold:\
69 | \ 3\n mismatch_threshold: 6\n search_radius: 30\n merged_analysis: True\n\n samples:\n U2OS_exp1_VEGFA_site_1:\n target: GGGTGGGGGGAGTTTGCTCCNGG\n read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R1_001.fastq.gz\n\
70 | \ read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R2_001.fastq.gz\n controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz\n\
71 | \ controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz\n description: U2OS_exp1\n U2OS_exp1_EMX1:\n target:\
72 | \ GAGTCCGAGCAGAAGAAGAANGG\n read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R1_001.fastq.gz\n read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R2_001.fastq.gz\n\
73 | \ controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz\n controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz\n\
74 | \ description: U2OS_exp1\n\n## Quickstart\n\n```\n\ngit clone https://github.com/tsailabSJ/changeseq\n\ncd changeseq/test\n\nchangeseq.py all --manifest CIRCLEseq_MergedTest.yaml\n\n```\n\
75 | \n# Writing A Manifest File\nWhen running the end-to-end analysis functionality of the CHANGEseq package a number of inputs are required. To simplify the formatting of these inputs and to encourage\
76 | \ reproducibility, these parameters are inputted into the pipeline via a manifest formatted as a YAML file. YAML files allow easy-to-read specification of key-value pairs. This allows us to easily specify\
77 | \ our parameters. The following fields are required in the manifest:\n\n- `reference_genome`: The absolute path to the reference genome FASTA file.\n- `output_folder`: The absolute path to the folder\
78 | \ in which all pipeline outputs will be saved.\n- `bwa`: The absolute path to the `bwa` executable\n- `samtools`: The absolute path to the `samtools` executable\n- `read_threshold`: The minimum number\
79 | \ of reads at a location for that location to be called as a site. We recommend leaving it to the default value of 4.\n- `window_size`: Size of the sliding window, we recommend leaving it to the default\
80 | \ value of 3.\n- `mapq_threshold`: Minimum read mapping quality score. We recommend leaving it to the default value of 50.\n- `start_threshold`: Tolerance for breakpoint location. We recommend leaving\
81 | \ it to the default value of 1.\n- `gap_threshold`: Distance between breakpoints. We recommend leaving it to the default value of 3 for Cas9.\n- `mismatch_threshold`: Number of tolerated gaps in the\
82 | \ fuzzy target search setp. We recommend leaving it to the default value of 6.\n- `read_length`: Fastq file read length, default is 151.\n- `PAM`: PAM sequence, default is NGG.\n- `merged_analysis`:\
83 | \ Whether or not the paired read merging step should takingTrue\n- `samples`: Lists the samples you wish to analyze and the details for each. Each sample name should be nested under the top level samples\
84 | \ key, and each sample detail should be nested under the sample name. See the sample manifest for an example.\n - For each sample, you must provide the following parameters:\n - `target`:\
85 | \ Target sequence for that sample. Accepts degenerate bases.\n - `read1`: The absolute path to the .FASTQ(.gz) file containing the read1 reads.\n - `read2`: The absolute path to the .FASTQ(.gz)\
86 | \ file containing the read2 reads.\n - `controlread1`: The absolute path to the .FASTQ(.gz) file containing the control read1 reads.\n - `controlread2`: The absolute path to the .FASTQ(.gz)\
87 | \ file containing the control read2 reads.\n - `description`: A brief description of the sample\n\n\n# Pipeline Output\nWhen running the full pipeline, the results of each step are outputted\
88 | \ to the `output_folder` in a separate folder for each step. The output folders and their respective contents are as follows:\n\n- `output_folder/aligned`: Contains an alignment `.sam`, alignment `.bam`,\
89 | \ sorted `bam`, and `.bai` index file for each sample.\n- `output_folder/fastq`: Merged `.fastq.gz` files for each sample.\n- `output_folder/identified`: Contains tab-delimited `.txt` files for each\
90 | \ sample containing the identified DSBs, control DSBs, filtered DSBs, and read quantification.\n- `output_folder/visualization`: Contains a `.svg` vector image representing an alignment of all detected\
91 | \ off-targets to the targetsite for each sample.\n\n# FAQ\n\nNone yet, we will keep this updated as needed.\n\n[version-shield]: https://img.shields.io/conda/v/tsailabsj/changeseq.svg\n[version-url]:\
92 | \ https://anaconda.org/tsailabSJ/changeseq\n[python-shield]: https://img.shields.io/pypi/pyversions/changeseq.svg\n[python-url]: https://pypi.python.org/pypi/changeseq\n[platform-shield]: https://anaconda.org/tsailabsj/changeseq/badges/platforms.svg\n\
93 | \n\n"
94 | doc_url: ''
95 | dev_url: ''
96 |
97 | extra:
98 | recipe-maintainers:
99 | - YichaoOU
100 |
--------------------------------------------------------------------------------
/example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/example_output.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse>=1.4.0
2 | PyYAML>=3.11
3 | regex>=2018.01.10
4 | HTSeq>=0.6.1p1
5 | pyfaidx>=0.2.7
6 | statsmodels>=0.6.1
7 | pysam>=0.9.1.4
8 | svgwrite>=1.1.6
9 | numpy>=1.11.1
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/scripts/NUC_SIMPLE:
--------------------------------------------------------------------------------
1 | #
2 | # This matrix was created by Todd Lowe 12/10/92
3 | #
4 | # Uses ambiguous nucleotide codes, probabilities rounded to
5 | # nearest integer
6 | #
7 | # Lowest score = -4, Highest score = 5
8 | #
9 | # Modified by Shengdar Tsai 1/23/16
10 | A T G C N
11 | A 10 -5 -5 -5 10
12 | T -5 10 -5 -5 10
13 | G -5 -5 10 -5 10
14 | C -5 -5 -5 10 10
15 | N 10 10 10 10 10
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'shengdar'
2 |
--------------------------------------------------------------------------------
/scripts/site_pvalue.R:
--------------------------------------------------------------------------------
1 | #!/apps/lab/aryee/R/R-3.2.3/bin/Rscript --vanilla
2 |
3 | # Usage example using test data from the circleseq repository:
4 | # ./site_pvalue.R ../test/U2OS_EMX1_counts.txt ../test/U2OS_EMX1_counts_pval.txt
5 |
6 | # Usage example using a larger test dataset on erisone:
7 | # ./site_pvalue.R /data/joung/CIRCLE-Seq/complete_analysis/160122_937aa31/output/U2OS_EMX1_counts.txt U2OS_EMX1_counts_pval.txt
8 |
9 | library("ggplot2")
10 | library("scales")
11 |
12 | args <- commandArgs(TRUE)
13 | infile <- args[1]
14 | outfile <- args[2]
15 | #infile <- "../test/U2OS_EMX1_counts.txt"
16 | # Read in counts
17 | message("Reading ", infile)
18 | dat <- read.delim(infile, comment.char = "", header=TRUE)
19 |
20 | # Condition on having observed at least one read
21 | bg <- dat$Control_Position_Reads
22 | bg <- bg[bg>0]
23 |
24 | # Model control distribution as exponential
25 | message("Calculating p-values")
26 | lambda <- mean(bg)
27 | pval <- 1 - pexp(dat$Nuclease_Position_Reads, rate=1/lambda)
28 | dat$pvalue <- pval
29 |
30 | # Model control distribution empirically
31 | background_cdf <- ecdf(bg)
32 | pval_empirical <- 1 - background_cdf(dat$Nuclease_Position_Reads)
33 |
34 | message("Saving diagnostic plots to pvalue_diagnostics.pdf")
35 | # Diagnostic plots
36 | pdf(file="pvalue_diagnostics.pdf", width=6, height=2.5)
37 | p <- ggplot(dat, aes(1+Control_Position_Reads)) + scale_x_continuous(limits=c(0,100)) + scale_y_log10(labels=comma) + geom_histogram(binwidth=2, na.rm=TRUE) + theme_bw() + ggtitle("Control_Position_Reads")
38 | suppressWarnings(print(p))
39 | p <- ggplot(dat, aes(1+Nuclease_Position_Reads)) + scale_x_continuous(limits=c(0,100)) + scale_y_log10(labels=comma) + geom_histogram(binwidth=2, na.rm=TRUE) + theme_bw() + ggtitle("Nuclease_Position_Reads")
40 | suppressWarnings(print(p))
41 | idx <- sample(length(pval), min(length(pval), 10000))
42 | plot(pval_empirical[idx], pval[idx], xlab="Empirical p-value", ylab="Exponential model p-value")
43 | abline(0,1)
44 | dev.off()
45 |
46 | message("Writing output table ", outfile)
47 | write.table(dat, file=outfile, sep="\t", quote=FALSE, row.names=FALSE)
48 |
49 |
--------------------------------------------------------------------------------
/scripts/test.py:
--------------------------------------------------------------------------------
1 | import regex
2 | import nwalign as nw
3 | import swalign
4 | import string
5 |
6 | def reverseComplement(sequence):
7 | transtab = string.maketrans("ACGT","TGCA")
8 | return sequence.translate(transtab)[::-1]
9 |
10 | def regexFromSequence(seq, lookahead=True, indels=1, errors=7):
11 | """
12 | Given a sequence with ambiguous base characters, returns a regex that matches for
13 | the explicit (unambiguous) base characters
14 | """
15 | IUPAC_notation_regex = {'N': '[ATCGN]',
16 | 'Y': '[CTY]',
17 | 'R': '[AGR]',
18 | 'W': '[ATW]',
19 | 'S': '[CGS]',
20 | 'A': 'A',
21 | 'T': 'T',
22 | 'C': 'C',
23 | 'G': 'G'}
24 |
25 | pattern = ''
26 |
27 | for c in seq:
28 | pattern += IUPAC_notation_regex[c]
29 |
30 | if lookahead:
31 | pattern = '(?b:' + pattern + ')'
32 |
33 | pattern_standard = pattern + '{{s<={0}}}'.format(errors)
34 | pattern_gap = pattern + '{{i<={0},d<={0},s<={1},3i+3d+1s<={1}}}'.format(indels, errors)
35 | return pattern_standard, pattern_gap
36 |
37 | """
38 | Given a targetsite and window, use a fuzzy regex to align the targetsite to
39 | the window. Returns the best match.
40 | """
41 | def alignSequences(targetsite_sequence, window_sequence, max_mismatches=7):
42 | # Try both strands
43 | query_regex_standard, query_regex_gap = regexFromSequence(targetsite_sequence, errors=max_mismatches)
44 |
45 | alignments = list()
46 | alignments.append(('+', 'standard', regex.search(query_regex_standard, window_sequence, regex.BESTMATCH)))
47 | alignments.append(('-', 'standard', regex.search(query_regex_standard, reverseComplement(window_sequence), regex.BESTMATCH)))
48 | alignments.append(('+', 'gapped', regex.search(query_regex_gap, window_sequence, regex.BESTMATCH)))
49 | alignments.append(('-', 'gapped', regex.search(query_regex_gap, reverseComplement(window_sequence), regex.BESTMATCH)))
50 |
51 | top_distance_score = 0
52 | chosen_alignment = None
53 | for i, aln in enumerate(alignments):
54 | strand, alignment_type, match = aln
55 | if match != None:
56 | substitutions, insertions, deletions = match.fuzzy_counts
57 | distance_score = substitutions + (insertions + deletions) * 3
58 | if distance_score > top_distance_score:
59 | chosen_alignment = match
60 | top_distance_score = distance_score
61 | print(match, distance_score)
62 |
63 | if chosen_alignment:
64 | match_sequence = chosen_alignment.group()
65 | distance = sum(chosen_alignment.fuzzy_counts)
66 | length = len(match_sequence)
67 | start = chosen_alignment.start()
68 | end = chosen_alignment.end()
69 | return [match_sequence, distance, length, strand, start, end]
70 | else:
71 | return [''] * 6
72 |
73 |
74 |
75 |
76 | # if forward_alignment is None and reverse_alignment is None:
77 | # return ['', '', '', '', '', '']
78 | # else:
79 | # if forward_alignment is None and reverse_alignment is not None:
80 | # strand = '-'
81 | # alignment = reverse_alignment
82 | # elif reverse_alignment is None and forward_alignment is not None:
83 | # strand = '+'
84 | # alignment = forward_alignment
85 | # elif forward_alignment is not None and reverse_alignment is not None:
86 | # forward_distance = sum(forward_alignment.fuzzy_counts)
87 | # reverse_distance = sum(reverse_alignment.fuzzy_counts)
88 | #
89 | # if forward_distance > reverse_distance:
90 | # strand = '-'
91 | # alignment = reverse_alignment
92 | # else:
93 | # strand = '+'
94 | # alignment = forward_alignment
95 | #
96 | # match_sequence = alignment.group()
97 | # distance = sum(alignment.fuzzy_counts)
98 | # length = len(match_sequence)
99 | # start = alignment.start()
100 | # end = alignment.end()
101 | #
102 | # return [match_sequence, distance, length, strand, start, end]
103 |
104 | def alignSequences2(ref_seq, query_seq):
105 | match = 2
106 | mismatch = -1
107 | ref_length = len(ref_seq)
108 | matches_required = len(ref_seq) - 1 - 7 # allow up to 8 mismatches
109 | scoring = swalign.NucleotideScoringMatrix(match, mismatch)
110 | sw = swalign.LocalAlignment(scoring, gap_penalty=-3, gap_extension_penalty=-100, prefer_gap_runs=True) # you can also choose gap penalties, etc...
111 | # sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True) # you can also choose gap penalties, etc...
112 | forward_alignment = sw.align(ref_seq, query_seq)
113 | reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq))
114 | if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches:
115 | start_pad = forward_alignment.r_pos
116 | start = forward_alignment.q_pos - start_pad
117 | end_pad = ref_length - forward_alignment.r_end
118 | end = forward_alignment.q_end + end_pad
119 | strand = "+"
120 | return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 1, end - start, strand, start, end]
121 | elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches:
122 | start_pad = reverse_alignment.r_pos
123 | start = reverse_alignment.q_pos - start_pad
124 | end_pad = ref_length - reverse_alignment.r_end
125 | end = reverse_alignment.q_end + end_pad
126 | strand = "-"
127 | return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 1, end - start, strand, start, end]
128 | else:
129 | return ["", "", "", "", "", ""]
130 |
131 |
132 | def main():
133 | # target = 'TTTNCTGATGGTCCATGTCTGTTACTC'
134 |
135 |
136 | # windowsequence = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA'
137 | # windowsequence = 'GGCCTGAGTCCGAGCAGAAGCAAGAAGGGCTCCCATCACATCAAC'
138 |
139 | target = 'TTTNGGGACGGGGAGAAGGAAAAGAGG'
140 | windowsequence = 'AATTTGGGGGGATTCATTACTCTATTTGGATTTGTTAGGGAGGAAGGCAGGTGGGATTTTTCTTCTCATTCTTATCTCTTTCCTTCTTCCCGTCCCAGAAAGAAACTAAGAATAATAACCAAATTATTAAAATGACTCACCGCCCTTCCA'
141 |
142 | print(alignSequences(target, windowsequence, max_mismatches=7))
143 |
144 |
145 | if __name__ == "__main__":
146 | main()
--------------------------------------------------------------------------------
/scripts/test_align.py:
--------------------------------------------------------------------------------
1 | import nwalign as nw
2 | import Levenshtein as l
3 | import difflib
4 | import os
5 |
6 | def main():
7 |
8 | # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC'
9 | # b = 'GAGTCGAGCAGAAGAAGAANGG'
10 |
11 | a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA'
12 | b= 'TTTNCTGATGGTCCATGTCTGTTACTC'
13 |
14 | print(l.distance(a, b))
15 | print(l.editops(a, b))
16 | print(l.matching_blocks(l.editops(a,b), a, b))
17 |
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
--------------------------------------------------------------------------------
/scripts/test_ga.py:
--------------------------------------------------------------------------------
1 | import HTSeq
2 |
3 | def main():
4 | ga = HTSeq.GenomicArray("auto", typecode='O', stranded=False)
5 | position = HTSeq.GenomicPosition('chr1', 123203, '.')
6 |
7 | ga[HTSeq.GenomicInterval( "chr1", 100000, 101000 , "." )] = [0.05, 0.002, 0.04, 0.005]
8 |
9 | iv = HTSeq.GenomicInterval( "chr1", 100000, 130000 , "." )
10 |
11 | for interval, value in ga[iv].steps():
12 | print(interval, value)
13 |
14 | if __name__ == "__main__":
15 | main()
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | verbosity=1
3 | detailed-errors=1
4 | exe=1
5 | where=test/
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # from distutils.core import setup
5 | from setuptools import setup, find_packages
6 | import changeseq
7 | ## conda skeleton can't find readme
8 | import os
9 | if os.path.isfile("README.MD"):
10 | with open("README.MD", "r") as fh:
11 | long_description = fh.read()
12 | else:
13 | long_description="change-seq"
14 |
15 | setup(
16 | name='changeseq',
17 | version=str(changeseq.__version__), # update visualization, run homer peak annotation if available
18 | description="Bioinformatic pipeline for the CHANGE-seq assay.",
19 | author="Shengdar Q Tsai, Martin Aryee, Ved V Topkar, Jose Malagon-Lopez",
20 | author_email='STSAI4@mgh.harvard.edu, Aryee.Martin@mgh.harvard.edu, vedtopkar@gmail.com, jose.lopez@mail.harvard.edu',
21 | url='https://github.com/tsailabSJ/changeseq',
22 | # packages=['changeseq','data'],
23 | packages=find_packages(),
24 | # package_dir={'changeseq':'changeseq'},
25 | license='LICENSE',
26 | scripts=['changeseq/changeseq.py','changeseq/alignReads.py','changeseq/visualization.py',
27 | 'changeseq/callVariants.py','changeseq/findCleavageSites.py','changeseq/log.py',
28 | 'changeseq/mergeReads.py','changeseq/referenceFree.py','changeseq/utility.py',
29 | 'changeseq/validation.py','changeseq/refseq_gene_name.py'],
30 | package_data={'test': ["test/*"]},
31 | # package_data={'':["README.md","data/refseq_gene_name.py"]},
32 | include_package_data=True,
33 | long_description=long_description,
34 | long_description_content_type='text/markdown' ,
35 | keywords='changeseq',
36 | classifiers=[
37 | 'Development Status :: 4 - Beta',
38 | 'Intended Audience :: Science/Research',
39 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
40 | 'Topic :: Scientific/Engineering :: Visualization',
41 | 'Topic :: Scientific/Engineering :: Information Analysis',
42 | 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)',
43 | 'Operating System :: Unix',
44 | 'Natural Language :: English',
45 | "Programming Language :: Python :: 2",
46 | 'Programming Language :: Python :: 2.6',
47 | 'Programming Language :: Python :: 2.7'
48 | ]
49 | )
50 |
--------------------------------------------------------------------------------
/test/CIRCLEseq_MergedTest.yaml:
--------------------------------------------------------------------------------
1 | reference_genome: data/input/CIRCLEseq_test_genome.fa
2 | analysis_folder: data/MergedOutput
3 |
4 | bwa: bwa
5 | samtools: samtools
6 |
7 | window_size: 3
8 | mapq_threshold: 50
9 | start_threshold: 1
10 | gap_threshold: 3
11 | mismatch_threshold: 6
12 | merged_analysis: True
13 |
14 | samples:
15 | TestSample:
16 | target: GAGTCCGAGCAGAAGAAGAANGG
17 | read1: data/input/TEST.r1.fastq.gz
18 | read2: data/input/TEST.r2.fastq.gz
19 | controlread1: data/input/TEST_control.r1.fastq.gz
20 | controlread2: data/input/TEST_control.r2.fastq.gz
21 | description: TestCell
22 |
--------------------------------------------------------------------------------
/test/CIRCLEseq_StandardTest.yaml:
--------------------------------------------------------------------------------
1 | reference_genome: data/input/CIRCLEseq_test_genome.fa
2 | analysis_folder: data/StandardOutput
3 |
4 | bwa: bwa
5 | samtools: samtools
6 |
7 | window_size: 3
8 | mapq_threshold: 50
9 | start_threshold: 1
10 | gap_threshold: 3
11 | mismatch_threshold: 6
12 | merged_analysis: False
13 | variant_analysis: True
14 |
15 | samples:
16 | TestSample:
17 | target: GAGTCCGAGCAGAAGAAGAANGG
18 | read1: data/input/TEST.r1.fastq.gz
19 | read2: data/input/TEST.r2.fastq.gz
20 | controlread1: data/input/TEST_control.r1.fastq.gz
21 | controlread2: data/input/TEST_control.r2.fastq.gz
22 | description: TestCell
23 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/__init__.py
--------------------------------------------------------------------------------
/test/data/MergedOutput/aligned/TestSample.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/TestSample.bam
--------------------------------------------------------------------------------
/test/data/MergedOutput/aligned/TestSample.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/TestSample.bam.bai
--------------------------------------------------------------------------------
/test/data/MergedOutput/aligned/TestSample_sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/TestSample_sorted.bam
--------------------------------------------------------------------------------
/test/data/MergedOutput/aligned/control_TestSample.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/control_TestSample.bam
--------------------------------------------------------------------------------
/test/data/MergedOutput/aligned/control_TestSample.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/control_TestSample.bam.bai
--------------------------------------------------------------------------------
/test/data/MergedOutput/aligned/control_TestSample_sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/control_TestSample_sorted.bam
--------------------------------------------------------------------------------
/test/data/MergedOutput/fastq/TestSample_merged.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/fastq/TestSample_merged.fastq.gz
--------------------------------------------------------------------------------
/test/data/MergedOutput/fastq/control_TestSample_merged.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/fastq/control_TestSample_merged.fastq.gz
--------------------------------------------------------------------------------
/test/data/MergedOutput/identified/TestSample_CONTROL_coordinates.txt:
--------------------------------------------------------------------------------
1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read2_start_position Read2_strand
2 |
--------------------------------------------------------------------------------
/test/data/MergedOutput/identified/TestSample_NUCLEASE_coordinates.txt:
--------------------------------------------------------------------------------
1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read2_start_position Read2_strand
2 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
3 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
4 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
5 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
6 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
7 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
8 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
9 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
10 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
11 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
12 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
13 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
14 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
15 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
16 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
17 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
18 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
19 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
20 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
21 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
22 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
23 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
24 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
25 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
26 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
27 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
28 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
29 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 +
30 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
31 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
32 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
33 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 +
34 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
35 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
36 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
37 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
38 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
39 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
40 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
41 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
42 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
43 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
44 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
45 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
46 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
47 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 +
48 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
49 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
50 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
51 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
52 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
53 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
54 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
55 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
56 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
57 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
58 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
59 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
60 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
61 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
62 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
63 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
64 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 +
65 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
66 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
67 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
68 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
69 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 +
70 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
71 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
72 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
73 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
74 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
75 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
76 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
77 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
78 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
79 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 +
80 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
81 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
82 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
83 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
84 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
85 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
86 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
87 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
88 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
89 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
90 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
91 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
92 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
93 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
94 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
95 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
96 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
97 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
98 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
99 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
100 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
101 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
102 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
103 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
104 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 +
105 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10004 - 8 10006 +
106 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
107 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
108 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
109 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
110 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
111 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
112 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
113 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
114 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
115 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
116 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
117 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
118 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
119 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
120 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
121 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
122 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
123 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
124 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
125 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
126 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
127 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
128 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
129 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
130 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
131 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
132 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
133 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
134 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
135 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
136 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
137 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
138 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
139 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
140 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
141 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
142 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
143 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
144 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
145 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
146 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
147 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
148 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
149 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
150 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
151 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
152 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
153 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
154 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
155 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
156 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
157 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
158 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
159 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
160 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
161 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
162 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
163 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
164 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
165 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
166 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
167 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
168 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
169 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
170 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
171 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
172 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
173 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
174 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
175 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
176 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
177 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 +
178 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
179 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
180 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
181 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
182 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
183 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
184 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
185 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
186 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
187 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
188 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
189 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
190 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
191 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
192 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
193 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
194 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
195 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
196 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
197 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
198 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
199 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
200 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
201 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
202 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
203 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
204 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
205 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
206 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
207 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
208 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
209 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
210 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
211 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
212 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
213 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
214 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
215 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
216 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
217 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
218 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
219 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
220 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
221 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10005 - 1 10006 +
222 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
223 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
224 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
225 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
226 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
227 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
228 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
229 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
230 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
231 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
232 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
233 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
234 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
235 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
236 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10005 - 1 10006 +
237 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
238 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
239 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 +
240 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
241 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
242 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
243 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
244 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
245 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
246 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
247 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
248 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
249 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
250 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
251 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
252 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
253 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
254 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
255 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
256 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
257 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
258 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
259 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
260 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
261 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 +
262 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
263 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
264 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
265 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
266 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
267 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
268 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
269 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
270 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
271 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
272 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
273 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
274 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
275 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
276 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
277 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
278 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
279 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
280 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
281 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
282 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
283 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
284 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
285 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
286 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
287 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
288 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
289 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 +
290 |
--------------------------------------------------------------------------------
/test/data/MergedOutput/identified/TestSample_count.txt:
--------------------------------------------------------------------------------
1 | #Chromosome zero_based_Position Nuclease_Position_Reads Control_Position_Reads Nuclease_Window_Reads Control_Window_Reads p_Value narrow_p_Value control_p_Value control_narrow_p_Value
2 | 1 10004 60.0 0.0 124.0 0.0 0.0 0.0 0.0 0.0
3 | 1 10005 62.0 0.0 124.0 0.0 0.0 0.0 0.0 0.0
4 | 1 10006 2.0 0.0 124.0 0.0 0.0 0.0 0.0 0.0
5 | 8 10004 1.0 0.0 146.0 0.0 0.0 0.0 0.0 0.0
6 | 8 10005 72.0 0.0 146.0 0.0 0.0 0.0 0.0 0.0
7 | 8 10006 73.0 0.0 146.0 0.0 0.0 0.0 0.0 0.0
8 | 2 10016 103.0 0.0 206.0 0.0 0.0 0.0 0.0 0.0
9 | 2 10017 97.0 0.0 206.0 0.0 0.0 0.0 0.0 0.0
10 | 2 10018 6.0 0.0 206.0 0.0 0.0 0.0 0.0 0.0
11 | 12 10005 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0
12 | 12 10006 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0
13 | 4 10016 28.0 0.0 56.0 0.0 0.0 0.0 0.0 0.0
14 | 4 10017 28.0 0.0 56.0 0.0 0.0 0.0 0.0 0.0
15 |
--------------------------------------------------------------------------------
/test/data/MergedOutput/identified/TestSample_identified_matched.txt:
--------------------------------------------------------------------------------
1 | Chromosome Start End Genomic Coordinate Nuclease_Read_Count Strand Control_Read_Count Site_Sequence Site_Substitution_Number Site_Sequence_Gaps_Allowed File_Name Cell Target_site Full_Name Target_Sequence Realigned_Target_Sequence
2 | 12 10000 10023 12:10000-10023 44 - 0.0 GAGTTAGAGCAGAAAAAAAATGG 4 TestSample.bam TestCell TestSample TestSample_TestCell_12:10000-10023_44 GAGTCCGAGCAGAAGAAGAANGG none
3 | 1 10000 10023 1:10000-10023 124 - 0.0 GAAGTAGAGCAGAAGAAGAAGCG 5 AAGT-AGAGCAGAAGAAGAAGCG TestSample.bam TestCell TestSample TestSample_TestCell_1:10000-10023_124 GAGTCCGAGCAGAAGAAGAANGG GAGTCCGAGCAGAAGAAGAANGG
4 | 2 10000 10023 2:10000-10023 206 + 0.0 GAGTCCGAGCAGAAGAAGAAGGG 0 TestSample.bam TestCell TestSample TestSample_TestCell_2:10000-10023_206 GAGTCCGAGCAGAAGAAGAANGG none
5 | 4 10000 10023 4:10000-10023 56 + 0.0 CACTCCAAGTAGAAGAAGAAAAG 5 TestSample.bam TestCell TestSample TestSample_TestCell_4:10000-10023_56 GAGTCCGAGCAGAAGAAGAANGG none
6 | 8 10000 10023 8:10000-10023 146 - 0.0 AAGGCCAAGCAGAAGAGTAATGG 5 TestSample.bam TestCell TestSample TestSample_TestCell_8:10000-10023_146 GAGTCCGAGCAGAAGAAGAANGG none
7 |
--------------------------------------------------------------------------------
/test/data/MergedOutput/identified/TestSample_identified_unmatched.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/identified/TestSample_identified_unmatched.txt
--------------------------------------------------------------------------------
/test/data/MergedOutput/visualization/TestSample_offtargets.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/aligned/TestSample.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/TestSample.bam
--------------------------------------------------------------------------------
/test/data/StandardOutput/aligned/TestSample.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/TestSample.bam.bai
--------------------------------------------------------------------------------
/test/data/StandardOutput/aligned/TestSample_sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/TestSample_sorted.bam
--------------------------------------------------------------------------------
/test/data/StandardOutput/aligned/control_TestSample.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/control_TestSample.bam
--------------------------------------------------------------------------------
/test/data/StandardOutput/aligned/control_TestSample.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/control_TestSample.bam.bai
--------------------------------------------------------------------------------
/test/data/StandardOutput/aligned/control_TestSample_sorted.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/control_TestSample_sorted.bam
--------------------------------------------------------------------------------
/test/data/StandardOutput/identified/TestSample_CONTROL_coordinates.txt:
--------------------------------------------------------------------------------
1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read1_start_position Read2_strand
2 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/identified/TestSample_NUCLEASE_coordinates.txt:
--------------------------------------------------------------------------------
1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read1_start_position Read2_strand
2 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
3 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
4 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 -
5 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
6 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
7 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
8 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
9 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
10 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
11 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
12 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
13 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
14 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
15 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
16 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
17 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
18 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
19 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
20 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
21 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 -
22 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
23 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
24 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
25 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
26 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
27 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
28 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
29 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
30 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
31 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 +
32 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
33 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10005 -
34 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
35 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
36 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
37 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
38 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
39 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
40 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
41 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 +
42 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
43 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
44 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
45 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
46 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
47 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
48 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
49 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
50 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
51 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
52 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
53 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
54 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
55 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
56 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
57 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
58 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
59 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 +
60 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
61 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
62 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
63 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 - 8 10006 +
64 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 +
65 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
66 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10005 - 12 10006 +
67 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
68 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
69 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
70 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
71 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
72 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
73 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
74 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
75 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
76 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
77 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 -
78 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
79 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
80 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
81 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
82 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
83 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
84 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
85 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
86 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
87 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
88 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
89 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
90 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
91 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
92 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
93 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
94 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
95 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
96 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
97 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
98 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
99 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
100 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
101 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
102 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
103 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
104 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
105 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
106 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
107 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 +
108 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10004 -
109 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
110 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
111 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
112 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
113 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
114 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 +
115 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
116 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
117 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
118 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
119 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
120 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
121 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
122 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
123 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
124 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
125 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
126 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
127 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10005 - 12 10006 +
128 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
129 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
130 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
131 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
132 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
133 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 +
134 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
135 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
136 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10005 -
137 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
138 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10005 - 12 10006 +
139 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
140 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
141 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
142 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
143 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
144 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
145 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
146 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
147 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
148 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
149 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
150 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
151 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
152 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
153 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
154 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
155 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
156 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
157 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
158 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
159 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
160 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
161 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
162 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
163 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
164 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
165 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 +
166 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
167 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
168 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
169 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
170 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
171 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
172 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
173 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
174 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
175 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
176 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
177 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
178 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
179 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
180 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
181 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
182 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
183 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
184 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
185 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
186 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
187 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
188 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
189 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
190 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
191 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
192 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 - 1 10005 +
193 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
194 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
195 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
196 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
197 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 -
198 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
199 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
200 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
201 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
202 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
203 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
204 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
205 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10004 -
206 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
207 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
208 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
209 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
210 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
211 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
212 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
213 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
214 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
215 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
216 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
217 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
218 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
219 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
220 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 +
221 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10006 + 1 10005 -
222 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
223 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
224 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
225 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
226 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
227 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
228 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
229 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
230 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
231 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 - 8 10006 +
232 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
233 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
234 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
235 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
236 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
237 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
238 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 - 1 10006 +
239 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 +
240 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
241 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
242 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
243 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
244 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
245 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
246 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
247 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
248 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 -
249 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
250 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 +
251 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10005 -
252 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 -
253 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 -
254 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 -
255 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
256 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
257 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
258 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 - 8 10006 +
259 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 +
260 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 - 1 10005 +
261 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
262 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
263 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
264 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
265 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 +
266 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 +
267 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 -
268 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/identified/TestSample_count.txt:
--------------------------------------------------------------------------------
1 | #Chromosome zero_based_Position Nuclease_Position_Reads Control_Position_Reads Nuclease_Window_Reads Control_Window_Reads p_Value narrow_p_Value control_p_Value control_narrow_p_Value
2 | 1 10004 47.0 0.0 108.0 0.0 0.0 0.0 0.0 0.0
3 | 1 10005 59.0 0.0 108.0 0.0 0.0 0.0 0.0 0.0
4 | 1 10006 2.0 0.0 108.0 0.0 0.0 0.0 0.0 0.0
5 | 8 10004 1.0 0.0 126.0 0.0 0.0 0.0 0.0 0.0
6 | 8 10005 59.0 0.0 126.0 0.0 0.0 0.0 0.0 0.0
7 | 8 10006 66.0 0.0 126.0 0.0 0.0 0.0 0.0 0.0
8 | 12 10004 1.0 0.0 70.0 0.0 0.0 0.0 0.0 0.0
9 | 12 10005 21.0 0.0 70.0 0.0 0.0 0.0 0.0 0.0
10 | 12 10006 48.0 0.0 70.0 0.0 0.0 0.0 0.0 0.0
11 | 2 10016 107.0 0.0 184.0 0.0 0.0 0.0 0.0 0.0
12 | 2 10017 73.0 0.0 184.0 0.0 0.0 0.0 0.0 0.0
13 | 2 10018 4.0 0.0 184.0 0.0 0.0 0.0 0.0 0.0
14 | 4 10016 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0
15 | 4 10017 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0
16 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/identified/TestSample_identified_matched.txt:
--------------------------------------------------------------------------------
1 | Chromosome Start End Name ReadCount Strand MappingPositionStart MappingPositionEnd WindowName WindowSequence Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End FileName Cell Targetsite FullName TargetSequence RealignedTargetSequence Position.Pvalue Narrow.Pvalue Position.Control.Pvalue Narrow.Control.Pvalue
2 | 12 10000 10023 12:10000-10023 70 - 10004 10007 12:[10004,10007)/. TTTTCACTTTCCTTTACCATTTTTTTTCTGCTCTAACTCTACC GAGTTAGAGCAGAAAAAAAATGG 4 - 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_12:10000-10023_70 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0
3 | 1 10000 10023 1:10000-10023 108 - 10004 10007 1:[10004,10007)/. GAACTTGCGGAAGGTCCGCTTCTTCTTCTGCTCTACTTCTGCC GAAGTAGAGCAGAAGAAGAAGCG 5 - 10000 10023 AAGTA-GAGCAGAAGAAGAAGCG 22 6 3 0 1 - 10000 10022 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_1:10000-10023_108 GAGTCCGAGCAGAAGAAGAANGG GAGTCCGAGCAGAAGAAGAANGG 0.0 0.0 0.0 0.0
4 | 2 10000 10023 2:10000-10023 184 + 10016 10019 2:[10016,10019)/. GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC GAGTCCGAGCAGAAGAAGAAGGG 0 + 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_2:10000-10023_184 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0
5 | 4 10000 10023 4:10000-10023 44 + 10016 10018 4:[10016,10018)/. TGATCACTCCAAGTAGAAGAAGAAAAGCTAGCTTCCATATAA CACTCCAAGTAGAAGAAGAAAAG 5 + 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_4:10000-10023_44 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0
6 | 8 10000 10023 8:10000-10023 126 - 10004 10007 8:[10004,10007)/. GCACTAGAATCCCAGGCCATTACTCTTCTGCTTGGCCTTTTGG AAGGCCAAGCAGAAGAGTAATGG 5 - 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_8:10000-10023_126 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0
7 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/identified/TestSample_identified_unmatched.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/identified/TestSample_identified_unmatched.txt
--------------------------------------------------------------------------------
/test/data/StandardOutput/variants/TestSample_Variants.txt:
--------------------------------------------------------------------------------
1 | Chromosome Start End Name ReadCount Strand Variant_WindowSequence Variant_Site_SubstitutionsOnly.Sequence Variant_Site_SubstitutionsOnly.NumSubstitutions Variant_Site_SubstitutionsOnly.Strand Variant_Site_GapsAllowed.Sequence Variant_Site_GapsAllowed.Length Variant_Site_GapsAllowed.Substitutions Variant_Site_GapsAllowed.Insertions Variant_Site_GapsAllowed.Deletions Variant_Site_GapsAllowed.Strand Cell Targetsite TargetSequence Variant_RealignedTargetSequence Reference Variant Genotype Quality
2 | 4 10000 10023 4:10000-10023 44 + TGATCACTCCAAGcAGAAGAAGAAAAGCTAGCTTCCATATAA CACTCCAAGcAGAAGAAGAAAAG 4 + TestCell TestSample GAGTCCGAGCAGAAGAAGAANGG none T C 1|1 162.998
3 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/variants/TestSample_mpileupCall.txt:
--------------------------------------------------------------------------------
1 | targetsite site_name chromosome one_based_position reference variant quality genotype depth PL
2 | TestSample TestSample_12:10000-10023 12 10000 A T 162.998 1|1 25 196_75_0
3 | TestSample TestSample_12:10000-10023 12 10025 A G 119.008 0|1 23 149_0_159
4 | TestSample TestSample_4:10000-10023 4 10010 T C 162.998 1|1 23 196_69_0
5 |
--------------------------------------------------------------------------------
/test/data/StandardOutput/visualization/TestSample_offtargets.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/test/data/input/CIRCLEseq_test_genome.fa.amb:
--------------------------------------------------------------------------------
1 | 100270 5 0
2 |
--------------------------------------------------------------------------------
/test/data/input/CIRCLEseq_test_genome.fa.ann:
--------------------------------------------------------------------------------
1 | 100270 5 11
2 | 0 2 (null)
3 | 0 20178 0
4 | 0 8 (null)
5 | 20178 20023 0
6 | 0 1 (null)
7 | 40201 20023 0
8 | 0 12 (null)
9 | 60224 20023 0
10 | 0 4 (null)
11 | 80247 20023 0
12 |
--------------------------------------------------------------------------------
/test/data/input/CIRCLEseq_test_genome.fa.bwt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/CIRCLEseq_test_genome.fa.bwt
--------------------------------------------------------------------------------
/test/data/input/CIRCLEseq_test_genome.fa.fai:
--------------------------------------------------------------------------------
1 | 2 20178 3 20178 20179
2 | 8 20023 20185 20023 20024
3 | 1 20023 40212 20023 20024
4 | 12 20023 60240 20023 20024
5 | 4 20023 80267 20023 20024
6 |
--------------------------------------------------------------------------------
/test/data/input/CIRCLEseq_test_genome.fa.pac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/CIRCLEseq_test_genome.fa.pac
--------------------------------------------------------------------------------
/test/data/input/CIRCLEseq_test_genome.fa.sa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/CIRCLEseq_test_genome.fa.sa
--------------------------------------------------------------------------------
/test/data/input/TEST.r1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST.r1.fastq.gz
--------------------------------------------------------------------------------
/test/data/input/TEST.r2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST.r2.fastq.gz
--------------------------------------------------------------------------------
/test/data/input/TEST_control.r1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST_control.r1.fastq.gz
--------------------------------------------------------------------------------
/test/data/input/TEST_control.r2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST_control.r2.fastq.gz
--------------------------------------------------------------------------------
/test/scripts/CIRCLEseq_prepare_test_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #####################################################################################
3 | ### CIRCLEseq_prepare_test_data.sh: assemble the fastq files for the test
4 | #####################################################################################
5 | ### Regions
6 | on_target="2:73160981-73161004"
7 | off_target01="8:120587494-120587517"
8 | off_target02="1:234492858-234492881"
9 | off_target03="12:73504668-73504691"
10 | off_target04="4:48639390-48639413"
11 | hotspots="1:121485221-121485228"
12 |
13 | ### Get the names of reads that overlap with the selected test regionsq
14 | samtools view sample.bam $on_target $off_target01 $off_target02 $off_target03 $off_target04 $hotspots | cut -f1 | sort | uniq > sample_read_names.txt
15 | samtools view control.bam $on_target $off_target01 $off_target02 $off_target03 $off_target04 $hotspots | cut -f1 | sort | uniq > control_read_names.txt
16 | cat sample_read_names.txt control_read_names.txt > read_names.txt
17 |
18 | ### Subset FASTQs to extract _all_ read pairs where at least one of the reads falls in a specified test region
19 | zcat fastq/128_S3_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST.r1.fastq.gz
20 | zcat fastq/128_S3_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST.r2.fastq.gz
21 | zcat fastq/Negative_S1_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST_control.r1.fastq.gz
22 | zcat fastq/Negative_S1_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST_control.r2.fastq.gz
23 |
--------------------------------------------------------------------------------
/test/scripts/CIRCLEseq_prepare_test_reference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #####################################################################################
3 | ### CIRCLEseq_prepare_test_genome.sh: assemble reference test
4 | #####################################################################################
5 | ### Get chromosomes
6 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.1.fa.gz
7 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.2.fa.gz
8 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.4.fa.gz
9 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.8.fa.gz
10 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.12.fa.gz
11 |
12 | ### Asemble reference
13 | cat *.fa.gz > Homo_sapiens.GRCh37.subset.fa.gz
14 | gunzip Homo_sapiens.GRCh37.subset.fa.gz
15 | samtools faidx Homo_sapiens.GRCh37.subset.fa
16 |
17 | ### Pad test regions with 10kb on either side
18 | bedtools slop -i CIRCLEseq_test.bed -g Homo_sapiens.GRCh37.subset.fa.fai -b 10000 > CIRCLEseq_test_padded.bed
19 |
20 | ### Extract sequences from reference file for each paded interval
21 | bedtools getfasta -fi Homo_sapiens.GRCh37.subset.fa -bed CIRCLEseq_test_padded.bed -fo CIRCLEseq_test_genome.fa -name
22 |
--------------------------------------------------------------------------------
/test/scripts/CIRCLEseq_test_bed.R:
--------------------------------------------------------------------------------
1 | ######################################################################################################quote
2 | ### test_regions_BED.R: make bed file with regions including
3 | ### on-target site, 2 off-target sites without variants,
4 | ### 2 off-target sites with variants, and 1 region without off-targets.
5 | ######################################################################################################
6 | bed = data.frame(chr=c('2', '8', '1', '12', '4'), start=c(73160981, 120587494, 234492858, 73504668, 48639390), end=c(73161159, 120587517, 234492881, 73504691, 48639413), name=c('2', '8', '1', '12', '4'))
7 |
8 | write.table(bed, 'CIRCLEseq_test.bed', quote=FALSE, row.names=FALSE, col.names=FALSE, sep='\t')
9 |
10 | ### Information about the sites
11 | on_target="2:73160981-73161004"
12 | off_target01="8:120587494-120587517"
13 | off_target02="1:234492858-234492881"
14 | off_target_with_variantWindowOnly="12:73504668-73504691"
15 | off_target_with_variants="4:48639390-48639413"
16 | hotspots="2:73161104-73161159"
17 | ######################################################################################################
18 | ######################################################################################################
19 |
--------------------------------------------------------------------------------
/test/scripts/Test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python ../../circleseq/circleseq.py all --manifest ../CIRCLEseq_MergedTest.yaml
3 |
4 | python ../../circleseq/circleseq.py all --manifest ../CIRCLEseq_StandardTest.yaml
5 |
--------------------------------------------------------------------------------
/test/test_circleseq_merged.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | test_circleseq_merged
6 | ----------------------------------
7 |
8 | Tests for `circleseq` module.
9 | """
10 |
11 | import yaml
12 | import unittest
13 | import os
14 | import shutil
15 | import utils
16 | from circleseq import circleseq
17 |
18 | TEST_OUTPUT_PATH = 'tmp'
19 |
20 | TEST_MANIFEST_PATH = os.path.join('CIRCLEseq_MergedTest.yaml')
21 |
22 | CORRECT_ALIGNED_OUTPUT = 'data/MergedOutput/aligned'
23 | CORRECT_IDENTIFIED_OUTPUT = 'data/MergedOutput/identified'
24 | CORRECT_MERGED_OUTPUT = 'data/MergedOutput/merged'
25 | CORRECT_VISUALIZATION_OUTPUT = 'data/MergedOutput/visualization'
26 |
27 | CORRECT_ALL_OUTPUT = 'data/MergedOutput'
28 |
29 | class FullPipelineTestCase(unittest.TestCase):
30 |
31 | def setUp(self):
32 | pass
33 |
34 | def testFullPipeline(self):
35 | c = circleseq.CircleSeq()
36 | c.parseManifest(TEST_MANIFEST_PATH)
37 |
38 | # Align and test the alignment output
39 | c.alignReads()
40 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, "aligned"), CORRECT_ALIGNED_OUTPUT))
41 |
42 | # Find cleavage sites
43 | c.findCleavageSites()
44 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'identified'), CORRECT_IDENTIFIED_OUTPUT))
45 |
46 | # Visualize filtered sites
47 | c.visualize()
48 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'visualization'), CORRECT_VISUALIZATION_OUTPUT))
49 |
50 |
51 | def tearDown(self):
52 | pass
53 |
54 | if __name__ == '__main__':
55 | unittest.main()
--------------------------------------------------------------------------------
/test/test_circleseq_std.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | test_circleseq_std
6 | ----------------------------------
7 |
8 | Tests for `circleseq` module.
9 | """
10 |
11 | import yaml
12 | import unittest
13 | import os
14 | import shutil
15 | import utils
16 | from circleseq import circleseq
17 |
18 | TEST_OUTPUT_PATH = 'tmp'
19 |
20 | TEST_MANIFEST_PATH = os.path.join('CIRCLEseq_StandardTest.yaml')
21 |
22 | CORRECT_ALIGNED_OUTPUT = 'data/StandardOutput/aligned'
23 | CORRECT_IDENTIFIED_OUTPUT = 'data/StandardOutput/identified'
24 | CORRECT_VARIANTS_OUTPUT = 'data/StandardOutput/variants'
25 | CORRECT_VISUALIZATION_OUTPUT = 'data/StandardOutput/visualization'
26 |
27 | CORRECT_ALL_OUTPUT = 'data'
28 |
29 | class FullPipelineTestCase(unittest.TestCase):
30 |
31 | def setUp(self):
32 | pass
33 |
34 | def testFullPipeline(self):
35 | c = circleseq.CircleSeq()
36 | c.parseManifest(TEST_MANIFEST_PATH)
37 |
38 | # Align and test the alignment output
39 | c.alignReads()
40 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, "aligned"), CORRECT_ALIGNED_OUTPUT))
41 |
42 | # Find cleavage sites
43 | c.findCleavageSites()
44 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'identified'), CORRECT_IDENTIFIED_OUTPUT))
45 |
46 | # Visualize filtered sites
47 | c.visualize()
48 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'visualization'), CORRECT_VISUALIZATION_OUTPUT))
49 |
50 | # Look for genomic variants
51 | c.callVariants()
52 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'variants'), CORRECT_VARIANTS_OUTPUT))
53 |
54 |
55 | def tearDown(self):
56 | pass
57 |
58 | if __name__ == '__main__':
59 | unittest.main()
--------------------------------------------------------------------------------
/test/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import os
4 | import sys
5 | import inspect
6 | import filecmp
7 | from itertools import islice
8 |
9 | def checkFolderEquality(folder1, folder2):
10 | """
11 | Given two folders, check if there are the same number of files,
12 | that the names of files are the same, and that the files with the same
13 | names are the same.
14 | """
15 |
16 | folder1_files = [x for x in os.listdir(folder1) if not x.startswith('.')]
17 | folder2_files = [x for x in os.listdir(folder2) if not x.startswith('.')]
18 |
19 | if set(folder1_files) != set(folder2_files):
20 | print('Folders do not have the same filenames.')
21 | return False
22 |
23 | for f in folder1_files:
24 | file1 = os.path.join(folder1, f)
25 | file2 = os.path.join(folder2, f)
26 |
27 | if f.split('.')[-1] == 'sam':
28 | with open(file1, 'r') as a, open(file2, 'r') as b:
29 | for line1, line2 in zip(a,b):
30 | if line1.startswith('@'):
31 | continue
32 | elif line1 != line2:
33 | return False
34 | else:
35 | if not filecmp.cmp(file1, file2):
36 | print('{0} does not match between folders.'.format(f))
37 | return False
38 |
39 | return True
40 |
41 |
42 | def head(filepath, n=10):
43 | with open(filepath) as f:
44 | for line in islice(f, n):
45 | print(line)
46 |
--------------------------------------------------------------------------------