├── ._README.MD ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.MD ├── build_pypi.sh ├── changeseq ├── NUC_SIMPLE ├── __init__.py ├── alignReads.py ├── callVariants.py ├── changeseq.py ├── findCleavageSites.py ├── log.py ├── mergeReads.py ├── referenceFree.py ├── refseq_gene_name.list ├── refseq_gene_name.py ├── test.yaml ├── utility.py ├── validation.py └── visualization.py ├── conda_build ├── conda_build_config.yaml └── meta.yaml ├── example_output.png ├── requirements.txt ├── scripts ├── NUC_SIMPLE ├── __init__.py ├── site_pvalue.R ├── test.py ├── test_align.py └── test_ga.py ├── setup.cfg ├── setup.py └── test ├── CIRCLEseq_MergedTest.yaml ├── CIRCLEseq_StandardTest.yaml ├── __init__.py ├── data ├── MergedOutput │ ├── aligned │ │ ├── TestSample.bam │ │ ├── TestSample.bam.bai │ │ ├── TestSample.sam │ │ ├── TestSample_sorted.bam │ │ ├── control_TestSample.bam │ │ ├── control_TestSample.bam.bai │ │ ├── control_TestSample.sam │ │ └── control_TestSample_sorted.bam │ ├── fastq │ │ ├── TestSample_merged.fastq.gz │ │ └── control_TestSample_merged.fastq.gz │ ├── identified │ │ ├── TestSample_CONTROL_coordinates.txt │ │ ├── TestSample_NUCLEASE_coordinates.txt │ │ ├── TestSample_count.txt │ │ ├── TestSample_identified_matched.txt │ │ └── TestSample_identified_unmatched.txt │ └── visualization │ │ └── TestSample_offtargets.svg ├── StandardOutput │ ├── aligned │ │ ├── TestSample.bam │ │ ├── TestSample.bam.bai │ │ ├── TestSample.sam │ │ ├── TestSample_sorted.bam │ │ ├── control_TestSample.bam │ │ ├── control_TestSample.bam.bai │ │ ├── control_TestSample.sam │ │ └── control_TestSample_sorted.bam │ ├── identified │ │ ├── TestSample_CONTROL_coordinates.txt │ │ ├── TestSample_NUCLEASE_coordinates.txt │ │ ├── TestSample_count.txt │ │ ├── TestSample_identified_matched.txt │ │ └── TestSample_identified_unmatched.txt │ ├── variants │ │ ├── TestSample_Variants.txt │ │ └── TestSample_mpileupCall.txt │ └── visualization │ │ └── TestSample_offtargets.svg └── input │ ├── CIRCLEseq_test_genome.fa │ ├── CIRCLEseq_test_genome.fa.amb │ ├── CIRCLEseq_test_genome.fa.ann │ ├── CIRCLEseq_test_genome.fa.bwt │ ├── CIRCLEseq_test_genome.fa.fai │ ├── CIRCLEseq_test_genome.fa.pac │ ├── CIRCLEseq_test_genome.fa.sa │ ├── TEST.r1.fastq.gz │ ├── TEST.r2.fastq.gz │ ├── TEST_control.r1.fastq.gz │ └── TEST_control.r2.fastq.gz ├── scripts ├── CIRCLEseq_prepare_test_data.sh ├── CIRCLEseq_prepare_test_reference.sh ├── CIRCLEseq_test_bed.R └── Test.sh ├── test_circleseq_merged.py ├── test_circleseq_std.py └── utils.py /._README.MD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/._README.MD -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | changeseq.egg-info/ 4 | *.py[cod] 5 | test/output 6 | <<<<<<< HEAD 7 | *.DS_Store 8 | ======= 9 | .DS_Store 10 | >>>>>>> add-testing 11 | 12 | # PyCharm Pref Folder 13 | .idea 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Packages 19 | *.egg 20 | *.egg-info 21 | dist 22 | build 23 | eggs 24 | parts 25 | bin 26 | var 27 | sdist 28 | develop-eggs 29 | .installed.cfg 30 | lib64 31 | 32 | # Installer logs 33 | pip-log.txt 34 | 35 | # Unit test / coverage reports 36 | .coverage 37 | .tox 38 | nosetests.xml 39 | htmlcov 40 | 41 | # Translations 42 | *.mo 43 | 44 | # Mr Developer 45 | .mr.developer.cfg 46 | .project 47 | .pydevproject 48 | 49 | # Complexity 50 | output/*.html 51 | output/*/index.html 52 | 53 | # Sphinx 54 | docs/_build 55 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "2.7" 7 | 8 | before_install: 9 | - cd test 10 | - git clone https://github.com/lh3/bwa.git 11 | - cd bwa 12 | - git checkout tags/v0.7.13 13 | - make 14 | - cd .. 15 | - PATH=`pwd`/bwa:$PATH 16 | - git clone https://github.com/samtools/htslib.git 17 | - cd htslib 18 | - git checkout tags/1.3 19 | - make 20 | - cd .. 21 | - git clone https://github.com/samtools/samtools.git 22 | - cd samtools 23 | - git checkout tags/1.3 24 | - make 25 | - cd .. 26 | - PATH=`pwd`/samtools:$PATH 27 | - cd .. 28 | - git clone git://github.com/samtools/htslib.git 29 | - cd htslib 30 | - git checkout tags/1.7 31 | - cd .. 32 | - git clone git://github.com/samtools/bcftools.git 33 | - cd bcftools 34 | - git checkout tags/1.6 35 | - make 36 | - cd .. 37 | - PATH=`pwd`/bcftools:$PATH 38 | 39 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 40 | install: 41 | - pip install --upgrade pip setuptools wheel 42 | - pip install --only-binary=numpy,scipy numpy scipy 43 | - pip install -r requirements.txt 44 | 45 | # command to run tests, e.g. python setup.py test 46 | script: 47 | cd test && nosetests --exe -v 48 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | 2 | [![Version][version-shield]][version-url] 3 | [![Python versions][python-shield]][python-url] 4 | [![Platforms][platform-shield]][python-url] 5 | 6 | 7 | # CHANGE-seq: Circularization for High-throughput Analysis Nuclease Genome-wide Effects by Sequencing 8 | 9 | This is a repository for CHANGE-seq analytical software, which takes sample-specific paired-end FASTQ files as input and produces a list of CHANGE-seq detected off-target cleavage sites as output. 10 | 11 | # Summary 12 | 13 | This package implements a pipeline that takes in reads from the CHANGE-seq assay and returns detected cleavage sites as output. The individual pipeline steps are: 14 | 15 | 1. **Merge**: Merge read1 an read2 for easier mapping to genome. 16 | 2. **Read Alignment**: Merged paired end reads from the assay are aligned to the reference genome using the BWA-MEM algorithm with default parameters (Li. H, 2009). 17 | 3. **Cleavage Site Identification**: Mapped sites are analyzed to determine which represent high-quality cleavage sites. 18 | 4. **Visualization of Results**: Identified on-target and off-target cleavage sites are rendered as a color-coded alignment map for easy analysis of results. 19 | 20 | # Installation 21 | 22 | The most easiest way to install change-seq pipeline is via conda. 23 | 24 | ``` 25 | 26 | conda create -n changeseq -c conda-forge -c bioconda -c anaconda -c omnia -c tsailabSJ changeseq 27 | 28 | source activate changeseq 29 | 30 | changeseq.py -h 31 | 32 | ## BWA 0.7.17 and samtools 1.9 are automatically installed 33 | 34 | ## If Homer is available, the identified off-targets will be annotated using "annotatePeaks.pl", specify the genome version in the YAML file. 35 | 36 | 37 | ``` 38 | 39 | Alternatively, you can git clone this repository and install 40 | 41 | ``` 42 | 43 | git clone https://github.com/tsailabSJ/changeseq 44 | 45 | cd changeseq 46 | 47 | pip install -r requirements.txt 48 | 49 | python setup.py install 50 | 51 | changeseq.py -h 52 | 53 | ## Please install BWA and samtools if you choose this option 54 | 55 | ``` 56 | 57 | ## Download Reference Genome 58 | 59 | The CHANGEseq package requires a reference genome for read mapping. You can use any genome of your choosing, but for all of our testing and original CHANGE-seq analyses we use hg19 ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)). Be sure to (g)unzip the FASTA file before use if it is compressed. 60 | 61 | # Usage 62 | 63 | The change-seq pipeline requires a manifest yaml file specifying input files, output directory, and pipeline parameters. Once the yaml file is created, users can simply run ``change_seq.py all --manifest /path/to/manifest.yaml`` 64 | 65 | 66 | Below is an example ``manifest.yaml`` file:: 67 | 68 | reference_genome: /data/joung/genomes/Homo_sapiens_assembly19.fasta 69 | analysis_folder: /data/joung/CHANGE-Seq/test2 70 | 71 | bwa: bwa 72 | samtools: samtools 73 | 74 | read_threshold: 4 75 | window_size: 3 76 | mapq_threshold: 50 77 | start_threshold: 1 78 | gap_threshold: 3 79 | mismatch_threshold: 6 80 | search_radius: 30 81 | merged_analysis: True 82 | 83 | samples: 84 | U2OS_exp1_VEGFA_site_1: 85 | target: GGGTGGGGGGAGTTTGCTCCNGG 86 | read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R1_001.fastq.gz 87 | read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R2_001.fastq.gz 88 | controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz 89 | controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz 90 | description: U2OS_exp1 91 | U2OS_exp1_EMX1: 92 | target: GAGTCCGAGCAGAAGAAGAANGG 93 | read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R1_001.fastq.gz 94 | read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R2_001.fastq.gz 95 | controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz 96 | controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz 97 | description: U2OS_exp1 98 | 99 | ## Quickstart 100 | 101 | ``` 102 | 103 | git clone https://github.com/tsailabSJ/changeseq 104 | 105 | cd changeseq/test 106 | 107 | changeseq.py all --manifest CIRCLEseq_MergedTest.yaml 108 | 109 | ``` 110 | 111 | ## Example Output 112 | 113 | ![x](example_output.png) 114 | 115 | # Writing A Manifest File 116 | When running the end-to-end analysis functionality of the CHANGEseq package a number of inputs are required. To simplify the formatting of these inputs and to encourage reproducibility, these parameters are inputted into the pipeline via a manifest formatted as a YAML file. YAML files allow easy-to-read specification of key-value pairs. This allows us to easily specify our parameters. The following fields are required in the manifest: 117 | 118 | - `reference_genome`: The absolute path to the reference genome FASTA file. 119 | - `output_folder`: The absolute path to the folder in which all pipeline outputs will be saved. 120 | - `bwa`: The absolute path to the `bwa` executable 121 | - `samtools`: The absolute path to the `samtools` executable 122 | - `read_threshold`: The minimum number of reads at a location for that location to be called as a site. We recommend leaving it to the default value of 4. 123 | - `window_size`: Size of the sliding window, we recommend leaving it to the default value of 3. 124 | - `mapq_threshold`: Minimum read mapping quality score. We recommend leaving it to the default value of 50. 125 | - `start_threshold`: Tolerance for breakpoint location. We recommend leaving it to the default value of 1. 126 | - `gap_threshold`: Distance between breakpoints. We recommend leaving it to the default value of 3 for Cas9. 127 | - `mismatch_threshold`: Number of tolerated gaps in the fuzzy target search setp. We recommend leaving it to the default value of 6. 128 | - `read_length`: Fastq file read length, default is 151. 129 | - `PAM`: PAM sequence, default is NGG. 130 | - `genome`: used for homer peak annotation, e.g., hg19, hg38, mm9, or mm10. 131 | - `merged_analysis`: Whether or not the paired read merging step should takingTrue 132 | - `samples`: Lists the samples you wish to analyze and the details for each. Each sample name should be nested under the top level samples key, and each sample detail should be nested under the sample name. See the sample manifest for an example. 133 | - For each sample, you must provide the following parameters: 134 | - `target`: Target sequence for that sample. Accepts degenerate bases. 135 | - `read1`: The absolute path to the .FASTQ(.gz) file containing the read1 reads. 136 | - `read2`: The absolute path to the .FASTQ(.gz) file containing the read2 reads. 137 | - `controlread1`: The absolute path to the .FASTQ(.gz) file containing the control read1 reads. 138 | - `controlread2`: The absolute path to the .FASTQ(.gz) file containing the control read2 reads. 139 | - `description`: A brief description of the sample 140 | 141 | 142 | # Pipeline Output 143 | When running the full pipeline, the results of each step are outputted to the `output_folder` in a separate folder for each step. The output folders and their respective contents are as follows: 144 | 145 | - `output_folder/aligned`: Contains an alignment `.sam`, alignment `.bam`, sorted `bam`, and `.bai` index file for each sample. 146 | - `output_folder/fastq`: Merged `.fastq.gz` files for each sample. 147 | - `output_folder/identified`: Contains tab-delimited `.txt` files for each sample containing the identified DSBs, control DSBs, filtered DSBs, and read quantification. 148 | - `output_folder/visualization`: Contains a `.svg` vector image representing an alignment of all detected off-targets to the targetsite for each sample. 149 | 150 | # FAQ 151 | 152 | ## Homer installation 153 | 154 | ``` 155 | 156 | conda install -c bioconda homer 157 | 158 | # To install genome annotation 159 | # Ref: http://homer.ucsd.edu/homer/introduction/configure.html 160 | 161 | ## Suppose you want to install hg19, follow the command here: 162 | 163 | annotatePeaks.pl xxx hg19 164 | 165 | ## You should be able to see: 166 | 167 | !!!!Genome hg19 not found in /rgs01/project_space/tsaigrp/Genomics/common/anaconda3/envs/changeseq/share/homer-4.11-2/.//config.txt 168 | 169 | To check if is available, run "perl /rgs01/project_space/tsaigrp/Genomics/common/anaconda3/envs/changeseq/share/homer-4.11-2/.//configureHomer.pl -list" 170 | If so, add it by typing "perl /rgs01/project_space/tsaigrp/Genomics/common/anaconda3/envs/changeseq/share/homer-4.11-2/.//configureHomer.pl -install hg19" 171 | 172 | ## Copy and paste the perl command to install genome annotation 173 | ``` 174 | 175 | 176 | 177 | [version-shield]: https://img.shields.io/conda/v/tsailabsj/changeseq.svg 178 | [version-url]: https://anaconda.org/tsailabSJ/changeseq 179 | [python-shield]: https://img.shields.io/pypi/pyversions/changeseq.svg 180 | [python-url]: https://pypi.python.org/pypi/changeseq 181 | [platform-shield]: https://anaconda.org/tsailabsj/changeseq/badges/platforms.svg 182 | -------------------------------------------------------------------------------- /build_pypi.sh: -------------------------------------------------------------------------------- 1 | python setup.py sdist 2 | python setup.py bdist_wheel 3 | twine upload dist/* 4 | 5 | -------------------------------------------------------------------------------- /changeseq/NUC_SIMPLE: -------------------------------------------------------------------------------- 1 | # 2 | # This matrix was created by Todd Lowe 12/10/92 3 | # 4 | # Uses ambiguous nucleotide codes, probabilities rounded to 5 | # nearest integer 6 | # 7 | # Lowest score = -4, Highest score = 5 8 | # 9 | # Modified by Shengdar Tsai 1/23/16 10 | A T G C N 11 | A 10 -5 -5 -5 10 12 | T -5 10 -5 -5 10 13 | G -5 -5 10 -5 10 14 | C -5 -5 -5 10 10 15 | N 10 10 10 10 10 -------------------------------------------------------------------------------- /changeseq/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Using __init__.py to organize the structure. 3 | """ 4 | 5 | __version__ = "1.2.9.1" -------------------------------------------------------------------------------- /changeseq/alignReads.py: -------------------------------------------------------------------------------- 1 | """ 2 | alignReads 3 | """ 4 | 5 | from __future__ import print_function 6 | 7 | import subprocess 8 | import os 9 | import logging 10 | 11 | logger = logging.getLogger('root') 12 | logger.propagate = False 13 | 14 | def alignReads(BWA_path, HG19_path, read1, read2, outfile): 15 | 16 | sample_name = os.path.basename(outfile).split('.')[0] 17 | output_folder = os.path.dirname(outfile) 18 | base_name = os.path.join(output_folder, sample_name) 19 | sam_filename = outfile 20 | bam_filename = base_name + '.bam' 21 | 22 | if not os.path.exists(output_folder): 23 | os.makedirs(output_folder) 24 | 25 | # Check if genome is already indexed by bwa 26 | index_files_extensions = ['.pac', '.amb', '.ann', '.bwt', '.sa'] 27 | 28 | genome_indexed = True 29 | for extension in index_files_extensions: 30 | if not os.path.isfile(HG19_path + extension): 31 | genome_indexed = False 32 | break 33 | 34 | # If the genome is not already indexed, index it 35 | if not genome_indexed: 36 | logger.info('Genome index files not detected. Running BWA to generate indices.') 37 | bwa_index_command = '{0} index {1}'.format(BWA_path, HG19_path) 38 | logger.info('Running bwa command: %s', bwa_index_command) 39 | subprocess.call(bwa_index_command.split()) 40 | logger.info('BWA genome index generated') 41 | else: 42 | logger.info('BWA genome index found.') 43 | 44 | # Run paired end alignment against the genome 45 | logger.info('Running paired end mapping for {0}'.format(sample_name)) 46 | bwa_alignment_command = '{0} mem {1} {2} {3} > {4}'.format(BWA_path, HG19_path, read1, read2, sam_filename) 47 | samtools_sam_to_bam_command = 'samtools sort -o {0} {1}'.format(bam_filename, sam_filename) 48 | samtools_index_command = 'samtools index {0}'.format(bam_filename) 49 | samtools_sort_by_name_command = 'samtools sort -o {0} -n {1}'.format("".join([base_name, '_sorted.bam']), bam_filename) 50 | 51 | # Open the outfile and redirect the output of the alignment to it. 52 | logger.info(bwa_alignment_command) 53 | subprocess.check_call(bwa_alignment_command, shell=True) 54 | logger.info('Paired end mapping for {0} completed.'.format(sample_name)) 55 | 56 | # Convert SAM to BAM file 57 | logger.info(samtools_sam_to_bam_command) 58 | subprocess.check_call(samtools_sam_to_bam_command, shell=True) 59 | logger.info('Sorting by coordinate position for {0} complete.'.format(sample_name)) 60 | 61 | # Index BAM file 62 | logger.info(samtools_index_command) 63 | subprocess.check_call(samtools_index_command, shell=True) 64 | logger.info('Indexing for {0} complete.'.format(sample_name)) 65 | 66 | # Sort BAM file by name 67 | logger.info(samtools_sort_by_name_command) 68 | subprocess.check_call(samtools_sort_by_name_command, shell=True) 69 | logger.info('Sorting for {0} by name complete.'.format(sample_name)) 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /changeseq/callVariants.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import subprocess 4 | import sys 5 | import os 6 | import argparse 7 | import regex 8 | import re 9 | import HTSeq 10 | import pyfaidx 11 | from findCleavageSites import get_sequence, regexFromSequence, alignSequences, reverseComplement, extendedPattern, realignedSequences 12 | 13 | 14 | """ 15 | Run samtools:mpileup and get all identified variants in the window sequences 16 | """ 17 | def snpCall(matched_file, reference, bam_file, out, search_radius): 18 | basename = os.path.basename(out) 19 | output_folder = os.path.dirname(out) 20 | 21 | # open matched file 22 | regions = list() 23 | with open(matched_file, 'rU') as f: 24 | f.readline() 25 | for line in f: 26 | site = line.strip().split('\t') 27 | # chromosome, windowStart, windowEnd, strand, bam, region_basename (=Targetsite_Name) 28 | regions.append([site[0], int(site[6]) - search_radius, int(site[7]) + search_radius, '*', bam_file, '_'.join([site[26], site[3]])]) 29 | 30 | print('Running samtools:mpileup for %s' % basename, file=sys.stderr) 31 | out_vcf = os.path.join(output_folder, basename + '_mpileup_output') 32 | if os.path.exists(out_vcf): 33 | subprocess.check_call('rm -r %s' % out_vcf, shell=True, env=os.environ.copy()) 34 | os.makedirs(out_vcf) 35 | process_mpileup = open(os.path.join(out_vcf, 'logFile_mpileup'), 'w') 36 | 37 | for item in regions: 38 | chromosome, windowStart, windowEnd, strand, bam_file, region_basename = item 39 | region = '%s%s%s%s%s' % (chromosome, ":", int(windowStart), "-", int(windowEnd)) 40 | output = os.path.join(out_vcf, region_basename + '.vcf') 41 | 42 | cl_vcf = 'samtools mpileup -v --region %s --fasta-ref %s %s > %s' % (region, reference, bam_file, output) 43 | subprocess.check_call(cl_vcf, shell=True, env=os.environ.copy(), stderr=process_mpileup, stdout=process_mpileup) 44 | process_mpileup.close() 45 | 46 | print('Collecting variants for %s' % basename, file=sys.stderr) 47 | out_bcf = os.path.join(output_folder, basename + '_output_bcftools') 48 | if os.path.exists(out_bcf): 49 | subprocess.check_call('rm -r %s' % out_bcf, shell=True, env=os.environ.copy()) 50 | os.makedirs(out_bcf) 51 | process_bcftools = open(os.path.join(out_bcf, 'logFile_bcftools'), 'w') 52 | 53 | vcf_files = [f for f in os.listdir(out_vcf) if os.path.isfile(os.path.join(out_vcf, f))] 54 | for arch in vcf_files: 55 | if not arch.startswith('.') and arch.endswith('.vcf'): 56 | name = arch[:-4] 57 | output = os.path.join(out_bcf, name + '_BCFcall.vcf') 58 | 59 | cl_bcf = 'bcftools call -v -c %s > %s' % (os.path.join(out_vcf, arch), output) 60 | subprocess.check_call(cl_bcf, shell=True, env=os.environ.copy(), stderr=process_bcftools, stdout=process_bcftools) 61 | process_bcftools.close() 62 | 63 | print('Collecting significant variant calls for %s' % basename, file=sys.stderr) 64 | out_svc = os.path.join(output_folder, basename + '_output_svc') 65 | if os.path.exists(out_svc): 66 | subprocess.check_call('rm -r %s' % out_svc, shell=True, env=os.environ.copy()) 67 | os.makedirs(out_svc) 68 | process_svc = open(os.path.join(out_svc, 'logFile_svc'), 'w') 69 | 70 | bcf_files = [f for f in os.listdir(out_bcf) if os.path.isfile(os.path.join(out_bcf, f))] 71 | for arch in bcf_files: 72 | if not arch.startswith('.') and arch.endswith('.vcf'): 73 | name = arch[:-12] 74 | output = os.path.join(out_svc, name + '_SIGNFcall.txt') 75 | 76 | cl_sed = "sed -n '/##/!p' %s | awk 'FNR>1' > %s" % (os.path.join(out_bcf, arch), output) 77 | subprocess.check_call(cl_sed, shell=True, env=os.environ.copy(), stderr=process_svc, stdout=process_svc) 78 | process_svc.close() 79 | 80 | print('Consolidating all the significant variant calls for %s' % basename, file=sys.stderr) 81 | header = ['targetsite', 'site_name', 'chromosome', 'one_based_position', 'reference', 'variant', 'quality', 'genotype', 'depth', 'PL'] 82 | variants = list() 83 | 84 | svc_files = [f for f in os.listdir(out_svc) if os.path.isfile(os.path.join(out_svc, f))] 85 | for arch in svc_files: 86 | if not arch.startswith('.') and arch.endswith('.txt'): 87 | tag = arch[:-14] 88 | f = open(os.path.join(out_svc, arch), 'r') 89 | reads = f.readlines() 90 | f.close() 91 | 92 | for line in reads: 93 | item = line.split() 94 | if 'INDEL' in item[7]: 95 | variants.append( 96 | [basename, tag] + item[:2] + item[3:6] + [str(int(item[9][0])) + '|' + str(int(item[9][2]))] + 97 | [item[7].split(';')[3][3:]] + ['_'.join(item[9][4:].split(','))]) 98 | else: 99 | variants.append( 100 | [basename, tag] + item[:2] + item[3:6] + [str(int(item[9][0])) + '|' + str(int(item[9][2]))] + 101 | [item[7].split(';')[0][3:]] + ['_'.join(item[9][4:].split(','))]) 102 | 103 | out_file = open(out + '_mpileupCall.txt', 'w') 104 | print(*header, sep='\t', file=out_file) 105 | for item in variants: 106 | print(*item, sep='\t', file=out_file) 107 | out_file.close() 108 | 109 | print('Cleaning up directive for %s' % basename, file=sys.stderr) 110 | subprocess.check_call('rm -r %s' % out_vcf, shell=True, env=os.environ.copy()) 111 | subprocess.check_call('rm -r %s' % out_bcf, shell=True, env=os.environ.copy()) 112 | subprocess.check_call('rm -r %s' % out_svc, shell=True, env=os.environ.copy()) 113 | 114 | print('Done running samtools:mpileup for %s' % basename, file=sys.stderr) 115 | return variants 116 | 117 | 118 | """ 119 | Obtain variant off-target sequences 120 | """ 121 | def realignVariantBulge(bulge_sequence, window_sequence_variant, bulge_strand): 122 | bseq = bulge_sequence.replace('-', '') 123 | if bulge_strand == '+': 124 | m_bulge = re.search(bseq, window_sequence_variant, re.I) 125 | else: 126 | m_bulge = re.search(bseq, reverseComplement(window_sequence_variant), re.I) 127 | variant_bseq = m_bulge.group() 128 | variant_bseq = variant_bseq[:bulge_sequence.find('-')] + '-' + variant_bseq[bulge_sequence.find('-'):] 129 | return variant_bseq 130 | 131 | 132 | def SNPreader(snp_file): 133 | ga = HTSeq.GenomicArray("auto", stranded=False, typecode='O') 134 | 135 | for snp in snp_file: 136 | basename, snpID, chromosome, one_based_position, reference, variant, quality, genotype, depth, PL = snp 137 | position = int(one_based_position) - 1 138 | key = '_'.join([basename, chromosome]) 139 | ga[HTSeq.GenomicInterval(chromosome, position, position + 1, ".")] = [position, reference, variant, genotype, quality, key] 140 | return ga 141 | 142 | 143 | def arrayOffTargets(matched_file, search_radius): 144 | offtargets_dict = {} 145 | gi_dict = {} 146 | 147 | with open(matched_file, 'r') as g: 148 | g.readline() 149 | for line in g: 150 | site = line.strip().split('\t') 151 | 152 | Chromosome = site[0] 153 | start = int(site[6]) - search_radius 154 | end = int(site[7]) + search_radius 155 | Name = site[3] 156 | 157 | offtargets_dict[Name] = site 158 | 159 | # create a genomic interval for each window sequence 160 | gi_dict[Name] = HTSeq.GenomicInterval(Chromosome, start, end, ".") 161 | return offtargets_dict, gi_dict 162 | 163 | 164 | def snpAdjustment(matched_file, snp_file, out, mismatch_threshold, search_radius): 165 | output_file = open(out + '_Variants.txt', 'w') 166 | print('Chromosome', 'Start', 'End', 'Name', 'ReadCount', 'Strand', 167 | 'Variant_WindowSequence', 168 | 'Variant_Site_SubstitutionsOnly.Sequence', 'Variant_Site_SubstitutionsOnly.NumSubstitutions', 169 | 'Variant_Site_SubstitutionsOnly.Strand', 170 | 'Variant_Site_GapsAllowed.Sequence', 'Variant_Site_GapsAllowed.Length', 171 | 'Variant_Site_GapsAllowed.Substitutions', 'Variant_Site_GapsAllowed.Insertions', 'Variant_Site_GapsAllowed.Deletions', 172 | 'Variant_Site_GapsAllowed.Strand', 173 | 'Cell', 'Targetsite', 'TargetSequence', 'Variant_RealignedTargetSequence', 174 | 'Reference', 'Variant', 'Genotype', 'Quality', 175 | sep='\t', file=output_file) 176 | output_file.close() 177 | 178 | basename = os.path.basename(out) 179 | offtargets, gi_offtargets = arrayOffTargets(matched_file, search_radius) 180 | ga_snp = SNPreader(snp_file) 181 | 182 | for name in offtargets: 183 | variant_flag = False 184 | site = offtargets[name] 185 | gi = gi_offtargets[name] 186 | 187 | chromosome = site[0] 188 | window_sequence = site[9] 189 | window_sequence = window_sequence.upper() 190 | cell, targetsite = site[25:27] 191 | TargetSequence = site[28] 192 | output01 = site[0:6] 193 | output03 = [cell, targetsite, TargetSequence] 194 | ots_nb, ots_bu = site[10], site[15] 195 | 196 | # obtain variant window sequence 197 | wkey = '_'.join([basename, chromosome]) 198 | insert_start, insert_end, insert_var, snp_data = list(), list(), list(), {} 199 | 200 | for i, v in ga_snp[gi].steps(): 201 | if v: 202 | position, reference, variant, genotype, quality, key = v 203 | if key == wkey: 204 | variant = variant.split(',')[0] 205 | for n, pos in enumerate(range(gi.start, gi.end)): 206 | if pos == int(position): 207 | insert_var.append(variant.lower()) 208 | insert_start.append(n) 209 | end_pos = n + len(reference) 210 | insert_end.append(end_pos) 211 | snp_data[str(position)] = [position, reference, variant, genotype, quality] 212 | 213 | tri = 0 214 | window_sequence_variant = '' 215 | for i in range(len(insert_var)): 216 | variant = insert_var[i] 217 | pos = insert_start[i] 218 | window_sequence_variant += window_sequence[tri:pos] + variant.lower() 219 | tri = insert_end[i] 220 | window_sequence_variant += window_sequence[tri:] 221 | 222 | # variant off-target sequences: only proceed if there is a variant in the window sequence 223 | window_sequence_var = window_sequence_variant.upper() 224 | if window_sequence_var != window_sequence: 225 | offtarget_sequence_no_bulge, mismatches, offtarget_sequence_length, chosen_alignment_strand_m, start_no_bulge, end_no_bulge, \ 226 | realigned_target, \ 227 | bulged_offtarget_sequence, length, score, substitutions, insertions, deletions, chosen_alignment_strand_b, bulged_start, bulged_end = \ 228 | alignSequences(TargetSequence, window_sequence_var, max_score=mismatch_threshold) 229 | 230 | variant_ots_no_bulge, variant_ots_bulge = '', '' 231 | 232 | # get variant sequence if the off-target sequences have changed by considering the variant window 233 | if ots_nb != offtarget_sequence_no_bulge: 234 | variant_flag = True 235 | if chosen_alignment_strand_m == '+': 236 | m_no_bulge = re.search(offtarget_sequence_no_bulge, window_sequence_variant, re.I) 237 | else: 238 | m_no_bulge = re.search(offtarget_sequence_no_bulge, reverseComplement(window_sequence_variant), re.I) 239 | variant_ots_no_bulge = m_no_bulge.group() 240 | 241 | if ots_bu != bulged_offtarget_sequence: 242 | variant_flag = True 243 | variant_ots_bulge = realignVariantBulge(bulged_offtarget_sequence, window_sequence_variant, chosen_alignment_strand_b) 244 | 245 | # collect and write variant data if we have variant off-target sequence(s) 246 | if variant_flag: 247 | total_genotype, total_reference, total_variant, total_quality = '', '', '', '' 248 | for pos in snp_data: 249 | position, reference, variant, genotype, quality = snp_data[pos] 250 | if total_genotype != '': 251 | total_genotype += ''.join([':', genotype]) 252 | total_reference += ''.join([':', reference]) 253 | total_variant += ''.join([':', variant]) 254 | total_quality += ''.join([':', quality]) 255 | else: 256 | total_genotype += ''.join([genotype]) 257 | total_reference += ''.join([reference]) 258 | total_variant += ''.join([variant]) 259 | total_quality += ''.join([quality]) 260 | 261 | output02 = [variant_ots_no_bulge, mismatches, chosen_alignment_strand_m, 262 | variant_ots_bulge, length, substitutions, insertions, deletions, chosen_alignment_strand_b] 263 | output04 = [total_reference, total_variant, total_genotype, total_quality] 264 | output_line = output01 + [window_sequence_variant] + output02 + output03 + [realigned_target] + output04 265 | 266 | with open(out + '_Variants.txt', 'a') as output_file: 267 | print(*output_line, sep='\t', file=output_file) 268 | 269 | 270 | """ 271 | Main function 272 | """ 273 | def getVariants(matched_file, ref, bam_file, out, search_radius, mismatch_threshold): 274 | basename = os.path.basename(out) 275 | output_folder = os.path.dirname(out) 276 | if not os.path.exists(output_folder): 277 | os.makedirs(output_folder) 278 | 279 | snp_file = snpCall(matched_file, ref, bam_file, out, search_radius) 280 | 281 | print('Obtaining Variant Off-Target Sequences for %s' % basename, file=sys.stderr) 282 | snpAdjustment(matched_file, snp_file, out, mismatch_threshold, search_radius) 283 | 284 | 285 | def main(): 286 | parser = argparse.ArgumentParser(description='Implement samtools:mpileup to identify genomic variants and adjust the off-target sequence when required.') 287 | parser.add_argument('--matched_file', help="full_path_to/matched file in 'identified' folder", required=True) 288 | parser.add_argument('--ref', help="Reference Genome Fasta", required=True) 289 | parser.add_argument('--bam', help="Sorted BAM file", required=True) 290 | parser.add_argument('--search_radius', help="Search radius around the position window", default=20, type=int) 291 | parser.add_argument('--mismatch_threshold', help='Maximum score threshold', default=7, type=int) 292 | parser.add_argument('--out', help="Output file basename, with full path", required=True) 293 | args = parser.parse_args() 294 | 295 | getVariants(args.matched_file, args.ref, args.bam, args.out, args.search_radius, args.mismatch_threshold) 296 | 297 | if __name__ == "__main__": 298 | main() 299 | -------------------------------------------------------------------------------- /changeseq/changeseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | """ 5 | circleseq.py as the wrapper for CIRCLE-seq analysis 6 | """ 7 | 8 | from alignReads import alignReads 9 | from visualization import visualizeOfftargets 10 | from mergeReads import mergeReads 11 | import argparse 12 | import os 13 | import sys 14 | import subprocess 15 | import traceback 16 | import log 17 | import yaml 18 | import validation 19 | import findCleavageSites 20 | import callVariants 21 | 22 | logger = log.createCustomLogger('root') 23 | p_dir = os.path.dirname(os.path.realpath(__file__)) 24 | 25 | class CircleSeq: 26 | 27 | def __init__(self): 28 | self.search_radius = 20 29 | self.window_size = 3 30 | self.mapq_threshold = 50 31 | self.start_threshold = 1 32 | self.gap_threshold = 3 33 | self.mismatch_threshold = 6 34 | self.read_threshold = 6 35 | self.merged_analysis = True 36 | self.all_chromosomes = False 37 | self.variant_analysis = False 38 | self.genome = None 39 | self.refseq_names = None 40 | 41 | 42 | def parseManifest(self, manifest_path, sample='all'): 43 | logger.info('Loading manifest...') 44 | 45 | with open(manifest_path, 'r') as f: 46 | manifest_data = yaml.load(f) 47 | 48 | try: 49 | # Validate manifest data 50 | validation.validateManifest(manifest_data) 51 | 52 | self.BWA_path = manifest_data['bwa'] 53 | self.reference_genome = manifest_data['reference_genome'] 54 | self.analysis_folder = manifest_data['analysis_folder'] 55 | 56 | # Allow the user to specify read threshold, window_size and search_radius if they'd like 57 | if 'search_radius' in manifest_data: 58 | self.search_radius = manifest_data['search_radius'] 59 | if 'window_size' in manifest_data: 60 | self.window_size = manifest_data['window_size'] 61 | if 'mapq_threshold' in manifest_data: 62 | self.mapq_threshold = manifest_data['mapq_threshold'] 63 | if 'start_threshold' in manifest_data: 64 | self.start_threshold = manifest_data['start_threshold'] 65 | if 'gap_threshold' in manifest_data: 66 | self.gap_threshold = manifest_data['gap_threshold'] 67 | if 'mismatch_threshold' in manifest_data: 68 | self.mismatch_threshold = manifest_data['mismatch_threshold'] 69 | if 'read_threshold' in manifest_data: 70 | self.read_threshold = manifest_data['read_threshold'] 71 | if 'merged_analysis' in manifest_data: 72 | self.merged_analysis = manifest_data['merged_analysis'] 73 | if 'all_chromosomes' in manifest_data: 74 | self.all_chromosomes = manifest_data['all_chromosomes'] 75 | if 'variant_analysis' in manifest_data: 76 | self.variant_analysis = manifest_data['variant_analysis'] 77 | if 'genome' in manifest_data: 78 | self.genome = manifest_data['genome'] 79 | if self.genome in ['hg38','hg19']: 80 | self.refseq_names = p_dir+"/refseq_gene_name.py" 81 | # Allow the user to specify PAM seq. Yichao 4/29/2020 82 | if 'PAM' in manifest_data: 83 | self.PAM = manifest_data['PAM'] 84 | else: 85 | self.PAM = "NGG" 86 | # Allow the user to specify Read Length. Yichao 4/29/2020 87 | if 'read_length' in manifest_data: 88 | self.read_length = manifest_data['read_length'] 89 | else: 90 | self.read_length = 151 91 | # Allow the user to specify Read Count cutoff. Yichao 4/29/2020 92 | if 'read_count_cutoff' in manifest_data: 93 | self.read_count_cutoff = manifest_data['read_count_cutoff'] 94 | else: 95 | self.read_count_cutoff = 6 96 | 97 | # Do not allow to run variant_analysis with merged_analysis 98 | if self.merged_analysis and self.variant_analysis: 99 | logger.error('merged_analysis is not compatible with variant_analysis. Please remove one option.') 100 | sys.exit() 101 | 102 | if sample == 'all': 103 | self.samples = manifest_data['samples'] 104 | else: 105 | self.samples = {} 106 | self.samples[sample] = manifest_data['samples'][sample] 107 | # Make folders for output 108 | for folder in ['aligned', 'identified', 'fastq', 'visualization', 'variants']: 109 | output_folder = os.path.join(self.analysis_folder, folder) 110 | if not os.path.exists(output_folder): 111 | os.makedirs(output_folder) 112 | 113 | except Exception as e: 114 | logger.error('Incorrect or malformed manifest file. Please ensure your manifest contains all required fields.') 115 | sys.exit() 116 | 117 | def alignReads(self): 118 | if self.merged_analysis: 119 | logger.info('Merging reads...') 120 | try: 121 | self.merged = {} 122 | for sample in self.samples: 123 | sample_merge_path = os.path.join(self.analysis_folder, 'fastq', sample + '_merged.fastq.gz') 124 | control_sample_merge_path = os.path.join(self.analysis_folder, 'fastq', 'control_' + sample + '_merged.fastq.gz') 125 | mergeReads(self.samples[sample]['read1'], 126 | self.samples[sample]['read2'], 127 | sample_merge_path) 128 | mergeReads(self.samples[sample]['controlread1'], 129 | self.samples[sample]['controlread2'], 130 | control_sample_merge_path) 131 | 132 | sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', sample + '.sam') 133 | control_sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '.sam') 134 | 135 | alignReads(self.BWA_path, 136 | self.reference_genome, 137 | sample_merge_path, 138 | '', 139 | sample_alignment_path) 140 | 141 | alignReads(self.BWA_path, 142 | self.reference_genome, 143 | control_sample_merge_path, 144 | '', 145 | control_sample_alignment_path) 146 | 147 | self.merged[sample] = sample_alignment_path 148 | logger.info('Finished merging and aligning reads.') 149 | 150 | except Exception as e: 151 | logger.error('Error aligning') 152 | logger.error(traceback.format_exc()) 153 | quit() 154 | else: 155 | logger.info('Aligning reads...') 156 | try: 157 | self.aligned = {} 158 | self.aligned_sorted = {} 159 | for sample in self.samples: 160 | sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', sample + '.sam') 161 | control_sample_alignment_path = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '.sam') 162 | alignReads(self.BWA_path, 163 | self.reference_genome, 164 | self.samples[sample]['read1'], 165 | self.samples[sample]['read2'], 166 | sample_alignment_path) 167 | alignReads(self.BWA_path, 168 | self.reference_genome, 169 | self.samples[sample]['controlread1'], 170 | self.samples[sample]['controlread2'], 171 | control_sample_alignment_path) 172 | self.aligned[sample] = sample_alignment_path 173 | self.aligned_sorted[sample] = os.path.join(self.analysis_folder, 'aligned', sample + '_sorted.bam') 174 | logger.info('Finished aligning reads to genome.') 175 | 176 | except Exception as e: 177 | logger.error('Error aligning') 178 | logger.error(traceback.format_exc()) 179 | quit() 180 | 181 | def findCleavageSites(self): 182 | logger.info('Identifying off-target cleavage sites.') 183 | 184 | try: 185 | for sample in self.samples: 186 | if self.merged_analysis: 187 | sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', sample + '.bam') 188 | control_sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '.bam') 189 | else: 190 | sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', sample + '_sorted.bam') 191 | control_sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', 'control_' + sample + '_sorted.bam') 192 | identified_sites_file = os.path.join(self.analysis_folder, 'identified', sample) 193 | logger.info('Window: {0}, MAPQ: {1}, Gap: {2}, Start {3}, Mismatches {4}, Search_Radius {5}'.format(self.window_size, self.mapq_threshold, self.gap_threshold, self.start_threshold, self.mismatch_threshold, self.search_radius)) 194 | findCleavageSites.compare(self.reference_genome, sorted_bam_file, control_sorted_bam_file, self.samples[sample]['target'], 195 | self.search_radius, self.window_size, self.mapq_threshold, self.gap_threshold, 196 | self.start_threshold, self.mismatch_threshold, sample, self.samples[sample]['description'], 197 | identified_sites_file, self.all_chromosomes, merged=self.merged_analysis,read_count_cutoff=self.read_threshold,read_length=self.read_length) 198 | except Exception as e: 199 | logger.error('Error identifying off-target cleavage site.') 200 | logger.error(traceback.format_exc()) 201 | quit() 202 | 203 | def visualize(self): 204 | logger.info('Visualizing off-target sites') 205 | 206 | # try: 207 | # for sample in self.samples: 208 | # if sample != 'control': 209 | # infile = os.path.join(self.analysis_folder, 'identified', sample + '_identified_matched.txt') 210 | # outfile = os.path.join(self.analysis_folder, 'visualization', sample + '_offtargets') 211 | # visualizeOfftargets(infile, outfile, title=sample) 212 | 213 | # logger.info('Finished visualizing off-target sites') 214 | 215 | # except Exception as e: 216 | # logger.error('Error visualizing off-target sites.') 217 | # logger.error(traceback.format_exc()) 218 | 219 | for sample in self.samples: ## 4/29/2020 Yichao solved: visualization stopped when sample has no off-target 220 | if sample != 'control': 221 | try: 222 | infile = os.path.join(self.analysis_folder, 'identified', sample + '_identified_matched.txt') 223 | outfile = os.path.join(self.analysis_folder, 'visualization', sample + '_offtargets') 224 | visualizeOfftargets(infile, outfile, title=sample,PAM=self.PAM,genome=self.genome,refseq_names=self.refseq_names) 225 | except Exception as e: 226 | logger.error('Error visualizing off-target sites: %s'%(sample)) 227 | logger.error(traceback.format_exc()) 228 | logger.info('Finished visualizing off-target sites') 229 | 230 | 231 | def callVariants(self): 232 | 233 | try: 234 | if self.variant_analysis: 235 | logger.info('Identifying genomic variants') 236 | 237 | for sample in self.samples: 238 | sorted_bam_file = os.path.join(self.analysis_folder, 'aligned', sample + '.bam') 239 | identified_sites_file = os.path.join(self.analysis_folder, 'identified', sample + '_identified_matched.txt') 240 | variants_basename = os.path.join(self.analysis_folder, 'variants', sample) 241 | logger.info('Mismatches {0}, Search_Radius {1}'.format(self.mismatch_threshold, self.search_radius)) 242 | callVariants.getVariants(identified_sites_file, self.reference_genome, sorted_bam_file, variants_basename, self.search_radius, self.mismatch_threshold) 243 | 244 | logger.info('Finished identifying genomic variants') 245 | 246 | except Exception as e: 247 | logger.error('Error identifying genomic variants.') 248 | logger.error(traceback.format_exc()) 249 | quit() 250 | 251 | def parallel(self, manifest_path, lsf, run='all'): 252 | logger.info('Submitting parallel jobs') 253 | current_script = __file__ 254 | 255 | try: 256 | for sample in self.samples: 257 | cmd = 'python {0} {1} --manifest {2} --sample {3}'.format(current_script, run, manifest_path, sample) 258 | logger.info(cmd) 259 | subprocess.call(lsf.split() + [cmd]) 260 | logger.info('Finished job submission') 261 | 262 | except Exception as e: 263 | logger.error('Error submitting jobs.') 264 | logger.error(traceback.format_exc()) 265 | 266 | def referenceFree(self): 267 | pass 268 | 269 | def parse_args(): 270 | parser = argparse.ArgumentParser() 271 | 272 | subparsers = parser.add_subparsers(description='Individual Step Commands', 273 | help='Use this to run individual steps of the pipeline', 274 | dest='command') 275 | 276 | all_parser = subparsers.add_parser('all', help='Run all steps of the pipeline') 277 | all_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 278 | all_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 279 | 280 | parallel_parser = subparsers.add_parser('parallel', help='Run all steps of the pipeline in parallel') 281 | parallel_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 282 | parallel_parser.add_argument('--lsf', '-l', help='Specify LSF CMD', default='bsub -R rusage[mem=32000] -P Genomics -q standard') 283 | parallel_parser.add_argument('--run', '-r', help='Specify which steps of pipepline to run (all, align, identify, visualize, variants)', default='all') 284 | 285 | align_parser = subparsers.add_parser('align', help='Run alignment only') 286 | align_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 287 | align_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 288 | 289 | merge_parser = subparsers.add_parser('merge', help='Merge paired end reads') 290 | merge_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 291 | merge_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 292 | 293 | identify_parser = subparsers.add_parser('identify', help='Run identification only') 294 | identify_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 295 | identify_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 296 | 297 | visualize_parser = subparsers.add_parser('visualize', help='Run visualization only') 298 | visualize_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 299 | visualize_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 300 | 301 | variants_parser = subparsers.add_parser('variants', help='Run variants analysis only') 302 | variants_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 303 | variants_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 304 | 305 | reference_free_parser = subparsers.add_parser('reference-free', help='Run reference-free discovery only') 306 | reference_free_parser.add_argument('--manifest', '-m', help='Specify the manifest Path', required=True) 307 | reference_free_parser.add_argument('--sample', '-s', help='Specify sample to process (default is all)', default='all') 308 | 309 | return parser.parse_args() 310 | 311 | def main(): 312 | args = parse_args() 313 | 314 | if args.command == 'all': 315 | c = CircleSeq() 316 | c.parseManifest(args.manifest, args.sample) 317 | c.alignReads() 318 | c.findCleavageSites() 319 | c.visualize() 320 | c.callVariants() 321 | elif args.command == 'parallel': 322 | c = CircleSeq() 323 | c.parseManifest(args.manifest) 324 | c.parallel(args.manifest, args.lsf, args.run) 325 | elif args.command == 'align': 326 | c = CircleSeq() 327 | c.parseManifest(args.manifest, args.sample) 328 | c.alignReads() 329 | elif args.command == 'identify': 330 | c = CircleSeq() 331 | c.parseManifest(args.manifest, args.sample) 332 | c.findCleavageSites() 333 | elif args.command == 'merge': 334 | c = CircleSeq() 335 | c.parseManifest(args.manifest, args.sample) 336 | c.mergeAlignReads() 337 | elif args.command == 'visualize': 338 | c = CircleSeq() 339 | c.parseManifest(args.manifest, args.sample) 340 | c.visualize() 341 | elif args.command == 'variants': 342 | c = CircleSeq() 343 | c.parseManifest(args.manifest, args.sample) 344 | c.callVariants() 345 | 346 | if __name__ == '__main__': 347 | main() 348 | -------------------------------------------------------------------------------- /changeseq/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | log.py 3 | ===== 4 | 5 | Setup logging utils for nested module logging 6 | 7 | Adapted from the accepted answer here: http://stackoverflow.com/questions/7621897/python-logging-module-globally 8 | """ 9 | 10 | import logging 11 | 12 | def createCustomLogger(name): 13 | formatter = logging.Formatter(fmt='[%(asctime)s][%(levelname)s][%(module)s] %(message)s', datefmt='%m/%d %I:%M:%S%p') 14 | 15 | handler = logging.StreamHandler() 16 | handler.setFormatter(formatter) 17 | 18 | logger = logging.getLogger(name) 19 | logger.setLevel(logging.DEBUG) 20 | logger.addHandler(handler) 21 | return logger 22 | -------------------------------------------------------------------------------- /changeseq/mergeReads.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import itertools 4 | import gzip 5 | from utility import reverseComplement, fq 6 | 7 | def mergeReads(fastq1_filename, fastq2_filename, out): 8 | fastq1_file = fq(fastq1_filename) 9 | fastq2_file = fq(fastq2_filename) 10 | 11 | with gzip.open(out, 'wb') as o: 12 | for r1, r2 in itertools.izip(fastq1_file, fastq2_file): 13 | merged_sequence = reverseComplement(r1[1]) + r2[1] 14 | merged_quality_scores = r1[3][::-1] + r2[3] 15 | print(r1[0], file=o) 16 | print(merged_sequence, file=o) 17 | print(r1[2], file=o) 18 | print(merged_quality_scores, file=o) 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(description='Merge CIRCLE-seq reads for alignment.') 22 | parser.add_argument('--read1', help='Read 1 filename', required=True) 23 | parser.add_argument('--read2', help='Read 2 filename', required=True) 24 | parser.add_argument('--out', help='Output filename', required=True) 25 | 26 | args = parser.parse_args() 27 | 28 | mergeReads(args.read1, args.read2, args.out) 29 | 30 | if __name__ == "__main__": 31 | main() -------------------------------------------------------------------------------- /changeseq/referenceFree.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import itertools 5 | import regex 6 | import re 7 | import gzip 8 | import sys 9 | import collections 10 | from findCleavageSites import regexFromSequence, alignSequences, reverseComplement, extendedPattern, realignedSequences 11 | 12 | """ 13 | FASTQ generator function from umi package 14 | """ 15 | def fq(file): 16 | if re.search('.gz$', file): 17 | fastq = gzip.open(file, 'rb') 18 | else: 19 | fastq = open(file, 'r') 20 | with fastq as f: 21 | while True: 22 | l1 = f.readline() 23 | if not l1: 24 | break 25 | l2 = f.readline() 26 | l3 = f.readline() 27 | l4 = f.readline() 28 | yield [l1, l2, l3, l4] 29 | 30 | """ 31 | Main function to find off-target sites in reference-free fashion 32 | """ 33 | def analyze(fastq1_filename, fastq2_filename, targetsite, out_base, name='', cells='', mismatch_threshold=7): 34 | 35 | read_count = 0 36 | c = collections.Counter() 37 | d = collections.defaultdict(list) 38 | 39 | fastq1_file = fq(fastq1_filename) 40 | fastq2_file = fq(fastq2_filename) 41 | for r1, r2 in itertools.izip(fastq1_file, fastq2_file): 42 | r1_sequence = r1[1].rstrip('\n') 43 | r2_sequence = r2[1].rstrip('\n') 44 | joined_seq = reverseComplement(r1_sequence) + r2_sequence 45 | truncated_joined_seq = joined_seq[130:170] 46 | 47 | sequence_data = alignSequences(targetsite, truncated_joined_seq, max_score=mismatch_threshold) 48 | offtarget, mismatch, length, strand, start, end, realigned_target = sequence_data[:7] 49 | 50 | if offtarget: 51 | c[offtarget] += 1 52 | d[offtarget].append(joined_seq) 53 | 54 | read_count += 1 55 | if not read_count % 100000: 56 | print(read_count/float(1000000), end=" ", file=sys.stderr) 57 | 58 | print('Finished tabulating reference-free discovery counts.', file=sys.stderr) 59 | out_filename = out_base + '.txt' 60 | 61 | with open(out_filename, 'w') as o: 62 | for target_sequence, target_count in c.most_common(): 63 | print(target_sequence, target_count, file=o) 64 | off_target_fasta_filename = '{0}_{1:04d}_{2}.fasta'.format(out_base, target_count, target_sequence) 65 | with open(off_target_fasta_filename, 'w') as off_target_fasta_file: 66 | j = 0 67 | for sequence in d[target_sequence]: 68 | j += 1 69 | print('>{0:04d}_{1}_{2}'.format(target_count, target_sequence, j), file=off_target_fasta_file) 70 | print(sequence, file=off_target_fasta_file) 71 | 72 | def join_write_output(fastq1_filename, fastq2_filename, out): 73 | fastq1_file = fq(fastq1_filename) 74 | fastq2_file = fq(fastq2_filename) 75 | 76 | with open(out, 'w') as o: 77 | for r1, r2 in itertools.izip(fastq1_file, fastq2_file): 78 | header = '>{0}'.format(r1[0]) 79 | r1_sequence = r1[1].rstrip('\n') 80 | r2_sequence = r2[1].rstrip('\n') 81 | joined_seq = reverseComplement(r1_sequence) + r2_sequence 82 | print(header, end='', file=o) 83 | print(joined_seq, file=o) 84 | 85 | 86 | def main(): 87 | parser = argparse.ArgumentParser(description='Identify off-target candidates from Illumina short read sequencing data.') 88 | parser.add_argument('--fq1', help='FASTQ Read 1', required=True) 89 | parser.add_argument('--fq2', help='FASTQ Read 2', required=True) 90 | parser.add_argument('--targetsite', help='Targetsite Sequence', required=True) 91 | parser.add_argument('--name', help='Targetsite Name', required=False) 92 | parser.add_argument('--cells', help='Cells', required=False) 93 | parser.add_argument('--mismatch_threshold', help='Maximum score threshold', default=7, type=int) 94 | parser.add_argument('--out', help='Output file base', required=True) 95 | args = parser.parse_args() 96 | 97 | analyze(args.fq1, args.fq2, args.targetsite, args.out, args.name, args.cells, args.mismatch_threshold) 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /changeseq/test.yaml: -------------------------------------------------------------------------------- 1 | reference_genome: /Users/shengdar/genomes/Homo_sapiens_assembly19.fasta 2 | analysis_folder: /Users/shengdar/Local/circleseq-test/merged 3 | 4 | bwa: bwa 5 | samtools: samtools 6 | 7 | read_threshold: 4 8 | window_size: 3 9 | mapq_threshold: 50 10 | start_threshold: 1 11 | gap_threshold: 3 12 | mismatch_threshold: 6 13 | merged_analysis: True 14 | 15 | samples: 16 | U2OS_exp1_VEGFA_site_1: 17 | target: GGGTGGGGGGAGTTTGCTCCNGG 18 | read1: /Users/shengdar/Local/circleseq-test/1_S1_subset_100000_R1.fastq 19 | read2: /Users/shengdar/Local/circleseq-test/1_S1_subset_100000_R2.fastq 20 | controlread1: /Users/shengdar/Local/circleseq-test/4_S4_subset_R1.fastq 21 | controlread2: /Users/shengdar/Local/circleseq-test/4_S4_subset_R2.fastq 22 | description: U2OS_exp1_VEGFA_site_1 -------------------------------------------------------------------------------- /changeseq/utility.py: -------------------------------------------------------------------------------- 1 | import string 2 | import re 3 | import gzip 4 | """ 5 | FASTQ generator function from umi package 6 | """ 7 | 8 | def fq(file): 9 | if re.search('.gz$', file): 10 | fastq = gzip.open(file, 'rb') 11 | else: 12 | fastq = open(file, 'r') 13 | with fastq as f: 14 | while True: 15 | l1 = f.readline().rstrip('\n') 16 | if not l1: 17 | break 18 | l2 = f.readline().rstrip('\n') 19 | l3 = f.readline().rstrip('\n') 20 | l4 = f.readline().rstrip('\n') 21 | yield [l1, l2, l3, l4] 22 | 23 | def reverseComplement(sequence): 24 | transtab = string.maketrans("ACGT","TGCA") 25 | return sequence.translate(transtab)[::-1] 26 | -------------------------------------------------------------------------------- /changeseq/validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | validation.py 3 | ============= 4 | 5 | Contains utils for validating the filetype and existence of manifest-defined files/folders 6 | 7 | """ 8 | 9 | import logging 10 | import os 11 | import sys 12 | from distutils.spawn import find_executable 13 | 14 | logger = logging.getLogger('root') 15 | 16 | 17 | def exists(filepath): 18 | if not os.path.isfile(filepath): 19 | logger.error('{0} does not exist'.format(filepath)) 20 | sys.exit() 21 | 22 | 23 | def checkIfBinary(filepath): 24 | executable = find_executable(filepath) 25 | 26 | if executable is None: 27 | logger.error('Executable binary not found at {0}'.format(filepath)) 28 | sys.exit() 29 | 30 | # First check if file exists 31 | exists(executable) 32 | 33 | # Check if file is a valid binary 34 | # Adapted from http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python 35 | textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f}) 36 | is_binary_string = lambda bytes: bool(bytes.translate(None, textchars)) 37 | 38 | if not is_binary_string(open(executable, 'rb').read(1024)): 39 | logger.error('{0} is not a valid binary'.format(executable)) 40 | sys.exit() 41 | 42 | 43 | def checkIfFasta(filepath): 44 | # First check if file exists 45 | exists(os.path.abspath(filepath)) 46 | 47 | 48 | def checkIfFolder(folderpath): 49 | # Check if the folder exists 50 | if not os.path.isdir(os.path.abspath(folderpath)): 51 | logger.error('{0} is not a valid folder path'.format(folderpath)) 52 | sys.exit() 53 | 54 | 55 | def checkIfValidUndemultiplexed(undemultiplexed): 56 | # Check if read1, read2, index1, and index2 exist 57 | fields = ['forward', 'reverse', 'index1', 'index2'] 58 | 59 | if set(fields) != set(undemultiplexed.keys()): 60 | logger.error('Undemultiplexed field must contain references to "forward", "reverse", "index1", "index2"') 61 | sys.exit() 62 | 63 | invalid_file = False 64 | for field in fields: 65 | if not os.path.isfile(undemultiplexed[field]): 66 | logger.error('"read1" undemultiplexed field does not reference a valid file') 67 | invalid_file = True 68 | 69 | if invalid_file: 70 | sys.exit() 71 | 72 | 73 | def checkIfValidSamples(samples): 74 | # # Check if control is one of the samples 75 | # if 'control' not in samples: 76 | # logger.error('A control sample must be specified') 77 | # sys.exit() 78 | 79 | if len(samples.keys()) == 0: 80 | logger.error('No samples defined') 81 | sys.exit() 82 | 83 | for sample in samples: 84 | if 'read1' not in samples[sample] or 'read2' not in samples[sample]: 85 | logger.error('read1 and read2 must be specified for {0} sample'.format(sample)) 86 | sys.exit() 87 | if 'controlread1' not in samples[sample] or 'controlread2' not in samples[sample]: 88 | logger.error('controlread1 and controlread2 must be specified for {0} sample'.format(sample)) 89 | sys.exit() 90 | if 'target' not in samples[sample]: 91 | logger.error('target sequence must be specified for {0} sample'.format(sample)) 92 | sys.exit() 93 | 94 | def validateManifest(manifest_data): 95 | # Check if manifest contains the required fields 96 | fields = ['bwa', 'reference_genome', 'analysis_folder', 'samples'] 97 | missing_fields = False 98 | 99 | for field in fields: 100 | if field not in manifest_data.keys(): 101 | logger.error('"{0}" field must be specified in manifest'.format(field)) 102 | missing_fields = True 103 | 104 | if missing_fields: 105 | sys.exit() 106 | 107 | # Now validate each field 108 | checkIfBinary(manifest_data['bwa']) 109 | checkIfBinary(manifest_data['samtools']) 110 | checkIfFasta(manifest_data['reference_genome']) 111 | checkIfValidSamples(manifest_data['samples']) -------------------------------------------------------------------------------- /changeseq/visualization.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import svgwrite 4 | import os 5 | import logging 6 | import argparse 7 | import pandas as pd 8 | 9 | ### 2017-October-11: Adapt plots to new output; inputs are managed using "argparse". 10 | 11 | logger = logging.getLogger('root') 12 | logger.propagate = False 13 | 14 | boxWidth = 10 15 | box_size = 15 16 | v_spacing = 3 17 | 18 | # colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', '-': '#B3B3B3'} 19 | colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#B3B3B3'} 20 | for c in ['Y','S','W','K','M','B','D','H','V','.']: 21 | colors[c] = "#B3B3B3" 22 | 23 | def refseqID_to_HGNC_symbol(x,myDict): 24 | if "(" in x: 25 | ID = x.split()[1].split(",")[0].replace(")","").replace("(","") 26 | # print (ID) 27 | if ID in myDict: 28 | gene = myDict[ID] 29 | # print (ID,gene) 30 | return x.replace(ID,gene) 31 | return x 32 | 33 | def reformat_homer_annotation(r): 34 | if r.Annotation =="Intergenic": 35 | return "%s (%s)"%(r.Annotation,r['Gene Name']) 36 | return r.Annotation 37 | def parse_homer(identified,homer_output,genome,refseq_names=None): 38 | select_col="Annotation" 39 | command = "annotatePeaks.pl %s %s > %s"%(identified,genome,homer_output) 40 | os.system(command) 41 | df = pd.read_csv(identified,sep="\t") 42 | df.index = df['Genomic Coordinate'].to_list() 43 | df2 = pd.read_csv(homer_output,sep="\t",index_col=0) 44 | df2[select_col] = df2.apply(reformat_homer_annotation,axis=1) 45 | df['Annotation'] = df2[select_col] 46 | if refseq_names!=None: 47 | myDict = parse_HGNC(refseq_names) 48 | df['Annotation'] = [refseqID_to_HGNC_symbol(x,myDict) for x in df.Annotation] 49 | out = identified.replace(".txt",".annot.tsv") 50 | df.to_csv(out,sep="\t",index=False) 51 | return out 52 | 53 | def get_int(x): 54 | try: 55 | x = float(x) 56 | except: 57 | return "" 58 | return int(x) 59 | 60 | def parse_HGNC(f): 61 | refseq = "#name" 62 | symbol = "name2" 63 | df = pd.read_csv(f,sep="\t") 64 | # print (df.head()) 65 | df = df[[refseq,symbol]] 66 | df = df.dropna() 67 | df.index = df[refseq].to_list() 68 | # print (df.head()) 69 | return df[symbol].to_dict() 70 | def parseSitesFile(infile): 71 | offtargets = [] 72 | total_seq = 0 73 | with open(infile, 'r') as f: 74 | f.readline() 75 | for line in f: 76 | line = line.rstrip('\n') 77 | line_items = line.split('\t') 78 | # offtarget_reads = line_items[4] 79 | # no_bulge_offtarget_sequence = line_items[10] 80 | # bulge_offtarget_sequence = line_items[15] 81 | # target_seq = line_items[28] 82 | # realigned_target_seq = line_items[29] 83 | offtarget_reads = line_items[4] 84 | no_bulge_offtarget_sequence = line_items[7] 85 | bulge_offtarget_sequence = line_items[9] 86 | target_seq = line_items[14] 87 | realigned_target_seq = line_items[15] 88 | coord = line_items[3] 89 | num_mismatch = get_int(line_items[8]) 90 | try: 91 | annot = line_items[16] 92 | except: 93 | annot = "" 94 | 95 | if no_bulge_offtarget_sequence != '' or bulge_offtarget_sequence != '': 96 | if no_bulge_offtarget_sequence: 97 | total_seq += 1 98 | if bulge_offtarget_sequence: 99 | total_seq += 1 100 | offtargets.append({'seq': no_bulge_offtarget_sequence.strip(), 101 | 'bulged_seq': bulge_offtarget_sequence.strip(), 102 | 'reads': int(offtarget_reads.strip()), 103 | 'coord': str(coord), 104 | 'annot': str(annot), 105 | 'num_mismatch': str(num_mismatch), 106 | 'target_seq': target_seq.strip(), 107 | 'realigned_target_seq': realigned_target_seq.strip() 108 | }) 109 | offtargets = sorted(offtargets, key=lambda x: x['reads'], reverse=True) 110 | return offtargets, target_seq, total_seq 111 | 112 | # 3/6/2020 Yichao 113 | def check_mismatch(a,b): 114 | from Bio.Data import IUPACData 115 | dna_dict = IUPACData.ambiguous_dna_values 116 | set_a = dna_dict[a.upper()] 117 | set_b = dna_dict[b.upper()] 118 | overlap = list(set(list(set_a)).intersection(list(set_b))) 119 | if len(overlap) == 0: 120 | return True 121 | else: 122 | return False 123 | from Bio import SeqUtils 124 | def find_PAM(seq,PAM): 125 | try: 126 | PAM_index = seq.index(PAM) 127 | except: 128 | # PAM on the left 129 | left_search = SeqUtils.nt_search(seq[:len(PAM)], PAM) 130 | if len(left_search)>1: 131 | PAM_index = left_search[1] 132 | else: 133 | right_search = SeqUtils.nt_search(seq[-len(PAM):], PAM) 134 | if len(right_search)>1: 135 | PAM_index = len(seq)-len(PAM) 136 | else: 137 | print ("PAM: %s not found in %s. Set PAM index to 20"%(PAM,seq)) 138 | PAM_index=20 139 | return PAM_index 140 | 141 | def visualizeOfftargets(infile, outfile, title, PAM, genome=None,refseq_names=None): 142 | 143 | output_folder = os.path.dirname(outfile) 144 | if not os.path.exists(output_folder): 145 | os.makedirs(output_folder) 146 | 147 | 148 | if genome!=None: 149 | infile = parse_homer(infile,outfile+".raw.homer.tsv",genome,refseq_names=refseq_names) 150 | # Get offtargets array from file 151 | offtargets, target_seq, total_seq = parseSitesFile(infile) 152 | 153 | # Initiate canvas 154 | dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(u'100%', 100 + total_seq*(box_size + 1))) 155 | 156 | if title is not None: 157 | # Define top and left margins 158 | x_offset = 20 159 | y_offset = 50 160 | dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier")) 161 | else: 162 | # Define top and left margins 163 | x_offset = 20 164 | y_offset = 20 165 | 166 | # Draw ticks 167 | # if target_seq.find('N') >= 0: 168 | # p = target_seq.index('N') 169 | # if p > len(target_seq) / 2: # PAM on the right end 170 | # tick_locations = [1, len(target_seq)] + range(p, len(target_seq)) # limits and PAM 171 | # tick_locations += [x + p - 20 + 1 for x in range(p)[::10][1:]] # intermediate values 172 | # tick_locations = list(set(tick_locations)) 173 | # tick_locations.sort() 174 | # tick_legend = [p, 10, 1] + ['P', 'A', 'M'] 175 | # else: 176 | # tick_locations = range(2, 6) + [14, len(target_seq)] # complementing PAM and limits 177 | # tick_legend = ['P', 'A', 'M', '1', '10'] + [str(len(target_seq) - 4)] 178 | 179 | # for x, y in zip(tick_locations, tick_legend): 180 | # dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 181 | # else: 182 | # tick_locations = [1, len(target_seq)] # limits 183 | # tick_locations += range(len(target_seq) + 1)[::10][1:] 184 | # tick_locations.sort() 185 | # for x in tick_locations: 186 | # dwg.add(dwg.text(str(x), insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 187 | ## Assume PAM is on the right end Yichao rewrite visualization code, generic PAM 188 | ## PAM can be on the left or right, Yichao 0713 189 | tick_locations = [] 190 | tick_legend = [] 191 | # PAM_index = target_seq.index(PAM) 192 | PAM_index = find_PAM(target_seq,PAM) 193 | count = 0 194 | for i in range(PAM_index,0,-1): 195 | count = count+1 196 | if count % 10 == 0: 197 | tick_legend.append(count) 198 | # print (count,i) 199 | tick_locations.append(i) 200 | if len(PAM)>=3: 201 | tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3) 202 | else: 203 | tick_legend+=["PAM"]+['-']*(len(PAM)-3) 204 | tick_locations+=range(PAM_index+1,len(target_seq)+1) 205 | if PAM_index == 0: 206 | tick_legend = [] 207 | tick_locations = [] 208 | tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3) 209 | tick_locations+=range(1,len(PAM)+1) 210 | count = 0 211 | for i in range(len(PAM)+1,len(target_seq)+1): 212 | count = count+1 213 | if count % 10 == 0 or count == 1: 214 | tick_legend.append(count) 215 | # print (count,i) 216 | tick_locations.append(i) 217 | # print (zip(tick_locations, tick_legend)) 218 | for x,y in zip(tick_locations, tick_legend): 219 | dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier")) 220 | 221 | # Draw reference sequence row 222 | for i, c in enumerate(target_seq): 223 | y = y_offset 224 | x = x_offset + i * box_size 225 | dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c])) 226 | dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Courier")) 227 | dwg.add(dwg.text('Reads', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 228 | dwg.add(dwg.text('Mismatches', insert=(box_size * (len(target_seq) + 1) + 90, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 229 | dwg.add(dwg.text('Coordinates', insert=(box_size * (len(target_seq) + 1) + 200, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 230 | if genome!=None: 231 | dwg.add(dwg.text('Annotation', insert=(box_size * (len(target_seq) + 1) + 450, y_offset + box_size - 3), style="font-size:15px; font-family:Courier")) 232 | 233 | # Draw aligned sequence rows 234 | y_offset += 1 # leave some extra space after the reference row 235 | line_number = 0 # keep track of plotted sequences 236 | for j, seq in enumerate(offtargets): 237 | realigned_target_seq = offtargets[j]['realigned_target_seq'] 238 | no_bulge_offtarget_sequence = offtargets[j]['seq'] 239 | bulge_offtarget_sequence = offtargets[j]['bulged_seq'] 240 | 241 | if no_bulge_offtarget_sequence != '': 242 | k = 0 243 | line_number += 1 244 | y = y_offset + line_number * box_size 245 | for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)): 246 | x = x_offset + k * box_size 247 | if r == '-': 248 | if 0 < k < len(target_seq): 249 | x = x_offset + (k - 0.25) * box_size 250 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 251 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 252 | elif c == r: 253 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 254 | k += 1 255 | elif r == 'N': 256 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 257 | k += 1 258 | else: 259 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 260 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 261 | k += 1 262 | if bulge_offtarget_sequence != '': 263 | k = 0 264 | line_number += 1 265 | y = y_offset + line_number * box_size 266 | for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)): 267 | x = x_offset + k * box_size 268 | if r == '-': 269 | if 0 < k < len(realigned_target_seq): 270 | x = x_offset + (k - 0.25) * box_size 271 | dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c])) 272 | dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier")) 273 | elif c == r: 274 | dwg.add(dwg.text(u"\u2022", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier")) 275 | k += 1 276 | elif r == 'N': 277 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 278 | k += 1 279 | else: 280 | dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c])) 281 | dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Courier")) 282 | k += 1 283 | 284 | if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '': 285 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2), 286 | fill='black', style="font-size:15px; font-family:Courier") 287 | dwg.add(reads_text) 288 | mismatch_text = dwg.text(seq['num_mismatch'], insert=(box_size * (len(target_seq) + 1) + 130, y_offset + box_size * (line_number + 2) - 2), 289 | fill='black', style="font-size:15px; font-family:Courier") 290 | dwg.add(mismatch_text) 291 | mismatch_text = dwg.text(seq['coord'], insert=(box_size * (len(target_seq) + 1) + 200, y_offset + box_size * (line_number + 2) - 2), 292 | fill='black', style="font-size:15px; font-family:Courier") 293 | dwg.add(mismatch_text) 294 | if genome!= None: 295 | annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 450, y_offset + box_size * (line_number + 2) - 2), 296 | fill='black', style="font-size:15px; font-family:Courier") 297 | dwg.add(annot_text) 298 | else: 299 | reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5), 300 | fill='black', style="font-size:15px; font-family:Courier") 301 | dwg.add(reads_text) 302 | mismatch_text = dwg.text(seq['num_mismatch'], insert=(box_size * (len(target_seq) + 1) + 130, y_offset + box_size * (line_number + 1) + 5), 303 | fill='black', style="font-size:15px; font-family:Courier") 304 | dwg.add(mismatch_text) 305 | mismatch_text = dwg.text(seq['coord'], insert=(box_size * (len(target_seq) + 1) + 200, y_offset + box_size * (line_number + 1) + 5), 306 | fill='black', style="font-size:15px; font-family:Courier") 307 | dwg.add(mismatch_text) 308 | if genome!= None: 309 | annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 450, y_offset + box_size * (line_number + 1) + 5), 310 | fill='black', style="font-size:15px; font-family:Courier") 311 | dwg.add(annot_text) 312 | reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5), 313 | fill='black', style="font-size:23px; font-family:Courier") 314 | dwg.add(reads_text02) 315 | dwg.save() 316 | 317 | def main(): 318 | parser = argparse.ArgumentParser(description='Plot visualization plots for re-aligned reads.') 319 | parser.add_argument("-f","--identified_file", help="FullPath/output file from reAlignment_circleseq.py", required=True) 320 | parser.add_argument("-o","--outfile", help="FullPath/VIZ", required=True) 321 | parser.add_argument("-t","--title", help="Plot title", required=True) 322 | parser.add_argument("-g","--genome", help="if specified, homer annotation will be performed", default=None) 323 | parser.add_argument("-a","--annotation", help="refseqID to gene name mapping", default=None) 324 | parser.add_argument("--PAM", help="PAM sequence", default="NGG") 325 | args = parser.parse_args() 326 | 327 | print(args) 328 | 329 | visualizeOfftargets(args.identified_file, args.outfile, args.title, args.PAM,args.genome,args.annotation) 330 | 331 | if __name__ == "__main__": 332 | 333 | main() 334 | -------------------------------------------------------------------------------- /conda_build/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 2.7 3 | -------------------------------------------------------------------------------- /conda_build/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "changeseq" %} 2 | {% set version = "1.2.8" %} 3 | {% set file_ext = "tar.gz" %} 4 | {% set hash_type = "sha256" %} 5 | {% set hash_value = "42dde92e84e63369e4c0f2d6f1135952a6478644df9a6f303d3f93507e1f6573" %} 6 | 7 | package: 8 | name: "{{ name|lower }}" 9 | version: "{{ version }}" 10 | 11 | source: 12 | fn: '{{ name }}-{{ version }}.{{ file_ext }}' 13 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.{{ file_ext }} 14 | '{{ hash_type }}': '{{ hash_value }}' 15 | 16 | build: 17 | number: 0 18 | script: python setup.py install --single-version-externally-managed --record=record.txt 19 | 20 | requirements: 21 | host: 22 | - pip 23 | - python 24 | run: 25 | - python 26 | - bwa=0.7.17 27 | - htseq 28 | - matplotlib 29 | - numpy 30 | - pandas 31 | - pyfaidx 32 | - pygments 33 | - pysam 34 | - pyyaml 35 | - regex 36 | - scipy 37 | - setuptools 38 | - sqlite 39 | - statsmodels 40 | - svgwrite 41 | - yaml 42 | - zlib 43 | - htslib=1.9 44 | - samtools=1.9 45 | 46 | test: 47 | imports: 48 | - changeseq 49 | 50 | about: 51 | home: https://github.com/tsailabSJ/changeseq 52 | license: GNU General Public License v2 (GPLv2) 53 | license_family: GPL2 54 | license_file: '' 55 | summary: Bioinformatic pipeline for the CHANGE-seq assay. 56 | description: "[![Version][version-shield]][version-url]\n[![Python versions][python-shield]][python-url]\n[![Platforms][platform-shield]][python-url]\n\n\n# CHANGE-seq: Circularization for High-throughput\ 57 | \ Analysis Nuclease Genome-wide Effects by Sequencing\n\nThis is a repository for CHANGE-seq analytical software, which takes sample-specific paired-end FASTQ files as input and produces a list of CHANGE-seq\ 58 | \ detected off-target cleavage sites as output.\n\n# Summary\n\nThis package implements a pipeline that takes in reads from the CHANGE-seq assay and returns detected cleavage sites as output. The individual\ 59 | \ pipeline steps are:\n\n1. **Merge**: Merge read1 an read2 for easier mapping to genome.\n2. **Read Alignment**: Merged paired end reads from the assay are aligned to the reference genome using the\ 60 | \ BWA-MEM algorithm with default parameters (Li. H, 2009).\n3. **Cleavage Site Identification**: Mapped sites are analyzed to determine which represent high-quality cleavage sites.\n4. **Visualization\ 61 | \ of Results**: Identified on-target and off-target cleavage sites are rendered as a color-coded alignment map for easy analysis of results.\n\n# Installation\n\nThe most easiest way to install change-seq\ 62 | \ pipeline is via conda.\n\n```\n\nconda create -n changeseq -c conda-forge -c bioconda -c anaconda -c omnia -c tsailabSJ changeseq\n\nsource activate changeseq\n\nchangeseq.py -h\n\n## BWA 0.7.17 and\ 63 | \ samtools 1.9 are automatically installed\n\n```\n\nAlternatively, you can git clone this repository and install\n\n```\n\ngit clone https://github.com/tsailabSJ/changeseq\n\ncd changeseq\n\npip install\ 64 | \ -r requirements.txt\n\npython setup.py install\n\nchangeseq.py -h\n\n## Please install BWA and samtools if you choose this option\n\n```\n\n## Download Reference Genome\n\nThe CHANGEseq package requires\ 65 | \ a reference genome for read mapping. You can use any genome of your choosing, but for all of our testing and original CHANGE-seq analyses we use hg19 ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)).\ 66 | \ Be sure to (g)unzip the FASTA file before use if it is compressed.\n\n# Usage\n\nThe change-seq pipeline requires a manifest yaml file specifying input files, output directory, and pipeline parameters.\ 67 | \ Once the yaml file is created, users can simply run ``change_seq.py all --manifest /path/to/manifest.yaml``\n\n\nBelow is an example ``manifest.yaml`` file::\n\n reference_genome: /data/joung/genomes/Homo_sapiens_assembly19.fasta\n\ 68 | \ analysis_folder: /data/joung/CHANGE-Seq/test2\n\n bwa: bwa\n samtools: samtools\n\n read_threshold: 4\n window_size: 3\n mapq_threshold: 50\n start_threshold: 1\n gap_threshold:\ 69 | \ 3\n mismatch_threshold: 6\n search_radius: 30\n merged_analysis: True\n\n samples:\n U2OS_exp1_VEGFA_site_1:\n target: GGGTGGGGGGAGTTTGCTCCNGG\n read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R1_001.fastq.gz\n\ 70 | \ read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/1_S1_L001_R2_001.fastq.gz\n controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz\n\ 71 | \ controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz\n description: U2OS_exp1\n U2OS_exp1_EMX1:\n target:\ 72 | \ GAGTCCGAGCAGAAGAAGAANGG\n read1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R1_001.fastq.gz\n read2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/2_S2_L001_R2_001.fastq.gz\n\ 73 | \ controlread1: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R1_001.fastq.gz\n controlread2: /data/joung/sequencing_fastq/150902_M01326_0235_000000000-AHLT8/fastq/4_S4_L001_R2_001.fastq.gz\n\ 74 | \ description: U2OS_exp1\n\n## Quickstart\n\n```\n\ngit clone https://github.com/tsailabSJ/changeseq\n\ncd changeseq/test\n\nchangeseq.py all --manifest CIRCLEseq_MergedTest.yaml\n\n```\n\ 75 | \n# Writing A Manifest File\nWhen running the end-to-end analysis functionality of the CHANGEseq package a number of inputs are required. To simplify the formatting of these inputs and to encourage\ 76 | \ reproducibility, these parameters are inputted into the pipeline via a manifest formatted as a YAML file. YAML files allow easy-to-read specification of key-value pairs. This allows us to easily specify\ 77 | \ our parameters. The following fields are required in the manifest:\n\n- `reference_genome`: The absolute path to the reference genome FASTA file.\n- `output_folder`: The absolute path to the folder\ 78 | \ in which all pipeline outputs will be saved.\n- `bwa`: The absolute path to the `bwa` executable\n- `samtools`: The absolute path to the `samtools` executable\n- `read_threshold`: The minimum number\ 79 | \ of reads at a location for that location to be called as a site. We recommend leaving it to the default value of 4.\n- `window_size`: Size of the sliding window, we recommend leaving it to the default\ 80 | \ value of 3.\n- `mapq_threshold`: Minimum read mapping quality score. We recommend leaving it to the default value of 50.\n- `start_threshold`: Tolerance for breakpoint location. We recommend leaving\ 81 | \ it to the default value of 1.\n- `gap_threshold`: Distance between breakpoints. We recommend leaving it to the default value of 3 for Cas9.\n- `mismatch_threshold`: Number of tolerated gaps in the\ 82 | \ fuzzy target search setp. We recommend leaving it to the default value of 6.\n- `read_length`: Fastq file read length, default is 151.\n- `PAM`: PAM sequence, default is NGG.\n- `merged_analysis`:\ 83 | \ Whether or not the paired read merging step should takingTrue\n- `samples`: Lists the samples you wish to analyze and the details for each. Each sample name should be nested under the top level samples\ 84 | \ key, and each sample detail should be nested under the sample name. See the sample manifest for an example.\n - For each sample, you must provide the following parameters:\n - `target`:\ 85 | \ Target sequence for that sample. Accepts degenerate bases.\n - `read1`: The absolute path to the .FASTQ(.gz) file containing the read1 reads.\n - `read2`: The absolute path to the .FASTQ(.gz)\ 86 | \ file containing the read2 reads.\n - `controlread1`: The absolute path to the .FASTQ(.gz) file containing the control read1 reads.\n - `controlread2`: The absolute path to the .FASTQ(.gz)\ 87 | \ file containing the control read2 reads.\n - `description`: A brief description of the sample\n\n\n# Pipeline Output\nWhen running the full pipeline, the results of each step are outputted\ 88 | \ to the `output_folder` in a separate folder for each step. The output folders and their respective contents are as follows:\n\n- `output_folder/aligned`: Contains an alignment `.sam`, alignment `.bam`,\ 89 | \ sorted `bam`, and `.bai` index file for each sample.\n- `output_folder/fastq`: Merged `.fastq.gz` files for each sample.\n- `output_folder/identified`: Contains tab-delimited `.txt` files for each\ 90 | \ sample containing the identified DSBs, control DSBs, filtered DSBs, and read quantification.\n- `output_folder/visualization`: Contains a `.svg` vector image representing an alignment of all detected\ 91 | \ off-targets to the targetsite for each sample.\n\n# FAQ\n\nNone yet, we will keep this updated as needed.\n\n[version-shield]: https://img.shields.io/conda/v/tsailabsj/changeseq.svg\n[version-url]:\ 92 | \ https://anaconda.org/tsailabSJ/changeseq\n[python-shield]: https://img.shields.io/pypi/pyversions/changeseq.svg\n[python-url]: https://pypi.python.org/pypi/changeseq\n[platform-shield]: https://anaconda.org/tsailabsj/changeseq/badges/platforms.svg\n\ 93 | \n\n" 94 | doc_url: '' 95 | dev_url: '' 96 | 97 | extra: 98 | recipe-maintainers: 99 | - YichaoOU 100 | -------------------------------------------------------------------------------- /example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/example_output.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse>=1.4.0 2 | PyYAML>=3.11 3 | regex>=2018.01.10 4 | HTSeq>=0.6.1p1 5 | pyfaidx>=0.2.7 6 | statsmodels>=0.6.1 7 | pysam>=0.9.1.4 8 | svgwrite>=1.1.6 9 | numpy>=1.11.1 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /scripts/NUC_SIMPLE: -------------------------------------------------------------------------------- 1 | # 2 | # This matrix was created by Todd Lowe 12/10/92 3 | # 4 | # Uses ambiguous nucleotide codes, probabilities rounded to 5 | # nearest integer 6 | # 7 | # Lowest score = -4, Highest score = 5 8 | # 9 | # Modified by Shengdar Tsai 1/23/16 10 | A T G C N 11 | A 10 -5 -5 -5 10 12 | T -5 10 -5 -5 10 13 | G -5 -5 10 -5 10 14 | C -5 -5 -5 10 10 15 | N 10 10 10 10 10 -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'shengdar' 2 | -------------------------------------------------------------------------------- /scripts/site_pvalue.R: -------------------------------------------------------------------------------- 1 | #!/apps/lab/aryee/R/R-3.2.3/bin/Rscript --vanilla 2 | 3 | # Usage example using test data from the circleseq repository: 4 | # ./site_pvalue.R ../test/U2OS_EMX1_counts.txt ../test/U2OS_EMX1_counts_pval.txt 5 | 6 | # Usage example using a larger test dataset on erisone: 7 | # ./site_pvalue.R /data/joung/CIRCLE-Seq/complete_analysis/160122_937aa31/output/U2OS_EMX1_counts.txt U2OS_EMX1_counts_pval.txt 8 | 9 | library("ggplot2") 10 | library("scales") 11 | 12 | args <- commandArgs(TRUE) 13 | infile <- args[1] 14 | outfile <- args[2] 15 | #infile <- "../test/U2OS_EMX1_counts.txt" 16 | # Read in counts 17 | message("Reading ", infile) 18 | dat <- read.delim(infile, comment.char = "", header=TRUE) 19 | 20 | # Condition on having observed at least one read 21 | bg <- dat$Control_Position_Reads 22 | bg <- bg[bg>0] 23 | 24 | # Model control distribution as exponential 25 | message("Calculating p-values") 26 | lambda <- mean(bg) 27 | pval <- 1 - pexp(dat$Nuclease_Position_Reads, rate=1/lambda) 28 | dat$pvalue <- pval 29 | 30 | # Model control distribution empirically 31 | background_cdf <- ecdf(bg) 32 | pval_empirical <- 1 - background_cdf(dat$Nuclease_Position_Reads) 33 | 34 | message("Saving diagnostic plots to pvalue_diagnostics.pdf") 35 | # Diagnostic plots 36 | pdf(file="pvalue_diagnostics.pdf", width=6, height=2.5) 37 | p <- ggplot(dat, aes(1+Control_Position_Reads)) + scale_x_continuous(limits=c(0,100)) + scale_y_log10(labels=comma) + geom_histogram(binwidth=2, na.rm=TRUE) + theme_bw() + ggtitle("Control_Position_Reads") 38 | suppressWarnings(print(p)) 39 | p <- ggplot(dat, aes(1+Nuclease_Position_Reads)) + scale_x_continuous(limits=c(0,100)) + scale_y_log10(labels=comma) + geom_histogram(binwidth=2, na.rm=TRUE) + theme_bw() + ggtitle("Nuclease_Position_Reads") 40 | suppressWarnings(print(p)) 41 | idx <- sample(length(pval), min(length(pval), 10000)) 42 | plot(pval_empirical[idx], pval[idx], xlab="Empirical p-value", ylab="Exponential model p-value") 43 | abline(0,1) 44 | dev.off() 45 | 46 | message("Writing output table ", outfile) 47 | write.table(dat, file=outfile, sep="\t", quote=FALSE, row.names=FALSE) 48 | 49 | -------------------------------------------------------------------------------- /scripts/test.py: -------------------------------------------------------------------------------- 1 | import regex 2 | import nwalign as nw 3 | import swalign 4 | import string 5 | 6 | def reverseComplement(sequence): 7 | transtab = string.maketrans("ACGT","TGCA") 8 | return sequence.translate(transtab)[::-1] 9 | 10 | def regexFromSequence(seq, lookahead=True, indels=1, errors=7): 11 | """ 12 | Given a sequence with ambiguous base characters, returns a regex that matches for 13 | the explicit (unambiguous) base characters 14 | """ 15 | IUPAC_notation_regex = {'N': '[ATCGN]', 16 | 'Y': '[CTY]', 17 | 'R': '[AGR]', 18 | 'W': '[ATW]', 19 | 'S': '[CGS]', 20 | 'A': 'A', 21 | 'T': 'T', 22 | 'C': 'C', 23 | 'G': 'G'} 24 | 25 | pattern = '' 26 | 27 | for c in seq: 28 | pattern += IUPAC_notation_regex[c] 29 | 30 | if lookahead: 31 | pattern = '(?b:' + pattern + ')' 32 | 33 | pattern_standard = pattern + '{{s<={0}}}'.format(errors) 34 | pattern_gap = pattern + '{{i<={0},d<={0},s<={1},3i+3d+1s<={1}}}'.format(indels, errors) 35 | return pattern_standard, pattern_gap 36 | 37 | """ 38 | Given a targetsite and window, use a fuzzy regex to align the targetsite to 39 | the window. Returns the best match. 40 | """ 41 | def alignSequences(targetsite_sequence, window_sequence, max_mismatches=7): 42 | # Try both strands 43 | query_regex_standard, query_regex_gap = regexFromSequence(targetsite_sequence, errors=max_mismatches) 44 | 45 | alignments = list() 46 | alignments.append(('+', 'standard', regex.search(query_regex_standard, window_sequence, regex.BESTMATCH))) 47 | alignments.append(('-', 'standard', regex.search(query_regex_standard, reverseComplement(window_sequence), regex.BESTMATCH))) 48 | alignments.append(('+', 'gapped', regex.search(query_regex_gap, window_sequence, regex.BESTMATCH))) 49 | alignments.append(('-', 'gapped', regex.search(query_regex_gap, reverseComplement(window_sequence), regex.BESTMATCH))) 50 | 51 | top_distance_score = 0 52 | chosen_alignment = None 53 | for i, aln in enumerate(alignments): 54 | strand, alignment_type, match = aln 55 | if match != None: 56 | substitutions, insertions, deletions = match.fuzzy_counts 57 | distance_score = substitutions + (insertions + deletions) * 3 58 | if distance_score > top_distance_score: 59 | chosen_alignment = match 60 | top_distance_score = distance_score 61 | print(match, distance_score) 62 | 63 | if chosen_alignment: 64 | match_sequence = chosen_alignment.group() 65 | distance = sum(chosen_alignment.fuzzy_counts) 66 | length = len(match_sequence) 67 | start = chosen_alignment.start() 68 | end = chosen_alignment.end() 69 | return [match_sequence, distance, length, strand, start, end] 70 | else: 71 | return [''] * 6 72 | 73 | 74 | 75 | 76 | # if forward_alignment is None and reverse_alignment is None: 77 | # return ['', '', '', '', '', ''] 78 | # else: 79 | # if forward_alignment is None and reverse_alignment is not None: 80 | # strand = '-' 81 | # alignment = reverse_alignment 82 | # elif reverse_alignment is None and forward_alignment is not None: 83 | # strand = '+' 84 | # alignment = forward_alignment 85 | # elif forward_alignment is not None and reverse_alignment is not None: 86 | # forward_distance = sum(forward_alignment.fuzzy_counts) 87 | # reverse_distance = sum(reverse_alignment.fuzzy_counts) 88 | # 89 | # if forward_distance > reverse_distance: 90 | # strand = '-' 91 | # alignment = reverse_alignment 92 | # else: 93 | # strand = '+' 94 | # alignment = forward_alignment 95 | # 96 | # match_sequence = alignment.group() 97 | # distance = sum(alignment.fuzzy_counts) 98 | # length = len(match_sequence) 99 | # start = alignment.start() 100 | # end = alignment.end() 101 | # 102 | # return [match_sequence, distance, length, strand, start, end] 103 | 104 | def alignSequences2(ref_seq, query_seq): 105 | match = 2 106 | mismatch = -1 107 | ref_length = len(ref_seq) 108 | matches_required = len(ref_seq) - 1 - 7 # allow up to 8 mismatches 109 | scoring = swalign.NucleotideScoringMatrix(match, mismatch) 110 | sw = swalign.LocalAlignment(scoring, gap_penalty=-3, gap_extension_penalty=-100, prefer_gap_runs=True) # you can also choose gap penalties, etc... 111 | # sw = swalign.LocalAlignment(scoring, gap_penalty=-10, gap_extension_penalty=-0.5, prefer_gap_runs=True) # you can also choose gap penalties, etc... 112 | forward_alignment = sw.align(ref_seq, query_seq) 113 | reverse_alignment = sw.align(ref_seq, reverseComplement(query_seq)) 114 | if forward_alignment.matches >= matches_required and forward_alignment.matches > reverse_alignment.matches: 115 | start_pad = forward_alignment.r_pos 116 | start = forward_alignment.q_pos - start_pad 117 | end_pad = ref_length - forward_alignment.r_end 118 | end = forward_alignment.q_end + end_pad 119 | strand = "+" 120 | return [forward_alignment.query[start:end], ref_length - forward_alignment.matches - 1, end - start, strand, start, end] 121 | elif reverse_alignment.matches >= matches_required and reverse_alignment.matches > forward_alignment.matches: 122 | start_pad = reverse_alignment.r_pos 123 | start = reverse_alignment.q_pos - start_pad 124 | end_pad = ref_length - reverse_alignment.r_end 125 | end = reverse_alignment.q_end + end_pad 126 | strand = "-" 127 | return [reverse_alignment.query[start:end], ref_length - reverse_alignment.matches - 1, end - start, strand, start, end] 128 | else: 129 | return ["", "", "", "", "", ""] 130 | 131 | 132 | def main(): 133 | # target = 'TTTNCTGATGGTCCATGTCTGTTACTC' 134 | 135 | 136 | # windowsequence = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA' 137 | # windowsequence = 'GGCCTGAGTCCGAGCAGAAGCAAGAAGGGCTCCCATCACATCAAC' 138 | 139 | target = 'TTTNGGGACGGGGAGAAGGAAAAGAGG' 140 | windowsequence = 'AATTTGGGGGGATTCATTACTCTATTTGGATTTGTTAGGGAGGAAGGCAGGTGGGATTTTTCTTCTCATTCTTATCTCTTTCCTTCTTCCCGTCCCAGAAAGAAACTAAGAATAATAACCAAATTATTAAAATGACTCACCGCCCTTCCA' 141 | 142 | print(alignSequences(target, windowsequence, max_mismatches=7)) 143 | 144 | 145 | if __name__ == "__main__": 146 | main() -------------------------------------------------------------------------------- /scripts/test_align.py: -------------------------------------------------------------------------------- 1 | import nwalign as nw 2 | import Levenshtein as l 3 | import difflib 4 | import os 5 | 6 | def main(): 7 | 8 | # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC' 9 | # b = 'GAGTCGAGCAGAAGAAGAANGG' 10 | 11 | a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA' 12 | b= 'TTTNCTGATGGTCCATGTCTGTTACTC' 13 | 14 | print(l.distance(a, b)) 15 | print(l.editops(a, b)) 16 | print(l.matching_blocks(l.editops(a,b), a, b)) 17 | 18 | 19 | 20 | if __name__ == "__main__": 21 | main() -------------------------------------------------------------------------------- /scripts/test_ga.py: -------------------------------------------------------------------------------- 1 | import HTSeq 2 | 3 | def main(): 4 | ga = HTSeq.GenomicArray("auto", typecode='O', stranded=False) 5 | position = HTSeq.GenomicPosition('chr1', 123203, '.') 6 | 7 | ga[HTSeq.GenomicInterval( "chr1", 100000, 101000 , "." )] = [0.05, 0.002, 0.04, 0.005] 8 | 9 | iv = HTSeq.GenomicInterval( "chr1", 100000, 130000 , "." ) 10 | 11 | for interval, value in ga[iv].steps(): 12 | print(interval, value) 13 | 14 | if __name__ == "__main__": 15 | main() -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | verbosity=1 3 | detailed-errors=1 4 | exe=1 5 | where=test/ -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # from distutils.core import setup 5 | from setuptools import setup, find_packages 6 | import changeseq 7 | ## conda skeleton can't find readme 8 | import os 9 | if os.path.isfile("README.MD"): 10 | with open("README.MD", "r") as fh: 11 | long_description = fh.read() 12 | else: 13 | long_description="change-seq" 14 | 15 | setup( 16 | name='changeseq', 17 | version=str(changeseq.__version__), # update visualization, run homer peak annotation if available 18 | description="Bioinformatic pipeline for the CHANGE-seq assay.", 19 | author="Shengdar Q Tsai, Martin Aryee, Ved V Topkar, Jose Malagon-Lopez", 20 | author_email='STSAI4@mgh.harvard.edu, Aryee.Martin@mgh.harvard.edu, vedtopkar@gmail.com, jose.lopez@mail.harvard.edu', 21 | url='https://github.com/tsailabSJ/changeseq', 22 | # packages=['changeseq','data'], 23 | packages=find_packages(), 24 | # package_dir={'changeseq':'changeseq'}, 25 | license='LICENSE', 26 | scripts=['changeseq/changeseq.py','changeseq/alignReads.py','changeseq/visualization.py', 27 | 'changeseq/callVariants.py','changeseq/findCleavageSites.py','changeseq/log.py', 28 | 'changeseq/mergeReads.py','changeseq/referenceFree.py','changeseq/utility.py', 29 | 'changeseq/validation.py','changeseq/refseq_gene_name.py'], 30 | package_data={'test': ["test/*"]}, 31 | # package_data={'':["README.md","data/refseq_gene_name.py"]}, 32 | include_package_data=True, 33 | long_description=long_description, 34 | long_description_content_type='text/markdown' , 35 | keywords='changeseq', 36 | classifiers=[ 37 | 'Development Status :: 4 - Beta', 38 | 'Intended Audience :: Science/Research', 39 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 40 | 'Topic :: Scientific/Engineering :: Visualization', 41 | 'Topic :: Scientific/Engineering :: Information Analysis', 42 | 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 43 | 'Operating System :: Unix', 44 | 'Natural Language :: English', 45 | "Programming Language :: Python :: 2", 46 | 'Programming Language :: Python :: 2.6', 47 | 'Programming Language :: Python :: 2.7' 48 | ] 49 | ) 50 | -------------------------------------------------------------------------------- /test/CIRCLEseq_MergedTest.yaml: -------------------------------------------------------------------------------- 1 | reference_genome: data/input/CIRCLEseq_test_genome.fa 2 | analysis_folder: data/MergedOutput 3 | 4 | bwa: bwa 5 | samtools: samtools 6 | 7 | window_size: 3 8 | mapq_threshold: 50 9 | start_threshold: 1 10 | gap_threshold: 3 11 | mismatch_threshold: 6 12 | merged_analysis: True 13 | 14 | samples: 15 | TestSample: 16 | target: GAGTCCGAGCAGAAGAAGAANGG 17 | read1: data/input/TEST.r1.fastq.gz 18 | read2: data/input/TEST.r2.fastq.gz 19 | controlread1: data/input/TEST_control.r1.fastq.gz 20 | controlread2: data/input/TEST_control.r2.fastq.gz 21 | description: TestCell 22 | -------------------------------------------------------------------------------- /test/CIRCLEseq_StandardTest.yaml: -------------------------------------------------------------------------------- 1 | reference_genome: data/input/CIRCLEseq_test_genome.fa 2 | analysis_folder: data/StandardOutput 3 | 4 | bwa: bwa 5 | samtools: samtools 6 | 7 | window_size: 3 8 | mapq_threshold: 50 9 | start_threshold: 1 10 | gap_threshold: 3 11 | mismatch_threshold: 6 12 | merged_analysis: False 13 | variant_analysis: True 14 | 15 | samples: 16 | TestSample: 17 | target: GAGTCCGAGCAGAAGAAGAANGG 18 | read1: data/input/TEST.r1.fastq.gz 19 | read2: data/input/TEST.r2.fastq.gz 20 | controlread1: data/input/TEST_control.r1.fastq.gz 21 | controlread2: data/input/TEST_control.r2.fastq.gz 22 | description: TestCell 23 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/__init__.py -------------------------------------------------------------------------------- /test/data/MergedOutput/aligned/TestSample.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/TestSample.bam -------------------------------------------------------------------------------- /test/data/MergedOutput/aligned/TestSample.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/TestSample.bam.bai -------------------------------------------------------------------------------- /test/data/MergedOutput/aligned/TestSample_sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/TestSample_sorted.bam -------------------------------------------------------------------------------- /test/data/MergedOutput/aligned/control_TestSample.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/control_TestSample.bam -------------------------------------------------------------------------------- /test/data/MergedOutput/aligned/control_TestSample.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/control_TestSample.bam.bai -------------------------------------------------------------------------------- /test/data/MergedOutput/aligned/control_TestSample_sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/aligned/control_TestSample_sorted.bam -------------------------------------------------------------------------------- /test/data/MergedOutput/fastq/TestSample_merged.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/fastq/TestSample_merged.fastq.gz -------------------------------------------------------------------------------- /test/data/MergedOutput/fastq/control_TestSample_merged.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/fastq/control_TestSample_merged.fastq.gz -------------------------------------------------------------------------------- /test/data/MergedOutput/identified/TestSample_CONTROL_coordinates.txt: -------------------------------------------------------------------------------- 1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read2_start_position Read2_strand 2 | -------------------------------------------------------------------------------- /test/data/MergedOutput/identified/TestSample_NUCLEASE_coordinates.txt: -------------------------------------------------------------------------------- 1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read2_start_position Read2_strand 2 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 3 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 4 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 5 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 6 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 7 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 8 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 9 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 10 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 11 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 12 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 13 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 14 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 15 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 16 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 17 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 18 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 19 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 20 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 21 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 22 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 23 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 24 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 25 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 26 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 27 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 28 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 29 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 + 30 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 31 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 32 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 33 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 + 34 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 35 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 36 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 37 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 38 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 39 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 40 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 41 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 42 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 43 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 44 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 45 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 46 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 47 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 + 48 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 49 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 50 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 51 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 52 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 53 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 54 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 55 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 56 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 57 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 58 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 59 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 60 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 61 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 62 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 63 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 64 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 + 65 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 66 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 67 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 68 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 69 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 + 70 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 71 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 72 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 73 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 74 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 75 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 76 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 77 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 78 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 79 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10018 + 80 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 81 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 82 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 83 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 84 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 85 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 86 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 87 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 88 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 89 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 90 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 91 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 92 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 93 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 94 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 95 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 96 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 97 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 98 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 99 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 100 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 101 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 102 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 103 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 104 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 2 10016 - 2 10017 + 105 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10004 - 8 10006 + 106 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 107 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 108 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 109 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 110 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 111 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 112 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 113 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 114 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 115 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 116 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 117 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 118 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 119 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 120 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 121 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 122 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 123 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 124 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 125 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 126 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 127 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 128 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 129 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 130 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 131 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 132 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 133 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 134 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 135 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 136 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 137 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 138 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 139 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 140 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 141 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 142 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 143 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 144 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 145 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 146 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 147 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 148 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 149 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 150 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 151 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 152 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 153 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 154 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 155 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 156 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 157 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 158 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 159 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 160 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 161 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 162 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 163 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 164 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 165 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 166 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 167 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 168 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 169 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 170 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 171 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 172 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 173 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 174 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 175 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 176 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 177 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 8 10005 - 8 10006 + 178 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 179 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 180 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 181 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 182 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 183 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 184 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 185 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 186 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 187 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 188 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 189 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 190 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 191 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 192 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 193 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 194 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 195 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 196 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 197 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 198 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 199 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 200 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 201 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 202 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 203 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 204 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 205 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 206 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 207 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 208 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 209 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 210 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 211 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 212 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 213 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 214 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 215 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 216 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 217 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 218 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 219 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 220 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 221 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10005 - 1 10006 + 222 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 223 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 224 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 225 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 226 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 227 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 228 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 229 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 230 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 231 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 232 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 233 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 234 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 235 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 236 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10005 - 1 10006 + 237 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 238 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 239 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 1 10004 - 1 10005 + 240 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 241 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 242 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 243 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 244 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 245 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 246 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 247 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 248 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 249 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 250 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 251 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 252 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 253 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 254 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 255 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 256 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 257 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 258 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 259 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 260 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 261 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 12 10005 - 12 10006 + 262 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 263 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 264 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 265 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 266 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 267 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 268 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 269 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 270 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 271 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 272 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 273 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 274 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 275 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 276 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 277 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 278 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 279 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 280 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 281 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 282 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 283 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 284 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 285 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 286 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 287 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 288 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 289 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample.bam 4 10016 - 4 10017 + 290 | -------------------------------------------------------------------------------- /test/data/MergedOutput/identified/TestSample_count.txt: -------------------------------------------------------------------------------- 1 | #Chromosome zero_based_Position Nuclease_Position_Reads Control_Position_Reads Nuclease_Window_Reads Control_Window_Reads p_Value narrow_p_Value control_p_Value control_narrow_p_Value 2 | 1 10004 60.0 0.0 124.0 0.0 0.0 0.0 0.0 0.0 3 | 1 10005 62.0 0.0 124.0 0.0 0.0 0.0 0.0 0.0 4 | 1 10006 2.0 0.0 124.0 0.0 0.0 0.0 0.0 0.0 5 | 8 10004 1.0 0.0 146.0 0.0 0.0 0.0 0.0 0.0 6 | 8 10005 72.0 0.0 146.0 0.0 0.0 0.0 0.0 0.0 7 | 8 10006 73.0 0.0 146.0 0.0 0.0 0.0 0.0 0.0 8 | 2 10016 103.0 0.0 206.0 0.0 0.0 0.0 0.0 0.0 9 | 2 10017 97.0 0.0 206.0 0.0 0.0 0.0 0.0 0.0 10 | 2 10018 6.0 0.0 206.0 0.0 0.0 0.0 0.0 0.0 11 | 12 10005 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 12 | 12 10006 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 13 | 4 10016 28.0 0.0 56.0 0.0 0.0 0.0 0.0 0.0 14 | 4 10017 28.0 0.0 56.0 0.0 0.0 0.0 0.0 0.0 15 | -------------------------------------------------------------------------------- /test/data/MergedOutput/identified/TestSample_identified_matched.txt: -------------------------------------------------------------------------------- 1 | Chromosome Start End Genomic Coordinate Nuclease_Read_Count Strand Control_Read_Count Site_Sequence Site_Substitution_Number Site_Sequence_Gaps_Allowed File_Name Cell Target_site Full_Name Target_Sequence Realigned_Target_Sequence 2 | 12 10000 10023 12:10000-10023 44 - 0.0 GAGTTAGAGCAGAAAAAAAATGG 4 TestSample.bam TestCell TestSample TestSample_TestCell_12:10000-10023_44 GAGTCCGAGCAGAAGAAGAANGG none 3 | 1 10000 10023 1:10000-10023 124 - 0.0 GAAGTAGAGCAGAAGAAGAAGCG 5 AAGT-AGAGCAGAAGAAGAAGCG TestSample.bam TestCell TestSample TestSample_TestCell_1:10000-10023_124 GAGTCCGAGCAGAAGAAGAANGG GAGTCCGAGCAGAAGAAGAANGG 4 | 2 10000 10023 2:10000-10023 206 + 0.0 GAGTCCGAGCAGAAGAAGAAGGG 0 TestSample.bam TestCell TestSample TestSample_TestCell_2:10000-10023_206 GAGTCCGAGCAGAAGAAGAANGG none 5 | 4 10000 10023 4:10000-10023 56 + 0.0 CACTCCAAGTAGAAGAAGAAAAG 5 TestSample.bam TestCell TestSample TestSample_TestCell_4:10000-10023_56 GAGTCCGAGCAGAAGAAGAANGG none 6 | 8 10000 10023 8:10000-10023 146 - 0.0 AAGGCCAAGCAGAAGAGTAATGG 5 TestSample.bam TestCell TestSample TestSample_TestCell_8:10000-10023_146 GAGTCCGAGCAGAAGAAGAANGG none 7 | -------------------------------------------------------------------------------- /test/data/MergedOutput/identified/TestSample_identified_unmatched.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/MergedOutput/identified/TestSample_identified_unmatched.txt -------------------------------------------------------------------------------- /test/data/MergedOutput/visualization/TestSample_offtargets.svg: -------------------------------------------------------------------------------- 1 | 2 | TestSample1020PAMGAGTCCGAGCAGAAGAAGAANGGReadsG206AGAGTT146AGTAGCA-AGC124}CCATAA56TAAAT44 -------------------------------------------------------------------------------- /test/data/StandardOutput/aligned/TestSample.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/TestSample.bam -------------------------------------------------------------------------------- /test/data/StandardOutput/aligned/TestSample.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/TestSample.bam.bai -------------------------------------------------------------------------------- /test/data/StandardOutput/aligned/TestSample_sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/TestSample_sorted.bam -------------------------------------------------------------------------------- /test/data/StandardOutput/aligned/control_TestSample.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/control_TestSample.bam -------------------------------------------------------------------------------- /test/data/StandardOutput/aligned/control_TestSample.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/control_TestSample.bam.bai -------------------------------------------------------------------------------- /test/data/StandardOutput/aligned/control_TestSample_sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/aligned/control_TestSample_sorted.bam -------------------------------------------------------------------------------- /test/data/StandardOutput/identified/TestSample_CONTROL_coordinates.txt: -------------------------------------------------------------------------------- 1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read1_start_position Read2_strand 2 | -------------------------------------------------------------------------------- /test/data/StandardOutput/identified/TestSample_NUCLEASE_coordinates.txt: -------------------------------------------------------------------------------- 1 | #Name Targetsite_Sequence Cells BAM Read1_chr Read1_start_position Read1_strand Read2_chr Read1_start_position Read2_strand 2 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 3 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 4 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 - 5 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 6 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 7 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 8 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 9 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 10 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 11 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 12 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 13 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 14 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 15 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 16 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 17 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 18 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 19 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 20 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 21 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 - 22 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 23 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 24 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 25 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 26 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 27 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 28 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 29 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 30 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 31 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 + 32 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 33 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10005 - 34 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 35 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 36 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 37 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 38 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 39 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 40 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 41 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 + 42 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 43 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 44 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 45 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 46 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 47 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 48 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 49 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 50 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 51 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 52 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 53 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 54 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 55 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 56 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 57 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 58 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 59 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 + 60 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 61 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 62 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 63 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 - 8 10006 + 64 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 + 65 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 66 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10005 - 12 10006 + 67 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 68 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 69 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 70 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 71 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 72 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 73 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 74 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 75 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 76 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 77 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 - 78 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 79 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 80 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 81 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 82 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 83 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 84 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 85 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 86 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 87 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 88 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 89 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 90 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 91 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 92 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 93 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 94 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 95 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 96 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 97 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 98 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 99 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 100 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 101 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 102 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 103 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 104 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 105 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 106 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 107 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 + 108 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10004 - 109 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 110 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 111 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 112 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 113 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 114 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 + 115 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 116 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 117 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 118 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 119 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 120 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 121 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 122 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 123 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 124 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 125 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 126 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 127 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10005 - 12 10006 + 128 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 129 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 130 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 131 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 132 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 133 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 + 134 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 135 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 136 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10005 - 137 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 138 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10005 - 12 10006 + 139 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 140 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 141 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 142 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 143 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 144 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 145 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 146 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 147 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 148 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 149 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 150 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 151 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 152 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 153 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 154 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 155 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 156 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 157 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 158 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 159 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 160 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 161 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 162 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 163 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 164 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 165 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10016 + 166 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 167 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 168 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 169 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 170 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 171 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 172 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 173 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 174 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 175 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 176 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 177 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 178 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 179 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 180 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 181 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 182 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 183 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 184 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 185 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 186 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 187 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 188 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 189 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 190 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 191 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 192 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 - 1 10005 + 193 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 194 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 195 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 196 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 197 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10017 + 4 10016 - 198 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 199 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 200 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 201 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 202 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 203 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 204 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 205 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10004 - 206 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 207 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 208 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 209 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 210 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 211 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 212 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 213 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 214 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 215 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 216 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 217 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 218 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 219 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 220 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 4 10016 - 4 10017 + 221 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10006 + 1 10005 - 222 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 223 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 224 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 225 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 226 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 227 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 228 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 229 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 230 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 231 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 - 8 10006 + 232 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 233 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 234 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 235 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 236 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 237 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 238 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 - 1 10006 + 239 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10018 + 240 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 241 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 242 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 243 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 244 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 245 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 246 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 247 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 248 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 + 2 10016 - 249 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 250 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10005 - 8 10006 + 251 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10005 - 252 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10017 + 2 10016 - 253 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 + 1 10004 - 254 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 + 12 10005 - 255 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 256 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 257 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 258 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 - 8 10006 + 259 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 2 10016 - 2 10017 + 260 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10005 - 1 10005 + 261 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 262 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 263 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 264 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 265 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 1 10004 - 1 10005 + 266 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 12 10006 - 12 10006 + 267 | TestSample GAGTCCGAGCAGAAGAAGAANGG TestCell TestSample_sorted.bam 8 10006 + 8 10005 - 268 | -------------------------------------------------------------------------------- /test/data/StandardOutput/identified/TestSample_count.txt: -------------------------------------------------------------------------------- 1 | #Chromosome zero_based_Position Nuclease_Position_Reads Control_Position_Reads Nuclease_Window_Reads Control_Window_Reads p_Value narrow_p_Value control_p_Value control_narrow_p_Value 2 | 1 10004 47.0 0.0 108.0 0.0 0.0 0.0 0.0 0.0 3 | 1 10005 59.0 0.0 108.0 0.0 0.0 0.0 0.0 0.0 4 | 1 10006 2.0 0.0 108.0 0.0 0.0 0.0 0.0 0.0 5 | 8 10004 1.0 0.0 126.0 0.0 0.0 0.0 0.0 0.0 6 | 8 10005 59.0 0.0 126.0 0.0 0.0 0.0 0.0 0.0 7 | 8 10006 66.0 0.0 126.0 0.0 0.0 0.0 0.0 0.0 8 | 12 10004 1.0 0.0 70.0 0.0 0.0 0.0 0.0 0.0 9 | 12 10005 21.0 0.0 70.0 0.0 0.0 0.0 0.0 0.0 10 | 12 10006 48.0 0.0 70.0 0.0 0.0 0.0 0.0 0.0 11 | 2 10016 107.0 0.0 184.0 0.0 0.0 0.0 0.0 0.0 12 | 2 10017 73.0 0.0 184.0 0.0 0.0 0.0 0.0 0.0 13 | 2 10018 4.0 0.0 184.0 0.0 0.0 0.0 0.0 0.0 14 | 4 10016 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 15 | 4 10017 22.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 16 | -------------------------------------------------------------------------------- /test/data/StandardOutput/identified/TestSample_identified_matched.txt: -------------------------------------------------------------------------------- 1 | Chromosome Start End Name ReadCount Strand MappingPositionStart MappingPositionEnd WindowName WindowSequence Site_SubstitutionsOnly.Sequence Site_SubstitutionsOnly.NumSubstitutions Site_SubstitutionsOnly.Strand Site_SubstitutionsOnly.Start Site_SubstitutionsOnly.End Site_GapsAllowed.Sequence Site_GapsAllowed.Length Site_GapsAllowed.Score Site_GapsAllowed.Substitutions Site_GapsAllowed.Insertions Site_GapsAllowed.Deletions Site_GapsAllowed.Strand Site_GapsAllowed.Start Site_GapsAllowed.End FileName Cell Targetsite FullName TargetSequence RealignedTargetSequence Position.Pvalue Narrow.Pvalue Position.Control.Pvalue Narrow.Control.Pvalue 2 | 12 10000 10023 12:10000-10023 70 - 10004 10007 12:[10004,10007)/. TTTTCACTTTCCTTTACCATTTTTTTTCTGCTCTAACTCTACC GAGTTAGAGCAGAAAAAAAATGG 4 - 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_12:10000-10023_70 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0 3 | 1 10000 10023 1:10000-10023 108 - 10004 10007 1:[10004,10007)/. GAACTTGCGGAAGGTCCGCTTCTTCTTCTGCTCTACTTCTGCC GAAGTAGAGCAGAAGAAGAAGCG 5 - 10000 10023 AAGTA-GAGCAGAAGAAGAAGCG 22 6 3 0 1 - 10000 10022 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_1:10000-10023_108 GAGTCCGAGCAGAAGAAGAANGG GAGTCCGAGCAGAAGAAGAANGG 0.0 0.0 0.0 0.0 4 | 2 10000 10023 2:10000-10023 184 + 10016 10019 2:[10016,10019)/. GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC GAGTCCGAGCAGAAGAAGAAGGG 0 + 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_2:10000-10023_184 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0 5 | 4 10000 10023 4:10000-10023 44 + 10016 10018 4:[10016,10018)/. TGATCACTCCAAGTAGAAGAAGAAAAGCTAGCTTCCATATAA CACTCCAAGTAGAAGAAGAAAAG 5 + 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_4:10000-10023_44 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0 6 | 8 10000 10023 8:10000-10023 126 - 10004 10007 8:[10004,10007)/. GCACTAGAATCCCAGGCCATTACTCTTCTGCTTGGCCTTTTGG AAGGCCAAGCAGAAGAGTAATGG 5 - 10000 10023 TestSample_sorted.bam TestCell TestSample TestSample_TestCell_8:10000-10023_126 GAGTCCGAGCAGAAGAAGAANGG none 0.0 0.0 0.0 0.0 7 | -------------------------------------------------------------------------------- /test/data/StandardOutput/identified/TestSample_identified_unmatched.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/StandardOutput/identified/TestSample_identified_unmatched.txt -------------------------------------------------------------------------------- /test/data/StandardOutput/variants/TestSample_Variants.txt: -------------------------------------------------------------------------------- 1 | Chromosome Start End Name ReadCount Strand Variant_WindowSequence Variant_Site_SubstitutionsOnly.Sequence Variant_Site_SubstitutionsOnly.NumSubstitutions Variant_Site_SubstitutionsOnly.Strand Variant_Site_GapsAllowed.Sequence Variant_Site_GapsAllowed.Length Variant_Site_GapsAllowed.Substitutions Variant_Site_GapsAllowed.Insertions Variant_Site_GapsAllowed.Deletions Variant_Site_GapsAllowed.Strand Cell Targetsite TargetSequence Variant_RealignedTargetSequence Reference Variant Genotype Quality 2 | 4 10000 10023 4:10000-10023 44 + TGATCACTCCAAGcAGAAGAAGAAAAGCTAGCTTCCATATAA CACTCCAAGcAGAAGAAGAAAAG 4 + TestCell TestSample GAGTCCGAGCAGAAGAAGAANGG none T C 1|1 162.998 3 | -------------------------------------------------------------------------------- /test/data/StandardOutput/variants/TestSample_mpileupCall.txt: -------------------------------------------------------------------------------- 1 | targetsite site_name chromosome one_based_position reference variant quality genotype depth PL 2 | TestSample TestSample_12:10000-10023 12 10000 A T 162.998 1|1 25 196_75_0 3 | TestSample TestSample_12:10000-10023 12 10025 A G 119.008 0|1 23 149_0_159 4 | TestSample TestSample_4:10000-10023 4 10010 T C 162.998 1|1 23 196_69_0 5 | -------------------------------------------------------------------------------- /test/data/StandardOutput/visualization/TestSample_offtargets.svg: -------------------------------------------------------------------------------- 1 | 2 | TestSample20101PAMGAGTCCGAGCAGAAGAAGAANGGReadsG184AGAGTT126AGTAGCAA-GC108}TAAAT70CCATAA44 -------------------------------------------------------------------------------- /test/data/input/CIRCLEseq_test_genome.fa.amb: -------------------------------------------------------------------------------- 1 | 100270 5 0 2 | -------------------------------------------------------------------------------- /test/data/input/CIRCLEseq_test_genome.fa.ann: -------------------------------------------------------------------------------- 1 | 100270 5 11 2 | 0 2 (null) 3 | 0 20178 0 4 | 0 8 (null) 5 | 20178 20023 0 6 | 0 1 (null) 7 | 40201 20023 0 8 | 0 12 (null) 9 | 60224 20023 0 10 | 0 4 (null) 11 | 80247 20023 0 12 | -------------------------------------------------------------------------------- /test/data/input/CIRCLEseq_test_genome.fa.bwt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/CIRCLEseq_test_genome.fa.bwt -------------------------------------------------------------------------------- /test/data/input/CIRCLEseq_test_genome.fa.fai: -------------------------------------------------------------------------------- 1 | 2 20178 3 20178 20179 2 | 8 20023 20185 20023 20024 3 | 1 20023 40212 20023 20024 4 | 12 20023 60240 20023 20024 5 | 4 20023 80267 20023 20024 6 | -------------------------------------------------------------------------------- /test/data/input/CIRCLEseq_test_genome.fa.pac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/CIRCLEseq_test_genome.fa.pac -------------------------------------------------------------------------------- /test/data/input/CIRCLEseq_test_genome.fa.sa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/CIRCLEseq_test_genome.fa.sa -------------------------------------------------------------------------------- /test/data/input/TEST.r1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST.r1.fastq.gz -------------------------------------------------------------------------------- /test/data/input/TEST.r2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST.r2.fastq.gz -------------------------------------------------------------------------------- /test/data/input/TEST_control.r1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST_control.r1.fastq.gz -------------------------------------------------------------------------------- /test/data/input/TEST_control.r2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tsailabSJ/changeseq/73e1424c3713140e0d04ba227786a0144e43faaa/test/data/input/TEST_control.r2.fastq.gz -------------------------------------------------------------------------------- /test/scripts/CIRCLEseq_prepare_test_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ##################################################################################### 3 | ### CIRCLEseq_prepare_test_data.sh: assemble the fastq files for the test 4 | ##################################################################################### 5 | ### Regions 6 | on_target="2:73160981-73161004" 7 | off_target01="8:120587494-120587517" 8 | off_target02="1:234492858-234492881" 9 | off_target03="12:73504668-73504691" 10 | off_target04="4:48639390-48639413" 11 | hotspots="1:121485221-121485228" 12 | 13 | ### Get the names of reads that overlap with the selected test regionsq 14 | samtools view sample.bam $on_target $off_target01 $off_target02 $off_target03 $off_target04 $hotspots | cut -f1 | sort | uniq > sample_read_names.txt 15 | samtools view control.bam $on_target $off_target01 $off_target02 $off_target03 $off_target04 $hotspots | cut -f1 | sort | uniq > control_read_names.txt 16 | cat sample_read_names.txt control_read_names.txt > read_names.txt 17 | 18 | ### Subset FASTQs to extract _all_ read pairs where at least one of the reads falls in a specified test region 19 | zcat fastq/128_S3_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST.r1.fastq.gz 20 | zcat fastq/128_S3_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST.r2.fastq.gz 21 | zcat fastq/Negative_S1_L001_R1_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST_control.r1.fastq.gz 22 | zcat fastq/Negative_S1_L001_R2_001.fastq.gz | grep -F -A3 --no-group-separator -f read_names.txt | gzip -c > TEST_control.r2.fastq.gz 23 | -------------------------------------------------------------------------------- /test/scripts/CIRCLEseq_prepare_test_reference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ##################################################################################### 3 | ### CIRCLEseq_prepare_test_genome.sh: assemble reference test 4 | ##################################################################################### 5 | ### Get chromosomes 6 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.1.fa.gz 7 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.2.fa.gz 8 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.4.fa.gz 9 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.8.fa.gz 10 | wget ftp://ftp.ensembl.org/pub/grch37/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.12.fa.gz 11 | 12 | ### Asemble reference 13 | cat *.fa.gz > Homo_sapiens.GRCh37.subset.fa.gz 14 | gunzip Homo_sapiens.GRCh37.subset.fa.gz 15 | samtools faidx Homo_sapiens.GRCh37.subset.fa 16 | 17 | ### Pad test regions with 10kb on either side 18 | bedtools slop -i CIRCLEseq_test.bed -g Homo_sapiens.GRCh37.subset.fa.fai -b 10000 > CIRCLEseq_test_padded.bed 19 | 20 | ### Extract sequences from reference file for each paded interval 21 | bedtools getfasta -fi Homo_sapiens.GRCh37.subset.fa -bed CIRCLEseq_test_padded.bed -fo CIRCLEseq_test_genome.fa -name 22 | -------------------------------------------------------------------------------- /test/scripts/CIRCLEseq_test_bed.R: -------------------------------------------------------------------------------- 1 | ######################################################################################################quote 2 | ### test_regions_BED.R: make bed file with regions including 3 | ### on-target site, 2 off-target sites without variants, 4 | ### 2 off-target sites with variants, and 1 region without off-targets. 5 | ###################################################################################################### 6 | bed = data.frame(chr=c('2', '8', '1', '12', '4'), start=c(73160981, 120587494, 234492858, 73504668, 48639390), end=c(73161159, 120587517, 234492881, 73504691, 48639413), name=c('2', '8', '1', '12', '4')) 7 | 8 | write.table(bed, 'CIRCLEseq_test.bed', quote=FALSE, row.names=FALSE, col.names=FALSE, sep='\t') 9 | 10 | ### Information about the sites 11 | on_target="2:73160981-73161004" 12 | off_target01="8:120587494-120587517" 13 | off_target02="1:234492858-234492881" 14 | off_target_with_variantWindowOnly="12:73504668-73504691" 15 | off_target_with_variants="4:48639390-48639413" 16 | hotspots="2:73161104-73161159" 17 | ###################################################################################################### 18 | ###################################################################################################### 19 | -------------------------------------------------------------------------------- /test/scripts/Test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python ../../circleseq/circleseq.py all --manifest ../CIRCLEseq_MergedTest.yaml 3 | 4 | python ../../circleseq/circleseq.py all --manifest ../CIRCLEseq_StandardTest.yaml 5 | -------------------------------------------------------------------------------- /test/test_circleseq_merged.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | test_circleseq_merged 6 | ---------------------------------- 7 | 8 | Tests for `circleseq` module. 9 | """ 10 | 11 | import yaml 12 | import unittest 13 | import os 14 | import shutil 15 | import utils 16 | from circleseq import circleseq 17 | 18 | TEST_OUTPUT_PATH = 'tmp' 19 | 20 | TEST_MANIFEST_PATH = os.path.join('CIRCLEseq_MergedTest.yaml') 21 | 22 | CORRECT_ALIGNED_OUTPUT = 'data/MergedOutput/aligned' 23 | CORRECT_IDENTIFIED_OUTPUT = 'data/MergedOutput/identified' 24 | CORRECT_MERGED_OUTPUT = 'data/MergedOutput/merged' 25 | CORRECT_VISUALIZATION_OUTPUT = 'data/MergedOutput/visualization' 26 | 27 | CORRECT_ALL_OUTPUT = 'data/MergedOutput' 28 | 29 | class FullPipelineTestCase(unittest.TestCase): 30 | 31 | def setUp(self): 32 | pass 33 | 34 | def testFullPipeline(self): 35 | c = circleseq.CircleSeq() 36 | c.parseManifest(TEST_MANIFEST_PATH) 37 | 38 | # Align and test the alignment output 39 | c.alignReads() 40 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, "aligned"), CORRECT_ALIGNED_OUTPUT)) 41 | 42 | # Find cleavage sites 43 | c.findCleavageSites() 44 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'identified'), CORRECT_IDENTIFIED_OUTPUT)) 45 | 46 | # Visualize filtered sites 47 | c.visualize() 48 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'visualization'), CORRECT_VISUALIZATION_OUTPUT)) 49 | 50 | 51 | def tearDown(self): 52 | pass 53 | 54 | if __name__ == '__main__': 55 | unittest.main() -------------------------------------------------------------------------------- /test/test_circleseq_std.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | test_circleseq_std 6 | ---------------------------------- 7 | 8 | Tests for `circleseq` module. 9 | """ 10 | 11 | import yaml 12 | import unittest 13 | import os 14 | import shutil 15 | import utils 16 | from circleseq import circleseq 17 | 18 | TEST_OUTPUT_PATH = 'tmp' 19 | 20 | TEST_MANIFEST_PATH = os.path.join('CIRCLEseq_StandardTest.yaml') 21 | 22 | CORRECT_ALIGNED_OUTPUT = 'data/StandardOutput/aligned' 23 | CORRECT_IDENTIFIED_OUTPUT = 'data/StandardOutput/identified' 24 | CORRECT_VARIANTS_OUTPUT = 'data/StandardOutput/variants' 25 | CORRECT_VISUALIZATION_OUTPUT = 'data/StandardOutput/visualization' 26 | 27 | CORRECT_ALL_OUTPUT = 'data' 28 | 29 | class FullPipelineTestCase(unittest.TestCase): 30 | 31 | def setUp(self): 32 | pass 33 | 34 | def testFullPipeline(self): 35 | c = circleseq.CircleSeq() 36 | c.parseManifest(TEST_MANIFEST_PATH) 37 | 38 | # Align and test the alignment output 39 | c.alignReads() 40 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, "aligned"), CORRECT_ALIGNED_OUTPUT)) 41 | 42 | # Find cleavage sites 43 | c.findCleavageSites() 44 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'identified'), CORRECT_IDENTIFIED_OUTPUT)) 45 | 46 | # Visualize filtered sites 47 | c.visualize() 48 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'visualization'), CORRECT_VISUALIZATION_OUTPUT)) 49 | 50 | # Look for genomic variants 51 | c.callVariants() 52 | self.assertTrue(utils.checkFolderEquality(os.path.join(c.analysis_folder, 'variants'), CORRECT_VARIANTS_OUTPUT)) 53 | 54 | 55 | def tearDown(self): 56 | pass 57 | 58 | if __name__ == '__main__': 59 | unittest.main() -------------------------------------------------------------------------------- /test/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import sys 5 | import inspect 6 | import filecmp 7 | from itertools import islice 8 | 9 | def checkFolderEquality(folder1, folder2): 10 | """ 11 | Given two folders, check if there are the same number of files, 12 | that the names of files are the same, and that the files with the same 13 | names are the same. 14 | """ 15 | 16 | folder1_files = [x for x in os.listdir(folder1) if not x.startswith('.')] 17 | folder2_files = [x for x in os.listdir(folder2) if not x.startswith('.')] 18 | 19 | if set(folder1_files) != set(folder2_files): 20 | print('Folders do not have the same filenames.') 21 | return False 22 | 23 | for f in folder1_files: 24 | file1 = os.path.join(folder1, f) 25 | file2 = os.path.join(folder2, f) 26 | 27 | if f.split('.')[-1] == 'sam': 28 | with open(file1, 'r') as a, open(file2, 'r') as b: 29 | for line1, line2 in zip(a,b): 30 | if line1.startswith('@'): 31 | continue 32 | elif line1 != line2: 33 | return False 34 | else: 35 | if not filecmp.cmp(file1, file2): 36 | print('{0} does not match between folders.'.format(f)) 37 | return False 38 | 39 | return True 40 | 41 | 42 | def head(filepath, n=10): 43 | with open(filepath) as f: 44 | for line in islice(f, n): 45 | print(line) 46 | --------------------------------------------------------------------------------