├── test ├── stitch ├── contigs.txt ├── seq_a.txt ├── seq_b.txt └── test.py ├── stitch ├── __init__.py ├── length_filter.py ├── fasta.py └── stitch.py ├── .gitignore ├── LICENSE.md └── README.md /test/stitch: -------------------------------------------------------------------------------- 1 | ../stitch -------------------------------------------------------------------------------- /stitch/__init__.py: -------------------------------------------------------------------------------- 1 | from stitch import * -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | s*short.txt 3 | *.fastq 4 | -------------------------------------------------------------------------------- /test/contigs.txt: -------------------------------------------------------------------------------- 1 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG 2 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG 3 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG -------------------------------------------------------------------------------- /test/seq_a.txt: -------------------------------------------------------------------------------- 1 | @sequence_1 2 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG 3 | +sequence_1 4 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff 5 | @sequence_2 6 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG 7 | +sequence_2 8 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff 9 | @sequence_3 10 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG 11 | +sequence_3 12 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff -------------------------------------------------------------------------------- /test/seq_b.txt: -------------------------------------------------------------------------------- 1 | @sequence_1 2 | CCCCCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTT 3 | +sequence_1 4 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff 5 | @sequence_2 6 | AAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGGG 7 | +sequence_2 8 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff 9 | @sequence_3 10 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG 11 | +sequence_3 12 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff -------------------------------------------------------------------------------- /stitch/length_filter.py: -------------------------------------------------------------------------------- 1 | from fasta import * 2 | import sys 3 | 4 | filename = sys.argv[1] 5 | 6 | for minimum, maximum in ((50, 59), (60, 69), (70, 79), (80, 89), (90, 99), (100, 129), (130, 200)): 7 | handle = open(filename) 8 | output = open('%s-%s-%s' % (filename, minimum, maximum), 'w') 9 | counter = 0 10 | for fasta in Fasta(handle): 11 | if len(fasta) > maximum: 12 | continue 13 | if len(fasta) < minimum: 14 | continue 15 | print >> output, fasta 16 | counter += 1 17 | print minimum, maximum, counter 18 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from stitch import * 4 | 5 | def test_headers(): 6 | ''' make sure headers are parsed correctly ''' 7 | with open('seq_a.txt') as handle: 8 | records = [i for i in Fasta(handle)] 9 | 10 | assert records.pop(0).header == 'sequence_1' 11 | 12 | def test_alignments(): 13 | with open('seq_a.txt') as ha, open('seq_b.txt') as hb: 14 | records = [(a, b) for (a, b) in zip(Fasta(ha), Fasta(hb))] 15 | 16 | contigs = [ i.strip() for i in open('contigs.txt').readlines() ] 17 | 18 | for i, j in records: 19 | c = contigs.pop(0) 20 | res = Stitch(i, j) 21 | 22 | # TODO, also check quality scores 23 | # print res.pretty 24 | seq = res.record.seq 25 | try: 26 | assert seq == c 27 | except AssertionError: 28 | print 'ERROR: %s' % i.header 29 | print 'expected: %s' % i.seq 30 | print 'got: %s' % seq 31 | 32 | 33 | if __name__ == '__main__': 34 | test_headers() 35 | test_alignments() -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Austin Davis-Richardson 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /stitch/fasta.py: -------------------------------------------------------------------------------- 1 | import string 2 | _complement = string.maketrans('GATCRYgatcry','CTAGYRctagyr') 3 | 4 | class Fasta: 5 | ''' iterates through a fastq file, returning dnaobj objects ''' 6 | 7 | def __init__(self, handle, filetype='fastq'): 8 | self.filetype = filetype 9 | self.handle = handle 10 | self.county = 0 11 | 12 | def __iter__(self): 13 | if self.filetype == 'fastq': 14 | counter = 0 15 | rec = { 0: '', 1: '', 2: '', 3: '' } 16 | for line in self.handle: 17 | if counter < 3: 18 | rec[counter] = line.strip() 19 | counter += 1 20 | elif counter == 3: 21 | rec[counter] = line.strip() 22 | counter = 0 23 | yield Dna(rec[0], rec[1], rec[3]) 24 | 25 | 26 | class Dna: 27 | ''' An object representing either a FASTA or FASTQ record ''' 28 | 29 | def __init__(self, header, sequence, quality = False): 30 | self.header = header.lstrip('@').rstrip('\n') 31 | self.seq = sequence 32 | self.qual = quality 33 | if quality: 34 | self.type = 'fastq' 35 | else: 36 | self.type = 'fasta' 37 | 38 | if len(self.seq) != len(self.qual): 39 | raise IOError, \ 40 | 'Seq length and qual length do not agree: %s' % (self.header) 41 | 42 | def __str__(self): 43 | ''' returns a FASTA/Q formatted string ''' 44 | if not self.qual: 45 | return ('>%s\nself.sequence\n') % \ 46 | (self.header, self.seq) 47 | else: 48 | return('@%s\n%s\n+%s\n%s') % \ 49 | (self.header, self.seq, self.header, self.qual) 50 | 51 | def __len__(self): 52 | return len(self.seq) 53 | 54 | def __repr__(self): 55 | return '' % (self.type, self.header) 56 | 57 | @property 58 | def complement(self): 59 | ''' returns complement of sequence ''' 60 | return self.seq.translate(_complement) 61 | 62 | @property 63 | def revcomp(self): 64 | ''' returns reverse complement of sequence ''' 65 | return self.complement[::-1] 66 | 67 | @property 68 | def rqual(self): 69 | ''' returns reverse quality''' 70 | return self.qual[::-1] 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stitch 2 | 3 | **No longer maintained**. Please use [Pandsaseq](https://github.com/neufeld/pandaseq). 4 | 5 | Austin G. Davis-Richardson 6 | 7 | 8 | Stitch assembles overlapping paired-end reads into a single contig for each 9 | pair. This increases the read length and hopefully the quality of a _de novo_ 10 | or reference assembly. Stitch is multi-threaded and will automatically use all 11 | cores on your system unless told otherwise. Stitch currently only reads FASTQ 12 | format. QSEQ and FASTA formats to come. Reads that are not found to overlap are 13 | dumped in a file called `-singletons` and are in FASTQ format. These 14 | can then be trimmed and combined with contigs to do a _de novo_ assembly. 15 | 16 | Stitch is licensed under the MIT open source license. See `LICENSE.md` for 17 | details. 18 | 19 | # Citing Stitch 20 | 21 | Please us the following citation: 22 | 23 | Brown, Christopher T., et al. "Gut microbiome metagenomics analysis suggests a 24 | functional model for the development of autoimmunity for type 1 diabetes." PloS 25 | one 6.10 (2011): e25792. 26 | 27 | Or, in BibTex: 28 | 29 | ``` 30 | @article{brown2011gut, 31 | title={Gut microbiome metagenomics analysis suggests a functional model for the development of autoimmunity for type 1 diabetes}, 32 | author={Brown, Christopher T and Davis-Richardson, Austin G and Giongo, Adriana and Gano, Kelsey A and Crabb, David B and Mukherjee, Nabanita and Casella, George and Drew, Jennifer C and Ilonen, Jorma and Knip, Mikael and others}, 33 | journal={PloS one}, 34 | volume={6}, 35 | number={10}, 36 | pages={e25792}, 37 | year={2011}, 38 | publisher={Public Library of Science} 39 | } 40 | ``` 41 | 42 | ## Requirements 43 | 44 | - Python 2.6, 2.7 45 | - Mac OS X or Linux (Windows might work but hasn't been tested) 46 | 47 | ## Alignment Algorithm 48 | 49 | Stitch aligns overlapping paired end reads by counting the number of matching 50 | nucleotides in an overlapping window. The window that provides the highest 51 | number of matching nucleotides wins. 52 | 53 | The consensus sequence is generated thusly, 54 | 55 | A 5' =============== =================================> 3' 56 | B 3' <================================= =========== 5' 57 | C 5' =============== ================================= ===========> 3' 58 | <--------- "the middle" ----------> 59 | 60 | In the region dubbed "the middle", the nucleotide with the highest 61 | corresponding quality score is used (if there is a mismatch). If there is a 62 | match, then that nucleotide is used (and the highest quality score is given). 63 | In the case of a tie, a 'N' is used and the quality score is unchanged. 64 | 65 | Stitch assumes that read A is 3'-5' and read B is 5'-3'. So read B is 66 | automatically reverse-complemented in the alignment procedure. (I know I should 67 | make this an option but I haven't yet). 68 | 69 | ## Usage 70 | 71 | **NOTE** - Stitch expects reads to be of the same length! 72 | 73 | Invoke, _comme ca_ 74 | 75 | Usage: stitch.py -i -j -o 76 | 77 | More options, 78 | 79 | -h, --help show this help message and exit 80 | -i FILEA, --first=FILEA 81 | first fastq file 82 | -j FILEB, --second=FILEB 83 | second fastq file 84 | -o PREFIX, --output=PREFIX 85 | output prefix (omit to print to stdout) 86 | -t THREADS, --threads=THREADS 87 | number of threads (default = all available) 88 | -p, --pretty_output displays overlapping contigs in a nice way. 89 | -s SCORE, --score=SCORE 90 | minimum percent identity (default = 25) 91 | -b TABLE, --table=TABLE 92 | output overlap length to a text file 93 | 94 | ## Bugs/Feature requests 95 | 96 | - If you have any, [let me know](https://github.com/audy/stitch/issues). Thx! 97 | -------------------------------------------------------------------------------- /stitch/stitch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # STITCH 4 | # Constructs contigs from overlapping paired-end Illumina sequencing reads. 5 | 6 | # Austin G. Davis-Richardson 7 | # adavisr@ufl.edu 8 | # working under Drs. Adriana Giongo & Eric Triplett 9 | # at The University of Florida, Gainesville, FL, USA 10 | 11 | # https://www.github.com/audy/stitch 12 | 13 | from fasta import Dna, Fasta 14 | from itertools import izip, imap, dropwhile 15 | from multiprocessing import Pool 16 | import os 17 | import sys 18 | from time import time 19 | 20 | 21 | def stitch(*args, **kwargs): 22 | ''' The stitcher ''' 23 | 24 | filea = kwargs.get('filea') 25 | fileb = kwargs.get('fileb') 26 | prefix = kwargs.get('prefix', None) 27 | score = kwargs.get('score', 35) 28 | pretty = kwargs.get('pretty', None) 29 | threads = kwargs.get('threads', None) 30 | table = kwargs.get('table', None) 31 | 32 | score = 20 33 | 34 | 35 | if not (filea or fileb): 36 | raise Exception, 'stitch(filea=\'filea\', fileb=\'fileb\')' 37 | 38 | if prefix: 39 | dudsa = open('%s-nh-s1.fastq' % prefix, 'w') 40 | dudsb = open('%s-nh-s2.fastq' % prefix, 'w') 41 | outfile = open('%s-contigs.fastq' % prefix , 'w') 42 | 43 | if table: 44 | htable = open(table, 'w') 45 | 46 | seqsa = open(filea, 'r') 47 | seqsb = open(fileb, 'r') 48 | 49 | # Ready.. Set.. 50 | numcontigs, numtotes, overlaps = 0, 0, 0 51 | starttime = time() 52 | p = Pool(threads) 53 | 54 | # Go! 55 | for i in p.imap(doStitch, izip(Fasta(seqsa), Fasta(seqsb))): 56 | numtotes += 1 57 | if i.score > score: 58 | numcontigs += 1 59 | overlaps += i.overlap 60 | 61 | if prefix: 62 | print >> outfile, '%s' % i.record 63 | 64 | if pretty: 65 | print >> sys.stdout, '>%s (%.3f)' % (i.reca.header, i.score) 66 | print >> sys.stdout, i.pretty 67 | 68 | if table: 69 | print >> htable, i.overlap 70 | else: 71 | reca, recb = i.originals 72 | 73 | if prefix: 74 | print >> dudsa, reca 75 | print >> dudsb, recb 76 | 77 | # Clean-up & inform the user 78 | duration = time() - starttime 79 | print >> sys.stderr, \ 80 | 'Made %s contigs out of %s reads in %.2f seconds (%.2f per sec)' % \ 81 | (numcontigs, numtotes, duration, numtotes/duration) 82 | try: 83 | print >> sys.stderr, \ 84 | 'Average overlap was %.2f\n' % (float(overlaps)/numcontigs) 85 | except ZeroDivisionError: 86 | print >> sys.stderr, 'no contigs :(\n' 87 | 88 | 89 | class Stitch: 90 | ''' Stitches together two overlapping Illumina reads using Doubleblast ''' 91 | def __init__(self, reca, recb): 92 | self.reca = reca 93 | self.recb = recb 94 | self.record = False 95 | self.overlap = 0 96 | self.pretty = '' 97 | self.score = 0.0 98 | self.find_overlaps() 99 | 100 | @property 101 | def originals(self): 102 | return (self.reca, self.recb) 103 | 104 | def find_overlaps(self): 105 | ''' Alignment algorithm, returns new DNA object of contig ''' 106 | 107 | # reverse complement second sequence 108 | # this should be made into an option 109 | a, b = self.reca.seq, self.recb.revcomp 110 | 111 | # convert quality score to integers 112 | qa, qb = [ [ ord(i) for i in j ] for j in [ self.reca.qual, self.recb.qual[::-1] ] ] 113 | 114 | scores = { 115 | 'eq': +1, 116 | 'un': -1, 117 | 'N': 0, 118 | } 119 | 120 | alignments = {} 121 | 122 | for n in range(len(a)): 123 | 124 | # get overlapping region 125 | ta = a[n:] 126 | 127 | # because 'string'[:-0] == '' 128 | if n > 0: 129 | tb = b[:-n] 130 | else: 131 | tb = b 132 | 133 | # score overlap 134 | score = 0 135 | for i, j in zip(ta, tb): 136 | if i == j: 137 | score += scores['eq'] 138 | elif i != j: 139 | score += scores['un'] 140 | if 'N' in [i, j]: 141 | score += scores['N'] 142 | 143 | alignments[score] = n 144 | 145 | best_score = max(alignments.keys()) 146 | best_index = alignments[best_score] 147 | 148 | # GENERATE CONTIG 149 | 150 | # beginning 151 | if best_index == 0: 152 | beginning = a 153 | qual_beg = qa 154 | else: 155 | beginning = a[:best_index] 156 | qual_beg = qa[:best_index] 157 | 158 | # middle 159 | middle = [] 160 | qual_middle = [] 161 | for (i, qi), (j, qj) in zip(zip(a[best_index:], qa[best_index:]), \ 162 | zip(b[:-best_index], qb[:-best_index])): 163 | if i == j: 164 | middle.append(i) 165 | qual_middle.append(max([qi, qj])) 166 | elif i != j: 167 | # take best quality 168 | if qi > qj: # i wins 169 | middle.append(i) 170 | qual_middle.append(qi) 171 | elif qi < qj: # j wins 172 | middle.append(j) 173 | qual_middle.append(qj) 174 | elif qi == qj: # tie 175 | middle.append('N') 176 | qual_middle.append(qi) 177 | else: 178 | raise Exception 179 | else: 180 | raise Exception 181 | 182 | middle = ''.join(middle) 183 | qual_middle = qual_middle 184 | 185 | assert len(middle) == len(qual_middle) 186 | 187 | # end 188 | if best_index == 0: 189 | end = '' 190 | qual_end = [] 191 | else: 192 | end = b[best_index:] 193 | qual_end = qb[best_index:] 194 | 195 | # concatenate 196 | newseq = beginning + middle + end 197 | newqual = ''.join(chr(i) for i in qual_beg + qual_middle + qual_end) 198 | 199 | # double-check 200 | assert len(newseq) == len(newqual) 201 | 202 | # print >> sys.stderr, " b: %s \n m: %s \n e: %s\n\n" % ( beginning, middle, end ) 203 | # print >> sys.stderr, newseq 204 | 205 | # generate pretty print view 206 | self.pretty = '1:%s\n2:%s\nC:%s\n' % \ 207 | (a + '-'*(best_index-1), 208 | '-'*(best_index-1) + b, newseq) 209 | 210 | # create a new record sequence for the contig 211 | self.record = Dna(self.reca.header, newseq, newqual) 212 | self.score = best_score 213 | 214 | def get_args(): 215 | from optparse import OptionParser 216 | ''' Parses command-line arguments, returns parser object ''' 217 | 218 | parser = OptionParser( 219 | description="""Stitch - Tool for creating contigs from overlapping 220 | paried-end illumina reads.""", 221 | usage='-i -j -o ') 222 | 223 | # TODO: add option to specify orientation of reads. 224 | parser.add_option('-i', '--first', dest='filea', 225 | help='first fastq file') 226 | 227 | parser.add_option('-j', '--second', dest='fileb', 228 | help='second fastq file') 229 | 230 | parser.add_option('-o', '--output', dest='prefix', 231 | help='output prefix (omit to print to stdout)') 232 | 233 | parser.add_option('-t', '--threads', dest='threads', default=None, 234 | type=int, help='number of threads (default = all available)') 235 | 236 | parser.add_option('-p', '--pretty_output', dest='pretty', default=False, 237 | action='store_true', 238 | help='displays overlapping contigs in a nice way.') 239 | 240 | parser.add_option('-s', '--score', dest='score', default=35, 241 | help='minimum percent identity (default = 25)', type=float) 242 | 243 | return parser 244 | 245 | def doStitch(recs): 246 | ''' Used by Pool.imap to create stitch jobs ''' 247 | 248 | try: 249 | reca, recb = recs 250 | return Stitch(reca, recb) 251 | 252 | except KeyboardInterrupt: 253 | # BUG Pool() has a habit of not exiting when a CTRL-C is passed 254 | # So far, I haven't found a way around this except by killing 255 | # the process by hand 256 | # Note: os.abort() may work 257 | print 'Ouch!' 258 | quit() 259 | 260 | def main(): 261 | ''' run from command-line ''' 262 | 263 | # parse arguments 264 | parser = get_args() 265 | (options, args) = parser.parse_args() 266 | 267 | # verify two files specified 268 | if not (options.filea and options.fileb): 269 | print >> sys.stderr, 'Usage: %s %s' % \ 270 | (parser.get_prog_name(), parser.usage) 271 | sys.exit() 272 | 273 | # output everything to stdout if no output prefix is specified 274 | if not (options.prefix): 275 | print >> sys.stderr, 'Warning: no outputfile' 276 | dudsa, dudsb, outfile = sys.stdout, sys.stdout, sys.stdout 277 | 278 | # initiate the stiching! 279 | try: 280 | stitch( filea = options.filea, 281 | fileb = options.fileb, 282 | prefix = options.prefix, 283 | threads = options.threads, 284 | pretty = options.pretty, 285 | score = options.score) 286 | 287 | except KeyboardInterrupt: 288 | print >> sys.stderr, 'Ouch!' 289 | quit() 290 | 291 | if __name__ == '__main__': 292 | main() 293 | --------------------------------------------------------------------------------