├── test
    ├── stitch
    ├── contigs.txt
    ├── seq_a.txt
    ├── seq_b.txt
    └── test.py
├── stitch
    ├── __init__.py
    ├── length_filter.py
    ├── fasta.py
    └── stitch.py
├── .gitignore
├── LICENSE.md
└── README.md


/test/stitch:
--------------------------------------------------------------------------------
1 | ../stitch


--------------------------------------------------------------------------------
/stitch/__init__.py:
--------------------------------------------------------------------------------
1 | from stitch import *


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | s*short.txt
3 | *.fastq
4 | 


--------------------------------------------------------------------------------
/test/contigs.txt:
--------------------------------------------------------------------------------
1 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG
2 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG
3 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG


--------------------------------------------------------------------------------
/test/seq_a.txt:
--------------------------------------------------------------------------------
 1 | @sequence_1
 2 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG
 3 | +sequence_1
 4 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
 5 | @sequence_2
 6 | AAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGG
 7 | +sequence_2
 8 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
 9 | @sequence_3
10 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG
11 | +sequence_3
12 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff


--------------------------------------------------------------------------------
/test/seq_b.txt:
--------------------------------------------------------------------------------
 1 | @sequence_1
 2 | CCCCCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTT
 3 | +sequence_1
 4 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
 5 | @sequence_2
 6 | AAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGGG
 7 | +sequence_2
 8 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
 9 | @sequence_3
10 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGG
11 | +sequence_3
12 | ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff


--------------------------------------------------------------------------------
/stitch/length_filter.py:
--------------------------------------------------------------------------------
 1 | from fasta import *
 2 | import sys
 3 | 
 4 | filename = sys.argv[1]
 5 | 
 6 | for minimum, maximum in ((50, 59), (60, 69), (70, 79), (80, 89), (90, 99), (100, 129), (130, 200)):
 7 |     handle = open(filename)
 8 |     output = open('%s-%s-%s' % (filename, minimum, maximum), 'w')
 9 |     counter = 0
10 |     for fasta in Fasta(handle):
11 |         if len(fasta) > maximum:
12 |             continue
13 |         if len(fasta) < minimum:
14 |             continue
15 |         print >> output, fasta
16 |         counter += 1
17 |     print minimum, maximum, counter
18 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from stitch import *
 4 | 
 5 | def test_headers():
 6 |     ''' make sure headers are parsed correctly '''
 7 |     with open('seq_a.txt') as handle:
 8 |         records = [i for i in Fasta(handle)]
 9 |         
10 |     assert records.pop(0).header == 'sequence_1'
11 | 
12 | def test_alignments():        
13 |     with open('seq_a.txt') as ha, open('seq_b.txt') as hb:
14 |         records = [(a, b) for (a, b) in zip(Fasta(ha), Fasta(hb))]
15 | 
16 |     contigs = [ i.strip() for i in open('contigs.txt').readlines() ]
17 | 
18 |     for i, j in records:
19 |         c = contigs.pop(0)
20 |         res = Stitch(i, j)
21 |         
22 |         # TODO, also check quality scores 
23 |         # print res.pretty
24 |         seq = res.record.seq
25 |         try:
26 |             assert seq == c
27 |         except AssertionError:
28 |             print 'ERROR:    %s' % i.header
29 |             print 'expected: %s' % i.seq
30 |             print 'got:      %s' % seq
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     test_headers()
35 |     test_alignments()


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2016 Austin Davis-Richardson 
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/stitch/fasta.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | _complement = string.maketrans('GATCRYgatcry','CTAGYRctagyr')
 3 | 
 4 | class Fasta:
 5 |     ''' iterates through a fastq file, returning dnaobj objects '''
 6 | 
 7 |     def __init__(self, handle, filetype='fastq'):
 8 |         self.filetype = filetype
 9 |         self.handle = handle
10 |         self.county = 0
11 | 
12 |     def __iter__(self):
13 |         if self.filetype == 'fastq':
14 |             counter = 0
15 |             rec = { 0: '', 1: '', 2: '', 3: '' }
16 |             for line in self.handle:
17 |                 if counter < 3:
18 |                     rec[counter] = line.strip()
19 |                     counter += 1
20 |                 elif counter == 3:
21 |                     rec[counter] = line.strip()
22 |                     counter = 0
23 |                     yield Dna(rec[0], rec[1], rec[3])
24 | 
25 | 
26 | class Dna:
27 |     ''' An object representing either a FASTA or FASTQ record '''
28 | 
29 |     def __init__(self, header, sequence, quality = False):
30 |         self.header = header.lstrip('@').rstrip('\n')
31 |         self.seq = sequence
32 |         self.qual = quality
33 |         if quality:
34 |             self.type = 'fastq'
35 |         else:
36 |             self.type = 'fasta'
37 | 
38 |         if len(self.seq) != len(self.qual):
39 |             raise IOError, \
40 |                 'Seq length and qual length do not agree: %s' % (self.header)
41 | 
42 |     def __str__(self):
43 |         ''' returns a FASTA/Q formatted string '''
44 |         if not self.qual:
45 |             return ('>%s\nself.sequence\n') % \
46 |                 (self.header, self.seq)
47 |         else:
48 |             return('@%s\n%s\n+%s\n%s') % \
49 |                 (self.header, self.seq, self.header, self.qual)
50 | 
51 |     def __len__(self):
52 |         return len(self.seq)
53 | 
54 |     def __repr__(self):
55 |         return '<dnaobj.%s instance: %s>' % (self.type, self.header)
56 | 
57 |     @property
58 |     def complement(self):
59 |         ''' returns complement of sequence '''
60 |         return self.seq.translate(_complement)
61 | 
62 |     @property
63 |     def revcomp(self):
64 |         ''' returns reverse complement of sequence '''
65 |         return self.complement[::-1]
66 | 
67 |     @property
68 |     def rqual(self):
69 |         ''' returns reverse quality'''
70 |         return self.qual[::-1]
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stitch
 2 | 
 3 | **No longer maintained**. Please use [Pandsaseq](https://github.com/neufeld/pandaseq).
 4 | 
 5 | Austin G. Davis-Richardson
 6 | <harekrishna@gmail.com>
 7 | 
 8 | Stitch assembles overlapping paired-end reads into a single contig for each
 9 | pair. This increases the read length and hopefully the quality of a _de novo_
10 | or reference assembly. Stitch is multi-threaded and will automatically use all
11 | cores on your system unless told otherwise. Stitch currently only reads FASTQ
12 | format. QSEQ and FASTA formats to come. Reads that are not found to overlap are
13 | dumped in a file called `<prefix>-singletons` and are in FASTQ format. These
14 | can then be trimmed and combined with contigs to do a _de novo_ assembly.
15 | 
16 | Stitch is licensed under the MIT open source license. See `LICENSE.md` for
17 | details.
18 | 
19 | # Citing Stitch
20 | 
21 | Please us the following citation:
22 | 
23 | Brown, Christopher T., et al. "Gut microbiome metagenomics analysis suggests a
24 | functional model for the development of autoimmunity for type 1 diabetes." PloS
25 | one 6.10 (2011): e25792.
26 | 
27 | Or, in BibTex:
28 | 
29 | ```
30 | @article{brown2011gut,
31 |   title={Gut microbiome metagenomics analysis suggests a functional model for the development of autoimmunity for type 1 diabetes},
32 |   author={Brown, Christopher T and Davis-Richardson, Austin G and Giongo, Adriana and Gano, Kelsey A and Crabb, David B and Mukherjee, Nabanita and Casella, George and Drew, Jennifer C and Ilonen, Jorma and Knip, Mikael and others},
33 |   journal={PloS one},
34 |   volume={6},
35 |   number={10},
36 |   pages={e25792},
37 |   year={2011},
38 |   publisher={Public Library of Science}
39 | }
40 | ```
41 | 
42 | ## Requirements
43 | 
44 | - Python 2.6, 2.7
45 | - Mac OS X or Linux (Windows might work but hasn't been tested)
46 | 
47 | ## Alignment Algorithm
48 | 
49 | Stitch aligns overlapping paired end reads by counting the number of matching
50 | nucleotides in an overlapping window. The window that provides the highest
51 | number of matching nucleotides wins.
52 | 
53 | The consensus sequence is generated thusly,
54 | 
55 |     A 5' =============== =================================> 3'
56 |     B                3' <================================= ===========  5'
57 |     C 5' =============== ================================= ===========> 3'
58 |                         <--------- "the middle" ---------->
59 | 
60 | In the region dubbed "the middle", the nucleotide with the highest
61 | corresponding quality score is used (if there is a mismatch). If there is a
62 | match, then that nucleotide is used (and the highest quality score is  given).
63 | In the case of a tie, a 'N' is used and the quality score is unchanged.
64 | 
65 | Stitch assumes that read A is 3'-5' and read B is 5'-3'. So read B is
66 | automatically reverse-complemented in the alignment procedure. (I know I should
67 | make this an option but I haven't yet).
68 | 
69 | ## Usage
70 | 
71 | **NOTE** - Stitch expects reads to be of the same length!
72 | 
73 | Invoke, _comme ca_
74 | 
75 |     Usage: stitch.py -i <fastq file 1> -j <fastq file 2> -o <output prefix>
76 | 
77 | More options,
78 | 
79 |     -h, --help            show this help message and exit
80 |     -i FILEA, --first=FILEA
81 |                         first fastq file
82 |     -j FILEB, --second=FILEB
83 |                         second fastq file
84 |     -o PREFIX, --output=PREFIX
85 |                         output prefix (omit to print to stdout)
86 |     -t THREADS, --threads=THREADS
87 |                         number of threads (default = all available)
88 |     -p, --pretty_output   displays overlapping contigs in a nice way.
89 |     -s SCORE, --score=SCORE
90 |                         minimum percent identity (default = 25)
91 |     -b TABLE, --table=TABLE
92 |                         output overlap length to a text file
93 | 
94 | ## Bugs/Feature requests
95 | 
96 |  - If you have any, [let me know](https://github.com/audy/stitch/issues). Thx!
97 | 


--------------------------------------------------------------------------------
/stitch/stitch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # STITCH
  4 | #  Constructs contigs from overlapping paired-end Illumina sequencing reads.
  5 | 
  6 | # Austin G. Davis-Richardson
  7 | #  adavisr@ufl.edu
  8 | #  working under Drs. Adriana Giongo & Eric Triplett
  9 | #  at The University of Florida, Gainesville, FL, USA
 10 | 
 11 | # https://www.github.com/audy/stitch
 12 | 
 13 | from fasta import Dna, Fasta
 14 | from itertools import izip, imap, dropwhile
 15 | from multiprocessing import Pool
 16 | import os
 17 | import sys
 18 | from time import time
 19 | 
 20 | 
 21 | def stitch(*args, **kwargs):
 22 |     ''' The stitcher '''
 23 | 
 24 |     filea = kwargs.get('filea')
 25 |     fileb = kwargs.get('fileb')
 26 |     prefix = kwargs.get('prefix', None)
 27 |     score = kwargs.get('score', 35)
 28 |     pretty = kwargs.get('pretty', None)
 29 |     threads = kwargs.get('threads', None)
 30 |     table = kwargs.get('table', None)
 31 | 
 32 |     score = 20
 33 | 
 34 | 
 35 |     if not (filea or fileb):
 36 |         raise Exception, 'stitch(filea=\'filea\', fileb=\'fileb\')'
 37 | 
 38 |     if prefix:
 39 |         dudsa = open('%s-nh-s1.fastq' % prefix, 'w')
 40 |         dudsb = open('%s-nh-s2.fastq' % prefix, 'w')
 41 |         outfile = open('%s-contigs.fastq' % prefix , 'w')
 42 | 
 43 |     if table:
 44 |         htable = open(table, 'w')
 45 | 
 46 |     seqsa = open(filea, 'r')
 47 |     seqsb = open(fileb, 'r')
 48 | 
 49 |     # Ready.. Set..
 50 |     numcontigs, numtotes, overlaps = 0, 0, 0
 51 |     starttime = time()
 52 |     p = Pool(threads)
 53 | 
 54 |     # Go!
 55 |     for i in p.imap(doStitch, izip(Fasta(seqsa), Fasta(seqsb))):
 56 |         numtotes += 1
 57 |         if i.score > score:
 58 |             numcontigs += 1
 59 |             overlaps += i.overlap
 60 | 
 61 |             if prefix:
 62 |                 print >> outfile, '%s' % i.record
 63 | 
 64 |             if pretty:
 65 |                 print >> sys.stdout, '>%s (%.3f)' % (i.reca.header, i.score)
 66 |                 print >> sys.stdout, i.pretty
 67 | 
 68 |             if table:
 69 |                 print >> htable, i.overlap
 70 |         else:
 71 |             reca, recb = i.originals
 72 | 
 73 |             if prefix:
 74 |                 print >> dudsa, reca
 75 |                 print >> dudsb, recb
 76 | 
 77 |     # Clean-up & inform the user
 78 |     duration = time() - starttime
 79 |     print >> sys.stderr, \
 80 |         'Made %s contigs out of %s reads in %.2f seconds (%.2f per sec)' % \
 81 |         (numcontigs, numtotes, duration, numtotes/duration)
 82 |     try:
 83 |         print >> sys.stderr, \
 84 |             'Average overlap was %.2f\n' % (float(overlaps)/numcontigs)
 85 |     except ZeroDivisionError:
 86 |         print >> sys.stderr, 'no contigs :(\n'
 87 | 
 88 | 
 89 | class Stitch:
 90 |     ''' Stitches together two overlapping Illumina reads using Doubleblast '''
 91 |     def __init__(self, reca, recb):
 92 |         self.reca = reca
 93 |         self.recb = recb
 94 |         self.record = False
 95 |         self.overlap = 0
 96 |         self.pretty = ''
 97 |         self.score = 0.0
 98 |         self.find_overlaps()
 99 | 
100 |     @property
101 |     def originals(self):
102 |         return (self.reca, self.recb)
103 | 
104 |     def find_overlaps(self):
105 |         ''' Alignment algorithm, returns new DNA object of contig '''
106 | 
107 |         # reverse complement second sequence
108 |         # this should be made into an option
109 |         a, b = self.reca.seq, self.recb.revcomp
110 | 
111 |         # convert quality score to integers
112 |         qa, qb = [ [ ord(i) for i in j ] for j in [ self.reca.qual, self.recb.qual[::-1] ] ]
113 | 
114 |         scores = {
115 |             'eq': +1,
116 |             'un': -1,
117 |             'N': 0,
118 |         }
119 | 
120 |         alignments = {}
121 | 
122 |         for n in range(len(a)):
123 | 
124 |             # get overlapping region
125 |             ta = a[n:]
126 | 
127 |             # because 'string'[:-0] == ''
128 |             if n > 0:
129 |                 tb = b[:-n]
130 |             else:
131 |                 tb = b
132 | 
133 |             # score overlap
134 |             score = 0
135 |             for i, j in zip(ta, tb):
136 |                 if i == j:
137 |                     score += scores['eq']
138 |                 elif i != j:
139 |                     score += scores['un']
140 |                 if 'N' in [i, j]:
141 |                     score += scores['N']
142 | 
143 |             alignments[score] = n
144 | 
145 |         best_score = max(alignments.keys())
146 |         best_index = alignments[best_score]
147 | 
148 |         # GENERATE CONTIG
149 | 
150 |         # beginning
151 |         if best_index == 0:
152 |             beginning = a
153 |             qual_beg  = qa
154 |         else:
155 |             beginning = a[:best_index]
156 |             qual_beg  = qa[:best_index]
157 | 
158 |         # middle
159 |         middle = []
160 |         qual_middle = []
161 |         for (i, qi), (j, qj) in zip(zip(a[best_index:], qa[best_index:]), \
162 |                     zip(b[:-best_index], qb[:-best_index])):
163 |             if i == j:
164 |                 middle.append(i)
165 |                 qual_middle.append(max([qi, qj]))
166 |             elif i != j:
167 |                 # take best quality
168 |                 if qi > qj: # i wins
169 |                     middle.append(i)
170 |                     qual_middle.append(qi)
171 |                 elif qi < qj: # j wins
172 |                     middle.append(j)
173 |                     qual_middle.append(qj)
174 |                 elif qi == qj: # tie
175 |                     middle.append('N')
176 |                     qual_middle.append(qi)
177 |                 else:
178 |                     raise Exception
179 |             else:
180 |                 raise Exception
181 | 
182 |         middle = ''.join(middle)
183 |         qual_middle = qual_middle
184 | 
185 |         assert len(middle) == len(qual_middle)
186 | 
187 |         # end
188 |         if best_index == 0:
189 |             end = ''
190 |             qual_end = []
191 |         else:
192 |             end = b[best_index:]
193 |             qual_end = qb[best_index:]
194 | 
195 |         # concatenate
196 |         newseq  = beginning + middle + end
197 |         newqual = ''.join(chr(i) for i in qual_beg + qual_middle + qual_end)
198 | 
199 |         # double-check
200 |         assert len(newseq) == len(newqual)
201 | 
202 |         # print >> sys.stderr, " b: %s \n m: %s \n e: %s\n\n"  % ( beginning, middle, end )
203 |         # print >> sys.stderr, newseq
204 | 
205 |         # generate pretty print view
206 |         self.pretty = '1:%s\n2:%s\nC:%s\n' % \
207 |                         (a + '-'*(best_index-1),
208 |                         '-'*(best_index-1) + b, newseq)
209 | 
210 |         # create a new record sequence for the contig
211 |         self.record = Dna(self.reca.header, newseq, newqual)
212 |         self.score = best_score
213 | 
214 | def get_args():
215 |     from optparse import OptionParser
216 |     ''' Parses command-line arguments, returns parser object '''
217 | 
218 |     parser = OptionParser(
219 |         description="""Stitch - Tool for creating contigs from overlapping
220 |         paried-end illumina reads.""",
221 |         usage='-i <fastq file 1> -j <fastq file 2> -o <output prefix>')
222 | 
223 |     # TODO: add option to specify orientation of reads.
224 |     parser.add_option('-i', '--first', dest='filea',
225 |         help='first fastq file')
226 | 
227 |     parser.add_option('-j', '--second', dest='fileb',
228 |         help='second fastq file')
229 | 
230 |     parser.add_option('-o', '--output', dest='prefix',
231 |         help='output prefix (omit to print to stdout)')
232 | 
233 |     parser.add_option('-t', '--threads', dest='threads', default=None,
234 |         type=int, help='number of threads (default = all available)')
235 | 
236 |     parser.add_option('-p', '--pretty_output', dest='pretty', default=False,
237 |         action='store_true',
238 |         help='displays overlapping contigs in a nice way.')
239 | 
240 |     parser.add_option('-s', '--score', dest='score', default=35,
241 |         help='minimum percent identity (default = 25)', type=float)
242 | 
243 |     return parser
244 | 
245 | def doStitch(recs):
246 |     ''' Used by Pool.imap to create stitch jobs '''
247 | 
248 |     try:
249 |         reca, recb = recs
250 |         return Stitch(reca, recb)
251 | 
252 |     except KeyboardInterrupt:
253 |         # BUG Pool() has a habit of not exiting when a CTRL-C is passed
254 |         # So far, I haven't found a way around this except by killing
255 |         # the process by hand
256 |         # Note: os.abort() may work
257 |         print 'Ouch!'
258 |         quit()
259 | 
260 | def main():
261 |     ''' run from command-line '''
262 | 
263 |     # parse arguments
264 |     parser = get_args()
265 |     (options, args) = parser.parse_args()
266 | 
267 |     # verify two files specified
268 |     if not (options.filea and options.fileb):
269 |         print >> sys.stderr, 'Usage: %s %s' % \
270 |             (parser.get_prog_name(), parser.usage)
271 |         sys.exit()
272 | 
273 |     # output everything to stdout if no output prefix is specified
274 |     if not (options.prefix):
275 |         print >> sys.stderr, 'Warning: no outputfile'
276 |         dudsa, dudsb, outfile = sys.stdout, sys.stdout, sys.stdout
277 | 
278 |     # initiate the stiching!
279 |     try:
280 |         stitch( filea   = options.filea,
281 |                 fileb   = options.fileb,
282 |                 prefix  = options.prefix,
283 |                 threads = options.threads,
284 |                 pretty  = options.pretty,
285 |                 score   = options.score)
286 | 
287 |     except KeyboardInterrupt:
288 |         print >> sys.stderr, 'Ouch!'
289 |         quit()
290 | 
291 | if __name__ == '__main__':
292 |     main()
293 | 


--------------------------------------------------------------------------------