├── .travis.yml
├── COPYING
├── F132-01 SSPACE_Basic_User_Manual_v2.0.pdf
├── F132-02 SSPACE_Basic_Tutorial_v2.0.pdf
├── README
├── SSPACE_Basic.pl
├── SSPACE_Basic_v2.0.pl
├── bin
    ├── ExtendOrFormatContigs.pl
    ├── PairingAndScaffolding.pl
    └── readLibFiles.pl
├── dotlib
    └── DotLib.pm
├── example
    ├── contigs_abyss.fasta
    ├── ecoli_scaffolds_no_extension.summaryfile.txt
    └── libraries.txt
└── tools
    ├── TQS.py
    ├── TQS.readme
    ├── TQSexport.py
    ├── TQSfastq.py
    ├── TRIMMING_PAIRED_READS.README
    ├── estimate_insert_size.pl
    ├── fq_all2std.pl
    ├── qseq2fasta.pl
    ├── qseq2fastq.pl
    └── sam_bam2tab.pl


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: "perl"
 2 | perl:
 3 |   - 5.24-shrplib
 4 | 
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - bowtie
 9 | 
10 | install: []
11 | 
12 | script:
13 |   - wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001665/SRR001665_1.fastq.gz
14 |   - wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001665/SRR001665_2.fastq.gz
15 |   - gunzip SRR001665_?.fastq.gz
16 |   - ./SSPACE_Basic.pl -l example/libraries.txt -s example/contigs_abyss.fasta -k 5 -a 0.7 -x 0 -b ecoli_scaffolds_no_extension
17 |   # Check that the output is correct
18 |   - diff -u example/ecoli_scaffolds_no_extension.summaryfile.txt .
19 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/F132-01 SSPACE_Basic_User_Manual_v2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nsoranzo/sspace_basic/4fe5c275c94b36e02d1b69438a2f7e022ecb58bc/F132-01 SSPACE_Basic_User_Manual_v2.0.pdf


--------------------------------------------------------------------------------
/F132-02 SSPACE_Basic_Tutorial_v2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nsoranzo/sspace_basic/4fe5c275c94b36e02d1b69438a2f7e022ecb58bc/F132-02 SSPACE_Basic_Tutorial_v2.0.pdf


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | Scaffolding Pre-Assemblies After Contig Extension (SSPACE)
  2 | 
  3 | SSPACE BASIC
  4 | © 2011 Marten Boetzer, Walter Pirovano
  5 | © 2014,2016 Nicola Soranzo
  6 | email: nicola.soranzo@earlham.ac.uk
  7 | 
  8 | NOTICE
  9 | ======
 10 | 
 11 | This is mainly a repository to store the last open source release (GNU GPL 2.0
 12 | license) of the "basic" version of SSPACE before it was discontinued. SSPACE
 13 | "standard", a newer, but non-open source, versions of SSPACE is available at
 14 | https://www.baseclear.com/services/bioinformatics/basetools/sspace-standard/
 15 | 
 16 | I have added a few patches of mine and I am open to external contributions, but
 17 | I do not offer support or plan any further development.
 18 | 
 19 | Description
 20 | -----------
 21 | 
 22 | SSPACE is a script able to extend and scaffold pre-assembled contigs using one or more mate pairs or paired-end libraries, or even a combination.
 23 | 
 24 | Implementation and requirements
 25 | -------------------------------
 26 | 
 27 | SSPACE is implemented in Perl and runs on Linux, MacOS and Windows. SSPACE requires bowtie and bowtie-build commands to be in your PATH, for more information about Bowtie see http://bowtie-bio.sourceforge.net/ .
 28 | 
 29 | SSPACE is built based on SSAKE. Code of SSAKE is changed to be able to extend and scaffold pre-assembled contigs for multiple paired reads libraries.
 30 | 
 31 | PLEASE READ:
 32 | SSPACE tracks in memory all contigs. That means that the memory usage will increase drastically with the size of your contig data set. In addition, during contig extension single reads are extracted and mapped to the contigs. Unmapped reads are stored in memory. Again, the more reads that can not map, the bigger the dataset and the more memory is used. Just be aware of these limitations and don't be surprised if you observe a lot of data swapping to disk if you attempt to run SSPACE on a machine with little RAM.
 33 | 
 34 | Contig extension might not be suited to work with 454-type read pair libraries. Simply because recurring base insertions/deletions errors, such as those commonly seen in homopolymeric regions, will not cluster well in the context of the SSAKE contig extension algorithm scheme. In addition, long 454 reads are less likely to map against the contigs, thus less read pairs are found and scaffolding is based on less read pairs. One possibility is to allow gaps during mapping using the '-g' parameter.
 35 | 
 36 | Citing SSPACE
 37 | ------------
 38 | 
 39 | Thank you for using, developing and promoting this free software.
 40 | If you use SSPACE for you research, please cite:
 41 | 
 42 | Boetzer M, Henkel CV, Jansen HJ, Butler D and Pirovano W. 2010. Scaffolding pre-assembled contigs using SSPACE. Bioinformatics. 27(4):578-579
 43 | 
 44 | Running SSPACE
 45 | -------------
 46 | 
 47 | e.g. perl SSPACE_Basic.pl -l libraries.txt -s contigs.fasta -x 0 -m 32 -o 20 -t 0 -k 5 -a 0.70 -n 15 -p 0 -v 0 -z 0 -g 0 -T 1 -b standard_out
 48 | 
 49 | Usage: ./SSPACE_Basic.pl
 50 | 
 51 | General parameters:
 52 |    -l  Library file containing two paired read files with insert size, error and orientation (see Manual for more information). Also possible to insert .tab files with pairing information (REQUIRED)
 53 |    -s  FASTA file containing contig sequences used for extension. Inserted paired reads are mapped to extended and non-extended contigs (REQUIRED)
 54 |    -x  Indicate whether to extend the contigs of -s using paired reads in -l (-x 1=extension, -x 0=no extension, default -x 0)
 55 | 
 56 | Extension parameters:
 57 |    -m  Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 32)
 58 |    -o  Minimum number of reads needed to call a base during an extension (default -o 20)
 59 |    -t  Trim up to -t base(s) on the contig end when all possibilities have been exhausted for an extension (default -t 0)
 60 |    -u  Single FASTA/FASTQ file containing unpaired sequence reads (optional)
 61 |    -r  Minimum base ratio used to accept a overhang consensus base (default -r 0.9)
 62 | 
 63 | Scaffolding parameters:
 64 |    -z  Minimum contig length used for scaffolding. Filters out contigs below this value (default -z 0)
 65 |    -k  Minimum number of links (read pairs) to compute scaffold (default -k 5)
 66 |    -a  Maximum link ratio between two best contig pairs. Higher values lead to least accurate scaffolding (default -a 0.7)
 67 |    -n  Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n 15)
 68 | 
 69 | Bowtie parameters:
 70 |    -g  Maximum number of allowed gaps during mapping with Bowtie. Corresponds to the -v option in Bowtie. Higher number of allowed gaps can lead to least accurate scaffolding (default -g 0)
 71 |    -T  Specifes the number of threads in Bowtie. Corresponds to the -p/--threads option in Bowtie (default -T 1)
 72 | 
 73 | Additional options:
 74 |    -b  Base name for your output files (default -b standard_output)
 75 |    -v  Runs in verbose mode (-v 1=yes, -v 0=no, default -v 0)
 76 |    -p  Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p 0)
 77 | 
 78 | 
 79 | How it works
 80 | ------------
 81 | 
 82 | The program consists of several steps, a short overview.
 83 | 
 84 | The first steps are reading the data and filter them. The protocol is slightly different when -x is set to either 0 or 1. We treat them separately here;
 85 | 
 86 | With -x 0 the steps are;
 87 | 1) Read -l library file;
 88 | 	A) For each library in the -l library file. Store the reads in appropriate format. Paired reads are stored in a new file with a similar read name for easy tracking of the paired read. Format is;
 89 | 
 90 | >read1.1
 91 | AGCTGATAGATGAT
 92 | >read1.2
 93 | GATGATAGATAGAC
 94 | 
 95 | 2) Convert the inserted contig file to appropriate format.
 96 | 
 97 | With -x 1 the steps are;
 98 | 
 99 | 1) Read -l library file;
100 | 	A) For each library in the -l library file. Store the reads in appropriate format, similar as step 1A.
101 | 	B) For all libraries
102 | 	- store the single reads to a new file. Only reads containing only ACGT characters are stored.
103 | 2) Extend the pre-assembled contigs
104 | 	A) Map single reads of step 1B to (-s) contig file with Bowtie.
105 | 	B) Read unmapped reads into memory.
106 | 	C) Go through each contig in the (-s) contig file, and try to extend the contig. The new contigs are stored in a new file.
107 | 
108 | 
109 | After producing either a formatted or an extended contig file, the next step is to go through each library in the -l library file and map the filtered paired reads of step 1A to the new contigs;
110 | 
111 | 3) Use Bowtie to map single reads of 1A to either the formatted or extended contigs. Map only reads that are on the edges of the contigs. Only reads that map to only one contig are stored in a file. Position and orientation of each read is stored in the file.
112 | 4) Retrieve the position of each found read.
113 | 5) Pair contigs if both reads if a paired-read are found, store the pairing information into memory. In addition, store the sequence of the pair into memory. If the sequence of a pair is already used for pairing contigs, it is not used again.
114 | 6) Pair contigs based on the number of links (-k) and link ratio (-a)
115 | 7) Merge, orient and order the contigs to produce scaffolds.
116 | 
117 | 8) If multiple libraries are in -l file, the produced scaffolds in FASTA format are the input for the new library. Steps 3 till 8 are repeated for each library.
118 | 
119 | A more detailed view of the six main steps are given below.
120 | 
121 | Detailed view
122 | ------------
123 | 
124 | 
125 | 1. Reading libraries
126 | Both FASTA/FASTQ files inserted at the -l library file are read, converted and stored in a new file. This new file is used for mapping with Bowtie (step 4), where the new naming of the headers makes it easy to backtrack the original read pair.
127 | 
128 | >read1.1 (read from file 1)
129 | ACGATGCTAT
130 | 
131 | >read1.2 (read from file 2)
132 | ACCGCGCCCC
133 | 
134 | If -x 1 is set, for contig extension, single reads containing only ACGT characters are stored in a new file. The single reads are mapped to contigs at the next step.
135 | 
136 | 2. Mapping when -x 1
137 | To extend contigs, only reads that are not already present on the contigs should be used. Otherwise, reads are re-used and cause erroneous contigs, but causes also reads mapped to multiple locations/contigs (step 4). To filter these reads out, Bowtie is used. Bowtie maps the produced single reads at step 1 to the (-s) pre-assembled contigs. A file is generated with reads that did not map to the contigs. The unmapped read file is read in memory, populating a hash table keyed by unique sequence reads with pairing values representing the number of sequence occurrences. The hash is used for contig extension at the next section.
138 | 
139 | 3. Extending when -x 1
140 | Contigs are extended, when -x set to 1, using the unmapped reads with a method developed by SSAKE. With SSAKE, contigs extension is initiated by generating the longest 3'-most word (k-mer) from the unassembled read u that is shorter than the sequence read length l.  Every possible 3' most k-mers will be generated from u and used in turn for the search until the word length is smaller than a user-defined minimum, m.  Meanwhile, all perfectly overlapping reads will be collected in an array and further considered for 3' extension once the k-mer search is done.  At the same time, a hash table c will store every base along with a coverage count for every position of the overhang (or stretches of bases hanging off the seed sequence u).
141 | 
142 | Once the search complete, a consensus sequence is derived from the hash table c, taking the most represented base at each position of the overhang.  To be considered for the consensus, each base has to be covered by user-defined -o (set to 20 by default).  If there's a tie (two bases at a specific position have the same coverage count), the prominent base is below a user-defined ratio r, the coverage -o is to low or the end of the overhang is reached, the consensus extension terminates and the consensus overhang joined to the contig.  All reads overlapping are searched against the newly formed sequence and, if found, are removed from the hash table and prefix tree. If they are not part of the consensus, they will be used to extend other contigs, if applicable.  If no overlapping reads match the newly formed contig, the extension is terminated from that end and SSAKE resumes with a new contig.  That prevents infinite looping through low complexity DNA sequences. In the former case, the extension resumes using the new [l-m] space to search for joining k-mers.
143 | 
144 | The process of progressively cycling through 3'-most k-mer is repeated after every contig extension until nothing else can be done on that side.  Since only left-most searches are possible with a prefix tree, when all possibilities have been exhausted for the 3' extension, the complementary strand of the contiguous sequence generated is used to extend the contig on the 5' end.  The DNA prefix tree is used to limit the search space by segregating sequence reads and their reverse-complemented counterparts by their first eleven 5' end bases.
145 | 
146 | There are three ways to control the stringency in SSPACE:
147 | 1) Disallow contig extension if the coverage is too low (-o). Higher -o values lead to shorter contigs, but minimizes sequence misassemblies.
148 | 2) Adjust the minimum overlap -m allowed between the contig and short sequence reads. Higher m values lead to more accurate contigs at the cost of decreased contiguity.
149 | 3) Set the minimum base ratio -r to higher values
150 | 
151 | After the sequence assembly, a file is generated with .extendedcontigs.fasta extension in the 'intermediate_results' folder. This file contains both extended and non-extended contigs.
152 | 
153 | The next steps are looped through each library, present in the (-l) library file.
154 | 
155 | 4. Mapping unique paired reads
156 | 
157 | At step 1, pairs of each library were filtered. Reads containing N's are unable to correctly map to the contigs, therefore they are not used by Bowtie. Bowtie maps the single reads to the contigs, produced either after extending (if -x 1), or after formatting (if -x 0), or after step 5 if multiple libraries are inserted on -l.
158 | 
159 | Before mapping, contigs are shortened, reducing the search space for Bowtie. Only edges of the contigs are considered for mapping. Cutting of edges is determined by taking the maximal allowed distance inserted by the user in the library file (insert size and insert standard deviation). The maximal distance is insert_size + (insert_size * insert_stdev). For example, with a insert size of 500 and a deviation of 0.5, the maximal distance is 750. First 750 bases and last 750 bases are subtracted from the contig sequence, in this case.
160 | 
161 | ------------------------------------------
162 |            |                  |                			
163 | ------------                  ------------
164 |    750bp                          750bp
165 | 
166 | This step reduces the search space by merging the two sequences, divided by a 'N' character.
167 | 
168 | The algorithm of mapping goes through each pair and checks its occurrence on the edges of the contigs. If both reads are found, the reads of the pair is stored and contigs could be paired in the next step. Otherwise, it is not stored and the read pair is not used for contig pairing. If a pair is previously found and used for contig pairing, the pair is not considered again. Otherwise same links between contigs are found based on same read pair, which can generate misleading results.
169 | 
170 | If either of the two reads of a read pair occur on multiple contigs, one can not tell which contig should be paired. For example, the left read occurs at contigs 1 and 3, and the right read at contig 2. For this situation it is impossible to tell if contigs 1 and 2 should be paired, or contigs 1 and 3. Therefore, reads that occur multiple times on contigs are not considered for contig pairing.
171 | 
172 | 5a. Building scaffolds
173 | The final step is scaffolding. SSPACE uses an updated version of the SSAKE scaffolder for this. For each read pairs, putative contig pairs (pre-scaffolding stage) are tallied based on the position/location of the paired reads on different contigs. Contig pairs are only considered if the calculated distance between them satisfy the mean distance specified (fourth column in -l file) while allowing for a deviation (fifth column in -l file), also defined by the user. Only contig pairs having a valid gap or overlap are allowed to proceed to the scaffolding stage.
174 | Please note that this stage accepts redundancy of contig pairs (i.e. a given contig may link to multiple contigs, and the number of links (spanning pairs) between any given contig pair is recorded, along with a mean putative gap or overlap(-)).
175 | 
176 | Once pairing between contigs is complete, the scaffolds are built using contigs as seeds. Every contig is used in turn until all have been incorporated into a scaffold.
177 | 
178 | Consider the following contig pairs (AB, AC and rAD):
179 | 
180 |     A         B
181 | ========= ========
182 |   ->       <-
183 |    ->        <-
184 |     ->      <-
185 |        ->       <-
186 | 
187 |     A       C
188 | ========= ======
189 |   ->        <-
190 |     ->        <-
191 | 
192 |    rA        D           equivalent to rDA, in this order
193 | ========= =======
194 |       ->   <-
195 |      ->   <-
196 |        ->   <-
197 | 
198 | Two parameters control scaffolding (-k and -a).  The -k option specifies the minimum number of links (read pairs) a valid contig pair MUST have to be considered.  The -a option specifies the maximum ratio between the best two contig pairs for a given contig being extended.  For example, contig A shares 4 links with B and 2 links with C, in this orientation.  contig rA (reverse) also shares 3 links with D.   When it's time to extend contig A (with the options -k and -a set to 2 and 0.7, respectively), both contig pairs AB and AC are considered.  Since C (second-best) has 2 links and B (best) has 4 (2/4) = 0.5 below the maximum ratio of 0.7, A will be linked with B in the scaffold and C will be kept for another extension. If AC had 3 links the resulting ratio (0.75), above the user-defined maximum 0.7 would have caused the extension to terminate at A, with both B and C considered for a different scaffold.  A maximum links ratio of 1 (not recommended) means that the best two candidate contig pairs have the same number of links -- SSPACE will accept the first one since both have a valid gap/overlap. The above method was adopted from SSAKE. The SSPACE improved this method by introduing another method if a contig can link to more than one alternative. Both methods (original SSAKE method and our method) for handling alternatives are explained below;
199 | 
200 | In version 2-0 of SSPACE an additional ratio is used to generate more reliable scaffolds, especially for libraries with large libraries. This ratio is used as an additional control for the scaffolding process. A contig with multiple links should satisfy both ratios in order to form a scaffold. The rules for scaffolding contigs with multiple alternative contig connections is explained in more detail below.
201 | 
202 | If a contig can be linked to more than one alternative, connections between these alternatives are searched and linked together if a connection is found. Otherwise a ratio is calculated between the two best alternatives. If this ratio is below a threshold (-a) a connection with the best scoring alternative is established. The two methods are shown below;
203 | 
204 | The first method;
205 | A has 10 links with B
206 | A has 5 links with C
207 | B has 10 links with C;
208 | 
209 | Result is a scaffold containing A-B-C
210 | 
211 | The second method (only used if first method did not produce a scaffold) is based on two ratios. The first ratio (ratio1) is based on the number of links, while the second ratio (ratio2) is based on the number of links and the used search space. This will be explained using an example;
212 | 
213 | If we have an insert size of 450 and contigs has two alternatives with two contigs, with the following details;
214 | 
215 | A and B with;
216 | 	gap = 100
217 | 	links = 19
218 | 	size of B is 100bp
219 | 
220 | A and C with;
221 | 	gap = 400
222 | 	links = 9
223 | 	size of B is 1000bp
224 | 
225 | Ratio1 is simply calculated by dividing the contig with lowest links with the contig with highest number of links;
226 | 
227 | Here, this is 9/19 (C/B) = 0.47.
228 | 
229 | 
230 | Ratio2 is calculated by incorporating the insert size. SSPACE first determines the amount of search space that was used for searching links.
231 | 
232 | In figure, where each character represents 50bp, this looks something like;
233 | 
234 | 	     <100bp>
235 |            ==(B)
236 | gap=100   /
237 | 	   /
238 | (A)======
239 | 	   \
240 | gap=400   \		
241 |            ------====================(C)
242 | 			     <1000bp>
243 |          *********
244 |       < SEARCH SPACE >
245 | 
246 | Legenda;
247 | * = search space
248 | = = contig
249 | - = gap
250 | 
251 | Now we calculate the used space on contigs (B) and (C) that was used for pairing with contig (A). In principle, this is just calculating the number of nucleotides fall into the SEARCH SPACE.
252 | For contig B, we can see that the whole contig falls into the SEARCH SPACE. Therefore, the space = 100bp
253 | For contig C, we can see that only the first 50bp of the contig falls into the SEARCH SPACE. Therefore, the space = 50bp.
254 | 
255 | Next, we estimate the number of links per space, by dividing the total number of links with the found space;
256 | For contig B, this is 19 links per 100 bp space = 0.19 links per space
257 | For contig C, this is 9 links per 50 bp space = 0.18 links per space
258 | 
259 | Ratio2 is then calculated by dividing the two numbers; 0.18/0.19 = 0.95. If both ratio1 and ratio2 are below the -a ratio threshold, the scaffold is A-B. Otherwise, no reliable scaffold can be formed and the scaffold extension is stopped.
260 | 
261 | 5b. Left scaffold extension
262 | When a scaffold extension is terminated on one side, the scaffold is extended on the "left", by looking for contig pairs that involve the reverse of the seed (in this example, rD).  With AB and AC having 4 and 2 links, respectively and rD being the only pair on the left, the final scaffolds outputted by SSPACE would be:
263 | 
264 | 1) rD-A-B
265 | 2) C
266 | 
267 | SSPACE outputs a .scaffolds file with linkage information between contigs (see "Understanding the .scaffolds csv file" below)
268 | Accurate scaffolding depends on many factors.  Number and nature of repeats in your target sequence, optimum adjustments of insert size, error, parameters -k and -a and data quality/size of sequence set (more doesn't mean better) will all affect SSPACE's ability to build scaffolds.
269 | 
270 | 
271 | 6. Merging contigs
272 | SSAKE scaffolder produces links between contigs and determines the possible gap between them. For a positive gap, m number of N's will be placed between them if a gap of size m is predicted to occur. When a negative gap is generated, a putative overlap is predicted to occur. The adjacent contigs are searched for overlap within a window given at -n option till 50 bp. If an overlap was found, contigs are merged and the region is marked with lowercase nucleotides. Otherwise, if no overlap was detected, a single "n" will be placed between the contigs. A short overview of this step with three examples;
273 | 
274 | >contig_1
275 | AGCTAGTCGTAGCTTGTAC
276 | >contig_2
277 | ACGTAGTGATATTATTGTC
278 | 
279 | Example 1:
280 | A link between contig_1 and contig_2 is found, with a putative gap of 10. In the final output, the gaps is indicated by 10 N's between the two contigs.
281 | 
282 | Link = contig_1 with contig_2. Gap = 10;
283 | AGCTAGTCGTAGCTTGTACNNNNNNNNNNACGTAGTGATATTATTGTC
284 | 
285 | Example 2;
286 | A link between contig_1 and contig_2 is found, with a putative gap of -10. When using the -n 10 option, no overlap was found and a small <n> is inserted between the two contigs.
287 | 
288 | Link = contig_1 with contig_2. Gap = -10. -n = 10;
289 | AGCTAGTCGTAGCTTGTACnACGTAGTGATATTATTGTC
290 | 
291 | Example 3;
292 | A link between contig_3 and contig_4 is found, with a putative gap of -10. When using the -n 10 option, an overlap of 13 nucleotides was found, indicated in lower case in the final output.
293 | 
294 | >contig_3
295 | AGTGTTAGATAGTTATAGA
296 | >contig_4
297 | AGATAGTTATAGAAGTAGT
298 | 
299 | Link = contig_3 with contig_4. Gap = -10. -n = 10;
300 | AGTGTTagatagttatagaAGTAGT
301 | 
302 | TIP: The summary file calculates the mean and median insert size based on mapping of paired reads on a single contig. For more reliable gap and overlap estimation, one may consider to change the insert size in the library file with the calculated mean.
303 | 
304 | 
305 | Input sequences
306 | ---------------
307 | 
308 | FASTA FILES:
309 | >ILLUMINA-52179E_0001:3:1:1062:15216#0/2
310 | ATNGGGTTTTTCAACTGCTAAGTCAGCAGGCTTTTCACCCTTCAACATC
311 | >ILLUMINA-52179E_0001:3:1:1062:4837#0/2
312 | ANNAACTCGTGCCGTTAAAGGTGGTCTTGCATTTCAGAAAGCTCACCAG
313 | 
314 | FASTQ files:
315 | @ILLUMINA-52179E_0001:3:1:1062:15216#0/2
316 | ATNGGGTTTTTCAACTGCTAAGTCAGCAGGCTTTTCACCCTTCAACATC
317 | +ILLUMINA-52179E_0001:3:1:1062:15216#0/2
318 | OOBOLJ[HHO`_aaa`a_]aaaY[`Za[Y[F]]VZWX]WZ^Z^^^O[XY
319 | @ILLUMINA-52179E_0001:3:1:1062:4837#0/2
320 | ANNAACTCGTGCCGTTAAAGGTGGTCTTGCATTTCAGAAAGCTCACCAG
321 | +ILLUMINA-52179E_0001:3:1:1062:4837#0/2
322 | OBBOO^^^^^bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb`bbbb`
323 | 
324 | General points:
325 | -Files present in the -l library file should either be in FASTA or FASTQ format, which is automatically determined by the program. For each paired read, one of the reads should be in the first file, and the other one in the second file. The paired reads are required to be on the same line in both files.
326 | -the header (given after "@" character for FASTQ or ">" for FASTA) of contig and paired-read data files could be of any format. No typical naming convention is needed. Duplicate names are also allowed.
327 | -Quality values of the FASTQ files are not used.
328 | -To be considered, sequences have to be longer than 16 nt or -m (but can be of different lengths).  If they are shorter, the program will simply omit them from the process.
329 | -Reads containing ambiguous bases, like <N> and <.>, and characters other than ACGT will be ignored entirely in input FASTA/FASTQ files inserted with -l option.
330 | -Contigs (inserted with -s option) containing ambiguous bases, like <N> and <.>, and characters other than ACGT are not ignored. However, contigs having these other characters can prevent proper contig extension when they are at the beginning or end of the sequence.
331 | -Spaces in any FASTQ and FASTA file are NOT permitted and will either not be considered or result in execution failure
332 | -For Bowtie, option -v 0 is used, which correspond to zero mismatches allowed on mapping. In addition bowtie's -m 1 option is used; only reads that map exactly to one contig (both in normal and reverse complement) are outputted. Pairs that are present on multiple contigs, are not used for scaffolding. Results are stored in the folder 'bowtieoutput'. For information about Bowtie see http://bowtie-bio.sourceforge.net/ .
333 | 
334 | 
335 | FASTA header of .extendedcontig.fasta file
336 | ------------
337 | 
338 | e.g.
339 | >extcontig27|size52|read193|cov92.79|seed:PreAssembledCtg0027
340 | 
341 | contig id# = 27, this contig is extended during extension step. If not extended, the contig is named >contig27
342 | size (G) = 52 nt. Size of the contig.
343 | number of reads (N) = 193. Number of reads for extension.
344 | cov [coverage] (C) = 92.79. the coverage (C) is calculated using the total number (T) of consensus bases [sum(L)] provided by the assembled sequences divided by the contig size:
345 | 
346 | C = T / G
347 | seed = PreAssembledCtg0027. Header of the original pre-assembled contig file.
348 | 
349 | Output files
350 | ------------
351 | Each file is starting with a basename given at the -b parameter. First, four main files are generated in the current working directory;;
352 | 
353 | (basename).final.scaffolds.fasta       :: text file; Final scaffolds produced by SSPACE.
354 | (basename).final.evidence:: text file; Produced scaffolds including the initial numbered contigs.
355 | (basename).logfile       :: text file; Logs execution time / errorsE
356 | (basename).summaryfile:: text file; Gives a summary after every step. Summary of number of inserted sequences, filtered sequences, contig sequences, mapping stats, pairing stats and contig/scaffold size summaries.
357 | 
358 | 
359 | In addition, four folders are generated, each having a number of files;
360 | 
361 | 'reads' folder;
362 | (basename).(libname).file(libnumber).fasta:: FASTA file; Converted files of the paired-read data, each two consecutive sequences are pairs. This file is used as input for both the contig extension as the scaffolding step.
363 | 
364 | 'bowtieoutput' folder;
365 | Four files are generated by bowtie;
366 | (basename).bowtieIndex.* :: index file; Index files generated by 'bowtie-build'. Produced for each library.
367 | 
368 | For further information about the outputs of Bowtie, see the Bowtie manual ( http://bowtie-bio.sourceforge.net/ ).
369 | 
370 | 
371 | 'pairinfo' folder;
372 | (basename) .(libname).pairing_distribution.csv:: comma-separated file; 1st column is the calculated distance for each pair (template) with reads that assembled logically within the same contig.  2nd column is the number of pairs at that distance. Produced for each library.
373 | (basename).(libname).pairing_issues:: text file; Lists all pairing issues encountered between contig pairs and illogical/out-of-bounds pairing. Produced for each library.
374 | 
375 | 'intermediate_results' folder;
376 | (basename).extendedcontigs.fasta   :: FASTA file; All contig sequences. Both extended and non-extended contigs. Extended contigs are named ">ext_contig" , while non-extended are named ">contig" in the header. Only produced when -x 1.
377 | 
378 | (basename).formattedcontigs.fasta   :: FASTA file; Original contig sequences. Formatted to appropriate input for scaffolding. Only produced when -x 0.
379 | 
380 | (basename).(libname).scaffolds :: comma-separated file; see below. Produced for each library.
381 | 
382 | (basename).(libname).scaffolds.fasta :: FASTA file; All merged/unmerged contigs within scaffolds are listed.  The overlap sequence between contigs (>= -n bases) will be shown in lower case within the merged contig.  Note that *perfect* sequence overlap has to occur between 2 predicted adjacent contigs of a scaffold in order to merge. Only merging of two contigs is established if a negative gap is determined. When two consecutive contigs do not physically overlap, then gaps will be padded with Ns of length corresponding to the predicted gap size m (refer to Understanding the .scaffolds csv file below) and predicted but undetected overlaps with a single (n).
383 | 
384 | (basename).(libname).scaffolds.evidence :: text file; Produced scaffolds including the initial numbered contigs (-s option). (refer to Understanding the .evidence file below).
385 | 
386 | (basename).(libname).foundlinks :: text file; Links between the contigs/scaffolds and their correspond gapsize.
387 | 
388 | (basename).(libname).repeats :: text file; Contig-edges having multiple links with other contigs.
389 | 
390 | 
391 | 'dotfiles' folder;
392 | (basename).(libname).visual_scaffolds.dot :: dot file; This file can be used to visualise the contigs orientation and order on the scaffolds. The .dot file can be converted to any format using the GraphViz package using the 'dot' command (www.graphviz.org). Each dotfile is cut into 5mb parts, otherwise the scaffolds can't be converted and visualised properly.
393 | 
394 | 
395 | Understanding the .scaffolds csv file
396 | -------------------------------------
397 | 
398 | scaffold1,7484,f127Z7068k12a0.58m42_f3090z62k7a0.14m76_f1473z354
399 | 
400 | Each column is separated by a comma;
401 | column 1: a unique scaffold identifier
402 | column 2: the sum of all contig sizes that made it to the scaffold/supercontig
403 | column 3: a contig chain representing the layout:
404 | 
405 | e.g.
406 | f127Z7068k12a0.58m42_f3090z62k7a0.14m76_f1473z354
407 | 
408 | means: contig f127 (strand=f/+), size (z) 7068 (Z if contig was used as the seed sequence) has 12 links (k), link ratio of 0.58 (a) with a mean gap of 42nt (m) with reverse (r) of contig 3090 (size 62) on the right.  if m values are negative, it's just that a possible overlap was calculated using the mean distance supplied by the user and the position of the reads flanking the contig.
409 | Negative m values imply that there's a possible overlap between the contigs. But since the pairing distance distribution usually follows a Normal/Gaussian distribution, some distances are expected to be larger than the median size expected/observed.  In reality, if the exact size was known between each paired-reads, we wouldn't expect much negative m values unless a break occurred during the contig extension (likely due to base errors/SNPs).
410 | 
411 | 
412 | 
413 | Understanding the .scaffolds.fasta file
414 | -------------------------------------
415 | 
416 | scaffold13.1|size84140|tigs14
417 | 
418 | Each column represents;
419 | name of the scaffold
420 | size of the scaffold
421 | number contigs in scaffold
422 | 
423 | Each initial contig inputted at -s option stored in a scaffold is written to the .evidence file. This file is explained below.
424 | 
425 | Understanding the .scaffolds.evidence file
426 | -------------------------------------
427 | 
428 | >scaffold1.1|size9058|tigs5
429 | f_tig5|size728|links12|gaps100
430 | r_tig1|size2726|links10|gaps89
431 | f_tig100|size3687|links4|gaps-46|merged40
432 | f_tig91|size238|links6|gaps392
433 | f_tig120|size1112
434 | 
435 | The first line indicates the scaffold, which is the same as in the .scaffolds.fasta file. Next, for each contig the connection (orientation, links and gaps) with other contigs are given. The second line for example means forward contig 5 with size 728 has 12 links and a gap of 100bp with reverse contig 1. If a line ends with <merged>, it means that the contig has overlap with the next contig, and they are merged. For contig f_tig100, 40 nucleotides had an overlap with contig f_tig91.
436 | 
437 | 
438 | Producing visualisation of scaffolds with .dot file using -p parameter
439 | -------------------------------------
440 | 
441 | To visualize the scaffolds of the .dot file, GraphViz should be downloaded at (www.graphviz.org). GraphViz converts the .dot file to any desired output using the 'dot' function. For example to convert the .dot to a .ps format;
442 | 
443 | dot -Tps2 (basename).(libname).visual_scaffolds.dot -o MYOUTPUT.ps
444 | 
445 | This will produce a postscript (.ps) file. For other options, see the manual of GraphViz.
446 | 
447 | 
448 | 
449 | How does the .tab file work
450 | ---------------------------
451 | 
452 | The .tab file is a tab-delimited file containing information about the positions of the reads on the contigs. On each line, positions of both reads are given.
453 | 
454 | A typical .tab file line looks like;
455 | 
456 | contig1	100	150	contig1	300	250
457 | 
458 | Here, the first read is found at contig1 with start and end at position 100 and 150, respectively. Meaning that the read is found at the positive strand (-).
459 | The second read is found at contig1 at start and end at position 300 and 250, respectively. Meaning that the read is found at the negative strand (-).
460 | 
461 | In figure;
462 |       	      read1    read2
463 | 			---->    <----
464 | contig1 ----------------------------------------------------
465 | 
466 | 
467 | Another line may look like;
468 | 
469 | contig2	300	350	contig3	100	550
470 | 
471 | Here, the first read is found at contig1 with start and end at position 100 and 150, respectively. Meaning that the read is found at the positive strand (-).
472 | The second read is found at contig1 at start and end at position 300 and 250, respectively. Meaning that the read is found at the negative strand (-).
473 | 
474 | In figure;
475 |       	            read1
476 | 			      ---->    	  read2
477 | contig2 ------------------------      <----
478 | contig3					-------------
479 | 
480 | Normally, SSPACE parses the output of Bowtie directly to the above format and uses this information to pair the contigs and to determine the insert size. With the .tab format, users can put directly the mapping positions of the reads into SSPACE, which is much faster. Also, this way users can make use of their favorite read mapper and put the results into SSPACE.
481 | 
482 | To work properly, the input contigs (-s option) should have the same name as the contigs in the .tab file, as explained in the MANUAL. Since the TAB file can be used in combination with other TAB files and also FASTA/FASTQ files, as well as multiple libraries, the original mappings should be updated after each library. Therefore, contigs are stored in memory, and their position in scaffolds is updated after each scaffold formation. An example;
483 | 
484 | contig2 (200bp) is linked with contig3 (200bp) with a gap of 10bp
485 | 
486 | 		contig3				contig2
487 | scaf1------------------------NNNNNNNNNN--------------
488 | 
489 | the contigs are then updated to new positions;
490 | -contig3 is at position 1-200 at scaf1
491 | -contig2 is at position 210-410 at scaf1
492 | 
493 | Most common used output format of read mappers are .sam format and their equivalent binary format .bam. A script is attached in the 'tools' folder in the SSPACE package, which converts .sam/.bam files to .tab format. See the TUTORIAL on an example on how such a process looks like.
494 | 
495 | SSPACE does not
496 | --------------
497 | 
498 | -Take into consideration base quality scores.  It is up to the user to process the sequence data before clustering with SSPACE. Python scripts (TQS.py, TQSfastq.py, TQSexport.fq) are provided to help trim poor quality bases off Illumina sequences. Refer to TQS.readme and TRIMMING_PAIRED_READS.README included in this distribution (in the ./tools subdirectory) for information on how to run those programs
499 | -Consider sequence read having any character other than A,C,G,T and will skip these reads entirely while reading the FASTA file.
500 | -Only input of FASTA or FASTQ is possible. For conversion to these formats use the fq_all2std.pl function in the ./tools directory.
501 | 


--------------------------------------------------------------------------------
/SSPACE_Basic.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | #AUTHOR
  4 | # Marten Boetzer and Walter Pirovano (c) 2010
  5 | # SSAKE-based Scaffolding of Pre-Assembled Contigs after Extension (SSPACE)
  6 | # walter.pirovano@baseclear.com
  7 | 
  8 | #NAME
  9 | #   SSPACE Marten Boetzer - Walter Pirovano November 2011
 10 | 
 11 | #SYNOPSIS
 12 | #   SSAKE-based Scaffolding of Pre-Assembled Contigs after Extension (SSPACE)
 13 | 
 14 | #DOCUMENTATION
 15 | #   README, MANUAL and TUTORIAL distributed with this software @ www.baseclear.com
 16 | #   Boetzer M, Henkel VJ, Jansen HJ, Butler D and Pirovano W. 2011. Scaffolding pre-assembled contigs using SSPACE. Bioinformatics 27(4) p578-9.
 17 | #   http://www.baseclear.com/sequencing/data-analysis/bioinformatics-tools/
 18 | #   We hope this code is useful to you -- Please send comments & suggestions to Walter.Pirovano@baseclear.com
 19 | #   If you use either the SSPACE code or ideas, please cite our work appropriately and accurately
 20 | 
 21 | #LICENSE
 22 | #   SSPACE Copyright (c) 2010-2011 BaseClear B.V. The Netherlands. All rights reserved.
 23 | #   SSAKE Copyright (c) 2006-2010 Canada's Michael Smith Genome Science Centre. All rights reserved.
 24 | 
 25 | #   This program is free software; you can redistribute it and/or
 26 | #   modify it under the terms of the GNU General Public License
 27 | #   as published by the Free Software Foundation; either version 2
 28 | #   of the License, or (at your option) any later version.
 29 | 
 30 | #   This program is distributed in the hope that it will be useful,
 31 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 32 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 33 | #   GNU General Public License for more details.
 34 | 
 35 | #   note: insert size and distance between pairing reads are used interchangeably
 36 | 
 37 | #MAJOR CHANGES ON SSAKE V3.4 TO FORM SSPACE
 38 | #   -New scaffolding feature dealing with contigs having multiple alternatives
 39 | #   -Seperate scripts to decrease the memory usage
 40 | #   -Automatic filtering of reads and duplicate mate pairs
 41 | #   -Option for contig extension on unfiltered reads
 42 | #           -Removed tracking of reads during contig extension
 43 | #           -Mapping single reads to extended and non extended contigs
 44 | #   -Single reads mapped more than once to a contig are removed for scaffolding
 45 | #   -A summary file is generated containing detailed information about the scaffolding process
 46 | #   -An evidence file is generated which indicates the contigs present in the scaffolds
 47 | #   -Optional; Scaffolds and their contigs are visualised by generating a .dot file
 48 | 
 49 | #MAJOR CHANGES ON SSPACE Basic v2.0;
 50 | 
 51 | # GENERAL
 52 | #   -Last column of the library file should be the orientation of the reads, instead of indication of being reverse complement or not. Options are FR, FF, RF and RR.
 53 | #   -Fixed some bugs in the summary file and removed some useless information.
 54 | #   -Included the -z option which specifies the minimal contig length that will be used for scaffolding. Contigs below this length are discarded for scaffolding.
 55 | #   -Included the possibility to include TAB delimited files with read mapping information, format is; ctg1 start1 end1 ctg2 start2 end2
 56 | #             - if a read is reverse complement on a contig, start and end should be turned around e.g. ctg1 100 150 ctg2 150 100 indicates that the second read is reverse complement on ctg2
 57 | #             - No contig filtering can be applied if TAB delimited files are included
 58 | #             - See MANUAL for more information of how to use the tab file option
 59 | #   -Included some scripts to convert a .sam file to a .tab file
 60 | 
 61 | # BOWTIE
 62 | #   -Included the -g option to specify maximum allowed gaps for Bowtie. This option corresponds to the -v option in Bowtie.
 63 | #   -Now able to do multithreaded Bowtie using the -T option (-T 3 does 3 threads). This option corresponds to the -p option in Bowtie.
 64 | 
 65 | # READING FILES:
 66 | #   -Speeded up the reading of the library files for a single threaded run
 67 | #   -Now able to read multiple libraries at once using the multithread -T option. -T 3 reads three files at the same time.
 68 | 
 69 | # CONTIG EXTENSION
 70 | #   -Included the -r option for contig extension (default is 0.9).
 71 | #   -Speeded up and reduced the memory usage during the contig extension.
 72 | #             - SSPACE reads in the output of Bowtie at once, rather than reading it from the output file.
 73 | #             - Faster check for presence of subsequence of a read, thereby able to faster check for overlapping sequences with the contig.
 74 | 
 75 | # SCAFFOLDING
 76 | #   -Combined the functions readBowtie and pairContigs, which saves runtime and memory.
 77 | #   -Saving runtime by reading Bowtie results in at once, instead of reading it from Bowtie's output file.
 78 | #   -Included a pre-filtering step of multiple alternative contig links before scaffolding. This step was previously done during scaffolding, now it's a step before scaffolding. It reduces the number of errors within the scaffolds.
 79 | #   -Additional check to connect two alternative contigs, making the scaffolds more reliable, especially with mate pair libraries. The search space is included in the calculation of the ratio, rather than looking at the number of links only. See the README file for more information.
 80 | #   -Calculation of mean insert size based on mapped read pairs on same contig. Users can choose this value for better estimation of gap sizes. Especially for paired-end sequences.
 81 | 
 82 | #   -Fixed a bug in the mergeContigs function. Indication of contigs merged in previous libraries were not displayed in the final .evidence file.
 83 | 
 84 | #-------------------------------------------------LOAD PACKAGES AND DEFINE VARIABLES
 85 |   use strict;
 86 |   use Storable;
 87 |   use Getopt::Std;
 88 |   use File::Path;
 89 |   use File::Basename;
 90 | 
 91 |   #Specify path to DotLib
 92 |   use FindBin qw($Bin);
 93 |   use lib "$Bin/dotlib/";
 94 |   use DotLib;
 95 | 
 96 |   use vars qw($opt_m $opt_o $opt_v $opt_p $opt_k $opt_a $opt_z $opt_s $opt_b $opt_n $opt_l $opt_x $opt_u $opt_t $opt_T $opt_g $opt_r);
 97 |   getopts('m:o:v:p:k:a:z:s:b:n:l:x:u:t:T:g:r:');
 98 |   my ($base_overlap, $min_overlap, $verbose, $MIN_READ_LENGTH, $SEQ_SLIDE, $min_base_ratio, $min_links, $max_link_ratio, $unpaired_file, $max_trim, $base_name, $max_count_trim, $min_tig_overlap, $doplot, $extending, $threads, $minContigLength, $gaps, $unpaired, $gapclosure) = (20, 32, 0, 16, 1, 0.9, 5, 0.7, "no-u", 0, "standard_output", 10, 15, 0, 0, 1, 0, 0, 0, 0);
 99 | 
100 |   my $version = "[SSPACE_Basic v2.1]";
101 |   my $seplines = ("-" x 60)."\n";
102 |   my ($MAX, $MAX_TOP, $TRACK_COUNT) = (0, 100, 1);# $MAX_TOP is the very maximum anchoring edge sequence that will be searched
103 | 
104 | #-------------------------------------------------READ OPTIONS
105 | 
106 |   if(!($opt_l) || !($opt_s)){
107 |     print STDERR "ERROR: Parameter -l is required. Please insert a library file\n" if(!$opt_l);
108 |     print STDERR "ERROR: Parameter -s is required. Please insert a contig FASTA file\n" if(!$opt_s);
109 |     my $error_msg = <<"END_MSG";
110 | \nUsage:\n
111 | ============ General Parameters ============\n
112 | -l  Library file containing two paired read files with insert size, error and either mate pair or paired end indication (REQUIRED)\n
113 | -s  FASTA file containing contig sequences used for extension. Inserted pairs are mapped to extended and non-extended contigs (REQUIRED)\n
114 | -x  Indicate whether to extend the contigs of -s using paired reads in -l (-x 1=extension, -x 0=no extension, default -x $extending)\n
115 | ============ Extension Parameters ============\n
116 | -m  Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m $min_overlap)\n
117 | -o  Minimum number of reads needed to call a base during an extension (default -o $base_overlap)\n
118 | -t  Trim up to -t base(s) on the contig end when all possibilities have been exhausted for an extension (default -t $max_trim)\n
119 | -u  FASTA/FASTQ file containing unpaired sequence reads (optional)\n
120 | -r  Minimum base ratio used to accept a overhang consensus base (default -r $min_base_ratio)\n
121 | ============ Scaffolding Parameters ============\n
122 | -z  Minimum contig length used for scaffolding. Filters out contigs below this value (default -z $minContigLength)\n
123 | -k  Minimum number of links (read pairs) to compute scaffold (default -k $min_links)\n
124 | -a  Maximum link ratio between two best contig pairs. *Higher values lead to least accurate scaffolding* (default -a $max_link_ratio)\n
125 | -n  Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n $min_tig_overlap)\n
126 | ============ Bowtie Parameters ============\n
127 | -g  Maximum number of allowed gaps during mapping with Bowtie. Corresponds to the -v option in Bowtie. *Higher number of allowed gaps can lead to least accurate scaffolding* (default -g $gaps)\n
128 | -T  Specify the number of threads in Bowtie. Corresponds to the -p/--threads option in Bowtie (default -T $threads)\n
129 | ============ Additional Parameters ============\n
130 | -b  Base name for your output files (default -b $base_name)\n
131 | -v  Runs in verbose mode (-v 1=yes, -v 0=no, default -v $verbose)\n
132 | -p  Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p $doplot)
133 | END_MSG
134 |     die $error_msg;
135 |   }
136 | 
137 |   my $libraryfile = $opt_l if ($opt_l);
138 |   my $filecontig = $opt_s if($opt_s);
139 |   $extending = $opt_x if($opt_x eq 1);
140 |   $min_overlap = $opt_m if ($opt_m);
141 |   $base_overlap = $opt_o if ($opt_o);
142 |   $max_trim = $opt_t if ($opt_t);
143 |   $unpaired_file = $opt_u if($opt_u);
144 |   $min_base_ratio = $opt_r if ($opt_r);
145 |   $minContigLength = $opt_z if($opt_z);
146 |   $min_links = $opt_k if ($opt_k);
147 |   $max_link_ratio = $opt_a if ($opt_a);
148 |   $min_tig_overlap = $opt_n if($opt_n);
149 |   $gaps = $opt_g if($opt_g);
150 |   $threads = $opt_T if ($opt_T);
151 |   $base_name = $opt_b if($opt_b);
152 |   $verbose = $opt_v if ($opt_v);
153 |   $doplot = $opt_p if($opt_p);
154 | 
155 | #-------------------------------------------------CHECKING PARAMETERS
156 |   die "ERROR: Invalid (-l) library file $libraryfile ...Exiting.\n" if(! -e $libraryfile);
157 |   die "ERROR: Invalid (-s) contig file $filecontig ...Exiting.\n" if(! -e $filecontig);
158 |   die "ERROR: -x must be either 0 or 1. Your inserted -x is $extending...Exiting.\n" if(!($extending == 0 || $extending == 1));
159 |   die "ERROR: -m must be a number between 15-50. Your inserted -m is $min_overlap ...Exiting.\n" if(!($min_overlap =~ /^\d+$/) || $min_overlap < 10 || $min_overlap > 50);
160 |   die "ERROR: -o must be set to 1 or higher. Your inserted -o is $base_overlap ...Exiting.\n" if($base_overlap < 1);
161 |   die "ERROR: -t must be a positive integer. Your inserted -t is $max_trim ...Exiting.\n" if(!($max_trim =~ /^\d+$/));
162 |   die "ERROR: Invalid unpaired file $unpaired_file -- fatal\n" if(! -e $unpaired_file && $opt_u);
163 |   die "ERROR: -r must be a number between 0.0 and 1.0. Your inserted -r is $min_base_ratio ...Exiting.\n" if($min_base_ratio < 0 || $min_base_ratio > 1);
164 |   die "ERROR: -z must be a positive integer. Your inserted -z is $minContigLength...Exiting.\n" if (!($minContigLength =~ /^\d+$/));
165 |   die "ERROR: -k must be a positive integer. Your inserted -k is $min_links ...Exiting.\n" if(!($min_links =~ /^\d+$/));
166 |   die "ERROR: -a must be a number between 0.0 and 1.0. Your inserted -a is $max_link_ratio ...Exiting.\n" if($max_link_ratio < 0 || $max_link_ratio > 1);
167 |   die "ERROR: -n must be a positive integer. Your inserted -n is $min_tig_overlap ...Exiting.\n" if (!($min_tig_overlap =~ /^\d+$/));
168 |   die "ERROR: -g must be a positive integer between 0 and 3. Your inserted -g is $gaps...Exiting.\n" if (!($gaps =~ /^\d+$/) || $gaps > 3);
169 |   die "ERROR: -T must be a positive integer. Your inserted -T is $threads...Exiting.\n" if (!($threads =~ /^\d+$/));
170 |   die "ERROR: -p must be either 0 or 1. Your inserted -p is $doplot...Exiting.\n" if(!($doplot == 0 || $doplot == 1));
171 | 
172 | #-------------------------------------------------check library file;
173 |   open(FILELIB, "< $libraryfile");
174 |   my ($min_allowed, $library, $fileA, $fileB, $insert_size, $insert_stdev, $orientation);
175 |   my $countline=0;
176 |   while(<FILELIB>){
177 |     chomp;
178 |     $countline++;
179 |     my @line = split(/\s+/, $_);
180 |     if($#line >= 0){
181 |       if($opt_l){
182 |         die "ERROR: Line $countline in your library file ($libraryfile) contains $#line spaces, which should be 5 spaces. Check that no spaces are within the file names.\n" if($#line != 5);
183 | 
184 |         my ($library, $fileA, $fileB, $insert_size, $insert_stdev, $orientation) = split(/\s+/, $_);
185 |         if($fileA ne "TAB"){
186 |           die "ERROR: Invalid file in library $library: $fileA -- fatal\n" if(! -e $fileA);
187 |         }else{
188 |           die "ERROR: Can't apply filtering using the -z option (-z = $minContigLength) and insertion of a TAB file -- fatal\n" if($minContigLength > 0);
189 |         }
190 |         die "ERROR: Invalid file in library $library: $fileB -- fatal\n" if(! -e $fileB);
191 |         die "ERROR: Insert size should be higher than or equal to 0. Your library $library has insert size of $insert_size. Exiting.\n" if(!($insert_size>0) || !($insert_size =~ /^\d+$/));
192 |         die "ERROR: Insert stdev must be a number between 0.00 and 1.00. Your library $library has insert size of $insert_stdev. Exiting.\n" if($insert_stdev < 0 || $insert_stdev > 1 || !($insert_stdev * 1 eq $insert_stdev));
193 |         die "ERROR: Orientation must have length of 2 characters and should contain one of the following; FR, FF, FR or RF. Your library $library has orientation of $orientation ...Exiting.\n" if(!(length($orientation) == 2) || !($orientation =~ /[FR][FR]/));
194 |       }
195 |     }
196 |   }
197 |   close FILELIB;
198 | #-------------------------------------------------Make folder structure
199 |   mkpath('intermediate_results');
200 |   mkpath('pairinfo');
201 |   mkpath('reads');
202 |   mkpath('bowtieoutput');
203 | 
204 |   $unpaired = $unpaired_file if (-e $opt_u && $extending == 1);
205 | #-------------------------------------------------Print input parameters
206 |   my $contig = "intermediate_results/" . $base_name .  ".formattedcontigs.fasta";
207 | 
208 |   my $log = $base_name . ".logfile.txt";
209 |   my $summaryfile = $base_name.".summaryfile.txt";
210 |   open (LOG, ">$log") || die "Can't write to $log -- fatal\n";
211 | 
212 |   open (SUMFILE, ">$summaryfile") || die "Can't open $summaryfile -- fatal\n";
213 |   close SUMFILE;
214 | 
215 |   my $init_message =  "Your inserted inputs on $version at ".getDate().":\nRequired inputs: \n\t-l = $libraryfile\n\t-s = $filecontig\n\t-b = $base_name\n\n";
216 |   $init_message .= "Optional inputs:\n\t-x = $extending\n\t-z = $minContigLength\n\t-k = $min_links\n";
217 |   $init_message .=  "\t-a = $max_link_ratio\n\t-n = $min_tig_overlap\n\t-T = $threads\n\t-p = $doplot\n\n";
218 | 
219 |   $init_message .= "Contig extension inputs:\n\t-o = $base_overlap\n\t-t = $max_trim\n\t-m = $min_overlap\n\t-r = $min_base_ratio\n\n" if($extending == 1);
220 | 
221 |   &printMessage($init_message);
222 |   close LOG;
223 | #-------------------------------------------------READING AND CONVERTING INPUT SEQUENCES
224 |   system("perl $Bin/bin/readLibFiles.pl $libraryfile $base_name $extending $unpaired $min_overlap $threads");
225 |   checkStatus();
226 | #-------------------------------------------------FORMATTING OR EXTENDING CONTIGS
227 |   system("perl $Bin/bin/ExtendOrFormatContigs.pl $contig $base_name $extending $filecontig $MIN_READ_LENGTH $base_overlap $min_overlap $min_base_ratio $max_trim $verbose $Bin $minContigLength $libraryfile $gaps $threads");
228 |   checkStatus();
229 | #--------------------------------------------------UPDATE SUMMARY FILE
230 |   open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
231 |   open (LOG, ">>$log") || die "Can't write to $log -- fatal\n";
232 | 
233 |   #write summary of initial contigs
234 |   my $sumfile .= "\nSUMMARY: \n".$seplines."\tInserted contig file;\n";
235 |   $sumfile = &writesummaryfiles($filecontig, "contig", $sumfile);
236 |   #write summary of extended contigs
237 |   my $extended_tig = "intermediate_results/" . $base_name .  ".extendedcontigs.fasta";
238 |   $sumfile .= "\tAfter extension;\n" if($extending);
239 |   $sumfile = &writesummaryfiles($extended_tig, "contig", $sumfile) if($extending);
240 | 
241 |   #write summary of filtered contigs
242 |   if($minContigLength > 0){
243 |     $sumfile .= "\tAfter filtering (z >= $minContigLength);\n";
244 |     $sumfile = &writesummaryfiles($contig, "contig", $sumfile);
245 |   }else{
246 |     $contig = $extended_tig if($extending);
247 |   }
248 |   &FlushFiles();
249 |   close LOG;
250 |   close SUMFILE;
251 | 
252 | #--------------------------------------------------GO THROUGH EACH LIBRARY AND SCAFFOLD
253 |   open(FILELIB, "< $libraryfile") || die "Can't open $libraryfile -- fatal\n";
254 |   my ($lib, $fileA, $fileB, $insert_size, $insert_stdev, $pair, $headscaffolds, $prevlib, $mergedtigs, $evidencefile);
255 | 
256 |   while(<FILELIB>){
257 |     chomp;
258 |     &FlushFiles();
259 |     ($lib, $fileA, $fileB, $insert_size, $insert_stdev, $orientation) = split(/\s+/, $_);
260 |     next if($lib eq $prevlib || $lib eq '');
261 | 
262 |     my $tabfile = 0;
263 |     $tabfile = 1 if($fileA eq "TAB");
264 | 
265 |     $prevlib = $lib;
266 |     $min_allowed = -1 * ($insert_stdev * $insert_size);
267 | 
268 |     open (LOG, ">>$log") || die "Can't write to $log -- fatal\n";
269 |     &printMessage("\nLIBRARY $lib\n".$seplines);
270 |     close LOG;
271 | 
272 |     open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
273 |     print SUMFILE "\n\nLIBRARY $lib STATS:\n".("#" x 80),"\n";
274 |     close SUMFILE;
275 | 
276 |     my $scaffold = "intermediate_results/" . $base_name . ".$lib.scaffolds";
277 |     $mergedtigs = "intermediate_results/" . $base_name . ".$lib.scaffolds.fasta";
278 |     my $issues = "pairinfo/" . $base_name . ".$lib.pairing_issues";
279 |     my $distribution = "pairinfo/" . $base_name . ".$lib.pairing_distribution.csv";
280 | 
281 | #-------------------------------------------------MAPPING READ PAIRS USING FILTERED FASTA FILE
282 |     mkpath("tmp.$base_name");
283 | #-------------------------------------------------Scaffold the contigs and generate .scaffold file
284 |     system("perl $Bin/bin/PairingAndScaffolding.pl $Bin $gaps $contig $base_name $issues $distribution $verbose $lib $insert_size $min_allowed $scaffold $min_links $max_link_ratio $orientation $threads") if(!$tabfile);
285 |     system("perl $Bin/bin/PairingAndScaffolding.pl $Bin $gaps $contig $base_name $issues $distribution $verbose $lib $insert_size $min_allowed $scaffold $min_links $max_link_ratio $orientation $threads $tabfile $fileB $filecontig $evidencefile") if($tabfile);
286 |     checkStatus();
287 | 
288 |     #retrieve the contigs that were stored
289 |     my $contigstored = "tmp.$base_name/contigs.stored";
290 |     my $contigs = retrieve("$contigstored");
291 | #-------------------------------------------------Generate .fasta file and .evidence file with scaffolds
292 |     open (LOG, ">>$log") || die "Can't write to $log -- fatal\n";
293 |     ($headscaffolds, $evidencefile) = &mergeContigs($scaffold, $contigs, $mergedtigs, 50, $verbose, $min_tig_overlap,$max_count_trim);
294 |     $contig = $mergedtigs;
295 | #-------------------------------------------------write summary of scaffolds
296 |     $sumfile .= "\tAfter scaffolding $lib:\n";
297 |     $sumfile = &writesummaryfiles($mergedtigs, "scaffold", $sumfile);
298 | 
299 | #-------------------------------------------------
300 |     open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
301 |     print SUMFILE ("#" x 80),"\n";
302 |     close SUMFILE;
303 |     &printMessage("\n$seplines");
304 |     $contigs = (''); undef $contigs;
305 | 
306 |     my $removedir = "tmp.$base_name";
307 |     rmtree([$removedir, 'blurfl/quux']);  #remove 'tmp' folder
308 |   }#END OF LIBRARY LOOP
309 | 
310 |   #-------------------------------------------------END OF LIBRARIES. PRINT SUMMARY TO FILE AND END SESSION
311 |   my $finalfile = $base_name . ".final.scaffolds.fasta";
312 |   my $finalevfile = $base_name . ".final.evidence";
313 | 
314 |   open (EVID, $evidencefile);
315 |   open (FINALEV, "> $finalevfile");
316 |   while(<EVID>){
317 |     print FINALEV $_;
318 |   }
319 | 
320 |   open (SCAF, $mergedtigs);
321 |   open (FINAL, "> $finalfile");
322 |   while(<SCAF>){
323 |     print FINAL $_;
324 |   }
325 | 
326 |   #make .dot file for visualisation
327 |   &visualiseScaffolds($base_name.".visual_scaffolds", $evidencefile) if($doplot);
328 | 
329 |   open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
330 |   &printMessage("\n=>".getDate().": Creating summary file\n");
331 |   print SUMFILE $sumfile.$seplines;
332 |   my $time = (time - $^T);
333 |   my $minutes = int ($time / 60);
334 |   $time = $time % 60;
335 |   &printMessage(("*" x 50)."\n\nProcess run succesfully on ".getDate()." in $minutes"." minutes and $time"." seconds\n\n\n");
336 |   close SCAF;
337 |   close FINAL;
338 |   close EVID;
339 |   close FINALEV;
340 |   close LOG;
341 |   close SUMFILE;
342 |   #END OF MAIN PROGRAM
343 | 
344 | ###MAKE A .FASTA FILE OF THE FOUND SCAFFOLDS. EITHER MERGE TWO CONTIGS WHEN A OVERLAP OF -n EXISTS OR PLACE A GAP
345 | sub mergeContigs{
346 | 
347 |    my ($scaffold, $contigs, $mergedtigs, $chunk, $verbose,$min_tig_overlap,$max_count_trim) = @_;
348 | 
349 |    &printMessage("\n=>".getDate().": Merging contigs and creating FASTA file of scaffolds\n");
350 | 
351 |    open(IN,$scaffold) || die "can't read $scaffold -- fatal\n";
352 | 
353 |    my $evidence_file = $mergedtigs;
354 |    $evidence_file =~ s/.fasta/.evidence/;
355 |    open(SCAFS,">$evidence_file") || die "can't write to $evidence_file -- fatal\n";
356 |    open(OUT,">$mergedtigs") || die "can't write to $mergedtigs -- fatal\n";
357 |    my $scafhashcount = keys ( %$headscaffolds );
358 |    my $scaffoldHashStart;
359 |    my ($tot,$sct,$ct_merge, $step) = (0,0,0,100);
360 |    while(<IN>){### each line is a scaffold
361 |       chomp;
362 |       my @a = split(/\,/);
363 |       my @tig;
364 | 
365 |       if($a[2]=~/\_/){
366 |          @tig = split(/\_/,$a[2]);
367 |       }else{
368 |          push @tig, $a[2];
369 |       }
370 |       if(++$sct == $step){
371 |         CounterPrint($sct);
372 |         $step = $step + 100;
373 |       }
374 |       my ($ct,$tigsum,$mct,$prev,$word,$template,$seq,$prevseq,$headconcat,$prevEstimatedDistance, $prevLinks) = (0,0,0,"NA","NA","NA","","","","");
375 |       foreach my $t (@tig){### each contig
376 |          $ct++;
377 | 
378 |          if($t=~/([fr])(\d+)z(\d+)(\S+)?/i){
379 | 
380 |             my $orient = $1;
381 |             my $tnum=$2;
382 |             my $head = $orient . $tnum;
383 |             my $search = "tig" . $tnum;
384 |             my $other = $4;
385 |             $tot+= $3;
386 |             $tigsum +=$3;
387 | 
388 |             my ($estimatedDistance, $links) = ("", "");
389 |             $estimatedDistance = $1 if($other=~/m((\-)?\d+)/);
390 |             $links = $1 if($other=~/k((\-)?\d+)/);
391 |             print "\tSC $a[0] - TIG $ct.  pattern: $t search: $search totalTigSize: $tot Orientation: $orient Gap/Overlap estimated distance: $estimatedDistance\n" if($verbose);
392 | 
393 |             my $count_trim = 0;
394 | 
395 |             $seq = $contigs->{$tnum}{'seq'};
396 |             $seq = reverseComplement($seq) if($orient eq "r");
397 |             chomp $seq;
398 |             my $prev;
399 |             if($scafhashcount >0){
400 |               $prev = $headscaffolds->{$tnum}{'head'};
401 |               $prev =~ s/^\n//;
402 |               chomp $prev;
403 |               delete $headscaffolds->{$tnum};
404 |               chomp $prev;
405 |               if($orient eq "r"){  ###Reverse all contigs if the whole scaffold is a reverse complement. ftig -> rtig and rtig -> ftig
406 |                 my @prevarray = split("\n", $prev);
407 |                 if($#prevarray >=0){
408 |                   my $newprev="";
409 |                   my ($tnum, $sizetig, $links, $gap, $prevline, $merge) = ("","","","","","");
410 |                   for(my $i = $#prevarray; $i >= 0; $i--){
411 | 
412 |                     my @info = split(/\|/, $prevarray[$i]);
413 |                     if($#info eq 1){
414 |                       ($tnum, $sizetig) = split(/\|/, $prevarray[$i]);
415 |                     }else{
416 |                       ($tnum, $sizetig, $links, $gap, $merge) = split(/\|/, $prevarray[$i]);
417 |                     }
418 |                     $tnum =~ tr/fr/rf/;
419 |                     if($prevline ne ""){
420 |                       $newprev .= $prevline."|".$links."|".$gap."\n" if($merge eq "");
421 |                       $newprev .= $prevline."|".$links."|".$gap."|".$merge."\n" if($merge ne "");
422 |                     }
423 |                    $prevline = $tnum."|".$sizetig;
424 |                   }
425 |                   $newprev .= $prevline;
426 |                   $prev = $newprev;
427 |                 }
428 |               }
429 |             }
430 |             else{
431 |               $prev = "$orient"."_$search|size".length($seq);
432 |             }
433 |               $prev .= "|links$links|gaps$estimatedDistance" if($links ne "");
434 | 
435 | 
436 |             #print "$prev\n";
437 |             if($word ne "NA"){
438 |                #####
439 |                if(length($seq)<=$chunk){
440 |                   $template = $seq;
441 |                }else{
442 |                   $template = substr($seq,0,$chunk);
443 |                }
444 | 
445 |                ##### word search
446 |                my $dynamic_word = $word;
447 | 	       if($prevEstimatedDistance <= 0){
448 |                  SCAN:
449 |                  until($template =~ /$dynamic_word/){
450 |                    $dynamic_word = substr($dynamic_word,1,length($dynamic_word));
451 |                    if(length($dynamic_word) < $min_tig_overlap){
452 |                      $count_trim++;
453 |                      last SCAN if($count_trim >= $max_count_trim);
454 |                      $dynamic_word = substr($word,0,length($word)-$count_trim);
455 |                    }
456 |                  }
457 | 	       }
458 |                if($prevEstimatedDistance <= 0  && $seq =~ /^\S{0,$max_count_trim}$dynamic_word(.*)/){### will grab the left-most match which is ok
459 |                   my $tail = $1;
460 |                   my $all = "ERROR_";
461 |                   while($prevseq =~ /^(.*)$dynamic_word/ig){
462 |                      $all = $1;
463 |                   }
464 |                   print "$prevseq **** $all **** WORD:$word *** DWord:$dynamic_word *** COUNTTRIM:$count_trim\n" if($all=~/ERROR/);
465 | 
466 |                   $prevseq = $all . lc($dynamic_word) . $tail;
467 |                   my $overlap = length($dynamic_word);
468 |                   $ct_merge++;
469 |                   print "$ct_merge. GROUNDS FOR MERGING ($overlap nt overlap) !!!\n" if($verbose);
470 |                   $headconcat .= "|merged$overlap"."\n".$prev;
471 |                }else{
472 |                   ### ADDED RLW 5.MAR.2010
473 |                   if($prevEstimatedDistance <= 0){
474 |                      $prevseq .= "n" . $seq
475 |                   }else{
476 |                      $prevseq .= ("N" x $prevEstimatedDistance) . $seq;
477 |                   }
478 |                   $headconcat .= "\n".$prev;
479 | 
480 |                }
481 |             }else{
482 |                $prevseq = $seq;
483 |                $headconcat = "\n".$prev;
484 |                $mct++;
485 |             }
486 | 
487 |             ##### For the next search
488 |             if(length($seq)<=$chunk){
489 |                $word = $seq;
490 |             }else{
491 |                $word = substr($seq,length($seq)-$chunk,$chunk); ### this will be the next word to search with
492 |             }
493 |             $prevEstimatedDistance = $estimatedDistance;
494 |             $prevLinks = $links;
495 |          }#tig regex
496 | 
497 |       }#each tig
498 |       my $scsz = length($prevseq);
499 |       $scaffoldHashStart->{$sct}{'head'} = $headconcat;
500 | 
501 |       my @line = split(/\n/, $headconcat);
502 |       print SCAFS ">$a[0]|size$scsz|tigs".($#line)."$headconcat\n\n";
503 |       print OUT ">$a[0]|size$scsz\n$prevseq\n";
504 |       $prevseq = '';
505 |    }
506 |    close IN;
507 |    close SCAFS;
508 |    close OUT;
509 |    CounterPrint("                ");
510 |    undef $contigs;
511 |    &FlushFiles();
512 |    return ($scaffoldHashStart, $evidence_file);
513 | }
514 | ###WRITE SUMMARY STATISTICS FOR ALL CONTIGS OR SCAFFOLDS
515 | sub writesummaryfiles{
516 |   my ($input_file, $insert, $sumfile) = @_;
517 | 
518 |   open (INFILE, $input_file) || die "Can't open input file $input_file.\n";
519 | 
520 |   my ($counter, $sum, $seq, $name, $foundN50, $sumN50, $totalNcount) = (0,0, "","", 0, 0);
521 |   my (@line, @lengths);
522 |   while (<INFILE>) {
523 |     s/\r\n/\n/;
524 |     chomp;
525 |     $seq.= $_ if(eof(INFILE));
526 |     if ($_ =~ /^[>]/ || eof(INFILE)) {
527 |       if($counter > 0){
528 |          push(@lengths, length($seq));
529 |          $sum+= length($seq);
530 |          my $Ncount = () = $seq =~ /[Nn]/g;
531 |          $totalNcount += $Ncount;
532 |          ($seq) = "";
533 |       }
534 |       $counter++;
535 |     }
536 |     else {
537 |        $seq .= $_;
538 |     }
539 |   }
540 |   $counter--;
541 |   my $half_length = $sum/2;
542 | 
543 |   my @lengths2 = reverse sort { $a <=> $b } @lengths;
544 | 
545 |   for(my $i = 0; $i <= $#lengths && $foundN50 == 0; $i++)
546 |   {
547 |     $sumN50 += @lengths2[$i];
548 |     if($sumN50 >= $half_length){
549 |       $foundN50 = @lengths2[$i] if($sumN50 >= $half_length);
550 |       last;
551 |     }
552 |   }
553 |   $sumfile .= "\t\tTotal number of $insert"."s = $counter\n";
554 |   $sumfile .= "\t\tSum (bp) = ". $sum. "\n";
555 |   $sumfile .= "\t\t\tTotal number of N's = $totalNcount\n";
556 |   $sumfile .= "\t\t\tSum (bp) no N's = ". ($sum-$totalNcount)."\n";
557 |   $sumfile .= "\t\tMax $insert size = ". @lengths2[0]."\n";
558 |   $sumfile .= "\t\tMin $insert size = ". @lengths2[$#lengths]."\n";
559 |   $sumfile .= "\t\tAverage $insert size = ".int($sum/$counter)."\n";
560 |   $sumfile .= "\t\tN50 = ". $foundN50. "\n\n";
561 | 
562 |   close (INFILE);
563 |   close OUTFILE;
564 | 
565 |   return $sumfile;
566 | }
567 | 
568 | 
569 | ###FUNCTION TO GENERATE A VISUALISATION OF THE SCAFFOLDS AND THEIR CONTIGS IN .DOT FORMAT
570 | sub visualiseScaffolds{
571 |    my ($dotname, $evidence) = @_;
572 |    my ($filext, $sizecutoff) = (1, 5000000);
573 |    mkpath('dotfiles');
574 |    my $filename2 = "dotfiles/$dotname.part".$filext.".dot";
575 |    &printMessage("\n=>".getDate().": Producing .dot file for visualisation\n");
576 | 
577 |    open(IN,$evidence) || die "can't read $evidence -- fatal\n";
578 |    open(DOT, ">$filename2") || die "can't open $filename2 -- fatal\n";
579 |    printHeader(\*DOT, undef);
580 |    my ($prevtig, $prevgap, $prevlinks, $prevratio, $scafcount) = ("","","", "",0);
581 |    while(<IN>){
582 |      chomp;
583 |      my $line = $_;
584 |      my $filesize = -s $filename2;
585 | 
586 |      if ($line =~ /^[>]/){
587 |       endCluster(\*DOT) if($scafcount > 0);
588 |        my $filesize = -s $filename2;
589 |        if($filesize > $sizecutoff){
590 |          printFooter(\*DOT);
591 |          close(DOT);
592 |          $filext++;
593 |          $filename2 = "$dotname.part".$filext.".dot";
594 |          open(DOT, ">$filename2") || die "can't open $filename2 -- fatal\n";
595 |          printHeader(\*DOT, undef);
596 |        }
597 |        $scafcount++;
598 |        $line =~ tr/[>\|]/ /;
599 |        startCluster(\*DOT, $scafcount, "$line");
600 |        ($prevtig, $prevgap, $prevlinks, $prevratio) = ("","","", "");
601 |      }
602 |      elsif($line =~ /^[fr]/){
603 |         my @info = split(/\|/, $line);
604 |         my ($tnum, $sizetig, $links, $gap);
605 |         if($#info eq 1){
606 |           ($tnum, $sizetig) = split(/\|/, $line);
607 |         }else{
608 |           ($tnum, $sizetig, $links, $gap) = split(/\|/, $line);
609 |         }
610 |         my ($orient, $tig) = split(/_/,$tnum);
611 |         my $ori=-1;
612 |         my ($other, $gap2) = split(/gaps/,$gap);
613 |         my ($other, $links2) = split(/links/,$links);
614 |         $ori = 1 if($orient eq "f");
615 |         printNode(\*DOT, $tig, "$tig ($sizetig)", $ori);
616 |         printEdge(\*DOT, $prevtig, $tig, "gap = $prevgap links = $prevlinks", undef) if($prevtig ne "");
617 | 
618 |         $prevtig = $tig;
619 |         $prevgap = $gap2;
620 |         $prevlinks = $links2;
621 |      }
622 |    }
623 |    endCluster(\*DOT) if($scafcount > 0);
624 |    printFooter(\*DOT);
625 |    close(DOT);
626 |    close IN;
627 | }
628 | 
629 | 
630 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE
631 | sub reverseComplement{
632 |    $_ = shift;
633 |    tr/ATGC/TACG/;
634 |    return (reverse());
635 | }
636 | 
637 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE
638 | sub printMessage{
639 |   my $message = shift;
640 |   print $message;
641 |   print LOG $message;
642 | }
643 | 
644 | ###FUNCTION TO GET THE CURRENT DATE
645 | sub getDate{
646 |   my $date = scalar(localtime);
647 |   return $date;
648 | }
649 | 
650 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
651 | sub CounterPrint{
652 |   my $countingMessager = shift;
653 |   print "\r$countingMessager";
654 |   $|++;
655 | }
656 | 
657 | ###FLUSHES THE SUMMARY AND LOG FILE
658 | sub FlushFiles{
659 |   select((select(SUMFILE), $| = 1)[0]);
660 |   select((select(LOG), $| = 1)[0]);
661 |   $|++;
662 | }
663 | #########END MAIN SCRIPT
664 | 
665 | 
666 | sub checkStatus{
667 |   &printMessage(("*" x 50)."\n\nProcess failed on ".getDate()."\n\n\n"), exit 1 if(!(-d "process_OK"));
668 |   rmtree(["process_OK", 'blurfl/quux']);
669 | }
670 | 


--------------------------------------------------------------------------------
/SSPACE_Basic_v2.0.pl:
--------------------------------------------------------------------------------
1 | SSPACE_Basic.pl


--------------------------------------------------------------------------------
/bin/ExtendOrFormatContigs.pl:
--------------------------------------------------------------------------------
  1 |   ###############################################################
  2 |   #Marten Boetzer 1-03-2010                                    #
  3 |   #SSPACE perl subscript ExtendOrFormatContigs.pl               #
  4 |   #This script, based on the the -x parameter;                  #
  5 |   #  -Formats the contigs to appropriate format (-x 0)          #
  6 |   #  -Extends the contigs with available unmapped reads (-x 1)  #
  7 |   ###############################################################
  8 | 
  9 |   use strict;
 10 |   use File::Basename;
 11 |   use File::Path;
 12 | 
 13 |   my ($MAX, $MAX_TOP, $TRACK_COUNT) = (0, 100, 1);
 14 | 
 15 |   my $seplines = ("-" x 60)."\n";
 16 | 
 17 |   my $contig = $ARGV[0];
 18 |   my $base_name = $ARGV[1];
 19 |   my $extending = $ARGV[2];
 20 |   my $filecontig = $ARGV[3];
 21 |   my $MIN_READ_LENGTH = $ARGV[4];
 22 |   my $base_overlap = $ARGV[5];
 23 |   my $min_overlap = $ARGV[6];
 24 |   my $min_base_ratio = $ARGV[7];
 25 |   my $max_trim = $ARGV[8];
 26 |   my $verbose = $ARGV[9];
 27 |   my $minContigLength = $ARGV[11];
 28 |   my $libraryfile = $ARGV[12];
 29 |   my $gaps = $ARGV[13];
 30 |   my $threads = $ARGV[14];
 31 | 
 32 |   my $log = $base_name . ".logfile.txt";
 33 |   my $summaryfile = $base_name.".summaryfile.txt";
 34 | 
 35 |   open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
 36 |   open (LOG, ">>$log") || die "Can't write to logfile$log -- fatal\n";
 37 |   my $filenameOutExt = $base_name . ".singlereads.fasta";
 38 |   my ($bin);
 39 |   if($extending == 1){
 40 | 
 41 |     &ExtendContigs($base_name, $filecontig, $filenameOutExt);
 42 |     print SUMFILE "\n" if($minContigLength > 0);
 43 |     &FormatContigs() if($minContigLength > 0);
 44 |   }else{
 45 |     &FormatContigs();
 46 |   }
 47 | 
 48 |   close SUMFILE;
 49 |   close LOG;
 50 | 
 51 |   mkpath('process_OK');
 52 | #--------------------------------------------------
 53 | 
 54 | ###EXTEND CONTIGS WITH UNMAPPED READS
 55 | sub ExtendContigs{
 56 |   my ($base_name, $filecontig, $filenameOutExt) = @_;
 57 |   my ($seq);
 58 |   #-------------------------------------------------NOW MAP SINGLE READS TO INITIAL CONTIGS FILE.
 59 |   my $readfile = "reads/" . $filenameOutExt;
 60 |   &getUnmappedReads($filecontig, $readfile);
 61 |   #-------------------------------------------------CONTIG EXTENSION USING UNMAPPED PAIRS STORED IN $SET
 62 |   &printMessage("\n=>".getDate().": Contig extension initiated\n");
 63 |   my $outfileTig =  "intermediate_results/" . $base_name .  ".extendedcontigs.fasta";
 64 | 
 65 |   open (TIG, ">$outfileTig") || die "Can't write to $outfileTig -- fatal\n";
 66 |   #--------------------------------------------ASSEMBLY START
 67 | 
 68 |  ASSEMBLY:
 69 |    open(IN, $filecontig) || die "Can't open $filecontig -- fatal\n";
 70 |    my ($exttig_count, $counter, $NCount, $orig_mer, $prevhead) = (0, 0, 0, 0, '');
 71 |    while(<IN>){
 72 |       s/\r\n/\n/;
 73 |       chomp;
 74 |       $seq.= uc($_) if(eof(IN));
 75 |       if (/\>(\S+)/ || eof(IN)){
 76 |          my $head=$1;
 77 |          $orig_mer = length($seq);
 78 |          if($seq ne ''){
 79 |              $NCount++ if($seq=~/([NX])/i);
 80 |              my $start_sequence = uc($seq);
 81 |              my $reads_needed = 1;                        #tracks coverage
 82 |              my $total_bases = $orig_mer * $reads_needed;
 83 | 
 84 |              ($seq, $reads_needed, $total_bases) = doExtension("3", $orig_mer, $seq, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $counter, $max_trim) if($orig_mer >= $MIN_READ_LENGTH && $orig_mer >= $min_overlap);
 85 | 
 86 |              my $seqrc = reverseComplement($seq);
 87 |              ($seqrc, $reads_needed, $total_bases) = doExtension("5", $orig_mer, $seqrc, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $counter, $max_trim) if($orig_mer >= $MIN_READ_LENGTH && $orig_mer >= $min_overlap);
 88 | 
 89 |              my $leng = length($seqrc);
 90 |              my $reversetig = reverseComplement($seqrc);                   ### return to sequence, as inputted
 91 |              if($leng > $orig_mer){ ### commented out: && $start_sequence ne $seqrc && $start_sequence ne $reversetig
 92 |                my $cov =  $total_bases / $leng;
 93 |                printf TIG ">extcontig%i|size%i|read%i|cov%.2f|seed:$prevhead\n%s\n", ($counter, $leng, $reads_needed, $cov, $reversetig);    #print contigs to file
 94 |                $exttig_count++;
 95 |              }else{
 96 |                my $cov = $reads_needed = 0;
 97 |                my $singlet_leng = length($start_sequence);
 98 |                printf TIG ">contig%i|size%i|read%i|cov%.2f|seed:$prevhead\n%s\n", ($counter, $leng, $reads_needed, $cov, $reversetig);    #print singlets to file
 99 |              }
100 |          }
101 |          CounterPrint(++$counter);
102 |          $prevhead = $head;
103 |          $seq='';
104 |       }else{
105 |          $seq .= uc($_);
106 |       }
107 |    }
108 |   CounterPrint("                ");
109 |   print SUMFILE "\tNumber of contig sequences =".($counter-1). "\n";
110 |   print SUMFILE "\t\tNumber of contigs containing N's (may prevent proper contig extension) = $NCount\n";
111 | 
112 |   print SUMFILE "\tNumber of contigs extended = $exttig_count\n".$seplines;
113 |   close IN;
114 |   $filecontig = $outfileTig;
115 |   if($@){
116 |      my $message = $@;
117 |      &printMessage("\nSomething went wrong running $0 ".getDate()."\n$message\n");
118 |   }
119 |   close TIG;
120 | }
121 | 
122 | ###STORE CONTIGS TO APPROPRIATE FORMAT WHEN CONTIGS WILL NOT BE EXTENDED
123 | sub FormatContigs{
124 |    &printMessage("\n=>".getDate().": Storing contigs to format for scaffolding\n");
125 |    open (TIG, ">$contig") || die "Can't write to $contig -- fatal\n";
126 |    open(IN, $filecontig) || die "Can't open $filecontig -- fatal\n";
127 |    my ($counter, $seq, $prevhead, $step) = (0, '', '', 100);
128 |    while(<IN>){
129 |       s/\r\n/\n/;
130 |       chomp;
131 |       $seq.= uc($_) if(eof(IN));
132 |       if (/\>(\S+)/ || eof(IN)){
133 |         my $head=$1;
134 |         my $length_seq = length($seq);
135 |         if($seq ne '' && $length_seq >= $minContigLength){
136 |           if(++$counter == $step){
137 |             CounterPrint($counter);
138 |             $step = $step + 100;
139 |           }
140 |           printf TIG ">contig%i|size%i|read%i|cov%.2f|seed:$prevhead\n%s\n", ($counter, $length_seq, 0, 0.00, $seq);
141 |         }
142 |         $prevhead = $head;
143 |         $seq = '';
144 |       }else{
145 |          $seq .= uc($_);
146 |       }
147 |    }
148 |    CounterPrint("                ");
149 |    close IN;
150 |    close TIG;
151 | }
152 | 
153 | ###EXTEND CONTIGS
154 | sub doExtension{
155 | 
156 |    my ($direction, $orig_mer, $seq, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $tig_count, $max_trim) = @_;
157 | 
158 |    my $previous = $seq;
159 |    my ($extended, $trim_ct) = (1, 0);
160 | 
161 |    if($orig_mer > $MAX){$orig_mer=$MAX;}  ### Deals with special cases where the seed sequences are different from the read set (and possibly very large) - goal here is not to increase sequence coverage of seed, but rather to extend it.
162 | 
163 |    TRIM:
164 |    while($trim_ct <= $max_trim){
165 |       while($extended){
166 | 
167 |          my ($pos, $current_reads, $current_bases, $span) = (0, 0, 0, "");
168 | 
169 |          ### Added 19March08
170 |          if(length($seq) >= $MAX){   # $seq is length of contig being extended -- if larger than largest read, make sure the largest read could align and all subsequent rds.
171 |             $span = $MAX - $TRACK_COUNT;
172 |          }else{
173 |             $span = length($seq) - $TRACK_COUNT;
174 |          }
175 |          my $startspan = $span;
176 |          my $overhang = {};
177 |          my @overlapping_reads = ();
178 |          for (my $x=1;$x <= ($orig_mer * 2);$x++){
179 |             ($overhang->{$x}{'A'}, $overhang->{$x}{'C'}, $overhang->{$x}{'G'}, $overhang->{$x}{'T'}) = (0, 0, 0, 0);
180 |          }
181 | 
182 |          ### COLLECT SEQUENCES
183 |          while ($span >= $min_overlap){  # will slide the subseq, until the user-defined min overlap size
184 | 
185 |             $pos = length($seq) - $span;
186 |             print "MAX:$MAX, SPAN:$span, POS:$pos" if ($verbose);
187 | 
188 |             my $subseq = substr($seq, $pos, $span);              #make a sub-sequence of length l-(1..i) for searching
189 |             my $sub = substr($subseq, 0, 10);                    #grab first 10 nucleotides and get all reads having this subset stored in $bin
190 |             my $subset = $bin->{$sub};                           #Will grab everything even the reverse complement ones
191 |             print "####$direction' SEARCH Position:$pos Span:$span - Subseq:$subseq Previous:$previous\n" if ($verbose);
192 |             ### SEARCH -- this cycles through limited k-mer space
193 |             foreach my $pass (keys %$subset){
194 |                my $pos = index($pass, $subseq);
195 |                if($pos==0){
196 |                   my $dangle = substr($pass, $pos+length($subseq));
197 |                   #can we align perfectly that subseq to another rd start?
198 |                   print "\n", "=" x 80, "\n$direction'- FOUND sequence: $pass -> subset: $subseq -> overhang: $dangle\n", "=" x 80, "\n\n" if ($verbose);
199 | 
200 |                   # Collect all overhangs
201 |                   push @overlapping_reads, $pass;                  ### all overlapping reads
202 |                   my @over = split(//, $dangle);
203 |                   my $ct_oh = 0;
204 | 
205 |                   foreach my $bz(@over){
206 |                     $ct_oh++;                                     ### tracks overhang position passed the seed
207 |                     $overhang->{$ct_oh}{$bz} += $bin->{$sub}{$pass};
208 |                     print "$ct_oh - $bz = $overhang->{$ct_oh}{$bz}\n" if($verbose);
209 |                   }
210 |                }
211 |             }
212 |             $span--;
213 |          }#while overlap >= user-defined -m minimum
214 | 
215 |          my $consensus = "";
216 |          print "Finished Collecting Overlapping Reads - BUILDING CONSENSUS...\n" if ($verbose);
217 |         # print Dumper(@overlapping_reads) if ($verbose);
218 | 
219 |          ### Build consensus
220 |          CONSENSUS:
221 |          foreach my $ohpos (sort {$a<=>$b} keys %$overhang){
222 |             if($ohpos){
223 | 
224 |                my $coverage = $overhang->{$ohpos}{'A'}+$overhang->{$ohpos}{'C'}+$overhang->{$ohpos}{'G'}+$overhang->{$ohpos}{'T'};
225 |                print "pos:$ohpos cov:$coverage A:$overhang->{$ohpos}{'A'} C:$overhang->{$ohpos}{'C'} G:$overhang->{$ohpos}{'G'} T:$overhang->{$ohpos}{'T'}\n" if($verbose);
226 |                if ($coverage < $base_overlap){
227 |                   print "COVERAGE BELOW THRESHOLD: $coverage < -o $base_overlap @ $ohpos :: will extend by: $consensus\n" if ($verbose);
228 |                   last CONSENSUS;
229 |                }
230 |                my $baselist = $overhang->{$ohpos};
231 |                my ($ct_dna, $previous_bz) = (0, "");
232 |                BASE:
233 |                foreach my $bz (sort {$baselist->{$b}<=>$baselist->{$a}} keys %$baselist){
234 |                  if($ct_dna){## the two most abundant bases at that position
235 |                      if($previous_bz ne "" && ($baselist->{$previous_bz} / $coverage) >= $min_base_ratio && $baselist->{$previous_bz} > $baselist->{$bz}){### a simple consensus btw top 2
236 |                         $consensus .= $previous_bz;                                         ### build consensus
237 |                         print "Added base $previous_bz (cov = $baselist->{$previous_bz}) to $consensus **\n" if ($verbose);
238 |                         last BASE;
239 |                      }else{
240 |                         print "ISSUES EXTENDING: best base = $previous_bz (cov=$baselist->{$previous_bz}) at $ohpos.  Second-Best: $bz (cov=$baselist->{$bz}) (ratio best=$baselist->{$previous_bz} / total=$coverage) >= $min_base_ratio (-r) -- will terminate with $consensus\n" if($verbose);
241 |                         last CONSENSUS;
242 |                      }
243 |                   }
244 |                   $previous_bz = $bz;
245 |                   $ct_dna++;
246 |                }
247 |             }
248 |          }
249 | 
250 |          ### deal with sequence reads making up the consensus/newly formed contig
251 |          if($consensus ne ""){
252 | 
253 |             print "Will extend $seq\nwith: $consensus\n\n" if($verbose);
254 |             my $temp_sequence = $seq . $consensus;  ## this is the contig extension
255 |             my $integral = 0;
256 |             my $position = length($temp_sequence) - ($startspan + length($consensus));
257 |             my $temp_sequence_end = substr($temp_sequence, $position);
258 |             foreach my $ro (@overlapping_reads){
259 |                if(index($temp_sequence_end, $ro) >= 0){
260 |                  $integral=1;
261 |                  my $sub = substr($ro, 0, 10);
262 |                  $current_reads = $bin->{$sub}{$ro};
263 |                  $current_bases = length($ro) * $current_reads;
264 |                  $reads_needed += $current_reads;
265 |                  $total_bases += $current_bases;
266 |                  deleteData($ro);
267 |                }
268 |             }
269 |             if(! $integral){### no reads are found overlapping with the consensus might be indicative of low complexity regions -- Stop the extension
270 |                print "No overlapping reads agree with the consensus sequence.   Stopping extension" if ($verbose);
271 |                $extended = 0;
272 |             }else{
273 |                $seq = $temp_sequence;
274 |                $temp_sequence = "";
275 |                print "New Contig is: $seq\n" if ($verbose);
276 |                $extended = 1;
277 |             }
278 |             $previous = $seq;
279 |          }else{### no consensus built, will stop the extension
280 |             $extended = 0;
281 |          }
282 | 
283 |       }###while get the OK for extension
284 | 
285 |       $trim_ct++;
286 |       if ($trim_ct <= $max_trim){
287 |          last TRIM if (length($seq) <= $MIN_READ_LENGTH); #terminate assembly if trimming becomes too agressive
288 |          $seq = substr($seq, 0, -1);
289 |          $extended = 1;
290 |          print "\n$direction prime EXTENSION ROUND $trim_ct COMPLETE UNTIL $max_trim nt TRIMMED OFF => TRIMMED SEQUENCE:$seq\n\n" if ($verbose);
291 |       }
292 | 
293 |    }### while trimming within bounds
294 | 
295 |    print "\n*** NOTHING ELSE TO BE DONE IN $direction prime- PERHAPS YOU COULD DECREASE THE MINIMUM OVERLAP -m (currently set to -m $min_overlap) ***\n\n" if ($verbose);
296 | 
297 |    return $seq, $reads_needed, $total_bases;
298 | }
299 | 
300 | 
301 | ###DELETE READ DATA IF IT HAS BEEN USED FOR EXTENDING A CONTIG
302 | sub deleteData {
303 |    my ($sequence) = @_;
304 | 
305 |    my $subnor = substr($sequence, 0, 10);
306 |    my $comp_seq = reverseComplement($sequence);
307 |    my $subrv = substr($comp_seq, 0, 10);
308 | 
309 |    #remove k-mer from hash table and prefix tree
310 |    delete $bin->{$subrv}{$comp_seq};
311 |    delete $bin->{$subnor}{$sequence};
312 | }
313 | 
314 | sub getUnmappedReads{
315 |   my ($contigFile, $readfiles) = @_;
316 |   my ($library, $fnames) = ("start", "");
317 | 
318 |   #obtain sequences to map against the contigs
319 |   open(FILELIB, "< $libraryfile") || die "Can't open $libraryfile -- fatal\n";
320 |   my $files;
321 |   while(<FILELIB>){
322 |     my ($lib) = split(/\s+/, $_);
323 |     my $i = 1;
324 |     while(-e "reads/$base_name.$lib.file$i.fa"){
325 |       $files->{"reads/$base_name.$lib.file1.fa"}++;
326 |       $i++;
327 |     }
328 |   }
329 |   close FILELIB;
330 |   my $unpaired = "reads/$base_name.singlereads.fasta";
331 |   $files->{$unpaired}++ if(-e $unpaired);
332 |   foreach my $f(keys %$files){
333 |     $fnames .= "$f,";
334 |   }
335 |   chop $fnames;
336 | 
337 |   #build bowtie index of contigs and map reads to the index
338 |   my $bowtieout = $base_name . ".$library.bowtieIndex";
339 |   die "Contig file ($contigFile) not found. Exiting...\n" if(!(-e $contigFile));
340 |   &printMessage("\n=>".getDate().": Building Bowtie index for contigs\n");
341 |   system("bowtie-build $contigFile bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values
342 |   &printMessage("\n=>".getDate().": Mapping reads to Bowtie index\n");
343 |   my $procline = "bowtie -p $threads -v $gaps bowtieoutput/$bowtieout -f $fnames --quiet -S |";
344 | 
345 |   #map reads with bowtie and obtain unmapped reads. Store the unmapped reads into a hash and use them for contig extension
346 |   open(IN, "$procline") || die "Can't open bowtie output -- fatal\n";
347 |   my ($counter, $step) = (0, 100000);
348 |   my ($orig, $rc, $subrv, $subnor, $orig_mer);
349 |   while(<IN>){
350 |     my @t = split(/\t/);
351 |     next if ($t[2] ne '*');
352 |     if(++$counter == $step){
353 |       CounterPrint($counter);
354 |       $step = $step + 100000;
355 |     }
356 |     $orig_mer = length($t[9]);
357 |     $rc=reverseComplement($t[9]);
358 |     $MAX=$orig_mer if ($orig_mer > $MAX);
359 |     $bin->{substr($t[9], 0, 10)}{$t[9]}++;
360 |     $bin->{substr($rc, 0, 10)}{$rc}++;
361 |   }
362 | 
363 |   print SUMFILE "CONTIG EXTENSION:\n".$seplines;
364 |   print SUMFILE "\tNumber of unmapped reads used for contig extension = $counter\n";
365 |   CounterPrint((" " x length($counter)));
366 | }
367 | 
368 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE
369 | sub reverseComplement{
370 |    $_ = shift;
371 |    tr/ATGC/TACG/;
372 |    return (reverse());
373 | }
374 | 
375 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
376 | sub CounterPrint{
377 |   my $countingMessager = shift;
378 |   print "\r$countingMessager";
379 |   $|++;
380 | }
381 | 
382 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE
383 | sub printMessage{
384 |   my $message = shift;
385 |   print $message;
386 |   print LOG $message;
387 | }
388 | 
389 | ###FUNCTION TO GET THE CURRENT DATE
390 | sub getDate{
391 |   my $date = scalar(localtime);
392 |   return $date;
393 | }
394 | 
395 | ###FLUSHES THE SUMMARY AND LOG FILE
396 | sub FlushFiles{
397 |   select((select(SUMFILE), $| = 1)[0]);
398 |   select((select(LOG), $| = 1)[0]);
399 |   $|++;
400 | }
401 | 
402 | sub checkStatus{
403 |   &printMessage(("*" x 50)."\n\nProcess failed on ".getDate()."\n\n\n"), exit if(!(-d "process_OK"));
404 |   rmtree(["process_OK", 'blurfl/quux']);
405 | }
406 | 
407 | #########END ExtendOrFormatContigs.pl
408 | 


--------------------------------------------------------------------------------
/bin/PairingAndScaffolding.pl:
--------------------------------------------------------------------------------
  1 |   ###################################################
  2 |   #Marten Boetzer 14-07-2011                        #
  3 |   #SSPACE perl subscript PairingAndScaffolding.pl   #
  4 |   #This script;                                     #
  5 |   #  -reads the contig sequences in a hash          #
  6 |   #  -stores Bowtie output in a hash                #
  7 |   #  -pairs the contigs                             #
  8 |   #  -generates scaffolds                           #
  9 |   ###################################################
 10 | 
 11 | 
 12 |   #THIS VERSION OF SCAFFOLDING FIRST ORDERS THE CONTIGS BASED ON THE NUMBER OF INGOING LINKS AND STARTS AT LOWEST LEVEL. AFTER ALL THESE CONTIGS ARE SCAFFOLDED, INGOING LINKS ARE RECALCULATED OF REMAINING CONTIGS, ITERATIVELY. 
 13 |   #ALSO, EACH CONTIG IS REPRESENTED ONCE IN THE SCAFFOLDS. 
 14 |   #METHOD OF SCAFFOLDING IS; IF MORE THAN ONE LINK, CHECK IF THOSE LINKS HAVE CONNECTION WITH EACH OTHER. IF SO, COMBINE THEM IN THE SCAFFOLD. IF NOT, ESTIMATE RATIO AND ONLY ALLOW EXTENSION OF SCAFFOLD IF IT'S BELOW THE RATIO THRESHOLD GIVEN BY THE USER.
 15 |   #FUTURE: INCLUDE NUMBER OF REPEATS THAT ARE POSSIBLY PRESENT
 16 |   use strict;
 17 |   use Storable;
 18 |   use File::Path;
 19 |   use File::Basename;
 20 |   my $seplines = ("-" x 60)."\n";
 21 |   my $gaps = $ARGV[1];
 22 |   my $contig = $ARGV[2];
 23 |   my $base_name = $ARGV[3];
 24 |   my $issues = $ARGV[4];
 25 |   my $distribution = $ARGV[5];
 26 |   my $verbose = $ARGV[6];
 27 |   my $library = $ARGV[7];
 28 |   my $insert_size = $ARGV[8];
 29 |   my $min_allowed = $ARGV[9];
 30 |   my $scaffold = $ARGV[10];
 31 |   my $min_links = $ARGV[11];
 32 |   my $max_link_ratio = $ARGV[12];
 33 |   my $ori = $ARGV[13];
 34 |   my $threads = $ARGV[14];
 35 |   my $tab = $ARGV[15];
 36 |   my $tabfile = $ARGV[16];
 37 |   my $origctg = $ARGV[17];
 38 |   my $prev_evidence = $ARGV[18];
 39 | 
 40 |   my ($low_iz, $up_iz) = ($insert_size + $min_allowed, $insert_size - $min_allowed);
 41 |   my $bowtiefile = "bowtieoutput/" . $base_name . ".$library.mapped";
 42 |   my $log = $base_name . ".logfile.txt";
 43 |   my $summaryfile = $base_name.".summaryfile.txt";
 44 | 
 45 |   my ($total_for_median, $step,$ct_illogical, $ct_ok_contig, $ct_ok_pairs, $ct_problem_pairs, $ct_iz_issues, $ct_single, $ct_both, $counter)= (0,100000,0,0,0,0,0,0,0,0 );
 46 |   my ($pair,$err,$track_insert, $tigOnScafHash, $tigHash);
 47 |   my $pair_found = 0;
 48 | 
 49 |   open (LOG, ">>$log") || die "Can't write to $log -- fatal\n";
 50 |   open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
 51 |   open(PET, ">$issues") || die "Can't open $issues for writing -- fatal\n";
 52 | #-------------------------------------------------READ CONTIGS INTO HASH AND STORE THEIR LENGTH. NEXT; PAIR THE CONTIGS
 53 |   if($tab){
 54 |     parseEvidenceFile($prev_evidence) if($prev_evidence ne '');
 55 |     &updateContigs($origctg);
 56 |   }
 57 |   my ($contigstored, $tig_length) = &readFileContigHash($contig);
 58 |   if(!$tab){
 59 |     my $up_iz = ($insert_size - $min_allowed);
 60 |     my $newcontig = processContig($contig, $up_iz) ;
 61 | 
 62 |     my $fname = "reads/$base_name.$library.file1.fa";
 63 |     if(-e $fname){
 64 |       my $fname2 = "reads/$base_name.$library.file2.fa";
 65 |       my $i = 2;
 66 |       while(-e $fname2){
 67 |         $fname = "$fname,$fname2";
 68 |         $i++;
 69 |         $fname2 = "reads/$base_name.$library.file$i.fa";
 70 |       }
 71 |     }
 72 |     mapReadsWithBowtie($newcontig, $fname,$gaps, $threads);
 73 |   }else{
 74 |     parseTabFile($tigHash);
 75 |   }
 76 |   &printResultsPairing();
 77 | 
 78 | #-------------------------------------------------BUILDING SCAFFOLDS
 79 |   &buildScaffolds($pair, $tig_length, $verbose, $scaffold, $library);
 80 |   ($pair, $tig_length) = ('',''); undef $pair; undef $tig_length;
 81 | 
 82 |   close SUMFILE;
 83 |   close LOG;
 84 |   mkpath('process_OK');
 85 | 
 86 | #-------------------------------------------------
 87 | 
 88 | ###FUNCTION TO PARSE THE TAB FILE
 89 | sub parseTabFile{
 90 |   my ($tigHash) = @_;
 91 |   open(TAB, "$tabfile") || die "Can't open $tabfile for reading -- fatal\n";
 92 |   my $lower = ($up_iz+200);
 93 |   my $step = 1000000;
 94 |   &printMessage("\n=>".getDate().": Parsing Tab file\n");
 95 |   while(<TAB>){
 96 |     chomp;
 97 |     if(++$ct_both == $step){
 98 |       CounterPrint($ct_both);
 99 |       $step = $step + 1000000;
100 |     }
101 |     my ($tig1,$start1,$end1,$tig2,$start2,$end2) = split(/\t/);
102 | 
103 |     #check if the contig in the tab file is also present in the inserted contig fasta file
104 |     if(!defined($tigHash->{$tig1})){
105 |       die "\nERROR: could not find an header containing $tig1 at line number $ct_both. Exit...\n";
106 |     }
107 |     if(!defined($tigHash->{$tig2})){
108 |       die "ERROR: could not find an header containing $tig2 at line number $ct_both. Exit...\n";
109 |     }
110 |     my $ctg1 = $tigHash->{$tig1};
111 |     my $ctg2 = $tigHash->{$tig2};
112 |     my ($track1, $track2) =  ("","");
113 |     
114 |     #if multiple libraries were used, update the contig positions in the TAB File by finding its position in the scaffolds
115 |     if($prev_evidence ne ''){
116 |       $start1 = $start1 + $tigOnScafHash->{$ctg1}{'begin'};
117 |       $end1 = $end1 + $tigOnScafHash->{$ctg1}{'begin'};
118 |       $start2 = $start2 + $tigOnScafHash->{$ctg2}{'begin'};
119 |       $end2 = $end2 + $tigOnScafHash->{$ctg2}{'begin'};
120 |       if($tigOnScafHash->{$ctg1}{'direction'} eq "r"){
121 |         my $tmp_start = ($tigOnScafHash->{$ctg1}{'end'}+$tigOnScafHash->{$ctg1}{'begin'}) - $start1;
122 |         my $tmp_end = ($tigOnScafHash->{$ctg1}{'end'}+$tigOnScafHash->{$ctg1}{'begin'}) - $end1;
123 |         $start1 = $tmp_start;
124 |         $end1 = $tmp_end;
125 |       }
126 |       if($tigOnScafHash->{$ctg2}{'direction'} eq "r"){
127 |         my $tmp_start = ($tigOnScafHash->{$ctg2}{'end'}+$tigOnScafHash->{$ctg2}{'begin'}) - $start2;
128 |         my $tmp_end = ($tigOnScafHash->{$ctg2}{'end'}+$tigOnScafHash->{$ctg2}{'begin'}) - $end2;
129 |         $start2 = $tmp_start;
130 |         $end2 = $tmp_end;
131 |       }
132 | 
133 |       $ctg1 = $tigOnScafHash->{$ctg1}{'scaf'};
134 |       $ctg2 = $tigOnScafHash->{$ctg2}{'scaf'};
135 |      if($start1 < $lower || ($end1 > ($tig_length->{$ctg1}-$lower))){
136 |        $track1 = "$ctg1"."|$start1"."|$end1";
137 |      }
138 |      if($start2 < $lower || ($end2 > ($tig_length->{$ctg2}-$lower))){
139 |        $track2 = "$ctg2"."|$start2"."|$end2";
140 |      }
141 |     }else{ #if it is the first library, just use the positions in the TAB file
142 |       if($start1 < $lower || ($end1 > ($tig_length->{$ctg1}-$lower))){
143 |         $track1 = "$ctg1"."|$start1"."|$end1";
144 |       }
145 |       if($start2 < $lower || ($end2 > ($tig_length->{$ctg2}-$lower))){
146 |         $track2 = "$ctg2"."|$start2"."|$end2";
147 |       }
148 | 
149 |     }
150 |     #pair the contigs based on the information provided in the TAB file
151 |     pairContigs($track1, $track2, "seq$ct_both.1", "seq$ct_both.2") if($track1 ne "" && $track2 ne "");
152 |   }
153 |   CounterPrint("                ");
154 | }
155 | 
156 | ###FUNCTION TO STORE ONLY THE EDGES OF THE CONTIGS. ONLY READS ARE MAPPED TO THESE EDGES, SAVING TIME FOR BUILDING THE INDEX WITH BOWTIE, AND MAPPING THE READS TO THE CONTIGS
157 | sub processContig{
158 |   my ($contigfile, $max_dist) = @_;
159 |   
160 |   my $lower = ($max_dist+200);
161 | 
162 |   open(IN,$contigfile) || die "can't read $contigfile -- fatal\n";
163 |   my $contigfilesub = "tmp.$base_name/subset_contigs.fasta";
164 |   open(OUT,">$contigfilesub") || die "can't write to $contigfilesub -- fatal\n";
165 |   my ($seq, $counter) = ('', 0);
166 |   while(<IN>){
167 |     chomp;
168 |     my $line = $_;
169 |     $seq.= uc($line) if(eof(IN));
170 |     if (/\>(\S+)/ || eof(IN)){
171 |       if($seq ne ''){
172 |         $counter++;
173 |         if(length($seq) > (($lower * 2)+100)){
174 |           my $upper = (length($seq) - ($lower));
175 |           my $first = substr($seq, 0, $lower);
176 |           my $second = substr($seq, $upper);
177 |           my $newseq = $first."NNN".$second;
178 |           print OUT ">$counter\n$newseq\n";
179 |         }
180 |         else{
181 |           print OUT ">$counter\n$seq\n";
182 |         }
183 |       }
184 |       $seq='';
185 |     }else{
186 |       $seq.=uc($line);
187 |     }
188 |   }
189 |   close IN;
190 |   close OUT;
191 |   return $contigfilesub;
192 | }
193 | 
194 | ###FUNCTION TO PARSE THE EVIDENCE FILE, ONLY USED IF TAB FILE IS INSERTED
195 | #Function determines the position of the contigs on the scaffolds, information is used to update the contigs of the TAB file
196 | sub parseEvidenceFile{
197 |    my ($file) = @_;
198 |    my $track_tigs;
199 |    open(IN,$file) || die "Can't open $file -- fatal\n";
200 |    my $scaf = 0;
201 |    my $totalsize= 0;
202 |    while(<IN>){
203 |       chomp;
204 |       if(/^>/){
205 |         $scaf++;
206 |         $totalsize=0;
207 |       }else{
208 |         my ($tig, $size, $links, $gap, $merge) = split(/\|/,$_);
209 |         if($tig ne ""){
210 |           my ($direction, $tig2) = split(/_tig/,$tig);
211 |           $tigOnScafHash->{$tig2}{'begin'} = $totalsize;
212 | 
213 |           my (undef, $size2) = split(/size/,$size);
214 |           my $end = $totalsize + $size2;
215 |           if($merge ne ""){
216 |             my (undef, $merge2) = split(/merged/,$merge);
217 |             $totalsize = $totalsize + ($size2 - $merge2);
218 |           }elsif($gap ne ""){
219 |             my (undef, $gap2) = split(/gaps/,$gap);
220 |             $gap2 = 1 if($gap2 < 0);
221 |             $totalsize = $totalsize + $size2 + $gap2;
222 |           }else{
223 |             $totalsize = $totalsize + $size2;
224 |           }
225 |           $tigOnScafHash->{$tig2}{'scaf'} = $scaf;
226 |           $tigOnScafHash->{$tig2}{'end'} = $totalsize;
227 |           $tigOnScafHash->{$tig2}{'direction'} = $direction;
228 |         }
229 |       }
230 |    }
231 | }
232 | 
233 | ###FUNCTION TO UPDATE THE ORIGINAL CONTIG FILE INSERTED BY THE USER, SO MULTIPLE TAB FILES OF SEVERAL LIBRARIES CAN BE INSERTED
234 | sub updateContigs{
235 |   my ($file, $update) = @_;
236 | 
237 |   &printMessage("\n=>".getDate().": Updating contig file\n");
238 | 
239 |   my ($countContig, $seq, $prevhead) = (0, "", '');
240 |   open(IN,$file) || die "Can't open $file -- fatal\n";
241 |   while(<IN>){
242 |      my $line = $_;
243 |      chomp $line;
244 |      $seq.= $line if(eof(IN));
245 |      if (/\>(\S+)/ || eof(IN)){
246 |        my $head=$1;
247 |        if($prevhead ne ''){
248 |          ++$countContig;
249 |          $tigHash->{$prevhead} = $countContig;
250 |        }
251 |        $prevhead = $head;
252 |        $seq='';
253 |      }else{
254 |         $seq.=$line;
255 |      }
256 |   }
257 |   CounterPrint("                ");
258 |   &FlushFiles();
259 | }
260 | 
261 | #READ THE CONTIG TO A HASH AND STORE THIS HASH
262 | sub readFileContigHash{
263 |   my ($file) = @_;
264 | 
265 |   &printMessage("\n=>".getDate().": Reading contig file\n");
266 |     
267 |   my ($contigs, $tig_length);
268 |   my ($countContig, $seq, $prevhead, $step) = (0, "", '', 1000);
269 |   open(IN,$file) || die "Can't open $file -- fatal\n";
270 |   while(<IN>){
271 |      my $line = $_;
272 |      chomp $line;
273 |      $seq.= $line if(eof(IN));
274 |      if (/\>(\S+)/ || eof(IN)){
275 |        my $head=$1;
276 |        if($prevhead ne ''){
277 |          if(++$countContig == $step){
278 |            CounterPrint($countContig);
279 |            $step = $step + 100000;
280 |          }
281 |            $tig_length->{$countContig} = length($seq);
282 |            $contigs->{$countContig}{'name'} = $prevhead;
283 |            $contigs->{$countContig}{'seq'} = $seq;
284 |        }
285 |        $prevhead = $head;
286 |        $seq='';
287 |      }else{
288 |         $seq.=$line;
289 |      }
290 |   }
291 |   CounterPrint("                ");
292 |   &FlushFiles();
293 |   my $contigstore = "tmp.$base_name/contigs.stored";
294 |   store \%$contigs, "$contigstore";
295 |   undef $contigs;
296 |   return ($contigstore, $tig_length);
297 | }
298 | 
299 | ###FUNCTION THAT FILTERS OUT THE REPEATS BY FINDING CONTIGS THAT HAVE MULTIPLE LINKS WITH OTHER CONTIGS
300 | sub determineRepeats{
301 |   my ($tig_length, $repeathash) = @_;
302 |   my $removeHash;
303 |   #go through each contig
304 |   foreach my $tig (sort {$tig_length->{$b}<=>$tig_length->{$a}} keys %$tig_length){
305 |     for(my $i = 0; $i < 2; $i++){
306 |       my $dtig = "r" . $tig;
307 |       $dtig = "f" . $tig if($i);
308 |       my $list = $pair->{$dtig};  #get contig pairs from $tig
309 |       my ($seen_it, $matchhash);
310 |       my $ct=0;
311 |       #Go through each contig pair and get the number of links and gapsize
312 |       foreach my $match (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){
313 |         my $matchnum = $1 if($match=~/[fr](\w+)/);
314 |         print TMP "$dtig has $list->{$match}{'links'} links with $match and gap of ".int($list->{$match}{'gaps'}/$list->{$match}{'links'})." bases\n" if($list->{$match}{'links'} >= $min_links);
315 |         if($list->{$match}{'links'} >= $min_links && !defined $seen_it->{$matchnum} && $ct < 2){
316 |           $ct++;
317 |           $matchhash->{$match}{'links'} = $list->{$match}{'links'};
318 |           $matchhash->{$match}{'gaps'} = $list->{$match}{'gaps'};
319 |           $matchhash->{$match}{'ratio'} = $list->{$match}{'gaps'}/$list->{$match}{'links'};
320 |           $seen_it->{$matchnum}++;
321 |       }
322 |       }
323 |       my @arraymatch;
324 |       foreach my $ratiosort (sort {$matchhash->{$a}{'ratio'}<=>$matchhash->{$b}{'ratio'}} keys %$matchhash){
325 |         push @arraymatch, $ratiosort;
326 |       }
327 |       my $repeat = 1;
328 |       my $used;
329 |       my $nummatch = $#arraymatch;
330 |       #only determine if contig is a repeat if it has more than 1 link with other contigs
331 |       if($nummatch > 0){
332 |         my $listmatch = $pair->{$arraymatch[0]};
333 |         #if the top two pairs of $tig have link with each other, establish their link so they are combined in scaffolding stage
334 |         if($listmatch->{$arraymatch[1]}{'links'} >= $min_links){
335 |           $pair = establishLink($dtig, $arraymatch[0], $pair);
336 |           $pair = establishLink($arraymatch[0], $arraymatch[1], $pair);
337 |         }else{ #otherwise, the contig has multiple links and is likely a repeat
338 |           my @linkmatch;
339 |           foreach my $linksort (sort {$matchhash->{$b}{'links'}<=>$matchhash->{$a}{'links'}} keys %$matchhash){
340 |             push @linkmatch, $linksort;
341 |           }
342 |           my ($ratio2, $first, $second) = (0,"","");
343 | 
344 |           #check for two ratio's  between the two best contig pairs. One is a ratio of the links, other is the number of links per searchspace. 
345 |           #If either one of the two ratio's is above the user-defined (-a) ratio, the original contig is treated as a repeat
346 |           
347 |           #estimate the ratio of the links of the two best contig pairs (ratio 1)
348 |           my $link1 = $matchhash->{$linkmatch[1]}{'links'};
349 |           my $link2 = $matchhash->{$linkmatch[0]}{'links'};
350 |           my $ratio1 = $link1 / $link2;        ## relative ratio of the two most abundant contig pairs
351 |           $ratio1 = sprintf("%.2f", $ratio1);
352 |           $first = $linkmatch[0];
353 |           #estimate the number of links per gap for the two best contig pairs and divide them (ratio 2)
354 |           my $gapPerSpace1 = estimateLinksPerGap($matchhash, $linkmatch[0], $insert_size, $tig_length);
355 |           my $gapPerSpace2 = estimateLinksPerGap($matchhash, $linkmatch[1], $insert_size, $tig_length);
356 |           if($gapPerSpace1 > $gapPerSpace2){
357 |             $second = $linkmatch[0];
358 |             $ratio2 = $gapPerSpace2/$gapPerSpace1;
359 |           }else{
360 |             $second = $linkmatch[1];
361 |             $ratio2 = $gapPerSpace1/$gapPerSpace2;
362 |           }
363 |           my $revdtig = $dtig;
364 |           $revdtig =~ tr/fr/rf/;
365 |           #if one of the two ratio's is above the user-defined (-a) option, contig is a repeat and all links with this contig are removed
366 |           if($ratio2 >= $max_link_ratio || $ratio1 >= $max_link_ratio || $first ne $second){
367 |             foreach my $linksort (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){
368 |               my $num = $1 if($linksort=~/[fr](\w+)/);
369 |               $removeHash->{$dtig}{$linksort}++;
370 |               my $revlinksort = $linksort;
371 |               $revlinksort =~ tr/fr/rf/;
372 |               $removeHash->{$revdtig}{$revlinksort}++;
373 |             }
374 |           }
375 |           else{ #otherwise, establish the link between the most likely contig pair
376 |             foreach my $linksort (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){
377 |               if($linksort ne $first){
378 |                 my $num = $1 if($linksort=~/[fr](\w+)/);
379 |                 my $revlinksort = $linksort;
380 |                 $revlinksort =~ tr/fr/rf/;
381 |                 $removeHash->{$revdtig}{$revlinksort}++;
382 |                 $removeHash->{$dtig}{$linksort}++;
383 |               }
384 |             }
385 |           }
386 |         }
387 |       }
388 |     }
389 |   }
390 |   return $removeHash;
391 | }
392 | 
393 | ###FUNCTION TO ESTABLISH A LINK BETWEEN CONTIGS SO THESE CONTIGS ARE PAIRED DURING SCAFFOLDING
394 | sub establishLink{
395 |   my ($tig1, $tig2, $pair) = @_;
396 |   my $list = $pair->{$tig1};
397 |   my $revtig1 = $tig1;
398 |   $revtig1 =~ tr/fr/rf/;
399 |   my $revtig2 = $tig2;
400 |   $revtig2 =~ tr/fr/rf/;
401 |   foreach my $rep (keys %$list){
402 |     if($rep ne $tig2 && $rep ne $revtig2){
403 |       delete $pair->{$tig1}{$rep};
404 |       $rep =~ tr/fr/rf/;
405 |       delete $pair->{$rep}{$revtig1};
406 |     }
407 |   }
408 |   my $list2 = $pair->{$revtig2};
409 |   foreach my $rep2 (keys %$list2){
410 |     if($rep2 ne $tig1 && $rep2 ne $revtig1){
411 |       delete $pair->{$revtig2}{$rep2};
412 |       $rep2 =~ tr/fr/rf/;
413 |       delete $pair->{$rep2}{$tig2};
414 |     }
415 |   }
416 | 
417 |   return $pair;
418 | }
419 | 
420 | ###DETERMINE THE NUMBER OF LINKS PER GAP, BASED ON INSERT SIZE
421 | sub estimateLinksPerGap{
422 |   my ($linkhash, $tig1, $insert_size, $length_hash) = @_;
423 |   my $t1 = $1 if($tig1=~/[fr](\w+)/);
424 |   my $space = 0;
425 |   my $gap = int($linkhash->{$tig1}{'ratio'});
426 |   $gap = 0 if($linkhash->{$tig1}{'ratio'} < 0);
427 |   if(($length_hash->{$t1}+$gap) >= $insert_size){
428 |     $space = int($insert_size - $gap);
429 |   }else{
430 |     $space =$length_hash->{$t1};
431 |   }
432 |   my $ratio = $linkhash->{$tig1}{'links'}/$space;
433 |   return $ratio;
434 | }
435 | 
436 | ###FUNCTION TO BUILD THE SCAFFOLDS
437 | sub buildScaffolds{
438 |    my ($pair, $tig_length, $verbose, $scaffold, $lib) = @_;
439 |    &printMessage("\n=>".getDate().": Building scaffolds file\n");
440 | 
441 |    open (SC, ">$scaffold") || die "Can't write to $scaffold -- fatal\n";
442 |    my ($sc_ct, $keyrep, $numrepeat) = (0,0,0);
443 |    my ($repeathash, $seen_start);
444 | 
445 |    #determine the repeats and remove any link if contig is a repeat
446 |    #if contig has multiple links, but one considered to be the 'best', establish this contig-pair by removing the links with other contigs
447 |    open (TMP, ">intermediate_results/$base_name"."_$library.foundlinks") || die "Can't write to intermediate_results/$base_name"."_$library.foundlinks -- fatal\n";
448 |    $repeathash = determineRepeats($tig_length, $repeathash);
449 |    close TMP;
450 |    open (REPEAT, ">intermediate_results/$base_name"."_$library.repeats") || die "Can't write to intermediate_results/$base_name"."_$library.repeats -- fatal\n";
451 |    foreach my $rep (sort keys %$repeathash){
452 |      my $tig = $1 if($rep=~/[fr](\w+)/);;
453 |      my $ls = $repeathash->{$rep};
454 |      my ($num_match,$repline) = (0,"");
455 |      foreach my $rep2 (sort keys %$ls){
456 |        if($pair->{$rep}{$rep2}{'links'} >= $min_links){
457 |          $repline.="\twith $rep2 (links = $pair->{$rep}{$rep2}{'links'})\n";
458 |          $num_match++;
459 |        }
460 |        delete $pair->{$rep}{$rep2};
461 |        delete $pair->{$rep2}{$rep};
462 |      }
463 |      if($num_match > 1){
464 |        $numrepeat++;
465 |        print REPEAT "Contig $rep (size = $tig_length->{$tig}) has $num_match multiple links;\n";
466 |        print REPEAT "$repline\n";
467 |      }
468 |    }
469 |    close REPEAT;
470 |    print SUMFILE "REPEATS: \n";
471 |    print SUMFILE "\tNumber of repeated edges = $numrepeat\n$seplines\n";
472 | 
473 |    #go through each contig and find contig pairs left and right, forming scaffolds
474 |    SEED:
475 |    foreach my $tig (sort {$tig_length->{$b}<=>$tig_length->{$a}} keys %$tig_length){
476 |       my $ftig = "f" . $tig;
477 |       my $rtig = "r" . $tig;
478 | 
479 |       if(! defined $seen_start->{$tig}){##should prevent re-using a contig as seed if it's already been incorporated into a scaffold
480 |          CounterPrint(++$sc_ct);
481 |          my $chainleft = "";
482 |          my $ori_chainright = $ftig . "Z" . $tig_length->{$tig};
483 |          my $chainright = $ori_chainright;
484 |          my $total = $tig_length->{$tig};
485 |          ($total, $chainright, $seen_start) = &computeLayout("R", $chainright, $ftig, $pair, $tig_length, $total, $seen_start, $tig);
486 |          ($total, $chainleft, $seen_start) = &computeLayout("L", $chainleft, $rtig, $pair, $tig_length, $total, $seen_start, $tig);
487 | 
488 |          delete $pair->{$ftig};
489 |          delete $pair->{$rtig};
490 |          delete $tig_length->{$tig};
491 |          $seen_start->{$tig}++;
492 |          my $scaffold = $chainleft . $chainright;
493 |          print SC "scaffold" . $sc_ct . ",$total,$scaffold\n";
494 |       }
495 |    }
496 |    CounterPrint("                ");
497 |    close SC;
498 |    &FlushFiles();
499 | }
500 | 
501 | # links contigs together into a chain - must satisfy user-defined criterions (-k -a)
502 | sub computeLayout{
503 |    my ($ext, $chain, $tig, $pair, $tig_length, $total, $seen_start, $orig_tig_number) = @_;
504 |    my $orig_tig = $tig;
505 |    my $extension = 1;
506 |    EXTENSION:
507 |    while($extension){
508 |       my $tnum = $1 if($tig=~/[fr](\w+)/);
509 |       my $tnumf = "f" . $tnum;
510 |       my $tnumr = "r" . $tnum;
511 |       my $ratio = 0.00;
512 |       if(!defined $seen_start->{$tnum}){ #if already seen in scaffold, do not use it again
513 |         $seen_start->{$tnum}++ if($tnumf ne $orig_tig);
514 |          my $list = $pair->{$tig};
515 |          my $matchhash;
516 |          my ($match1,$link1,$gaps1,$match2,$link2,$gaps2,$cntloop, $countmatches)=("",0,0,"",0,0,0,0);
517 |          my $ct=0;
518 |          LINK:
519 |          foreach my $match (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){
520 |             my $matchnum = $1 if($match=~/[fr](\w+)/);
521 |             if($list->{$match}{'links'} >= $min_links && !defined $seen_start->{$matchnum} && $matchnum ne $orig_tig_number && $ct < 2){
522 |               $ct++;
523 |               $matchhash->{$match}{'links'} = $list->{$match}{'links'};
524 |               $matchhash->{$match}{'gaps'} = $list->{$match}{'gaps'};
525 |               $matchhash->{$match}{'ratio'} = $list->{$match}{'gaps'}/$list->{$match}{'links'};
526 |               $countmatches++;
527 |             }else{
528 |               last LINK;
529 |             }
530 |          }
531 |          my $foundlinks = 0;
532 |          if($countmatches > 1){
533 |            my @arraymatch;
534 |            foreach my $ratiosort (sort {$matchhash->{$a}{'ratio'}<=>$matchhash->{$b}{'ratio'}} keys %$matchhash){
535 |              push @arraymatch, $ratiosort;
536 |            }
537 |            my $nummatch = $#arraymatch;
538 |            for(my $i=0; $i <= $nummatch && $foundlinks < 1; $i++){
539 |              my $listmatch = $pair->{$arraymatch[$i]};
540 |               for(my $j=$i+1; $j <= $nummatch && $foundlinks < 1; $j++){
541 |                  my $linkmatch = $listmatch->{$arraymatch[$j]}{'links'};
542 |                  $foundlinks = 1 if(!($linkmatch >= $min_links));
543 |               }
544 |            }
545 |            my $tignum = $1 if($arraymatch[$nummatch]=~/[fr](\w+)/);
546 |            $countmatches=0 if(!$foundlinks && defined $seen_start->{$tignum});
547 |          }if($foundlinks && $countmatches > 1){
548 |              my @linkmatch;
549 |              foreach my $linksort (sort {$matchhash->{$b}{'links'}<=>$matchhash->{$a}{'links'}} keys %$matchhash){
550 |                push @linkmatch, $linksort;
551 |              }
552 |              my $linkhash;
553 |              my $link1 = $matchhash->{$linkmatch[1]}{'links'};
554 |              my $link2 = $matchhash->{$linkmatch[0]}{'links'};
555 |              my $ratio = $link1 / $link2;        ## relative ratio of the two most abundant contig pairs
556 |              $ratio = sprintf("%.2f", $ratio);
557 | 
558 |              if($ratio <= $max_link_ratio){
559 |                foreach my $mat (keys %$matchhash){
560 |                  delete $matchhash->{$mat} if($mat ne $linkmatch[0]);
561 |                }
562 |                $foundlinks = 0;
563 |                $countmatches = 1;
564 |              }
565 |          }
566 |          if((!$foundlinks) && $countmatches > 0){
567 |            my $nummatch =0;
568 |            my @chainlist;
569 |            my @tiglist;
570 |            foreach my $incl_matches (sort {$matchhash->{$a}{'ratio'}<=>$matchhash->{$b}{'ratio'}} keys %$matchhash){
571 |              if($tig ne $incl_matches){
572 |                $nummatch++;
573 |                my $listmatch = $pair->{$tig};
574 |                my $tempnum = $1 if($incl_matches =~ /[fr](\w+)/);
575 |                my $link2 = $listmatch->{$incl_matches}{'links'};
576 |                my $mean2 = $listmatch->{$incl_matches}{'gaps'}/$link2;
577 | 
578 |                $seen_start->{$tempnum}++if($nummatch < $countmatches);
579 | 
580 |                ($chain, $total, $tig) = &getChain($chain, $ext, $link2, $mean2, $incl_matches, $tempnum, $ratio, $tig_length, $total);
581 |                delete $tig_length->{$tempnum};
582 |              }
583 |            }
584 |            $extension = 1;
585 | 
586 |          }else{
587 |            $extension = 0;
588 |            last EXTENSION;
589 |          }
590 |       }else{
591 |            $extension = 0;
592 |            last EXTENSION;
593 |       }
594 |    }
595 |    return $total, $chain, $seen_start;
596 | }
597 | 
598 | ###function to combine contigs into a scaffold
599 | sub getChain{
600 |   my ($chain, $ext, $link, $mean, $match, $tempnum, $ratio, $tig_length, $total) = @_;
601 |   my $tig = $match;
602 |   if($ext eq "R"){
603 |                $chain .= "k" . $link . "a" . $ratio . "m" . int($mean) . "_" . $match . "z" . $tig_length->{$tempnum};
604 |   }else{
605 |     my $temp_match = "";
606 |     if($match =~ /^r(\d+)/){$temp_match = "f" . $1;}else{$temp_match = "r". $1;}
607 |      $chain = $temp_match . "z" . $tig_length->{$tempnum} . "k" . $link . "a" . $ratio . "m" . int($mean) . "_" . $chain;
608 |  }
609 |  
610 |   $total += $tig_length->{$tempnum};
611 |   return ($chain, $total, $tig);
612 | }
613 | 
614 | 
615 | ###GET THE DISTANCE BETWEEN TWO PAIRED READS
616 | sub getDistance{
617 | 
618 |    my ($insert_size, $length_i, $start_i, $start_j) = @_;
619 | 
620 |    # L  ------  --------- R
621 |    # i    ->        <-    j
622 |    #      ....  ......    insert_span
623 |    #      ============    insert_size
624 | 
625 |    my $insert_span = ($length_i - $start_i) + $start_j;
626 |    my $gap_or_overlap = $insert_size - $insert_span;
627 | 
628 |    return $gap_or_overlap;
629 | }
630 | 
631 | ###Pair contigs based on mapping of two reads
632 | sub pairContigs{
633 |   my ($trackA, $trackB, $read_a, $read_b) = @_;
634 |   my ($tig_a, $A_start, $A_end) = split(/\|/, $trackA);
635 |   my ($tig_b, $B_start, $B_end) = split(/\|/, $trackB);
636 |   my ($ori_1,$ori_2) = split(//, $ori);
637 |   if($ori_1 eq "R"){
638 |     my ($tmp_A_start, $tmp_A_end) = ($A_start, $A_end);
639 |     ($A_start, $A_end) = ($tmp_A_end, $tmp_A_start);
640 |   } 
641 |   if($ori_2 eq "F"){
642 |     my ($tmp_B_start,$tmp_B_end) = ($B_start,$B_end);
643 |     ($B_start,$B_end) = ($tmp_B_end,$tmp_B_start);
644 |  }
645 |   my $ftig_a = "f" . $tig_a;
646 |   my $ftig_b = "f" . $tig_b;
647 |   my $rtig_a = "r" . $tig_a;
648 |   my $rtig_b = "r" . $tig_b;
649 |   my $A_length = $tig_length->{$tig_a};
650 |   my $B_length = $tig_length->{$tig_b};
651 |   if (($tig_a != $tig_b) || ($tig_a ne $tig_b)){####paired reads located on <> contigs
652 |     ####Determine most likely possibility
653 |     if ($A_start < $A_end){
654 |       if ($B_end < $B_start){####-> <- :::  A-> <-B  /  rB -> <- rA
655 |         my $d = &getDistance($insert_size, $A_length, $A_start, $B_start);
656 |         print "A-> <-B  WITH $tig_a -> <- $tig_b GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Alen, Astart,Bstart\n" if($verbose);
657 |         if($d >= $min_allowed){
658 |           $pair->{$ftig_a}{$ftig_b}{'links'}++;
659 |           $pair->{$ftig_a}{$ftig_b}{'gaps'} += $d;
660 |           $pair->{$rtig_b}{$rtig_a}{'links'}++;
661 |           $pair->{$rtig_b}{$rtig_a}{'gaps'} += $d;
662 |           $ct_ok_pairs++;
663 |         }else{
664 |           my $err_pair = $ftig_a . "-". $ftig_b;
665 |           $err->{$err_pair}{'links'}++;
666 |           $err->{$err_pair}{'gaps'} += $d;
667 |           $ct_problem_pairs++;
668 |            print PET "Pairs unsatisfied in distance within a contig pair.  A-> <-B  WITH tig#$tig_a -> $d <- tig#$tig_b, A=$A_length nt (start:$A_start, end:$A_end) B=$B_length nt (start:$B_start, end:$B_end) CALCULATED DISTANCE APART: $d < $min_allowed\n";
669 |         }
670 |       }else{#### -> -> ::: A-> <-rB  / B-> <-rA
671 |         my $rB_start = $B_length - $B_start;
672 |         my $d = &getDistance($insert_size, $A_length, $A_start, $rB_start);
673 |         print "A-> <-rB  WITH $tig_a -> <- r.$tig_b GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Alen,Astart,rBstart\n"if($verbose);
674 |         if($d >= $min_allowed){
675 |           $pair->{$ftig_a}{$rtig_b}{'links'}++;
676 |           $pair->{$ftig_a}{$rtig_b}{'gaps'} += $d;
677 |           $pair->{$ftig_b}{$rtig_a}{'links'}++;
678 |           $pair->{$ftig_b}{$rtig_a}{'gaps'} += $d;
679 |           $ct_ok_pairs++;
680 |         }else{
681 |           my $err_pair = $ftig_a . "-". $rtig_b;
682 |           $err->{$err_pair}{'links'}++;
683 |           $err->{$err_pair}{'gaps'} += $d;
684 |           $ct_problem_pairs++;
685 |           print PET "Pairs unsatisfied in distance within a contig pair.  A-> <-rB  WITH tig#$tig_a -> $d <- tig#r.$tig_b, A=$A_length  nt (start:$A_start, end:$A_end) B=$B_length nt (start:$B_start, end:$B_end) CALCULATED DISTANCE APART: $d < $min_allowed\n";
686 |         }
687 |       }
688 |     }else{
689 |       if ($B_end > $B_start){####<-  -> ::: B-> <-A / rA -> <- rB
690 |         my $d = &getDistance($insert_size, $B_length, $B_start, $A_start);
691 |         print "B-> <-A  WITH $tig_b -> <- $tig_a GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Blen,Bstart,Astart\n" if($verbose);
692 |         if($d >= $min_allowed){
693 |           $pair->{$ftig_b}{$ftig_a}{'links'}++;
694 |           $pair->{$ftig_b}{$ftig_a}{'gaps'} += $d;
695 |           $pair->{$rtig_a}{$rtig_b}{'links'}++;
696 |           $pair->{$rtig_a}{$rtig_b}{'gaps'} += $d;
697 |           $ct_ok_pairs++;
698 |         }else{
699 |           my $err_pair = $ftig_b . "-". $ftig_a;
700 |           $err->{$err_pair}{'links'}++;
701 |           $err->{$err_pair}{'gaps'} += $d;
702 |           $ct_problem_pairs++;
703 |           print PET "Pairs unsatisfied in distance within a contig pair.  B-> <-A  WITH tig#$tig_b -> $d <- tig#$tig_a, B=$B_length nt (start:$B_start, end:$B_end) A=$A_length nt (start:$A_start, end:$A_end) CALCULATED DISTANCE APART: $d < $min_allowed\n";
704 |         }
705 |       }else{                          ####<- <-  :::  rB-> <-A / rA-> <-B
706 |         my $rB_start = $B_length - $B_start;
707 |         my $d = &getDistance($insert_size, $B_length, $rB_start, $A_start);
708 |         print "rB-> <-A WITH r.$tig_b -> <- $tig_a GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Blen,rBstart,Astart\n" if($verbose);
709 |         if($d >= $min_allowed){
710 |           $pair->{$rtig_b}{$ftig_a}{'links'}++;
711 |           $pair->{$rtig_b}{$ftig_a}{'gaps'} += $d;
712 |           $pair->{$rtig_a}{$ftig_b}{'links'}++;
713 |           $pair->{$rtig_a}{$ftig_b}{'gaps'} += $d;
714 |           $ct_ok_pairs++;
715 |         }else{
716 |           my $err_pair = $rtig_b . "-". $ftig_a;
717 |           $err->{$err_pair}{'links'}++;
718 |           $err->{$err_pair}{'gaps'} += $d;
719 |           $ct_problem_pairs++;
720 |           print PET "Pairs unsatisfied in distance within a contig pair.  rB-> <-A WITH tig#r.$tig_b -> $d <- tig#$tig_a, B=$B_length nt (start:$B_start, end:$B_end) A=$A_length nt (start:$A_start, end:$A_end) CALCULATED DISTANCE APART: $d < $min_allowed\n";
721 |         }
722 |       }
723 |     }
724 |   }else{###Clone, paired reads located on the same contig -- could be used to investigate misassemblies
725 |       print "Pair ($read_a and $read_b) located on same contig $tig_a ($A_length nt)\n" if ($verbose);
726 |       my $pet_size = 0;
727 | 
728 |       if ($A_start > $B_start && ($B_start < $B_end) && ($A_start > $A_end)){    # B --> <-- A
729 |         $total_for_median++;
730 |         $pet_size = $A_start - $B_start;
731 |         $track_insert->{$pet_size}++;
732 |         if($pet_size >= $low_iz && $pet_size <= $up_iz){
733 |            $ct_ok_contig++;
734 |         }else{
735 |           print PET "Pairs unsatisfied in distance within a contig.  Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end CALCULATED DISTANCE APART: $pet_size\n";
736 |           $ct_iz_issues++;
737 |         }
738 |     }elsif($B_start > $A_start && ($B_start > $B_end) && ($A_start < $A_end)){ # A --> <-- B
739 |       $total_for_median++;
740 |       $pet_size = $B_start - $A_start;
741 |       $track_insert->{$pet_size}++;
742 |       if($pet_size >= $low_iz && $pet_size <= $up_iz){
743 |         $ct_ok_contig++;
744 |       }else{
745 |         print PET "Pairs unsatisfied in distance within a contig.  Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end CALCULATED DISTANCE APART: $pet_size\n";
746 |         $ct_iz_issues++;
747 |       }
748 |     }else{
749 |       $ct_illogical++;
750 |       print PET "Pairs unsatisfied in pairing logic within a contig.  Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end\n";
751 |     }
752 |   }
753 | }
754 | 
755 | ###Print read pairing results to the summary file, including estimation of mean and median insert size
756 | sub printResultsPairing{
757 |    print PET "------------- Putative issues with contig pairing - Summary  ----------------\n";
758 |    foreach my $err_pair (sort {$err->{$b}{'links'}<=>$err->{$a}{'links'}} keys %$err){
759 |       my $mean_iz = 0;
760 |       $mean_iz = $err->{$err_pair}{'gaps'} / $err->{$err_pair}{'links'} if ($err->{$err_pair}{'links'});
761 |       print PET "Pair $err_pair has $err->{$err_pair}{'links'} links and mean distance = $mean_iz\n";
762 |    }
763 |    close PET;
764 | 
765 |    my $satisfied = $ct_ok_pairs + $ct_ok_contig;
766 |    my $unsatisfied = $ct_problem_pairs + $ct_iz_issues + $ct_illogical;
767 |    my $ct_both_reads = $ct_both * 2;
768 | 
769 |    #write distribution file
770 |    open (CSV, ">$distribution") || die "Can't open $distribution for writing -- fatal";
771 |    my ($total_is, $overal_is,$median_ins, $stdev,$record, $sumX,$sumX2) = (0,0,0,0,0,0,0);
772 |    my $median_bin = int($total_for_median/2);
773 | 
774 |    foreach my $is (sort {$a<=>$b} keys %$track_insert){
775 |      for(my $i=0;$i<$track_insert->{$is};$i++){
776 |        $record++;
777 |        $sumX += $is;
778 |        $sumX2 += ($is * $is);
779 |        $median_ins = $is if($record >= $median_bin && $median_ins == 0);
780 |      }
781 |      $overal_is += ($is * $track_insert->{$is});
782 |       print CSV "$is,$track_insert->{$is}\n";
783 |    }
784 |    my ($mean_ins,$sigma) = (0,0);
785 |    if($sumX > 0 && $record > 0){
786 |      $mean_ins = int($sumX/$record);
787 |      $sigma = sprintf("%.2f",sqrt($sumX2/$record - $mean_ins*$mean_ins));
788 |    }
789 |    close CSV;
790 | 
791 |    print SUMFILE "READ PAIRS STATS:\n";
792 |    print SUMFILE "\tAssembled pairs: $ct_both ($ct_both_reads sequences)\n";
793 |    print SUMFILE "\t\tSatisfied in distance/logic within contigs (i.e. -> <-, distance on target: $insert_size +/$min_allowed): $ct_ok_contig\n";
794 |    print SUMFILE "\t\tUnsatisfied in distance within contigs (i.e. distance out-of-bounds): $ct_iz_issues\n";
795 |    print SUMFILE "\t\tUnsatisfied pairing logic within contigs (i.e. illogical pairing ->->, <-<- or <-->): $ct_illogical\n";
796 |    print SUMFILE "\t\t---\n";
797 |    print SUMFILE "\t\tSatisfied in distance/logic within a given contig pair (pre-scaffold): $ct_ok_pairs\n";
798 |    print SUMFILE "\t\tUnsatisfied in distance within a given contig pair (i.e. calculated distances out-of-bounds): $ct_problem_pairs\n";
799 |    print SUMFILE "\t\t---\n";
800 |    print SUMFILE "\tTotal satisfied: $satisfied\tunsatisfied: $unsatisfied\n\n";
801 |    print SUMFILE "\n\tEstimated insert size statistics (based on $total_for_median pairs): \n";
802 |    print SUMFILE "\t\tMean insert size = $mean_ins\n";
803 |    print SUMFILE "\t\tMedian insert size = $median_ins\n";
804 |   # print SUMFILE "\t\tInsert size deviation = $sigma\n$seplines\n";
805 | 
806 |    &FlushFiles();
807 |    return $pair;
808 | }
809 | 
810 | ###Function that maps the readfiles to the contigs
811 | sub mapReadsWithBowtie{
812 |    my ($contigFile, $singlereads, $gaps, $threads) = @_;
813 |    #building Index of contig and mapping reads to Index
814 |    my $bowtieout = $base_name . ".$library.bowtieIndex";
815 | 
816 |    my @reads = split(/,/,$singlereads);
817 |    foreach my $read (@reads){
818 |      die "Single read file ($read) not found. Exiting...\n" if(!(-e $read));
819 |    }
820 |    my $procline = "bowtie -p $threads -v $gaps -m 1 bowtieoutput/$bowtieout --suppress 6,7 -f $singlereads --quiet --refidx |";
821 |    die "Contig file ($contigFile) not found. Exiting...\n" if(!(-e $contigFile));
822 |    &printMessage("\n=>".getDate().": Building Bowtie index for contigs\n");
823 |    system("bowtie-build $contigFile bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values
824 |    
825 |    #Treat the output of Bowtie differently if multithreading is used or not
826 |    readBowtieOneThread($procline) if($threads <= 1);
827 |    readBowtieMultThread($procline) if($threads > 1);
828 | }
829 | 
830 | ###Parse output of Bowtie when only one thread of Bowtie is used
831 | sub readBowtieOneThread{
832 |    my ($input) = @_;
833 |    my $lower = ($up_iz+200);
834 |    my $sub = ($lower * 2) + 3;
835 |    my ($prevline, $line, $prevread, $counter, $step, $pair_found, $ct_pair) = ("","","",0, 1000000, 0, 0);
836 |    my ($seq1, $seq2, $track1, $track2, $count);
837 | 
838 |    &printMessage("\n=>".getDate().": Mapping reads to contigs. Reading bowtie output and pairing contigs\n");
839 |    open(IN, "$input") || die "Can't open bowtie output -- fatal\n";
840 |    #go through mapping results
841 |    while($line = <IN>){
842 |       if(++$counter == $step){
843 |         CounterPrint($counter);
844 |         $step = $step + 1000000;
845 |       }
846 |       my ($read) = split(/\//,$line);
847 |       if($prevread eq $read){
848 |         $pair_found++;
849 |         ($seq1, $track1) = StoreResults($prevline, $lower, $sub);
850 |         ($seq2, $track2) = StoreResults($line, $lower, $sub);
851 |         my $combined = "$seq1:$seq2";
852 |         my $revcombined = reverseComplement($combined);
853 |         if(!$count->{$combined} && !$count->{$revcombined}){
854 |           $count->{$combined}++;
855 |           pairContigs($track1, $track2, $seq1, $seq2);
856 |           $ct_both++;
857 |         }
858 |       }
859 |       $prevread = $read;
860 |       $prevline = $line;
861 |    }
862 |    close IN;
863 |    CounterPrint("                ");
864 |    print SUMFILE "\nMAPPING READS TO CONTIGS:\n";
865 |    print SUMFILE "$seplines\tNumber of single reads found on contigs = $counter\n";
866 |    my $read_number_message = "\tNumber of pairs used for pairing contigs / total pairs = $ct_both / $pair_found\n";
867 |    printf SUMFILE $read_number_message.$seplines."\n";
868 |    &FlushFiles();
869 | }
870 | 
871 | ###Parse output of Bowtie when multiple threads of Bowtie are used
872 | sub readBowtieMultThread{
873 |    my ($input) = @_;
874 |    my $lower = ($up_iz+200);
875 |    my $sub = ($lower * 2) + 3;
876 |    my ($prevline, $line, $prevread, $counter, $step, $ct, $ctHash, $pair_found) = ("","","",0, 1000000,0,1,0);
877 |    my ($seq1, $seq2, $track1, $track2, $count, $readHash);
878 | 
879 |    &printMessage("\n=>".getDate().": Mapping reads to contigs. Reading bowtie output and pairing contigs\n");
880 |    open(IN,"$input") || die "Can't open bowtie output -- fatal\n";
881 |    while ($line = <IN>) {
882 |      if(++$ct >= ($ctHash*$step)){
883 |        CounterPrint($ct);
884 |        delete $readHash->{($ctHash-1)};
885 |        $ctHash++;
886 |      }
887 |      my ($readname) = split(/\t/,$line);
888 |      my ($read,$readnum) = split(/\//,$readname);
889 |      if($readHash->{($ctHash-1)}{$read}){
890 |        $pair_found++;
891 |        if($readnum == 1){
892 |          ($seq2, $track2) = StoreResults($readHash->{($ctHash-1)}{$read}, $lower, $sub);
893 |          ($seq1, $track1) = StoreResults($line, $lower, $sub);
894 |        }else{
895 |          ($seq1, $track1) = StoreResults($readHash->{($ctHash-1)}{$read}, $lower, $sub);
896 |          ($seq2, $track2) = StoreResults($line, $lower, $sub);
897 |        }
898 |        my $combined = "$seq1:$seq2";
899 |        my $revcombined = reverseComplement($combined);
900 |        if(!$count->{$combined} && !$count->{$revcombined}){
901 |           $count->{$combined}++;
902 |           pairContigs($track1, $track2, $seq1, $seq2);
903 |           $ct_both++;
904 |        }
905 |      }elsif(defined $readHash->{$ctHash}{$read}){
906 |        $pair_found++;
907 |        if($readnum == 1){
908 |          ($seq2, $track2) = StoreResults($readHash->{($ctHash)}{$read}, $lower, $sub);
909 |          ($seq1, $track1) = StoreResults($line, $lower, $sub);
910 |        }else{
911 |          ($seq1, $track1) = StoreResults($readHash->{($ctHash)}{$read}, $lower, $sub);
912 |          ($seq2, $track2) = StoreResults($line, $lower, $sub);
913 |        }
914 |        my $combined = "$seq1:$seq2";
915 |        my $revcombined = reverseComplement($combined);
916 |        if(!$count->{$combined} && !$count->{$revcombined}){
917 |           $count->{$combined}++;
918 |           pairContigs($track1, $track2, $seq1, $seq2);
919 |           $ct_both++;
920 |        }
921 |      }
922 |      $readHash->{$ctHash}{$read} = $line;
923 |    }
924 |    close IN;
925 |    CounterPrint("                ");
926 |    print SUMFILE "\nMAPPING READS TO CONTIGS:\n";
927 |    print SUMFILE "$seplines\tNumber of single reads found on contigs = ".  $ct."\n";
928 |    my $read_number_message = "\tNumber of pairs used for pairing contigs / total pairs = $ct_both / $pair_found\n";
929 |    printf SUMFILE $read_number_message.$seplines."\n";
930 |    &FlushFiles();
931 | }
932 | 
933 | sub StoreResults{
934 |   my ($input, $lower, $sub) = @_;
935 |   my ($read, $strand, $tig, $start, $seq) = split(/\t/,$input);
936 |   my ($startval, $endval, $keyvalue) = (0,0,"");
937 |   $tig++;
938 |   if($start >  $lower && $tig_length->{$tig} > (($lower * 2)+100)){
939 |     my $minsub = $sub - $start;
940 |     $start = ($tig_length->{$tig} - $minsub);
941 |   }
942 |   if($strand eq "+"){
943 |     $startval = $start;
944 |     $endval = $start + length($seq);
945 |   }
946 |   else{
947 |     $startval = $start + length($seq);
948 |     $endval = $start; 
949 |     $seq = reverseComplement($seq);
950 |   }
951 |   $keyvalue = "$tig"."|$startval"."|$endval";
952 |   return $seq, $keyvalue;
953 | }
954 | 
955 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE
956 | sub reverseComplement{
957 |    $_ = shift;
958 |    tr/ATGC/TACG/;
959 |    return (reverse());
960 | }
961 | 
962 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
963 | sub CounterPrint{
964 |   my $countingMessager = shift;
965 |   print "\r$countingMessager";
966 |   $|++;
967 | }
968 | 
969 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE
970 | sub printMessage{
971 |   my $message = shift;
972 |   print $message;
973 |   print LOG $message;
974 | }
975 | 
976 | ###FUNCTION TO GET THE CURRENT DATE
977 | sub getDate{
978 |   my $date = scalar(localtime);
979 |   return $date;
980 | }
981 | 
982 | ###FLUSHES THE SUMMARY AND LOG FILE
983 | sub FlushFiles{
984 |   select((select(SUMFILE), $| = 1)[0]);
985 |   select((select(LOG), $| = 1)[0]);
986 |   $|++;
987 | }
988 | 
989 | #########END PairingAndScaffolding.pl
990 | 


--------------------------------------------------------------------------------
/bin/readLibFiles.pl:
--------------------------------------------------------------------------------
  1 |   #############################################################
  2 |   #Marten Boetzer 13-06-2011                                  #
  3 |   #SSPACE perl subscript readLibFiles.pl                      #
  4 |   #This script;                                               #
  5 |   #  -reads, converts and filters original input sequences    #
  6 |   #############################################################
  7 | 
  8 |   use Storable;
  9 |   use File::Path;
 10 |   use File::Basename;
 11 |   use threads;
 12 | 
 13 |   my $seplines = ("-" x 60)."\n";
 14 |   my $maxlen = 0;
 15 | 
 16 |   my $libraryfile = $ARGV[0];
 17 |   my $base_name = $ARGV[1];
 18 |   my $extending = $ARGV[2];
 19 |   my $unpaired_file = $ARGV[3];
 20 |   my $min_overlap = $ARGV[4];
 21 |   my $thread = $ARGV[5];
 22 |   my $log = $base_name . ".logfile.txt";
 23 |   my $summaryfile = $base_name.".summaryfile.txt";
 24 | 
 25 |   open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n";
 26 |   open (LOG, ">>$log") || die "Can't write to $log -- fatal\n";
 27 | 
 28 |   my $filenameOutFilt = "filtered.readpairs.fasta";
 29 |   my $filenameOutExt = $base_name . ".singlereads.fasta";
 30 | 
 31 | #-------------------------------------------------READ UNPAIRED FILE CONTAINING SINGLE READS
 32 |   &readUnpairedFile($unpaired_file) if ($unpaired_file);
 33 | #-------------------------------------------------LOOP THROUGH EACH LIBRARY IN LIBRARYFILE AND STORE AND FILTER READS
 34 |   open(FILELIB, "< $libraryfile");
 35 | 
 36 |   my ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse, $libResHash);
 37 |   my ($prevlibrary, $ctlib) = ("",0);
 38 |   &printMessage("\n=>".getDate().": Reading, filtering and converting input sequences of library file initiated\n");
 39 | 
 40 |   while(<FILELIB>){
 41 |     chomp;
 42 |     ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse) = split(/\s+/, $_);
 43 | 
 44 |     next if($library eq "");
 45 |     $ctlib=0 if($library ne $prevlibrary && $prevlibrary ne "");
 46 |     $ctlib++;
 47 | 
 48 |     my ($fileBaseName1, $dirName1, $fileExtension1) = fileparse($fileA);
 49 |     my ($fileBaseName2, $dirName2, $fileExtension2) = fileparse($fileB);
 50 | 
 51 |     my $fname = "reads/$base_name.$library.filtered.readpairs.singles.fasta";
 52 |     my ($counter2, $Ncount2);
 53 |     #Process multiple files at the same time if multithreaded option is set (-T parameter larger than 1)
 54 |     if($fileA ne "TAB"  && $thread > 1){
 55 |        my $thr = threads->create(\&generateInputFiles, $library, $fileA, $fileB, $extending, $reverse, $fname, $ctlib);
 56 |        if(!($ctlib % $thread)){
 57 |          foreach my $thr (threads->list()) {
 58 |            my @res = $thr->join();
 59 |            ($lib,$nreads,$ncount) = split(/,/,$res[0]);
 60 |            $libResHash->{$lib}{'reads'}+=$nreads;
 61 |            $libResHash->{$lib}{'N'}+=$ncount;
 62 |          }
 63 |        }
 64 |     #otherwise, process only one file at a time
 65 |     }elsif($fileA ne "TAB" && $thread <=1){
 66 |       my $out = &generateInputFiles($library, $fileA, $fileB, $extending, $reverse, $fname, $ctlib);
 67 |       ($lib,$nreads,$ncount) = split(/,/,$out);
 68 |       $libResHash->{$lib}{'reads'}+=$nreads;
 69 |       $libResHash->{$lib}{'N'}+=$ncount;
 70 |     }
 71 |     #if user has inserted a TAB file, calculate read statistics
 72 |     if($fileA eq "TAB"){
 73 |       open FILE, "$fileB" or die $!;
 74 |       my ($fileBaseName2, $dirName2, $fileExtension2) = fileparse($fileB);
 75 |       print "Reading tabfile: $fileBaseName2...\n";
 76 |       $counter2++ while(<FILE>);
 77 |       $libResHash->{$lib}{'reads'}+=$counter2;
 78 |       $libResHash->{$lib}{'N'} = 0;
 79 |       close FILE;
 80 |     }
 81 |     $prevlibrary = $library;
 82 |   }
 83 |   #Process remaining reads
 84 |   if($fileA ne "TAB"){
 85 |     foreach my $thr (threads->list()) {
 86 |       my @res = $thr->join();
 87 |       ($lib,$nreads,$ncount) = split(/,/,$res[0]);
 88 |       $libResHash->{$lib}{'reads'}+=$nreads;
 89 |       $libResHash->{$lib}{'N'}+=$ncount;
 90 |     }
 91 |   }
 92 |   #Print read statistics to the summary file
 93 |   &printMessage("\n$seplines");
 94 |   foreach my $libs (keys %$libResHash){
 95 |     my $totcounter = $libResHash->{$libs}{'reads'};
 96 |     my $totNcount = $libResHash->{$libs}{'N'};
 97 |     my $filt = $totcounter-$totNcount;
 98 |     print SUMFILE "READING READS $libs:\n";
 99 |     print SUMFILE "$seplines\tTotal inserted pairs = $totcounter \n";
100 |     print SUMFILE "\tNumber of pairs containing N's = $totNcount \n\tRemaining pairs = $filt\n$seplines\n";
101 |   }
102 |   close FILELIB;
103 |   close SUMFILE;
104 |   close LOG;
105 | 
106 |   mkpath('process_OK'); #make directory, indicating that process has run OK
107 | 
108 | #--------------------------------------------------
109 | 
110 | ###CONVERT INPUT SEQUENCES BY REMOVING PAIRED READS HAVING AN 'N'
111 | sub generateInputFiles{
112 |   my ($lib, $fileA, $fileB, $extension, $reverse, $fname, $libct) = @_;
113 |   my ($name,$seq1,$seq2, $res1,$res2);
114 |   my ($counterext, $Ncount, $countsinglet, $fastq, $step) = (0,0,0,0,1000000);
115 |   open (OUTSINGLEFILE, ">reads/$base_name.$lib.file$libct.fa") || die "Can't write to single file file$fname-- fatal\n";
116 | 
117 |   #check if file is fastQ or fastA
118 |   open(TEST, "< $fileA");
119 |   $name = <TEST>;
120 |   close TEST;
121 |   $fastq = 1 if ($name =~ /^[@]/);
122 | 
123 |   open(FILEA, "< $fileA");
124 |   open(FILEB, "< $fileB");
125 |   CounterPrint("Reading read-pairs $lib.$libct @ $countsinglet       ");
126 |   while(<FILEA>) {
127 |     <FILEB>;
128 |     $seq1 = uc(<FILEA>), $seq1 =~ s/^\r\n/\n/;
129 |     $seq2 = uc(<FILEB>), $seq2 =~ s/^\r\n/\n/;
130 |     #FASTQ FORMAT
131 |     <FILEA>,<FILEA>,<FILEB>,<FILEB> if ($fastq);
132 | 
133 |     $res1 = index($seq1,"N");
134 |     $res2 = index($seq2,"N");
135 |     #if both reads contain N's, do not use them for contig extension and for scaffolding
136 |     if($res1 == -1 && $res2 == -1){
137 |        print OUTSINGLEFILE ">read$countsinglet/1\n$seq1>read$countsinglet/2\n$seq2";
138 |     }else{
139 |       $Ncount++;
140 |     }
141 |     if(++$countsinglet == $step){   
142 |       CounterPrint("Reading read-pairs $lib.$libct @ $countsinglet         ");
143 |       $step = $step + 1000000;
144 |     }
145 | 
146 |   }
147 |   CounterPrint("\n") if($thread <= 1);
148 |   CounterPrint((" " x 40));
149 |   close OUTSINGLEFILE;
150 |   close FILEB;
151 |   close FILEA;
152 |   return "$lib,$countsinglet,$Ncount";
153 | }
154 | 
155 | #------------------READ UNPAIRED SINGLE READS FILE WHEN -u IS SET
156 | 
157 | sub readUnpairedFile{
158 |   my ($file) = @_;
159 |   open(INUNPAIRED, "< $file") || die "Can't open $file -- fatal\n";
160 |   open OUTFILEExt, "> reads/$filenameOutExt";
161 | 
162 |   &printMessage("\n=>".getDate().": Reading, filtering and converting unpaired input sequences initiated\n");
163 | 
164 |   my ($seq1, $name);
165 |   my ($counterext, $counter, $step, $fastq) = (0,0, 100000,0);
166 | 
167 |   open(TEST, "< $file");
168 |   $name = <TEST>;
169 |   close TEST;
170 |   $fastq = 1 if ($name =~ /^[@]/);
171 |   while(<INUNPAIRED>) {
172 |     $seq1 = uc(<INUNPAIRED>); $seq1 =~ s/\r\n/\n/; chomp $seq1;
173 | 
174 |     #FASTQ FORMAT
175 |     if ($fastq){
176 |       <INUNPAIRED>; <INUNPAIRED>;
177 |     }
178 |     # ELSE FASTA FORMAT
179 |     if(index($seq1, "N") == -1){
180 |        print OUTFILEExt ">$counterext\n$seq1\n";
181 |        $counterext++;
182 |     }
183 |     if(++$counter == $step){
184 |       CounterPrint($counter);
185 |       $step = $step + 100000;
186 |     }
187 |   }
188 |   CounterPrint("                ");
189 | 
190 |   print SUMFILE "READING UNPAIRED READS:\n";
191 |   print SUMFILE "$seplines\tTotal inserted reads = $counter \n";
192 |   print SUMFILE "\tNumber of reads containing N's = ".($counter-$counterext)."\n\tRemaining reads = $counterext\n";
193 |   close OUTFILEext;
194 |   close INUNPAIRED;
195 | }
196 | 
197 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE
198 | sub reverseComplement{
199 |    $_ = shift;
200 |    tr/ATGC/TACG/;
201 |    return (reverse());
202 | }
203 | 
204 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
205 | sub CounterPrint{
206 |   my $countingMessager = shift;
207 |   print "\r$countingMessager";
208 |   $|++;
209 | }
210 | 
211 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE
212 | sub printMessage{
213 |   my $message = shift;
214 |   print $message;
215 |   print LOG $message;
216 | }
217 | 
218 | ###FUNCTION TO GET THE CURRENT DATE
219 | sub getDate{
220 |   my $date = scalar(localtime);
221 |   return $date;
222 | }
223 | 
224 | ###FLUSHES THE SUMMARY AND LOG FILE
225 | sub FlushFiles{
226 |   select((select(SUMFILE), $| = 1)[0]);
227 |   select((select(LOG), $| = 1)[0]);
228 |   $|++;
229 | }
230 | 
231 | #########END readLibFiles.pl


--------------------------------------------------------------------------------
/dotlib/DotLib.pm:
--------------------------------------------------------------------------------
  1 | # $Id: DotLib.pm,v 1.3 2003/02/24 17:33:00 mpop Exp $
  2 | #
  3 | # DotLib.pm - set of procedures for generating .dot files
  4 | #
  5 | 
  6 | #  Copyright @ 2002, 2003, The Institute for Genomic Research (TIGR).  All
  7 | #  rights reserved.
  8 | 
  9 | 
 10 | =head1 Name
 11 | 
 12 | DotLib - library of routines for generating .dot files
 13 | 
 14 | =head1 Synopsis
 15 | 
 16 |     use DotLib;
 17 | 
 18 | =head1 Description
 19 | 
 20 |     A set of procedures used to create various .dot objects such as
 21 | file headers, file tails, components, nodes, edges, etc.
 22 | 
 23 | =cut
 24 | 
 25 | package DotLib;
 26 | 
 27 | use strict;
 28 | 
 29 | 
 30 | BEGIN {
 31 |     use Exporter ();
 32 |     use vars qw(@EXPORT @EXPORT_OK @ISA %EXPORT_TAGS);
 33 | 
 34 |     @ISA         = qw(Exporter);
 35 |     @EXPORT      = qw(&printHeader
 36 |                       &printFooter
 37 | 		      &printNode
 38 | 		      &printEdge
 39 | 		      &startCluster
 40 | 		      &endCluster
 41 | 		      );
 42 |     %EXPORT_TAGS = ();
 43 |     @EXPORT_OK   = ();
 44 | }
 45 | 
 46 | our $VERSION = '1.0'; 
 47 | our $REVISION = '$Revision: 1.3 $ ';
 48 | our $VERSION_STRING = "$VERSION ($REVISION)";
 49 | 
 50 | use vars @EXPORT;
 51 | use vars @EXPORT_OK;
 52 | 
 53 | =over 4
 54 | 
 55 | =item B<my $ret = printHeader($file, $type);>
 56 | 
 57 | Prints a .dot header for the type of output specified in the $type variable.
 58 | Allowable types are "printer", "plotter".  If $type is undefined or not
 59 | passed, it generates a default header.  Returns 1 upon successful 
 60 | completion and 'undef' otherwise.
 61 | 
 62 | Example:
 63 | 
 64 |     my $err = printHeader(\*STDOUT, "plotter");
 65 | 
 66 | =cut
 67 | 
 68 | sub printHeader
 69 | {
 70 |     my $file = shift;
 71 |     my $type = shift;
 72 | 
 73 |     print $file "digraph ROOT {\n";
 74 |     print $file "  rankdir = LR\n";
 75 |     print $file "  orientation = landscape\n";
 76 |     print $file "  ranksep = 0.3\n";
 77 |     print $file "  nodesep = 0.3\n";
 78 |     print $file "  fontsize = 8\n";
 79 |     print $file "  margin = \".2,.2\"\n";
 80 | 	
 81 |     if ($type eq "printer"){
 82 | 	print $file "  ratio = auto\n";
 83 | 	print $file "  page = \"8.5,11\"\n";
 84 |     } elsif ($type eq "plotter"){
 85 | 	print $file "  ratio = auto\n";
 86 | 	print $file "  page = \"36,48\"\n";
 87 |     }
 88 |     
 89 |     print $file "\n";
 90 | 
 91 |     return 1;
 92 | } # printHeader
 93 | 
 94 | 
 95 | =item B<my $ret = printFooter($file);>
 96 | 
 97 | Prints a .dot footer (currently just a closed brace).  Returns 1 upon
 98 | successful completion and 'undef' otherwise.
 99 | 
100 | Example:
101 | 
102 |     my $err = printFooter(\*STDOUT);
103 | 
104 | =cut
105 | 
106 | sub printFooter
107 | {
108 |     my $file = shift;
109 | 
110 |     print $file "}\n";
111 |     
112 |     return 1;
113 | } # printFooter
114 | 
115 | 
116 | =item B<my $ret = printNode($file, $id, $label, $ori);>
117 | 
118 | Prints a "contig" node with the specified id, label, and orientation.
119 | If orientation is 1 then the node is a forward facing arrow, otherwise
120 | it is a backward facing arror. Returns 1 upon successful completion
121 | and 'undef' otherwise.
122 | 
123 | Example:
124 | 
125 |     my $err = printNode(\*STDOUT, $node_id, "$node_id ($node_len)", 1);
126 | 
127 | =cut   
128 | 
129 | sub printNode
130 | {
131 |     my $file = shift;
132 |     my $id = shift;
133 |     my $label = shift;
134 |     my $ori = shift;
135 |     my $angle;
136 |     
137 |     $id =~ s/(\W)/_/g;
138 | 
139 |     if ($ori == 1){
140 | 	$angle = -90;
141 |     } else {
142 | 	$angle = 90;
143 |     }
144 | 
145 |     print $file "    $id [ label = \"$label\" height = 0.2, fontsize = 8, shape = \"house\", orientation = $angle ]\n";
146 | 
147 |     return 1;
148 | 
149 | } # printNode
150 | 
151 | 
152 | =item B<my $ret = printEdge($file, $nodeA, $nodeB, $label, $style);>
153 | 
154 | Prints an edge between two nodes with the specified label.  The style can
155 | be any of the GraphViz acceptable styles ("dotted", "solid", "dashed", 
156 | "invis") or undefined in which case the default is used. Returns 1 upon
157 | successful completion and 'undef' otherwise.
158 | 
159 | Example:
160 | 
161 |     my $err = printEdge(\*STDOUT, $nodeA, $nodeB, "A to B", "invis");
162 | 
163 | =cut   
164 | 
165 | sub printEdge
166 | {
167 |     my $file = shift;
168 |     my $nodeA = shift;
169 |     my $nodeB = shift;
170 |     my $label = shift;
171 |     my $instyle = shift;
172 |     my $style;
173 | 
174 |     $nodeA =~ s/(\W)/_/g;
175 |     $nodeB =~ s/(\W)/_/g;
176 | 
177 |     if (defined $instyle){
178 | 	$style = "style = \"" . $instyle . "\"";
179 | 	if ($instyle eq "invis"){
180 | 	    $style .= " color = \"white\" ";
181 | 	}
182 |     }
183 | 
184 |     print $file "    $nodeA -> $nodeB [ label =\"$label\" fontsize = 8 $style ]\n";
185 | 
186 |     return 1;
187 | } # printEdge
188 | 
189 | =item B<my $err = startCluster($file, $id, $label);>
190 | 
191 | Starts a cluster in the .dot output file with the given label and id.
192 | Returns 1 upon successful completion and 'undef' otherwise.
193 | 
194 | Example:
195 |     
196 |     my $err = startCluster(\*STDOUT, $clust_id, "first cluster");
197 | 
198 | =cut
199 | 
200 | sub startCluster
201 | {
202 |     my $file = shift;
203 |     my $id = shift;
204 |     my $label = shift;
205 | 
206 |     $id =~ s/(\W)/_/g;
207 | 
208 |     print $file "  subgraph cluster_$id {\n";
209 |     print $file "    label = \"$label\"\n";
210 | 
211 |     return 1;
212 | } # startCluster
213 | 
214 | =item B<my $err = endCluster($file);>
215 | 
216 | Ends a cluster in the .dot output.  Returns 1 upon successful
217 | completion and 'undef' otherwise.
218 | 
219 | Example: 
220 | 
221 |     my $err = endCluster(\*STDOUT);
222 | 
223 | =cut
224 | 
225 | sub endCluster
226 | {
227 |     my $file = shift;
228 | 
229 |     print $file "  }\n";
230 |    
231 |     return 1;
232 | } # endCluster
233 | 
234 | 
235 | 1;
236 | 
237 | 
238 | 
239 | 


--------------------------------------------------------------------------------
/example/ecoli_scaffolds_no_extension.summaryfile.txt:
--------------------------------------------------------------------------------
 1 | READING READS lib1:
 2 | ------------------------------------------------------------
 3 | 	Total inserted pairs = 10408224 
 4 | 	Number of pairs containing N's = 61604 
 5 | 	Remaining pairs = 10346620
 6 | ------------------------------------------------------------
 7 | 
 8 | 
 9 | 
10 | LIBRARY lib1 STATS:
11 | ################################################################################
12 | 
13 | MAPPING READS TO CONTIGS:
14 | ------------------------------------------------------------
15 | 	Number of single reads found on contigs = 1949086
16 | 	Number of pairs used for pairing contigs / total pairs = 666142 / 683736
17 | ------------------------------------------------------------
18 | 
19 | READ PAIRS STATS:
20 | 	Assembled pairs: 666142 (1332284 sequences)
21 | 		Satisfied in distance/logic within contigs (i.e. -> <-, distance on target: 200 +/-150): 519701
22 | 		Unsatisfied in distance within contigs (i.e. distance out-of-bounds): 233
23 | 		Unsatisfied pairing logic within contigs (i.e. illogical pairing ->->, <-<- or <-->): 5
24 | 		---
25 | 		Satisfied in distance/logic within a given contig pair (pre-scaffold): 146175
26 | 		Unsatisfied in distance within a given contig pair (i.e. calculated distances out-of-bounds): 28
27 | 		---
28 | 	Total satisfied: 665876	unsatisfied: 266
29 | 
30 | 
31 | 	Estimated insert size statistics (based on 519934 pairs): 
32 | 		Mean insert size = 215
33 | 		Median insert size = 215
34 | REPEATS: 
35 | 	Number of repeated edges = 24
36 | ------------------------------------------------------------
37 | 
38 | ################################################################################
39 | 
40 | SUMMARY: 
41 | ------------------------------------------------------------
42 | 	Inserted contig file;
43 | 		Total number of contigs = 595
44 | 		Sum (bp) = 4545610
45 | 			Total number of N's = 0
46 | 			Sum (bp) no N's = 4545610
47 | 		Max contig size = 67081
48 | 		Min contig size = 100
49 | 		Average contig size = 7639
50 | 		N50 = 18242
51 | 
52 | 	After scaffolding lib1:
53 | 		Total number of scaffolds = 127
54 | 		Sum (bp) = 4545129
55 | 			Total number of N's = 5518
56 | 			Sum (bp) no N's = 4539611
57 | 		Max scaffold size = 268578
58 | 		Min scaffold size = 100
59 | 		Average scaffold size = 35788
60 | 		N50 = 94525
61 | 
62 | ------------------------------------------------------------
63 | 


--------------------------------------------------------------------------------
/example/libraries.txt:
--------------------------------------------------------------------------------
1 | lib1 SRR001665_1.fastq SRR001665_2.fastq 200 0.75 FR


--------------------------------------------------------------------------------
/tools/TQS.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | __doc__ = """
  4 | TQS
  5 | 
  6 | Trim Quality Solexa Sequences (TQS)
  7 | 
  8 | SYNOPSIS
  9 |    Quality trim solexa-Illumina sequence reads using user-defined thresholds 
 10 | """
 11 | __author__ = "Rene L. Warren"
 12 | __version__ = '1.0'
 13 | 
 14 | #LICENSE
 15 | #   Copyright (c) 2007 Canada's Michael Smith Genome Science Centre.  All rights reserved.
 16 | 
 17 | #   This program is free software; you can redistribute it and/or
 18 | #   modify it under the terms of the GNU General Public License
 19 | #   as published by the Free Software Foundation; either version 2
 20 | #   of the License, or (at your option) any later version.
 21 | 
 22 | #   This program is distributed in the hope that it will be useful,
 23 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 | #   GNU General Public License for more details.
 26 | 
 27 | import sys, os, re, string
 28 | from datetime import datetime
 29 | from optparse import OptionParser
 30 | 
 31 | 
 32 | def main():
 33 | 	usage = "Usage: %s --help"
 34 | 
 35 | 	parser = OptionParser()
 36 | 	parser.add_option("-f", "--sequence file", dest="seqfile",
 37 | 	                  help="Illumina sequence file - Output format from the 1G Genome Analyzer (_seq.txt):                                       7       1       255     669     AACCCCCACTCCTACAACGCCATCATTCCCCTCGAC",)
 38 | 	parser.add_option("-q", "--qual file", dest="qualfile",
 39 | 	                  help="A prb file containing all the Illumina intensities, as outputted by the 1G Genome Analyzer (_prb.txt)",)
 40 | 	parser.add_option("-l", "--length", dest="mer", type="int", default=36,
 41 | 	                  help="Length of sequence reads (i.e. Number of sequencing cycles, default=36)",)
 42 |         parser.add_option("-t", "--threshold", dest="threshold", type="int", default=5,
 43 |                           help="Base intensity threshold value (-40 to 40, default=5)",)
 44 |         parser.add_option("-d", "--difference", dest="diff", type="int", default=5,
 45 |                           help="Base intensity difference between top intensity and second best (1 to 80, default=5)",)
 46 |         parser.add_option("-c", "--consec", dest="consec", type="int", default=20,
 47 |                           help="Minimum number of consecutive bases passing threshold values (default=20)",)
 48 | 	parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
 49 | 	                  help="Runs in Verbose mode.",)
 50 | 	(opts, args) = parser.parse_args()
 51 | 	
 52 | 	try:
 53 | 		f = open(opts.seqfile)
 54 | 		seq = f.readlines()
 55 | 		f.close()
 56 | 	except Exception, e:
 57 | 		print "ERROR: Could not read from %s: %s" % (opts.seqfile, e)
 58 | 		print usage % (sys.argv[0:])
 59 | 		sys.exit()
 60 | 
 61 |         try:
 62 |                 f = open(opts.qualfile)
 63 |                 qual = f.readlines()
 64 |                 f.close()
 65 |         except Exception, e:
 66 |                 print "ERROR: Could not read from %s: %s" % (opts.qualfile, e)
 67 |                 print usage % (sys.argv[0:])
 68 |                 sys.exit()
 69 | 	
 70 | 
 71 | 	fasta = "%s_I%sD%sL%s.trim.fa" % (opts.seqfile,opts.threshold,opts.diff,opts.consec)
 72 | 	log = "%s.log" % opts.seqfile
 73 | 
 74 | 
 75 |         try:
 76 |                 FASTA = open(fasta, 'w')
 77 |         except:
 78 |                 print "ERROR: Can not write to %s" % fasta
 79 |                 sys.exit()
 80 | 
 81 | 	try:
 82 | 		LOG = open(log, 'w')
 83 | 	except:
 84 | 		print "ERROR: Can not write to %s" % log
 85 | 		sys.exit()
 86 | 	
 87 | 
 88 | 	if opts.mer < 15 or opts.mer > 200:
 89 | 		print "ERROR: -l must be a number between 15 and 200."
 90 | 		sys.exit()
 91 | 	
 92 | 	if opts.consec < 16 or opts.consec > opts.mer:
 93 | 		print "ERROR: -c must be a number between 16 and -l."
 94 | 		sys.exit()
 95 | 
 96 | 	LOG.write("""
 97 | Running:
 98 | %s
 99 | -f %s
100 | -q %s
101 | -l %s
102 | -c %s
103 | -t %s
104 | -d %s
105 | Fasta file: %s
106 | 
107 | """ % (sys.argv[0:],opts.seqfile, opts.qualfile, opts.mer, opts.consec, opts.threshold, opts.diff, fasta))
108 | 	
109 | 	t0 = datetime.now()
110 | 	LOG.write("\nReading Quality File: %s\n" % str(t0)[:len('2006-10-05 23:04')])
111 | 	trim_info = parseQualFile(opts.threshold, opts.diff, opts.consec, opts.mer, qual, opts.verbose, LOG)
112 |         t1 = datetime.now()
113 |         LOG.write("\n\nTrimming low quality bases: %s\n" % str(t1)[:len('2006-10-05 23:04')])
114 | 	readNTrim(trim_info, seq, opts.verbose, FASTA, LOG)
115 |         LOG.write("DNA sequences have been trimmed accordingly and placed in %s" % fasta)
116 | 	
117 | 	LOG.close()
118 | 	FASTA.close()
119 | 	return	
120 | 
121 | #--------------------------------------------------------------------------------------
122 | def parseQualFile(threshold, difference, consecutive, read_length, qual, verbose, LOG):
123 | 	"""
124 | 	Parse a solexa-illumina intensity file
125 | 	
126 | 	Return a Dictionary of sequence order number, with the index value and length to extract 
127 | 	"""
128 | 	trim_info = {}
129 | 	ok_read = 0
130 | 	read_number = 0
131 | 
132 | 	if verbose:
133 | 		print "Printing trimming pattern for all reads passing the set threshold values...\n"
134 | 	
135 | 	for line in qual:
136 | 		read_number += 1                ### this keeps track of the read order, respected between the prb and seq files
137 | 		concat = ""			### concat builds a string of bases passing the user-defined filter 
138 | 		quartets = line.split("\t")	### split quartet (4 number per position)
139 | 		for quartet in quartets:	### cycle through each quartet
140 | 			quad = (quartet.split())
141 | 			quadint = []
142 |                         for basequal in quad:	### each intensity/number for each position
143 | 				quadint.append(int(basequal))
144 |                         quadint.sort()
145 | 			quadint.reverse()
146 | 			basediff = quadint[0] - quadint[1]
147 |                         #print "T=%i D=%i" % (quadint[0],basediff)
148 | 
149 | 			if quadint[0] < threshold or basediff < difference:
150 | 				concat += "x"
151 | 			else:
152 | 				concat += "-"
153 | 
154 |   		head_match_regex = re.compile("\-{%i,%i}" % (consecutive,read_length)) 
155 | 		head_match = head_match_regex.search(concat)
156 |  		if head_match != None:
157 | 			ok_read += 1
158 | 			col = head_match.span()
159 |                         if not trim_info.has_key(read_number):
160 |                                 trim_info[read_number] = {}
161 | 
162 | 			start = int(col[0])	
163 | 			end = int(col[1])
164 | 	
165 | 			trim_info[read_number]['start'] = start
166 | 			trim_info[read_number]['end'] = end
167 | 
168 | 			if verbose:
169 | 				sub = concat[trim_info[read_number]['start']:trim_info[read_number]['end']]
170 | 				print "passed seqs:%i line#%i %s (start trim:%i,length:%i) %s\n" % (ok_read, read_number, concat, start, end, sub)
171 | 
172 | 	LOG.write("%i out of %i sequences passed your filter (I >= %i and D >= %i and L >= %i)\n" % (ok_read, read_number, threshold, difference, consecutive))
173 | 
174 | 	return trim_info
175 | 
176 | 
177 | #--------------------------------------------------------------------------------------
178 | def readNTrim(trim_info, seq, verbose, FASTA, LOG):
179 | 
180 | 	"""         
181 |         Parse a solexa/illumina sequence file and trim DNA sequence based user-defined intensity threshold/differences
182 | 	"""	
183 |        
184 | 
185 | 	read_number = 0
186 | 	gDNAlinker_count = 0
187 | 	usable_reads = 0
188 | 
189 |         dna_sequence_field = re.compile('^[ACTG]+$')
190 | 	gDNAlinker1_field = re.compile('^ATCCCC[GA]A')
191 | 	gDNAlinker2_field = re.compile('^ATCTAACAG')	
192 | 
193 | 	if verbose:
194 | 		print "Printing trimmed sequences for all reads passing the set threshold values minus, excluding sequence containing linkers...\n"
195 | 
196 |         for line in seq:
197 | 		read_number += 1            ### tracks read number / will match order in prb file
198 |         	line = line.rstrip('\r\n')
199 | 		info = line.split("\t")     ### split line, the seq file lists: lane tile xcoord y coord DNAseq 
200 | 		dna_string = info[4]
201 | 	
202 | 		if trim_info.has_key(read_number):
203 | 			trim_seq = dna_string[trim_info[read_number]['start']:trim_info[read_number]['end']]
204 | 			if re.match(dna_sequence_field, trim_seq):		### no ambiguous bases?
205 | 				if re.match(gDNAlinker1_field, trim_seq) or re.match(gDNAlinker2_field,trim_seq):	### matches gDNA linker?
206 | 					gDNAlinker_count += 1
207 | 				else:
208 | 					usable_reads += 1
209 | 					FASTA.write(">%s-%s-%s-%s\n%s\n" % (info[0],info[1],info[2],info[3],trim_seq))
210 | 					if verbose:
211 | 						print "line#%i %s (start trim:%i,length:%i) %s" % (read_number,info[4],trim_info[read_number]['start'],trim_info[read_number]['end'],trim_seq)
212 | 	LOG.write("%i out of %i sequences appear to be usable, after filtering out sequences hard-coded in this program * %i gDNA linker sequences*\n" % (usable_reads, read_number,gDNAlinker_count))
213 | 	return
214 | 
215 | if __name__ == '__main__':
216 | 	main()
217 | 	import time
218 | 	sys.exit()
219 | 


--------------------------------------------------------------------------------
/tools/TQS.readme:
--------------------------------------------------------------------------------
 1 | 
 2 | TQS
 3 | 
 4 | Trim Quality Solexa-Illumina Sequences (TQS)
 5 | 
 6 | SYNOPSIS
 7 |    Quality trim solexa-Illumina sequence reads using user-defined thresholds
 8 | """
 9 | __author__ = "Rene L. Warren"
10 | __version__ = '1.0'
11 | 
12 | #LICENSE
13 | #   Copyright (c) 2007 Canada's Michael Smith Genome Science Centre.  All rights reserved.
14 | 
15 | #   This program is free software; you can redistribute it and/or
16 | #   modify it under the terms of the GNU General Public License
17 | #   as published by the Free Software Foundation; either version 2
18 | #   of the License, or (at your option) any later version.
19 | 
20 | #   This program is distributed in the hope that it will be useful,
21 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23 | #   GNU General Public License for more details.
24 | 
25 | Execution example
26 | ==================
27 | python TQS.py -f test_seq.txt -q test_prb.txt -l 36 -t 5 -d 5 -c 20
28 | 
29 | 
30 | Options
31 | =======
32 | python TQS.py --help
33 | 
34 | Usage: TQS.py [options]
35 | 
36 | Options:
37 |   -h, --help            show this help message and exit
38 |   -f SEQFILE, --sequence file=SEQFILE
39 |                         Illumina sequence file - Output format from the 1G
40 |                         Genome Analyzer (_seq.txt):
41 |                         7       1       255     669
42 |                         AACCCCCACTCCTACAACGCCATCATTCCCCTCGAC
43 |   -q QUALFILE, --qual file=QUALFILE
44 |                         A prb file containing all the Illumina intensities, as
45 |                         outputted by the 1G Genome Analyzer (_prb.txt)
46 |   -l MER, --length=MER  Length of sequence reads (i.e. Number of sequencing
47 |                         cycles, default=36)
48 |   -t THRESHOLD, --threshold=THRESHOLD
49 |                         Base intensity threshold value (-40 to 40, default=5)
50 |   -d DIFF, --difference=DIFF
51 |                         Base intensity difference between top intensity and
52 |                         second best (1 to 80, default=5)
53 |   -c CONSEC, --consec=CONSEC
54 |                         Minimum number of consecutive bases passing threshold
55 |                         values (default=20)
56 |   -v, --verbose         Runs in Verbose mode.
57 | 
58 | 
59 | Output
60 | ======
61 | 
62 | .log file: Indicates the option chosen and tracks the execution time
63 | .fa file: A single fasta file containing the sequence reads that passed the filter specified
64 | 


--------------------------------------------------------------------------------
/tools/TQSexport.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | __doc__ = """
  4 | TQS
  5 | 
  6 | Trim Quality Solexa Sequences (TQS)
  7 | 
  8 | SYNOPSIS
  9 |    Quality trim solexa-Illumina sequence reads using user-defined thresholds 
 10 | """
 11 | __author__ = "Rene L. Warren"
 12 | __version__ = '1.0'
 13 | 
 14 | #LICENSE
 15 | #   Copyright (c) 2007 Canada's Michael Smith Genome Science Centre.  All rights reserved.
 16 | 
 17 | #   This program is free software; you can redistribute it and/or
 18 | #   modify it under the terms of the GNU General Public License
 19 | #   as published by the Free Software Foundation; either version 2
 20 | #   of the License, or (at your option) any later version.
 21 | 
 22 | #   This program is distributed in the hope that it will be useful,
 23 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 | #   GNU General Public License for more details.
 26 | 
 27 | import sys, os, re, string, math
 28 | from datetime import datetime
 29 | from optparse import OptionParser
 30 | 
 31 | 
 32 | def main():
 33 | 	usage = "Usage: %s --help"
 34 | 
 35 | 	parser = OptionParser()
 36 | 	parser.add_option("-f", "--export file", dest="exportfile",
 37 | 	                  help="Illumina export file - Output format from the Genome Analyzer",)
 38 |         parser.add_option("-t", "--Phred quality threshold", dest="threshold", type="int", default=10,
 39 |                           help="Base intensity threshold value (Phred quality scores 0 to 40, default=10)",)
 40 |         parser.add_option("-c", "--consec", dest="consec", type="int", default=20,
 41 |                           help="Minimum number of consecutive bases passing threshold values (default=20)",)
 42 | 	parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
 43 | 	                  help="Runs in Verbose mode.",)
 44 | 	(opts, args) = parser.parse_args()
 45 | 	
 46 | 	try:
 47 | 		f = open(opts.exportfile)
 48 | 		seq = f.readlines()
 49 | 		f.close()
 50 | 	except Exception, e:
 51 | 		print "ERROR: Could not read from %s: %s" % (opts.exportfile, e)
 52 | 		print usage % (sys.argv[0:])
 53 | 		sys.exit()
 54 | 
 55 | 
 56 | 	fasta = "%s_T%sC%s.trim.fa" % (opts.exportfile,opts.threshold,opts.consec)
 57 | 	log = "%s.log" % opts.exportfile
 58 |         minimum_length = 15
 59 | 
 60 | 
 61 |         try:
 62 |                 FASTA = open(fasta, 'w')
 63 |         except:
 64 |                 print "ERROR: Can not write to %s" % fasta
 65 |                 sys.exit()
 66 | 
 67 | 	try:
 68 | 		LOG = open(log, 'w')
 69 | 	except:
 70 | 		print "ERROR: Can not write to %s" % log
 71 | 		sys.exit()
 72 | 	
 73 | 	if opts.consec < minimum_length:
 74 | 		print "ERROR: -c must be a number larger than %i." % (minimum_length)
 75 | 		sys.exit()
 76 | 
 77 | 	LOG.write("""
 78 | Running:
 79 | %s
 80 | -f %s
 81 | -c %s
 82 | -t %s
 83 | Fasta file: %s
 84 | 
 85 | """ % (sys.argv[0:],opts.exportfile, opts.consec, opts.threshold, fasta))
 86 | 	
 87 |         t1 = datetime.now()
 88 |         LOG.write("\n\nTrimming low quality bases: %s\n" % str(t1)[:len('2006-10-05 23:04')])
 89 | 	readNtrim(seq, opts.threshold, opts.consec, opts.verbose, FASTA, LOG)
 90 |         LOG.write("DNA sequences have been trimmed accordingly and placed in %s" % fasta)
 91 | 	
 92 | 	LOG.close()
 93 | 	FASTA.close()
 94 | 	return	
 95 | 
 96 | #--------------------------------------------------------------------------------------
 97 | def readNtrim(export, threshold, consecutive, verbose, FASTA, LOG):
 98 | 	"""
 99 | 	Parse a solexa-illumina export file
100 | 	SOLEXA3_77_30V9CAAXX		4	1	1068	522		1	GGACAGCTGACAGCTGTTAAGAAGGACCCTATGTTAAAGGAAATGGATAC	YYYYYYYYYYYJYY
101 | YYYYRYYYYYYYYYYYTTTTTOOOMOOOMMOOOOOG	chr13		36311743	F	50	52	121			187	R	N
102 | 	Return a Dictionary of sequence order number, with the index value and length to extract 
103 | 	"""
104 | 	trim_info = {}
105 | 	ok_read = 0
106 | 	read_number = 0
107 | 
108 | 	if verbose:
109 | 		print "Printing trimming pattern for all reads passing the set threshold values...\n"
110 | 	
111 | 	for line in export:
112 | 		read_number += 1
113 | 		concat = ""			### concat builds a string of bases passing the user-defined filter 
114 | 		info = line.split() 	        ### split info 
115 | 		illumina_encoded_qual = list(info[7])
116 | 		"""
117 | 		print "line%s\tseq:%s\tqual:%s\n" % (line,info[6],info[7])
118 | 		"""
119 | 		pos = 0
120 | 		for illumina_qual in illumina_encoded_qual:
121 | 			pos += 1
122 | 			Q = 10 * math.log(1 + 10 ** ((ord(illumina_qual) - 64) / 10.0)) / math.log(10)
123 | 			if Q < threshold:
124 | 				concat += "x"
125 | 			else:
126 | 				concat += "-"
127 | 			"""
128 | 			print "base#%i. Illumina qual (%s) == phredQ (%i)\n" % (pos,illumina_qual,Q)
129 | 			"""
130 | 
131 | 		seq_len = len(info[6])
132 |   		head_match_regex = re.compile("\-{%i,%i}" % (consecutive, seq_len)) 
133 | 		head_match = head_match_regex.search(concat)
134 |  		if head_match != None:
135 | 			ok_read += 1
136 | 			col = head_match.span()
137 |                         if not trim_info.has_key(read_number):
138 |                                 trim_info[read_number] = {}
139 | 
140 | 			start = int(col[0])	
141 | 			end = int(col[1])
142 | 
143 | 			pair = ""
144 | 			if info[5] == "1":
145 | 				pair = "a"
146 | 			elif info[5] == "2":
147 | 				pair = "b"
148 | 
149 |                         trim_seq = info[6][start:end]
150 |                         FASTA.write(">%s-%s-%s-%s%s\n%s\n" % (info[1],info[2],info[3],info[4],pair,trim_seq))
151 | 
152 | 			if verbose:
153 | 				print "passed seqs:%i line#%i %s (start trim:%i,end trim:%i) %s\n" % (ok_read, read_number, concat, start, end, trim_seq)
154 | 
155 | 	LOG.write("%i out of %i sequences passed your filter (-t >= %i and -c >= %i)\n" % (ok_read, read_number, threshold, consecutive))
156 | 
157 | 	return
158 | 
159 | 
160 | 
161 | if __name__ == '__main__':
162 | 	main()
163 | 	import time
164 | 	sys.exit()
165 | 


--------------------------------------------------------------------------------
/tools/TQSfastq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | __doc__ = """
  4 | TQS
  5 | 
  6 | Trim Quality Sequences (TQS)
  7 | 
  8 | SYNOPSIS
  9 |    Quality trim FASTQ sequence reads using user-defined thresholds 
 10 | """
 11 | __author__ = "Rene L. Warren"
 12 | __version__ = 'fastq'
 13 | 
 14 | #LICENSE
 15 | #   Copyright (c) 2007 Canada's Michael Smith Genome Science Centre.  All rights reserved.
 16 | 
 17 | #   This program is free software; you can redistribute it and/or
 18 | #   modify it under the terms of the GNU General Public License
 19 | #   as published by the Free Software Foundation; either version 2
 20 | #   of the License, or (at your option) any later version.
 21 | 
 22 | #   This program is distributed in the hope that it will be useful,
 23 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 | #   GNU General Public License for more details.
 26 | 
 27 | #   Modified by Lance Parsons at Princton University's Lewis-Sigler Institute for Integrative Genomics
 28 | #   Adapted to trim "standard" FASTQ files (PHRED+33)
 29 | 
 30 | import sys, os, re, string, math
 31 | from datetime import datetime
 32 | from optparse import OptionParser
 33 | 
 34 | 
 35 | def main():
 36 | 	usage = "Usage: %s --help"
 37 | 
 38 | 	parser = OptionParser()
 39 | 	parser.add_option("-f", "--fastq file", dest="fastqfile",
 40 | 	                  help="fastq (fq) file - standard (ASCII+33) encoded PHRED quality scores / illumina (ASCII+64) encoded PHRED quality scores",)
 41 |         parser.add_option("-t", "--Phred quality threshold", dest="threshold", type="int", default=10,
 42 |                           help="Base intensity threshold value (Phred quality scores 0 to 40, default=10)",)
 43 |         parser.add_option("-c", "--consec", dest="consec", type="int", default=20,
 44 |                           help="Minimum number of consecutive bases passing threshold values (default=20)",)
 45 |         parser.add_option("-e", "--ASCII encoding type: 33 or 64", dest="encoding", type="int", default=64,
 46 |                           help="Type of ASCII encoding: 33 (standard) or 64 (illumina)  (default=64)",)
 47 | 	parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
 48 | 	                  help="Runs in Verbose mode.",)
 49 | 	(opts, args) = parser.parse_args()
 50 | 	
 51 | 	try:
 52 | 		f = open(opts.fastqfile)
 53 | 		seq = f.readlines()
 54 | 		f.close()
 55 | 	except Exception, e:
 56 | 		print "ERROR: Could not read from %s: %s" % (opts.fastqfile, e)
 57 | 		print usage % (sys.argv[0:])
 58 | 		sys.exit()
 59 | 
 60 | 
 61 | 	fasta = "%s_T%sC%sE%s.trim.fa" % (opts.fastqfile, opts.threshold, opts.consec, opts.encoding)
 62 | 	log = "%s.log" % opts.fastqfile
 63 |         minimum_length = 15
 64 | 
 65 | 
 66 |         try:
 67 |                 FASTA = open(fasta, 'w')
 68 |         except:
 69 |                 print "ERROR: Can not write to %s" % fasta
 70 |                 sys.exit()
 71 | 
 72 | 	try:
 73 | 		LOG = open(log, 'w')
 74 | 	except:
 75 | 		print "ERROR: Can not write to %s" % log
 76 | 		sys.exit()
 77 | 	
 78 | 	if opts.consec < minimum_length:
 79 | 		print "ERROR: -c must be a number larger than %i." % (minimum_length)
 80 | 		sys.exit()
 81 | 
 82 |         if opts.encoding != 33 and opts.encoding != 64:
 83 |                 print "ERROR: -e must be either 33 or 64."
 84 |                 sys.exit()
 85 | 
 86 | 	LOG.write("""
 87 | Running:
 88 | %s
 89 | -f %s
 90 | -c %s
 91 | -t %s
 92 | -e %s
 93 | Fasta file: %s
 94 | 
 95 | """ % (sys.argv[0:], opts.fastqfile, opts.consec, opts.threshold, opts.encoding, fasta))
 96 | 	
 97 |         t1 = datetime.now()
 98 |         LOG.write("\n\nTrimming low quality bases: %s\n" % str(t1)[:len('2006-10-05 23:04')])
 99 | 	readNtrim(seq, opts.threshold, opts.consec, opts.encoding, opts.verbose, FASTA, LOG)
100 |         LOG.write("DNA sequences have been trimmed accordingly and placed in %s" % fasta)
101 | 	
102 | 	LOG.close()
103 | 	FASTA.close()
104 | 	return	
105 | 
106 | #--------------------------------------------------------------------------------------
107 | def readNtrim(fastq, threshold, consecutive, encoding, verbose, FASTA, LOG):
108 | 	"""
109 | 	Return a Dictionary of sequence order number, with the index value and length to extract 
110 | 	"""
111 | 	trim_info = {}
112 | 	ok_read = 0
113 | 	read_number = 0
114 | 	record_line = 0
115 | 	
116 | 	if verbose:
117 | 		print "Printing trimming pattern for all reads passing the set threshold values...\n"
118 | 	
119 | 	for line in fastq:
120 | 		record_line += 1
121 | 		if record_line == 1:
122 | 			read_id = line.strip()
123 | 		elif record_line == 2:
124 | 			seq = line.strip()
125 | 		elif record_line == 3:
126 | 			qual_id = line.strip()
127 | 		elif record_line == 4:
128 | 			record_line = 0
129 | 			qual = line.strip()
130 | 			read_number += 1
131 | 			concat = ""			### concat builds a string of bases passing the user-defined filter 
132 | 			"""
133 | 			print "line%s\tseq:%s\tqual:%s\n" % (line,info[6],info[7])
134 | 			"""
135 | 			pos = 0
136 | 			for qual_char in qual:
137 | 				Q = (ord(qual_char) - encoding)
138 | 				pos += 1
139 | 				if Q < threshold:
140 | 					concat += "x"
141 | 				else:
142 | 					concat += "-"
143 | 				"""
144 | 				print "base#%i. Illumina qual (%s) == phredQ (%i)\n" % (pos,illumina_qual,Q)
145 | 				"""
146 | 	
147 | 			seq_len = len(seq)
148 | 	  		head_match_regex = re.compile("\-{%i,%i}" % (consecutive, seq_len)) 
149 | 			head_match = head_match_regex.search(concat)
150 | 	 		if head_match != None:
151 | 				ok_read += 1
152 | 				col = head_match.span()
153 | 	                        if not trim_info.has_key(read_number):
154 | 	                                trim_info[read_number] = {}
155 | 	
156 | 				start = int(col[0])	
157 | 				end = int(col[1])
158 | 	
159 | 	                        trim_seq = seq[start:end]
160 | 	                        FASTA.write(">%s\n%s\n" % (read_id, trim_seq))
161 | 	
162 | 				if verbose:
163 | 					print "%s\n%s\n%s\n passed seqs:%i line#%i %s (start trim:%i,end trim:%i) %s\n" % (read_id,seq,qual,ok_read, read_number, concat, start, end, trim_seq)
164 | 
165 | 	LOG.write("%i out of %i sequences passed your filter (-t >= %i and -c >= %i)\n" % (ok_read, read_number, threshold, consecutive))
166 | 
167 | 	return
168 | 
169 | 
170 | 
171 | if __name__ == '__main__':
172 | 	main()
173 | 	import time
174 | 	sys.exit()
175 | 


--------------------------------------------------------------------------------
/tools/TRIMMING_PAIRED_READS.README:
--------------------------------------------------------------------------------
 1 | December 2008/February 2010
 2 | Rene Warren
 3 | rwarren at bcgsc dot ca
 4 | warrenlr at gmail dot com
 5 | 
 6 | #--------------------------------
 7 | To trim reads using fastq as input, run TQSfastq.py on both PE file:
 8 | *Make sure you know whether your fastq file qual score were encoded ASCII+33 (standard) or ASCII+64 (illumina) 
 9 | 
10 | for options run:
11 | ./TQSfastq.py --help
12 | 
13 | Usage: TQSfastq.py [options]
14 | 
15 | Options:
16 |   -h, --help            show this help message and exit
17 |   -f FASTQFILE, --fastq file=FASTQFILE
18 |                         fastq (fq) file - standard (ASCII+33) encoded PHRED
19 |                         quality scores / illumina (ASCII+64) encoded PHRED
20 |                         quality scores
21 |   -t THRESHOLD, --Phred quality threshold=THRESHOLD
22 |                         Base intensity threshold value (Phred quality scores 0
23 |                         to 40, default=10)
24 |   -c CONSEC, --consec=CONSEC
25 |                         Minimum number of consecutive bases passing threshold
26 |                         values (default=20)
27 |   -e ENCODING, --ASCII encoding type: 33 or 64=ENCODING
28 |                         Type of ASCII encoding: 33 (standard) or 64 (illumina)
29 |                         (default=64)
30 |   -v, --verbose         Runs in Verbose mode.
31 | 
32 | 
33 | e.g. 
34 | ./qseq2fastq.pl s_3_1_0048_qseq.txt > s_3_1_0048_qseq.txt.fq
35 | ./qseq2fastq.pl s_3_2_0048_qseq.txt > s_3_2_0048_qseq.txt.fq
36 | ./TQSfastq.py -f s_3_1_0048_qseq.txt.fq -t 20 -c 36 -e 64
37 | ./TQSfastq.py -f s_3_2_0048_qseq.txt.fq -t 20 -c 36 -e 64
38 | 
39 | To join both for SSAKE's paired-end input, run:
40 | ./makePairedOutput2UNEQUALfiles.pl s_3_1_0048_qseq.txt.fq_T20C36E64.trim.fa s_3_2_0048_qseq.txt.fq_T20C36E64.trim.fa
41 | 
42 | This will create 2 files: paired.fa and unpaired.fa
43 | Run SSAKE: SSAKE -f paired.fa -g unpaired.fa -p 1
44 | 
45 | #--------------------------------
46 | *For those not interested in trimming their reads, but interested in joining 2 equal-record fasta files:
47 | 
48 | ./qseq2fasta.pl s_3_1_0048_qseq.txt > file1.fa
49 | ./qseq2fasta.pl s_3_2_0048_qseq.txt > file2.fa
50 | ./makePairedOutput2EQUALfiles.pl file1.fa file2.fa
51 | 
52 | #--------------------------------
53 | Many of you asked me whether you could trim paired-end (PE) Illumina reads with TQS.py
54 | The answer is yes. However, I never got a chance to conjure a formal script for PE reads - But I hacked
55 | a work-around trimming paired reads:
56 | 
57 | 
58 | 
59 | 1. run splitInput.pl (supplied in the ./tools directory) where all *_seq.txt and *_prb.txt are located:
60 | 
61 | Usage: ./splitInput.pl <directory where all *_seq and *_prb are> <# Illumina cycles (read length)>
62 | 
63 | 
64 | 2. run this perl one-liner to make a shell script:
65 | 
66 | ls -la | perl -ne 'if(/(s_\d+_\d+)_seq.txt.new/){print "TQS.py -f $1_seq.txt.new -q $1_prb.txt.new -l * -t * -d * -c *\n"}' > runTQS.sh
67 | *replace stars with your values
68 | 
69 | 
70 | 3. Run the shell script the above command created (will run TQS.py on ALL tiles, sequentially (why not farming the job on a compte cluster?!)
71 | 
72 |  chmod 755 runTQS.sh
73 |  ./runTQS.sh
74 | 
75 | 
76 | 4. Concatenate all trimmed reads
77 | 
78 |   cat s*.trim.fa > all_raw.fa
79 | 
80 |  
81 | 5. Make the paired output (that will become the input for ssake3.2.1 -f)
82 | using the 2nd script supplied in this directory.
83 | 
84 | ./makePairedOutput.pl all_raw.fa
85 | 
86 | 
87 | As always, feel free to contact me if you have any questions.
88 | Rene
89 | 


--------------------------------------------------------------------------------
/tools/estimate_insert_size.pl:
--------------------------------------------------------------------------------
  1 | ###################################################################################################################
  2 | #Marten Boetzer BaseClear B.v. 14-07-2011                                                                         #
  3 | #SSPACE perl subscript samToTab_multi.pl                                                                          #
  4 | #This script;                                                                                                     #
  5 | #  -Estimates median insert size by mapping paired-reads on contigs                                               #
  6 | #  It goes through each contig and maps both reads, if a pair is mapped,                                          #
  7 | #  the orientation and insert size is estimated.                                                                  #
  8 | #  If sufficient pairs (given by the user) are found, the median insert size is                                   #
  9 | #  estimated, as well as a file with the distribution is generated which can be                                   #
 10 | #  used to visualize the insert size distribution.                                                                #
 11 | #                                                                                                                 #
 12 | #  To run this script;                                                                                            #
 13 | #  perl estimate_insert_size.pl <contigfile> <readfile1> <readfile2> <number_of_pairs> <orientation_of_pairs>     #
 14 |                                                                                                                   #
 15 | #  Output is the median insert size and a file with distribution of the insert size. Also, number of pairs for    #
 16 | #  each found orientation (FR, RF, FF and RR) are given.                                                          #
 17 | ###################################################################################################################
 18 | 
 19 | use File::Path;
 20 | use strict;
 21 | my $contigfile = $ARGV[0];
 22 | my $fileA = $ARGV[1];
 23 | my $fileB = $ARGV[2];
 24 | my $numpairs = $ARGV[3];
 25 | my $orientation = $ARGV[4];
 26 | 
 27 | die "ERROR: Can't find contig file: $contigfile -- fatal\n" if(! -e $contigfile);
 28 | die "ERROR: Can't find read file: $fileA -- fatal\n" if(! -e $fileA);
 29 | die "ERROR: Can't find read file: $fileB -- fatal\n" if(! -e $fileB);
 30 | if($numpairs eq ''){
 31 |   print "WARNING: No number of pairs are given, using 10000 pairs instead\n";
 32 |   $numpairs = 10000;
 33 | }
 34 | if($orientation eq ''){
 35 |   print "WARNING: No orientation of the pairs is given, using orientation FR instead\n";
 36 |   $orientation = "FR";
 37 | }
 38 | die "ERROR: You've inserted $numpairs, which does not seem to be an valid number. Exiting.\n" if(!($numpairs>0) || !($numpairs =~ /^\d+$/));
 39 | die "ERROR: Orientation must have length of 2 characters and should contain one of the following; FR, FF, FR or RF. You've inserted orientation of $orientation ...Exiting.\n" if(!(length($orientation) == 2) || !($orientation =~ /[FR][FR]/));
 40 | 
 41 | print "\n";
 42 | my $paircount = 0;
 43 | my ($direction, $insertsize);
 44 | mkpath('bowtieoutput');
 45 | open (CONT, $contigfile) || die "Can't open contig file $contigfile\n";
 46 | 
 47 | my ($seq,$name, $maxctg, $maxseq, $maxname)=("","",0,"","");
 48 | my $contignum = 0;
 49 | CONTIG:
 50 | while (<CONT>) {
 51 |   chomp;
 52 |   $seq.=$_ if(eof(CONT));
 53 |   if (/\>(\S+)/ || eof(CONT)){
 54 |     if($seq ne ""){         
 55 |        $contignum++;
 56 |        if(length($seq) > $maxctg){
 57 |          $maxctg = length($seq);
 58 |          $maxseq = $seq;
 59 |          $maxname = $name;
 60 |        }
 61 |        if(eof(CONT)){
 62 |          $seq = $maxseq;
 63 |          $name = $maxname;
 64 |        }
 65 |        if(eof(CONT)){
 66 |          print "now at contig $name = size".length($seq)."\n";
 67 |          open (BOWCONT, ">bowtieoutput/bowtie_input.fa");
 68 |          print BOWCONT ">$name\n$seq\n";
 69 |          close BOWCONT;
 70 |          ($paircount) = &mapWithBowtie($contignum,"bowtieoutput/bowtie_input.fa", $fileA, $fileB);
 71 |          last CONTIG if($paircount>=$numpairs);
 72 |        }
 73 | 
 74 |        $name = "";
 75 |        $seq = "";
 76 |     }
 77 |     $name = $1;
 78 |   }
 79 |   else {
 80 |      $seq .= $_;
 81 |   }
 82 | }
 83 | 
 84 | foreach my $d (keys %$direction){
 85 |   print "direction $d is found $direction->{$d} times\n";
 86 | }
 87 | my ($median_ins,$record) = (0,0);
 88 | my $median_bin = int($paircount/2);
 89 | open (CSV, ">distribution.txt") || die "Can't open distribution.txt for writing -- fatal";
 90 | foreach my $is (sort {$a<=>$b} keys %$insertsize){
 91 |   for(my $i=0;$i<$insertsize->{$is};$i++){
 92 |     $record++;
 93 |     $median_ins = $is if($record >= $median_bin && $median_ins == 0);
 94 |   }
 95 |   print CSV "$is\t$insertsize->{$is}\n";
 96 | }
 97 | 
 98 | print "\nmedian = $median_ins\n\nSee the distribution in file 'distribution.txt'\n";
 99 | 
100 | 
101 | sub mapWithBowtie{
102 |   my ($fname,$contig, $fileA, $fileB) = @_;
103 |   my $bowtieout = "contig$fname.bowtieIndex";
104 |   system("bowtie-build $contig bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values
105 | 
106 |   my $fastq = 0;
107 |   open(TEST, "< $fileA");
108 |   $name = <TEST>;
109 |   close TEST;
110 |   $fastq = 1 if ($name =~ /^[@]/);
111 | 
112 |   open(FILEA, "< $fileA");
113 |   open(FILEB, "< $fileB");
114 | 
115 |   my $count=0;
116 |   open (BOWIN, ">bowtieoutput/bowtiein.$fname.fa") || die "Can't write to single file bowtieoutput/bowtiein.$fname.fa-- fatal\n";
117 |   while(<FILEA>) {
118 |     <FILEB>;
119 |     $count++;
120 |     my $seq1 = <FILEA>;
121 |     chomp $seq1;
122 |     my $seq2 = <FILEB>;
123 |     chomp $seq2;
124 |     #FASTQ FORMAT
125 |     <FILEA>,<FILEA>,<FILEB>,<FILEB> if ($fastq);
126 |     
127 |     print BOWIN ">read$count\n$seq1>read$count\n$seq2";
128 |     if($count > $numpairs){
129 |       close BOWIN;
130 |       open(IN, "bowtie -p 1 -v 0 -m 1 bowtieoutput/$bowtieout --suppress 6,7 -f bowtieoutput/bowtiein.$fname.fa --quiet|") || die "Can't open bowtie output -- fatal\n";
131 |       my ($prevread, $prevline);
132 |       while(my $line = <IN>){
133 |         my @t1 = split(/\t/,$line);
134 |         if($prevread eq $t1[0]){
135 |           $paircount++;
136 |           my @t2 = split(/\t/,$prevline);
137 |           my ($start1, $start2, $end1,$end2);
138 | 
139 |           if($t1[1] eq "+"){
140 |             $end1 = $t1[3] + length($t1[4]);
141 |             $start1 = $t1[3];
142 |           }
143 |           else{
144 |             $start1 = $t1[3] + length($t1[4]);
145 |             $end1 = $t1[3];
146 |           }
147 |           if($t2[1] eq "+"){
148 |             $end2 = $t2[3] + length($t2[4]);
149 |             $start2 = $t2[3];
150 |           }
151 |           else{
152 |             $start2 = $t2[3] + length($t2[4]);
153 |             $end2 = $t2[3];
154 |           }
155 |           my ($dir1, $dir2);
156 |           $dir1 = "F" if($start1 < $end1);
157 |           $dir1 = "R" if($start1 > $end1);
158 |           $dir2 = "F" if($start2 < $end2);
159 |           $dir2 = "R" if($start2 > $end2);
160 |           $direction->{"$dir1$dir2"}++ if($start1 < $start2);
161 |           $direction->{"$dir2$dir1"}++ if($start2 < $start1);
162 |           my $diff = abs($start2-$start1);
163 |           if($orientation eq "$dir1$dir2" || $orientation eq "$dir2$dir1"){
164 |             $insertsize->{$diff}++;
165 |           }
166 |           return $paircount if($paircount >= $numpairs);
167 |         }
168 |         $prevread = $t1[0];
169 |         $prevline = $line;
170 |      }
171 | 
172 |       close BOWIN;
173 |       open (BOWIN, "bowtieoutput/bowtiein.$fname.fa") || die "Can't write to single file bowtieoutput/bowtiein.$name.fa-- fatal\n";
174 |     }
175 |   }
176 |   print "count = $paircount\n";
177 |   return $paircount;
178 | }
179 | 
180 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE
181 | sub CounterPrint{
182 |   my $countingMessager = shift;
183 |   print "\r$countingMessager";
184 |   $|++;
185 | }
186 | 


--------------------------------------------------------------------------------
/tools/fq_all2std.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/perl -w
  2 | 
  3 | # Author: lh3
  4 | 
  5 | use strict;
  6 | use warnings;
  7 | use Getopt::Std;
  8 | use FindBin qw($Bin);
  9 | 
 10 | my $usage = qq(
 11 | Usage:   fq_all2std.pl <command> <in.txt> <lane_qseq (only for SRF conversions)>
 12 | 
 13 | Command: scarf2std      Convert SCARF format to the standard/Sanger FASTQ
 14 |          fqint2std      Convert FASTQ-int format to the standard/Sanger FASTQ
 15 |          sol2std        Convert Solexa/Illumina FASTQ to the standard FASTQ
 16 |          fa2std         Convert FASTA to the standard FASTQ
 17 |          fq2fa          Convert various FASTQ-like format to FASTA
 18 |          sol2scarf      Convert Solexa/Illumina FASTQ to the SCARF format
 19 |          qseq2srf       Convert Solexa/Illumina qseq format to the SRF format
 20 |          qseqin2srf     Convert Solexa/Illumina qseq + intensity/noise format to the SRF format
 21 |          instruction    Explanation to different format
 22 |          example        Show examples of various formats
 23 | 
 24 | Note:    Read/quality sequences MUST be presented in one line.
 25 | \n);
 26 | 
 27 | die($usage) if (@ARGV < 1);
 28 | 
 29 | # Solexa->Sanger quality conversion table
 30 | my @conv_table;
 31 | for (-64..64) {
 32 |   $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499));
 33 | }
 34 | 
 35 | # parsing command line
 36 | my $cmd = shift;
 37 | my %cmd_hash = (scarf2std=>\&scarf2std, fqint2std=>\&fqint2std, sol2std=>\&sol2std, fa2std=>\&fa2std,
 38 | 				sol2scarf=>\&sol2scarf, fq2fa=>\&fq2fa, qseq2srf=>\&qseq2srf,
 39 |                                 qseqin2srf=>\&qseqin2srf, example=>\&example, instruction=>\&instruction);
 40 | if (defined($cmd_hash{$cmd})) {
 41 |   if ($cmd eq 'qseq2srf') {
 42 |     &qseq2srf($ARGV[1]);
 43 |   }
 44 |   elsif ($cmd eq 'qseqin2srf') {
 45 |     &qseqin2srf($ARGV[1]);
 46 |   }
 47 |   &{$cmd_hash{$cmd}};
 48 | } else {
 49 |   die("** Unrecognized command $cmd");
 50 | }
 51 | 
 52 | sub fa2std {
 53 |   my %opts = (q=>25);
 54 |   getopts('q:', \%opts);
 55 |   my $q = chr($opts{q} + 33);
 56 |   warn("-- The default quality is set to $opts{q}. Use '-q' at the command line to change the default.\n");
 57 |   while (<>) {
 58 | 	if (/^>(\S+)/) {
 59 | 	  print "\@$1\n";
 60 | 	  $_ = <>;
 61 | 	  print "$_+\n", $q x (length($_)-1), "\n";
 62 | 	}
 63 |   }
 64 | }
 65 | 
 66 | sub fq2fa {
 67 |   while (<>) {
 68 | 	if (/^@(\S+)/) {
 69 | 	  print ">$1\n";
 70 | 	  $_ = <>; print;
 71 | 	  <>; <>;
 72 | 	}
 73 |   }
 74 | }
 75 | 
 76 | sub scarf2std {
 77 |   while (<>) {
 78 | 	my @t = split(':', $_);
 79 | 	my $name = join('_', @t[0..4]);
 80 | 	print "\@$name\n$t[5]\n+\n";
 81 | 	my $qual = '';
 82 | 	@t = split(/\s/, $t[6]);
 83 | 	$qual .= $conv_table[$_+64] for (@t);
 84 | 	print "$qual\n";
 85 |   }
 86 | }
 87 | 
 88 | sub fqint2std {
 89 |   while (<>) {
 90 | 	if (/^@/) {
 91 | 	  print;
 92 | 	  $_ = <>; print; $_ = <>; $_ = <>;
 93 | 	  my @t = split;
 94 | 	  my $qual = '';
 95 | 	  $qual .= $conv_table[$_+64] for (@t);
 96 | 	  print "+\n$qual\n";
 97 | 	}
 98 |   }
 99 | }
100 | 
101 | sub sol2std {
102 |   my $max = 0;
103 |   while (<>) {
104 | 	if (/^@/) {
105 | 	  print;
106 | 	  $_ = <>; print; $_ = <>; $_ = <>;
107 | 	  my @t = split('', $_);
108 | 	  my $qual = '';
109 | 	  $qual .= $conv_table[ord($_)] for (@t);
110 | 	  print "+\n$qual\n";
111 | 	}
112 |   }
113 | }
114 | 
115 | sub sol2scarf {
116 |   my $counter = 0;
117 |   while (<>) {
118 |     chomp;
119 |     if (/^@/) {
120 |       my $line = substr($_,1);
121 |       if ($counter==0) {
122 |         print "$line:";
123 |         $counter+=1;
124 |       }
125 |       else {
126 |         print "\n$line:";
127 |       }
128 |     }
129 |     elsif (/^[+]/) {
130 |       print ":";
131 |     }
132 |     else {
133 |       print "$_";
134 |     }
135 |   }
136 |   print "\n";
137 | }
138 | 
139 | sub qseq2srf {
140 |   die "This routine is currently out of order. Compatibility could only be guaranteed until GAP version 1.3
141 | Formerly it could be called as follows: fq_all2std.pl qseq2srf s_3_*_qseq.txt\n";
142 | 
143 |   my $arg = shift;
144 |   system("$Bin/srf-conversions/illumina2srf $arg");
145 | }
146 | 
147 | sub qseqin2srf {
148 |   die "This routine is currently out of order. Compatibility could only be guaranteed until GAP version 1.3
149 | Formerly it could be called as follows: fq_all2std.pl qseq2srf -b s_3_*_qseq.txt
150 | 
151 | If you want to re-implement it do not forget to complete the pre-processing steps of the cif intensities and cnf noise files.
152 | A) To generate the cif intensities files, go to the Lane folder in Intensities and type:
153 | for ((i=001;i<=120;i++));
154 | do /data/scripts/next-gen/convert_sequence_format/srf-conversions/cifToTxt -l <lane> -t \$i;
155 | done
156 | B) To generate the cnf noise files, go to the Lane folder in Intensities and type:
157 | for ((i=001;i<=120;i++));
158 | do /data/scripts/next-gen/convert_sequence_format/srf-conversions/cnfToTxt -l <lane> -t \$i;
159 | done\n";
160 |   my $arg = shift;
161 |   system("$Bin/srf-conversions/illumina2srf -b $arg");
162 | }
163 | 
164 | sub instruction {
165 | 
166 |   print "
167 | FASTQ format is first used in the Sanger Institute, and therefore
168 | we take the Sanger specification as the standard FASTQ. Although
169 | Solexa/Illumina reads file looks pretty much like the standard
170 | FASTQ, they are different in that the qualities are scaled
171 | differently. In the quality string, if you can see a character
172 | with its ASCII code higher than 90, probably your file is in the
173 | Solexa/Illumina format.
174 | 
175 | Sometimes we also use an integer, instead of a single character,
176 | to explicitly show the qualities. In that case, negative
177 | qualities indicates that Solexa/Illumina qualities are used.
178 | 
179 | ";
180 | 
181 | }
182 | 
183 | sub example {
184 |   my $exam_scarf = '
185 | USI-EAS50_1:4:2:710:120:GTCAAAGTAATAATAGGAGATTTGAGCTATTT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 19 23 23 23 18 23 23 23
186 | USI-EAS50_1:4:2:690:87:GTTTTTTTTTTTCTTTCCATTAATTTCCCTTT:23 23 23 23 23 23 23 23 23 23 23 23 12 23 23 23 23 23 16 23 23 9 18 23 23 23 12 23 18 23 23 23
187 | USI-EAS50_1:4:2:709:32:GAGAAGTCAAACCTGTGTTAGAAATTTTATAC:23 23 23 23 23 23 23 23 20 23 23 23 23 23 23 23 23 23 23 23 23 12 23 18 23 23 23 23 23 23 23 23
188 | USI-EAS50_1:4:2:886:890:GCTTATTTAAAAATTTACTTGGGGTTGTCTTT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
189 | USI-EAS50_1:4:2:682:91:GGGTTTCTAGACTAAAGGGATTTAACAAGTTT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 20 23 23 23 23 23 23 23 23 23 23 23 18 23 23 23 23
190 | USI-EAS50_1:4:2:663:928:GAATTTGTTTGAAGAGTGTCATGGTCAGATCT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
191 | ';
192 | 
193 |   my $exam_fqint = '
194 | @4_1_912_360
195 | AAGGGGCTAGAGAAACACGTAATGAAGGGAGGACTC
196 | +4_1_912_360
197 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 21 40 40 40 40 40 40 40 40 40 26 40 40 14 39 40 40
198 | @4_1_54_483
199 | TAATAAATGTGCTTCCTTGATGCATGTGCTATGATT
200 | +4_1_54_483
201 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 40 40 40 28 40 40 40 40 40 40 16 40 40 5 40 40
202 | @4_1_537_334
203 | ATTGATGATGCTGTGCACCTAGCAAGAAGTTGCATA
204 | +4_1_537_334
205 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 21 29 40 40 33 40 40 33 40 40 33 31 40 40 40 40 18 26 40 -2
206 | @4_1_920_361
207 | AACGGCACAATCCAGGTTGATGCCTACGGCGGGTAC
208 | +4_1_920_361
209 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 40 40 40 40 40 40 40 40 31 40 40 40 40 40 40 15 5 -1 3
210 | @4_1_784_155
211 | AATGCATGCTTCGAATGGCATTCTCTTCAATCACGA
212 | +4_1_784_155
213 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 31 40 40 40 40 40
214 | @4_1_595_150
215 | AAAGACGTGGCCAGATGGGTGGCCAAGTGCCCGACT
216 | +4_1_595_150
217 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 30 40 40 40 40 40 40 40 40 40 20 40 40 40 40 40 14 40 40
218 | ';
219 | 
220 |   my $exam_sol = '
221 | @SLXA-B3_649_FC8437_R1_1_1_610_79
222 | GATGTGCAATACCTTTGTAGAGGAA
223 | +SLXA-B3_649_FC8437_R1_1_1_610_79
224 | YYYYYYYYYYYYYYYYYYWYWYYSU
225 | @SLXA-B3_649_FC8437_R1_1_1_397_389
226 | GGTTTGAGAAAGAGAAATGAGATAA
227 | +SLXA-B3_649_FC8437_R1_1_1_397_389
228 | YYYYYYYYYWYYYYWWYYYWYWYWW
229 | @SLXA-B3_649_FC8437_R1_1_1_850_123
230 | GAGGGTGTTGATCATGATGATGGCG
231 | +SLXA-B3_649_FC8437_R1_1_1_850_123
232 | YYYYYYYYYYYYYWYYWYYSYYYSY
233 | @SLXA-B3_649_FC8437_R1_1_1_362_549
234 | GGAAACAAAGTTTTTCTCAACATAG
235 | +SLXA-B3_649_FC8437_R1_1_1_362_549
236 | YYYYYYYYYYYYYYYYYYWWWWYWY
237 | @SLXA-B3_649_FC8437_R1_1_1_183_714
238 | GTATTATTTAATGGCATACACTCAA
239 | +SLXA-B3_649_FC8437_R1_1_1_183_714
240 | YYYYYYYYYYWYYYYWYWWUWWWQQ
241 | ';
242 | 
243 |   print qq(
244 | solexa
245 | ======
246 | $exam_sol
247 | scarf
248 | =====
249 | $exam_scarf
250 | fqint
251 | =====
252 | $exam_fqint
253 | );
254 | }
255 | 


--------------------------------------------------------------------------------
/tools/qseq2fasta.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | 
 5 | if($#ARGV<0){
 6 |    die "Usage: $0 <file>\n";
 7 | }
 8 | 
 9 | open(IN,$ARGV[0]) || die "Can't open $ARGV[0] for reading --fatal.\n";
10 | my $fasta = $ARGV[0] . ".fa";
11 | open(OUT,">$fasta") || die "Can't open $fasta for writing --fatal.\n";
12 | 
13 | while (<IN>) {
14 | 	chomp;
15 | 	my @parts = split(/\s+/);
16 |         my $concat = ">$parts[0]:$parts[2]:$parts[3]:$parts[4]:$parts[5]#$parts[6]/$parts[7]";
17 |         print OUT "$concat\n";
18 | 	print OUT "$parts[8]\n";
19 | }
20 | 
21 | close IN;
22 | close OUT;
23 | 
24 | exit;
25 | 


--------------------------------------------------------------------------------
/tools/qseq2fastq.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use warnings;
 4 | use strict;
 5 | 
 6 | if($#ARGV<0){
 7 |    die "Usage: $0 <file>\n";
 8 | }
 9 | 
10 | open(IN,$ARGV[0]) || die "Can't open $ARGV[0] for reading --fatal.\n";
11 | 
12 | while (<IN>) {
13 | 	chomp;
14 | 	my @parts = split /\t/;
15 | 	print "@";
16 |         print "$parts[0]:$parts[2]:$parts[3]:$parts[4]:$parts[5]#$parts[6]/$parts[7]\n";
17 | 	print "$parts[8]\n";
18 | 	print "+\n";
19 | 	print "$parts[9]\n";
20 | }
21 | 
22 | close IN;
23 | 


--------------------------------------------------------------------------------
/tools/sam_bam2tab.pl:
--------------------------------------------------------------------------------
 1 | ########################################################################
 2 | #Marten Boetzer BaseClear B.v. 26-07-2011                              #
 3 | #SSPACE perl sam_bam2Tab.pl                                            #
 4 | #This script;                                                          #
 5 | #  -converts a .sam file to a tab file containing;                     #
 6 | #      -contig of read 1                                               #
 7 | #      -start position of read 1                                       #
 8 | #      -end position of read 1                                         #
 9 | #      -contig of read 2                                               #
10 | #      -start position of read 2                                       #
11 | #      -end position of read 2                                         #
12 | #                                                                      #
13 | #  -Sam/Bam file should contain a read pair at consecutive             #
14 | #   lines where the first line contains the first read and             #
15 | #   second line the second read                                        #
16 | #   In order to have such a file, sort the sam file                    #
17 | #   before using this script with SAMTools command:                    #
18 | #   samtools view -uS <input.sam> | samtools sort -n - <input.sorted>  #
19 | #                                                                      #
20 | #  -This script requires samtools to be installed                      #
21 | #                                                                      #
22 | #  -Bam files should end with .bam extension                           #
23 | #                                                                      #
24 | #INPUT:                                                                #
25 | #   perl sam_bam2Tab.pl <samfile> <postfixread1> <postfixread2         #
26 | #                                                                      #
27 | #   example:                                                           #
28 | #   perl sam_bam2Tab.pl input.sorted.bam /1 /2 out.tab                 #
29 | #   or                                                                 #
30 | #   perl sam_bam2Tab.pl input.sorted.sam /1 /2 out.tab                 #
31 | #                                                                      #
32 | #   This means that the first read is ending with /1 while the         #
33 | #   second read ends with /2                                           #
34 |                                                                        #
35 | #OUTPUT:                                                               #
36 | #   Output of this script is saved into                                #
37 | ########################################################################
38 | 
39 | my $infile = $ARGV[0];
40 | my $postfix1 = $ARGV[1];
41 | my $postfix2 = $ARGV[2];
42 | my $outfile = $ARGV[3];
43 | die "length of postfix1 ($postfix1) has not same length of postfix2 ($postfix2). Exiting...\n" if(length($postfix1) != length($postfix2));
44 | 
45 | my $bam=($infile =~ /.bam$/)? 1:0;
46 | 
47 | if($bam){
48 |     open(SAM, "samtools view $infile |") or die "Can't open $infile for reading -- fatal\n";
49 | }else{
50 |     open(SAM, "$infile") || die "Can't open $infile for reading -- fatal\n";
51 | }
52 | open(OUT, ">$outfile") || die "Can't open $outfile for writing -- fatal\n";
53 | 
54 | my $step = 100000;
55 | my ($ct, $diffct, $read, $prevread, $prevline, $line);
56 | while($line = <SAM>){
57 |   next if($line =~ /^@/);
58 |   ($read, undef, $chrom) = split("\t", $line);
59 |   next if($chrom eq "*");
60 |   if($read !~ /$postfix1$/ && $read !~ /$postfix2$/){
61 |     warn("read $read had no suffix '$postfix1' or '$postfix2', please insert a correct suffix (e.g. '/1' and '/2')\n");
62 |   }
63 |   $read = substr($read,0,-(length($postfix1)));
64 |   if($prevread eq $read){
65 |     $pair_found++;
66 |     my ($line1, $line2) = ($prevline,$line);
67 |     if($prevread =~ /$postfix2$/){
68 |       $line1 = $line;
69 |       $line2 = $prevline;
70 |     }
71 |     my @arr1 = split("\t", $line1);
72 |     my @arr2 = split("\t", $line2);
73 | 
74 |     my ($tig1,$start1,$end1, $tig2,$start2,$end2) = ($arr1[2], $arr1[3], ($arr1[3]+length($arr1[9])), $arr2[2],$arr2[3],($arr2[3]+length($arr2[9])));
75 | 
76 |     if ($arr1[1] & 16) {
77 |       $end1 = $start1;
78 |       $start1 = $start1 + length($arr1[9]);
79 |     }
80 |     if ($arr2[1] & 16) {
81 |       $end2 = $start2;
82 |       $start2 = $start2 + length($arr2[9]);
83 |     }
84 |     print OUT "$tig1\t$start1\t$end1\t$tig2\t$start2\t$end2\n";
85 |   }
86 |   $prevread = $read;
87 |   $prevline = $line;
88 |   if(++$ct == $step){
89 |     CounterPrint("reads = $ct pairs = $pair_found");
90 |     $step = $step + 100000;
91 |   }
92 | }
93 | CounterPrint("\n");
94 | 
95 | sub CounterPrint{
96 |   my $countingMessager = shift;
97 |   print "\r$countingMessager";
98 |   $|++;
99 | }


--------------------------------------------------------------------------------