├── .travis.yml ├── COPYING ├── F132-01 SSPACE_Basic_User_Manual_v2.0.pdf ├── F132-02 SSPACE_Basic_Tutorial_v2.0.pdf ├── README ├── SSPACE_Basic.pl ├── SSPACE_Basic_v2.0.pl ├── bin ├── ExtendOrFormatContigs.pl ├── PairingAndScaffolding.pl └── readLibFiles.pl ├── dotlib └── DotLib.pm ├── example ├── contigs_abyss.fasta ├── ecoli_scaffolds_no_extension.summaryfile.txt └── libraries.txt └── tools ├── TQS.py ├── TQS.readme ├── TQSexport.py ├── TQSfastq.py ├── TRIMMING_PAIRED_READS.README ├── estimate_insert_size.pl ├── fq_all2std.pl ├── qseq2fasta.pl ├── qseq2fastq.pl └── sam_bam2tab.pl /.travis.yml: -------------------------------------------------------------------------------- 1 | language: "perl" 2 | perl: 3 | - 5.24-shrplib 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - bowtie 9 | 10 | install: [] 11 | 12 | script: 13 | - wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001665/SRR001665_1.fastq.gz 14 | - wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001665/SRR001665_2.fastq.gz 15 | - gunzip SRR001665_?.fastq.gz 16 | - ./SSPACE_Basic.pl -l example/libraries.txt -s example/contigs_abyss.fasta -k 5 -a 0.7 -x 0 -b ecoli_scaffolds_no_extension 17 | # Check that the output is correct 18 | - diff -u example/ecoli_scaffolds_no_extension.summaryfile.txt . 19 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /F132-01 SSPACE_Basic_User_Manual_v2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nsoranzo/sspace_basic/4fe5c275c94b36e02d1b69438a2f7e022ecb58bc/F132-01 SSPACE_Basic_User_Manual_v2.0.pdf -------------------------------------------------------------------------------- /F132-02 SSPACE_Basic_Tutorial_v2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nsoranzo/sspace_basic/4fe5c275c94b36e02d1b69438a2f7e022ecb58bc/F132-02 SSPACE_Basic_Tutorial_v2.0.pdf -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Scaffolding Pre-Assemblies After Contig Extension (SSPACE) 2 | 3 | SSPACE BASIC 4 | © 2011 Marten Boetzer, Walter Pirovano 5 | © 2014,2016 Nicola Soranzo 6 | email: nicola.soranzo@earlham.ac.uk 7 | 8 | NOTICE 9 | ====== 10 | 11 | This is mainly a repository to store the last open source release (GNU GPL 2.0 12 | license) of the "basic" version of SSPACE before it was discontinued. SSPACE 13 | "standard", a newer, but non-open source, versions of SSPACE is available at 14 | https://www.baseclear.com/services/bioinformatics/basetools/sspace-standard/ 15 | 16 | I have added a few patches of mine and I am open to external contributions, but 17 | I do not offer support or plan any further development. 18 | 19 | Description 20 | ----------- 21 | 22 | SSPACE is a script able to extend and scaffold pre-assembled contigs using one or more mate pairs or paired-end libraries, or even a combination. 23 | 24 | Implementation and requirements 25 | ------------------------------- 26 | 27 | SSPACE is implemented in Perl and runs on Linux, MacOS and Windows. SSPACE requires bowtie and bowtie-build commands to be in your PATH, for more information about Bowtie see http://bowtie-bio.sourceforge.net/ . 28 | 29 | SSPACE is built based on SSAKE. Code of SSAKE is changed to be able to extend and scaffold pre-assembled contigs for multiple paired reads libraries. 30 | 31 | PLEASE READ: 32 | SSPACE tracks in memory all contigs. That means that the memory usage will increase drastically with the size of your contig data set. In addition, during contig extension single reads are extracted and mapped to the contigs. Unmapped reads are stored in memory. Again, the more reads that can not map, the bigger the dataset and the more memory is used. Just be aware of these limitations and don't be surprised if you observe a lot of data swapping to disk if you attempt to run SSPACE on a machine with little RAM. 33 | 34 | Contig extension might not be suited to work with 454-type read pair libraries. Simply because recurring base insertions/deletions errors, such as those commonly seen in homopolymeric regions, will not cluster well in the context of the SSAKE contig extension algorithm scheme. In addition, long 454 reads are less likely to map against the contigs, thus less read pairs are found and scaffolding is based on less read pairs. One possibility is to allow gaps during mapping using the '-g' parameter. 35 | 36 | Citing SSPACE 37 | ------------ 38 | 39 | Thank you for using, developing and promoting this free software. 40 | If you use SSPACE for you research, please cite: 41 | 42 | Boetzer M, Henkel CV, Jansen HJ, Butler D and Pirovano W. 2010. Scaffolding pre-assembled contigs using SSPACE. Bioinformatics. 27(4):578-579 43 | 44 | Running SSPACE 45 | ------------- 46 | 47 | e.g. perl SSPACE_Basic.pl -l libraries.txt -s contigs.fasta -x 0 -m 32 -o 20 -t 0 -k 5 -a 0.70 -n 15 -p 0 -v 0 -z 0 -g 0 -T 1 -b standard_out 48 | 49 | Usage: ./SSPACE_Basic.pl 50 | 51 | General parameters: 52 | -l Library file containing two paired read files with insert size, error and orientation (see Manual for more information). Also possible to insert .tab files with pairing information (REQUIRED) 53 | -s FASTA file containing contig sequences used for extension. Inserted paired reads are mapped to extended and non-extended contigs (REQUIRED) 54 | -x Indicate whether to extend the contigs of -s using paired reads in -l (-x 1=extension, -x 0=no extension, default -x 0) 55 | 56 | Extension parameters: 57 | -m Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 32) 58 | -o Minimum number of reads needed to call a base during an extension (default -o 20) 59 | -t Trim up to -t base(s) on the contig end when all possibilities have been exhausted for an extension (default -t 0) 60 | -u Single FASTA/FASTQ file containing unpaired sequence reads (optional) 61 | -r Minimum base ratio used to accept a overhang consensus base (default -r 0.9) 62 | 63 | Scaffolding parameters: 64 | -z Minimum contig length used for scaffolding. Filters out contigs below this value (default -z 0) 65 | -k Minimum number of links (read pairs) to compute scaffold (default -k 5) 66 | -a Maximum link ratio between two best contig pairs. Higher values lead to least accurate scaffolding (default -a 0.7) 67 | -n Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n 15) 68 | 69 | Bowtie parameters: 70 | -g Maximum number of allowed gaps during mapping with Bowtie. Corresponds to the -v option in Bowtie. Higher number of allowed gaps can lead to least accurate scaffolding (default -g 0) 71 | -T Specifes the number of threads in Bowtie. Corresponds to the -p/--threads option in Bowtie (default -T 1) 72 | 73 | Additional options: 74 | -b Base name for your output files (default -b standard_output) 75 | -v Runs in verbose mode (-v 1=yes, -v 0=no, default -v 0) 76 | -p Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p 0) 77 | 78 | 79 | How it works 80 | ------------ 81 | 82 | The program consists of several steps, a short overview. 83 | 84 | The first steps are reading the data and filter them. The protocol is slightly different when -x is set to either 0 or 1. We treat them separately here; 85 | 86 | With -x 0 the steps are; 87 | 1) Read -l library file; 88 | A) For each library in the -l library file. Store the reads in appropriate format. Paired reads are stored in a new file with a similar read name for easy tracking of the paired read. Format is; 89 | 90 | >read1.1 91 | AGCTGATAGATGAT 92 | >read1.2 93 | GATGATAGATAGAC 94 | 95 | 2) Convert the inserted contig file to appropriate format. 96 | 97 | With -x 1 the steps are; 98 | 99 | 1) Read -l library file; 100 | A) For each library in the -l library file. Store the reads in appropriate format, similar as step 1A. 101 | B) For all libraries 102 | - store the single reads to a new file. Only reads containing only ACGT characters are stored. 103 | 2) Extend the pre-assembled contigs 104 | A) Map single reads of step 1B to (-s) contig file with Bowtie. 105 | B) Read unmapped reads into memory. 106 | C) Go through each contig in the (-s) contig file, and try to extend the contig. The new contigs are stored in a new file. 107 | 108 | 109 | After producing either a formatted or an extended contig file, the next step is to go through each library in the -l library file and map the filtered paired reads of step 1A to the new contigs; 110 | 111 | 3) Use Bowtie to map single reads of 1A to either the formatted or extended contigs. Map only reads that are on the edges of the contigs. Only reads that map to only one contig are stored in a file. Position and orientation of each read is stored in the file. 112 | 4) Retrieve the position of each found read. 113 | 5) Pair contigs if both reads if a paired-read are found, store the pairing information into memory. In addition, store the sequence of the pair into memory. If the sequence of a pair is already used for pairing contigs, it is not used again. 114 | 6) Pair contigs based on the number of links (-k) and link ratio (-a) 115 | 7) Merge, orient and order the contigs to produce scaffolds. 116 | 117 | 8) If multiple libraries are in -l file, the produced scaffolds in FASTA format are the input for the new library. Steps 3 till 8 are repeated for each library. 118 | 119 | A more detailed view of the six main steps are given below. 120 | 121 | Detailed view 122 | ------------ 123 | 124 | 125 | 1. Reading libraries 126 | Both FASTA/FASTQ files inserted at the -l library file are read, converted and stored in a new file. This new file is used for mapping with Bowtie (step 4), where the new naming of the headers makes it easy to backtrack the original read pair. 127 | 128 | >read1.1 (read from file 1) 129 | ACGATGCTAT 130 | 131 | >read1.2 (read from file 2) 132 | ACCGCGCCCC 133 | 134 | If -x 1 is set, for contig extension, single reads containing only ACGT characters are stored in a new file. The single reads are mapped to contigs at the next step. 135 | 136 | 2. Mapping when -x 1 137 | To extend contigs, only reads that are not already present on the contigs should be used. Otherwise, reads are re-used and cause erroneous contigs, but causes also reads mapped to multiple locations/contigs (step 4). To filter these reads out, Bowtie is used. Bowtie maps the produced single reads at step 1 to the (-s) pre-assembled contigs. A file is generated with reads that did not map to the contigs. The unmapped read file is read in memory, populating a hash table keyed by unique sequence reads with pairing values representing the number of sequence occurrences. The hash is used for contig extension at the next section. 138 | 139 | 3. Extending when -x 1 140 | Contigs are extended, when -x set to 1, using the unmapped reads with a method developed by SSAKE. With SSAKE, contigs extension is initiated by generating the longest 3'-most word (k-mer) from the unassembled read u that is shorter than the sequence read length l. Every possible 3' most k-mers will be generated from u and used in turn for the search until the word length is smaller than a user-defined minimum, m. Meanwhile, all perfectly overlapping reads will be collected in an array and further considered for 3' extension once the k-mer search is done. At the same time, a hash table c will store every base along with a coverage count for every position of the overhang (or stretches of bases hanging off the seed sequence u). 141 | 142 | Once the search complete, a consensus sequence is derived from the hash table c, taking the most represented base at each position of the overhang. To be considered for the consensus, each base has to be covered by user-defined -o (set to 20 by default). If there's a tie (two bases at a specific position have the same coverage count), the prominent base is below a user-defined ratio r, the coverage -o is to low or the end of the overhang is reached, the consensus extension terminates and the consensus overhang joined to the contig. All reads overlapping are searched against the newly formed sequence and, if found, are removed from the hash table and prefix tree. If they are not part of the consensus, they will be used to extend other contigs, if applicable. If no overlapping reads match the newly formed contig, the extension is terminated from that end and SSAKE resumes with a new contig. That prevents infinite looping through low complexity DNA sequences. In the former case, the extension resumes using the new [l-m] space to search for joining k-mers. 143 | 144 | The process of progressively cycling through 3'-most k-mer is repeated after every contig extension until nothing else can be done on that side. Since only left-most searches are possible with a prefix tree, when all possibilities have been exhausted for the 3' extension, the complementary strand of the contiguous sequence generated is used to extend the contig on the 5' end. The DNA prefix tree is used to limit the search space by segregating sequence reads and their reverse-complemented counterparts by their first eleven 5' end bases. 145 | 146 | There are three ways to control the stringency in SSPACE: 147 | 1) Disallow contig extension if the coverage is too low (-o). Higher -o values lead to shorter contigs, but minimizes sequence misassemblies. 148 | 2) Adjust the minimum overlap -m allowed between the contig and short sequence reads. Higher m values lead to more accurate contigs at the cost of decreased contiguity. 149 | 3) Set the minimum base ratio -r to higher values 150 | 151 | After the sequence assembly, a file is generated with .extendedcontigs.fasta extension in the 'intermediate_results' folder. This file contains both extended and non-extended contigs. 152 | 153 | The next steps are looped through each library, present in the (-l) library file. 154 | 155 | 4. Mapping unique paired reads 156 | 157 | At step 1, pairs of each library were filtered. Reads containing N's are unable to correctly map to the contigs, therefore they are not used by Bowtie. Bowtie maps the single reads to the contigs, produced either after extending (if -x 1), or after formatting (if -x 0), or after step 5 if multiple libraries are inserted on -l. 158 | 159 | Before mapping, contigs are shortened, reducing the search space for Bowtie. Only edges of the contigs are considered for mapping. Cutting of edges is determined by taking the maximal allowed distance inserted by the user in the library file (insert size and insert standard deviation). The maximal distance is insert_size + (insert_size * insert_stdev). For example, with a insert size of 500 and a deviation of 0.5, the maximal distance is 750. First 750 bases and last 750 bases are subtracted from the contig sequence, in this case. 160 | 161 | ------------------------------------------ 162 | | | 163 | ------------ ------------ 164 | 750bp 750bp 165 | 166 | This step reduces the search space by merging the two sequences, divided by a 'N' character. 167 | 168 | The algorithm of mapping goes through each pair and checks its occurrence on the edges of the contigs. If both reads are found, the reads of the pair is stored and contigs could be paired in the next step. Otherwise, it is not stored and the read pair is not used for contig pairing. If a pair is previously found and used for contig pairing, the pair is not considered again. Otherwise same links between contigs are found based on same read pair, which can generate misleading results. 169 | 170 | If either of the two reads of a read pair occur on multiple contigs, one can not tell which contig should be paired. For example, the left read occurs at contigs 1 and 3, and the right read at contig 2. For this situation it is impossible to tell if contigs 1 and 2 should be paired, or contigs 1 and 3. Therefore, reads that occur multiple times on contigs are not considered for contig pairing. 171 | 172 | 5a. Building scaffolds 173 | The final step is scaffolding. SSPACE uses an updated version of the SSAKE scaffolder for this. For each read pairs, putative contig pairs (pre-scaffolding stage) are tallied based on the position/location of the paired reads on different contigs. Contig pairs are only considered if the calculated distance between them satisfy the mean distance specified (fourth column in -l file) while allowing for a deviation (fifth column in -l file), also defined by the user. Only contig pairs having a valid gap or overlap are allowed to proceed to the scaffolding stage. 174 | Please note that this stage accepts redundancy of contig pairs (i.e. a given contig may link to multiple contigs, and the number of links (spanning pairs) between any given contig pair is recorded, along with a mean putative gap or overlap(-)). 175 | 176 | Once pairing between contigs is complete, the scaffolds are built using contigs as seeds. Every contig is used in turn until all have been incorporated into a scaffold. 177 | 178 | Consider the following contig pairs (AB, AC and rAD): 179 | 180 | A B 181 | ========= ======== 182 | -> <- 183 | -> <- 184 | -> <- 185 | -> <- 186 | 187 | A C 188 | ========= ====== 189 | -> <- 190 | -> <- 191 | 192 | rA D equivalent to rDA, in this order 193 | ========= ======= 194 | -> <- 195 | -> <- 196 | -> <- 197 | 198 | Two parameters control scaffolding (-k and -a). The -k option specifies the minimum number of links (read pairs) a valid contig pair MUST have to be considered. The -a option specifies the maximum ratio between the best two contig pairs for a given contig being extended. For example, contig A shares 4 links with B and 2 links with C, in this orientation. contig rA (reverse) also shares 3 links with D. When it's time to extend contig A (with the options -k and -a set to 2 and 0.7, respectively), both contig pairs AB and AC are considered. Since C (second-best) has 2 links and B (best) has 4 (2/4) = 0.5 below the maximum ratio of 0.7, A will be linked with B in the scaffold and C will be kept for another extension. If AC had 3 links the resulting ratio (0.75), above the user-defined maximum 0.7 would have caused the extension to terminate at A, with both B and C considered for a different scaffold. A maximum links ratio of 1 (not recommended) means that the best two candidate contig pairs have the same number of links -- SSPACE will accept the first one since both have a valid gap/overlap. The above method was adopted from SSAKE. The SSPACE improved this method by introduing another method if a contig can link to more than one alternative. Both methods (original SSAKE method and our method) for handling alternatives are explained below; 199 | 200 | In version 2-0 of SSPACE an additional ratio is used to generate more reliable scaffolds, especially for libraries with large libraries. This ratio is used as an additional control for the scaffolding process. A contig with multiple links should satisfy both ratios in order to form a scaffold. The rules for scaffolding contigs with multiple alternative contig connections is explained in more detail below. 201 | 202 | If a contig can be linked to more than one alternative, connections between these alternatives are searched and linked together if a connection is found. Otherwise a ratio is calculated between the two best alternatives. If this ratio is below a threshold (-a) a connection with the best scoring alternative is established. The two methods are shown below; 203 | 204 | The first method; 205 | A has 10 links with B 206 | A has 5 links with C 207 | B has 10 links with C; 208 | 209 | Result is a scaffold containing A-B-C 210 | 211 | The second method (only used if first method did not produce a scaffold) is based on two ratios. The first ratio (ratio1) is based on the number of links, while the second ratio (ratio2) is based on the number of links and the used search space. This will be explained using an example; 212 | 213 | If we have an insert size of 450 and contigs has two alternatives with two contigs, with the following details; 214 | 215 | A and B with; 216 | gap = 100 217 | links = 19 218 | size of B is 100bp 219 | 220 | A and C with; 221 | gap = 400 222 | links = 9 223 | size of B is 1000bp 224 | 225 | Ratio1 is simply calculated by dividing the contig with lowest links with the contig with highest number of links; 226 | 227 | Here, this is 9/19 (C/B) = 0.47. 228 | 229 | 230 | Ratio2 is calculated by incorporating the insert size. SSPACE first determines the amount of search space that was used for searching links. 231 | 232 | In figure, where each character represents 50bp, this looks something like; 233 | 234 | <100bp> 235 | ==(B) 236 | gap=100 / 237 | / 238 | (A)====== 239 | \ 240 | gap=400 \ 241 | ------====================(C) 242 | <1000bp> 243 | ********* 244 | < SEARCH SPACE > 245 | 246 | Legenda; 247 | * = search space 248 | = = contig 249 | - = gap 250 | 251 | Now we calculate the used space on contigs (B) and (C) that was used for pairing with contig (A). In principle, this is just calculating the number of nucleotides fall into the SEARCH SPACE. 252 | For contig B, we can see that the whole contig falls into the SEARCH SPACE. Therefore, the space = 100bp 253 | For contig C, we can see that only the first 50bp of the contig falls into the SEARCH SPACE. Therefore, the space = 50bp. 254 | 255 | Next, we estimate the number of links per space, by dividing the total number of links with the found space; 256 | For contig B, this is 19 links per 100 bp space = 0.19 links per space 257 | For contig C, this is 9 links per 50 bp space = 0.18 links per space 258 | 259 | Ratio2 is then calculated by dividing the two numbers; 0.18/0.19 = 0.95. If both ratio1 and ratio2 are below the -a ratio threshold, the scaffold is A-B. Otherwise, no reliable scaffold can be formed and the scaffold extension is stopped. 260 | 261 | 5b. Left scaffold extension 262 | When a scaffold extension is terminated on one side, the scaffold is extended on the "left", by looking for contig pairs that involve the reverse of the seed (in this example, rD). With AB and AC having 4 and 2 links, respectively and rD being the only pair on the left, the final scaffolds outputted by SSPACE would be: 263 | 264 | 1) rD-A-B 265 | 2) C 266 | 267 | SSPACE outputs a .scaffolds file with linkage information between contigs (see "Understanding the .scaffolds csv file" below) 268 | Accurate scaffolding depends on many factors. Number and nature of repeats in your target sequence, optimum adjustments of insert size, error, parameters -k and -a and data quality/size of sequence set (more doesn't mean better) will all affect SSPACE's ability to build scaffolds. 269 | 270 | 271 | 6. Merging contigs 272 | SSAKE scaffolder produces links between contigs and determines the possible gap between them. For a positive gap, m number of N's will be placed between them if a gap of size m is predicted to occur. When a negative gap is generated, a putative overlap is predicted to occur. The adjacent contigs are searched for overlap within a window given at -n option till 50 bp. If an overlap was found, contigs are merged and the region is marked with lowercase nucleotides. Otherwise, if no overlap was detected, a single "n" will be placed between the contigs. A short overview of this step with three examples; 273 | 274 | >contig_1 275 | AGCTAGTCGTAGCTTGTAC 276 | >contig_2 277 | ACGTAGTGATATTATTGTC 278 | 279 | Example 1: 280 | A link between contig_1 and contig_2 is found, with a putative gap of 10. In the final output, the gaps is indicated by 10 N's between the two contigs. 281 | 282 | Link = contig_1 with contig_2. Gap = 10; 283 | AGCTAGTCGTAGCTTGTACNNNNNNNNNNACGTAGTGATATTATTGTC 284 | 285 | Example 2; 286 | A link between contig_1 and contig_2 is found, with a putative gap of -10. When using the -n 10 option, no overlap was found and a small is inserted between the two contigs. 287 | 288 | Link = contig_1 with contig_2. Gap = -10. -n = 10; 289 | AGCTAGTCGTAGCTTGTACnACGTAGTGATATTATTGTC 290 | 291 | Example 3; 292 | A link between contig_3 and contig_4 is found, with a putative gap of -10. When using the -n 10 option, an overlap of 13 nucleotides was found, indicated in lower case in the final output. 293 | 294 | >contig_3 295 | AGTGTTAGATAGTTATAGA 296 | >contig_4 297 | AGATAGTTATAGAAGTAGT 298 | 299 | Link = contig_3 with contig_4. Gap = -10. -n = 10; 300 | AGTGTTagatagttatagaAGTAGT 301 | 302 | TIP: The summary file calculates the mean and median insert size based on mapping of paired reads on a single contig. For more reliable gap and overlap estimation, one may consider to change the insert size in the library file with the calculated mean. 303 | 304 | 305 | Input sequences 306 | --------------- 307 | 308 | FASTA FILES: 309 | >ILLUMINA-52179E_0001:3:1:1062:15216#0/2 310 | ATNGGGTTTTTCAACTGCTAAGTCAGCAGGCTTTTCACCCTTCAACATC 311 | >ILLUMINA-52179E_0001:3:1:1062:4837#0/2 312 | ANNAACTCGTGCCGTTAAAGGTGGTCTTGCATTTCAGAAAGCTCACCAG 313 | 314 | FASTQ files: 315 | @ILLUMINA-52179E_0001:3:1:1062:15216#0/2 316 | ATNGGGTTTTTCAACTGCTAAGTCAGCAGGCTTTTCACCCTTCAACATC 317 | +ILLUMINA-52179E_0001:3:1:1062:15216#0/2 318 | OOBOLJ[HHO`_aaa`a_]aaaY[`Za[Y[F]]VZWX]WZ^Z^^^O[XY 319 | @ILLUMINA-52179E_0001:3:1:1062:4837#0/2 320 | ANNAACTCGTGCCGTTAAAGGTGGTCTTGCATTTCAGAAAGCTCACCAG 321 | +ILLUMINA-52179E_0001:3:1:1062:4837#0/2 322 | OBBOO^^^^^bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb`bbbb` 323 | 324 | General points: 325 | -Files present in the -l library file should either be in FASTA or FASTQ format, which is automatically determined by the program. For each paired read, one of the reads should be in the first file, and the other one in the second file. The paired reads are required to be on the same line in both files. 326 | -the header (given after "@" character for FASTQ or ">" for FASTA) of contig and paired-read data files could be of any format. No typical naming convention is needed. Duplicate names are also allowed. 327 | -Quality values of the FASTQ files are not used. 328 | -To be considered, sequences have to be longer than 16 nt or -m (but can be of different lengths). If they are shorter, the program will simply omit them from the process. 329 | -Reads containing ambiguous bases, like and <.>, and characters other than ACGT will be ignored entirely in input FASTA/FASTQ files inserted with -l option. 330 | -Contigs (inserted with -s option) containing ambiguous bases, like and <.>, and characters other than ACGT are not ignored. However, contigs having these other characters can prevent proper contig extension when they are at the beginning or end of the sequence. 331 | -Spaces in any FASTQ and FASTA file are NOT permitted and will either not be considered or result in execution failure 332 | -For Bowtie, option -v 0 is used, which correspond to zero mismatches allowed on mapping. In addition bowtie's -m 1 option is used; only reads that map exactly to one contig (both in normal and reverse complement) are outputted. Pairs that are present on multiple contigs, are not used for scaffolding. Results are stored in the folder 'bowtieoutput'. For information about Bowtie see http://bowtie-bio.sourceforge.net/ . 333 | 334 | 335 | FASTA header of .extendedcontig.fasta file 336 | ------------ 337 | 338 | e.g. 339 | >extcontig27|size52|read193|cov92.79|seed:PreAssembledCtg0027 340 | 341 | contig id# = 27, this contig is extended during extension step. If not extended, the contig is named >contig27 342 | size (G) = 52 nt. Size of the contig. 343 | number of reads (N) = 193. Number of reads for extension. 344 | cov [coverage] (C) = 92.79. the coverage (C) is calculated using the total number (T) of consensus bases [sum(L)] provided by the assembled sequences divided by the contig size: 345 | 346 | C = T / G 347 | seed = PreAssembledCtg0027. Header of the original pre-assembled contig file. 348 | 349 | Output files 350 | ------------ 351 | Each file is starting with a basename given at the -b parameter. First, four main files are generated in the current working directory;; 352 | 353 | (basename).final.scaffolds.fasta :: text file; Final scaffolds produced by SSPACE. 354 | (basename).final.evidence:: text file; Produced scaffolds including the initial numbered contigs. 355 | (basename).logfile :: text file; Logs execution time / errorsE 356 | (basename).summaryfile:: text file; Gives a summary after every step. Summary of number of inserted sequences, filtered sequences, contig sequences, mapping stats, pairing stats and contig/scaffold size summaries. 357 | 358 | 359 | In addition, four folders are generated, each having a number of files; 360 | 361 | 'reads' folder; 362 | (basename).(libname).file(libnumber).fasta:: FASTA file; Converted files of the paired-read data, each two consecutive sequences are pairs. This file is used as input for both the contig extension as the scaffolding step. 363 | 364 | 'bowtieoutput' folder; 365 | Four files are generated by bowtie; 366 | (basename).bowtieIndex.* :: index file; Index files generated by 'bowtie-build'. Produced for each library. 367 | 368 | For further information about the outputs of Bowtie, see the Bowtie manual ( http://bowtie-bio.sourceforge.net/ ). 369 | 370 | 371 | 'pairinfo' folder; 372 | (basename) .(libname).pairing_distribution.csv:: comma-separated file; 1st column is the calculated distance for each pair (template) with reads that assembled logically within the same contig. 2nd column is the number of pairs at that distance. Produced for each library. 373 | (basename).(libname).pairing_issues:: text file; Lists all pairing issues encountered between contig pairs and illogical/out-of-bounds pairing. Produced for each library. 374 | 375 | 'intermediate_results' folder; 376 | (basename).extendedcontigs.fasta :: FASTA file; All contig sequences. Both extended and non-extended contigs. Extended contigs are named ">ext_contig" , while non-extended are named ">contig" in the header. Only produced when -x 1. 377 | 378 | (basename).formattedcontigs.fasta :: FASTA file; Original contig sequences. Formatted to appropriate input for scaffolding. Only produced when -x 0. 379 | 380 | (basename).(libname).scaffolds :: comma-separated file; see below. Produced for each library. 381 | 382 | (basename).(libname).scaffolds.fasta :: FASTA file; All merged/unmerged contigs within scaffolds are listed. The overlap sequence between contigs (>= -n bases) will be shown in lower case within the merged contig. Note that *perfect* sequence overlap has to occur between 2 predicted adjacent contigs of a scaffold in order to merge. Only merging of two contigs is established if a negative gap is determined. When two consecutive contigs do not physically overlap, then gaps will be padded with Ns of length corresponding to the predicted gap size m (refer to Understanding the .scaffolds csv file below) and predicted but undetected overlaps with a single (n). 383 | 384 | (basename).(libname).scaffolds.evidence :: text file; Produced scaffolds including the initial numbered contigs (-s option). (refer to Understanding the .evidence file below). 385 | 386 | (basename).(libname).foundlinks :: text file; Links between the contigs/scaffolds and their correspond gapsize. 387 | 388 | (basename).(libname).repeats :: text file; Contig-edges having multiple links with other contigs. 389 | 390 | 391 | 'dotfiles' folder; 392 | (basename).(libname).visual_scaffolds.dot :: dot file; This file can be used to visualise the contigs orientation and order on the scaffolds. The .dot file can be converted to any format using the GraphViz package using the 'dot' command (www.graphviz.org). Each dotfile is cut into 5mb parts, otherwise the scaffolds can't be converted and visualised properly. 393 | 394 | 395 | Understanding the .scaffolds csv file 396 | ------------------------------------- 397 | 398 | scaffold1,7484,f127Z7068k12a0.58m42_f3090z62k7a0.14m76_f1473z354 399 | 400 | Each column is separated by a comma; 401 | column 1: a unique scaffold identifier 402 | column 2: the sum of all contig sizes that made it to the scaffold/supercontig 403 | column 3: a contig chain representing the layout: 404 | 405 | e.g. 406 | f127Z7068k12a0.58m42_f3090z62k7a0.14m76_f1473z354 407 | 408 | means: contig f127 (strand=f/+), size (z) 7068 (Z if contig was used as the seed sequence) has 12 links (k), link ratio of 0.58 (a) with a mean gap of 42nt (m) with reverse (r) of contig 3090 (size 62) on the right. if m values are negative, it's just that a possible overlap was calculated using the mean distance supplied by the user and the position of the reads flanking the contig. 409 | Negative m values imply that there's a possible overlap between the contigs. But since the pairing distance distribution usually follows a Normal/Gaussian distribution, some distances are expected to be larger than the median size expected/observed. In reality, if the exact size was known between each paired-reads, we wouldn't expect much negative m values unless a break occurred during the contig extension (likely due to base errors/SNPs). 410 | 411 | 412 | 413 | Understanding the .scaffolds.fasta file 414 | ------------------------------------- 415 | 416 | scaffold13.1|size84140|tigs14 417 | 418 | Each column represents; 419 | name of the scaffold 420 | size of the scaffold 421 | number contigs in scaffold 422 | 423 | Each initial contig inputted at -s option stored in a scaffold is written to the .evidence file. This file is explained below. 424 | 425 | Understanding the .scaffolds.evidence file 426 | ------------------------------------- 427 | 428 | >scaffold1.1|size9058|tigs5 429 | f_tig5|size728|links12|gaps100 430 | r_tig1|size2726|links10|gaps89 431 | f_tig100|size3687|links4|gaps-46|merged40 432 | f_tig91|size238|links6|gaps392 433 | f_tig120|size1112 434 | 435 | The first line indicates the scaffold, which is the same as in the .scaffolds.fasta file. Next, for each contig the connection (orientation, links and gaps) with other contigs are given. The second line for example means forward contig 5 with size 728 has 12 links and a gap of 100bp with reverse contig 1. If a line ends with , it means that the contig has overlap with the next contig, and they are merged. For contig f_tig100, 40 nucleotides had an overlap with contig f_tig91. 436 | 437 | 438 | Producing visualisation of scaffolds with .dot file using -p parameter 439 | ------------------------------------- 440 | 441 | To visualize the scaffolds of the .dot file, GraphViz should be downloaded at (www.graphviz.org). GraphViz converts the .dot file to any desired output using the 'dot' function. For example to convert the .dot to a .ps format; 442 | 443 | dot -Tps2 (basename).(libname).visual_scaffolds.dot -o MYOUTPUT.ps 444 | 445 | This will produce a postscript (.ps) file. For other options, see the manual of GraphViz. 446 | 447 | 448 | 449 | How does the .tab file work 450 | --------------------------- 451 | 452 | The .tab file is a tab-delimited file containing information about the positions of the reads on the contigs. On each line, positions of both reads are given. 453 | 454 | A typical .tab file line looks like; 455 | 456 | contig1 100 150 contig1 300 250 457 | 458 | Here, the first read is found at contig1 with start and end at position 100 and 150, respectively. Meaning that the read is found at the positive strand (-). 459 | The second read is found at contig1 at start and end at position 300 and 250, respectively. Meaning that the read is found at the negative strand (-). 460 | 461 | In figure; 462 | read1 read2 463 | ----> <---- 464 | contig1 ---------------------------------------------------- 465 | 466 | 467 | Another line may look like; 468 | 469 | contig2 300 350 contig3 100 550 470 | 471 | Here, the first read is found at contig1 with start and end at position 100 and 150, respectively. Meaning that the read is found at the positive strand (-). 472 | The second read is found at contig1 at start and end at position 300 and 250, respectively. Meaning that the read is found at the negative strand (-). 473 | 474 | In figure; 475 | read1 476 | ----> read2 477 | contig2 ------------------------ <---- 478 | contig3 ------------- 479 | 480 | Normally, SSPACE parses the output of Bowtie directly to the above format and uses this information to pair the contigs and to determine the insert size. With the .tab format, users can put directly the mapping positions of the reads into SSPACE, which is much faster. Also, this way users can make use of their favorite read mapper and put the results into SSPACE. 481 | 482 | To work properly, the input contigs (-s option) should have the same name as the contigs in the .tab file, as explained in the MANUAL. Since the TAB file can be used in combination with other TAB files and also FASTA/FASTQ files, as well as multiple libraries, the original mappings should be updated after each library. Therefore, contigs are stored in memory, and their position in scaffolds is updated after each scaffold formation. An example; 483 | 484 | contig2 (200bp) is linked with contig3 (200bp) with a gap of 10bp 485 | 486 | contig3 contig2 487 | scaf1------------------------NNNNNNNNNN-------------- 488 | 489 | the contigs are then updated to new positions; 490 | -contig3 is at position 1-200 at scaf1 491 | -contig2 is at position 210-410 at scaf1 492 | 493 | Most common used output format of read mappers are .sam format and their equivalent binary format .bam. A script is attached in the 'tools' folder in the SSPACE package, which converts .sam/.bam files to .tab format. See the TUTORIAL on an example on how such a process looks like. 494 | 495 | SSPACE does not 496 | -------------- 497 | 498 | -Take into consideration base quality scores. It is up to the user to process the sequence data before clustering with SSPACE. Python scripts (TQS.py, TQSfastq.py, TQSexport.fq) are provided to help trim poor quality bases off Illumina sequences. Refer to TQS.readme and TRIMMING_PAIRED_READS.README included in this distribution (in the ./tools subdirectory) for information on how to run those programs 499 | -Consider sequence read having any character other than A,C,G,T and will skip these reads entirely while reading the FASTA file. 500 | -Only input of FASTA or FASTQ is possible. For conversion to these formats use the fq_all2std.pl function in the ./tools directory. 501 | -------------------------------------------------------------------------------- /SSPACE_Basic.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #AUTHOR 4 | # Marten Boetzer and Walter Pirovano (c) 2010 5 | # SSAKE-based Scaffolding of Pre-Assembled Contigs after Extension (SSPACE) 6 | # walter.pirovano@baseclear.com 7 | 8 | #NAME 9 | # SSPACE Marten Boetzer - Walter Pirovano November 2011 10 | 11 | #SYNOPSIS 12 | # SSAKE-based Scaffolding of Pre-Assembled Contigs after Extension (SSPACE) 13 | 14 | #DOCUMENTATION 15 | # README, MANUAL and TUTORIAL distributed with this software @ www.baseclear.com 16 | # Boetzer M, Henkel VJ, Jansen HJ, Butler D and Pirovano W. 2011. Scaffolding pre-assembled contigs using SSPACE. Bioinformatics 27(4) p578-9. 17 | # http://www.baseclear.com/sequencing/data-analysis/bioinformatics-tools/ 18 | # We hope this code is useful to you -- Please send comments & suggestions to Walter.Pirovano@baseclear.com 19 | # If you use either the SSPACE code or ideas, please cite our work appropriately and accurately 20 | 21 | #LICENSE 22 | # SSPACE Copyright (c) 2010-2011 BaseClear B.V. The Netherlands. All rights reserved. 23 | # SSAKE Copyright (c) 2006-2010 Canada's Michael Smith Genome Science Centre. All rights reserved. 24 | 25 | # This program is free software; you can redistribute it and/or 26 | # modify it under the terms of the GNU General Public License 27 | # as published by the Free Software Foundation; either version 2 28 | # of the License, or (at your option) any later version. 29 | 30 | # This program is distributed in the hope that it will be useful, 31 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 32 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 33 | # GNU General Public License for more details. 34 | 35 | # note: insert size and distance between pairing reads are used interchangeably 36 | 37 | #MAJOR CHANGES ON SSAKE V3.4 TO FORM SSPACE 38 | # -New scaffolding feature dealing with contigs having multiple alternatives 39 | # -Seperate scripts to decrease the memory usage 40 | # -Automatic filtering of reads and duplicate mate pairs 41 | # -Option for contig extension on unfiltered reads 42 | # -Removed tracking of reads during contig extension 43 | # -Mapping single reads to extended and non extended contigs 44 | # -Single reads mapped more than once to a contig are removed for scaffolding 45 | # -A summary file is generated containing detailed information about the scaffolding process 46 | # -An evidence file is generated which indicates the contigs present in the scaffolds 47 | # -Optional; Scaffolds and their contigs are visualised by generating a .dot file 48 | 49 | #MAJOR CHANGES ON SSPACE Basic v2.0; 50 | 51 | # GENERAL 52 | # -Last column of the library file should be the orientation of the reads, instead of indication of being reverse complement or not. Options are FR, FF, RF and RR. 53 | # -Fixed some bugs in the summary file and removed some useless information. 54 | # -Included the -z option which specifies the minimal contig length that will be used for scaffolding. Contigs below this length are discarded for scaffolding. 55 | # -Included the possibility to include TAB delimited files with read mapping information, format is; ctg1 start1 end1 ctg2 start2 end2 56 | # - if a read is reverse complement on a contig, start and end should be turned around e.g. ctg1 100 150 ctg2 150 100 indicates that the second read is reverse complement on ctg2 57 | # - No contig filtering can be applied if TAB delimited files are included 58 | # - See MANUAL for more information of how to use the tab file option 59 | # -Included some scripts to convert a .sam file to a .tab file 60 | 61 | # BOWTIE 62 | # -Included the -g option to specify maximum allowed gaps for Bowtie. This option corresponds to the -v option in Bowtie. 63 | # -Now able to do multithreaded Bowtie using the -T option (-T 3 does 3 threads). This option corresponds to the -p option in Bowtie. 64 | 65 | # READING FILES: 66 | # -Speeded up the reading of the library files for a single threaded run 67 | # -Now able to read multiple libraries at once using the multithread -T option. -T 3 reads three files at the same time. 68 | 69 | # CONTIG EXTENSION 70 | # -Included the -r option for contig extension (default is 0.9). 71 | # -Speeded up and reduced the memory usage during the contig extension. 72 | # - SSPACE reads in the output of Bowtie at once, rather than reading it from the output file. 73 | # - Faster check for presence of subsequence of a read, thereby able to faster check for overlapping sequences with the contig. 74 | 75 | # SCAFFOLDING 76 | # -Combined the functions readBowtie and pairContigs, which saves runtime and memory. 77 | # -Saving runtime by reading Bowtie results in at once, instead of reading it from Bowtie's output file. 78 | # -Included a pre-filtering step of multiple alternative contig links before scaffolding. This step was previously done during scaffolding, now it's a step before scaffolding. It reduces the number of errors within the scaffolds. 79 | # -Additional check to connect two alternative contigs, making the scaffolds more reliable, especially with mate pair libraries. The search space is included in the calculation of the ratio, rather than looking at the number of links only. See the README file for more information. 80 | # -Calculation of mean insert size based on mapped read pairs on same contig. Users can choose this value for better estimation of gap sizes. Especially for paired-end sequences. 81 | 82 | # -Fixed a bug in the mergeContigs function. Indication of contigs merged in previous libraries were not displayed in the final .evidence file. 83 | 84 | #-------------------------------------------------LOAD PACKAGES AND DEFINE VARIABLES 85 | use strict; 86 | use Storable; 87 | use Getopt::Std; 88 | use File::Path; 89 | use File::Basename; 90 | 91 | #Specify path to DotLib 92 | use FindBin qw($Bin); 93 | use lib "$Bin/dotlib/"; 94 | use DotLib; 95 | 96 | use vars qw($opt_m $opt_o $opt_v $opt_p $opt_k $opt_a $opt_z $opt_s $opt_b $opt_n $opt_l $opt_x $opt_u $opt_t $opt_T $opt_g $opt_r); 97 | getopts('m:o:v:p:k:a:z:s:b:n:l:x:u:t:T:g:r:'); 98 | my ($base_overlap, $min_overlap, $verbose, $MIN_READ_LENGTH, $SEQ_SLIDE, $min_base_ratio, $min_links, $max_link_ratio, $unpaired_file, $max_trim, $base_name, $max_count_trim, $min_tig_overlap, $doplot, $extending, $threads, $minContigLength, $gaps, $unpaired, $gapclosure) = (20, 32, 0, 16, 1, 0.9, 5, 0.7, "no-u", 0, "standard_output", 10, 15, 0, 0, 1, 0, 0, 0, 0); 99 | 100 | my $version = "[SSPACE_Basic v2.1]"; 101 | my $seplines = ("-" x 60)."\n"; 102 | my ($MAX, $MAX_TOP, $TRACK_COUNT) = (0, 100, 1);# $MAX_TOP is the very maximum anchoring edge sequence that will be searched 103 | 104 | #-------------------------------------------------READ OPTIONS 105 | 106 | if(!($opt_l) || !($opt_s)){ 107 | print STDERR "ERROR: Parameter -l is required. Please insert a library file\n" if(!$opt_l); 108 | print STDERR "ERROR: Parameter -s is required. Please insert a contig FASTA file\n" if(!$opt_s); 109 | my $error_msg = <<"END_MSG"; 110 | \nUsage:\n 111 | ============ General Parameters ============\n 112 | -l Library file containing two paired read files with insert size, error and either mate pair or paired end indication (REQUIRED)\n 113 | -s FASTA file containing contig sequences used for extension. Inserted pairs are mapped to extended and non-extended contigs (REQUIRED)\n 114 | -x Indicate whether to extend the contigs of -s using paired reads in -l (-x 1=extension, -x 0=no extension, default -x $extending)\n 115 | ============ Extension Parameters ============\n 116 | -m Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m $min_overlap)\n 117 | -o Minimum number of reads needed to call a base during an extension (default -o $base_overlap)\n 118 | -t Trim up to -t base(s) on the contig end when all possibilities have been exhausted for an extension (default -t $max_trim)\n 119 | -u FASTA/FASTQ file containing unpaired sequence reads (optional)\n 120 | -r Minimum base ratio used to accept a overhang consensus base (default -r $min_base_ratio)\n 121 | ============ Scaffolding Parameters ============\n 122 | -z Minimum contig length used for scaffolding. Filters out contigs below this value (default -z $minContigLength)\n 123 | -k Minimum number of links (read pairs) to compute scaffold (default -k $min_links)\n 124 | -a Maximum link ratio between two best contig pairs. *Higher values lead to least accurate scaffolding* (default -a $max_link_ratio)\n 125 | -n Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -n $min_tig_overlap)\n 126 | ============ Bowtie Parameters ============\n 127 | -g Maximum number of allowed gaps during mapping with Bowtie. Corresponds to the -v option in Bowtie. *Higher number of allowed gaps can lead to least accurate scaffolding* (default -g $gaps)\n 128 | -T Specify the number of threads in Bowtie. Corresponds to the -p/--threads option in Bowtie (default -T $threads)\n 129 | ============ Additional Parameters ============\n 130 | -b Base name for your output files (default -b $base_name)\n 131 | -v Runs in verbose mode (-v 1=yes, -v 0=no, default -v $verbose)\n 132 | -p Make .dot file for visualisation (-p 1=yes, -p 0=no, default -p $doplot) 133 | END_MSG 134 | die $error_msg; 135 | } 136 | 137 | my $libraryfile = $opt_l if ($opt_l); 138 | my $filecontig = $opt_s if($opt_s); 139 | $extending = $opt_x if($opt_x eq 1); 140 | $min_overlap = $opt_m if ($opt_m); 141 | $base_overlap = $opt_o if ($opt_o); 142 | $max_trim = $opt_t if ($opt_t); 143 | $unpaired_file = $opt_u if($opt_u); 144 | $min_base_ratio = $opt_r if ($opt_r); 145 | $minContigLength = $opt_z if($opt_z); 146 | $min_links = $opt_k if ($opt_k); 147 | $max_link_ratio = $opt_a if ($opt_a); 148 | $min_tig_overlap = $opt_n if($opt_n); 149 | $gaps = $opt_g if($opt_g); 150 | $threads = $opt_T if ($opt_T); 151 | $base_name = $opt_b if($opt_b); 152 | $verbose = $opt_v if ($opt_v); 153 | $doplot = $opt_p if($opt_p); 154 | 155 | #-------------------------------------------------CHECKING PARAMETERS 156 | die "ERROR: Invalid (-l) library file $libraryfile ...Exiting.\n" if(! -e $libraryfile); 157 | die "ERROR: Invalid (-s) contig file $filecontig ...Exiting.\n" if(! -e $filecontig); 158 | die "ERROR: -x must be either 0 or 1. Your inserted -x is $extending...Exiting.\n" if(!($extending == 0 || $extending == 1)); 159 | die "ERROR: -m must be a number between 15-50. Your inserted -m is $min_overlap ...Exiting.\n" if(!($min_overlap =~ /^\d+$/) || $min_overlap < 10 || $min_overlap > 50); 160 | die "ERROR: -o must be set to 1 or higher. Your inserted -o is $base_overlap ...Exiting.\n" if($base_overlap < 1); 161 | die "ERROR: -t must be a positive integer. Your inserted -t is $max_trim ...Exiting.\n" if(!($max_trim =~ /^\d+$/)); 162 | die "ERROR: Invalid unpaired file $unpaired_file -- fatal\n" if(! -e $unpaired_file && $opt_u); 163 | die "ERROR: -r must be a number between 0.0 and 1.0. Your inserted -r is $min_base_ratio ...Exiting.\n" if($min_base_ratio < 0 || $min_base_ratio > 1); 164 | die "ERROR: -z must be a positive integer. Your inserted -z is $minContigLength...Exiting.\n" if (!($minContigLength =~ /^\d+$/)); 165 | die "ERROR: -k must be a positive integer. Your inserted -k is $min_links ...Exiting.\n" if(!($min_links =~ /^\d+$/)); 166 | die "ERROR: -a must be a number between 0.0 and 1.0. Your inserted -a is $max_link_ratio ...Exiting.\n" if($max_link_ratio < 0 || $max_link_ratio > 1); 167 | die "ERROR: -n must be a positive integer. Your inserted -n is $min_tig_overlap ...Exiting.\n" if (!($min_tig_overlap =~ /^\d+$/)); 168 | die "ERROR: -g must be a positive integer between 0 and 3. Your inserted -g is $gaps...Exiting.\n" if (!($gaps =~ /^\d+$/) || $gaps > 3); 169 | die "ERROR: -T must be a positive integer. Your inserted -T is $threads...Exiting.\n" if (!($threads =~ /^\d+$/)); 170 | die "ERROR: -p must be either 0 or 1. Your inserted -p is $doplot...Exiting.\n" if(!($doplot == 0 || $doplot == 1)); 171 | 172 | #-------------------------------------------------check library file; 173 | open(FILELIB, "< $libraryfile"); 174 | my ($min_allowed, $library, $fileA, $fileB, $insert_size, $insert_stdev, $orientation); 175 | my $countline=0; 176 | while(){ 177 | chomp; 178 | $countline++; 179 | my @line = split(/\s+/, $_); 180 | if($#line >= 0){ 181 | if($opt_l){ 182 | die "ERROR: Line $countline in your library file ($libraryfile) contains $#line spaces, which should be 5 spaces. Check that no spaces are within the file names.\n" if($#line != 5); 183 | 184 | my ($library, $fileA, $fileB, $insert_size, $insert_stdev, $orientation) = split(/\s+/, $_); 185 | if($fileA ne "TAB"){ 186 | die "ERROR: Invalid file in library $library: $fileA -- fatal\n" if(! -e $fileA); 187 | }else{ 188 | die "ERROR: Can't apply filtering using the -z option (-z = $minContigLength) and insertion of a TAB file -- fatal\n" if($minContigLength > 0); 189 | } 190 | die "ERROR: Invalid file in library $library: $fileB -- fatal\n" if(! -e $fileB); 191 | die "ERROR: Insert size should be higher than or equal to 0. Your library $library has insert size of $insert_size. Exiting.\n" if(!($insert_size>0) || !($insert_size =~ /^\d+$/)); 192 | die "ERROR: Insert stdev must be a number between 0.00 and 1.00. Your library $library has insert size of $insert_stdev. Exiting.\n" if($insert_stdev < 0 || $insert_stdev > 1 || !($insert_stdev * 1 eq $insert_stdev)); 193 | die "ERROR: Orientation must have length of 2 characters and should contain one of the following; FR, FF, FR or RF. Your library $library has orientation of $orientation ...Exiting.\n" if(!(length($orientation) == 2) || !($orientation =~ /[FR][FR]/)); 194 | } 195 | } 196 | } 197 | close FILELIB; 198 | #-------------------------------------------------Make folder structure 199 | mkpath('intermediate_results'); 200 | mkpath('pairinfo'); 201 | mkpath('reads'); 202 | mkpath('bowtieoutput'); 203 | 204 | $unpaired = $unpaired_file if (-e $opt_u && $extending == 1); 205 | #-------------------------------------------------Print input parameters 206 | my $contig = "intermediate_results/" . $base_name . ".formattedcontigs.fasta"; 207 | 208 | my $log = $base_name . ".logfile.txt"; 209 | my $summaryfile = $base_name.".summaryfile.txt"; 210 | open (LOG, ">$log") || die "Can't write to $log -- fatal\n"; 211 | 212 | open (SUMFILE, ">$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 213 | close SUMFILE; 214 | 215 | my $init_message = "Your inserted inputs on $version at ".getDate().":\nRequired inputs: \n\t-l = $libraryfile\n\t-s = $filecontig\n\t-b = $base_name\n\n"; 216 | $init_message .= "Optional inputs:\n\t-x = $extending\n\t-z = $minContigLength\n\t-k = $min_links\n"; 217 | $init_message .= "\t-a = $max_link_ratio\n\t-n = $min_tig_overlap\n\t-T = $threads\n\t-p = $doplot\n\n"; 218 | 219 | $init_message .= "Contig extension inputs:\n\t-o = $base_overlap\n\t-t = $max_trim\n\t-m = $min_overlap\n\t-r = $min_base_ratio\n\n" if($extending == 1); 220 | 221 | &printMessage($init_message); 222 | close LOG; 223 | #-------------------------------------------------READING AND CONVERTING INPUT SEQUENCES 224 | system("perl $Bin/bin/readLibFiles.pl $libraryfile $base_name $extending $unpaired $min_overlap $threads"); 225 | checkStatus(); 226 | #-------------------------------------------------FORMATTING OR EXTENDING CONTIGS 227 | system("perl $Bin/bin/ExtendOrFormatContigs.pl $contig $base_name $extending $filecontig $MIN_READ_LENGTH $base_overlap $min_overlap $min_base_ratio $max_trim $verbose $Bin $minContigLength $libraryfile $gaps $threads"); 228 | checkStatus(); 229 | #--------------------------------------------------UPDATE SUMMARY FILE 230 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 231 | open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; 232 | 233 | #write summary of initial contigs 234 | my $sumfile .= "\nSUMMARY: \n".$seplines."\tInserted contig file;\n"; 235 | $sumfile = &writesummaryfiles($filecontig, "contig", $sumfile); 236 | #write summary of extended contigs 237 | my $extended_tig = "intermediate_results/" . $base_name . ".extendedcontigs.fasta"; 238 | $sumfile .= "\tAfter extension;\n" if($extending); 239 | $sumfile = &writesummaryfiles($extended_tig, "contig", $sumfile) if($extending); 240 | 241 | #write summary of filtered contigs 242 | if($minContigLength > 0){ 243 | $sumfile .= "\tAfter filtering (z >= $minContigLength);\n"; 244 | $sumfile = &writesummaryfiles($contig, "contig", $sumfile); 245 | }else{ 246 | $contig = $extended_tig if($extending); 247 | } 248 | &FlushFiles(); 249 | close LOG; 250 | close SUMFILE; 251 | 252 | #--------------------------------------------------GO THROUGH EACH LIBRARY AND SCAFFOLD 253 | open(FILELIB, "< $libraryfile") || die "Can't open $libraryfile -- fatal\n"; 254 | my ($lib, $fileA, $fileB, $insert_size, $insert_stdev, $pair, $headscaffolds, $prevlib, $mergedtigs, $evidencefile); 255 | 256 | while(){ 257 | chomp; 258 | &FlushFiles(); 259 | ($lib, $fileA, $fileB, $insert_size, $insert_stdev, $orientation) = split(/\s+/, $_); 260 | next if($lib eq $prevlib || $lib eq ''); 261 | 262 | my $tabfile = 0; 263 | $tabfile = 1 if($fileA eq "TAB"); 264 | 265 | $prevlib = $lib; 266 | $min_allowed = -1 * ($insert_stdev * $insert_size); 267 | 268 | open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; 269 | &printMessage("\nLIBRARY $lib\n".$seplines); 270 | close LOG; 271 | 272 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 273 | print SUMFILE "\n\nLIBRARY $lib STATS:\n".("#" x 80),"\n"; 274 | close SUMFILE; 275 | 276 | my $scaffold = "intermediate_results/" . $base_name . ".$lib.scaffolds"; 277 | $mergedtigs = "intermediate_results/" . $base_name . ".$lib.scaffolds.fasta"; 278 | my $issues = "pairinfo/" . $base_name . ".$lib.pairing_issues"; 279 | my $distribution = "pairinfo/" . $base_name . ".$lib.pairing_distribution.csv"; 280 | 281 | #-------------------------------------------------MAPPING READ PAIRS USING FILTERED FASTA FILE 282 | mkpath("tmp.$base_name"); 283 | #-------------------------------------------------Scaffold the contigs and generate .scaffold file 284 | system("perl $Bin/bin/PairingAndScaffolding.pl $Bin $gaps $contig $base_name $issues $distribution $verbose $lib $insert_size $min_allowed $scaffold $min_links $max_link_ratio $orientation $threads") if(!$tabfile); 285 | system("perl $Bin/bin/PairingAndScaffolding.pl $Bin $gaps $contig $base_name $issues $distribution $verbose $lib $insert_size $min_allowed $scaffold $min_links $max_link_ratio $orientation $threads $tabfile $fileB $filecontig $evidencefile") if($tabfile); 286 | checkStatus(); 287 | 288 | #retrieve the contigs that were stored 289 | my $contigstored = "tmp.$base_name/contigs.stored"; 290 | my $contigs = retrieve("$contigstored"); 291 | #-------------------------------------------------Generate .fasta file and .evidence file with scaffolds 292 | open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; 293 | ($headscaffolds, $evidencefile) = &mergeContigs($scaffold, $contigs, $mergedtigs, 50, $verbose, $min_tig_overlap,$max_count_trim); 294 | $contig = $mergedtigs; 295 | #-------------------------------------------------write summary of scaffolds 296 | $sumfile .= "\tAfter scaffolding $lib:\n"; 297 | $sumfile = &writesummaryfiles($mergedtigs, "scaffold", $sumfile); 298 | 299 | #------------------------------------------------- 300 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 301 | print SUMFILE ("#" x 80),"\n"; 302 | close SUMFILE; 303 | &printMessage("\n$seplines"); 304 | $contigs = (''); undef $contigs; 305 | 306 | my $removedir = "tmp.$base_name"; 307 | rmtree([$removedir, 'blurfl/quux']); #remove 'tmp' folder 308 | }#END OF LIBRARY LOOP 309 | 310 | #-------------------------------------------------END OF LIBRARIES. PRINT SUMMARY TO FILE AND END SESSION 311 | my $finalfile = $base_name . ".final.scaffolds.fasta"; 312 | my $finalevfile = $base_name . ".final.evidence"; 313 | 314 | open (EVID, $evidencefile); 315 | open (FINALEV, "> $finalevfile"); 316 | while(){ 317 | print FINALEV $_; 318 | } 319 | 320 | open (SCAF, $mergedtigs); 321 | open (FINAL, "> $finalfile"); 322 | while(){ 323 | print FINAL $_; 324 | } 325 | 326 | #make .dot file for visualisation 327 | &visualiseScaffolds($base_name.".visual_scaffolds", $evidencefile) if($doplot); 328 | 329 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 330 | &printMessage("\n=>".getDate().": Creating summary file\n"); 331 | print SUMFILE $sumfile.$seplines; 332 | my $time = (time - $^T); 333 | my $minutes = int ($time / 60); 334 | $time = $time % 60; 335 | &printMessage(("*" x 50)."\n\nProcess run succesfully on ".getDate()." in $minutes"." minutes and $time"." seconds\n\n\n"); 336 | close SCAF; 337 | close FINAL; 338 | close EVID; 339 | close FINALEV; 340 | close LOG; 341 | close SUMFILE; 342 | #END OF MAIN PROGRAM 343 | 344 | ###MAKE A .FASTA FILE OF THE FOUND SCAFFOLDS. EITHER MERGE TWO CONTIGS WHEN A OVERLAP OF -n EXISTS OR PLACE A GAP 345 | sub mergeContigs{ 346 | 347 | my ($scaffold, $contigs, $mergedtigs, $chunk, $verbose,$min_tig_overlap,$max_count_trim) = @_; 348 | 349 | &printMessage("\n=>".getDate().": Merging contigs and creating FASTA file of scaffolds\n"); 350 | 351 | open(IN,$scaffold) || die "can't read $scaffold -- fatal\n"; 352 | 353 | my $evidence_file = $mergedtigs; 354 | $evidence_file =~ s/.fasta/.evidence/; 355 | open(SCAFS,">$evidence_file") || die "can't write to $evidence_file -- fatal\n"; 356 | open(OUT,">$mergedtigs") || die "can't write to $mergedtigs -- fatal\n"; 357 | my $scafhashcount = keys ( %$headscaffolds ); 358 | my $scaffoldHashStart; 359 | my ($tot,$sct,$ct_merge, $step) = (0,0,0,100); 360 | while(){### each line is a scaffold 361 | chomp; 362 | my @a = split(/\,/); 363 | my @tig; 364 | 365 | if($a[2]=~/\_/){ 366 | @tig = split(/\_/,$a[2]); 367 | }else{ 368 | push @tig, $a[2]; 369 | } 370 | if(++$sct == $step){ 371 | CounterPrint($sct); 372 | $step = $step + 100; 373 | } 374 | my ($ct,$tigsum,$mct,$prev,$word,$template,$seq,$prevseq,$headconcat,$prevEstimatedDistance, $prevLinks) = (0,0,0,"NA","NA","NA","","","",""); 375 | foreach my $t (@tig){### each contig 376 | $ct++; 377 | 378 | if($t=~/([fr])(\d+)z(\d+)(\S+)?/i){ 379 | 380 | my $orient = $1; 381 | my $tnum=$2; 382 | my $head = $orient . $tnum; 383 | my $search = "tig" . $tnum; 384 | my $other = $4; 385 | $tot+= $3; 386 | $tigsum +=$3; 387 | 388 | my ($estimatedDistance, $links) = ("", ""); 389 | $estimatedDistance = $1 if($other=~/m((\-)?\d+)/); 390 | $links = $1 if($other=~/k((\-)?\d+)/); 391 | print "\tSC $a[0] - TIG $ct. pattern: $t search: $search totalTigSize: $tot Orientation: $orient Gap/Overlap estimated distance: $estimatedDistance\n" if($verbose); 392 | 393 | my $count_trim = 0; 394 | 395 | $seq = $contigs->{$tnum}{'seq'}; 396 | $seq = reverseComplement($seq) if($orient eq "r"); 397 | chomp $seq; 398 | my $prev; 399 | if($scafhashcount >0){ 400 | $prev = $headscaffolds->{$tnum}{'head'}; 401 | $prev =~ s/^\n//; 402 | chomp $prev; 403 | delete $headscaffolds->{$tnum}; 404 | chomp $prev; 405 | if($orient eq "r"){ ###Reverse all contigs if the whole scaffold is a reverse complement. ftig -> rtig and rtig -> ftig 406 | my @prevarray = split("\n", $prev); 407 | if($#prevarray >=0){ 408 | my $newprev=""; 409 | my ($tnum, $sizetig, $links, $gap, $prevline, $merge) = ("","","","","",""); 410 | for(my $i = $#prevarray; $i >= 0; $i--){ 411 | 412 | my @info = split(/\|/, $prevarray[$i]); 413 | if($#info eq 1){ 414 | ($tnum, $sizetig) = split(/\|/, $prevarray[$i]); 415 | }else{ 416 | ($tnum, $sizetig, $links, $gap, $merge) = split(/\|/, $prevarray[$i]); 417 | } 418 | $tnum =~ tr/fr/rf/; 419 | if($prevline ne ""){ 420 | $newprev .= $prevline."|".$links."|".$gap."\n" if($merge eq ""); 421 | $newprev .= $prevline."|".$links."|".$gap."|".$merge."\n" if($merge ne ""); 422 | } 423 | $prevline = $tnum."|".$sizetig; 424 | } 425 | $newprev .= $prevline; 426 | $prev = $newprev; 427 | } 428 | } 429 | } 430 | else{ 431 | $prev = "$orient"."_$search|size".length($seq); 432 | } 433 | $prev .= "|links$links|gaps$estimatedDistance" if($links ne ""); 434 | 435 | 436 | #print "$prev\n"; 437 | if($word ne "NA"){ 438 | ##### 439 | if(length($seq)<=$chunk){ 440 | $template = $seq; 441 | }else{ 442 | $template = substr($seq,0,$chunk); 443 | } 444 | 445 | ##### word search 446 | my $dynamic_word = $word; 447 | if($prevEstimatedDistance <= 0){ 448 | SCAN: 449 | until($template =~ /$dynamic_word/){ 450 | $dynamic_word = substr($dynamic_word,1,length($dynamic_word)); 451 | if(length($dynamic_word) < $min_tig_overlap){ 452 | $count_trim++; 453 | last SCAN if($count_trim >= $max_count_trim); 454 | $dynamic_word = substr($word,0,length($word)-$count_trim); 455 | } 456 | } 457 | } 458 | if($prevEstimatedDistance <= 0 && $seq =~ /^\S{0,$max_count_trim}$dynamic_word(.*)/){### will grab the left-most match which is ok 459 | my $tail = $1; 460 | my $all = "ERROR_"; 461 | while($prevseq =~ /^(.*)$dynamic_word/ig){ 462 | $all = $1; 463 | } 464 | print "$prevseq **** $all **** WORD:$word *** DWord:$dynamic_word *** COUNTTRIM:$count_trim\n" if($all=~/ERROR/); 465 | 466 | $prevseq = $all . lc($dynamic_word) . $tail; 467 | my $overlap = length($dynamic_word); 468 | $ct_merge++; 469 | print "$ct_merge. GROUNDS FOR MERGING ($overlap nt overlap) !!!\n" if($verbose); 470 | $headconcat .= "|merged$overlap"."\n".$prev; 471 | }else{ 472 | ### ADDED RLW 5.MAR.2010 473 | if($prevEstimatedDistance <= 0){ 474 | $prevseq .= "n" . $seq 475 | }else{ 476 | $prevseq .= ("N" x $prevEstimatedDistance) . $seq; 477 | } 478 | $headconcat .= "\n".$prev; 479 | 480 | } 481 | }else{ 482 | $prevseq = $seq; 483 | $headconcat = "\n".$prev; 484 | $mct++; 485 | } 486 | 487 | ##### For the next search 488 | if(length($seq)<=$chunk){ 489 | $word = $seq; 490 | }else{ 491 | $word = substr($seq,length($seq)-$chunk,$chunk); ### this will be the next word to search with 492 | } 493 | $prevEstimatedDistance = $estimatedDistance; 494 | $prevLinks = $links; 495 | }#tig regex 496 | 497 | }#each tig 498 | my $scsz = length($prevseq); 499 | $scaffoldHashStart->{$sct}{'head'} = $headconcat; 500 | 501 | my @line = split(/\n/, $headconcat); 502 | print SCAFS ">$a[0]|size$scsz|tigs".($#line)."$headconcat\n\n"; 503 | print OUT ">$a[0]|size$scsz\n$prevseq\n"; 504 | $prevseq = ''; 505 | } 506 | close IN; 507 | close SCAFS; 508 | close OUT; 509 | CounterPrint(" "); 510 | undef $contigs; 511 | &FlushFiles(); 512 | return ($scaffoldHashStart, $evidence_file); 513 | } 514 | ###WRITE SUMMARY STATISTICS FOR ALL CONTIGS OR SCAFFOLDS 515 | sub writesummaryfiles{ 516 | my ($input_file, $insert, $sumfile) = @_; 517 | 518 | open (INFILE, $input_file) || die "Can't open input file $input_file.\n"; 519 | 520 | my ($counter, $sum, $seq, $name, $foundN50, $sumN50, $totalNcount) = (0,0, "","", 0, 0); 521 | my (@line, @lengths); 522 | while () { 523 | s/\r\n/\n/; 524 | chomp; 525 | $seq.= $_ if(eof(INFILE)); 526 | if ($_ =~ /^[>]/ || eof(INFILE)) { 527 | if($counter > 0){ 528 | push(@lengths, length($seq)); 529 | $sum+= length($seq); 530 | my $Ncount = () = $seq =~ /[Nn]/g; 531 | $totalNcount += $Ncount; 532 | ($seq) = ""; 533 | } 534 | $counter++; 535 | } 536 | else { 537 | $seq .= $_; 538 | } 539 | } 540 | $counter--; 541 | my $half_length = $sum/2; 542 | 543 | my @lengths2 = reverse sort { $a <=> $b } @lengths; 544 | 545 | for(my $i = 0; $i <= $#lengths && $foundN50 == 0; $i++) 546 | { 547 | $sumN50 += @lengths2[$i]; 548 | if($sumN50 >= $half_length){ 549 | $foundN50 = @lengths2[$i] if($sumN50 >= $half_length); 550 | last; 551 | } 552 | } 553 | $sumfile .= "\t\tTotal number of $insert"."s = $counter\n"; 554 | $sumfile .= "\t\tSum (bp) = ". $sum. "\n"; 555 | $sumfile .= "\t\t\tTotal number of N's = $totalNcount\n"; 556 | $sumfile .= "\t\t\tSum (bp) no N's = ". ($sum-$totalNcount)."\n"; 557 | $sumfile .= "\t\tMax $insert size = ". @lengths2[0]."\n"; 558 | $sumfile .= "\t\tMin $insert size = ". @lengths2[$#lengths]."\n"; 559 | $sumfile .= "\t\tAverage $insert size = ".int($sum/$counter)."\n"; 560 | $sumfile .= "\t\tN50 = ". $foundN50. "\n\n"; 561 | 562 | close (INFILE); 563 | close OUTFILE; 564 | 565 | return $sumfile; 566 | } 567 | 568 | 569 | ###FUNCTION TO GENERATE A VISUALISATION OF THE SCAFFOLDS AND THEIR CONTIGS IN .DOT FORMAT 570 | sub visualiseScaffolds{ 571 | my ($dotname, $evidence) = @_; 572 | my ($filext, $sizecutoff) = (1, 5000000); 573 | mkpath('dotfiles'); 574 | my $filename2 = "dotfiles/$dotname.part".$filext.".dot"; 575 | &printMessage("\n=>".getDate().": Producing .dot file for visualisation\n"); 576 | 577 | open(IN,$evidence) || die "can't read $evidence -- fatal\n"; 578 | open(DOT, ">$filename2") || die "can't open $filename2 -- fatal\n"; 579 | printHeader(\*DOT, undef); 580 | my ($prevtig, $prevgap, $prevlinks, $prevratio, $scafcount) = ("","","", "",0); 581 | while(){ 582 | chomp; 583 | my $line = $_; 584 | my $filesize = -s $filename2; 585 | 586 | if ($line =~ /^[>]/){ 587 | endCluster(\*DOT) if($scafcount > 0); 588 | my $filesize = -s $filename2; 589 | if($filesize > $sizecutoff){ 590 | printFooter(\*DOT); 591 | close(DOT); 592 | $filext++; 593 | $filename2 = "$dotname.part".$filext.".dot"; 594 | open(DOT, ">$filename2") || die "can't open $filename2 -- fatal\n"; 595 | printHeader(\*DOT, undef); 596 | } 597 | $scafcount++; 598 | $line =~ tr/[>\|]/ /; 599 | startCluster(\*DOT, $scafcount, "$line"); 600 | ($prevtig, $prevgap, $prevlinks, $prevratio) = ("","","", ""); 601 | } 602 | elsif($line =~ /^[fr]/){ 603 | my @info = split(/\|/, $line); 604 | my ($tnum, $sizetig, $links, $gap); 605 | if($#info eq 1){ 606 | ($tnum, $sizetig) = split(/\|/, $line); 607 | }else{ 608 | ($tnum, $sizetig, $links, $gap) = split(/\|/, $line); 609 | } 610 | my ($orient, $tig) = split(/_/,$tnum); 611 | my $ori=-1; 612 | my ($other, $gap2) = split(/gaps/,$gap); 613 | my ($other, $links2) = split(/links/,$links); 614 | $ori = 1 if($orient eq "f"); 615 | printNode(\*DOT, $tig, "$tig ($sizetig)", $ori); 616 | printEdge(\*DOT, $prevtig, $tig, "gap = $prevgap links = $prevlinks", undef) if($prevtig ne ""); 617 | 618 | $prevtig = $tig; 619 | $prevgap = $gap2; 620 | $prevlinks = $links2; 621 | } 622 | } 623 | endCluster(\*DOT) if($scafcount > 0); 624 | printFooter(\*DOT); 625 | close(DOT); 626 | close IN; 627 | } 628 | 629 | 630 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE 631 | sub reverseComplement{ 632 | $_ = shift; 633 | tr/ATGC/TACG/; 634 | return (reverse()); 635 | } 636 | 637 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE 638 | sub printMessage{ 639 | my $message = shift; 640 | print $message; 641 | print LOG $message; 642 | } 643 | 644 | ###FUNCTION TO GET THE CURRENT DATE 645 | sub getDate{ 646 | my $date = scalar(localtime); 647 | return $date; 648 | } 649 | 650 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE 651 | sub CounterPrint{ 652 | my $countingMessager = shift; 653 | print "\r$countingMessager"; 654 | $|++; 655 | } 656 | 657 | ###FLUSHES THE SUMMARY AND LOG FILE 658 | sub FlushFiles{ 659 | select((select(SUMFILE), $| = 1)[0]); 660 | select((select(LOG), $| = 1)[0]); 661 | $|++; 662 | } 663 | #########END MAIN SCRIPT 664 | 665 | 666 | sub checkStatus{ 667 | &printMessage(("*" x 50)."\n\nProcess failed on ".getDate()."\n\n\n"), exit 1 if(!(-d "process_OK")); 668 | rmtree(["process_OK", 'blurfl/quux']); 669 | } 670 | -------------------------------------------------------------------------------- /SSPACE_Basic_v2.0.pl: -------------------------------------------------------------------------------- 1 | SSPACE_Basic.pl -------------------------------------------------------------------------------- /bin/ExtendOrFormatContigs.pl: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | #Marten Boetzer 1-03-2010 # 3 | #SSPACE perl subscript ExtendOrFormatContigs.pl # 4 | #This script, based on the the -x parameter; # 5 | # -Formats the contigs to appropriate format (-x 0) # 6 | # -Extends the contigs with available unmapped reads (-x 1) # 7 | ############################################################### 8 | 9 | use strict; 10 | use File::Basename; 11 | use File::Path; 12 | 13 | my ($MAX, $MAX_TOP, $TRACK_COUNT) = (0, 100, 1); 14 | 15 | my $seplines = ("-" x 60)."\n"; 16 | 17 | my $contig = $ARGV[0]; 18 | my $base_name = $ARGV[1]; 19 | my $extending = $ARGV[2]; 20 | my $filecontig = $ARGV[3]; 21 | my $MIN_READ_LENGTH = $ARGV[4]; 22 | my $base_overlap = $ARGV[5]; 23 | my $min_overlap = $ARGV[6]; 24 | my $min_base_ratio = $ARGV[7]; 25 | my $max_trim = $ARGV[8]; 26 | my $verbose = $ARGV[9]; 27 | my $minContigLength = $ARGV[11]; 28 | my $libraryfile = $ARGV[12]; 29 | my $gaps = $ARGV[13]; 30 | my $threads = $ARGV[14]; 31 | 32 | my $log = $base_name . ".logfile.txt"; 33 | my $summaryfile = $base_name.".summaryfile.txt"; 34 | 35 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 36 | open (LOG, ">>$log") || die "Can't write to logfile$log -- fatal\n"; 37 | my $filenameOutExt = $base_name . ".singlereads.fasta"; 38 | my ($bin); 39 | if($extending == 1){ 40 | 41 | &ExtendContigs($base_name, $filecontig, $filenameOutExt); 42 | print SUMFILE "\n" if($minContigLength > 0); 43 | &FormatContigs() if($minContigLength > 0); 44 | }else{ 45 | &FormatContigs(); 46 | } 47 | 48 | close SUMFILE; 49 | close LOG; 50 | 51 | mkpath('process_OK'); 52 | #-------------------------------------------------- 53 | 54 | ###EXTEND CONTIGS WITH UNMAPPED READS 55 | sub ExtendContigs{ 56 | my ($base_name, $filecontig, $filenameOutExt) = @_; 57 | my ($seq); 58 | #-------------------------------------------------NOW MAP SINGLE READS TO INITIAL CONTIGS FILE. 59 | my $readfile = "reads/" . $filenameOutExt; 60 | &getUnmappedReads($filecontig, $readfile); 61 | #-------------------------------------------------CONTIG EXTENSION USING UNMAPPED PAIRS STORED IN $SET 62 | &printMessage("\n=>".getDate().": Contig extension initiated\n"); 63 | my $outfileTig = "intermediate_results/" . $base_name . ".extendedcontigs.fasta"; 64 | 65 | open (TIG, ">$outfileTig") || die "Can't write to $outfileTig -- fatal\n"; 66 | #--------------------------------------------ASSEMBLY START 67 | 68 | ASSEMBLY: 69 | open(IN, $filecontig) || die "Can't open $filecontig -- fatal\n"; 70 | my ($exttig_count, $counter, $NCount, $orig_mer, $prevhead) = (0, 0, 0, 0, ''); 71 | while(){ 72 | s/\r\n/\n/; 73 | chomp; 74 | $seq.= uc($_) if(eof(IN)); 75 | if (/\>(\S+)/ || eof(IN)){ 76 | my $head=$1; 77 | $orig_mer = length($seq); 78 | if($seq ne ''){ 79 | $NCount++ if($seq=~/([NX])/i); 80 | my $start_sequence = uc($seq); 81 | my $reads_needed = 1; #tracks coverage 82 | my $total_bases = $orig_mer * $reads_needed; 83 | 84 | ($seq, $reads_needed, $total_bases) = doExtension("3", $orig_mer, $seq, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $counter, $max_trim) if($orig_mer >= $MIN_READ_LENGTH && $orig_mer >= $min_overlap); 85 | 86 | my $seqrc = reverseComplement($seq); 87 | ($seqrc, $reads_needed, $total_bases) = doExtension("5", $orig_mer, $seqrc, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $counter, $max_trim) if($orig_mer >= $MIN_READ_LENGTH && $orig_mer >= $min_overlap); 88 | 89 | my $leng = length($seqrc); 90 | my $reversetig = reverseComplement($seqrc); ### return to sequence, as inputted 91 | if($leng > $orig_mer){ ### commented out: && $start_sequence ne $seqrc && $start_sequence ne $reversetig 92 | my $cov = $total_bases / $leng; 93 | printf TIG ">extcontig%i|size%i|read%i|cov%.2f|seed:$prevhead\n%s\n", ($counter, $leng, $reads_needed, $cov, $reversetig); #print contigs to file 94 | $exttig_count++; 95 | }else{ 96 | my $cov = $reads_needed = 0; 97 | my $singlet_leng = length($start_sequence); 98 | printf TIG ">contig%i|size%i|read%i|cov%.2f|seed:$prevhead\n%s\n", ($counter, $leng, $reads_needed, $cov, $reversetig); #print singlets to file 99 | } 100 | } 101 | CounterPrint(++$counter); 102 | $prevhead = $head; 103 | $seq=''; 104 | }else{ 105 | $seq .= uc($_); 106 | } 107 | } 108 | CounterPrint(" "); 109 | print SUMFILE "\tNumber of contig sequences =".($counter-1). "\n"; 110 | print SUMFILE "\t\tNumber of contigs containing N's (may prevent proper contig extension) = $NCount\n"; 111 | 112 | print SUMFILE "\tNumber of contigs extended = $exttig_count\n".$seplines; 113 | close IN; 114 | $filecontig = $outfileTig; 115 | if($@){ 116 | my $message = $@; 117 | &printMessage("\nSomething went wrong running $0 ".getDate()."\n$message\n"); 118 | } 119 | close TIG; 120 | } 121 | 122 | ###STORE CONTIGS TO APPROPRIATE FORMAT WHEN CONTIGS WILL NOT BE EXTENDED 123 | sub FormatContigs{ 124 | &printMessage("\n=>".getDate().": Storing contigs to format for scaffolding\n"); 125 | open (TIG, ">$contig") || die "Can't write to $contig -- fatal\n"; 126 | open(IN, $filecontig) || die "Can't open $filecontig -- fatal\n"; 127 | my ($counter, $seq, $prevhead, $step) = (0, '', '', 100); 128 | while(){ 129 | s/\r\n/\n/; 130 | chomp; 131 | $seq.= uc($_) if(eof(IN)); 132 | if (/\>(\S+)/ || eof(IN)){ 133 | my $head=$1; 134 | my $length_seq = length($seq); 135 | if($seq ne '' && $length_seq >= $minContigLength){ 136 | if(++$counter == $step){ 137 | CounterPrint($counter); 138 | $step = $step + 100; 139 | } 140 | printf TIG ">contig%i|size%i|read%i|cov%.2f|seed:$prevhead\n%s\n", ($counter, $length_seq, 0, 0.00, $seq); 141 | } 142 | $prevhead = $head; 143 | $seq = ''; 144 | }else{ 145 | $seq .= uc($_); 146 | } 147 | } 148 | CounterPrint(" "); 149 | close IN; 150 | close TIG; 151 | } 152 | 153 | ###EXTEND CONTIGS 154 | sub doExtension{ 155 | 156 | my ($direction, $orig_mer, $seq, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $tig_count, $max_trim) = @_; 157 | 158 | my $previous = $seq; 159 | my ($extended, $trim_ct) = (1, 0); 160 | 161 | if($orig_mer > $MAX){$orig_mer=$MAX;} ### Deals with special cases where the seed sequences are different from the read set (and possibly very large) - goal here is not to increase sequence coverage of seed, but rather to extend it. 162 | 163 | TRIM: 164 | while($trim_ct <= $max_trim){ 165 | while($extended){ 166 | 167 | my ($pos, $current_reads, $current_bases, $span) = (0, 0, 0, ""); 168 | 169 | ### Added 19March08 170 | if(length($seq) >= $MAX){ # $seq is length of contig being extended -- if larger than largest read, make sure the largest read could align and all subsequent rds. 171 | $span = $MAX - $TRACK_COUNT; 172 | }else{ 173 | $span = length($seq) - $TRACK_COUNT; 174 | } 175 | my $startspan = $span; 176 | my $overhang = {}; 177 | my @overlapping_reads = (); 178 | for (my $x=1;$x <= ($orig_mer * 2);$x++){ 179 | ($overhang->{$x}{'A'}, $overhang->{$x}{'C'}, $overhang->{$x}{'G'}, $overhang->{$x}{'T'}) = (0, 0, 0, 0); 180 | } 181 | 182 | ### COLLECT SEQUENCES 183 | while ($span >= $min_overlap){ # will slide the subseq, until the user-defined min overlap size 184 | 185 | $pos = length($seq) - $span; 186 | print "MAX:$MAX, SPAN:$span, POS:$pos" if ($verbose); 187 | 188 | my $subseq = substr($seq, $pos, $span); #make a sub-sequence of length l-(1..i) for searching 189 | my $sub = substr($subseq, 0, 10); #grab first 10 nucleotides and get all reads having this subset stored in $bin 190 | my $subset = $bin->{$sub}; #Will grab everything even the reverse complement ones 191 | print "####$direction' SEARCH Position:$pos Span:$span - Subseq:$subseq Previous:$previous\n" if ($verbose); 192 | ### SEARCH -- this cycles through limited k-mer space 193 | foreach my $pass (keys %$subset){ 194 | my $pos = index($pass, $subseq); 195 | if($pos==0){ 196 | my $dangle = substr($pass, $pos+length($subseq)); 197 | #can we align perfectly that subseq to another rd start? 198 | print "\n", "=" x 80, "\n$direction'- FOUND sequence: $pass -> subset: $subseq -> overhang: $dangle\n", "=" x 80, "\n\n" if ($verbose); 199 | 200 | # Collect all overhangs 201 | push @overlapping_reads, $pass; ### all overlapping reads 202 | my @over = split(//, $dangle); 203 | my $ct_oh = 0; 204 | 205 | foreach my $bz(@over){ 206 | $ct_oh++; ### tracks overhang position passed the seed 207 | $overhang->{$ct_oh}{$bz} += $bin->{$sub}{$pass}; 208 | print "$ct_oh - $bz = $overhang->{$ct_oh}{$bz}\n" if($verbose); 209 | } 210 | } 211 | } 212 | $span--; 213 | }#while overlap >= user-defined -m minimum 214 | 215 | my $consensus = ""; 216 | print "Finished Collecting Overlapping Reads - BUILDING CONSENSUS...\n" if ($verbose); 217 | # print Dumper(@overlapping_reads) if ($verbose); 218 | 219 | ### Build consensus 220 | CONSENSUS: 221 | foreach my $ohpos (sort {$a<=>$b} keys %$overhang){ 222 | if($ohpos){ 223 | 224 | my $coverage = $overhang->{$ohpos}{'A'}+$overhang->{$ohpos}{'C'}+$overhang->{$ohpos}{'G'}+$overhang->{$ohpos}{'T'}; 225 | print "pos:$ohpos cov:$coverage A:$overhang->{$ohpos}{'A'} C:$overhang->{$ohpos}{'C'} G:$overhang->{$ohpos}{'G'} T:$overhang->{$ohpos}{'T'}\n" if($verbose); 226 | if ($coverage < $base_overlap){ 227 | print "COVERAGE BELOW THRESHOLD: $coverage < -o $base_overlap @ $ohpos :: will extend by: $consensus\n" if ($verbose); 228 | last CONSENSUS; 229 | } 230 | my $baselist = $overhang->{$ohpos}; 231 | my ($ct_dna, $previous_bz) = (0, ""); 232 | BASE: 233 | foreach my $bz (sort {$baselist->{$b}<=>$baselist->{$a}} keys %$baselist){ 234 | if($ct_dna){## the two most abundant bases at that position 235 | if($previous_bz ne "" && ($baselist->{$previous_bz} / $coverage) >= $min_base_ratio && $baselist->{$previous_bz} > $baselist->{$bz}){### a simple consensus btw top 2 236 | $consensus .= $previous_bz; ### build consensus 237 | print "Added base $previous_bz (cov = $baselist->{$previous_bz}) to $consensus **\n" if ($verbose); 238 | last BASE; 239 | }else{ 240 | print "ISSUES EXTENDING: best base = $previous_bz (cov=$baselist->{$previous_bz}) at $ohpos. Second-Best: $bz (cov=$baselist->{$bz}) (ratio best=$baselist->{$previous_bz} / total=$coverage) >= $min_base_ratio (-r) -- will terminate with $consensus\n" if($verbose); 241 | last CONSENSUS; 242 | } 243 | } 244 | $previous_bz = $bz; 245 | $ct_dna++; 246 | } 247 | } 248 | } 249 | 250 | ### deal with sequence reads making up the consensus/newly formed contig 251 | if($consensus ne ""){ 252 | 253 | print "Will extend $seq\nwith: $consensus\n\n" if($verbose); 254 | my $temp_sequence = $seq . $consensus; ## this is the contig extension 255 | my $integral = 0; 256 | my $position = length($temp_sequence) - ($startspan + length($consensus)); 257 | my $temp_sequence_end = substr($temp_sequence, $position); 258 | foreach my $ro (@overlapping_reads){ 259 | if(index($temp_sequence_end, $ro) >= 0){ 260 | $integral=1; 261 | my $sub = substr($ro, 0, 10); 262 | $current_reads = $bin->{$sub}{$ro}; 263 | $current_bases = length($ro) * $current_reads; 264 | $reads_needed += $current_reads; 265 | $total_bases += $current_bases; 266 | deleteData($ro); 267 | } 268 | } 269 | if(! $integral){### no reads are found overlapping with the consensus might be indicative of low complexity regions -- Stop the extension 270 | print "No overlapping reads agree with the consensus sequence. Stopping extension" if ($verbose); 271 | $extended = 0; 272 | }else{ 273 | $seq = $temp_sequence; 274 | $temp_sequence = ""; 275 | print "New Contig is: $seq\n" if ($verbose); 276 | $extended = 1; 277 | } 278 | $previous = $seq; 279 | }else{### no consensus built, will stop the extension 280 | $extended = 0; 281 | } 282 | 283 | }###while get the OK for extension 284 | 285 | $trim_ct++; 286 | if ($trim_ct <= $max_trim){ 287 | last TRIM if (length($seq) <= $MIN_READ_LENGTH); #terminate assembly if trimming becomes too agressive 288 | $seq = substr($seq, 0, -1); 289 | $extended = 1; 290 | print "\n$direction prime EXTENSION ROUND $trim_ct COMPLETE UNTIL $max_trim nt TRIMMED OFF => TRIMMED SEQUENCE:$seq\n\n" if ($verbose); 291 | } 292 | 293 | }### while trimming within bounds 294 | 295 | print "\n*** NOTHING ELSE TO BE DONE IN $direction prime- PERHAPS YOU COULD DECREASE THE MINIMUM OVERLAP -m (currently set to -m $min_overlap) ***\n\n" if ($verbose); 296 | 297 | return $seq, $reads_needed, $total_bases; 298 | } 299 | 300 | 301 | ###DELETE READ DATA IF IT HAS BEEN USED FOR EXTENDING A CONTIG 302 | sub deleteData { 303 | my ($sequence) = @_; 304 | 305 | my $subnor = substr($sequence, 0, 10); 306 | my $comp_seq = reverseComplement($sequence); 307 | my $subrv = substr($comp_seq, 0, 10); 308 | 309 | #remove k-mer from hash table and prefix tree 310 | delete $bin->{$subrv}{$comp_seq}; 311 | delete $bin->{$subnor}{$sequence}; 312 | } 313 | 314 | sub getUnmappedReads{ 315 | my ($contigFile, $readfiles) = @_; 316 | my ($library, $fnames) = ("start", ""); 317 | 318 | #obtain sequences to map against the contigs 319 | open(FILELIB, "< $libraryfile") || die "Can't open $libraryfile -- fatal\n"; 320 | my $files; 321 | while(){ 322 | my ($lib) = split(/\s+/, $_); 323 | my $i = 1; 324 | while(-e "reads/$base_name.$lib.file$i.fa"){ 325 | $files->{"reads/$base_name.$lib.file1.fa"}++; 326 | $i++; 327 | } 328 | } 329 | close FILELIB; 330 | my $unpaired = "reads/$base_name.singlereads.fasta"; 331 | $files->{$unpaired}++ if(-e $unpaired); 332 | foreach my $f(keys %$files){ 333 | $fnames .= "$f,"; 334 | } 335 | chop $fnames; 336 | 337 | #build bowtie index of contigs and map reads to the index 338 | my $bowtieout = $base_name . ".$library.bowtieIndex"; 339 | die "Contig file ($contigFile) not found. Exiting...\n" if(!(-e $contigFile)); 340 | &printMessage("\n=>".getDate().": Building Bowtie index for contigs\n"); 341 | system("bowtie-build $contigFile bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values 342 | &printMessage("\n=>".getDate().": Mapping reads to Bowtie index\n"); 343 | my $procline = "bowtie -p $threads -v $gaps bowtieoutput/$bowtieout -f $fnames --quiet -S |"; 344 | 345 | #map reads with bowtie and obtain unmapped reads. Store the unmapped reads into a hash and use them for contig extension 346 | open(IN, "$procline") || die "Can't open bowtie output -- fatal\n"; 347 | my ($counter, $step) = (0, 100000); 348 | my ($orig, $rc, $subrv, $subnor, $orig_mer); 349 | while(){ 350 | my @t = split(/\t/); 351 | next if ($t[2] ne '*'); 352 | if(++$counter == $step){ 353 | CounterPrint($counter); 354 | $step = $step + 100000; 355 | } 356 | $orig_mer = length($t[9]); 357 | $rc=reverseComplement($t[9]); 358 | $MAX=$orig_mer if ($orig_mer > $MAX); 359 | $bin->{substr($t[9], 0, 10)}{$t[9]}++; 360 | $bin->{substr($rc, 0, 10)}{$rc}++; 361 | } 362 | 363 | print SUMFILE "CONTIG EXTENSION:\n".$seplines; 364 | print SUMFILE "\tNumber of unmapped reads used for contig extension = $counter\n"; 365 | CounterPrint((" " x length($counter))); 366 | } 367 | 368 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE 369 | sub reverseComplement{ 370 | $_ = shift; 371 | tr/ATGC/TACG/; 372 | return (reverse()); 373 | } 374 | 375 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE 376 | sub CounterPrint{ 377 | my $countingMessager = shift; 378 | print "\r$countingMessager"; 379 | $|++; 380 | } 381 | 382 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE 383 | sub printMessage{ 384 | my $message = shift; 385 | print $message; 386 | print LOG $message; 387 | } 388 | 389 | ###FUNCTION TO GET THE CURRENT DATE 390 | sub getDate{ 391 | my $date = scalar(localtime); 392 | return $date; 393 | } 394 | 395 | ###FLUSHES THE SUMMARY AND LOG FILE 396 | sub FlushFiles{ 397 | select((select(SUMFILE), $| = 1)[0]); 398 | select((select(LOG), $| = 1)[0]); 399 | $|++; 400 | } 401 | 402 | sub checkStatus{ 403 | &printMessage(("*" x 50)."\n\nProcess failed on ".getDate()."\n\n\n"), exit if(!(-d "process_OK")); 404 | rmtree(["process_OK", 'blurfl/quux']); 405 | } 406 | 407 | #########END ExtendOrFormatContigs.pl 408 | -------------------------------------------------------------------------------- /bin/PairingAndScaffolding.pl: -------------------------------------------------------------------------------- 1 | ################################################### 2 | #Marten Boetzer 14-07-2011 # 3 | #SSPACE perl subscript PairingAndScaffolding.pl # 4 | #This script; # 5 | # -reads the contig sequences in a hash # 6 | # -stores Bowtie output in a hash # 7 | # -pairs the contigs # 8 | # -generates scaffolds # 9 | ################################################### 10 | 11 | 12 | #THIS VERSION OF SCAFFOLDING FIRST ORDERS THE CONTIGS BASED ON THE NUMBER OF INGOING LINKS AND STARTS AT LOWEST LEVEL. AFTER ALL THESE CONTIGS ARE SCAFFOLDED, INGOING LINKS ARE RECALCULATED OF REMAINING CONTIGS, ITERATIVELY. 13 | #ALSO, EACH CONTIG IS REPRESENTED ONCE IN THE SCAFFOLDS. 14 | #METHOD OF SCAFFOLDING IS; IF MORE THAN ONE LINK, CHECK IF THOSE LINKS HAVE CONNECTION WITH EACH OTHER. IF SO, COMBINE THEM IN THE SCAFFOLD. IF NOT, ESTIMATE RATIO AND ONLY ALLOW EXTENSION OF SCAFFOLD IF IT'S BELOW THE RATIO THRESHOLD GIVEN BY THE USER. 15 | #FUTURE: INCLUDE NUMBER OF REPEATS THAT ARE POSSIBLY PRESENT 16 | use strict; 17 | use Storable; 18 | use File::Path; 19 | use File::Basename; 20 | my $seplines = ("-" x 60)."\n"; 21 | my $gaps = $ARGV[1]; 22 | my $contig = $ARGV[2]; 23 | my $base_name = $ARGV[3]; 24 | my $issues = $ARGV[4]; 25 | my $distribution = $ARGV[5]; 26 | my $verbose = $ARGV[6]; 27 | my $library = $ARGV[7]; 28 | my $insert_size = $ARGV[8]; 29 | my $min_allowed = $ARGV[9]; 30 | my $scaffold = $ARGV[10]; 31 | my $min_links = $ARGV[11]; 32 | my $max_link_ratio = $ARGV[12]; 33 | my $ori = $ARGV[13]; 34 | my $threads = $ARGV[14]; 35 | my $tab = $ARGV[15]; 36 | my $tabfile = $ARGV[16]; 37 | my $origctg = $ARGV[17]; 38 | my $prev_evidence = $ARGV[18]; 39 | 40 | my ($low_iz, $up_iz) = ($insert_size + $min_allowed, $insert_size - $min_allowed); 41 | my $bowtiefile = "bowtieoutput/" . $base_name . ".$library.mapped"; 42 | my $log = $base_name . ".logfile.txt"; 43 | my $summaryfile = $base_name.".summaryfile.txt"; 44 | 45 | my ($total_for_median, $step,$ct_illogical, $ct_ok_contig, $ct_ok_pairs, $ct_problem_pairs, $ct_iz_issues, $ct_single, $ct_both, $counter)= (0,100000,0,0,0,0,0,0,0,0 ); 46 | my ($pair,$err,$track_insert, $tigOnScafHash, $tigHash); 47 | my $pair_found = 0; 48 | 49 | open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; 50 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 51 | open(PET, ">$issues") || die "Can't open $issues for writing -- fatal\n"; 52 | #-------------------------------------------------READ CONTIGS INTO HASH AND STORE THEIR LENGTH. NEXT; PAIR THE CONTIGS 53 | if($tab){ 54 | parseEvidenceFile($prev_evidence) if($prev_evidence ne ''); 55 | &updateContigs($origctg); 56 | } 57 | my ($contigstored, $tig_length) = &readFileContigHash($contig); 58 | if(!$tab){ 59 | my $up_iz = ($insert_size - $min_allowed); 60 | my $newcontig = processContig($contig, $up_iz) ; 61 | 62 | my $fname = "reads/$base_name.$library.file1.fa"; 63 | if(-e $fname){ 64 | my $fname2 = "reads/$base_name.$library.file2.fa"; 65 | my $i = 2; 66 | while(-e $fname2){ 67 | $fname = "$fname,$fname2"; 68 | $i++; 69 | $fname2 = "reads/$base_name.$library.file$i.fa"; 70 | } 71 | } 72 | mapReadsWithBowtie($newcontig, $fname,$gaps, $threads); 73 | }else{ 74 | parseTabFile($tigHash); 75 | } 76 | &printResultsPairing(); 77 | 78 | #-------------------------------------------------BUILDING SCAFFOLDS 79 | &buildScaffolds($pair, $tig_length, $verbose, $scaffold, $library); 80 | ($pair, $tig_length) = ('',''); undef $pair; undef $tig_length; 81 | 82 | close SUMFILE; 83 | close LOG; 84 | mkpath('process_OK'); 85 | 86 | #------------------------------------------------- 87 | 88 | ###FUNCTION TO PARSE THE TAB FILE 89 | sub parseTabFile{ 90 | my ($tigHash) = @_; 91 | open(TAB, "$tabfile") || die "Can't open $tabfile for reading -- fatal\n"; 92 | my $lower = ($up_iz+200); 93 | my $step = 1000000; 94 | &printMessage("\n=>".getDate().": Parsing Tab file\n"); 95 | while(){ 96 | chomp; 97 | if(++$ct_both == $step){ 98 | CounterPrint($ct_both); 99 | $step = $step + 1000000; 100 | } 101 | my ($tig1,$start1,$end1,$tig2,$start2,$end2) = split(/\t/); 102 | 103 | #check if the contig in the tab file is also present in the inserted contig fasta file 104 | if(!defined($tigHash->{$tig1})){ 105 | die "\nERROR: could not find an header containing $tig1 at line number $ct_both. Exit...\n"; 106 | } 107 | if(!defined($tigHash->{$tig2})){ 108 | die "ERROR: could not find an header containing $tig2 at line number $ct_both. Exit...\n"; 109 | } 110 | my $ctg1 = $tigHash->{$tig1}; 111 | my $ctg2 = $tigHash->{$tig2}; 112 | my ($track1, $track2) = ("",""); 113 | 114 | #if multiple libraries were used, update the contig positions in the TAB File by finding its position in the scaffolds 115 | if($prev_evidence ne ''){ 116 | $start1 = $start1 + $tigOnScafHash->{$ctg1}{'begin'}; 117 | $end1 = $end1 + $tigOnScafHash->{$ctg1}{'begin'}; 118 | $start2 = $start2 + $tigOnScafHash->{$ctg2}{'begin'}; 119 | $end2 = $end2 + $tigOnScafHash->{$ctg2}{'begin'}; 120 | if($tigOnScafHash->{$ctg1}{'direction'} eq "r"){ 121 | my $tmp_start = ($tigOnScafHash->{$ctg1}{'end'}+$tigOnScafHash->{$ctg1}{'begin'}) - $start1; 122 | my $tmp_end = ($tigOnScafHash->{$ctg1}{'end'}+$tigOnScafHash->{$ctg1}{'begin'}) - $end1; 123 | $start1 = $tmp_start; 124 | $end1 = $tmp_end; 125 | } 126 | if($tigOnScafHash->{$ctg2}{'direction'} eq "r"){ 127 | my $tmp_start = ($tigOnScafHash->{$ctg2}{'end'}+$tigOnScafHash->{$ctg2}{'begin'}) - $start2; 128 | my $tmp_end = ($tigOnScafHash->{$ctg2}{'end'}+$tigOnScafHash->{$ctg2}{'begin'}) - $end2; 129 | $start2 = $tmp_start; 130 | $end2 = $tmp_end; 131 | } 132 | 133 | $ctg1 = $tigOnScafHash->{$ctg1}{'scaf'}; 134 | $ctg2 = $tigOnScafHash->{$ctg2}{'scaf'}; 135 | if($start1 < $lower || ($end1 > ($tig_length->{$ctg1}-$lower))){ 136 | $track1 = "$ctg1"."|$start1"."|$end1"; 137 | } 138 | if($start2 < $lower || ($end2 > ($tig_length->{$ctg2}-$lower))){ 139 | $track2 = "$ctg2"."|$start2"."|$end2"; 140 | } 141 | }else{ #if it is the first library, just use the positions in the TAB file 142 | if($start1 < $lower || ($end1 > ($tig_length->{$ctg1}-$lower))){ 143 | $track1 = "$ctg1"."|$start1"."|$end1"; 144 | } 145 | if($start2 < $lower || ($end2 > ($tig_length->{$ctg2}-$lower))){ 146 | $track2 = "$ctg2"."|$start2"."|$end2"; 147 | } 148 | 149 | } 150 | #pair the contigs based on the information provided in the TAB file 151 | pairContigs($track1, $track2, "seq$ct_both.1", "seq$ct_both.2") if($track1 ne "" && $track2 ne ""); 152 | } 153 | CounterPrint(" "); 154 | } 155 | 156 | ###FUNCTION TO STORE ONLY THE EDGES OF THE CONTIGS. ONLY READS ARE MAPPED TO THESE EDGES, SAVING TIME FOR BUILDING THE INDEX WITH BOWTIE, AND MAPPING THE READS TO THE CONTIGS 157 | sub processContig{ 158 | my ($contigfile, $max_dist) = @_; 159 | 160 | my $lower = ($max_dist+200); 161 | 162 | open(IN,$contigfile) || die "can't read $contigfile -- fatal\n"; 163 | my $contigfilesub = "tmp.$base_name/subset_contigs.fasta"; 164 | open(OUT,">$contigfilesub") || die "can't write to $contigfilesub -- fatal\n"; 165 | my ($seq, $counter) = ('', 0); 166 | while(){ 167 | chomp; 168 | my $line = $_; 169 | $seq.= uc($line) if(eof(IN)); 170 | if (/\>(\S+)/ || eof(IN)){ 171 | if($seq ne ''){ 172 | $counter++; 173 | if(length($seq) > (($lower * 2)+100)){ 174 | my $upper = (length($seq) - ($lower)); 175 | my $first = substr($seq, 0, $lower); 176 | my $second = substr($seq, $upper); 177 | my $newseq = $first."NNN".$second; 178 | print OUT ">$counter\n$newseq\n"; 179 | } 180 | else{ 181 | print OUT ">$counter\n$seq\n"; 182 | } 183 | } 184 | $seq=''; 185 | }else{ 186 | $seq.=uc($line); 187 | } 188 | } 189 | close IN; 190 | close OUT; 191 | return $contigfilesub; 192 | } 193 | 194 | ###FUNCTION TO PARSE THE EVIDENCE FILE, ONLY USED IF TAB FILE IS INSERTED 195 | #Function determines the position of the contigs on the scaffolds, information is used to update the contigs of the TAB file 196 | sub parseEvidenceFile{ 197 | my ($file) = @_; 198 | my $track_tigs; 199 | open(IN,$file) || die "Can't open $file -- fatal\n"; 200 | my $scaf = 0; 201 | my $totalsize= 0; 202 | while(){ 203 | chomp; 204 | if(/^>/){ 205 | $scaf++; 206 | $totalsize=0; 207 | }else{ 208 | my ($tig, $size, $links, $gap, $merge) = split(/\|/,$_); 209 | if($tig ne ""){ 210 | my ($direction, $tig2) = split(/_tig/,$tig); 211 | $tigOnScafHash->{$tig2}{'begin'} = $totalsize; 212 | 213 | my (undef, $size2) = split(/size/,$size); 214 | my $end = $totalsize + $size2; 215 | if($merge ne ""){ 216 | my (undef, $merge2) = split(/merged/,$merge); 217 | $totalsize = $totalsize + ($size2 - $merge2); 218 | }elsif($gap ne ""){ 219 | my (undef, $gap2) = split(/gaps/,$gap); 220 | $gap2 = 1 if($gap2 < 0); 221 | $totalsize = $totalsize + $size2 + $gap2; 222 | }else{ 223 | $totalsize = $totalsize + $size2; 224 | } 225 | $tigOnScafHash->{$tig2}{'scaf'} = $scaf; 226 | $tigOnScafHash->{$tig2}{'end'} = $totalsize; 227 | $tigOnScafHash->{$tig2}{'direction'} = $direction; 228 | } 229 | } 230 | } 231 | } 232 | 233 | ###FUNCTION TO UPDATE THE ORIGINAL CONTIG FILE INSERTED BY THE USER, SO MULTIPLE TAB FILES OF SEVERAL LIBRARIES CAN BE INSERTED 234 | sub updateContigs{ 235 | my ($file, $update) = @_; 236 | 237 | &printMessage("\n=>".getDate().": Updating contig file\n"); 238 | 239 | my ($countContig, $seq, $prevhead) = (0, "", ''); 240 | open(IN,$file) || die "Can't open $file -- fatal\n"; 241 | while(){ 242 | my $line = $_; 243 | chomp $line; 244 | $seq.= $line if(eof(IN)); 245 | if (/\>(\S+)/ || eof(IN)){ 246 | my $head=$1; 247 | if($prevhead ne ''){ 248 | ++$countContig; 249 | $tigHash->{$prevhead} = $countContig; 250 | } 251 | $prevhead = $head; 252 | $seq=''; 253 | }else{ 254 | $seq.=$line; 255 | } 256 | } 257 | CounterPrint(" "); 258 | &FlushFiles(); 259 | } 260 | 261 | #READ THE CONTIG TO A HASH AND STORE THIS HASH 262 | sub readFileContigHash{ 263 | my ($file) = @_; 264 | 265 | &printMessage("\n=>".getDate().": Reading contig file\n"); 266 | 267 | my ($contigs, $tig_length); 268 | my ($countContig, $seq, $prevhead, $step) = (0, "", '', 1000); 269 | open(IN,$file) || die "Can't open $file -- fatal\n"; 270 | while(){ 271 | my $line = $_; 272 | chomp $line; 273 | $seq.= $line if(eof(IN)); 274 | if (/\>(\S+)/ || eof(IN)){ 275 | my $head=$1; 276 | if($prevhead ne ''){ 277 | if(++$countContig == $step){ 278 | CounterPrint($countContig); 279 | $step = $step + 100000; 280 | } 281 | $tig_length->{$countContig} = length($seq); 282 | $contigs->{$countContig}{'name'} = $prevhead; 283 | $contigs->{$countContig}{'seq'} = $seq; 284 | } 285 | $prevhead = $head; 286 | $seq=''; 287 | }else{ 288 | $seq.=$line; 289 | } 290 | } 291 | CounterPrint(" "); 292 | &FlushFiles(); 293 | my $contigstore = "tmp.$base_name/contigs.stored"; 294 | store \%$contigs, "$contigstore"; 295 | undef $contigs; 296 | return ($contigstore, $tig_length); 297 | } 298 | 299 | ###FUNCTION THAT FILTERS OUT THE REPEATS BY FINDING CONTIGS THAT HAVE MULTIPLE LINKS WITH OTHER CONTIGS 300 | sub determineRepeats{ 301 | my ($tig_length, $repeathash) = @_; 302 | my $removeHash; 303 | #go through each contig 304 | foreach my $tig (sort {$tig_length->{$b}<=>$tig_length->{$a}} keys %$tig_length){ 305 | for(my $i = 0; $i < 2; $i++){ 306 | my $dtig = "r" . $tig; 307 | $dtig = "f" . $tig if($i); 308 | my $list = $pair->{$dtig}; #get contig pairs from $tig 309 | my ($seen_it, $matchhash); 310 | my $ct=0; 311 | #Go through each contig pair and get the number of links and gapsize 312 | foreach my $match (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){ 313 | my $matchnum = $1 if($match=~/[fr](\w+)/); 314 | print TMP "$dtig has $list->{$match}{'links'} links with $match and gap of ".int($list->{$match}{'gaps'}/$list->{$match}{'links'})." bases\n" if($list->{$match}{'links'} >= $min_links); 315 | if($list->{$match}{'links'} >= $min_links && !defined $seen_it->{$matchnum} && $ct < 2){ 316 | $ct++; 317 | $matchhash->{$match}{'links'} = $list->{$match}{'links'}; 318 | $matchhash->{$match}{'gaps'} = $list->{$match}{'gaps'}; 319 | $matchhash->{$match}{'ratio'} = $list->{$match}{'gaps'}/$list->{$match}{'links'}; 320 | $seen_it->{$matchnum}++; 321 | } 322 | } 323 | my @arraymatch; 324 | foreach my $ratiosort (sort {$matchhash->{$a}{'ratio'}<=>$matchhash->{$b}{'ratio'}} keys %$matchhash){ 325 | push @arraymatch, $ratiosort; 326 | } 327 | my $repeat = 1; 328 | my $used; 329 | my $nummatch = $#arraymatch; 330 | #only determine if contig is a repeat if it has more than 1 link with other contigs 331 | if($nummatch > 0){ 332 | my $listmatch = $pair->{$arraymatch[0]}; 333 | #if the top two pairs of $tig have link with each other, establish their link so they are combined in scaffolding stage 334 | if($listmatch->{$arraymatch[1]}{'links'} >= $min_links){ 335 | $pair = establishLink($dtig, $arraymatch[0], $pair); 336 | $pair = establishLink($arraymatch[0], $arraymatch[1], $pair); 337 | }else{ #otherwise, the contig has multiple links and is likely a repeat 338 | my @linkmatch; 339 | foreach my $linksort (sort {$matchhash->{$b}{'links'}<=>$matchhash->{$a}{'links'}} keys %$matchhash){ 340 | push @linkmatch, $linksort; 341 | } 342 | my ($ratio2, $first, $second) = (0,"",""); 343 | 344 | #check for two ratio's between the two best contig pairs. One is a ratio of the links, other is the number of links per searchspace. 345 | #If either one of the two ratio's is above the user-defined (-a) ratio, the original contig is treated as a repeat 346 | 347 | #estimate the ratio of the links of the two best contig pairs (ratio 1) 348 | my $link1 = $matchhash->{$linkmatch[1]}{'links'}; 349 | my $link2 = $matchhash->{$linkmatch[0]}{'links'}; 350 | my $ratio1 = $link1 / $link2; ## relative ratio of the two most abundant contig pairs 351 | $ratio1 = sprintf("%.2f", $ratio1); 352 | $first = $linkmatch[0]; 353 | #estimate the number of links per gap for the two best contig pairs and divide them (ratio 2) 354 | my $gapPerSpace1 = estimateLinksPerGap($matchhash, $linkmatch[0], $insert_size, $tig_length); 355 | my $gapPerSpace2 = estimateLinksPerGap($matchhash, $linkmatch[1], $insert_size, $tig_length); 356 | if($gapPerSpace1 > $gapPerSpace2){ 357 | $second = $linkmatch[0]; 358 | $ratio2 = $gapPerSpace2/$gapPerSpace1; 359 | }else{ 360 | $second = $linkmatch[1]; 361 | $ratio2 = $gapPerSpace1/$gapPerSpace2; 362 | } 363 | my $revdtig = $dtig; 364 | $revdtig =~ tr/fr/rf/; 365 | #if one of the two ratio's is above the user-defined (-a) option, contig is a repeat and all links with this contig are removed 366 | if($ratio2 >= $max_link_ratio || $ratio1 >= $max_link_ratio || $first ne $second){ 367 | foreach my $linksort (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){ 368 | my $num = $1 if($linksort=~/[fr](\w+)/); 369 | $removeHash->{$dtig}{$linksort}++; 370 | my $revlinksort = $linksort; 371 | $revlinksort =~ tr/fr/rf/; 372 | $removeHash->{$revdtig}{$revlinksort}++; 373 | } 374 | } 375 | else{ #otherwise, establish the link between the most likely contig pair 376 | foreach my $linksort (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){ 377 | if($linksort ne $first){ 378 | my $num = $1 if($linksort=~/[fr](\w+)/); 379 | my $revlinksort = $linksort; 380 | $revlinksort =~ tr/fr/rf/; 381 | $removeHash->{$revdtig}{$revlinksort}++; 382 | $removeHash->{$dtig}{$linksort}++; 383 | } 384 | } 385 | } 386 | } 387 | } 388 | } 389 | } 390 | return $removeHash; 391 | } 392 | 393 | ###FUNCTION TO ESTABLISH A LINK BETWEEN CONTIGS SO THESE CONTIGS ARE PAIRED DURING SCAFFOLDING 394 | sub establishLink{ 395 | my ($tig1, $tig2, $pair) = @_; 396 | my $list = $pair->{$tig1}; 397 | my $revtig1 = $tig1; 398 | $revtig1 =~ tr/fr/rf/; 399 | my $revtig2 = $tig2; 400 | $revtig2 =~ tr/fr/rf/; 401 | foreach my $rep (keys %$list){ 402 | if($rep ne $tig2 && $rep ne $revtig2){ 403 | delete $pair->{$tig1}{$rep}; 404 | $rep =~ tr/fr/rf/; 405 | delete $pair->{$rep}{$revtig1}; 406 | } 407 | } 408 | my $list2 = $pair->{$revtig2}; 409 | foreach my $rep2 (keys %$list2){ 410 | if($rep2 ne $tig1 && $rep2 ne $revtig1){ 411 | delete $pair->{$revtig2}{$rep2}; 412 | $rep2 =~ tr/fr/rf/; 413 | delete $pair->{$rep2}{$tig2}; 414 | } 415 | } 416 | 417 | return $pair; 418 | } 419 | 420 | ###DETERMINE THE NUMBER OF LINKS PER GAP, BASED ON INSERT SIZE 421 | sub estimateLinksPerGap{ 422 | my ($linkhash, $tig1, $insert_size, $length_hash) = @_; 423 | my $t1 = $1 if($tig1=~/[fr](\w+)/); 424 | my $space = 0; 425 | my $gap = int($linkhash->{$tig1}{'ratio'}); 426 | $gap = 0 if($linkhash->{$tig1}{'ratio'} < 0); 427 | if(($length_hash->{$t1}+$gap) >= $insert_size){ 428 | $space = int($insert_size - $gap); 429 | }else{ 430 | $space =$length_hash->{$t1}; 431 | } 432 | my $ratio = $linkhash->{$tig1}{'links'}/$space; 433 | return $ratio; 434 | } 435 | 436 | ###FUNCTION TO BUILD THE SCAFFOLDS 437 | sub buildScaffolds{ 438 | my ($pair, $tig_length, $verbose, $scaffold, $lib) = @_; 439 | &printMessage("\n=>".getDate().": Building scaffolds file\n"); 440 | 441 | open (SC, ">$scaffold") || die "Can't write to $scaffold -- fatal\n"; 442 | my ($sc_ct, $keyrep, $numrepeat) = (0,0,0); 443 | my ($repeathash, $seen_start); 444 | 445 | #determine the repeats and remove any link if contig is a repeat 446 | #if contig has multiple links, but one considered to be the 'best', establish this contig-pair by removing the links with other contigs 447 | open (TMP, ">intermediate_results/$base_name"."_$library.foundlinks") || die "Can't write to intermediate_results/$base_name"."_$library.foundlinks -- fatal\n"; 448 | $repeathash = determineRepeats($tig_length, $repeathash); 449 | close TMP; 450 | open (REPEAT, ">intermediate_results/$base_name"."_$library.repeats") || die "Can't write to intermediate_results/$base_name"."_$library.repeats -- fatal\n"; 451 | foreach my $rep (sort keys %$repeathash){ 452 | my $tig = $1 if($rep=~/[fr](\w+)/);; 453 | my $ls = $repeathash->{$rep}; 454 | my ($num_match,$repline) = (0,""); 455 | foreach my $rep2 (sort keys %$ls){ 456 | if($pair->{$rep}{$rep2}{'links'} >= $min_links){ 457 | $repline.="\twith $rep2 (links = $pair->{$rep}{$rep2}{'links'})\n"; 458 | $num_match++; 459 | } 460 | delete $pair->{$rep}{$rep2}; 461 | delete $pair->{$rep2}{$rep}; 462 | } 463 | if($num_match > 1){ 464 | $numrepeat++; 465 | print REPEAT "Contig $rep (size = $tig_length->{$tig}) has $num_match multiple links;\n"; 466 | print REPEAT "$repline\n"; 467 | } 468 | } 469 | close REPEAT; 470 | print SUMFILE "REPEATS: \n"; 471 | print SUMFILE "\tNumber of repeated edges = $numrepeat\n$seplines\n"; 472 | 473 | #go through each contig and find contig pairs left and right, forming scaffolds 474 | SEED: 475 | foreach my $tig (sort {$tig_length->{$b}<=>$tig_length->{$a}} keys %$tig_length){ 476 | my $ftig = "f" . $tig; 477 | my $rtig = "r" . $tig; 478 | 479 | if(! defined $seen_start->{$tig}){##should prevent re-using a contig as seed if it's already been incorporated into a scaffold 480 | CounterPrint(++$sc_ct); 481 | my $chainleft = ""; 482 | my $ori_chainright = $ftig . "Z" . $tig_length->{$tig}; 483 | my $chainright = $ori_chainright; 484 | my $total = $tig_length->{$tig}; 485 | ($total, $chainright, $seen_start) = &computeLayout("R", $chainright, $ftig, $pair, $tig_length, $total, $seen_start, $tig); 486 | ($total, $chainleft, $seen_start) = &computeLayout("L", $chainleft, $rtig, $pair, $tig_length, $total, $seen_start, $tig); 487 | 488 | delete $pair->{$ftig}; 489 | delete $pair->{$rtig}; 490 | delete $tig_length->{$tig}; 491 | $seen_start->{$tig}++; 492 | my $scaffold = $chainleft . $chainright; 493 | print SC "scaffold" . $sc_ct . ",$total,$scaffold\n"; 494 | } 495 | } 496 | CounterPrint(" "); 497 | close SC; 498 | &FlushFiles(); 499 | } 500 | 501 | # links contigs together into a chain - must satisfy user-defined criterions (-k -a) 502 | sub computeLayout{ 503 | my ($ext, $chain, $tig, $pair, $tig_length, $total, $seen_start, $orig_tig_number) = @_; 504 | my $orig_tig = $tig; 505 | my $extension = 1; 506 | EXTENSION: 507 | while($extension){ 508 | my $tnum = $1 if($tig=~/[fr](\w+)/); 509 | my $tnumf = "f" . $tnum; 510 | my $tnumr = "r" . $tnum; 511 | my $ratio = 0.00; 512 | if(!defined $seen_start->{$tnum}){ #if already seen in scaffold, do not use it again 513 | $seen_start->{$tnum}++ if($tnumf ne $orig_tig); 514 | my $list = $pair->{$tig}; 515 | my $matchhash; 516 | my ($match1,$link1,$gaps1,$match2,$link2,$gaps2,$cntloop, $countmatches)=("",0,0,"",0,0,0,0); 517 | my $ct=0; 518 | LINK: 519 | foreach my $match (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){ 520 | my $matchnum = $1 if($match=~/[fr](\w+)/); 521 | if($list->{$match}{'links'} >= $min_links && !defined $seen_start->{$matchnum} && $matchnum ne $orig_tig_number && $ct < 2){ 522 | $ct++; 523 | $matchhash->{$match}{'links'} = $list->{$match}{'links'}; 524 | $matchhash->{$match}{'gaps'} = $list->{$match}{'gaps'}; 525 | $matchhash->{$match}{'ratio'} = $list->{$match}{'gaps'}/$list->{$match}{'links'}; 526 | $countmatches++; 527 | }else{ 528 | last LINK; 529 | } 530 | } 531 | my $foundlinks = 0; 532 | if($countmatches > 1){ 533 | my @arraymatch; 534 | foreach my $ratiosort (sort {$matchhash->{$a}{'ratio'}<=>$matchhash->{$b}{'ratio'}} keys %$matchhash){ 535 | push @arraymatch, $ratiosort; 536 | } 537 | my $nummatch = $#arraymatch; 538 | for(my $i=0; $i <= $nummatch && $foundlinks < 1; $i++){ 539 | my $listmatch = $pair->{$arraymatch[$i]}; 540 | for(my $j=$i+1; $j <= $nummatch && $foundlinks < 1; $j++){ 541 | my $linkmatch = $listmatch->{$arraymatch[$j]}{'links'}; 542 | $foundlinks = 1 if(!($linkmatch >= $min_links)); 543 | } 544 | } 545 | my $tignum = $1 if($arraymatch[$nummatch]=~/[fr](\w+)/); 546 | $countmatches=0 if(!$foundlinks && defined $seen_start->{$tignum}); 547 | }if($foundlinks && $countmatches > 1){ 548 | my @linkmatch; 549 | foreach my $linksort (sort {$matchhash->{$b}{'links'}<=>$matchhash->{$a}{'links'}} keys %$matchhash){ 550 | push @linkmatch, $linksort; 551 | } 552 | my $linkhash; 553 | my $link1 = $matchhash->{$linkmatch[1]}{'links'}; 554 | my $link2 = $matchhash->{$linkmatch[0]}{'links'}; 555 | my $ratio = $link1 / $link2; ## relative ratio of the two most abundant contig pairs 556 | $ratio = sprintf("%.2f", $ratio); 557 | 558 | if($ratio <= $max_link_ratio){ 559 | foreach my $mat (keys %$matchhash){ 560 | delete $matchhash->{$mat} if($mat ne $linkmatch[0]); 561 | } 562 | $foundlinks = 0; 563 | $countmatches = 1; 564 | } 565 | } 566 | if((!$foundlinks) && $countmatches > 0){ 567 | my $nummatch =0; 568 | my @chainlist; 569 | my @tiglist; 570 | foreach my $incl_matches (sort {$matchhash->{$a}{'ratio'}<=>$matchhash->{$b}{'ratio'}} keys %$matchhash){ 571 | if($tig ne $incl_matches){ 572 | $nummatch++; 573 | my $listmatch = $pair->{$tig}; 574 | my $tempnum = $1 if($incl_matches =~ /[fr](\w+)/); 575 | my $link2 = $listmatch->{$incl_matches}{'links'}; 576 | my $mean2 = $listmatch->{$incl_matches}{'gaps'}/$link2; 577 | 578 | $seen_start->{$tempnum}++if($nummatch < $countmatches); 579 | 580 | ($chain, $total, $tig) = &getChain($chain, $ext, $link2, $mean2, $incl_matches, $tempnum, $ratio, $tig_length, $total); 581 | delete $tig_length->{$tempnum}; 582 | } 583 | } 584 | $extension = 1; 585 | 586 | }else{ 587 | $extension = 0; 588 | last EXTENSION; 589 | } 590 | }else{ 591 | $extension = 0; 592 | last EXTENSION; 593 | } 594 | } 595 | return $total, $chain, $seen_start; 596 | } 597 | 598 | ###function to combine contigs into a scaffold 599 | sub getChain{ 600 | my ($chain, $ext, $link, $mean, $match, $tempnum, $ratio, $tig_length, $total) = @_; 601 | my $tig = $match; 602 | if($ext eq "R"){ 603 | $chain .= "k" . $link . "a" . $ratio . "m" . int($mean) . "_" . $match . "z" . $tig_length->{$tempnum}; 604 | }else{ 605 | my $temp_match = ""; 606 | if($match =~ /^r(\d+)/){$temp_match = "f" . $1;}else{$temp_match = "r". $1;} 607 | $chain = $temp_match . "z" . $tig_length->{$tempnum} . "k" . $link . "a" . $ratio . "m" . int($mean) . "_" . $chain; 608 | } 609 | 610 | $total += $tig_length->{$tempnum}; 611 | return ($chain, $total, $tig); 612 | } 613 | 614 | 615 | ###GET THE DISTANCE BETWEEN TWO PAIRED READS 616 | sub getDistance{ 617 | 618 | my ($insert_size, $length_i, $start_i, $start_j) = @_; 619 | 620 | # L ------ --------- R 621 | # i -> <- j 622 | # .... ...... insert_span 623 | # ============ insert_size 624 | 625 | my $insert_span = ($length_i - $start_i) + $start_j; 626 | my $gap_or_overlap = $insert_size - $insert_span; 627 | 628 | return $gap_or_overlap; 629 | } 630 | 631 | ###Pair contigs based on mapping of two reads 632 | sub pairContigs{ 633 | my ($trackA, $trackB, $read_a, $read_b) = @_; 634 | my ($tig_a, $A_start, $A_end) = split(/\|/, $trackA); 635 | my ($tig_b, $B_start, $B_end) = split(/\|/, $trackB); 636 | my ($ori_1,$ori_2) = split(//, $ori); 637 | if($ori_1 eq "R"){ 638 | my ($tmp_A_start, $tmp_A_end) = ($A_start, $A_end); 639 | ($A_start, $A_end) = ($tmp_A_end, $tmp_A_start); 640 | } 641 | if($ori_2 eq "F"){ 642 | my ($tmp_B_start,$tmp_B_end) = ($B_start,$B_end); 643 | ($B_start,$B_end) = ($tmp_B_end,$tmp_B_start); 644 | } 645 | my $ftig_a = "f" . $tig_a; 646 | my $ftig_b = "f" . $tig_b; 647 | my $rtig_a = "r" . $tig_a; 648 | my $rtig_b = "r" . $tig_b; 649 | my $A_length = $tig_length->{$tig_a}; 650 | my $B_length = $tig_length->{$tig_b}; 651 | if (($tig_a != $tig_b) || ($tig_a ne $tig_b)){####paired reads located on <> contigs 652 | ####Determine most likely possibility 653 | if ($A_start < $A_end){ 654 | if ($B_end < $B_start){####-> <- ::: A-> <-B / rB -> <- rA 655 | my $d = &getDistance($insert_size, $A_length, $A_start, $B_start); 656 | print "A-> <-B WITH $tig_a -> <- $tig_b GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Alen, Astart,Bstart\n" if($verbose); 657 | if($d >= $min_allowed){ 658 | $pair->{$ftig_a}{$ftig_b}{'links'}++; 659 | $pair->{$ftig_a}{$ftig_b}{'gaps'} += $d; 660 | $pair->{$rtig_b}{$rtig_a}{'links'}++; 661 | $pair->{$rtig_b}{$rtig_a}{'gaps'} += $d; 662 | $ct_ok_pairs++; 663 | }else{ 664 | my $err_pair = $ftig_a . "-". $ftig_b; 665 | $err->{$err_pair}{'links'}++; 666 | $err->{$err_pair}{'gaps'} += $d; 667 | $ct_problem_pairs++; 668 | print PET "Pairs unsatisfied in distance within a contig pair. A-> <-B WITH tig#$tig_a -> $d <- tig#$tig_b, A=$A_length nt (start:$A_start, end:$A_end) B=$B_length nt (start:$B_start, end:$B_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 669 | } 670 | }else{#### -> -> ::: A-> <-rB / B-> <-rA 671 | my $rB_start = $B_length - $B_start; 672 | my $d = &getDistance($insert_size, $A_length, $A_start, $rB_start); 673 | print "A-> <-rB WITH $tig_a -> <- r.$tig_b GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Alen,Astart,rBstart\n"if($verbose); 674 | if($d >= $min_allowed){ 675 | $pair->{$ftig_a}{$rtig_b}{'links'}++; 676 | $pair->{$ftig_a}{$rtig_b}{'gaps'} += $d; 677 | $pair->{$ftig_b}{$rtig_a}{'links'}++; 678 | $pair->{$ftig_b}{$rtig_a}{'gaps'} += $d; 679 | $ct_ok_pairs++; 680 | }else{ 681 | my $err_pair = $ftig_a . "-". $rtig_b; 682 | $err->{$err_pair}{'links'}++; 683 | $err->{$err_pair}{'gaps'} += $d; 684 | $ct_problem_pairs++; 685 | print PET "Pairs unsatisfied in distance within a contig pair. A-> <-rB WITH tig#$tig_a -> $d <- tig#r.$tig_b, A=$A_length nt (start:$A_start, end:$A_end) B=$B_length nt (start:$B_start, end:$B_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 686 | } 687 | } 688 | }else{ 689 | if ($B_end > $B_start){####<- -> ::: B-> <-A / rA -> <- rB 690 | my $d = &getDistance($insert_size, $B_length, $B_start, $A_start); 691 | print "B-> <-A WITH $tig_b -> <- $tig_a GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Blen,Bstart,Astart\n" if($verbose); 692 | if($d >= $min_allowed){ 693 | $pair->{$ftig_b}{$ftig_a}{'links'}++; 694 | $pair->{$ftig_b}{$ftig_a}{'gaps'} += $d; 695 | $pair->{$rtig_a}{$rtig_b}{'links'}++; 696 | $pair->{$rtig_a}{$rtig_b}{'gaps'} += $d; 697 | $ct_ok_pairs++; 698 | }else{ 699 | my $err_pair = $ftig_b . "-". $ftig_a; 700 | $err->{$err_pair}{'links'}++; 701 | $err->{$err_pair}{'gaps'} += $d; 702 | $ct_problem_pairs++; 703 | print PET "Pairs unsatisfied in distance within a contig pair. B-> <-A WITH tig#$tig_b -> $d <- tig#$tig_a, B=$B_length nt (start:$B_start, end:$B_end) A=$A_length nt (start:$A_start, end:$A_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 704 | } 705 | }else{ ####<- <- ::: rB-> <-A / rA-> <-B 706 | my $rB_start = $B_length - $B_start; 707 | my $d = &getDistance($insert_size, $B_length, $rB_start, $A_start); 708 | print "rB-> <-A WITH r.$tig_b -> <- $tig_a GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Blen,rBstart,Astart\n" if($verbose); 709 | if($d >= $min_allowed){ 710 | $pair->{$rtig_b}{$ftig_a}{'links'}++; 711 | $pair->{$rtig_b}{$ftig_a}{'gaps'} += $d; 712 | $pair->{$rtig_a}{$ftig_b}{'links'}++; 713 | $pair->{$rtig_a}{$ftig_b}{'gaps'} += $d; 714 | $ct_ok_pairs++; 715 | }else{ 716 | my $err_pair = $rtig_b . "-". $ftig_a; 717 | $err->{$err_pair}{'links'}++; 718 | $err->{$err_pair}{'gaps'} += $d; 719 | $ct_problem_pairs++; 720 | print PET "Pairs unsatisfied in distance within a contig pair. rB-> <-A WITH tig#r.$tig_b -> $d <- tig#$tig_a, B=$B_length nt (start:$B_start, end:$B_end) A=$A_length nt (start:$A_start, end:$A_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 721 | } 722 | } 723 | } 724 | }else{###Clone, paired reads located on the same contig -- could be used to investigate misassemblies 725 | print "Pair ($read_a and $read_b) located on same contig $tig_a ($A_length nt)\n" if ($verbose); 726 | my $pet_size = 0; 727 | 728 | if ($A_start > $B_start && ($B_start < $B_end) && ($A_start > $A_end)){ # B --> <-- A 729 | $total_for_median++; 730 | $pet_size = $A_start - $B_start; 731 | $track_insert->{$pet_size}++; 732 | if($pet_size >= $low_iz && $pet_size <= $up_iz){ 733 | $ct_ok_contig++; 734 | }else{ 735 | print PET "Pairs unsatisfied in distance within a contig. Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end CALCULATED DISTANCE APART: $pet_size\n"; 736 | $ct_iz_issues++; 737 | } 738 | }elsif($B_start > $A_start && ($B_start > $B_end) && ($A_start < $A_end)){ # A --> <-- B 739 | $total_for_median++; 740 | $pet_size = $B_start - $A_start; 741 | $track_insert->{$pet_size}++; 742 | if($pet_size >= $low_iz && $pet_size <= $up_iz){ 743 | $ct_ok_contig++; 744 | }else{ 745 | print PET "Pairs unsatisfied in distance within a contig. Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end CALCULATED DISTANCE APART: $pet_size\n"; 746 | $ct_iz_issues++; 747 | } 748 | }else{ 749 | $ct_illogical++; 750 | print PET "Pairs unsatisfied in pairing logic within a contig. Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end\n"; 751 | } 752 | } 753 | } 754 | 755 | ###Print read pairing results to the summary file, including estimation of mean and median insert size 756 | sub printResultsPairing{ 757 | print PET "------------- Putative issues with contig pairing - Summary ----------------\n"; 758 | foreach my $err_pair (sort {$err->{$b}{'links'}<=>$err->{$a}{'links'}} keys %$err){ 759 | my $mean_iz = 0; 760 | $mean_iz = $err->{$err_pair}{'gaps'} / $err->{$err_pair}{'links'} if ($err->{$err_pair}{'links'}); 761 | print PET "Pair $err_pair has $err->{$err_pair}{'links'} links and mean distance = $mean_iz\n"; 762 | } 763 | close PET; 764 | 765 | my $satisfied = $ct_ok_pairs + $ct_ok_contig; 766 | my $unsatisfied = $ct_problem_pairs + $ct_iz_issues + $ct_illogical; 767 | my $ct_both_reads = $ct_both * 2; 768 | 769 | #write distribution file 770 | open (CSV, ">$distribution") || die "Can't open $distribution for writing -- fatal"; 771 | my ($total_is, $overal_is,$median_ins, $stdev,$record, $sumX,$sumX2) = (0,0,0,0,0,0,0); 772 | my $median_bin = int($total_for_median/2); 773 | 774 | foreach my $is (sort {$a<=>$b} keys %$track_insert){ 775 | for(my $i=0;$i<$track_insert->{$is};$i++){ 776 | $record++; 777 | $sumX += $is; 778 | $sumX2 += ($is * $is); 779 | $median_ins = $is if($record >= $median_bin && $median_ins == 0); 780 | } 781 | $overal_is += ($is * $track_insert->{$is}); 782 | print CSV "$is,$track_insert->{$is}\n"; 783 | } 784 | my ($mean_ins,$sigma) = (0,0); 785 | if($sumX > 0 && $record > 0){ 786 | $mean_ins = int($sumX/$record); 787 | $sigma = sprintf("%.2f",sqrt($sumX2/$record - $mean_ins*$mean_ins)); 788 | } 789 | close CSV; 790 | 791 | print SUMFILE "READ PAIRS STATS:\n"; 792 | print SUMFILE "\tAssembled pairs: $ct_both ($ct_both_reads sequences)\n"; 793 | print SUMFILE "\t\tSatisfied in distance/logic within contigs (i.e. -> <-, distance on target: $insert_size +/$min_allowed): $ct_ok_contig\n"; 794 | print SUMFILE "\t\tUnsatisfied in distance within contigs (i.e. distance out-of-bounds): $ct_iz_issues\n"; 795 | print SUMFILE "\t\tUnsatisfied pairing logic within contigs (i.e. illogical pairing ->->, <-<- or <-->): $ct_illogical\n"; 796 | print SUMFILE "\t\t---\n"; 797 | print SUMFILE "\t\tSatisfied in distance/logic within a given contig pair (pre-scaffold): $ct_ok_pairs\n"; 798 | print SUMFILE "\t\tUnsatisfied in distance within a given contig pair (i.e. calculated distances out-of-bounds): $ct_problem_pairs\n"; 799 | print SUMFILE "\t\t---\n"; 800 | print SUMFILE "\tTotal satisfied: $satisfied\tunsatisfied: $unsatisfied\n\n"; 801 | print SUMFILE "\n\tEstimated insert size statistics (based on $total_for_median pairs): \n"; 802 | print SUMFILE "\t\tMean insert size = $mean_ins\n"; 803 | print SUMFILE "\t\tMedian insert size = $median_ins\n"; 804 | # print SUMFILE "\t\tInsert size deviation = $sigma\n$seplines\n"; 805 | 806 | &FlushFiles(); 807 | return $pair; 808 | } 809 | 810 | ###Function that maps the readfiles to the contigs 811 | sub mapReadsWithBowtie{ 812 | my ($contigFile, $singlereads, $gaps, $threads) = @_; 813 | #building Index of contig and mapping reads to Index 814 | my $bowtieout = $base_name . ".$library.bowtieIndex"; 815 | 816 | my @reads = split(/,/,$singlereads); 817 | foreach my $read (@reads){ 818 | die "Single read file ($read) not found. Exiting...\n" if(!(-e $read)); 819 | } 820 | my $procline = "bowtie -p $threads -v $gaps -m 1 bowtieoutput/$bowtieout --suppress 6,7 -f $singlereads --quiet --refidx |"; 821 | die "Contig file ($contigFile) not found. Exiting...\n" if(!(-e $contigFile)); 822 | &printMessage("\n=>".getDate().": Building Bowtie index for contigs\n"); 823 | system("bowtie-build $contigFile bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values 824 | 825 | #Treat the output of Bowtie differently if multithreading is used or not 826 | readBowtieOneThread($procline) if($threads <= 1); 827 | readBowtieMultThread($procline) if($threads > 1); 828 | } 829 | 830 | ###Parse output of Bowtie when only one thread of Bowtie is used 831 | sub readBowtieOneThread{ 832 | my ($input) = @_; 833 | my $lower = ($up_iz+200); 834 | my $sub = ($lower * 2) + 3; 835 | my ($prevline, $line, $prevread, $counter, $step, $pair_found, $ct_pair) = ("","","",0, 1000000, 0, 0); 836 | my ($seq1, $seq2, $track1, $track2, $count); 837 | 838 | &printMessage("\n=>".getDate().": Mapping reads to contigs. Reading bowtie output and pairing contigs\n"); 839 | open(IN, "$input") || die "Can't open bowtie output -- fatal\n"; 840 | #go through mapping results 841 | while($line = ){ 842 | if(++$counter == $step){ 843 | CounterPrint($counter); 844 | $step = $step + 1000000; 845 | } 846 | my ($read) = split(/\//,$line); 847 | if($prevread eq $read){ 848 | $pair_found++; 849 | ($seq1, $track1) = StoreResults($prevline, $lower, $sub); 850 | ($seq2, $track2) = StoreResults($line, $lower, $sub); 851 | my $combined = "$seq1:$seq2"; 852 | my $revcombined = reverseComplement($combined); 853 | if(!$count->{$combined} && !$count->{$revcombined}){ 854 | $count->{$combined}++; 855 | pairContigs($track1, $track2, $seq1, $seq2); 856 | $ct_both++; 857 | } 858 | } 859 | $prevread = $read; 860 | $prevline = $line; 861 | } 862 | close IN; 863 | CounterPrint(" "); 864 | print SUMFILE "\nMAPPING READS TO CONTIGS:\n"; 865 | print SUMFILE "$seplines\tNumber of single reads found on contigs = $counter\n"; 866 | my $read_number_message = "\tNumber of pairs used for pairing contigs / total pairs = $ct_both / $pair_found\n"; 867 | printf SUMFILE $read_number_message.$seplines."\n"; 868 | &FlushFiles(); 869 | } 870 | 871 | ###Parse output of Bowtie when multiple threads of Bowtie are used 872 | sub readBowtieMultThread{ 873 | my ($input) = @_; 874 | my $lower = ($up_iz+200); 875 | my $sub = ($lower * 2) + 3; 876 | my ($prevline, $line, $prevread, $counter, $step, $ct, $ctHash, $pair_found) = ("","","",0, 1000000,0,1,0); 877 | my ($seq1, $seq2, $track1, $track2, $count, $readHash); 878 | 879 | &printMessage("\n=>".getDate().": Mapping reads to contigs. Reading bowtie output and pairing contigs\n"); 880 | open(IN,"$input") || die "Can't open bowtie output -- fatal\n"; 881 | while ($line = ) { 882 | if(++$ct >= ($ctHash*$step)){ 883 | CounterPrint($ct); 884 | delete $readHash->{($ctHash-1)}; 885 | $ctHash++; 886 | } 887 | my ($readname) = split(/\t/,$line); 888 | my ($read,$readnum) = split(/\//,$readname); 889 | if($readHash->{($ctHash-1)}{$read}){ 890 | $pair_found++; 891 | if($readnum == 1){ 892 | ($seq2, $track2) = StoreResults($readHash->{($ctHash-1)}{$read}, $lower, $sub); 893 | ($seq1, $track1) = StoreResults($line, $lower, $sub); 894 | }else{ 895 | ($seq1, $track1) = StoreResults($readHash->{($ctHash-1)}{$read}, $lower, $sub); 896 | ($seq2, $track2) = StoreResults($line, $lower, $sub); 897 | } 898 | my $combined = "$seq1:$seq2"; 899 | my $revcombined = reverseComplement($combined); 900 | if(!$count->{$combined} && !$count->{$revcombined}){ 901 | $count->{$combined}++; 902 | pairContigs($track1, $track2, $seq1, $seq2); 903 | $ct_both++; 904 | } 905 | }elsif(defined $readHash->{$ctHash}{$read}){ 906 | $pair_found++; 907 | if($readnum == 1){ 908 | ($seq2, $track2) = StoreResults($readHash->{($ctHash)}{$read}, $lower, $sub); 909 | ($seq1, $track1) = StoreResults($line, $lower, $sub); 910 | }else{ 911 | ($seq1, $track1) = StoreResults($readHash->{($ctHash)}{$read}, $lower, $sub); 912 | ($seq2, $track2) = StoreResults($line, $lower, $sub); 913 | } 914 | my $combined = "$seq1:$seq2"; 915 | my $revcombined = reverseComplement($combined); 916 | if(!$count->{$combined} && !$count->{$revcombined}){ 917 | $count->{$combined}++; 918 | pairContigs($track1, $track2, $seq1, $seq2); 919 | $ct_both++; 920 | } 921 | } 922 | $readHash->{$ctHash}{$read} = $line; 923 | } 924 | close IN; 925 | CounterPrint(" "); 926 | print SUMFILE "\nMAPPING READS TO CONTIGS:\n"; 927 | print SUMFILE "$seplines\tNumber of single reads found on contigs = ". $ct."\n"; 928 | my $read_number_message = "\tNumber of pairs used for pairing contigs / total pairs = $ct_both / $pair_found\n"; 929 | printf SUMFILE $read_number_message.$seplines."\n"; 930 | &FlushFiles(); 931 | } 932 | 933 | sub StoreResults{ 934 | my ($input, $lower, $sub) = @_; 935 | my ($read, $strand, $tig, $start, $seq) = split(/\t/,$input); 936 | my ($startval, $endval, $keyvalue) = (0,0,""); 937 | $tig++; 938 | if($start > $lower && $tig_length->{$tig} > (($lower * 2)+100)){ 939 | my $minsub = $sub - $start; 940 | $start = ($tig_length->{$tig} - $minsub); 941 | } 942 | if($strand eq "+"){ 943 | $startval = $start; 944 | $endval = $start + length($seq); 945 | } 946 | else{ 947 | $startval = $start + length($seq); 948 | $endval = $start; 949 | $seq = reverseComplement($seq); 950 | } 951 | $keyvalue = "$tig"."|$startval"."|$endval"; 952 | return $seq, $keyvalue; 953 | } 954 | 955 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE 956 | sub reverseComplement{ 957 | $_ = shift; 958 | tr/ATGC/TACG/; 959 | return (reverse()); 960 | } 961 | 962 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE 963 | sub CounterPrint{ 964 | my $countingMessager = shift; 965 | print "\r$countingMessager"; 966 | $|++; 967 | } 968 | 969 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE 970 | sub printMessage{ 971 | my $message = shift; 972 | print $message; 973 | print LOG $message; 974 | } 975 | 976 | ###FUNCTION TO GET THE CURRENT DATE 977 | sub getDate{ 978 | my $date = scalar(localtime); 979 | return $date; 980 | } 981 | 982 | ###FLUSHES THE SUMMARY AND LOG FILE 983 | sub FlushFiles{ 984 | select((select(SUMFILE), $| = 1)[0]); 985 | select((select(LOG), $| = 1)[0]); 986 | $|++; 987 | } 988 | 989 | #########END PairingAndScaffolding.pl 990 | -------------------------------------------------------------------------------- /bin/readLibFiles.pl: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | #Marten Boetzer 13-06-2011 # 3 | #SSPACE perl subscript readLibFiles.pl # 4 | #This script; # 5 | # -reads, converts and filters original input sequences # 6 | ############################################################# 7 | 8 | use Storable; 9 | use File::Path; 10 | use File::Basename; 11 | use threads; 12 | 13 | my $seplines = ("-" x 60)."\n"; 14 | my $maxlen = 0; 15 | 16 | my $libraryfile = $ARGV[0]; 17 | my $base_name = $ARGV[1]; 18 | my $extending = $ARGV[2]; 19 | my $unpaired_file = $ARGV[3]; 20 | my $min_overlap = $ARGV[4]; 21 | my $thread = $ARGV[5]; 22 | my $log = $base_name . ".logfile.txt"; 23 | my $summaryfile = $base_name.".summaryfile.txt"; 24 | 25 | open (SUMFILE, ">>$summaryfile") || die "Can't open $summaryfile -- fatal\n"; 26 | open (LOG, ">>$log") || die "Can't write to $log -- fatal\n"; 27 | 28 | my $filenameOutFilt = "filtered.readpairs.fasta"; 29 | my $filenameOutExt = $base_name . ".singlereads.fasta"; 30 | 31 | #-------------------------------------------------READ UNPAIRED FILE CONTAINING SINGLE READS 32 | &readUnpairedFile($unpaired_file) if ($unpaired_file); 33 | #-------------------------------------------------LOOP THROUGH EACH LIBRARY IN LIBRARYFILE AND STORE AND FILTER READS 34 | open(FILELIB, "< $libraryfile"); 35 | 36 | my ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse, $libResHash); 37 | my ($prevlibrary, $ctlib) = ("",0); 38 | &printMessage("\n=>".getDate().": Reading, filtering and converting input sequences of library file initiated\n"); 39 | 40 | while(){ 41 | chomp; 42 | ($library, $fileA, $fileB, $insert_size, $insert_stdev, $reverse) = split(/\s+/, $_); 43 | 44 | next if($library eq ""); 45 | $ctlib=0 if($library ne $prevlibrary && $prevlibrary ne ""); 46 | $ctlib++; 47 | 48 | my ($fileBaseName1, $dirName1, $fileExtension1) = fileparse($fileA); 49 | my ($fileBaseName2, $dirName2, $fileExtension2) = fileparse($fileB); 50 | 51 | my $fname = "reads/$base_name.$library.filtered.readpairs.singles.fasta"; 52 | my ($counter2, $Ncount2); 53 | #Process multiple files at the same time if multithreaded option is set (-T parameter larger than 1) 54 | if($fileA ne "TAB" && $thread > 1){ 55 | my $thr = threads->create(\&generateInputFiles, $library, $fileA, $fileB, $extending, $reverse, $fname, $ctlib); 56 | if(!($ctlib % $thread)){ 57 | foreach my $thr (threads->list()) { 58 | my @res = $thr->join(); 59 | ($lib,$nreads,$ncount) = split(/,/,$res[0]); 60 | $libResHash->{$lib}{'reads'}+=$nreads; 61 | $libResHash->{$lib}{'N'}+=$ncount; 62 | } 63 | } 64 | #otherwise, process only one file at a time 65 | }elsif($fileA ne "TAB" && $thread <=1){ 66 | my $out = &generateInputFiles($library, $fileA, $fileB, $extending, $reverse, $fname, $ctlib); 67 | ($lib,$nreads,$ncount) = split(/,/,$out); 68 | $libResHash->{$lib}{'reads'}+=$nreads; 69 | $libResHash->{$lib}{'N'}+=$ncount; 70 | } 71 | #if user has inserted a TAB file, calculate read statistics 72 | if($fileA eq "TAB"){ 73 | open FILE, "$fileB" or die $!; 74 | my ($fileBaseName2, $dirName2, $fileExtension2) = fileparse($fileB); 75 | print "Reading tabfile: $fileBaseName2...\n"; 76 | $counter2++ while(); 77 | $libResHash->{$lib}{'reads'}+=$counter2; 78 | $libResHash->{$lib}{'N'} = 0; 79 | close FILE; 80 | } 81 | $prevlibrary = $library; 82 | } 83 | #Process remaining reads 84 | if($fileA ne "TAB"){ 85 | foreach my $thr (threads->list()) { 86 | my @res = $thr->join(); 87 | ($lib,$nreads,$ncount) = split(/,/,$res[0]); 88 | $libResHash->{$lib}{'reads'}+=$nreads; 89 | $libResHash->{$lib}{'N'}+=$ncount; 90 | } 91 | } 92 | #Print read statistics to the summary file 93 | &printMessage("\n$seplines"); 94 | foreach my $libs (keys %$libResHash){ 95 | my $totcounter = $libResHash->{$libs}{'reads'}; 96 | my $totNcount = $libResHash->{$libs}{'N'}; 97 | my $filt = $totcounter-$totNcount; 98 | print SUMFILE "READING READS $libs:\n"; 99 | print SUMFILE "$seplines\tTotal inserted pairs = $totcounter \n"; 100 | print SUMFILE "\tNumber of pairs containing N's = $totNcount \n\tRemaining pairs = $filt\n$seplines\n"; 101 | } 102 | close FILELIB; 103 | close SUMFILE; 104 | close LOG; 105 | 106 | mkpath('process_OK'); #make directory, indicating that process has run OK 107 | 108 | #-------------------------------------------------- 109 | 110 | ###CONVERT INPUT SEQUENCES BY REMOVING PAIRED READS HAVING AN 'N' 111 | sub generateInputFiles{ 112 | my ($lib, $fileA, $fileB, $extension, $reverse, $fname, $libct) = @_; 113 | my ($name,$seq1,$seq2, $res1,$res2); 114 | my ($counterext, $Ncount, $countsinglet, $fastq, $step) = (0,0,0,0,1000000); 115 | open (OUTSINGLEFILE, ">reads/$base_name.$lib.file$libct.fa") || die "Can't write to single file file$fname-- fatal\n"; 116 | 117 | #check if file is fastQ or fastA 118 | open(TEST, "< $fileA"); 119 | $name = ; 120 | close TEST; 121 | $fastq = 1 if ($name =~ /^[@]/); 122 | 123 | open(FILEA, "< $fileA"); 124 | open(FILEB, "< $fileB"); 125 | CounterPrint("Reading read-pairs $lib.$libct @ $countsinglet "); 126 | while() { 127 | ; 128 | $seq1 = uc(), $seq1 =~ s/^\r\n/\n/; 129 | $seq2 = uc(), $seq2 =~ s/^\r\n/\n/; 130 | #FASTQ FORMAT 131 | ,,, if ($fastq); 132 | 133 | $res1 = index($seq1,"N"); 134 | $res2 = index($seq2,"N"); 135 | #if both reads contain N's, do not use them for contig extension and for scaffolding 136 | if($res1 == -1 && $res2 == -1){ 137 | print OUTSINGLEFILE ">read$countsinglet/1\n$seq1>read$countsinglet/2\n$seq2"; 138 | }else{ 139 | $Ncount++; 140 | } 141 | if(++$countsinglet == $step){ 142 | CounterPrint("Reading read-pairs $lib.$libct @ $countsinglet "); 143 | $step = $step + 1000000; 144 | } 145 | 146 | } 147 | CounterPrint("\n") if($thread <= 1); 148 | CounterPrint((" " x 40)); 149 | close OUTSINGLEFILE; 150 | close FILEB; 151 | close FILEA; 152 | return "$lib,$countsinglet,$Ncount"; 153 | } 154 | 155 | #------------------READ UNPAIRED SINGLE READS FILE WHEN -u IS SET 156 | 157 | sub readUnpairedFile{ 158 | my ($file) = @_; 159 | open(INUNPAIRED, "< $file") || die "Can't open $file -- fatal\n"; 160 | open OUTFILEExt, "> reads/$filenameOutExt"; 161 | 162 | &printMessage("\n=>".getDate().": Reading, filtering and converting unpaired input sequences initiated\n"); 163 | 164 | my ($seq1, $name); 165 | my ($counterext, $counter, $step, $fastq) = (0,0, 100000,0); 166 | 167 | open(TEST, "< $file"); 168 | $name = ; 169 | close TEST; 170 | $fastq = 1 if ($name =~ /^[@]/); 171 | while() { 172 | $seq1 = uc(); $seq1 =~ s/\r\n/\n/; chomp $seq1; 173 | 174 | #FASTQ FORMAT 175 | if ($fastq){ 176 | ; ; 177 | } 178 | # ELSE FASTA FORMAT 179 | if(index($seq1, "N") == -1){ 180 | print OUTFILEExt ">$counterext\n$seq1\n"; 181 | $counterext++; 182 | } 183 | if(++$counter == $step){ 184 | CounterPrint($counter); 185 | $step = $step + 100000; 186 | } 187 | } 188 | CounterPrint(" "); 189 | 190 | print SUMFILE "READING UNPAIRED READS:\n"; 191 | print SUMFILE "$seplines\tTotal inserted reads = $counter \n"; 192 | print SUMFILE "\tNumber of reads containing N's = ".($counter-$counterext)."\n\tRemaining reads = $counterext\n"; 193 | close OUTFILEext; 194 | close INUNPAIRED; 195 | } 196 | 197 | ###FUNCTION TO REVERSE COMPLEMENT A SEQUENCE 198 | sub reverseComplement{ 199 | $_ = shift; 200 | tr/ATGC/TACG/; 201 | return (reverse()); 202 | } 203 | 204 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE 205 | sub CounterPrint{ 206 | my $countingMessager = shift; 207 | print "\r$countingMessager"; 208 | $|++; 209 | } 210 | 211 | ###FUNCTION TO PRINT MESSAGES TO THE SCREEN AND TO THE LOG FILE 212 | sub printMessage{ 213 | my $message = shift; 214 | print $message; 215 | print LOG $message; 216 | } 217 | 218 | ###FUNCTION TO GET THE CURRENT DATE 219 | sub getDate{ 220 | my $date = scalar(localtime); 221 | return $date; 222 | } 223 | 224 | ###FLUSHES THE SUMMARY AND LOG FILE 225 | sub FlushFiles{ 226 | select((select(SUMFILE), $| = 1)[0]); 227 | select((select(LOG), $| = 1)[0]); 228 | $|++; 229 | } 230 | 231 | #########END readLibFiles.pl -------------------------------------------------------------------------------- /dotlib/DotLib.pm: -------------------------------------------------------------------------------- 1 | # $Id: DotLib.pm,v 1.3 2003/02/24 17:33:00 mpop Exp $ 2 | # 3 | # DotLib.pm - set of procedures for generating .dot files 4 | # 5 | 6 | # Copyright @ 2002, 2003, The Institute for Genomic Research (TIGR). All 7 | # rights reserved. 8 | 9 | 10 | =head1 Name 11 | 12 | DotLib - library of routines for generating .dot files 13 | 14 | =head1 Synopsis 15 | 16 | use DotLib; 17 | 18 | =head1 Description 19 | 20 | A set of procedures used to create various .dot objects such as 21 | file headers, file tails, components, nodes, edges, etc. 22 | 23 | =cut 24 | 25 | package DotLib; 26 | 27 | use strict; 28 | 29 | 30 | BEGIN { 31 | use Exporter (); 32 | use vars qw(@EXPORT @EXPORT_OK @ISA %EXPORT_TAGS); 33 | 34 | @ISA = qw(Exporter); 35 | @EXPORT = qw(&printHeader 36 | &printFooter 37 | &printNode 38 | &printEdge 39 | &startCluster 40 | &endCluster 41 | ); 42 | %EXPORT_TAGS = (); 43 | @EXPORT_OK = (); 44 | } 45 | 46 | our $VERSION = '1.0'; 47 | our $REVISION = '$Revision: 1.3 $ '; 48 | our $VERSION_STRING = "$VERSION ($REVISION)"; 49 | 50 | use vars @EXPORT; 51 | use vars @EXPORT_OK; 52 | 53 | =over 4 54 | 55 | =item B 56 | 57 | Prints a .dot header for the type of output specified in the $type variable. 58 | Allowable types are "printer", "plotter". If $type is undefined or not 59 | passed, it generates a default header. Returns 1 upon successful 60 | completion and 'undef' otherwise. 61 | 62 | Example: 63 | 64 | my $err = printHeader(\*STDOUT, "plotter"); 65 | 66 | =cut 67 | 68 | sub printHeader 69 | { 70 | my $file = shift; 71 | my $type = shift; 72 | 73 | print $file "digraph ROOT {\n"; 74 | print $file " rankdir = LR\n"; 75 | print $file " orientation = landscape\n"; 76 | print $file " ranksep = 0.3\n"; 77 | print $file " nodesep = 0.3\n"; 78 | print $file " fontsize = 8\n"; 79 | print $file " margin = \".2,.2\"\n"; 80 | 81 | if ($type eq "printer"){ 82 | print $file " ratio = auto\n"; 83 | print $file " page = \"8.5,11\"\n"; 84 | } elsif ($type eq "plotter"){ 85 | print $file " ratio = auto\n"; 86 | print $file " page = \"36,48\"\n"; 87 | } 88 | 89 | print $file "\n"; 90 | 91 | return 1; 92 | } # printHeader 93 | 94 | 95 | =item B 96 | 97 | Prints a .dot footer (currently just a closed brace). Returns 1 upon 98 | successful completion and 'undef' otherwise. 99 | 100 | Example: 101 | 102 | my $err = printFooter(\*STDOUT); 103 | 104 | =cut 105 | 106 | sub printFooter 107 | { 108 | my $file = shift; 109 | 110 | print $file "}\n"; 111 | 112 | return 1; 113 | } # printFooter 114 | 115 | 116 | =item B 117 | 118 | Prints a "contig" node with the specified id, label, and orientation. 119 | If orientation is 1 then the node is a forward facing arrow, otherwise 120 | it is a backward facing arror. Returns 1 upon successful completion 121 | and 'undef' otherwise. 122 | 123 | Example: 124 | 125 | my $err = printNode(\*STDOUT, $node_id, "$node_id ($node_len)", 1); 126 | 127 | =cut 128 | 129 | sub printNode 130 | { 131 | my $file = shift; 132 | my $id = shift; 133 | my $label = shift; 134 | my $ori = shift; 135 | my $angle; 136 | 137 | $id =~ s/(\W)/_/g; 138 | 139 | if ($ori == 1){ 140 | $angle = -90; 141 | } else { 142 | $angle = 90; 143 | } 144 | 145 | print $file " $id [ label = \"$label\" height = 0.2, fontsize = 8, shape = \"house\", orientation = $angle ]\n"; 146 | 147 | return 1; 148 | 149 | } # printNode 150 | 151 | 152 | =item B 153 | 154 | Prints an edge between two nodes with the specified label. The style can 155 | be any of the GraphViz acceptable styles ("dotted", "solid", "dashed", 156 | "invis") or undefined in which case the default is used. Returns 1 upon 157 | successful completion and 'undef' otherwise. 158 | 159 | Example: 160 | 161 | my $err = printEdge(\*STDOUT, $nodeA, $nodeB, "A to B", "invis"); 162 | 163 | =cut 164 | 165 | sub printEdge 166 | { 167 | my $file = shift; 168 | my $nodeA = shift; 169 | my $nodeB = shift; 170 | my $label = shift; 171 | my $instyle = shift; 172 | my $style; 173 | 174 | $nodeA =~ s/(\W)/_/g; 175 | $nodeB =~ s/(\W)/_/g; 176 | 177 | if (defined $instyle){ 178 | $style = "style = \"" . $instyle . "\""; 179 | if ($instyle eq "invis"){ 180 | $style .= " color = \"white\" "; 181 | } 182 | } 183 | 184 | print $file " $nodeA -> $nodeB [ label =\"$label\" fontsize = 8 $style ]\n"; 185 | 186 | return 1; 187 | } # printEdge 188 | 189 | =item B 190 | 191 | Starts a cluster in the .dot output file with the given label and id. 192 | Returns 1 upon successful completion and 'undef' otherwise. 193 | 194 | Example: 195 | 196 | my $err = startCluster(\*STDOUT, $clust_id, "first cluster"); 197 | 198 | =cut 199 | 200 | sub startCluster 201 | { 202 | my $file = shift; 203 | my $id = shift; 204 | my $label = shift; 205 | 206 | $id =~ s/(\W)/_/g; 207 | 208 | print $file " subgraph cluster_$id {\n"; 209 | print $file " label = \"$label\"\n"; 210 | 211 | return 1; 212 | } # startCluster 213 | 214 | =item B 215 | 216 | Ends a cluster in the .dot output. Returns 1 upon successful 217 | completion and 'undef' otherwise. 218 | 219 | Example: 220 | 221 | my $err = endCluster(\*STDOUT); 222 | 223 | =cut 224 | 225 | sub endCluster 226 | { 227 | my $file = shift; 228 | 229 | print $file " }\n"; 230 | 231 | return 1; 232 | } # endCluster 233 | 234 | 235 | 1; 236 | 237 | 238 | 239 | -------------------------------------------------------------------------------- /example/ecoli_scaffolds_no_extension.summaryfile.txt: -------------------------------------------------------------------------------- 1 | READING READS lib1: 2 | ------------------------------------------------------------ 3 | Total inserted pairs = 10408224 4 | Number of pairs containing N's = 61604 5 | Remaining pairs = 10346620 6 | ------------------------------------------------------------ 7 | 8 | 9 | 10 | LIBRARY lib1 STATS: 11 | ################################################################################ 12 | 13 | MAPPING READS TO CONTIGS: 14 | ------------------------------------------------------------ 15 | Number of single reads found on contigs = 1949086 16 | Number of pairs used for pairing contigs / total pairs = 666142 / 683736 17 | ------------------------------------------------------------ 18 | 19 | READ PAIRS STATS: 20 | Assembled pairs: 666142 (1332284 sequences) 21 | Satisfied in distance/logic within contigs (i.e. -> <-, distance on target: 200 +/-150): 519701 22 | Unsatisfied in distance within contigs (i.e. distance out-of-bounds): 233 23 | Unsatisfied pairing logic within contigs (i.e. illogical pairing ->->, <-<- or <-->): 5 24 | --- 25 | Satisfied in distance/logic within a given contig pair (pre-scaffold): 146175 26 | Unsatisfied in distance within a given contig pair (i.e. calculated distances out-of-bounds): 28 27 | --- 28 | Total satisfied: 665876 unsatisfied: 266 29 | 30 | 31 | Estimated insert size statistics (based on 519934 pairs): 32 | Mean insert size = 215 33 | Median insert size = 215 34 | REPEATS: 35 | Number of repeated edges = 24 36 | ------------------------------------------------------------ 37 | 38 | ################################################################################ 39 | 40 | SUMMARY: 41 | ------------------------------------------------------------ 42 | Inserted contig file; 43 | Total number of contigs = 595 44 | Sum (bp) = 4545610 45 | Total number of N's = 0 46 | Sum (bp) no N's = 4545610 47 | Max contig size = 67081 48 | Min contig size = 100 49 | Average contig size = 7639 50 | N50 = 18242 51 | 52 | After scaffolding lib1: 53 | Total number of scaffolds = 127 54 | Sum (bp) = 4545129 55 | Total number of N's = 5518 56 | Sum (bp) no N's = 4539611 57 | Max scaffold size = 268578 58 | Min scaffold size = 100 59 | Average scaffold size = 35788 60 | N50 = 94525 61 | 62 | ------------------------------------------------------------ 63 | -------------------------------------------------------------------------------- /example/libraries.txt: -------------------------------------------------------------------------------- 1 | lib1 SRR001665_1.fastq SRR001665_2.fastq 200 0.75 FR -------------------------------------------------------------------------------- /tools/TQS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | __doc__ = """ 4 | TQS 5 | 6 | Trim Quality Solexa Sequences (TQS) 7 | 8 | SYNOPSIS 9 | Quality trim solexa-Illumina sequence reads using user-defined thresholds 10 | """ 11 | __author__ = "Rene L. Warren" 12 | __version__ = '1.0' 13 | 14 | #LICENSE 15 | # Copyright (c) 2007 Canada's Michael Smith Genome Science Centre. All rights reserved. 16 | 17 | # This program is free software; you can redistribute it and/or 18 | # modify it under the terms of the GNU General Public License 19 | # as published by the Free Software Foundation; either version 2 20 | # of the License, or (at your option) any later version. 21 | 22 | # This program is distributed in the hope that it will be useful, 23 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | # GNU General Public License for more details. 26 | 27 | import sys, os, re, string 28 | from datetime import datetime 29 | from optparse import OptionParser 30 | 31 | 32 | def main(): 33 | usage = "Usage: %s --help" 34 | 35 | parser = OptionParser() 36 | parser.add_option("-f", "--sequence file", dest="seqfile", 37 | help="Illumina sequence file - Output format from the 1G Genome Analyzer (_seq.txt): 7 1 255 669 AACCCCCACTCCTACAACGCCATCATTCCCCTCGAC",) 38 | parser.add_option("-q", "--qual file", dest="qualfile", 39 | help="A prb file containing all the Illumina intensities, as outputted by the 1G Genome Analyzer (_prb.txt)",) 40 | parser.add_option("-l", "--length", dest="mer", type="int", default=36, 41 | help="Length of sequence reads (i.e. Number of sequencing cycles, default=36)",) 42 | parser.add_option("-t", "--threshold", dest="threshold", type="int", default=5, 43 | help="Base intensity threshold value (-40 to 40, default=5)",) 44 | parser.add_option("-d", "--difference", dest="diff", type="int", default=5, 45 | help="Base intensity difference between top intensity and second best (1 to 80, default=5)",) 46 | parser.add_option("-c", "--consec", dest="consec", type="int", default=20, 47 | help="Minimum number of consecutive bases passing threshold values (default=20)",) 48 | parser.add_option("-v", "--verbose", dest="verbose", action="store_true", 49 | help="Runs in Verbose mode.",) 50 | (opts, args) = parser.parse_args() 51 | 52 | try: 53 | f = open(opts.seqfile) 54 | seq = f.readlines() 55 | f.close() 56 | except Exception, e: 57 | print "ERROR: Could not read from %s: %s" % (opts.seqfile, e) 58 | print usage % (sys.argv[0:]) 59 | sys.exit() 60 | 61 | try: 62 | f = open(opts.qualfile) 63 | qual = f.readlines() 64 | f.close() 65 | except Exception, e: 66 | print "ERROR: Could not read from %s: %s" % (opts.qualfile, e) 67 | print usage % (sys.argv[0:]) 68 | sys.exit() 69 | 70 | 71 | fasta = "%s_I%sD%sL%s.trim.fa" % (opts.seqfile,opts.threshold,opts.diff,opts.consec) 72 | log = "%s.log" % opts.seqfile 73 | 74 | 75 | try: 76 | FASTA = open(fasta, 'w') 77 | except: 78 | print "ERROR: Can not write to %s" % fasta 79 | sys.exit() 80 | 81 | try: 82 | LOG = open(log, 'w') 83 | except: 84 | print "ERROR: Can not write to %s" % log 85 | sys.exit() 86 | 87 | 88 | if opts.mer < 15 or opts.mer > 200: 89 | print "ERROR: -l must be a number between 15 and 200." 90 | sys.exit() 91 | 92 | if opts.consec < 16 or opts.consec > opts.mer: 93 | print "ERROR: -c must be a number between 16 and -l." 94 | sys.exit() 95 | 96 | LOG.write(""" 97 | Running: 98 | %s 99 | -f %s 100 | -q %s 101 | -l %s 102 | -c %s 103 | -t %s 104 | -d %s 105 | Fasta file: %s 106 | 107 | """ % (sys.argv[0:],opts.seqfile, opts.qualfile, opts.mer, opts.consec, opts.threshold, opts.diff, fasta)) 108 | 109 | t0 = datetime.now() 110 | LOG.write("\nReading Quality File: %s\n" % str(t0)[:len('2006-10-05 23:04')]) 111 | trim_info = parseQualFile(opts.threshold, opts.diff, opts.consec, opts.mer, qual, opts.verbose, LOG) 112 | t1 = datetime.now() 113 | LOG.write("\n\nTrimming low quality bases: %s\n" % str(t1)[:len('2006-10-05 23:04')]) 114 | readNTrim(trim_info, seq, opts.verbose, FASTA, LOG) 115 | LOG.write("DNA sequences have been trimmed accordingly and placed in %s" % fasta) 116 | 117 | LOG.close() 118 | FASTA.close() 119 | return 120 | 121 | #-------------------------------------------------------------------------------------- 122 | def parseQualFile(threshold, difference, consecutive, read_length, qual, verbose, LOG): 123 | """ 124 | Parse a solexa-illumina intensity file 125 | 126 | Return a Dictionary of sequence order number, with the index value and length to extract 127 | """ 128 | trim_info = {} 129 | ok_read = 0 130 | read_number = 0 131 | 132 | if verbose: 133 | print "Printing trimming pattern for all reads passing the set threshold values...\n" 134 | 135 | for line in qual: 136 | read_number += 1 ### this keeps track of the read order, respected between the prb and seq files 137 | concat = "" ### concat builds a string of bases passing the user-defined filter 138 | quartets = line.split("\t") ### split quartet (4 number per position) 139 | for quartet in quartets: ### cycle through each quartet 140 | quad = (quartet.split()) 141 | quadint = [] 142 | for basequal in quad: ### each intensity/number for each position 143 | quadint.append(int(basequal)) 144 | quadint.sort() 145 | quadint.reverse() 146 | basediff = quadint[0] - quadint[1] 147 | #print "T=%i D=%i" % (quadint[0],basediff) 148 | 149 | if quadint[0] < threshold or basediff < difference: 150 | concat += "x" 151 | else: 152 | concat += "-" 153 | 154 | head_match_regex = re.compile("\-{%i,%i}" % (consecutive,read_length)) 155 | head_match = head_match_regex.search(concat) 156 | if head_match != None: 157 | ok_read += 1 158 | col = head_match.span() 159 | if not trim_info.has_key(read_number): 160 | trim_info[read_number] = {} 161 | 162 | start = int(col[0]) 163 | end = int(col[1]) 164 | 165 | trim_info[read_number]['start'] = start 166 | trim_info[read_number]['end'] = end 167 | 168 | if verbose: 169 | sub = concat[trim_info[read_number]['start']:trim_info[read_number]['end']] 170 | print "passed seqs:%i line#%i %s (start trim:%i,length:%i) %s\n" % (ok_read, read_number, concat, start, end, sub) 171 | 172 | LOG.write("%i out of %i sequences passed your filter (I >= %i and D >= %i and L >= %i)\n" % (ok_read, read_number, threshold, difference, consecutive)) 173 | 174 | return trim_info 175 | 176 | 177 | #-------------------------------------------------------------------------------------- 178 | def readNTrim(trim_info, seq, verbose, FASTA, LOG): 179 | 180 | """ 181 | Parse a solexa/illumina sequence file and trim DNA sequence based user-defined intensity threshold/differences 182 | """ 183 | 184 | 185 | read_number = 0 186 | gDNAlinker_count = 0 187 | usable_reads = 0 188 | 189 | dna_sequence_field = re.compile('^[ACTG]+$') 190 | gDNAlinker1_field = re.compile('^ATCCCC[GA]A') 191 | gDNAlinker2_field = re.compile('^ATCTAACAG') 192 | 193 | if verbose: 194 | print "Printing trimmed sequences for all reads passing the set threshold values minus, excluding sequence containing linkers...\n" 195 | 196 | for line in seq: 197 | read_number += 1 ### tracks read number / will match order in prb file 198 | line = line.rstrip('\r\n') 199 | info = line.split("\t") ### split line, the seq file lists: lane tile xcoord y coord DNAseq 200 | dna_string = info[4] 201 | 202 | if trim_info.has_key(read_number): 203 | trim_seq = dna_string[trim_info[read_number]['start']:trim_info[read_number]['end']] 204 | if re.match(dna_sequence_field, trim_seq): ### no ambiguous bases? 205 | if re.match(gDNAlinker1_field, trim_seq) or re.match(gDNAlinker2_field,trim_seq): ### matches gDNA linker? 206 | gDNAlinker_count += 1 207 | else: 208 | usable_reads += 1 209 | FASTA.write(">%s-%s-%s-%s\n%s\n" % (info[0],info[1],info[2],info[3],trim_seq)) 210 | if verbose: 211 | print "line#%i %s (start trim:%i,length:%i) %s" % (read_number,info[4],trim_info[read_number]['start'],trim_info[read_number]['end'],trim_seq) 212 | LOG.write("%i out of %i sequences appear to be usable, after filtering out sequences hard-coded in this program * %i gDNA linker sequences*\n" % (usable_reads, read_number,gDNAlinker_count)) 213 | return 214 | 215 | if __name__ == '__main__': 216 | main() 217 | import time 218 | sys.exit() 219 | -------------------------------------------------------------------------------- /tools/TQS.readme: -------------------------------------------------------------------------------- 1 | 2 | TQS 3 | 4 | Trim Quality Solexa-Illumina Sequences (TQS) 5 | 6 | SYNOPSIS 7 | Quality trim solexa-Illumina sequence reads using user-defined thresholds 8 | """ 9 | __author__ = "Rene L. Warren" 10 | __version__ = '1.0' 11 | 12 | #LICENSE 13 | # Copyright (c) 2007 Canada's Michael Smith Genome Science Centre. All rights reserved. 14 | 15 | # This program is free software; you can redistribute it and/or 16 | # modify it under the terms of the GNU General Public License 17 | # as published by the Free Software Foundation; either version 2 18 | # of the License, or (at your option) any later version. 19 | 20 | # This program is distributed in the hope that it will be useful, 21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 | # GNU General Public License for more details. 24 | 25 | Execution example 26 | ================== 27 | python TQS.py -f test_seq.txt -q test_prb.txt -l 36 -t 5 -d 5 -c 20 28 | 29 | 30 | Options 31 | ======= 32 | python TQS.py --help 33 | 34 | Usage: TQS.py [options] 35 | 36 | Options: 37 | -h, --help show this help message and exit 38 | -f SEQFILE, --sequence file=SEQFILE 39 | Illumina sequence file - Output format from the 1G 40 | Genome Analyzer (_seq.txt): 41 | 7 1 255 669 42 | AACCCCCACTCCTACAACGCCATCATTCCCCTCGAC 43 | -q QUALFILE, --qual file=QUALFILE 44 | A prb file containing all the Illumina intensities, as 45 | outputted by the 1G Genome Analyzer (_prb.txt) 46 | -l MER, --length=MER Length of sequence reads (i.e. Number of sequencing 47 | cycles, default=36) 48 | -t THRESHOLD, --threshold=THRESHOLD 49 | Base intensity threshold value (-40 to 40, default=5) 50 | -d DIFF, --difference=DIFF 51 | Base intensity difference between top intensity and 52 | second best (1 to 80, default=5) 53 | -c CONSEC, --consec=CONSEC 54 | Minimum number of consecutive bases passing threshold 55 | values (default=20) 56 | -v, --verbose Runs in Verbose mode. 57 | 58 | 59 | Output 60 | ====== 61 | 62 | .log file: Indicates the option chosen and tracks the execution time 63 | .fa file: A single fasta file containing the sequence reads that passed the filter specified 64 | -------------------------------------------------------------------------------- /tools/TQSexport.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | __doc__ = """ 4 | TQS 5 | 6 | Trim Quality Solexa Sequences (TQS) 7 | 8 | SYNOPSIS 9 | Quality trim solexa-Illumina sequence reads using user-defined thresholds 10 | """ 11 | __author__ = "Rene L. Warren" 12 | __version__ = '1.0' 13 | 14 | #LICENSE 15 | # Copyright (c) 2007 Canada's Michael Smith Genome Science Centre. All rights reserved. 16 | 17 | # This program is free software; you can redistribute it and/or 18 | # modify it under the terms of the GNU General Public License 19 | # as published by the Free Software Foundation; either version 2 20 | # of the License, or (at your option) any later version. 21 | 22 | # This program is distributed in the hope that it will be useful, 23 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | # GNU General Public License for more details. 26 | 27 | import sys, os, re, string, math 28 | from datetime import datetime 29 | from optparse import OptionParser 30 | 31 | 32 | def main(): 33 | usage = "Usage: %s --help" 34 | 35 | parser = OptionParser() 36 | parser.add_option("-f", "--export file", dest="exportfile", 37 | help="Illumina export file - Output format from the Genome Analyzer",) 38 | parser.add_option("-t", "--Phred quality threshold", dest="threshold", type="int", default=10, 39 | help="Base intensity threshold value (Phred quality scores 0 to 40, default=10)",) 40 | parser.add_option("-c", "--consec", dest="consec", type="int", default=20, 41 | help="Minimum number of consecutive bases passing threshold values (default=20)",) 42 | parser.add_option("-v", "--verbose", dest="verbose", action="store_true", 43 | help="Runs in Verbose mode.",) 44 | (opts, args) = parser.parse_args() 45 | 46 | try: 47 | f = open(opts.exportfile) 48 | seq = f.readlines() 49 | f.close() 50 | except Exception, e: 51 | print "ERROR: Could not read from %s: %s" % (opts.exportfile, e) 52 | print usage % (sys.argv[0:]) 53 | sys.exit() 54 | 55 | 56 | fasta = "%s_T%sC%s.trim.fa" % (opts.exportfile,opts.threshold,opts.consec) 57 | log = "%s.log" % opts.exportfile 58 | minimum_length = 15 59 | 60 | 61 | try: 62 | FASTA = open(fasta, 'w') 63 | except: 64 | print "ERROR: Can not write to %s" % fasta 65 | sys.exit() 66 | 67 | try: 68 | LOG = open(log, 'w') 69 | except: 70 | print "ERROR: Can not write to %s" % log 71 | sys.exit() 72 | 73 | if opts.consec < minimum_length: 74 | print "ERROR: -c must be a number larger than %i." % (minimum_length) 75 | sys.exit() 76 | 77 | LOG.write(""" 78 | Running: 79 | %s 80 | -f %s 81 | -c %s 82 | -t %s 83 | Fasta file: %s 84 | 85 | """ % (sys.argv[0:],opts.exportfile, opts.consec, opts.threshold, fasta)) 86 | 87 | t1 = datetime.now() 88 | LOG.write("\n\nTrimming low quality bases: %s\n" % str(t1)[:len('2006-10-05 23:04')]) 89 | readNtrim(seq, opts.threshold, opts.consec, opts.verbose, FASTA, LOG) 90 | LOG.write("DNA sequences have been trimmed accordingly and placed in %s" % fasta) 91 | 92 | LOG.close() 93 | FASTA.close() 94 | return 95 | 96 | #-------------------------------------------------------------------------------------- 97 | def readNtrim(export, threshold, consecutive, verbose, FASTA, LOG): 98 | """ 99 | Parse a solexa-illumina export file 100 | SOLEXA3_77_30V9CAAXX 4 1 1068 522 1 GGACAGCTGACAGCTGTTAAGAAGGACCCTATGTTAAAGGAAATGGATAC YYYYYYYYYYYJYY 101 | YYYYRYYYYYYYYYYYTTTTTOOOMOOOMMOOOOOG chr13 36311743 F 50 52 121 187 R N 102 | Return a Dictionary of sequence order number, with the index value and length to extract 103 | """ 104 | trim_info = {} 105 | ok_read = 0 106 | read_number = 0 107 | 108 | if verbose: 109 | print "Printing trimming pattern for all reads passing the set threshold values...\n" 110 | 111 | for line in export: 112 | read_number += 1 113 | concat = "" ### concat builds a string of bases passing the user-defined filter 114 | info = line.split() ### split info 115 | illumina_encoded_qual = list(info[7]) 116 | """ 117 | print "line%s\tseq:%s\tqual:%s\n" % (line,info[6],info[7]) 118 | """ 119 | pos = 0 120 | for illumina_qual in illumina_encoded_qual: 121 | pos += 1 122 | Q = 10 * math.log(1 + 10 ** ((ord(illumina_qual) - 64) / 10.0)) / math.log(10) 123 | if Q < threshold: 124 | concat += "x" 125 | else: 126 | concat += "-" 127 | """ 128 | print "base#%i. Illumina qual (%s) == phredQ (%i)\n" % (pos,illumina_qual,Q) 129 | """ 130 | 131 | seq_len = len(info[6]) 132 | head_match_regex = re.compile("\-{%i,%i}" % (consecutive, seq_len)) 133 | head_match = head_match_regex.search(concat) 134 | if head_match != None: 135 | ok_read += 1 136 | col = head_match.span() 137 | if not trim_info.has_key(read_number): 138 | trim_info[read_number] = {} 139 | 140 | start = int(col[0]) 141 | end = int(col[1]) 142 | 143 | pair = "" 144 | if info[5] == "1": 145 | pair = "a" 146 | elif info[5] == "2": 147 | pair = "b" 148 | 149 | trim_seq = info[6][start:end] 150 | FASTA.write(">%s-%s-%s-%s%s\n%s\n" % (info[1],info[2],info[3],info[4],pair,trim_seq)) 151 | 152 | if verbose: 153 | print "passed seqs:%i line#%i %s (start trim:%i,end trim:%i) %s\n" % (ok_read, read_number, concat, start, end, trim_seq) 154 | 155 | LOG.write("%i out of %i sequences passed your filter (-t >= %i and -c >= %i)\n" % (ok_read, read_number, threshold, consecutive)) 156 | 157 | return 158 | 159 | 160 | 161 | if __name__ == '__main__': 162 | main() 163 | import time 164 | sys.exit() 165 | -------------------------------------------------------------------------------- /tools/TQSfastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | __doc__ = """ 4 | TQS 5 | 6 | Trim Quality Sequences (TQS) 7 | 8 | SYNOPSIS 9 | Quality trim FASTQ sequence reads using user-defined thresholds 10 | """ 11 | __author__ = "Rene L. Warren" 12 | __version__ = 'fastq' 13 | 14 | #LICENSE 15 | # Copyright (c) 2007 Canada's Michael Smith Genome Science Centre. All rights reserved. 16 | 17 | # This program is free software; you can redistribute it and/or 18 | # modify it under the terms of the GNU General Public License 19 | # as published by the Free Software Foundation; either version 2 20 | # of the License, or (at your option) any later version. 21 | 22 | # This program is distributed in the hope that it will be useful, 23 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | # GNU General Public License for more details. 26 | 27 | # Modified by Lance Parsons at Princton University's Lewis-Sigler Institute for Integrative Genomics 28 | # Adapted to trim "standard" FASTQ files (PHRED+33) 29 | 30 | import sys, os, re, string, math 31 | from datetime import datetime 32 | from optparse import OptionParser 33 | 34 | 35 | def main(): 36 | usage = "Usage: %s --help" 37 | 38 | parser = OptionParser() 39 | parser.add_option("-f", "--fastq file", dest="fastqfile", 40 | help="fastq (fq) file - standard (ASCII+33) encoded PHRED quality scores / illumina (ASCII+64) encoded PHRED quality scores",) 41 | parser.add_option("-t", "--Phred quality threshold", dest="threshold", type="int", default=10, 42 | help="Base intensity threshold value (Phred quality scores 0 to 40, default=10)",) 43 | parser.add_option("-c", "--consec", dest="consec", type="int", default=20, 44 | help="Minimum number of consecutive bases passing threshold values (default=20)",) 45 | parser.add_option("-e", "--ASCII encoding type: 33 or 64", dest="encoding", type="int", default=64, 46 | help="Type of ASCII encoding: 33 (standard) or 64 (illumina) (default=64)",) 47 | parser.add_option("-v", "--verbose", dest="verbose", action="store_true", 48 | help="Runs in Verbose mode.",) 49 | (opts, args) = parser.parse_args() 50 | 51 | try: 52 | f = open(opts.fastqfile) 53 | seq = f.readlines() 54 | f.close() 55 | except Exception, e: 56 | print "ERROR: Could not read from %s: %s" % (opts.fastqfile, e) 57 | print usage % (sys.argv[0:]) 58 | sys.exit() 59 | 60 | 61 | fasta = "%s_T%sC%sE%s.trim.fa" % (opts.fastqfile, opts.threshold, opts.consec, opts.encoding) 62 | log = "%s.log" % opts.fastqfile 63 | minimum_length = 15 64 | 65 | 66 | try: 67 | FASTA = open(fasta, 'w') 68 | except: 69 | print "ERROR: Can not write to %s" % fasta 70 | sys.exit() 71 | 72 | try: 73 | LOG = open(log, 'w') 74 | except: 75 | print "ERROR: Can not write to %s" % log 76 | sys.exit() 77 | 78 | if opts.consec < minimum_length: 79 | print "ERROR: -c must be a number larger than %i." % (minimum_length) 80 | sys.exit() 81 | 82 | if opts.encoding != 33 and opts.encoding != 64: 83 | print "ERROR: -e must be either 33 or 64." 84 | sys.exit() 85 | 86 | LOG.write(""" 87 | Running: 88 | %s 89 | -f %s 90 | -c %s 91 | -t %s 92 | -e %s 93 | Fasta file: %s 94 | 95 | """ % (sys.argv[0:], opts.fastqfile, opts.consec, opts.threshold, opts.encoding, fasta)) 96 | 97 | t1 = datetime.now() 98 | LOG.write("\n\nTrimming low quality bases: %s\n" % str(t1)[:len('2006-10-05 23:04')]) 99 | readNtrim(seq, opts.threshold, opts.consec, opts.encoding, opts.verbose, FASTA, LOG) 100 | LOG.write("DNA sequences have been trimmed accordingly and placed in %s" % fasta) 101 | 102 | LOG.close() 103 | FASTA.close() 104 | return 105 | 106 | #-------------------------------------------------------------------------------------- 107 | def readNtrim(fastq, threshold, consecutive, encoding, verbose, FASTA, LOG): 108 | """ 109 | Return a Dictionary of sequence order number, with the index value and length to extract 110 | """ 111 | trim_info = {} 112 | ok_read = 0 113 | read_number = 0 114 | record_line = 0 115 | 116 | if verbose: 117 | print "Printing trimming pattern for all reads passing the set threshold values...\n" 118 | 119 | for line in fastq: 120 | record_line += 1 121 | if record_line == 1: 122 | read_id = line.strip() 123 | elif record_line == 2: 124 | seq = line.strip() 125 | elif record_line == 3: 126 | qual_id = line.strip() 127 | elif record_line == 4: 128 | record_line = 0 129 | qual = line.strip() 130 | read_number += 1 131 | concat = "" ### concat builds a string of bases passing the user-defined filter 132 | """ 133 | print "line%s\tseq:%s\tqual:%s\n" % (line,info[6],info[7]) 134 | """ 135 | pos = 0 136 | for qual_char in qual: 137 | Q = (ord(qual_char) - encoding) 138 | pos += 1 139 | if Q < threshold: 140 | concat += "x" 141 | else: 142 | concat += "-" 143 | """ 144 | print "base#%i. Illumina qual (%s) == phredQ (%i)\n" % (pos,illumina_qual,Q) 145 | """ 146 | 147 | seq_len = len(seq) 148 | head_match_regex = re.compile("\-{%i,%i}" % (consecutive, seq_len)) 149 | head_match = head_match_regex.search(concat) 150 | if head_match != None: 151 | ok_read += 1 152 | col = head_match.span() 153 | if not trim_info.has_key(read_number): 154 | trim_info[read_number] = {} 155 | 156 | start = int(col[0]) 157 | end = int(col[1]) 158 | 159 | trim_seq = seq[start:end] 160 | FASTA.write(">%s\n%s\n" % (read_id, trim_seq)) 161 | 162 | if verbose: 163 | print "%s\n%s\n%s\n passed seqs:%i line#%i %s (start trim:%i,end trim:%i) %s\n" % (read_id,seq,qual,ok_read, read_number, concat, start, end, trim_seq) 164 | 165 | LOG.write("%i out of %i sequences passed your filter (-t >= %i and -c >= %i)\n" % (ok_read, read_number, threshold, consecutive)) 166 | 167 | return 168 | 169 | 170 | 171 | if __name__ == '__main__': 172 | main() 173 | import time 174 | sys.exit() 175 | -------------------------------------------------------------------------------- /tools/TRIMMING_PAIRED_READS.README: -------------------------------------------------------------------------------- 1 | December 2008/February 2010 2 | Rene Warren 3 | rwarren at bcgsc dot ca 4 | warrenlr at gmail dot com 5 | 6 | #-------------------------------- 7 | To trim reads using fastq as input, run TQSfastq.py on both PE file: 8 | *Make sure you know whether your fastq file qual score were encoded ASCII+33 (standard) or ASCII+64 (illumina) 9 | 10 | for options run: 11 | ./TQSfastq.py --help 12 | 13 | Usage: TQSfastq.py [options] 14 | 15 | Options: 16 | -h, --help show this help message and exit 17 | -f FASTQFILE, --fastq file=FASTQFILE 18 | fastq (fq) file - standard (ASCII+33) encoded PHRED 19 | quality scores / illumina (ASCII+64) encoded PHRED 20 | quality scores 21 | -t THRESHOLD, --Phred quality threshold=THRESHOLD 22 | Base intensity threshold value (Phred quality scores 0 23 | to 40, default=10) 24 | -c CONSEC, --consec=CONSEC 25 | Minimum number of consecutive bases passing threshold 26 | values (default=20) 27 | -e ENCODING, --ASCII encoding type: 33 or 64=ENCODING 28 | Type of ASCII encoding: 33 (standard) or 64 (illumina) 29 | (default=64) 30 | -v, --verbose Runs in Verbose mode. 31 | 32 | 33 | e.g. 34 | ./qseq2fastq.pl s_3_1_0048_qseq.txt > s_3_1_0048_qseq.txt.fq 35 | ./qseq2fastq.pl s_3_2_0048_qseq.txt > s_3_2_0048_qseq.txt.fq 36 | ./TQSfastq.py -f s_3_1_0048_qseq.txt.fq -t 20 -c 36 -e 64 37 | ./TQSfastq.py -f s_3_2_0048_qseq.txt.fq -t 20 -c 36 -e 64 38 | 39 | To join both for SSAKE's paired-end input, run: 40 | ./makePairedOutput2UNEQUALfiles.pl s_3_1_0048_qseq.txt.fq_T20C36E64.trim.fa s_3_2_0048_qseq.txt.fq_T20C36E64.trim.fa 41 | 42 | This will create 2 files: paired.fa and unpaired.fa 43 | Run SSAKE: SSAKE -f paired.fa -g unpaired.fa -p 1 44 | 45 | #-------------------------------- 46 | *For those not interested in trimming their reads, but interested in joining 2 equal-record fasta files: 47 | 48 | ./qseq2fasta.pl s_3_1_0048_qseq.txt > file1.fa 49 | ./qseq2fasta.pl s_3_2_0048_qseq.txt > file2.fa 50 | ./makePairedOutput2EQUALfiles.pl file1.fa file2.fa 51 | 52 | #-------------------------------- 53 | Many of you asked me whether you could trim paired-end (PE) Illumina reads with TQS.py 54 | The answer is yes. However, I never got a chance to conjure a formal script for PE reads - But I hacked 55 | a work-around trimming paired reads: 56 | 57 | 58 | 59 | 1. run splitInput.pl (supplied in the ./tools directory) where all *_seq.txt and *_prb.txt are located: 60 | 61 | Usage: ./splitInput.pl <# Illumina cycles (read length)> 62 | 63 | 64 | 2. run this perl one-liner to make a shell script: 65 | 66 | ls -la | perl -ne 'if(/(s_\d+_\d+)_seq.txt.new/){print "TQS.py -f $1_seq.txt.new -q $1_prb.txt.new -l * -t * -d * -c *\n"}' > runTQS.sh 67 | *replace stars with your values 68 | 69 | 70 | 3. Run the shell script the above command created (will run TQS.py on ALL tiles, sequentially (why not farming the job on a compte cluster?!) 71 | 72 | chmod 755 runTQS.sh 73 | ./runTQS.sh 74 | 75 | 76 | 4. Concatenate all trimmed reads 77 | 78 | cat s*.trim.fa > all_raw.fa 79 | 80 | 81 | 5. Make the paired output (that will become the input for ssake3.2.1 -f) 82 | using the 2nd script supplied in this directory. 83 | 84 | ./makePairedOutput.pl all_raw.fa 85 | 86 | 87 | As always, feel free to contact me if you have any questions. 88 | Rene 89 | -------------------------------------------------------------------------------- /tools/estimate_insert_size.pl: -------------------------------------------------------------------------------- 1 | ################################################################################################################### 2 | #Marten Boetzer BaseClear B.v. 14-07-2011 # 3 | #SSPACE perl subscript samToTab_multi.pl # 4 | #This script; # 5 | # -Estimates median insert size by mapping paired-reads on contigs # 6 | # It goes through each contig and maps both reads, if a pair is mapped, # 7 | # the orientation and insert size is estimated. # 8 | # If sufficient pairs (given by the user) are found, the median insert size is # 9 | # estimated, as well as a file with the distribution is generated which can be # 10 | # used to visualize the insert size distribution. # 11 | # # 12 | # To run this script; # 13 | # perl estimate_insert_size.pl # 14 | # 15 | # Output is the median insert size and a file with distribution of the insert size. Also, number of pairs for # 16 | # each found orientation (FR, RF, FF and RR) are given. # 17 | ################################################################################################################### 18 | 19 | use File::Path; 20 | use strict; 21 | my $contigfile = $ARGV[0]; 22 | my $fileA = $ARGV[1]; 23 | my $fileB = $ARGV[2]; 24 | my $numpairs = $ARGV[3]; 25 | my $orientation = $ARGV[4]; 26 | 27 | die "ERROR: Can't find contig file: $contigfile -- fatal\n" if(! -e $contigfile); 28 | die "ERROR: Can't find read file: $fileA -- fatal\n" if(! -e $fileA); 29 | die "ERROR: Can't find read file: $fileB -- fatal\n" if(! -e $fileB); 30 | if($numpairs eq ''){ 31 | print "WARNING: No number of pairs are given, using 10000 pairs instead\n"; 32 | $numpairs = 10000; 33 | } 34 | if($orientation eq ''){ 35 | print "WARNING: No orientation of the pairs is given, using orientation FR instead\n"; 36 | $orientation = "FR"; 37 | } 38 | die "ERROR: You've inserted $numpairs, which does not seem to be an valid number. Exiting.\n" if(!($numpairs>0) || !($numpairs =~ /^\d+$/)); 39 | die "ERROR: Orientation must have length of 2 characters and should contain one of the following; FR, FF, FR or RF. You've inserted orientation of $orientation ...Exiting.\n" if(!(length($orientation) == 2) || !($orientation =~ /[FR][FR]/)); 40 | 41 | print "\n"; 42 | my $paircount = 0; 43 | my ($direction, $insertsize); 44 | mkpath('bowtieoutput'); 45 | open (CONT, $contigfile) || die "Can't open contig file $contigfile\n"; 46 | 47 | my ($seq,$name, $maxctg, $maxseq, $maxname)=("","",0,"",""); 48 | my $contignum = 0; 49 | CONTIG: 50 | while () { 51 | chomp; 52 | $seq.=$_ if(eof(CONT)); 53 | if (/\>(\S+)/ || eof(CONT)){ 54 | if($seq ne ""){ 55 | $contignum++; 56 | if(length($seq) > $maxctg){ 57 | $maxctg = length($seq); 58 | $maxseq = $seq; 59 | $maxname = $name; 60 | } 61 | if(eof(CONT)){ 62 | $seq = $maxseq; 63 | $name = $maxname; 64 | } 65 | if(eof(CONT)){ 66 | print "now at contig $name = size".length($seq)."\n"; 67 | open (BOWCONT, ">bowtieoutput/bowtie_input.fa"); 68 | print BOWCONT ">$name\n$seq\n"; 69 | close BOWCONT; 70 | ($paircount) = &mapWithBowtie($contignum,"bowtieoutput/bowtie_input.fa", $fileA, $fileB); 71 | last CONTIG if($paircount>=$numpairs); 72 | } 73 | 74 | $name = ""; 75 | $seq = ""; 76 | } 77 | $name = $1; 78 | } 79 | else { 80 | $seq .= $_; 81 | } 82 | } 83 | 84 | foreach my $d (keys %$direction){ 85 | print "direction $d is found $direction->{$d} times\n"; 86 | } 87 | my ($median_ins,$record) = (0,0); 88 | my $median_bin = int($paircount/2); 89 | open (CSV, ">distribution.txt") || die "Can't open distribution.txt for writing -- fatal"; 90 | foreach my $is (sort {$a<=>$b} keys %$insertsize){ 91 | for(my $i=0;$i<$insertsize->{$is};$i++){ 92 | $record++; 93 | $median_ins = $is if($record >= $median_bin && $median_ins == 0); 94 | } 95 | print CSV "$is\t$insertsize->{$is}\n"; 96 | } 97 | 98 | print "\nmedian = $median_ins\n\nSee the distribution in file 'distribution.txt'\n"; 99 | 100 | 101 | sub mapWithBowtie{ 102 | my ($fname,$contig, $fileA, $fileB) = @_; 103 | my $bowtieout = "contig$fname.bowtieIndex"; 104 | system("bowtie-build $contig bowtieoutput/$bowtieout --quiet --noref") == 0 || die "\nBowtie-build error; $?"; # returns exit status values 105 | 106 | my $fastq = 0; 107 | open(TEST, "< $fileA"); 108 | $name = ; 109 | close TEST; 110 | $fastq = 1 if ($name =~ /^[@]/); 111 | 112 | open(FILEA, "< $fileA"); 113 | open(FILEB, "< $fileB"); 114 | 115 | my $count=0; 116 | open (BOWIN, ">bowtieoutput/bowtiein.$fname.fa") || die "Can't write to single file bowtieoutput/bowtiein.$fname.fa-- fatal\n"; 117 | while() { 118 | ; 119 | $count++; 120 | my $seq1 = ; 121 | chomp $seq1; 122 | my $seq2 = ; 123 | chomp $seq2; 124 | #FASTQ FORMAT 125 | ,,, if ($fastq); 126 | 127 | print BOWIN ">read$count\n$seq1>read$count\n$seq2"; 128 | if($count > $numpairs){ 129 | close BOWIN; 130 | open(IN, "bowtie -p 1 -v 0 -m 1 bowtieoutput/$bowtieout --suppress 6,7 -f bowtieoutput/bowtiein.$fname.fa --quiet|") || die "Can't open bowtie output -- fatal\n"; 131 | my ($prevread, $prevline); 132 | while(my $line = ){ 133 | my @t1 = split(/\t/,$line); 134 | if($prevread eq $t1[0]){ 135 | $paircount++; 136 | my @t2 = split(/\t/,$prevline); 137 | my ($start1, $start2, $end1,$end2); 138 | 139 | if($t1[1] eq "+"){ 140 | $end1 = $t1[3] + length($t1[4]); 141 | $start1 = $t1[3]; 142 | } 143 | else{ 144 | $start1 = $t1[3] + length($t1[4]); 145 | $end1 = $t1[3]; 146 | } 147 | if($t2[1] eq "+"){ 148 | $end2 = $t2[3] + length($t2[4]); 149 | $start2 = $t2[3]; 150 | } 151 | else{ 152 | $start2 = $t2[3] + length($t2[4]); 153 | $end2 = $t2[3]; 154 | } 155 | my ($dir1, $dir2); 156 | $dir1 = "F" if($start1 < $end1); 157 | $dir1 = "R" if($start1 > $end1); 158 | $dir2 = "F" if($start2 < $end2); 159 | $dir2 = "R" if($start2 > $end2); 160 | $direction->{"$dir1$dir2"}++ if($start1 < $start2); 161 | $direction->{"$dir2$dir1"}++ if($start2 < $start1); 162 | my $diff = abs($start2-$start1); 163 | if($orientation eq "$dir1$dir2" || $orientation eq "$dir2$dir1"){ 164 | $insertsize->{$diff}++; 165 | } 166 | return $paircount if($paircount >= $numpairs); 167 | } 168 | $prevread = $t1[0]; 169 | $prevline = $line; 170 | } 171 | 172 | close BOWIN; 173 | open (BOWIN, "bowtieoutput/bowtiein.$fname.fa") || die "Can't write to single file bowtieoutput/bowtiein.$name.fa-- fatal\n"; 174 | } 175 | } 176 | print "count = $paircount\n"; 177 | return $paircount; 178 | } 179 | 180 | ###PRINTS A COUNTER ON THE SCREEN AND OVERWRITES PREVIOUS LINE 181 | sub CounterPrint{ 182 | my $countingMessager = shift; 183 | print "\r$countingMessager"; 184 | $|++; 185 | } 186 | -------------------------------------------------------------------------------- /tools/fq_all2std.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl -w 2 | 3 | # Author: lh3 4 | 5 | use strict; 6 | use warnings; 7 | use Getopt::Std; 8 | use FindBin qw($Bin); 9 | 10 | my $usage = qq( 11 | Usage: fq_all2std.pl 12 | 13 | Command: scarf2std Convert SCARF format to the standard/Sanger FASTQ 14 | fqint2std Convert FASTQ-int format to the standard/Sanger FASTQ 15 | sol2std Convert Solexa/Illumina FASTQ to the standard FASTQ 16 | fa2std Convert FASTA to the standard FASTQ 17 | fq2fa Convert various FASTQ-like format to FASTA 18 | sol2scarf Convert Solexa/Illumina FASTQ to the SCARF format 19 | qseq2srf Convert Solexa/Illumina qseq format to the SRF format 20 | qseqin2srf Convert Solexa/Illumina qseq + intensity/noise format to the SRF format 21 | instruction Explanation to different format 22 | example Show examples of various formats 23 | 24 | Note: Read/quality sequences MUST be presented in one line. 25 | \n); 26 | 27 | die($usage) if (@ARGV < 1); 28 | 29 | # Solexa->Sanger quality conversion table 30 | my @conv_table; 31 | for (-64..64) { 32 | $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499)); 33 | } 34 | 35 | # parsing command line 36 | my $cmd = shift; 37 | my %cmd_hash = (scarf2std=>\&scarf2std, fqint2std=>\&fqint2std, sol2std=>\&sol2std, fa2std=>\&fa2std, 38 | sol2scarf=>\&sol2scarf, fq2fa=>\&fq2fa, qseq2srf=>\&qseq2srf, 39 | qseqin2srf=>\&qseqin2srf, example=>\&example, instruction=>\&instruction); 40 | if (defined($cmd_hash{$cmd})) { 41 | if ($cmd eq 'qseq2srf') { 42 | &qseq2srf($ARGV[1]); 43 | } 44 | elsif ($cmd eq 'qseqin2srf') { 45 | &qseqin2srf($ARGV[1]); 46 | } 47 | &{$cmd_hash{$cmd}}; 48 | } else { 49 | die("** Unrecognized command $cmd"); 50 | } 51 | 52 | sub fa2std { 53 | my %opts = (q=>25); 54 | getopts('q:', \%opts); 55 | my $q = chr($opts{q} + 33); 56 | warn("-- The default quality is set to $opts{q}. Use '-q' at the command line to change the default.\n"); 57 | while (<>) { 58 | if (/^>(\S+)/) { 59 | print "\@$1\n"; 60 | $_ = <>; 61 | print "$_+\n", $q x (length($_)-1), "\n"; 62 | } 63 | } 64 | } 65 | 66 | sub fq2fa { 67 | while (<>) { 68 | if (/^@(\S+)/) { 69 | print ">$1\n"; 70 | $_ = <>; print; 71 | <>; <>; 72 | } 73 | } 74 | } 75 | 76 | sub scarf2std { 77 | while (<>) { 78 | my @t = split(':', $_); 79 | my $name = join('_', @t[0..4]); 80 | print "\@$name\n$t[5]\n+\n"; 81 | my $qual = ''; 82 | @t = split(/\s/, $t[6]); 83 | $qual .= $conv_table[$_+64] for (@t); 84 | print "$qual\n"; 85 | } 86 | } 87 | 88 | sub fqint2std { 89 | while (<>) { 90 | if (/^@/) { 91 | print; 92 | $_ = <>; print; $_ = <>; $_ = <>; 93 | my @t = split; 94 | my $qual = ''; 95 | $qual .= $conv_table[$_+64] for (@t); 96 | print "+\n$qual\n"; 97 | } 98 | } 99 | } 100 | 101 | sub sol2std { 102 | my $max = 0; 103 | while (<>) { 104 | if (/^@/) { 105 | print; 106 | $_ = <>; print; $_ = <>; $_ = <>; 107 | my @t = split('', $_); 108 | my $qual = ''; 109 | $qual .= $conv_table[ord($_)] for (@t); 110 | print "+\n$qual\n"; 111 | } 112 | } 113 | } 114 | 115 | sub sol2scarf { 116 | my $counter = 0; 117 | while (<>) { 118 | chomp; 119 | if (/^@/) { 120 | my $line = substr($_,1); 121 | if ($counter==0) { 122 | print "$line:"; 123 | $counter+=1; 124 | } 125 | else { 126 | print "\n$line:"; 127 | } 128 | } 129 | elsif (/^[+]/) { 130 | print ":"; 131 | } 132 | else { 133 | print "$_"; 134 | } 135 | } 136 | print "\n"; 137 | } 138 | 139 | sub qseq2srf { 140 | die "This routine is currently out of order. Compatibility could only be guaranteed until GAP version 1.3 141 | Formerly it could be called as follows: fq_all2std.pl qseq2srf s_3_*_qseq.txt\n"; 142 | 143 | my $arg = shift; 144 | system("$Bin/srf-conversions/illumina2srf $arg"); 145 | } 146 | 147 | sub qseqin2srf { 148 | die "This routine is currently out of order. Compatibility could only be guaranteed until GAP version 1.3 149 | Formerly it could be called as follows: fq_all2std.pl qseq2srf -b s_3_*_qseq.txt 150 | 151 | If you want to re-implement it do not forget to complete the pre-processing steps of the cif intensities and cnf noise files. 152 | A) To generate the cif intensities files, go to the Lane folder in Intensities and type: 153 | for ((i=001;i<=120;i++)); 154 | do /data/scripts/next-gen/convert_sequence_format/srf-conversions/cifToTxt -l -t \$i; 155 | done 156 | B) To generate the cnf noise files, go to the Lane folder in Intensities and type: 157 | for ((i=001;i<=120;i++)); 158 | do /data/scripts/next-gen/convert_sequence_format/srf-conversions/cnfToTxt -l -t \$i; 159 | done\n"; 160 | my $arg = shift; 161 | system("$Bin/srf-conversions/illumina2srf -b $arg"); 162 | } 163 | 164 | sub instruction { 165 | 166 | print " 167 | FASTQ format is first used in the Sanger Institute, and therefore 168 | we take the Sanger specification as the standard FASTQ. Although 169 | Solexa/Illumina reads file looks pretty much like the standard 170 | FASTQ, they are different in that the qualities are scaled 171 | differently. In the quality string, if you can see a character 172 | with its ASCII code higher than 90, probably your file is in the 173 | Solexa/Illumina format. 174 | 175 | Sometimes we also use an integer, instead of a single character, 176 | to explicitly show the qualities. In that case, negative 177 | qualities indicates that Solexa/Illumina qualities are used. 178 | 179 | "; 180 | 181 | } 182 | 183 | sub example { 184 | my $exam_scarf = ' 185 | USI-EAS50_1:4:2:710:120:GTCAAAGTAATAATAGGAGATTTGAGCTATTT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 19 23 23 23 18 23 23 23 186 | USI-EAS50_1:4:2:690:87:GTTTTTTTTTTTCTTTCCATTAATTTCCCTTT:23 23 23 23 23 23 23 23 23 23 23 23 12 23 23 23 23 23 16 23 23 9 18 23 23 23 12 23 18 23 23 23 187 | USI-EAS50_1:4:2:709:32:GAGAAGTCAAACCTGTGTTAGAAATTTTATAC:23 23 23 23 23 23 23 23 20 23 23 23 23 23 23 23 23 23 23 23 23 12 23 18 23 23 23 23 23 23 23 23 188 | USI-EAS50_1:4:2:886:890:GCTTATTTAAAAATTTACTTGGGGTTGTCTTT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 189 | USI-EAS50_1:4:2:682:91:GGGTTTCTAGACTAAAGGGATTTAACAAGTTT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 20 23 23 23 23 23 23 23 23 23 23 23 18 23 23 23 23 190 | USI-EAS50_1:4:2:663:928:GAATTTGTTTGAAGAGTGTCATGGTCAGATCT:23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 191 | '; 192 | 193 | my $exam_fqint = ' 194 | @4_1_912_360 195 | AAGGGGCTAGAGAAACACGTAATGAAGGGAGGACTC 196 | +4_1_912_360 197 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 21 40 40 40 40 40 40 40 40 40 26 40 40 14 39 40 40 198 | @4_1_54_483 199 | TAATAAATGTGCTTCCTTGATGCATGTGCTATGATT 200 | +4_1_54_483 201 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 40 40 40 28 40 40 40 40 40 40 16 40 40 5 40 40 202 | @4_1_537_334 203 | ATTGATGATGCTGTGCACCTAGCAAGAAGTTGCATA 204 | +4_1_537_334 205 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 21 29 40 40 33 40 40 33 40 40 33 31 40 40 40 40 18 26 40 -2 206 | @4_1_920_361 207 | AACGGCACAATCCAGGTTGATGCCTACGGCGGGTAC 208 | +4_1_920_361 209 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 40 40 40 40 40 40 40 40 31 40 40 40 40 40 40 15 5 -1 3 210 | @4_1_784_155 211 | AATGCATGCTTCGAATGGCATTCTCTTCAATCACGA 212 | +4_1_784_155 213 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 31 40 40 40 40 40 214 | @4_1_595_150 215 | AAAGACGTGGCCAGATGGGTGGCCAAGTGCCCGACT 216 | +4_1_595_150 217 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 30 40 40 40 40 40 40 40 40 40 20 40 40 40 40 40 14 40 40 218 | '; 219 | 220 | my $exam_sol = ' 221 | @SLXA-B3_649_FC8437_R1_1_1_610_79 222 | GATGTGCAATACCTTTGTAGAGGAA 223 | +SLXA-B3_649_FC8437_R1_1_1_610_79 224 | YYYYYYYYYYYYYYYYYYWYWYYSU 225 | @SLXA-B3_649_FC8437_R1_1_1_397_389 226 | GGTTTGAGAAAGAGAAATGAGATAA 227 | +SLXA-B3_649_FC8437_R1_1_1_397_389 228 | YYYYYYYYYWYYYYWWYYYWYWYWW 229 | @SLXA-B3_649_FC8437_R1_1_1_850_123 230 | GAGGGTGTTGATCATGATGATGGCG 231 | +SLXA-B3_649_FC8437_R1_1_1_850_123 232 | YYYYYYYYYYYYYWYYWYYSYYYSY 233 | @SLXA-B3_649_FC8437_R1_1_1_362_549 234 | GGAAACAAAGTTTTTCTCAACATAG 235 | +SLXA-B3_649_FC8437_R1_1_1_362_549 236 | YYYYYYYYYYYYYYYYYYWWWWYWY 237 | @SLXA-B3_649_FC8437_R1_1_1_183_714 238 | GTATTATTTAATGGCATACACTCAA 239 | +SLXA-B3_649_FC8437_R1_1_1_183_714 240 | YYYYYYYYYYWYYYYWYWWUWWWQQ 241 | '; 242 | 243 | print qq( 244 | solexa 245 | ====== 246 | $exam_sol 247 | scarf 248 | ===== 249 | $exam_scarf 250 | fqint 251 | ===== 252 | $exam_fqint 253 | ); 254 | } 255 | -------------------------------------------------------------------------------- /tools/qseq2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | if($#ARGV<0){ 6 | die "Usage: $0 \n"; 7 | } 8 | 9 | open(IN,$ARGV[0]) || die "Can't open $ARGV[0] for reading --fatal.\n"; 10 | my $fasta = $ARGV[0] . ".fa"; 11 | open(OUT,">$fasta") || die "Can't open $fasta for writing --fatal.\n"; 12 | 13 | while () { 14 | chomp; 15 | my @parts = split(/\s+/); 16 | my $concat = ">$parts[0]:$parts[2]:$parts[3]:$parts[4]:$parts[5]#$parts[6]/$parts[7]"; 17 | print OUT "$concat\n"; 18 | print OUT "$parts[8]\n"; 19 | } 20 | 21 | close IN; 22 | close OUT; 23 | 24 | exit; 25 | -------------------------------------------------------------------------------- /tools/qseq2fastq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | if($#ARGV<0){ 7 | die "Usage: $0 \n"; 8 | } 9 | 10 | open(IN,$ARGV[0]) || die "Can't open $ARGV[0] for reading --fatal.\n"; 11 | 12 | while () { 13 | chomp; 14 | my @parts = split /\t/; 15 | print "@"; 16 | print "$parts[0]:$parts[2]:$parts[3]:$parts[4]:$parts[5]#$parts[6]/$parts[7]\n"; 17 | print "$parts[8]\n"; 18 | print "+\n"; 19 | print "$parts[9]\n"; 20 | } 21 | 22 | close IN; 23 | -------------------------------------------------------------------------------- /tools/sam_bam2tab.pl: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | #Marten Boetzer BaseClear B.v. 26-07-2011 # 3 | #SSPACE perl sam_bam2Tab.pl # 4 | #This script; # 5 | # -converts a .sam file to a tab file containing; # 6 | # -contig of read 1 # 7 | # -start position of read 1 # 8 | # -end position of read 1 # 9 | # -contig of read 2 # 10 | # -start position of read 2 # 11 | # -end position of read 2 # 12 | # # 13 | # -Sam/Bam file should contain a read pair at consecutive # 14 | # lines where the first line contains the first read and # 15 | # second line the second read # 16 | # In order to have such a file, sort the sam file # 17 | # before using this script with SAMTools command: # 18 | # samtools view -uS | samtools sort -n - # 19 | # # 20 | # -This script requires samtools to be installed # 21 | # # 22 | # -Bam files should end with .bam extension # 23 | # # 24 | #INPUT: # 25 | # perl sam_bam2Tab.pl $outfile") || die "Can't open $outfile for writing -- fatal\n"; 53 | 54 | my $step = 100000; 55 | my ($ct, $diffct, $read, $prevread, $prevline, $line); 56 | while($line = ){ 57 | next if($line =~ /^@/); 58 | ($read, undef, $chrom) = split("\t", $line); 59 | next if($chrom eq "*"); 60 | if($read !~ /$postfix1$/ && $read !~ /$postfix2$/){ 61 | warn("read $read had no suffix '$postfix1' or '$postfix2', please insert a correct suffix (e.g. '/1' and '/2')\n"); 62 | } 63 | $read = substr($read,0,-(length($postfix1))); 64 | if($prevread eq $read){ 65 | $pair_found++; 66 | my ($line1, $line2) = ($prevline,$line); 67 | if($prevread =~ /$postfix2$/){ 68 | $line1 = $line; 69 | $line2 = $prevline; 70 | } 71 | my @arr1 = split("\t", $line1); 72 | my @arr2 = split("\t", $line2); 73 | 74 | my ($tig1,$start1,$end1, $tig2,$start2,$end2) = ($arr1[2], $arr1[3], ($arr1[3]+length($arr1[9])), $arr2[2],$arr2[3],($arr2[3]+length($arr2[9]))); 75 | 76 | if ($arr1[1] & 16) { 77 | $end1 = $start1; 78 | $start1 = $start1 + length($arr1[9]); 79 | } 80 | if ($arr2[1] & 16) { 81 | $end2 = $start2; 82 | $start2 = $start2 + length($arr2[9]); 83 | } 84 | print OUT "$tig1\t$start1\t$end1\t$tig2\t$start2\t$end2\n"; 85 | } 86 | $prevread = $read; 87 | $prevline = $line; 88 | if(++$ct == $step){ 89 | CounterPrint("reads = $ct pairs = $pair_found"); 90 | $step = $step + 100000; 91 | } 92 | } 93 | CounterPrint("\n"); 94 | 95 | sub CounterPrint{ 96 | my $countingMessager = shift; 97 | print "\r$countingMessager"; 98 | $|++; 99 | } --------------------------------------------------------------------------------