├── LICENSE ├── NC_012624.fna ├── README.md ├── __pycache__ ├── constants.cpython-33.pyc ├── isPredict.cpython-33.pyc ├── is_analysis.cpython-33.pyc ├── pred.cpython-33.pyc ├── ssw_wrap.cpython-33.pyc └── tools.cpython-33.pyc ├── constants.py ├── isPredict.py ├── is_analysis.py ├── isescan.py ├── pHMMs ├── clusters.faa.hmm └── clusters.single.faa ├── pred.py ├── publication ├── SupplementaryMaterials.docx ├── SupplementaryMaterials.xlsx └── btx433.pdf ├── pyssw.py ├── ssw201507 ├── Makefile ├── __pycache__ │ └── ssw_wrap.cpython-33.pyc ├── example.c ├── kseq.h ├── license.ssw.txt ├── main.c ├── pyssw.py ├── result.sam ├── ssw.c ├── ssw.h ├── ssw.h.gch ├── ssw_wrap.py ├── test1.fna ├── test11.fna ├── test2.fna └── test22.fna ├── ssw_wrap.py └── tools.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ISEScan [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](https://bioconda.github.io/recipes/isescan/README.html) [![install with docker](https://img.shields.io/badge/install%20with-docker-important.svg?style=flat-square&logo=docker)](https://quay.io/repository/biocontainers/isescan) 2 | 3 | ## A python pipeline to identify IS (Insertion Sequence) elements in genome and metagenome 4 | - ISEScan can be used to identify/annotate full-length or non-full-length IS elements in any DNA sequence but ISEScan was only tested on prokarytoic genome including draft genome and meta-genome. 5 | - Among the existing tools identifying IS elements, ISEScan might be the only one that gives TIR (Terminal Inverted Repeat) sequences. 6 | - The input sequence file (namely, genome or meta-genome) of ISEScan can contain one or more sequences and there is no limit on the length of each sequence, though ISEScan was only tested on complete genome with one or more sequences, draft genome with many contigs, assembled meta-genome with many contigs. 7 | - The only requirment for the input sequence file is: the sequence file must be in **FASTA** format. When ISEScan is started, it first scans the sequences in the FASTA file one by one, then identify/annotate the IS elements in each sequence independently, finally output all identified/annotated IS elements for each sequence and the statistics of identified/annotated IS elements from all sequences in the input FASTA file. 8 | - Unknown bases are allowed in the sequences, e.g. ACACGCCCGTTGTTTT**NNNNNNNNN**, GGGTCAGGTCATCAACTTTAGCGTAACGC**NNNNN**GGG. 9 | - If you just want to identify potential transposases (not FULL or partial IS elements) in your sequences and don't like to install ISEScan, you can do so by following two steps: 1) download the transposase models (clusters.faa.hmm and clusters.single.fa) from ISEScan subdirectory [pHMMs](https://github.com/xiezhq/ISEScan/tree/master/pHMMs), 2) install and use software HMMER (version 3.1b2 or later) to search transposases in your sequences. 10 | - ISEScan users asked many good questions (see [issues](https://github.com/xiezhq/ISEScan/issues)) which have been answered by the developer of ISEScan. If you didn't find the answers you want at [issues](https://github.com/xiezhq/ISEScan/issues), you can open a new issue at [issues](https://github.com/xiezhq/ISEScan/issues). 11 | - If you want to replace some (or all) of genes/proteins predicted by ISEScan (actually FraGeneScan called by ISEScan) to predict transposases and IS elements, you can try manually replacing gene boudaries and protein sequences in file `.faa` under directory `results/proteome` after you run ISEScan on your genome sequences. For how to do so, please check [my comments](https://github.com/xiezhq/ISEScan/issues/45) on May 2022. 12 | 13 | ## Table of Contents 14 | - [Overview](#Overview) 15 | - [Citation](#Citation) 16 | - [Contact](#Contact) 17 | - [Installation](#Installation) 18 | - [ISEScan on linux](#install-on-linux) 19 | - [ISEScan on mac](#install-on-mac) 20 | - [Automated install by Bioconda (recommended!)](#Bioconda-install) 21 | - [Manual install (install from source code)](#Manual-install) 22 | - [Upgrade ISEScan to the latest version](#Upgrade) 23 | - [Usage example](#Usage) 24 | - [Tips to run ISEScan efficiently](#Tips) 25 | - [How to run a set of genomes in a row](#lots-of-genomes) 26 | - [Re-run ISEScan without gene/protein prediction and HMMER searching](#Re-run) 27 | - [Release History](#Release) 28 | 29 | 30 | ## Overview 31 | ISEScan is a python pipeline to identify IS (Insertion Sequence) elements in genome. It includes an option to report either complete IS elements or both complete and partial IS elements. It might be a good idea to try reporting both complete and partial IS elements when it is used to identify the IS elements in the assemblies of metegenome. ISEScan reports both complete and partial IS elements by default. 32 | 33 | ISEScan was developed using Python3. It 1) scans genome (or metagenome) in fasta format; 2) predicts/translates (using FragGeneScan) genome into proteome; 3) searches the pre-built pHMMs (profile Hidden Markov Models) of transposases (two files shipped with ISEScan; clusters.faa.hmm and clusters.single.faa) against the proteome and identifies the transposase gene in genome; 4) then extends the identified transposase gene into the complete IS (Insertion Sequence) elements based on the common characteristics shared by the known IS elements reported by literatures and database; 5) finally reports the identified IS elements in a few result files (e.g. a file containing a list of IS elements, a file containing sequences of IS elements in fasta format, an annotation file in GFF3 format). 34 | 35 | 36 | ## Citation 37 | Zhiqun Xie, Haixu Tang. ISEScan: automated identification of Insertion Sequence Elements in prokaryotic genomes. *Bioinformatics*, 2017, 33(21): 3340-3347. 38 | 39 | Download: [full text](https://doi.org/10.1093/bioinformatics/btx433), [SupplementaryMaterials.docx](publication/SupplementaryMaterials.docx), [SupplementaryMaterials.xlsx](publication/SupplementaryMaterials.xlsx). 40 | 41 | 42 | ## Contact 43 | Zhiqun Xie: `xiezhq@hotmail.com` 44 | 45 | 46 | ## Installation 47 | 48 | #### ISEScan on linux 49 | ISEScan was tested on Linux only and can be installed from Bioconda packages and source code. Install from Bioconda is recommended as it is the simplest way for non-experienced users. 50 | 51 | #### ISEScan on mac 52 | I have no idea about ISEScan on mac as I only fully tested it on Linux. If you cannot install ISEScan on mac from Bioconda, you can try installing ISEScan from source codes. For installing ISEScan from source codes, I knew there was an issue to compile FragGensScan on Mac but I once solved it. To solve the problem of running FragGeneScan on Mac, please modify two source files in FragGeneScan source codes: 1) open util_lib.c and comment out ‘#include ’ on line3; 2) open hmm_lib.c and comment out ‘‘#include ’ on line6 and replace values.h with limits.h on line4. The modified FragGeneScan can run on Mac and Linux without problem according to my test result. 53 | 54 | 55 | #### Automated install by Bioconda (recommended!) 56 | The steps below will install ISEScan package via bioconda to /apps/inst/miniconda3/. You can install ISEScan to other place by changing the default miniconda3 install path in step **Install Miniconda3**. Visit [Bioconda recipe for ISEScan](https://bioconda.github.io/recipes/isescan/README.html) for more details (Thanks both [pbasting](https://github.com/pbasting) and [tseemann](https://github.com/tseemann) for making it available!). 57 | - Install [Bioconda](https://bioconda.github.io/user/install.html). To minimize the install time and size, we [install miniconda](https://docs.conda.io/en/latest/miniconda.html#linux-installers) 58 | - Download [Miniconda3-latest-Linux-x86_64 installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) 59 | ``` 60 | curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 61 | ``` 62 | - Install Miniconda3 63 | ``` 64 | sh Miniconda3-latest-Linux-x86_64.sh 65 | ``` 66 | - Please answer yes (see my screen shot below) for all questions of `sh Miniconda3-latest-Linux-x86_64.sh` if you have no idea about the questions. 67 | ``` 68 | Do you wish the installer to initialize Miniconda3 69 | by running conda init? [yes|no] 70 | [no] >>> yes 71 | ``` 72 | ``` 73 | rm Miniconda3-latest-Linux-x86_64.sh 74 | source ~/.bashrc 75 | ``` 76 | - Add the bioconda channel as well as the other channels bioconda depends on. It is important to add them in this order so that the priority is set correctly (that is, conda-forge is highest priority). 77 | ``` 78 | conda config --add channels defaults 79 | conda config --add channels bioconda 80 | conda config --add channels conda-forge 81 | ``` 82 | - Install and update ISEScan 83 | ``` 84 | conda install isescan 85 | ``` 86 | - Try ISEScan (You can find the available command options by running `isescan.py -h`). 87 | ``` 88 | cp /apps/inst/miniconda3/test/NC_012624.fna ./ 89 | isescan.py --seqfile NC_012624.fna --output results --nthread 2 90 | ``` 91 | Note: replace `/apps/inst/miniconda3` in commands with your conda install path. 92 | 93 | If system reports `isescan.py: command not found...`, please add ISEScan package to your `PATH` (replace `/apps/inst/miniconda3` in the command below with your conda install path): 94 | ``` 95 | export PATH=/apps/inst/miniconda3/bin/:$PATH 96 | ``` 97 | Then, try ISEScan again: 98 | ``` 99 | isescan.py --seqfile NC_012624.fna --output results --nthread 2 100 | ``` 101 | 102 | 103 | #### Manual install (install from source code) 104 | - Install ISEScan 105 | - Download the latest ISEScan from https://github.com/xiezhq/ISEScan/releases, e.g. **Source code (tar.gz)**. 106 | 107 | - Uncompress the .zip (or .tar.gz) file. 108 | - Use unzip command to uncompress the zip file: 109 | ``` 110 | unzip v1.7.2.2.zip 111 | ``` 112 | - Use tar command to uncompress the tar.gz file: 113 | ``` 114 | tar -zvxf v1.7.2.2.tar.gz 115 | ``` 116 | This will create a ISEScan folder, e.g. ISEScan-1.7.2.2. You need to go to ISEScan folder to configure and run it. 117 | ``` 118 | cd ISEScan-1.7.2.2 119 | ``` 120 | - Install dependencies before you run ISEScan 121 | - Python 3.3.3 or later 122 | - numpy-1.8.0 or later 123 | - scipy-0.13.1 or later 124 | - fastcluster, latest version recommended, https://pypi.python.org/pypi/fastcluster 125 | - FragGeneScan1.30 or earlier, (The .faa file output by version1.31 is not compatible with ISEScan!), http://omics.informatics.indiana.edu/FragGeneScan 126 | - HMMER-3.1b2 or later, http://hmmer.org/download.html 127 | - BLAST 2.2.31 or later 128 | - SSW Library, the latest version is not tested with ISEScan and the tested version of SSW library is shipped with ISEScan, please find it at ssw201507 subdirectory. 129 | - To use the shipped SSW library in ISEScan, please go to ssw201507 and then compile the codes by gcc: 130 | ``` 131 | cd ssw201507 132 | gcc -Wall -O3 -pipe -fPIC -shared -rdynamic -o libssw.so ssw.c ssw.h 133 | ``` 134 | - And then copy libssw.so and set search path: 135 | ``` 136 | cp libssw.so ../ 137 | export LD_LIBRARY_PATH=/home/xiezhq/projects/ISEScan-1.7.2.2:$LD_LIBRARY_PATH 138 | ``` 139 | In command `export LD_LIBRARY_PATH=/home/xiezhq/projects/ISEScan-1.7.2.2:$LD_LIBRARY_PATH`, please replace `/home/xiezhq/projects/ISEScan-1.7.2.2` with the actual path of libssw.so on your computer! 140 | - The latest SSW library can be found at https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library. 141 | - biopython 1.62 or later (required by SSW library) 142 | 143 | - Add the required packages to your $PATH before you run ISEScan 144 | - Add to $PATH the paths pointing to run_FragGeneScan.pl, phmmer, hmmsearch, blastn, blastp, makeblastdb 145 | ``` 146 | export PATH=$PATH:/apps/inst/FragGeneScan1.30:/apps/inst/hmmer-3.3/bin:/apps/inst/ncbi-blast-2.10.0+/bin 147 | ``` 148 | In command export above, please replace `/apps/inst/FragGeneScan1.30`, `/apps/inst/hmmer-3.3/bin` and `/apps/inst/ncbi-blast-2.10.0+/bin` with the actual paths of FragGeneScan, HMMER and BLAST on your computer! 149 | 150 | 151 | 152 | ## Upgrade ISEScan to the latest version 153 | ### Automated upgrade from Bioconda 154 | The lastest version becomes available on Bioconda is in a few hours or days after it is released on https://github.com/xiezhq/ISEScan. You can run the command below to upgrade the existing ISEScan if the existing ISEScan was installed by Bioconda. 155 | ``` 156 | conda update isescan 157 | ``` 158 | ### Manual upgrade from existing ISEScan 159 | By manual upgrade, you may get the lastest version immediately from https://github.com/xiezhq/ISEScan). It is quite easy to upgrade the existing ISEScan to the latest version: copy all .py files from the latest version to the ISEScan install directory. 160 | - Locate the existing ISEscan (ISEScan install directory). If you don't know where isescan.py is installed, you can run `which isescan.py` to help find where it is on your system. 161 | ``` 162 | which isescan.py 163 | /apps/inst/miniconda3/bin/isescan.py 164 | ``` 165 | - Get the latest ISEScan source codes and then copy the all .py files to ISEScan install directory. Please check [Manual install (install from source code)](#Manual-install) on how to get the latest ISEScan source codes. For example, you downlaoded the latest ISEScan, v1.7.2.2.2.tar.gz. 166 | ``` 167 | tar -zxf v1.7.2.2.2.tar.gz 168 | cd ISEScan-1.7.2.2.2/ 169 | cp *.py /apps/inst/miniconda3/bin/ 170 | ``` 171 | ### Check and test the upgraded ISEScan 172 | - Check the version of the upgraded ISEScan. 173 | ``` 174 | python3 isescan.py --version 175 | ``` 176 | or 177 | ``` 178 | isescan.py --version 179 | ``` 180 | - Test the upgraded ISEScan. 181 | ``` 182 | python3 isescan.py --seqfile /apps/inst/miniconda3/test/NC_012624.fna --output /home/xiezhq/results --nthread 2 183 | ``` 184 | 185 | 186 | ## Usage example 187 | Let's try an example, NC_012624.fna. 188 | 189 | - The command below scans NC_012624.fna (genome sequence of Sulfolobus_islandicus_Y_N_15_51, ~42 kb), and outputs all results in `results` directory: 190 | ``` 191 | cp /apps/inst/miniconda3/test/NC_012624.fna ./ 192 | isescan.py --seqfile NC_012624.fna --output results --nthread 2 193 | ``` 194 | Note: run `isescan.py -h` or `isescan.py --help` to get help. 195 | - Wait for its finishing. It may take a while (~40 seconds) as ISEScan uses the HMMER to scan the genome sequences and it will use 621 profile HMM models to scan each protein sequence (predicted by FragGeneScan) in the genome sequence. HMMER searching is usually more sensitive but slower than the regular BLAST searching for remote homologs. The running time for larger genome will increase quickly, e.g. about 20 minutes for NC_000913.fna (genome sequence of Escherichia coli str. K-12 substr. MG1655, ~4.6 Mb) with two cpu cores on my virtual machine. 196 | 197 | - After ISEScan finish running, you can find the output files in results directory: 198 | - NC_012624.fna.sum: the summarization of IS copies for each IS family 199 | - NC_012624.fna.csv: details about IS copies in NC_012624, one copy per line, comma-separated tabular table 200 | - NC_012624.fna.tsv: details about IS copies in NC_012624, one copy per line, tab-separated tabular table 201 | - NC_012624.fna.raw: details about IS copies in NC_012624, one copy per line 202 | - NC_012624.fna.gff: listing each IS copy and its TIR, gff3 format 203 | - NC_012624.fna.is.fna: the nucleic acid sequence of each IS copy, fasta format 204 | - NC_012624.fna.orf.fna: the nucleic acid sequence of the Tpase gene in each IS copy, fasta format 205 | - NC_012624.fna.orf.faa: the amino acid sequence of the Tpase in each IS copy, fasta format 206 | 207 | - Details about NC_012624.fna.sum: 208 | - The title line starts with `#`, followed by the summarization of IS content for each sequence in NC_012624. The last line is the summarization of IS content for all sequences in NC_012624. 209 | - Summarization of IS content for each sequence in NC_012624: 210 | - seqid: sequence identifier, extracted from head lines begining with `>` in NC_012624.fna, usuall the texts between `>` and the first blank character in a head line 211 | - family: family name of IS element 212 | - nIS: number of IS copies assigned to the specific family in a sequence 213 | - %Genome: percentage of genome sequence content spaned by IS elements in a sequence, calculated by bps4IS/dnaLen (see the following columns) 214 | - bps4IS: length of sequence segments spaned by IS elements in a sequence 215 | - dnaLen: length of the specific sequence 216 | 217 | - Details about NC_012624.fna.csv (NC_012624.fna.tsv, NC_012624.fna.raw): 218 | - The first row is header line listing column names. 219 | - The rows after the first row are the main content of NC_012624.fna.csv file, one IS copy per line. 220 | - Columns in NC_012624.fna.csv (NC_012624.fna.tsv, NC_012624.fna.raw): 221 | - seqID: sequence identifier 222 | - family: family name of IS element 223 | - cluster: Tpase cluster 224 | - isBegin and isEnd: genome coordinates of the predicted IS element 225 | - isLen: length of the predicted IS element 226 | - ncopy4is: number of predicted IS copies including full-length and partial IS copies 227 | - start1, end1, start2, end2: genome coordinates of the IRs 228 | - score: score of the IRs 229 | - irId: number of identical matches in pairwise alignment of left and righ hand invered repeats 230 | - irLen, length of inverted repeats 231 | - nGaps: number of gaps in IRs 232 | - orfBegin, orfEnd: genome coordinates of the predicted Tpase ORF 233 | - strand: strand where the Tpase is 234 | - orfLen: length of predicted Tpase ORF 235 | - E-value: the best E-value among all IS copies for the same IS element, the smaller the better 236 | - E-value4copy: the E-value of the reported IS copy, the smaller the better 237 | - Note: the E-value is the E-value returned by hmmer when searching profile HMMs against proteome translated from a genome sequence 238 | - type: type of IS element copy, 'c' for complete IS element and 'p' for partial IS element 239 | - ov: ov number returned by hmmer search 240 | - tir: terminal inverted repeat sequences 241 | 242 | 243 | ## Tips to run ISEScan efficiently: 244 | 245 | ### How to run a set of genomes in a row 246 | Sometimes, we want to run hundres of genomes in one line of command and then wait for all computing jobs to complete. Before doing it, we assume: 247 | - You can successfully run ISEScan on one genome: 248 | - run commands as the following if you installed ISEScan via Bioconda. 249 | ``` 250 | conda activate base 251 | isescan.py --seqfile NC_012624.fna --output results 252 | ``` 253 | - run the commands as the following if you installed ISEScan manually. 254 | ``` 255 | python3 /home/xiezhq/projects/ISEScan-1.7.2.2/isescan.py --seqfile genome1.fa --output results 256 | ``` 257 | where genome1.fa is your genome sequence file in fasta format. By default, ISEScan will use one CPU core but you can change it using command option `--nthread NTHREAD`, e.g. 258 | ``` 259 | isescan.py --seqfile genome1.fa --output results --nthread 2 260 | ``` 261 | - You are running ISEScan jobs on a Linux computer instead of a Linux cluster system. 262 | - Your Linux computer has **nproc** (nproc could be 1 or 2 or 4 or 6 or 8 or ....) CPU cores. 263 | - You want to run ISEScan on ngenome (ngenome could be 1 or 2 or 3, ...) fasta file(s) (genome) in parallel on your Linux computer. 264 | 265 | Now, let's run 200 genomes in one line of command and then wait for all computing jobs to complete (probably several days or weeks, depending on how many hours are required for each of your 200 genomes on average). If your computer has 8 CPU cores, you can execute the command below: 266 | ``` 267 | nohup cat test.fna.list | xargs -n 1 -P 4 -I{} isescan.py --seqfile {} --output results --nthread 2 > log.txt & 268 | ``` 269 | 270 | In the command line, 271 | - **test.fna.list** is a text file which includes 200 fasta files, one fasta file per row, for example: 272 | ``` 273 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS014235.scaffolds.fa 274 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS049959.scaffolds.fa 275 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS020233.scaffolds.fa 276 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS022609.scaffolds.fa 277 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS024132.scaffolds.fa 278 | ``` 279 | - **-n 1** tells your computer to pick only one fasta file from **test.fna.list** for each ISEScan computing job. 280 | - **-P 4** tells your computer to spawn 4 processes at the same time (run 4 ISEScan jobs in parallel, namely, run 4 genomes at the same time). When one job completes with success or exits with error, a new ISEScan job on the next fasta file (e.g. 5th fasta file) in **test.fna.list** is spawned. So, the command line will keep 4 ISEScan computing jobs (one fasta file per ISEScan job) running on your computer, and each job utilizes two CPU cores by default. It means all of 8 CPU cores on your computer have been utilized by your 4 ISEScan computing jobs till the last fasta file is processed by ISEScan. 281 | - **> log.txt** tells your computer to write the screen messages output by ISEScan to the file **log.txt**. 282 | - **&** tells your computer to run jobs in the background without interrupting you on the current terminal (e.g. xterm), in order that you can work on other things on the same terminal. 283 | You can check your job status by the command `top -c -u xiezhq` (assuming your user name is **xiezhq**). 284 | 285 | It might take several days or weeks for 200 genomes to complete. It depends on how many CPU cores you have on your computer and how fast each CPU core is. Please do not load too many ISEScan jobs because each ISEScan job will consume part of your RAM on your computer. However, you can always test and estimate how many GB RAM and how many hours are required for one genome. 286 | 287 | 288 | ### Re-run ISEScan without gene/protein prediction and HMMER searching 289 | - ISEScan will run much faster if you run it on the same genome sequence more than once (e.g., trying different optimal parameters of near and far regions (see our paper [...] for the definitions of near and far regions)) to search for IS elements in your genome). The reason is that it skips either FragGeneScan or both FragGeneScan and phmer/hmmsearch steps which are most time-consuming steps in ISEScan pipeline. 290 | - If you prefer ISEScan recalculating the the results, you can simply remove the proteome file and HMMER search results which are related to your genome sequence file name. For example, you can delete NC_012624.fna.faa in `results/proteome` directory and clusters.faa.hmm.NC_012624.fna.faa and clusters.single.faa.NC_012624.fna.faa in `results/hmm` directory, and then rerun it: 291 | ``` 292 | isescan.py --seqfile NC_012624.fna --output results 293 | ``` 294 | 295 | 296 | ## Release History 297 | - 1.7.3 298 | - fix the bug reported referenced by issue59-60 and increase version number from 1.7.2.3 to 1.7.3. (Thanks lxsteiner, adriludwig, the-reese, ChristophKnapp and SRooke for reporting the issue) 299 | - 1.7.2.3 300 | - remove the bug in pred.py, which cuases the issue 'UnboundLocalError: local variable raworfhits referenced before assignment' in rare cases. 301 | - 1.7.2.2.2 302 | - add code to remove temporary files (created by tempfile.NamedTemporaryFile()) once blastn search completes in case that large amounts of temporary files consume too much space. (Thanks Biancamaria for the suggestion) 303 | - 1.7.2.2 304 | - ISEScan can output .csv (columns are separated by `,`) and .tsv (columns are separated by `tab`) result files, which are much easier for users to parse the results (Thanks oschwengers for his suggestion) 305 | - add command options `--seqfile` and `--output` to remove the positional parmater `seqfile`, `proteome` and `hmm` (Thanks oschwengers for his suggestion) 306 | - modify constants.py to remove the hard coded paths pointing to the third party dependencies and the output directory `dir4prediction` (Thanks oschwengers for his suggestion) 307 | - add tips for installing ISEScan from source codes on Mac (Thanks [Ania Gorska](https://github.com/gvalchca) for her suggestion) 308 | - 1.7.2.1 309 | - modify constants.py to remove the hard coded path poiting to the profile HMM files (clusters.single.faa and clusters.faa.hmm) 310 | - update readme to add an introduction for installing ISEScan package via bioconda (Thanks both [pbasting](https://github.com/pbasting) and [tseemann](https://github.com/tseemann) for making it available!) 311 | - 1.7.2. 312 | - Add command options `--removeShortIS` and `--no-FragGeneScan`, and remove `removeShortIS` and `translateGenome` from constants.py. (Thanks EricDeveaud for his suggestion and codes) 313 | - Add command option `--nthread` to isescan.py, and remove `nthread` and `nproc` from constants.py. 314 | - Remove useless parallel testing codes from code base. 315 | - 1.7.1 316 | - fix a bug in constants.py, which fails to locate the correct path pointing to profile HMM files (clusters.single.faa and clusters.faa.hmm). Thank giuliodimaria92 for it. 317 | - 1.7 318 | - Set removeShortIS = False in constants.py for ISEScan to report both complete and partial IS elements by default. One additional column (type) was added accordingly in .raw output file to label each IS element copy as either complete (c) or partial (p) IS element. For details refer to the section 'Details about NC_012624.fna.raw' in Readme. 319 | - 1.6 320 | - Update Readme about the configuration of ISEScan where the paths to clusters.faa.hmm and clusters.single.faa should also be correctly specified in constants.py (Thank Ania Gorska for it). 321 | - 1.5.4.3 322 | - Fix the bug which failed to report the Tpase ORFs in multi-copy IS elements, and ISEScan now output a .raw file with one additional column E-value4copy which is the E-value of the reported IS copy while the column E-value is the best E-value among all IS copies for the same IS element. 323 | - 1.5.4.1 324 | - fix bug for batch4bacteria.py when *.sum files were created by either outputIndividual() or outputIS4multipleSeqOneFile() in pred.py 325 | - 1.5.4 326 | - Add removeFalsePositive() to remove the potentail false positive in the 'new' family: 1) single-copy hits with e-value > e-50 or no tir or nGaps > 0 or irId < 20 or irId/irLen < 0.75; 2) multi-copy hits with evalue > e-50 and (irId < 13 or (irId < 20 and ngaps > 0)) 327 | - Modify refineHits() to remove the single-copy partial IS elements: 1) if evalue > e-50 or (irId < 13 or (irId < 20 and ngaps > 0 for familys other than IS200/IS605) 328 | - Modify refineHits() to remove the multi-copy partial IS elements: 1) if evalue > e-50 for IS200/IS605 family; 2) if irId < 10 for familys other than ten familys which could have the full IS without perfect TIR (irId < 10), IS110, IS4, IS5, IS6, ISAS1, ISH3, ISNCY. 329 | - Change irSim4singleCopy in constants.py from 0.85 to 0.75, for the use in removeFalsePositive() 330 | - 1.5.3 331 | - Fix bug in getFullIS4seqOnStream() for genome sequence with long multi-copy fregments containing the common IS element 332 | - Use 'average' instead of 'single' method in fastcluster.linkage() 333 | - Fix bug in removeOverlappedOrfhits() to correctly count single-copy IS elements for genome sequence without multi-copy IS elements 334 | - 1.5.2 335 | - Fix bug for genome sequence without multi-copy IS elements 336 | - 1.5.1 337 | - Change: changed consensusBoundaryByCutoff() to consensusBoundaryByCutoffBySeparated() 338 | - Change: added consensusBoundaryByCutoffByCombined() and getbds4opt4start(), to determine the left and right boundaries of multi-copy pro-IS element simultaneously, namely, to determine the optimal combined left and right boundaries instead of separated left and right boundaries. 339 | - 1.5 340 | - Change: add consensusBoundaryByCutoff() and ncopyByCutoff() in tools.py, to determine the optimal boundary of multi-copy pro-IS element. 341 | - 1.4 342 | - Change: recruit the IS copies without predicted Tpase when search for multi-copy IS elements 343 | - 1.3 344 | - Remove buildHMM.py from ISEScan 345 | - 1.2 346 | - CHANGE: pHMMs `clusters.faa.hmm` and `clusters.single.faa`, both files are now built upon the curated ACLAME dataset (ACLAME is a mobile genetic element database.) 347 | - 1.1.1 348 | - Add option in `constants.py` to report either complete IS elements or both complete and partial IS elements 349 | - 1.0 350 | - The first proper release 351 | -------------------------------------------------------------------------------- /__pycache__/constants.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/constants.cpython-33.pyc -------------------------------------------------------------------------------- /__pycache__/isPredict.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/isPredict.cpython-33.pyc -------------------------------------------------------------------------------- /__pycache__/is_analysis.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/is_analysis.cpython-33.pyc -------------------------------------------------------------------------------- /__pycache__/pred.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/pred.cpython-33.pyc -------------------------------------------------------------------------------- /__pycache__/ssw_wrap.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/ssw_wrap.cpython-33.pyc -------------------------------------------------------------------------------- /__pycache__/tools.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/tools.cpython-33.pyc -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | ## Config packages 4 | # 5 | # Set the path variables pointing to the required packages in order that ISEScan can find the required packages 6 | # on your computer. 7 | # 8 | # FragGeneScan 9 | FragGeneScan = 'run_FragGeneScan.pl' 10 | # Hmmer 11 | phmmer = 'phmmer' 12 | hmmsearch = 'hmmsearch' 13 | # Blast 14 | blastn = 'blastn' 15 | blastp = 'blastp' 16 | makeblastdb = 'makeblastdb' 17 | # 18 | ## Config packages 19 | 20 | # get path where isescan.py is 21 | import sys 22 | path2isescan = os.path.dirname(sys.argv[0]) 23 | 24 | # Set the path variables pointing to the profile HMM files (clusters.single.faa and clusters.faa.hmm). 25 | # 26 | # The peptide sequences of single-member clusters, which is used by phmmer in hmmer 27 | file4clusterSeqFile4phmmer = os.path.join(path2isescan, 'pHMMs', 'clusters.single.faa') 28 | # 29 | # The profile HMMs of multiple-member clusters, which is used by hmmsearch in hmmer 30 | file4clusterHMM = os.path.join(path2isescan, 'pHMMs', 'clusters.faa.hmm') 31 | # 32 | 33 | 34 | # for local linux machine 35 | #path2results = '' 36 | #dir4prediction = os.path.join(path2results, 'prediction') 37 | 38 | 39 | # Optimal values for SSW to find TIR in database 40 | # (gapopen, gapextend, match, mismatch) 41 | # 42 | # Optimal filter when aligning two sequences with length = maxLenIR 43 | filters4ssw4isMax = [(1, 10, 4, 5)] # giving the greatest number of matched IS elements and 44 | # the greatest number of matched best IS elements 45 | filters4ssw4trial = [(2, 6, 2, 2)] # trial filter to stop alignment from creating the consecutive gaps 46 | 47 | # minimal and maximal values of length of full-length IS element in each family 48 | minMaxLen4is = { 49 | 'IS1': (732, 4601), 50 | 'IS110': (969, 4105), 51 | 'IS1182': (1330, 1980), 52 | 'IS1380': (1474, 4160), 53 | 'IS1595': (701, 7915), 54 | # IS1016V5 (272 bp) is a deleted variant of IS1016V6: 242/711 bp. IS1016V4 (672 bp) is a deleted variant of IS1016V6: 673/711 bp. 55 | # Then ISMha1 (701 bp) is the shortest member in family IS1595. 56 | 57 | 'IS1634': (1511, 2089), 58 | 'IS200/IS605': (407, 2223), 59 | 'IS21': (1924, 3533), 60 | 'IS256': (1124, 1629), 61 | 'IS3': (435, 1814), 62 | 'IS30': (1027, 8273), 63 | 'IS4': (521, 5396), 64 | 'IS481': (553, 3451), 65 | 'IS5': (789, 5396), 66 | 'IS6': (696, 1648), 67 | 'IS607': (1415, 2607), 68 | 'IS630': (895, 2009), 69 | 'IS66': (1364, 3481), 70 | # IS867 has about 75 % homology with IS866. IS866 is 2716 bp. 71 | # Then ISMno3 (1364 bp) is the shortest member in family IS66. 72 | 'IS701': (1016, 2207), 73 | 'IS91': (712, 2604), 74 | 'IS982': (845, 1282), 75 | 'ISAS1': (1139, 3041), 76 | 'ISAZO13': (1284, 2171), 77 | 'ISH3': (1200, 1509), 78 | 'ISKRA4': (1164, 3746), 79 | 'ISL3': (536, 9109), 80 | 'ISNCY': (786, 3989), 81 | 'new': (400, 10000), # for the novel IS families in database 82 | } 83 | 84 | # peptide and ORF lengths of tpases: 85 | # The first collumn: shortest tpase ORF (bp) 86 | # The second collumn: longest tpase ORF (bp) 87 | # The third collumn: shortest peptide ORF (bp) among all peptides in IS_PEP record for each IS element 88 | # To be added: shortest tpase (aa), longest tpase (aa), 89 | # ORF = tpase + stopcodon 90 | minMax4tpase = { 91 | 'IS1': (666, 1119, 252), 92 | 'IS110': (603, 1380, 156), 93 | 'IS1182': (822, 1731, 570), 94 | 'IS1380': (1158, 1554, 1158), 95 | 'IS1595': (576, 1158, 426), 96 | 'IS1634': (1314, 1875, 1314), 97 | 'IS200/IS605': (366, 1482, 147), 98 | 'IS21': (882, 1758, 231), 99 | 'IS256': (990, 1389, 990), 100 | 'IS3': (441, 1581, 120), 101 | 'IS30': (540, 1419, 189), 102 | 'IS4': (570, 1629, 219), 103 | 'IS481': (447, 1794, 447), 104 | 'IS5': (360, 1908, 75), 105 | 'IS6': (528, 1062, 246), 106 | 'IS607': (768, 1653, 453), 107 | 'IS630': (510, 1194, 318), 108 | 'IS66': (354, 1695, 165), 109 | 'IS701': (921, 1410, 921), 110 | 'IS91': (648, 1548, 648), 111 | 'IS982': (627, 981, 429), 112 | 'ISAS1': (594, 1329, 189), 113 | 'ISAZO13': (1203, 2094, 513), 114 | 'ISH3': (573, 1206, 549), 115 | 'ISKRA4': (1047, 1719, 114), 116 | 'ISL3': (414, 1716, 408), 117 | 'ISNCY': (573, 1815, 123), 118 | 'new': (300, 2100, 50), # for the novel IS families in database 119 | } 120 | 121 | # allowed minimal and maximal and optimal values of the length of TIR sequence for each family 122 | # Here, the optimal values are the empirical parameter based on the observations. 123 | # The 4th collumn is marker indicating whether the family always has TIR (1) or no TIR (0), 124 | # and -1 for not determined (in the family, some members have tir but others have no tir). 125 | minMax4tir = { 126 | 'IS1': (8, 67, 14, 1), 127 | 'IS110': (2, 31, 14, -1), 128 | 'IS1182': (8, 44, 10, 1), 129 | 'IS1380': (7, 39, 10, 1), 130 | 'IS1595': (10, 43, 15, 1), 131 | 'IS1634': (11, 32, 12, 1), 132 | 'IS200/IS605': (10000, 0, 10000, 0), # prevent program from finding any tir with irLen > 0 133 | #'IS200/IS605_8': (11, 11, 11, 1), # cluster 8 (cdhit30) of IS200/IS605 has tir with irLen == 0 or irLen == 11 134 | #'IS200/IS605': (11, 11, 11, -1), # cluster 8 (cdhit30) of IS200/IS605 has tir with irLen == 0 or irLen == 11 135 | 'IS21': (8, 76, 10, 1), 136 | 'IS256': (8, 48, 15, 1), 137 | 'IS3': (7, 54, 10, -1), 138 | 'IS30': (11, 50, 12, 1), 139 | 'IS4': (8, 67, 12, 1), 140 | 'IS481': (5, 52, 10, 1), 141 | 'IS5': (7, 45, 14, 1), 142 | 'IS6': (12, 36, 14, 1), 143 | 'IS607': (12, 46, 12, -1), 144 | 'IS630': (3, 92, 11, 1), 145 | 'IS66': (11, 144, 11, 1), 146 | 'IS701': (12, 38, 12, 1), 147 | 'IS91': (11, 21, 11, -1), 148 | 'IS982': (11, 35, 11, 1), 149 | 'ISAS1': (12, 34, 12, 1), 150 | 'ISAZO13': (18, 48, 18, 1), 151 | 'ISH3': (11, 31, 15, 1), 152 | 'ISKRA4': (15, 40, 18, 1), 153 | 'ISL3': (6, 50, 11, 1), 154 | 'ISNCY': (4, 52, 13, -1), 155 | 'new': (10, 50, 20, -1), # use the popular values for the novel IS families in database 156 | } 157 | # ssw will use minMax4tir[2] as minimal length of the alignement of two tir sequences 158 | # if useOPTtir == True else minMax[0] as minimal length of the alignment of two tir sequences. 159 | #useOPTtir = True 160 | useOPTtir = False 161 | 162 | # the minimum of rations of irId/irLen 163 | minIrIdentity = 0.4 164 | # optimal ration of irId/irLen 165 | optIrIdentity = 0.6 166 | # stringent irId/irLen, which is usually required when irLen < 5(stringentShortestIR) or irLen > 55(stringentLongestIR) 167 | stringentIrIdentity = 0.7 168 | 169 | # maximum distance (bp) between two neighboring orfs (including +/- strand) within one IS element 170 | # 764 IS elements with multiple ORFs with clear coordinates in ORF records, 171 | # 405 with distBetweenORFs >=0, 172 | # 1/405 with dist >= 1000, 6/405(1%) with dist >= 500, 14/405(3%) with dist >= 400, 173 | # 22/405(5%) with dist >= 300, 31/405(8%) with dist >= 250, 44/405(11%) with dist >= 200, 174 | # 90/405(22%) with dist >= 100, 202/405(50%) with dist >= 55, 214/405(53%) with dist >= 50 175 | # 176 | # not to merge 177 | #maxDistBetweenOrfs = -1 178 | # merge ORFs with gap = 0, ('NC_000913.3', 4518418, 4519014, '+') and ('NC_000913.3', 4519015, 4519224, '+') 179 | #maxDistBetweenOrfs = 0 180 | # merge ORFs with gap <= 100 bps 181 | maxDistBetweenOrfs = 100 182 | 183 | # In a dataset, 3891 IS elements with both lORF2TER and rORF2TER >= 0, 184 | # 36/3891(1%) with lORF2TER >= 500, 177/3891(5%) with lORF2TER >= 250, 185 | # 51/3891 with rORF2TER >= 500, 232/3891 with rORF2TER >= 250 186 | # ~99% IS elements in dataset has lORF2TER/rORF2TER less than 500 bps 187 | # ~95% IS elements in dataset has lORF2TER/rORF2TER less than 250 bps 188 | # 189 | # switch maxDist4ter2orf between 500 and 250 190 | #maxDist4ter2orf = 250 191 | maxDist4ter2orf = 500 192 | outerDist4ter2tpase = (150,500) 193 | 194 | # Minimum distance (bp) from near ends of IS element to the nearest ORF, namely, 195 | # the length of the shortest linker between TIR and the nearest ORF. 196 | minDist4ter2orf = -150 197 | #minDist4ter2orf = -50 198 | # 199 | # There is no linkder (space) between TIR and the nearest ORF. 200 | #minDist4ter2orf = 1 201 | 202 | # The strand does not matter when extracting two terminal sequences to align, namely, 203 | # which sequence is the first sequence in pairwise alignement does not make sense. 204 | #splitAlign2orf = True 205 | splitAlign2orf = False 206 | 207 | # IS elements with identicalBases/lengthOfAlignment > sim4iso are regarded as the same IS element (isoform) 208 | # Isoforms have been defined as elements which share in the first instance more than 95% identity 209 | # at the level of their transposase protein sequence or otherwise 90% at the DNA level. 210 | #sim4iso = 0.85 211 | sim4iso = 0.9 212 | # 213 | # SIM4ISO = sim4iso * 100, used by blastn search to get copy number of hit 214 | #SIM4ISO = 85 215 | SIM4ISO = 90 216 | # 217 | # similarity cutoff for protein sequence 218 | aaSim4iso = 0.95 219 | aaSIM4ISO = 95 220 | 221 | # Two neighboring sequences with overlap >= min4overlap are deemed overlapped. 222 | min4overlap = 0.5 # 50% 223 | 224 | # Two sequences with intersect >= min4intersect are deemed intersect. 225 | #min4intersect = 100 # 100 bp, namely, 33 aa or so. 226 | min4intersect = 1 # 1 bp. 227 | 228 | # two neighoring segments with overlap > overlap2removeRedundancy are considered overlapped (redundant) 229 | overlap2removeRedundancy = 0.5 # 50% 230 | #overlap2removeRedundancy = 0.99999999999 # 100% 231 | 232 | # use min4intersect if True else overlap2removeRedundancy as the threshold to 233 | # turn on clustering and remove intersected ISs/hits except the representative in a cluster. 234 | #intersected2remove = True 235 | intersected2remove = False 236 | 237 | # hits with evalue <= min4evalue are defined as the final hits. 238 | min4evalue = 1e-10 239 | #min4evalue = 1e-5 240 | 241 | # more strict evalue and tir are required for single copy hits 242 | evalue4singleCopy = 1e-50 243 | #irSim4singleCopy = 0.85 # irId/irLen 244 | irSim4singleCopy = 0.75 # irId/irLen 245 | 246 | # E-value cutoff for filtering hits returned by HMM search 247 | evalue2filterHMMhits = min4evalue 248 | #evalue2filterHMMhits = 10 # do not filter out any hits returned by HMM search 249 | 250 | # Paramter for removing potential falsely discovered novel IS elements (family 'new') and partial IS elements 251 | # 252 | # {excludedFamilys:(full,partial,no)}: 253 | # {'IS110':(54,19,3), 'IS4':(2,3,1), 'IS5':(2,1,1), 'IS6':(2,0,0), 'IS630':(1,1,7), 254 | # 'IS66':(1,0,2), 'IS91':(1,1,2), 'ISAS1':(2,0,0), 'ISH3':(8,1,3), 'ISNCY':(3,1,4)} 255 | # The full IS elements in the familys above might exist without perfect TIR with irId < 10. 256 | # We should hence exclude these familys when filtering out the partial IS elements without perfect TIR. 257 | #excludedFamilys = ['IS110', 'IS4', 'IS5', 'IS6', 'IS630', 'IS66', 'IS91', 'ISAS1', 'ISH3', 'ISNCY'] 258 | excludedFamilys = ['IS110', 'IS4', 'IS5', 'IS6', 'ISAS1', 'ISH3', 'ISNCY'] 259 | # 260 | # number of matches in tir alignment, 261 | # which are used for removing the potential falsely discovered IS elements (false positive) and partial IS elements without perfect TIR. 262 | # Refer to removeFalsePositive() and refineHits() in pred.py for more details. 263 | cutoff4irId4short = 13 264 | cutoff4irId4long = 20 265 | cutoff4irId4multicopy = 10 266 | # 267 | # Paramter for removing potential falsely discovered novel IS elements (family 'new') and partial IS elements 268 | 269 | 270 | # width of line in fasta file created by us 271 | fastaLineWidth = 60 272 | 273 | # complementary table for DNA 274 | #------------------------------------------ 275 | # Code Represents Complement 276 | # A Adenine T 277 | # G Guanine C 278 | # C Cytosine G 279 | # T Thymine A 280 | # Y Pyrimidine (C or T) R 281 | # R Purine (A or G) Y 282 | # W weak (A or T) W 283 | # S strong (G or C) S 284 | # K keto (T or G) M 285 | # M amino (C or A) K 286 | # D A, G, T (not C) H 287 | # V A, C, G (not T) B 288 | # H A, C, T (not G) D 289 | # B C, G, T (not A) V 290 | # X/N any base X/N 291 | # - Gap - 292 | #------------------------------------------ 293 | #na1u = 'ATCGN' 294 | na1u = 'ATCGNRYWSKMDVHBX' 295 | #na2u = 'TAGCN' 296 | na2u = 'TAGCNYRWSMKHBDVX' 297 | #na1l = 'atcgn' 298 | na1l = 'atcgry' 299 | #na2l = 'tagcn' 300 | na2l = 'tagcyr' 301 | #na1ul = 'ATCGNatcgn' 302 | na1ul = 'ATCGRYatcgry' 303 | #na2ul = 'TAGCNtagcn' 304 | na2ul = 'TAGCYRtagcyr' 305 | 306 | # The Genetic Codes 307 | # Refer to http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes 308 | # The Bacterial, Archaeal and Plant Plastid Code (transl_table=11). 309 | table11 = { 310 | 'starts': ('TTG', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTG'), 311 | 312 | 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 313 | 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 314 | 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*', 315 | 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W', 316 | 317 | 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 318 | 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 319 | 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 320 | 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 321 | 322 | 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 323 | 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 324 | 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 325 | 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 326 | 327 | 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 328 | 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 329 | 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 330 | 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', 331 | } 332 | gene2pepTable = {'11': table11} 333 | -------------------------------------------------------------------------------- /isPredict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import time, random 5 | import os 6 | import argparse 7 | import sys 8 | import datetime 9 | import operator 10 | import concurrent.futures 11 | 12 | import constants 13 | import tools 14 | import is_analysis 15 | import pred 16 | 17 | 18 | def genome2proteome(args2concurrent): 19 | print("\nBegin to translate genome into proteome.") 20 | 21 | for args in args2concurrent: 22 | outs = is_analysis.translate_genome_dna_v3(args) 23 | dna_file = args[0] 24 | if outs == 0: 25 | print('Translating genome into proteome for', dna_file, ', return ', outs) 26 | else: 27 | e = "Translating genome into proteome for {}, return error!".format(dna_file) 28 | raise RuntimeError(e) 29 | 30 | print("\nFinish translating genome into proteome.", datetime.datetime.now().ctime()) 31 | 32 | 33 | # proteome_file: (faaFileName, org) 34 | # faaFileName: peptide sequence file output by FragGeneScan 35 | # org: organism id which is the parent directory of DNA sequence file 36 | # outFiles4phmmer: [output_file, ...] 37 | # output_file: file, 38 | # hmmer hits file with full path, e.g. /path/output4hmmsearch_illumina_5_cdhit30/HMASM/clusters.single.faa.SRS078176.scaffolds.fa.faa 39 | def prepare4phmmer(clusterSeqFile4phmmer, proteome_files, path_to_hmmsearch_results, nthread): 40 | args2concurrent = [] 41 | outFiles4phmmer = [] 42 | query = os.path.basename(clusterSeqFile4phmmer) 43 | for proteome_file in proteome_files: 44 | faaFileName, org, update = proteome_file 45 | if not os.path.isfile(faaFileName) or os.stat(faaFileName).st_size == 0: 46 | print('No such file or Empty file', faaFileName) 47 | continue 48 | fileName = '.'.join([query, os.path.basename(faaFileName)]) 49 | output_file = os.path.join(path_to_hmmsearch_results, org, fileName) 50 | callhmmer = False 51 | if update == True: 52 | callhmmer = True 53 | elif os.path.isfile(output_file) and os.stat(output_file).st_size > 0: 54 | fp = open(output_file, 'r') 55 | fp.seek(fp.seek(0,2)-len('# [ok]\n')) 56 | if '# [ok]\n' in fp.read(): 57 | callhmmer = False 58 | else: 59 | # incomplete file missing the last line of the normal file created by hmmer-3.1b2 60 | callhmmer = True 61 | else: 62 | callhmmer = True 63 | 64 | if callhmmer == True: 65 | args2concurrent.append((clusterSeqFile4phmmer, faaFileName, output_file, nthread)) 66 | tools.makedir(os.path.dirname(output_file)) 67 | else: 68 | print('Skip phmmer {} against {}'.format(clusterSeqFile4phmmer, faaFileName)) 69 | 70 | outFiles4phmmer.append(output_file) 71 | return (args2concurrent, outFiles4phmmer) 72 | 73 | # outFiles4hmmsearch: [output_file, ...] 74 | # output_file: output of hmmsearch, e.g. clusters.faa.hmm.NC_000913.fna.faa, clusters.faa.hmm.SRS014235.scaffolds.fa.faa 75 | def prepare4hmmsearch(hmms_file, proteome_files, path_to_hmmsearch_results, nthread): 76 | args2concurrent = [] 77 | outFiles4hmmsearch = [] 78 | query = os.path.basename(hmms_file) 79 | for proteome_file in proteome_files: 80 | faaFileName, org, update = proteome_file 81 | if not os.path.isfile(faaFileName) or os.stat(faaFileName).st_size == 0: 82 | print('No such file or Empty file', faaFileName) 83 | continue 84 | fileName = '.'.join([query, os.path.basename(faaFileName)]) 85 | output_file = os.path.join(path_to_hmmsearch_results, org, fileName) 86 | callhmmer = False 87 | if update == True: 88 | callhmmer = True 89 | elif os.path.isfile(output_file) and os.stat(output_file).st_size > 0: 90 | fp = open(output_file, 'r') 91 | fp.seek(fp.seek(0,2)-len('# [ok]\n')) 92 | if '# [ok]\n' in fp.read(): 93 | callhmmer = False 94 | else: 95 | # incomplete file missing the last line of the normal file created by hmmer-3.1b2 96 | callhmmer = True 97 | else: 98 | callhmmer = True 99 | if callhmmer == True: 100 | args2concurrent.append((hmms_file, faaFileName, output_file, nthread)) 101 | tools.makedir(os.path.dirname(output_file)) 102 | else: 103 | print('Skip hmmsearch {} against {}'.format(hmms_file, faaFileName)) 104 | 105 | outFiles4hmmsearch.append(output_file) 106 | return (args2concurrent, outFiles4hmmsearch) 107 | 108 | def hmmSearch(args2concurrent): 109 | print("\nBegin to profile HMM search against proteome database.", datetime.datetime.now().ctime()) 110 | 111 | for args in args2concurrent: 112 | outs = is_analysis.is_hmmsearch_v2(args) 113 | hmms_file, proteome_file, hmmHitsFile, nthread = args 114 | if outs == 0: 115 | print('Finish Profile HMM searching', hmms_file, ' against', proteome_file, ', output', hmmHitsFile) 116 | else: 117 | e = 'Profile HMM searching ' + hmms_file + ' against ' + proteome_file + ', return error!\n' 118 | raise RuntimeError(e) 119 | 120 | print("\nFinish profile HMM searching against proteome database.", datetime.datetime.now().ctime()) 121 | 122 | def phmmerSearch(args2concurrent4phmmer): 123 | print("\nBegin to phmmer search against proteome database.", datetime.datetime.now().ctime()) 124 | 125 | for arg in args2concurrent4phmmer: 126 | outs = is_analysis.is_phmmer(arg) 127 | seqFile, proteome_file, hmmHitsFile, nthread = arg 128 | if outs == 0: 129 | print('Finish phmmer searching', seqFile, ' against', proteome_file, ', output', hmmHitsFile) 130 | else: 131 | e = 'phmmer searching ' + seqFile + ' against ' + proteome_file + ', return error!\n' 132 | raise RuntimeError(e) 133 | 134 | print("\nFinish phmmer searching against proteome database.", datetime.datetime.now().ctime()) 135 | 136 | 137 | # dnaFiles: [(file, org), ..., (file, org)] 138 | def translateGenomeByFGS_v2(dnaFiles, dir2proteome, nthread): 139 | #seq_type = '1' 140 | #train_model = 'complete' 141 | seq_type = '0' 142 | #train_model = 'sanger_5' 143 | #train_model = 'sanger_10' 144 | #train_model = '454_5' 145 | #train_model = '454_10' 146 | #train_model = '454_30' 147 | train_model = 'illumina_5' 148 | #train_model = 'illumina_10' 149 | 150 | proteome_files = [] 151 | args2concurrent = [] 152 | for item in dnaFiles: 153 | dna_file, org = item 154 | 155 | outputFile = os.path.basename(dna_file) 156 | output_file = os.path.join(dir2proteome, org, outputFile) 157 | 158 | faaFile = output_file + '.faa' 159 | # prepare to translate genome into proteome if protome file has not been available. 160 | update = False 161 | if not os.path.isfile(faaFile): 162 | tools.makedir(os.path.dirname(faaFile)) 163 | args2concurrent.append((dna_file, output_file, seq_type, train_model, nthread)) 164 | update = True 165 | elif os.stat(faaFile).st_size > 0: 166 | print('Skip translating {} into {}'.format(dna_file, faaFile)) 167 | else: 168 | print('No gene was found for', dna_file) 169 | continue 170 | 171 | proteome_files.append((faaFile, org, update)) 172 | 173 | # Translate genome into proteome. 174 | if len(args2concurrent) > 0: 175 | genome2proteome(args2concurrent) 176 | else: 177 | print('Skip translating genome into proteome.') 178 | return proteome_files 179 | 180 | # Based on .faa and .ptt files, it read annotated protein sequence from NCBI 181 | # and then write a protein sequence file same as the output of FragGeneScan. 182 | # dnaFiles: [(file, org), ..., (file, org)] 183 | def proteinFromNCBI(dnaFiles, dir2proteome): 184 | proteome_files = [] 185 | # Convert GeneBank protein info (NC_000913.faa and NC_000913.ptt) 186 | # into FragGeneScan protein file format(NC_000913.fna.faa) 187 | update = True 188 | for item in dnaFiles: 189 | fnaFile, org = item 190 | #faaFile = fnaFile[:-4] + '.faa' 191 | #pttFile = fnaFile[:-4] + '.ptt' 192 | gbkFile = fnaFile[:-4] + '.gbk' 193 | fgsFile = os.path.join(dir2proteome, org, os.path.basename(fnaFile + '.faa')) 194 | #tools.gb2fgs4protein(fnaFile, faaFile, pttFile, fgsFile) 195 | tools.gbk2fgs4protein(fnaFile, gbkFile, fgsFile) 196 | proteome_files.append((fgsFile, org, update)) 197 | return proteome_files 198 | 199 | def isPredict(dna_list, output, removeShortIS, translateGenome, 200 | nthread=1): 201 | dnaFiles = tools.rdDNAlist(dna_list) 202 | path_to_proteome = os.path.join(output, 'proteome') 203 | if translateGenome == True: 204 | print ("predict and translate genes from genome sequence into protein database using FragGeneScan program") 205 | proteome_files = translateGenomeByFGS_v2(dnaFiles, path_to_proteome, nthread) 206 | else: 207 | print ("use NCBI protein database") 208 | proteome_files = proteinFromNCBI(dnaFiles, path_to_proteome) 209 | 210 | clusterSeqFile4phmmer = constants.file4clusterSeqFile4phmmer 211 | hmms_file = constants.file4clusterHMM 212 | 213 | # HMM searches against protein database 214 | # 215 | path_to_hmmsearch_results = os.path.join(output, 'hmm') 216 | if os.path.isfile(clusterSeqFile4phmmer) and os.stat(clusterSeqFile4phmmer).st_size > 0: 217 | args2concurrent4phmmer, outFiles4phmmer = prepare4phmmer(clusterSeqFile4phmmer, 218 | proteome_files, path_to_hmmsearch_results, nthread) 219 | else: # no valid clusters.single.faa available 220 | #args2concurrent4phmmer,outFiles4phmmer = [], [] 221 | e = clusterSeqFile4phmmer + ' is not found or empty!\n' 222 | raise RuntimeError(e) 223 | if len(args2concurrent4phmmer) > 0: 224 | phmmerSearch(args2concurrent4phmmer) 225 | 226 | if os.path.isfile(hmms_file) and os.stat(hmms_file).st_size > 0: 227 | args2concurrent4hmmsearch, outFiles4hmmsearch = prepare4hmmsearch(hmms_file, 228 | proteome_files, path_to_hmmsearch_results, nthread) 229 | else: # no valid clusters.faa.hmm available 230 | #args2concurrent4hmmsearch, outFiles4hmmsearch = [], [] 231 | e = hmms_file + ' is not found or empty!\n' 232 | raise RuntimeError(e) 233 | if len(args2concurrent4hmmsearch) > 0: 234 | hmmSearch(args2concurrent4hmmsearch) 235 | 236 | # Select significant ones (predictions) from hits returned by HMM search 237 | hitsFile = outFiles4phmmer + outFiles4hmmsearch 238 | if len(hitsFile) > 0: 239 | args4pred = { 240 | 'dna_list': dna_list, 241 | 'output': output, 242 | 'path_to_proteome': path_to_proteome, 243 | 'path_to_hmmsearch_results': path_to_hmmsearch_results, 244 | 'hitsFile': hitsFile, 245 | 'removeShortIS' : removeShortIS, 246 | 'nthread': nthread, 247 | } 248 | pred.pred(args4pred) 249 | if removeShortIS is False: 250 | print('Both complete and partial IS elements are reported.') 251 | else: 252 | print('Only complete IS elements are reported.') 253 | else: 254 | e = 'No hit was returned by HMM search against protein database. ' + datetime.datetime.now().ctime() 255 | print(e) 256 | -------------------------------------------------------------------------------- /isescan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # ISEScan version 5 | version = '1.7.3' 6 | 7 | import argparse 8 | import os 9 | import sys 10 | import datetime 11 | 12 | import isPredict 13 | 14 | def isPredictSingle(args): 15 | print('ISEScan starts at', datetime.datetime.now().ctime()) 16 | 17 | seqfile = args['seqfile'] 18 | output = args['output'] 19 | seqfilename = os.path.basename(seqfile) 20 | org = os.path.basename(os.path.dirname(seqfile)) 21 | filelist = org + '_' + seqfilename + '.list' 22 | with open(filelist, 'w') as fp: 23 | fp.write(seqfile+'\n') 24 | 25 | isPredict.isPredict(filelist, args['output'], args['removeShortIS'], args['translateGenome'], 26 | args['nthread']) 27 | os.remove(filelist) 28 | print('ISEScan ends at', datetime.datetime.now().ctime()) 29 | 30 | if __name__ == "__main__": 31 | import textwrap 32 | 33 | # Parse command line arguments 34 | descriptStr = '''\ 35 | ISEScan is a python pipeline to identify Insertion Sequence elements (both complete and incomplete IS elements) in genom. A typical invocation would be: 36 | python3 isescan.py seqfile proteome hmm 37 | 38 | - If you want isescan to report only complete IS elements, you need to set command line option --removeShortIS.''' 39 | parser = argparse.ArgumentParser(prog='isescan', description = textwrap.dedent(descriptStr), 40 | formatter_class=argparse.RawDescriptionHelpFormatter) 41 | 42 | parser.add_argument('--version', action='version', version='%(prog)s' + ' ' + version) 43 | 44 | parser.add_argument( 45 | '--removeShortIS', 46 | action='store_true', 47 | help = "Remove incomplete (partial) IS elements which include IS element with length < 400 or single copy IS element without perfect TIR.", 48 | ) 49 | 50 | parser.add_argument( 51 | '--no-FragGeneScan', 52 | action='store_false', 53 | help = "Use the annotated protein sequences in NCBI GenBank file (.gbk which must be in the same folder with genome sequence file), instead of the protein sequences predicted/translated by FragGeneScan. (Experimental feature!)", 54 | ) 55 | 56 | parser.add_argument( 57 | '--seqfile', 58 | required = True, 59 | default='', 60 | help = "Sequence file in fasta format, '' by default", 61 | ) 62 | 63 | parser.add_argument( 64 | '--output', 65 | required = True, 66 | default='results', 67 | help = "Output directory, 'results' by default", 68 | ) 69 | 70 | parser.add_argument( 71 | '--nthread', 72 | required = False, 73 | type = int, 74 | default = 1, 75 | help = 'Number of CPU cores used for FragGeneScan and hmmer, 1 by default.') 76 | 77 | args = parser.parse_args() 78 | 79 | args4isPredictSingle = { 80 | 'removeShortIS' : args.removeShortIS, 81 | 'translateGenome' : args.no_FragGeneScan, 82 | 'seqfile': args.seqfile.strip(), 83 | 'output': args.output.strip(), 84 | 'nthread': args.nthread, 85 | } 86 | 87 | isPredictSingle(args4isPredictSingle) 88 | -------------------------------------------------------------------------------- /publication/SupplementaryMaterials.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/publication/SupplementaryMaterials.docx -------------------------------------------------------------------------------- /publication/SupplementaryMaterials.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/publication/SupplementaryMaterials.xlsx -------------------------------------------------------------------------------- /publication/btx433.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/publication/btx433.pdf -------------------------------------------------------------------------------- /pyssw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @package pyssw 6 | @brief Python standalone program for ssw alignment using the C library 7 | Complete-Striped-Smith-Waterman-Library 8 | Biopython module is require for fastq/fastq parsing 9 | @copyright [The MIT licence](http://opensource.org/licenses/MIT) 10 | @author Adrien Leger - 2014 11 | * 12 | * 13 | * 14 | * [Github](https://github.com/a-slide) 15 | * [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/) 16 | """ 17 | 18 | #~~~~~~~GLOBAL IMPORTS~~~~~~~# 19 | # Standard library packages 20 | import optparse 21 | import sys 22 | from time import time 23 | import gzip 24 | 25 | #~~~~~~~MAIN FUNCTION~~~~~~~# 26 | def main (opt): 27 | 28 | print ("Inport subject sequence") 29 | # Import fasta subject 30 | if opt.subject.rpartition(".")[2].lower() == "gz": 31 | subject_handle = gzip.open(opt.subject, "r") 32 | else: 33 | subject_handle = open(opt.subject, "r") 34 | subject = SeqIO.read(subject_handle, "fasta") 35 | 36 | print ("Inport query sequences and count the number of sequences") 37 | # Import fasta subject 38 | if opt.query.rpartition(".")[2].lower() == "gz": 39 | nseq = count_seq(opt.query, opt.qtype, True) 40 | query_handle = gzip.open(opt.query, "r") 41 | else: 42 | nseq = count_seq(opt.query, opt.qtype, False) 43 | query_handle = open(opt.query, "r") 44 | query_gen = SeqIO.parse(query_handle, opt.qtype) 45 | 46 | print("{} contains {} sequences to align".format(opt.query, nseq)) 47 | # Calculate a step list for the progress bar 48 | nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)] 49 | 50 | print ("Initialize ssw aligner with the subject sequence") 51 | # Init the an Aligner object with the reference value 52 | ssw = Aligner( 53 | str(subject.seq), 54 | match=int(opt.match), 55 | mismatch=int(opt.mismatch), 56 | gap_open=int(opt.gap_open), 57 | gap_extend= int(opt.gap_extend), 58 | report_secondary=False, 59 | report_cigar=True) 60 | 61 | # Write the header of the SAM file 62 | with open("result.sam", "w") as f: 63 | f.write("@HD\tVN:1.0\tSO:unsorted\n") 64 | f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq))) 65 | f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n") 66 | f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format( 67 | opt.match, 68 | opt.mismatch, 69 | opt.gap_open, 70 | opt.gap_extend)) 71 | f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format( 72 | opt.min_score, 73 | opt.min_len)) 74 | 75 | print ("Starting alignment of queries against the subject sequence") 76 | start = time() 77 | # Align each query along the subject an write result in a SAM file 78 | i = 0 79 | for query in query_gen: 80 | 81 | # Find the best alignment 82 | if opt.reverse: 83 | al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len)) 84 | else: 85 | al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True 86 | 87 | # If valid match found 88 | if al: 89 | f.write(sam_line( 90 | qname=query.id, 91 | flag=0 if orient else 16, 92 | rname=subject.id, 93 | pos=al.ref_begin+1, 94 | cigar=al.cigar_string, 95 | seq=str(query.seq), 96 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*", 97 | tags=["AS:i:{}".format(al.score)])) 98 | 99 | # If no valid match found and -u flag activated (report unaligned) 100 | elif opt.unaligned: 101 | f.write(sam_line( 102 | qname=query.id, 103 | flag=4, 104 | seq=str(query.seq), 105 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*")) 106 | # Else = match unreported 107 | 108 | # Progress bar 109 | i+=1 110 | if i in nseq_list: 111 | frac = i/float(nseq) 112 | t = time()-start 113 | print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2))) 114 | 115 | print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2))) 116 | 117 | #~~~~~~~HELPER FUNCTIONS~~~~~~~# 118 | 119 | 120 | def sam_line (qname='*', flag=4, rname='*', pos=0, mapq=0, cigar='*', rnext='*', pnext=0, tlen=0, seq='*', qual='*', tags=None): 121 | """ 122 | Return a minimal sam line = by default return an undetermined sam line. Check the document 123 | [SAM Format Specification](http://samtools.sourceforge.net/SAM1.pdf) for a full description. 124 | @param qname Query template NAME 125 | @param flag bitwise FLAG 126 | @param rname Reference sequence NAME of the alignment 127 | @param pos 1-based leftmost mapping POSition of the first matching base 128 | @param mapq MAPping Quality 129 | @param cigar CIGAR string 130 | @param rnext Reference sequence name of the primary alignment of the mate 131 | @param pnext 1-based leftmost position of the primary alignment of the mate 132 | @param tlen signed observed Template LENgth 133 | @param seq segment SEQuence 134 | @param qual ASCII of base QUALity plus 33 135 | @param tags list of optional tags 136 | @return A Sam alignment line 137 | """ 138 | if tags: 139 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 140 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, " ".join(tags)) 141 | else: 142 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( 143 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual) 144 | 145 | def find_best_align (ssw, query, min_score, min_len): 146 | 147 | # Align reverse and forward query 148 | forward_al = ssw.align(str(query.seq), min_score, min_len) 149 | reverse_al = ssw.align(str(query.seq.reverse_complement()), min_score, min_len) 150 | 151 | # Decision tree to return the best aligned sequence taking into acount the absence of result 152 | # by ssw_wrap in case of score filtering 153 | 154 | if not forward_al: 155 | if not reverse_al: 156 | return (None, None) 157 | else: 158 | return (reverse_al, False) 159 | 160 | else: 161 | if not reverse_al: 162 | return (forward_al, True) 163 | else: 164 | if forward_al.score >= reverse_al.score: 165 | return (forward_al, True) 166 | else: 167 | return (reverse_al, False) 168 | 169 | def count_seq (filename, seq_type="fasta", gziped=False): 170 | """ 171 | Count the number of sequences in a fastq or a fastq file 172 | @param filename Path to a valid readeable file 173 | @param file_type Should be either fastq or fastq. Default fasta 174 | @param gziped Boolean indicating if the file is gziped or not. Default False 175 | """ 176 | #Standard library import 177 | import gzip 178 | from mmap import mmap 179 | 180 | # Verify if the file is fasta or fastq type 181 | assert seq_type in ["fasta", "fastq"], "The file has to be either fastq or fasta format" 182 | 183 | # Open the file 184 | if gziped: 185 | f = gzip.open(filename, "r") 186 | else: 187 | f = open(filename, "r") 188 | 189 | # FASTA Find a start line seq character ">" an increment the counter each time 190 | if seq_type == "fasta": 191 | nline = 0 192 | for line in f: 193 | if line[0] == ">": 194 | nline+=1 195 | f.close() 196 | return nline 197 | 198 | # FASTQ No motif to find, but 4 lines correspond to 1 sequence 199 | else: 200 | nline = 0 201 | for line in f: 202 | nline+=1 203 | f.close() 204 | return nline/4 205 | 206 | def optparser(): 207 | 208 | print("Parse command line options") 209 | # Usage and version strings 210 | program_name = "pyssw" 211 | program_version = 0.1 212 | version_string = "{}\t{}".format(program_name, program_version) 213 | usage_string = "{}.py -s subject.fasta -q fastq (or fasta) [Facultative options]".format(program_name) 214 | optparser = optparse.OptionParser(usage = usage_string, version = version_string) 215 | 216 | # Define optparser options 217 | hstr = "Path of the fasta file containing the subject genome sequence. Can be gziped. [REQUIRED] " 218 | optparser.add_option( '-s', '--subject', dest="subject", help=hstr) 219 | hstr = "Path of the fastq or fasta file containing the short read to be aligned. Can be gziped. [REQUIRED]" 220 | optparser.add_option( '-q', '--query', dest="query", help=hstr) 221 | hstr = "Type of the query file = fastq or fasta. [default: fastq]" 222 | optparser.add_option( '-t', '--qtype', dest="qtype", default="fastq", help=hstr) 223 | hstr = "Positive integer for weight match in genome sequence alignment. [default: 2]" 224 | optparser.add_option( '-m', '--match', dest="match",default=2, help=hstr) 225 | hstr = "Positive integer. The negative value will be used as weight mismatch in genome sequence alignment. [default: 2]" 226 | optparser.add_option( '-x', '--mismatch', dest="mismatch", default=2, help=hstr) 227 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 3]" 228 | optparser.add_option( '-o', '--gap_open', dest="gap_open", default=3, help=hstr) 229 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 1]" 230 | optparser.add_option( '-e', '--gap_extend', dest="gap_extend", default=1, help=hstr) 231 | hstr = "Integer. Consider alignments having a score <= as not aligned. [default: 0]" 232 | optparser.add_option( '-f', '--min_score', dest="min_score", default=0, help=hstr) 233 | hstr = "Integer. Consider alignments having a length <= as not aligned. [default: 0]" 234 | optparser.add_option( '-l', '--min_len', dest="min_len", default=0, help=hstr) 235 | hstr = "Flag. Align query in forward and reverse orientation and choose the best alignment. [Set by default]" 236 | optparser.add_option( '-r', '--reverse', dest="reverse", action="store_true", default=True, help=hstr) 237 | hstr = "Flag. Write unaligned reads in sam output [Unset by default]" 238 | optparser.add_option( '-u', '--unaligned', dest="unaligned", action="store_true", default=False, help=hstr) 239 | 240 | # Parse arg and return a dictionnary_like object of options 241 | opt, args = optparser.parse_args() 242 | 243 | if not opt.subject: 244 | print ("\nERROR: a subject fasta file has to be provided (-s option)\n") 245 | optparser.print_help() 246 | sys.exit() 247 | 248 | if not opt.query: 249 | print ("\nERROR: a query fasta or fastq file has to be provided (-q option)\n") 250 | optparser.print_help() 251 | sys.exit() 252 | 253 | return opt 254 | 255 | #~~~~~~~TOP LEVEL INSTRUCTIONS~~~~~~~# 256 | 257 | if __name__ == '__main__': 258 | 259 | # try to import Third party and local packages 260 | try: 261 | from Bio import SeqIO 262 | except ImportError: 263 | print ("ERROR: Please install Biopython package") 264 | sys.exit() 265 | 266 | try: 267 | from ssw_wrap import Aligner 268 | except ImportError: 269 | print ("ERROR: Please place ssw_wrap in the current directory or add its dir to python path") 270 | sys.exit() 271 | 272 | # Parse command line arguments 273 | opt = optparser() 274 | # Run the main function 275 | main(opt) 276 | -------------------------------------------------------------------------------- /ssw201507/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CXX = g++ 3 | CFLAGS := -Wall -O3 -pipe #-pg 4 | CXXFLAGS := $(CFLAGS) 5 | LOBJS = ssw.o 6 | LCPPOBJS = ssw_cpp.o 7 | PROG = ssw_test 8 | LIB = libssw.so 9 | EXAMPLE = example_c 10 | EXAMPLE_CPP = example_cpp 11 | JAVA_JAR = ssw.jar 12 | JAVA_LIB = libsswjni.so 13 | JAVA_INLCUDES = -I"$(JAVA_HOME)/include" -I"$(JAVA_HOME)/include/linux" 14 | JAVA_OBJ = ssw/Aligner.class ssw/Alignment.class ssw/Example.class 15 | 16 | .PHONY: all default java clean 17 | 18 | default: $(PROG) $(EXAMPLE) $(EXAMPLE_CPP) $(LIB) 19 | 20 | all: default java 21 | 22 | java: $(JAVA_JAR) $(JAVA_LIB) 23 | 24 | $(LIB): ssw.c ssw.h 25 | $(CC) $(CFLAGS) -fPIC -shared -rdynamic -o $@ $< 26 | 27 | $(PROG): main.c kseq.h 28 | 29 | $(EXAMPLE): example.c 30 | 31 | $(PROG) $(EXAMPLE): $(LOBJS) 32 | $(CC) -o $@ $(filter-out %.h,$^) $(CFLAGS) -lm -lz 33 | 34 | $(EXAMPLE_CPP): example.cpp $(LOBJS) $(LCPPOBJS) 35 | $(CXX) -o $@ $^ $(CXXFLAGS) -lm -lz 36 | 37 | $(JAVA_LIB): sswjni.c ssw.c ssw.h 38 | $(CC) $(CFLAGS) $(JAVA_INLCUDES) -fPIC -shared -rdynamic -o $@ $< ssw.c 39 | 40 | $(JAVA_JAR): $(JAVA_OBJ) 41 | jar cvfe $@ ssw.Example $^ 42 | 43 | %.class: %.java 44 | javac $< 45 | 46 | ssw.o: ssw.c ssw.h 47 | $(CC) -c -o $@ $< $(CFLAGS) 48 | 49 | ssw_cpp.o: ssw_cpp.cpp ssw_cpp.h ssw.h 50 | $(CXX) -c -o $@ $< $(CXXFLAGS) 51 | 52 | clean: 53 | -rm -f $(LOBJS) $(LCPPOBJS) $(PROG) $(LIB) $(EXAMPLE) $(EXAMPLE_CPP) $(JAVA_LIB) $(JAVA_JAR) $(JAVA_OBJ) *~ 54 | -------------------------------------------------------------------------------- /ssw201507/__pycache__/ssw_wrap.cpython-33.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/ssw201507/__pycache__/ssw_wrap.cpython-33.pyc -------------------------------------------------------------------------------- /ssw201507/example.c: -------------------------------------------------------------------------------- 1 | /* example.c 2 | * This is a simple example to show you how to use the SSW C library. 3 | * To run this example: 4 | * 1) gcc -Wall -lz ssw.c example.c 5 | * 2) ./a.out 6 | * Created by Mengyao Zhao on 07/31/12. 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include "ssw.h" 13 | 14 | // Print the BLAST like output. 15 | static void ssw_write (const s_align* a, 16 | const char* ref_seq, 17 | const char* read_seq, 18 | const int8_t* table) { 19 | 20 | fprintf(stdout, "hello, %d\n", a->cigarLen); 21 | int i; 22 | fprintf(stdout, "hello\n"); 23 | for(i = 0; i < a->cigarLen; ++i) 24 | { 25 | fprintf(stdout, "%d%c",cigar_int_to_len(a->cigar[i]), cigar_int_to_op(a->cigar[i])); 26 | } 27 | fprintf(stdout, "\nhello\n"); 28 | 29 | fprintf(stdout, "optimal_alignment_score: %d\tsub-optimal_alignment_score: %d\t", a->score1, a->score2); 30 | if (a->ref_begin1 + 1) fprintf(stdout, "target_begin: %d\t", a->ref_begin1 + 1); 31 | fprintf(stdout, "target_end: %d\t", a->ref_end1 + 1); 32 | if (a->read_begin1 + 1) fprintf(stdout, "query_begin: %d\t", a->read_begin1 + 1); 33 | fprintf(stdout, "query_end: %d\n\n", a->read_end1 + 1); 34 | if (a->cigar) { 35 | int32_t c = 0, left = 0, e = 0, qb = a->ref_begin1, pb = a->read_begin1; 36 | uint32_t i; 37 | while (e < a->cigarLen || left > 0) { 38 | int32_t count = 0; 39 | int32_t q = qb; 40 | int32_t p = pb; 41 | fprintf(stdout, "Target: %8d ", q + 1); 42 | for (c = e; c < a->cigarLen; ++c) { 43 | char letter = cigar_int_to_op(a->cigar[c]); 44 | uint32_t length = cigar_int_to_len(a->cigar[c]); 45 | uint32_t l = (count == 0 && left > 0) ? left: length; 46 | for (i = 0; i < l; ++i) { 47 | if (letter == 'I') fprintf(stdout, "-"); 48 | else { 49 | fprintf(stdout, "%c", *(ref_seq + q)); 50 | ++ q; 51 | } 52 | ++ count; 53 | if (count == 60) goto step2; 54 | } 55 | } 56 | step2: 57 | fprintf(stdout, " %d\n ", q); 58 | q = qb; 59 | count = 0; 60 | for (c = e; c < a->cigarLen; ++c) { 61 | char letter = cigar_int_to_op(a->cigar[c]); 62 | uint32_t length = cigar_int_to_len(a->cigar[c]); 63 | uint32_t l = (count == 0 && left > 0) ? left: length; 64 | for (i = 0; i < l; ++i){ 65 | if (letter == 'M') { 66 | if (table[(int)*(ref_seq + q)] == table[(int)*(read_seq + p)])fprintf(stdout, "|"); 67 | else fprintf(stdout, "*"); 68 | ++q; 69 | ++p; 70 | } else { 71 | fprintf(stdout, "*"); 72 | if (letter == 'I') ++p; 73 | else ++q; 74 | } 75 | ++ count; 76 | if (count == 60) { 77 | qb = q; 78 | goto step3; 79 | } 80 | } 81 | } 82 | step3: 83 | p = pb; 84 | fprintf(stdout, "\nQuery: %8d ", p + 1); 85 | count = 0; 86 | for (c = e; c < a->cigarLen; ++c) { 87 | char letter = cigar_int_to_op(a->cigar[c]); 88 | uint32_t length = cigar_int_to_len(a->cigar[c]); 89 | uint32_t l = (count == 0 && left > 0) ? left: length; 90 | for (i = 0; i < l; ++i) { 91 | if (letter == 'D') fprintf(stdout, "-"); 92 | else { 93 | fprintf(stdout, "%c", *(read_seq + p)); 94 | ++p; 95 | } 96 | ++ count; 97 | if (count == 60) { 98 | pb = p; 99 | left = l - i - 1; 100 | e = (left == 0) ? (c + 1) : c; 101 | goto end; 102 | } 103 | } 104 | } 105 | e = c; 106 | left = 0; 107 | end: 108 | fprintf(stdout, " %d\n\n", p); 109 | } 110 | } 111 | } 112 | 113 | // Align a pair of genome sequences. 114 | int main (int argc, char * const argv[]) { 115 | int32_t l, m, k, match = 2, mismatch = 2, gap_open = 3, gap_extension = 1; // default parameters for genome sequence alignment 116 | // reference sequence 117 | static const char ref_seq[40] = {'A', 'A', 'G', 'C', 'C', 'T', 'T', 'T', 'C', 'T', 'G', 'A', 'C', 'C', 'C', 'G', 'G', 'A', 'A', 'A', 'T', 118 | 'C', 'A', 'A', 'A', 'A', 'T', 'A', 'G', 'G', 'C', 'A', 'C', 'A', 'A', 'C', 'A', 'A', 'A', '\0'}; 119 | static const char read_seq[16] = {'C', 'T', 'G', 'A', 'G', 'C', 'C', 'G', 'G', 'T', 'A', 'A', 'A', 'T', 'C', '\0'}; // read sequence 120 | s_profile* profile; 121 | int8_t* num = (int8_t*)malloc(16); // the read sequence represented in numbers 122 | int8_t* ref_num = (int8_t*)malloc(64); // the read sequence represented in numbers 123 | s_align* result; 124 | 125 | /* This table is used to transform nucleotide letters into numbers. */ 126 | static const int8_t nt_table[128] = { 127 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 128 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 129 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 130 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 131 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 132 | 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 133 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 134 | 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 135 | }; 136 | 137 | // initialize scoring matrix for genome sequences 138 | // A C G T N (or other ambiguous code) 139 | // 2 -2 -2 -2 0 A 140 | // -2 2 -2 -2 0 C 141 | // -2 -2 2 -2 0 G 142 | // -2 -2 -2 2 0 T 143 | // 0 0 0 0 0 N (or other ambiguous code) 144 | int8_t* mat = (int8_t*)calloc(25, sizeof(int8_t)); 145 | for (l = k = 0; l < 4; ++l) { 146 | for (m = 0; m < 4; ++m) mat[k++] = l == m ? match : - mismatch; /* weight_match : -weight_mismatch */ 147 | mat[k++] = 0; // ambiguous base: no penalty 148 | } 149 | for (m = 0; m < 5; ++m) mat[k++] = 0; 150 | 151 | for (m = 0; m < 15; ++m) num[m] = nt_table[(int)read_seq[m]]; 152 | profile = ssw_init(num, 15, mat, 5, 2); 153 | for (m = 0; m < 39; ++m) ref_num[m] = nt_table[(int)ref_seq[m]]; 154 | 155 | // Only the 8 bit of the flag is setted. ssw_align will always return the best alignment beginning position and cigar. 156 | result = ssw_align (profile, ref_num, 39, gap_open, gap_extension, 1, 0, 0, 15); 157 | ssw_write(result, ref_seq, read_seq, nt_table); 158 | 159 | align_destroy(result); 160 | init_destroy(profile); 161 | free(mat); 162 | free(ref_num); 163 | free(num); 164 | return(0); 165 | } 166 | -------------------------------------------------------------------------------- /ssw201507/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* Last Modified: 12APR2009 */ 29 | 30 | #ifndef AC_KSEQ_H 31 | #define AC_KSEQ_H 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 38 | #define KS_SEP_TAB 1 // isspace() && !' ' 39 | #define KS_SEP_MAX 1 40 | 41 | #define __KS_TYPE(type_t) \ 42 | typedef struct __kstream_t { \ 43 | char *buf; \ 44 | int begin, end, is_eof; \ 45 | type_t f; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(type_t, __bufsize) \ 52 | static inline kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; \ 56 | ks->buf = (char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | static inline void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (ks) { \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } \ 65 | } 66 | 67 | #define __KS_GETC(__read, __bufsize) \ 68 | static inline int ks_getc(kstream_t *ks) \ 69 | { \ 70 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 71 | if (ks->begin >= ks->end) { \ 72 | ks->begin = 0; \ 73 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 74 | if (ks->end < __bufsize) ks->is_eof = 1; \ 75 | if (ks->end == 0) return -1; \ 76 | } \ 77 | return (int)ks->buf[ks->begin++]; \ 78 | } 79 | 80 | #ifndef KSTRING_T 81 | #define KSTRING_T kstring_t 82 | typedef struct __kstring_t { 83 | size_t l, m; 84 | char *s; 85 | } kstring_t; 86 | #endif 87 | 88 | #ifndef kroundup32 89 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 90 | #endif 91 | 92 | #define __KS_GETUNTIL(__read, __bufsize) \ 93 | static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 94 | { \ 95 | if (dret) *dret = 0; \ 96 | str->l = 0; \ 97 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 98 | for (;;) { \ 99 | int i; \ 100 | if (ks->begin >= ks->end) { \ 101 | if (!ks->is_eof) { \ 102 | ks->begin = 0; \ 103 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 104 | if (ks->end < __bufsize) ks->is_eof = 1; \ 105 | if (ks->end == 0) break; \ 106 | } else break; \ 107 | } \ 108 | if (delimiter > KS_SEP_MAX) { \ 109 | for (i = ks->begin; i < ks->end; ++i) \ 110 | if (ks->buf[i] == delimiter) break; \ 111 | } else if (delimiter == KS_SEP_SPACE) { \ 112 | for (i = ks->begin; i < ks->end; ++i) \ 113 | if (isspace(ks->buf[i])) break; \ 114 | } else if (delimiter == KS_SEP_TAB) { \ 115 | for (i = ks->begin; i < ks->end; ++i) \ 116 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 117 | } else i = 0; /* never come to here! */ \ 118 | if (str->m - str->l < i - ks->begin + 1) { \ 119 | str->m = str->l + (i - ks->begin) + 1; \ 120 | kroundup32(str->m); \ 121 | str->s = (char*)realloc(str->s, str->m); \ 122 | } \ 123 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 124 | str->l = str->l + (i - ks->begin); \ 125 | ks->begin = i + 1; \ 126 | if (i < ks->end) { \ 127 | if (dret) *dret = ks->buf[i]; \ 128 | break; \ 129 | } \ 130 | } \ 131 | if (str->l == 0) { \ 132 | str->m = 1; \ 133 | str->s = (char*)calloc(1, 1); \ 134 | } \ 135 | str->s[str->l] = '\0'; \ 136 | return str->l; \ 137 | } 138 | 139 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 140 | __KS_TYPE(type_t) \ 141 | __KS_BASIC(type_t, __bufsize) \ 142 | __KS_GETC(__read, __bufsize) \ 143 | __KS_GETUNTIL(__read, __bufsize) 144 | 145 | #define __KSEQ_BASIC(type_t) \ 146 | static inline kseq_t *kseq_init(type_t fd) \ 147 | { \ 148 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 149 | s->f = ks_init(fd); \ 150 | return s; \ 151 | } \ 152 | static inline void kseq_rewind(kseq_t *ks) \ 153 | { \ 154 | ks->last_char = 0; \ 155 | ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ 156 | } \ 157 | static inline void kseq_destroy(kseq_t *ks) \ 158 | { \ 159 | if (!ks) return; \ 160 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 161 | ks_destroy(ks->f); \ 162 | free(ks); \ 163 | } 164 | 165 | /* Return value: 166 | >=0 length of the sequence (normal) 167 | -1 end-of-file 168 | -2 truncated quality string 169 | */ 170 | #define __KSEQ_READ \ 171 | static int kseq_read(kseq_t *seq) \ 172 | { \ 173 | int c; \ 174 | kstream_t *ks = seq->f; \ 175 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 176 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 177 | if (c == -1) return -1; /* end of file */ \ 178 | seq->last_char = c; \ 179 | } /* the first header char has been read */ \ 180 | seq->comment.l = seq->seq.l = seq->qual.l = 0; \ 181 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ 182 | if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ 183 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 184 | if (isgraph(c)) { /* printable non-space character */ \ 185 | if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ 186 | seq->seq.m = seq->seq.l + 2; \ 187 | kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ 188 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 189 | } \ 190 | seq->seq.s[seq->seq.l++] = (char)c; \ 191 | } \ 192 | } \ 193 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 194 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 195 | if (c != '+') return seq->seq.l; /* FASTA */ \ 196 | if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ 197 | seq->qual.m = seq->seq.m; \ 198 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 199 | } \ 200 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 201 | if (c == -1) return -2; /* we should not stop here */ \ 202 | while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ 203 | if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ 204 | seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ 205 | seq->last_char = 0; /* we have not come to the next header line */ \ 206 | if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ 207 | return seq->seq.l; \ 208 | } 209 | 210 | #define __KSEQ_TYPE(type_t) \ 211 | typedef struct { \ 212 | kstring_t name, comment, seq, qual; \ 213 | int last_char; \ 214 | kstream_t *f; \ 215 | } kseq_t; 216 | 217 | #define KSEQ_INIT(type_t, __read) \ 218 | KSTREAM_INIT(type_t, __read, 4096) \ 219 | __KSEQ_TYPE(type_t) \ 220 | __KSEQ_BASIC(type_t) \ 221 | __KSEQ_READ 222 | 223 | #endif 224 | -------------------------------------------------------------------------------- /ssw201507/license.ssw.txt: -------------------------------------------------------------------------------- 1 | SSW Library: An SIMD Smith-Waterman C/C++/Python/Java Library for Use in Genomic Applications 2 | 3 | License: MIT 4 | 5 | Copyright (c) 2012-2015 Boston College 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 12 | 13 | -------------------------------------------------------------------------------- /ssw201507/main.c: -------------------------------------------------------------------------------- 1 | /* main.c 2 | * Created by Mengyao Zhao on 06/23/11. 3 | * Version 0.1.5 4 | * Last revision by Mengyao Zhao on 06/27/14. 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "ssw.h" 17 | #include "kseq.h" 18 | 19 | #ifdef __GNUC__ 20 | #define LIKELY(x) __builtin_expect((x),1) 21 | #define UNLIKELY(x) __builtin_expect((x),0) 22 | #else 23 | #define LIKELY(x) (x) 24 | #define UNLIKELY(x) (x) 25 | #endif 26 | 27 | /*! @function 28 | @abstract Round an integer to the next closest power-2 integer. 29 | @param x integer to be rounded (in place) 30 | @discussion x will be modified. 31 | */ 32 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 33 | 34 | KSEQ_INIT(gzFile, gzread) 35 | 36 | static void reverse_comple(const char* seq, char* rc) { 37 | int32_t end = strlen(seq), start = 0; 38 | static const int8_t rc_table[128] = { 39 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 40 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 41 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 42 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 43 | 4, 84, 4, 71, 4, 4, 4, 67, 4, 4, 4, 4, 4, 4, 4, 4, 44 | 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 45 | 4, 84, 4, 71, 4, 4, 4, 67, 4, 4, 4, 4, 4, 4, 4, 4, 46 | 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 47 | }; 48 | rc[end] = '\0'; 49 | -- end; 50 | while (LIKELY(start < end)) { 51 | rc[start] = (char)rc_table[(int8_t)seq[end]]; 52 | rc[end] = (char)rc_table[(int8_t)seq[start]]; 53 | ++ start; 54 | -- end; 55 | } 56 | if (start == end) rc[start] = (char)rc_table[(int8_t)seq[start]]; 57 | } 58 | 59 | static void ssw_write (const s_align* a, 60 | const kseq_t* ref_seq, 61 | const kseq_t* read, 62 | const char* read_seq, // strand == 0: original read; strand == 1: reverse complement read 63 | const int8_t* table, 64 | int8_t strand, // 0: forward aligned ; 1: reverse complement aligned 65 | int8_t sam) { // 0: Blast like output; 1: Sam format output 66 | 67 | if (sam == 0) { // Blast like output 68 | fprintf(stdout, "target_name: %s\nquery_name: %s\noptimal_alignment_score: %d\t", ref_seq->name.s, read->name.s, a->score1); 69 | if (a->score2 > 0) fprintf(stdout, "suboptimal_alignment_score: %d\t", a->score2); 70 | if (strand == 0) fprintf(stdout, "strand: +\t"); 71 | else fprintf(stdout, "strand: -\t"); 72 | if (a->ref_begin1 + 1) fprintf(stdout, "target_begin: %d\t", a->ref_begin1 + 1); 73 | fprintf(stdout, "target_end: %d\t", a->ref_end1 + 1); 74 | if (a->read_begin1 + 1) fprintf(stdout, "query_begin: %d\t", a->read_begin1 + 1); 75 | fprintf(stdout, "query_end: %d\n\n", a->read_end1 + 1); 76 | if (a->cigar) { 77 | int32_t c = 0, left = 0, e = 0, qb = a->ref_begin1, pb = a->read_begin1; 78 | uint32_t i; 79 | while (e < a->cigarLen || left > 0) { 80 | int32_t count = 0; 81 | int32_t q = qb; 82 | int32_t p = pb; 83 | fprintf(stdout, "Target: %8d ", q + 1); 84 | for (c = e; c < a->cigarLen; ++c) { 85 | char letter = cigar_int_to_op(a->cigar[c]); 86 | uint32_t length = cigar_int_to_len(a->cigar[c]); 87 | uint32_t l = (count == 0 && left > 0) ? left: length; 88 | for (i = 0; i < l; ++i) { 89 | if (letter == 'I') fprintf(stdout, "-"); 90 | else { 91 | fprintf(stdout, "%c", *(ref_seq->seq.s + q)); 92 | ++ q; 93 | } 94 | ++ count; 95 | if (count == 60) goto step2; 96 | } 97 | } 98 | step2: 99 | fprintf(stdout, " %d\n ", q); 100 | q = qb; 101 | count = 0; 102 | for (c = e; c < a->cigarLen; ++c) { 103 | char letter = cigar_int_to_op(a->cigar[c]); 104 | uint32_t length = cigar_int_to_len(a->cigar[c]); 105 | uint32_t l = (count == 0 && left > 0) ? left: length; 106 | for (i = 0; i < l; ++i){ 107 | if (letter == 'M') { 108 | if (table[(int)*(ref_seq->seq.s + q)] == table[(int)*(read_seq + p)])fprintf(stdout, "|"); 109 | else fprintf(stdout, "*"); 110 | ++q; 111 | ++p; 112 | } else { 113 | fprintf(stdout, " "); 114 | if (letter == 'I') ++p; 115 | else ++q; 116 | } 117 | ++ count; 118 | if (count == 60) { 119 | qb = q; 120 | goto step3; 121 | } 122 | } 123 | } 124 | step3: 125 | p = pb; 126 | fprintf(stdout, "\nQuery: %8d ", p + 1); 127 | count = 0; 128 | for (c = e; c < a->cigarLen; ++c) { 129 | char letter = cigar_int_to_op(a->cigar[c]); 130 | uint32_t length = cigar_int_to_len(a->cigar[c]); 131 | uint32_t l = (count == 0 && left > 0) ? left: length; 132 | for (i = 0; i < l; ++i) { 133 | if (letter == 'D') fprintf(stdout, "-"); 134 | else { 135 | fprintf(stdout, "%c", *(read_seq + p)); 136 | ++p; 137 | } 138 | ++ count; 139 | if (count == 60) { 140 | pb = p; 141 | left = l - i - 1; 142 | e = (left == 0) ? (c + 1) : c; 143 | goto end; 144 | } 145 | } 146 | } 147 | e = c; 148 | left = 0; 149 | end: 150 | fprintf(stdout, " %d\n\n", p); 151 | } 152 | } 153 | }else { // Sam format output 154 | fprintf(stdout, "%s\t", read->name.s); 155 | if (a->score1 == 0) fprintf(stdout, "4\t*\t0\t255\t*\t*\t0\t0\t*\t*\n"); 156 | else { 157 | int32_t c, l = a->read_end1 - a->read_begin1 + 1, qb = a->ref_begin1, pb = a->read_begin1, p; 158 | uint32_t mapq = -4.343 * log(1 - (double)abs(a->score1 - a->score2)/(double)a->score1); 159 | mapq = (uint32_t) (mapq + 4.99); 160 | mapq = mapq < 254 ? mapq : 254; 161 | if (strand) fprintf(stdout, "16\t"); 162 | else fprintf(stdout, "0\t"); 163 | fprintf(stdout, "%s\t%d\t%d\t", ref_seq->name.s, a->ref_begin1 + 1, mapq); 164 | for (c = 0; c < a->cigarLen; ++c) { 165 | char letter = cigar_int_to_op(a->cigar[c]); 166 | uint32_t length = cigar_int_to_len(a->cigar[c]); 167 | fprintf(stdout, "%lu%c", (unsigned long)length, letter); 168 | } 169 | fprintf(stdout, "\t*\t0\t0\t"); 170 | for (c = a->read_begin1; c <= a->read_end1; ++c) fprintf(stdout, "%c", read_seq[c]); 171 | fprintf(stdout, "\t"); 172 | if (read->qual.s && strand) { 173 | p = a->read_end1; 174 | for (c = 0; c < l; ++c) { 175 | fprintf(stdout, "%c", read->qual.s[p]); 176 | --p; 177 | } 178 | }else if (read->qual.s){ 179 | p = a->read_begin1; 180 | for (c = 0; c < l; ++c) { 181 | fprintf(stdout, "%c", read->qual.s[p]); 182 | ++p; 183 | } 184 | } else fprintf(stdout, "*"); 185 | fprintf(stdout, "\tAS:i:%d", a->score1); 186 | mapq = 0; // counter of difference 187 | for (c = 0; c < a->cigarLen; ++c) { 188 | char letter = cigar_int_to_op(a->cigar[c]); 189 | uint32_t length = cigar_int_to_len(a->cigar[c]); 190 | if (letter == 'M') { 191 | for (p = 0; p < length; ++p){ 192 | if (table[(int)*(ref_seq->seq.s + qb)] != table[(int)*(read_seq + pb)]) ++mapq; 193 | ++qb; 194 | ++pb; 195 | } 196 | } else if (letter == 'I') { 197 | pb += length; 198 | mapq += length; 199 | } else { 200 | qb += length; 201 | mapq += length; 202 | } 203 | } 204 | fprintf(stdout,"\tNM:i:%d\t", mapq); 205 | if (a->score2 > 0) fprintf(stdout, "ZS:i:%d\n", a->score2); 206 | else fprintf(stdout, "\n"); 207 | } 208 | } 209 | } 210 | 211 | int main (int argc, char * const argv[]) { 212 | clock_t start, end; 213 | float cpu_time; 214 | gzFile read_fp, ref_fp; 215 | kseq_t *read_seq, *ref_seq; 216 | int32_t l, m, k, match = 2, mismatch = 2, gap_open = 3, gap_extension = 1, path = 0, reverse = 0, n = 5, sam = 0, protein = 0, header = 0, s1 = 67108864, s2 = 128, filter = 0; 217 | int8_t* mata = (int8_t*)calloc(25, sizeof(int8_t)); 218 | const int8_t* mat = mata; 219 | char mat_name[16]; 220 | mat_name[0] = '\0'; 221 | int8_t* ref_num = (int8_t*)malloc(s1); 222 | int8_t* num = (int8_t*)malloc(s2), *num_rc = 0; 223 | char* read_rc = 0; 224 | 225 | static const int8_t mat50[] = { 226 | // A R N D C Q E G H I L K M F P S T W Y V B Z X * 227 | 5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -2, -1, -1, -3, -1, 1, 0, -3, -2, 0, -2, -1, -1, -5, // A 228 | -2, 7, -1, -2, -4, 1, 0, -3, 0, -4, -3, 3, -2, -3, -3, -1, -1, -3, -1, -3, -1, 0, -1, -5, // R 229 | -1, -1, 7, 2, -2, 0, 0, 0, 1, -3, -4, 0, -2, -4, -2, 1, 0, -4, -2, -3, 5, 0, -1, -5, // N 230 | -2, -2, 2, 8, -4, 0, 2, -1, -1, -4, -4, -1, -4, -5, -1, 0, -1, -5, -3, -4, 6, 1, -1, -5, // D 231 | -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -3, -3, -1, -5, // C 232 | -1, 1, 0, 0, -3, 7, 2, -2, 1, -3, -2, 2, 0, -4, -1, 0, -1, -1, -1, -3, 0, 4, -1, -5, // Q 233 | -1, 0, 0, 2, -3, 2, 6, -3, 0, -4, -3, 1, -2, -3, -1, -1, -1, -3, -2, -3, 1, 5, -1, -5, // E 234 | 0, -3, 0, -1, -3, -2, -3, 8, -2, -4, -4, -2, -3, -4, -2, 0, -2, -3, -3, -4, -1, -2, -1, -5, // G 235 | -2, 0, 1, -1, -3, 1, 0, -2, 10, -4, -3, 0, -1, -1, -2, -1, -2, -3, 2, -4, 0, 0, -1, -5, // H 236 | -1, -4, -3, -4, -2, -3, -4, -4, -4, 5, 2, -3, 2, 0, -3, -3, -1, -3, -1, 4, -4, -3, -1, -5, // I 237 | -2, -3, -4, -4, -2, -2, -3, -4, -3, 2, 5, -3, 3, 1, -4, -3, -1, -2, -1, 1, -4, -3, -1, -5, // L 238 | -1, 3, 0, -1, -3, 2, 1, -2, 0, -3, -3, 6, -2, -4, -1, 0, -1, -3, -2, -3, 0, 1, -1, -5, // K 239 | -1, -2, -2, -4, -2, 0, -2, -3, -1, 2, 3, -2, 7, 0, -3, -2, -1, -1, 0, 1, -3, -1, -1, -5, // M 240 | -3, -3, -4, -5, -2, -4, -3, -4, -1, 0, 1, -4, 0, 8, -4, -3, -2, 1, 4, -1, -4, -4, -1, -5, // F 241 | -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3, -2, -1, -1, -5, // P 242 | 1, -1, 1, 0, -1, 0, -1, 0, -1, -3, -3, 0, -2, -3, -1, 5, 2, -4, -2, -2, 0, 0, -1, -5, // S 243 | 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 2, 5, -3, -2, 0, 0, -1, -1, -5, // T 244 | -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1, 1, -4, -4, -3, 15, 2, -3, -5, -2, -1, -5, // W 245 | -2, -1, -2, -3, -3, -1, -2, -3, 2, -1, -1, -2, 0, 4, -3, -2, -2, 2, 8, -1, -3, -2, -1, -5, // Y 246 | 0, -3, -3, -4, -1, -3, -3, -4, -4, 4, 1, -3, 1, -1, -3, -2, 0, -3, -1, 5, -3, -3, -1, -5, // V 247 | -2, -1, 5, 6, -3, 0, 1, -1, 0, -4, -4, 0, -3, -4, -2, 0, 0, -5, -3, -3, 6, 1, -1, -5, // B 248 | -1, 0, 0, 1, -3, 4, 5, -2, 0, -3, -3, 1, -1, -4, -1, 0, -1, -2, -2, -3, 1, 5, -1, -5, // Z 249 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, // X 250 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1 // * 251 | }; 252 | 253 | /* This table is used to transform amino acid letters into numbers. */ 254 | int8_t aa_table[128] = { 255 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 256 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 257 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 258 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 259 | 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23, 260 | 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23, 261 | 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23, 262 | 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23 263 | }; 264 | 265 | /* This table is used to transform nucleotide letters into numbers. */ 266 | int8_t nt_table[128] = { 267 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 268 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 269 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 270 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 271 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 272 | 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 273 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 274 | 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 275 | }; 276 | 277 | int8_t* table = nt_table; 278 | 279 | // Parse command line. 280 | while ((l = getopt(argc, argv, "m:x:o:e:a:f:pcrsh")) >= 0) { 281 | switch (l) { 282 | case 'm': match = atoi(optarg); break; 283 | case 'x': mismatch = atoi(optarg); break; 284 | case 'o': gap_open = atoi(optarg); break; 285 | case 'e': gap_extension = atoi(optarg); break; 286 | case 'a': strcpy(mat_name, optarg); break; 287 | case 'f': filter = atoi(optarg); break; 288 | case 'p': protein = 1; break; 289 | case 'c': path = 1; break; 290 | case 'r': reverse = 1; break; 291 | case 's': sam = 1; break; 292 | case 'h': header = 1; break; 293 | } 294 | } 295 | if (optind + 2 > argc) { 296 | fprintf(stderr, "\n"); 297 | fprintf(stderr, "Usage: ssw_test [options] ... (or )\n"); 298 | fprintf(stderr, "Options:\n"); 299 | fprintf(stderr, "\t-m N\tN is a positive integer for weight match in genome sequence alignment. [default: 2]\n"); 300 | fprintf(stderr, "\t-x N\tN is a positive integer. -N will be used as weight mismatch in genome sequence alignment. [default: 2]\n"); 301 | fprintf(stderr, "\t-o N\tN is a positive integer. -N will be used as the weight for the gap opening. [default: 3]\n"); 302 | fprintf(stderr, "\t-e N\tN is a positive integer. -N will be used as the weight for the gap extension. [default: 1]\n"); 303 | fprintf(stderr, "\t-p\tDo protein sequence alignment. Without this option, the ssw_test will do genome sequence alignment.\n"); 304 | fprintf(stderr, "\t-a FILE\tFILE is either the Blosum or Pam weight matrix. [default: Blosum50]\n"); 305 | fprintf(stderr, "\t-c\tReturn the alignment path.\n"); 306 | fprintf(stderr, "\t-f N\tN is a positive integer. Only output the alignments with the Smith-Waterman score >= N.\n"); 307 | fprintf(stderr, "\t-r\tThe best alignment will be picked between the original read alignment and the reverse complement read alignment.\n"); 308 | fprintf(stderr, "\t-s\tOutput in SAM format. [default: no header]\n"); 309 | fprintf(stderr, "\t-h\tIf -s is used, include header in SAM output.\n\n"); 310 | return 1; 311 | } 312 | 313 | // initialize scoring matrix for genome sequences 314 | for (l = k = 0; LIKELY(l < 4); ++l) { 315 | for (m = 0; LIKELY(m < 4); ++m) mata[k++] = l == m ? match : -mismatch; /* weight_match : -weight_mismatch */ 316 | mata[k++] = 0; // ambiguous base 317 | } 318 | for (m = 0; LIKELY(m < 5); ++m) mata[k++] = 0; 319 | 320 | if (protein == 1 && (! strcmp(mat_name, "\0"))) { 321 | n = 24; 322 | table = aa_table; 323 | mat = mat50; 324 | } else if (strcmp(mat_name, "\0")) { 325 | 326 | // Parse score matrix. 327 | FILE *f_mat = fopen(mat_name, "r"); 328 | char line[128]; 329 | mata = (int8_t*)realloc(mata, 1024 * sizeof(int8_t)); 330 | k = 0; 331 | m = 0; 332 | while (fgets(line, 128, f_mat)) { 333 | if (line[0] == '*' || (line[0] >= 'A' && line[0] <= 'Z')) { 334 | if (line[0] >= 'A' && line[0] <= 'Z') aa_table[(int)line[0]] = aa_table[(int)line[0] + 32] = m; 335 | char str[4], *s = str; 336 | str[0] = '\0'; 337 | l = 1; 338 | while (line[l]) { 339 | if ((line[l] >= '0' && line[l] <= '9') || line[l] == '-') *s++ = line[l]; 340 | else if (str[0] != '\0') { 341 | *s = '\0'; 342 | mata[k++] = (int8_t)atoi(str); 343 | s = str; 344 | str[0] = '\0'; 345 | } 346 | ++l; 347 | } 348 | if (str[0] != '\0') { 349 | *s = '\0'; 350 | mata[k++] = (int8_t)atoi(str); 351 | s = str; 352 | str[0] = '\0'; 353 | } 354 | ++m; 355 | } 356 | } 357 | if (k == 0) { 358 | fprintf(stderr, "Problem of reading the weight matrix file.\n"); 359 | return 1; 360 | } 361 | fclose(f_mat); 362 | n = m; 363 | table = aa_table; 364 | mat = mata; 365 | } 366 | 367 | read_fp = gzopen(argv[optind + 1], "r"); 368 | read_seq = kseq_init(read_fp); 369 | if (sam && header && path) { 370 | fprintf(stdout, "@HD\tVN:1.4\tSO:queryname\n"); 371 | ref_fp = gzopen(argv[optind], "r"); 372 | ref_seq = kseq_init(ref_fp); 373 | while ((l = kseq_read(ref_seq)) >= 0) fprintf(stdout, "@SQ\tSN:%s\tLN:%d\n", ref_seq->name.s, (int32_t)ref_seq->seq.l); 374 | kseq_destroy(ref_seq); 375 | gzclose(ref_fp); 376 | } else if (sam && !path) { 377 | fprintf(stderr, "SAM format output is only available together with option -c.\n"); 378 | sam = 0; 379 | } 380 | 381 | // alignment 382 | if (reverse == 1 && n == 5) { 383 | read_rc = (char*)malloc(s2); 384 | num_rc = (int8_t*)malloc(s2); 385 | } 386 | start = clock(); 387 | while (kseq_read(read_seq) >= 0) { 388 | s_profile* p, *p_rc = 0; 389 | int32_t readLen = read_seq->seq.l; 390 | int32_t maskLen = readLen / 2; 391 | // int32_t maskLen = 2*readLen; 392 | 393 | while (readLen >= s2) { 394 | ++s2; 395 | kroundup32(s2); 396 | num = (int8_t*)realloc(num, s2); 397 | if (reverse == 1 && n == 5) { 398 | read_rc = (char*)realloc(read_rc, s2); 399 | num_rc = (int8_t*)realloc(num_rc, s2); 400 | } 401 | } 402 | for (m = 0; m < readLen; ++m) num[m] = table[(int)read_seq->seq.s[m]]; 403 | p = ssw_init(num, readLen, mat, n, 2); 404 | if (reverse == 1 && n == 5) { 405 | reverse_comple(read_seq->seq.s, read_rc); 406 | for (m = 0; m < readLen; ++m) num_rc[m] = table[(int)read_rc[m]]; 407 | p_rc = ssw_init(num_rc, readLen, mat, n, 2); 408 | }else if (reverse == 1 && n == 24) { 409 | fprintf (stderr, "Reverse complement alignment is not available for protein sequences. \n"); 410 | return 1; 411 | } 412 | 413 | ref_fp = gzopen(argv[optind], "r"); 414 | ref_seq = kseq_init(ref_fp); 415 | while (kseq_read(ref_seq) >= 0) { 416 | s_align* result, *result_rc = 0; 417 | int32_t refLen = ref_seq->seq.l; 418 | int8_t flag = 0; 419 | while (refLen > s1) { 420 | ++s1; 421 | kroundup32(s1); 422 | ref_num = (int8_t*)realloc(ref_num, s1); 423 | } 424 | for (m = 0; m < refLen; ++m) ref_num[m] = table[(int)ref_seq->seq.s[m]]; 425 | if (path == 1) flag = 2; 426 | result = ssw_align (p, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen); 427 | if (reverse == 1 && protein == 0) 428 | result_rc = ssw_align(p_rc, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen); 429 | if (result_rc && result_rc->score1 > result->score1 && result_rc->score1 >= filter) { 430 | if (sam) ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 1); 431 | else ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 0); 432 | }else if (result && result->score1 >= filter){ 433 | if (sam) ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 1); 434 | else ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 0); 435 | } else if (! result) return 1; 436 | if (result_rc) align_destroy(result_rc); 437 | align_destroy(result); 438 | } 439 | 440 | if(p_rc) init_destroy(p_rc); 441 | init_destroy(p); 442 | kseq_destroy(ref_seq); 443 | gzclose(ref_fp); 444 | } 445 | end = clock(); 446 | cpu_time = ((float) (end - start)) / CLOCKS_PER_SEC; 447 | fprintf(stderr, "CPU time: %f seconds\n", cpu_time); 448 | 449 | if (num_rc) { 450 | free(num_rc); 451 | free(read_rc); 452 | } 453 | kseq_destroy(read_seq); 454 | gzclose(read_fp); 455 | free(num); 456 | free(ref_num); 457 | free(mata); 458 | return 0; 459 | } 460 | -------------------------------------------------------------------------------- /ssw201507/pyssw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | @package pyssw 6 | @brief Python standalone program for ssw alignment using the C library 7 | Complete-Striped-Smith-Waterman-Library 8 | Biopython module is require for fastq/fastq parsing 9 | @copyright [The MIT licence](http://opensource.org/licenses/MIT) 10 | @author Adrien Leger - 2014 11 | * 12 | * 13 | * 14 | * [Github](https://github.com/a-slide) 15 | * [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/) 16 | """ 17 | 18 | #~~~~~~~GLOBAL IMPORTS~~~~~~~# 19 | # Standard library packages 20 | import optparse 21 | import sys 22 | from time import time 23 | import gzip 24 | 25 | #~~~~~~~MAIN FUNCTION~~~~~~~# 26 | def main (opt): 27 | 28 | print ("Inport subject sequence") 29 | # Import fasta subject 30 | if opt.subject.rpartition(".")[2].lower() == "gz": 31 | subject_handle = gzip.open(opt.subject, "r") 32 | else: 33 | subject_handle = open(opt.subject, "r") 34 | subject = SeqIO.read(subject_handle, "fasta") 35 | 36 | print ("Inport query sequences and count the number of sequences") 37 | # Import fasta subject 38 | if opt.query.rpartition(".")[2].lower() == "gz": 39 | nseq = count_seq(opt.query, opt.qtype, True) 40 | query_handle = gzip.open(opt.query, "r") 41 | else: 42 | nseq = count_seq(opt.query, opt.qtype, False) 43 | query_handle = open(opt.query, "r") 44 | query_gen = SeqIO.parse(query_handle, opt.qtype) 45 | 46 | print("{} contains {} sequences to align".format(opt.query, nseq)) 47 | # Calculate a step list for the progress bar 48 | nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)] 49 | 50 | print ("Initialize ssw aligner with the subject sequence") 51 | # Init the an Aligner object with the reference value 52 | ssw = Aligner( 53 | str(subject.seq), 54 | match=int(opt.match), 55 | mismatch=int(opt.mismatch), 56 | gap_open=int(opt.gap_open), 57 | gap_extend= int(opt.gap_extend), 58 | report_secondary=False, 59 | report_cigar=True) 60 | 61 | # Write the header of the SAM file 62 | with open("result.sam", "w") as f: 63 | f.write("@HD\tVN:1.0\tSO:unsorted\n") 64 | f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq))) 65 | f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n") 66 | f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format( 67 | opt.match, 68 | opt.mismatch, 69 | opt.gap_open, 70 | opt.gap_extend)) 71 | f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format( 72 | opt.min_score, 73 | opt.min_len)) 74 | 75 | print ("Starting alignment of queries against the subject sequence") 76 | start = time() 77 | # Align each query along the subject an write result in a SAM file 78 | i = 0 79 | for query in query_gen: 80 | 81 | # Find the best alignment 82 | if opt.reverse: 83 | al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len)) 84 | else: 85 | al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True 86 | 87 | # If valid match found 88 | if al: 89 | f.write(sam_line( 90 | qname=query.id, 91 | flag=0 if orient else 16, 92 | rname=subject.id, 93 | pos=al.ref_begin+1, 94 | cigar=al.cigar_string, 95 | seq=str(query.seq), 96 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*", 97 | tags=["AS:i:{}".format(al.score)])) 98 | 99 | # If no valid match found and -u flag activated (report unaligned) 100 | elif opt.unaligned: 101 | f.write(sam_line( 102 | qname=query.id, 103 | flag=4, 104 | seq=str(query.seq), 105 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*")) 106 | # Else = match unreported 107 | 108 | # Progress bar 109 | i+=1 110 | if i in nseq_list: 111 | frac = i/float(nseq) 112 | t = time()-start 113 | print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2))) 114 | 115 | print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2))) 116 | 117 | #~~~~~~~HELPER FUNCTIONS~~~~~~~# 118 | 119 | 120 | def sam_line (qname='*', flag=4, rname='*', pos=0, mapq=0, cigar='*', rnext='*', pnext=0, tlen=0, seq='*', qual='*', tags=None): 121 | """ 122 | Return a minimal sam line = by default return an undetermined sam line. Check the document 123 | [SAM Format Specification](http://samtools.sourceforge.net/SAM1.pdf) for a full description. 124 | @param qname Query template NAME 125 | @param flag bitwise FLAG 126 | @param rname Reference sequence NAME of the alignment 127 | @param pos 1-based leftmost mapping POSition of the first matching base 128 | @param mapq MAPping Quality 129 | @param cigar CIGAR string 130 | @param rnext Reference sequence name of the primary alignment of the mate 131 | @param pnext 1-based leftmost position of the primary alignment of the mate 132 | @param tlen signed observed Template LENgth 133 | @param seq segment SEQuence 134 | @param qual ASCII of base QUALity plus 33 135 | @param tags list of optional tags 136 | @return A Sam alignment line 137 | """ 138 | if tags: 139 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 140 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, " ".join(tags)) 141 | else: 142 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( 143 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual) 144 | 145 | def find_best_align (ssw, query, min_score, min_len): 146 | 147 | # Align reverse and forward query 148 | forward_al = ssw.align(str(query.seq), min_score, min_len) 149 | reverse_al = ssw.align(str(query.seq.reverse_complement()), min_score, min_len) 150 | 151 | # Decision tree to return the best aligned sequence taking into acount the absence of result 152 | # by ssw_wrap in case of score filtering 153 | 154 | if not forward_al: 155 | if not reverse_al: 156 | return (None, None) 157 | else: 158 | return (reverse_al, False) 159 | 160 | else: 161 | if not reverse_al: 162 | return (forward_al, True) 163 | else: 164 | if forward_al.score >= reverse_al.score: 165 | return (forward_al, True) 166 | else: 167 | return (reverse_al, False) 168 | 169 | def count_seq (filename, seq_type="fasta", gziped=False): 170 | """ 171 | Count the number of sequences in a fastq or a fastq file 172 | @param filename Path to a valid readeable file 173 | @param file_type Should be either fastq or fastq. Default fasta 174 | @param gziped Boolean indicating if the file is gziped or not. Default False 175 | """ 176 | #Standard library import 177 | import gzip 178 | from mmap import mmap 179 | 180 | # Verify if the file is fasta or fastq type 181 | assert seq_type in ["fasta", "fastq"], "The file has to be either fastq or fasta format" 182 | 183 | # Open the file 184 | if gziped: 185 | f = gzip.open(filename, "r") 186 | else: 187 | f = open(filename, "r") 188 | 189 | # FASTA Find a start line seq character ">" an increment the counter each time 190 | if seq_type == "fasta": 191 | nline = 0 192 | for line in f: 193 | if line[0] == ">": 194 | nline+=1 195 | f.close() 196 | return nline 197 | 198 | # FASTQ No motif to find, but 4 lines correspond to 1 sequence 199 | else: 200 | nline = 0 201 | for line in f: 202 | nline+=1 203 | f.close() 204 | return nline/4 205 | 206 | def optparser(): 207 | 208 | print("Parse command line options") 209 | # Usage and version strings 210 | program_name = "pyssw" 211 | program_version = 0.1 212 | version_string = "{}\t{}".format(program_name, program_version) 213 | usage_string = "{}.py -s subject.fasta -q fastq (or fasta) [Facultative options]".format(program_name) 214 | optparser = optparse.OptionParser(usage = usage_string, version = version_string) 215 | 216 | # Define optparser options 217 | hstr = "Path of the fasta file containing the subject genome sequence. Can be gziped. [REQUIRED] " 218 | optparser.add_option( '-s', '--subject', dest="subject", help=hstr) 219 | hstr = "Path of the fastq or fasta file containing the short read to be aligned. Can be gziped. [REQUIRED]" 220 | optparser.add_option( '-q', '--query', dest="query", help=hstr) 221 | hstr = "Type of the query file = fastq or fasta. [default: fastq]" 222 | optparser.add_option( '-t', '--qtype', dest="qtype", default="fastq", help=hstr) 223 | hstr = "Positive integer for weight match in genome sequence alignment. [default: 2]" 224 | optparser.add_option( '-m', '--match', dest="match",default=2, help=hstr) 225 | hstr = "Positive integer. The negative value will be used as weight mismatch in genome sequence alignment. [default: 2]" 226 | optparser.add_option( '-x', '--mismatch', dest="mismatch", default=2, help=hstr) 227 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 3]" 228 | optparser.add_option( '-o', '--gap_open', dest="gap_open", default=3, help=hstr) 229 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 1]" 230 | optparser.add_option( '-e', '--gap_extend', dest="gap_extend", default=1, help=hstr) 231 | hstr = "Integer. Consider alignments having a score <= as not aligned. [default: 0]" 232 | optparser.add_option( '-f', '--min_score', dest="min_score", default=0, help=hstr) 233 | hstr = "Integer. Consider alignments having a length <= as not aligned. [default: 0]" 234 | optparser.add_option( '-l', '--min_len', dest="min_len", default=0, help=hstr) 235 | hstr = "Flag. Align query in forward and reverse orientation and choose the best alignment. [Set by default]" 236 | optparser.add_option( '-r', '--reverse', dest="reverse", action="store_true", default=True, help=hstr) 237 | hstr = "Flag. Write unaligned reads in sam output [Unset by default]" 238 | optparser.add_option( '-u', '--unaligned', dest="unaligned", action="store_true", default=False, help=hstr) 239 | 240 | # Parse arg and return a dictionnary_like object of options 241 | opt, args = optparser.parse_args() 242 | 243 | if not opt.subject: 244 | print ("\nERROR: a subject fasta file has to be provided (-s option)\n") 245 | optparser.print_help() 246 | sys.exit() 247 | 248 | if not opt.query: 249 | print ("\nERROR: a query fasta or fastq file has to be provided (-q option)\n") 250 | optparser.print_help() 251 | sys.exit() 252 | 253 | return opt 254 | 255 | #~~~~~~~TOP LEVEL INSTRUCTIONS~~~~~~~# 256 | 257 | if __name__ == '__main__': 258 | 259 | # try to import Third party and local packages 260 | try: 261 | from Bio import SeqIO 262 | except ImportError: 263 | print ("ERROR: Please install Biopython package") 264 | sys.exit() 265 | 266 | try: 267 | from ssw_wrap import Aligner 268 | except ImportError: 269 | print ("ERROR: Please place ssw_wrap in the current directory or add its dir to python path") 270 | sys.exit() 271 | 272 | # Parse command line arguments 273 | opt = optparser() 274 | # Run the main function 275 | main(opt) 276 | -------------------------------------------------------------------------------- /ssw201507/result.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.0 SO:unsorted 2 | @SQ SN:ISAcma33_left32 LN:32 3 | @PG ID:Striped-Smith-Waterman PN:pyssw VN:0.1 4 | @CO Score_values = match 3, mismatch 3, gap_open 2, gap_extend 1 5 | @CO Filter Options = min_score 0, min_len 0 6 | ISAcma33_right32 0 ISAcma33_left32 3 0 3S21M1I1M2I3M1S * 0 0 GATTGTGCGTCAATAAAGTGTGGGATAGTTGA * AS:i:56 7 | -------------------------------------------------------------------------------- /ssw201507/ssw.c: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2012-1015 Boston College. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Mengyao Zhao */ 27 | 28 | /* 29 | * ssw.c 30 | * 31 | * Created by Mengyao Zhao on 6/22/10. 32 | * Copyright 2010 Boston College. All rights reserved. 33 | * Version 0.1.4 34 | * Last revision by Mengyao Zhao on 06/27/14. 35 | * 36 | */ 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include "ssw.h" 45 | 46 | #ifdef __GNUC__ 47 | #define LIKELY(x) __builtin_expect((x),1) 48 | #define UNLIKELY(x) __builtin_expect((x),0) 49 | #else 50 | #define LIKELY(x) (x) 51 | #define UNLIKELY(x) (x) 52 | #endif 53 | 54 | /* Convert the coordinate in the scoring matrix into the coordinate in one line of the band. */ 55 | #define set_u(u, w, i, j) { int x=(i)-(w); x=x>0?x:0; (u)=(j)-x+1; } 56 | 57 | /* Convert the coordinate in the direction matrix into the coordinate in one line of the band. */ 58 | #define set_d(u, w, i, j, p) { int x=(i)-(w); x=x>0?x:0; x=(j)-x; (u)=x*3+p; } 59 | 60 | /*! @function 61 | @abstract Round an integer to the next closest power-2 integer. 62 | @param x integer to be rounded (in place) 63 | @discussion x will be modified. 64 | */ 65 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 66 | 67 | typedef struct { 68 | uint16_t score; 69 | int32_t ref; //0-based position 70 | int32_t read; //alignment ending position on read, 0-based 71 | } alignment_end; 72 | 73 | typedef struct { 74 | uint32_t* seq; 75 | int32_t length; 76 | } cigar; 77 | 78 | struct _profile{ 79 | __m128i* profile_byte; // 0: none 80 | __m128i* profile_word; // 0: none 81 | const int8_t* read; 82 | const int8_t* mat; 83 | int32_t readLen; 84 | int32_t n; 85 | uint8_t bias; 86 | }; 87 | 88 | /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */ 89 | static __m128i* qP_byte (const int8_t* read_num, 90 | const int8_t* mat, 91 | const int32_t readLen, 92 | const int32_t n, /* the edge length of the squre matrix mat */ 93 | uint8_t bias) { 94 | 95 | int32_t segLen = (readLen + 15) / 16; /* Split the 128 bit register into 16 pieces. 96 | Each piece is 8 bit. Split the read into 16 segments. 97 | Calculat 16 segments in parallel. 98 | */ 99 | __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i)); 100 | int8_t* t = (int8_t*)vProfile; 101 | int32_t nt, i, j, segNum; 102 | 103 | /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */ 104 | for (nt = 0; LIKELY(nt < n); nt ++) { 105 | for (i = 0; i < segLen; i ++) { 106 | j = i; 107 | for (segNum = 0; LIKELY(segNum < 16) ; segNum ++) { 108 | *t++ = j>= readLen ? bias : mat[nt * n + read_num[j]] + bias; 109 | j += segLen; 110 | } 111 | } 112 | } 113 | return vProfile; 114 | } 115 | 116 | /* Striped Smith-Waterman 117 | Record the highest score of each reference position. 118 | Return the alignment score and ending position of the best alignment, 2nd best alignment, etc. 119 | Gap begin and gap extension are different. 120 | wight_match > 0, all other weights < 0. 121 | The returned positions are 0-based. 122 | */ 123 | static alignment_end* sw_sse2_byte (const int8_t* ref, 124 | int8_t ref_dir, // 0: forward ref; 1: reverse ref 125 | int32_t refLen, 126 | int32_t readLen, 127 | const uint8_t weight_gapO, /* will be used as - */ 128 | const uint8_t weight_gapE, /* will be used as - */ 129 | const __m128i* vProfile, 130 | uint8_t terminate, /* the best alignment score: used to terminate 131 | the matrix calculation when locating the 132 | alignment beginning point. If this score 133 | is set to 0, it will not be used */ 134 | uint8_t bias, /* Shift 0 point to a positive value. */ 135 | int32_t maskLen) { 136 | 137 | #define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \ 138 | (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \ 139 | (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \ 140 | (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \ 141 | (m) = _mm_extract_epi16((vm), 0) 142 | 143 | uint8_t max = 0; /* the max alignment score */ 144 | int32_t end_read = readLen - 1; 145 | int32_t end_ref = -1; /* 0_based best alignment ending point; Initialized as isn't aligned -1. */ 146 | int32_t segLen = (readLen + 15) / 16; /* number of segment */ 147 | 148 | /* array to record the largest score of each reference position */ 149 | uint8_t* maxColumn = (uint8_t*) calloc(refLen, 1); 150 | 151 | /* array to record the alignment read ending position of the largest score of each reference position */ 152 | int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t)); 153 | 154 | /* Define 16 byte 0 vector. */ 155 | __m128i vZero = _mm_set1_epi32(0); 156 | 157 | __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i)); 158 | __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i)); 159 | __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i)); 160 | __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i)); 161 | 162 | int32_t i, j; 163 | /* 16 byte insertion begin vector */ 164 | __m128i vGapO = _mm_set1_epi8(weight_gapO); 165 | 166 | /* 16 byte insertion extension vector */ 167 | __m128i vGapE = _mm_set1_epi8(weight_gapE); 168 | 169 | /* 16 byte bias vector */ 170 | __m128i vBias = _mm_set1_epi8(bias); 171 | 172 | __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */ 173 | __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */ 174 | __m128i vTemp; 175 | int32_t edge, begin = 0, end = refLen, step = 1; 176 | // int32_t distance = readLen * 2 / 3; 177 | // int32_t distance = readLen / 2; 178 | // int32_t distance = readLen; 179 | 180 | /* outer loop to process the reference sequence */ 181 | if (ref_dir == 1) { 182 | begin = refLen - 1; 183 | end = -1; 184 | step = -1; 185 | } 186 | for (i = begin; LIKELY(i != end); i += step) { 187 | int32_t cmp; 188 | __m128i e, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0. 189 | Any errors to vH values will be corrected in the Lazy_F loop. 190 | */ 191 | // max16(maxColumn[i], vMaxColumn); 192 | // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]); 193 | 194 | __m128i vH = pvHStore[segLen - 1]; 195 | vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */ 196 | const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */ 197 | 198 | /* Swap the 2 H buffers. */ 199 | __m128i* pv = pvHLoad; 200 | pvHLoad = pvHStore; 201 | pvHStore = pv; 202 | 203 | /* inner loop to process the query sequence */ 204 | for (j = 0; LIKELY(j < segLen); ++j) { 205 | vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j)); 206 | vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */ 207 | // max16(maxColumn[i], vH); 208 | // fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]); 209 | // int8_t* t; 210 | // int32_t ti; 211 | //for (t = (int8_t*)&vH, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++); 212 | 213 | /* Get max from vH, vE and vF. */ 214 | e = _mm_load_si128(pvE + j); 215 | vH = _mm_max_epu8(vH, e); 216 | vH = _mm_max_epu8(vH, vF); 217 | vMaxColumn = _mm_max_epu8(vMaxColumn, vH); 218 | 219 | // max16(maxColumn[i], vMaxColumn); 220 | // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]); 221 | // for (t = (int8_t*)&vMaxColumn, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++); 222 | 223 | /* Save vH values. */ 224 | _mm_store_si128(pvHStore + j, vH); 225 | 226 | /* Update vE value. */ 227 | vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */ 228 | e = _mm_subs_epu8(e, vGapE); 229 | e = _mm_max_epu8(e, vH); 230 | _mm_store_si128(pvE + j, e); 231 | 232 | /* Update vF value. */ 233 | vF = _mm_subs_epu8(vF, vGapE); 234 | vF = _mm_max_epu8(vF, vH); 235 | 236 | /* Load the next vH. */ 237 | vH = _mm_load_si128(pvHLoad + j); 238 | } 239 | 240 | /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */ 241 | /* reset pointers to the start of the saved data */ 242 | j = 0; 243 | vH = _mm_load_si128 (pvHStore + j); 244 | 245 | /* the computed vF value is for the given column. since */ 246 | /* we are at the end, we need to shift the vF value over */ 247 | /* to the next column. */ 248 | vF = _mm_slli_si128 (vF, 1); 249 | vTemp = _mm_subs_epu8 (vH, vGapO); 250 | vTemp = _mm_subs_epu8 (vF, vTemp); 251 | vTemp = _mm_cmpeq_epi8 (vTemp, vZero); 252 | cmp = _mm_movemask_epi8 (vTemp); 253 | 254 | while (cmp != 0xffff) 255 | { 256 | vH = _mm_max_epu8 (vH, vF); 257 | vMaxColumn = _mm_max_epu8(vMaxColumn, vH); 258 | _mm_store_si128 (pvHStore + j, vH); 259 | vF = _mm_subs_epu8 (vF, vGapE); 260 | j++; 261 | if (j >= segLen) 262 | { 263 | j = 0; 264 | vF = _mm_slli_si128 (vF, 1); 265 | } 266 | vH = _mm_load_si128 (pvHStore + j); 267 | 268 | vTemp = _mm_subs_epu8 (vH, vGapO); 269 | vTemp = _mm_subs_epu8 (vF, vTemp); 270 | vTemp = _mm_cmpeq_epi8 (vTemp, vZero); 271 | cmp = _mm_movemask_epi8 (vTemp); 272 | } 273 | 274 | vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn); 275 | vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore); 276 | cmp = _mm_movemask_epi8(vTemp); 277 | if (cmp != 0xffff) { 278 | uint8_t temp; 279 | vMaxMark = vMaxScore; 280 | max16(temp, vMaxScore); 281 | vMaxScore = vMaxMark; 282 | 283 | if (LIKELY(temp > max)) { 284 | max = temp; 285 | if (max + bias >= 255) break; //overflow 286 | end_ref = i; 287 | 288 | /* Store the column with the highest alignment score in order to trace the alignment ending position on read. */ 289 | for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j]; 290 | } 291 | } 292 | 293 | /* Record the max score of current column. */ 294 | max16(maxColumn[i], vMaxColumn); 295 | // fprintf(stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]); 296 | if (maxColumn[i] == terminate) break; 297 | } 298 | 299 | /* Trace the alignment ending position on read. */ 300 | uint8_t *t = (uint8_t*)pvHmax; 301 | int32_t column_len = segLen * 16; 302 | for (i = 0; LIKELY(i < column_len); ++i, ++t) { 303 | int32_t temp; 304 | if (*t == max) { 305 | temp = i / 16 + i % 16 * segLen; 306 | if (temp < end_read) end_read = temp; 307 | } 308 | } 309 | 310 | free(pvHmax); 311 | free(pvE); 312 | free(pvHLoad); 313 | free(pvHStore); 314 | 315 | /* Find the most possible 2nd best alignment. */ 316 | alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end)); 317 | bests[0].score = max + bias >= 255 ? 255 : max; 318 | bests[0].ref = end_ref; 319 | bests[0].read = end_read; 320 | 321 | bests[1].score = 0; 322 | bests[1].ref = 0; 323 | bests[1].read = 0; 324 | 325 | edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0; 326 | for (i = 0; i < edge; i ++) { 327 | // fprintf (stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]); 328 | if (maxColumn[i] > bests[1].score) { 329 | bests[1].score = maxColumn[i]; 330 | bests[1].ref = i; 331 | } 332 | } 333 | edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen); 334 | for (i = edge + 1; i < refLen; i ++) { 335 | // fprintf (stderr, "refLen: %d\tmaxColumn[%d]: %d\n", refLen, i, maxColumn[i]); 336 | if (maxColumn[i] > bests[1].score) { 337 | bests[1].score = maxColumn[i]; 338 | bests[1].ref = i; 339 | } 340 | } 341 | 342 | free(maxColumn); 343 | free(end_read_column); 344 | return bests; 345 | } 346 | 347 | static __m128i* qP_word (const int8_t* read_num, 348 | const int8_t* mat, 349 | const int32_t readLen, 350 | const int32_t n) { 351 | 352 | int32_t segLen = (readLen + 7) / 8; 353 | __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i)); 354 | int16_t* t = (int16_t*)vProfile; 355 | int32_t nt, i, j; 356 | int32_t segNum; 357 | 358 | /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */ 359 | for (nt = 0; LIKELY(nt < n); nt ++) { 360 | for (i = 0; i < segLen; i ++) { 361 | j = i; 362 | for (segNum = 0; LIKELY(segNum < 8) ; segNum ++) { 363 | *t++ = j>= readLen ? 0 : mat[nt * n + read_num[j]]; 364 | j += segLen; 365 | } 366 | } 367 | } 368 | return vProfile; 369 | } 370 | 371 | static alignment_end* sw_sse2_word (const int8_t* ref, 372 | int8_t ref_dir, // 0: forward ref; 1: reverse ref 373 | int32_t refLen, 374 | int32_t readLen, 375 | const uint8_t weight_gapO, /* will be used as - */ 376 | const uint8_t weight_gapE, /* will be used as - */ 377 | const __m128i* vProfile, 378 | uint16_t terminate, 379 | int32_t maskLen) { 380 | 381 | #define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \ 382 | (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \ 383 | (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \ 384 | (m) = _mm_extract_epi16((vm), 0) 385 | 386 | uint16_t max = 0; /* the max alignment score */ 387 | int32_t end_read = readLen - 1; 388 | int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */ 389 | int32_t segLen = (readLen + 7) / 8; /* number of segment */ 390 | 391 | /* array to record the largest score of each reference position */ 392 | uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2); 393 | 394 | /* array to record the alignment read ending position of the largest score of each reference position */ 395 | int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t)); 396 | 397 | /* Define 16 byte 0 vector. */ 398 | __m128i vZero = _mm_set1_epi32(0); 399 | 400 | __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i)); 401 | __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i)); 402 | __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i)); 403 | __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i)); 404 | 405 | int32_t i, j, k; 406 | /* 16 byte insertion begin vector */ 407 | __m128i vGapO = _mm_set1_epi16(weight_gapO); 408 | 409 | /* 16 byte insertion extension vector */ 410 | __m128i vGapE = _mm_set1_epi16(weight_gapE); 411 | 412 | __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */ 413 | __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */ 414 | __m128i vTemp; 415 | int32_t edge, begin = 0, end = refLen, step = 1; 416 | 417 | /* outer loop to process the reference sequence */ 418 | if (ref_dir == 1) { 419 | begin = refLen - 1; 420 | end = -1; 421 | step = -1; 422 | } 423 | for (i = begin; LIKELY(i != end); i += step) { 424 | int32_t cmp; 425 | __m128i e, vF = vZero; /* Initialize F value to 0. 426 | Any errors to vH values will be corrected in the Lazy_F loop. 427 | */ 428 | __m128i vH = pvHStore[segLen - 1]; 429 | vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */ 430 | 431 | /* Swap the 2 H buffers. */ 432 | __m128i* pv = pvHLoad; 433 | 434 | __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */ 435 | 436 | const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */ 437 | pvHLoad = pvHStore; 438 | pvHStore = pv; 439 | 440 | /* inner loop to process the query sequence */ 441 | for (j = 0; LIKELY(j < segLen); j ++) { 442 | vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j)); 443 | 444 | /* Get max from vH, vE and vF. */ 445 | e = _mm_load_si128(pvE + j); 446 | vH = _mm_max_epi16(vH, e); 447 | vH = _mm_max_epi16(vH, vF); 448 | vMaxColumn = _mm_max_epi16(vMaxColumn, vH); 449 | 450 | /* Save vH values. */ 451 | _mm_store_si128(pvHStore + j, vH); 452 | 453 | /* Update vE value. */ 454 | vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */ 455 | e = _mm_subs_epu16(e, vGapE); 456 | e = _mm_max_epi16(e, vH); 457 | _mm_store_si128(pvE + j, e); 458 | 459 | /* Update vF value. */ 460 | vF = _mm_subs_epu16(vF, vGapE); 461 | vF = _mm_max_epi16(vF, vH); 462 | 463 | /* Load the next vH. */ 464 | vH = _mm_load_si128(pvHLoad + j); 465 | } 466 | 467 | /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */ 468 | for (k = 0; LIKELY(k < 8); ++k) { 469 | vF = _mm_slli_si128 (vF, 2); 470 | for (j = 0; LIKELY(j < segLen); ++j) { 471 | vH = _mm_load_si128(pvHStore + j); 472 | vH = _mm_max_epi16(vH, vF); 473 | _mm_store_si128(pvHStore + j, vH); 474 | vH = _mm_subs_epu16(vH, vGapO); 475 | vF = _mm_subs_epu16(vF, vGapE); 476 | if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end; 477 | } 478 | } 479 | 480 | end: 481 | vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn); 482 | vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore); 483 | cmp = _mm_movemask_epi8(vTemp); 484 | if (cmp != 0xffff) { 485 | uint16_t temp; 486 | vMaxMark = vMaxScore; 487 | max8(temp, vMaxScore); 488 | vMaxScore = vMaxMark; 489 | 490 | if (LIKELY(temp > max)) { 491 | max = temp; 492 | end_ref = i; 493 | for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j]; 494 | } 495 | } 496 | 497 | /* Record the max score of current column. */ 498 | max8(maxColumn[i], vMaxColumn); 499 | if (maxColumn[i] == terminate) break; 500 | } 501 | 502 | /* Trace the alignment ending position on read. */ 503 | uint16_t *t = (uint16_t*)pvHmax; 504 | int32_t column_len = segLen * 8; 505 | for (i = 0; LIKELY(i < column_len); ++i, ++t) { 506 | int32_t temp; 507 | if (*t == max) { 508 | temp = i / 8 + i % 8 * segLen; 509 | if (temp < end_read) end_read = temp; 510 | } 511 | } 512 | 513 | free(pvHmax); 514 | free(pvE); 515 | free(pvHLoad); 516 | free(pvHStore); 517 | 518 | /* Find the most possible 2nd best alignment. */ 519 | alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end)); 520 | bests[0].score = max; 521 | bests[0].ref = end_ref; 522 | bests[0].read = end_read; 523 | 524 | bests[1].score = 0; 525 | bests[1].ref = 0; 526 | bests[1].read = 0; 527 | 528 | edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0; 529 | for (i = 0; i < edge; i ++) { 530 | if (maxColumn[i] > bests[1].score) { 531 | bests[1].score = maxColumn[i]; 532 | bests[1].ref = i; 533 | } 534 | } 535 | edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen); 536 | for (i = edge; i < refLen; i ++) { 537 | if (maxColumn[i] > bests[1].score) { 538 | bests[1].score = maxColumn[i]; 539 | bests[1].ref = i; 540 | } 541 | } 542 | 543 | free(maxColumn); 544 | free(end_read_column); 545 | return bests; 546 | } 547 | 548 | static cigar* banded_sw (const int8_t* ref, 549 | const int8_t* read, 550 | int32_t refLen, 551 | int32_t readLen, 552 | int32_t score, 553 | const uint32_t weight_gapO, /* will be used as - */ 554 | const uint32_t weight_gapE, /* will be used as - */ 555 | int32_t band_width, 556 | const int8_t* mat, /* pointer to the weight matrix */ 557 | int32_t n) { 558 | 559 | uint32_t *c = (uint32_t*)malloc(16 * sizeof(uint32_t)), *c1; 560 | int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, l, max = 0; 561 | int64_t s2 = 1024; 562 | char op, prev_op; 563 | int32_t width, width_d, *h_b, *e_b, *h_c; 564 | int8_t *direction, *direction_line; 565 | cigar* result = (cigar*)malloc(sizeof(cigar)); 566 | h_b = (int32_t*)malloc(s1 * sizeof(int32_t)); 567 | e_b = (int32_t*)malloc(s1 * sizeof(int32_t)); 568 | h_c = (int32_t*)malloc(s1 * sizeof(int32_t)); 569 | direction = (int8_t*)malloc(s2 * sizeof(int8_t)); 570 | 571 | do { 572 | width = band_width * 2 + 3, width_d = band_width * 2 + 1; 573 | while (width >= s1) { 574 | ++s1; 575 | kroundup32(s1); 576 | h_b = (int32_t*)realloc(h_b, s1 * sizeof(int32_t)); 577 | e_b = (int32_t*)realloc(e_b, s1 * sizeof(int32_t)); 578 | h_c = (int32_t*)realloc(h_c, s1 * sizeof(int32_t)); 579 | } 580 | while (width_d * readLen * 3 >= s2) { 581 | ++s2; 582 | kroundup32(s2); 583 | if (s2 < 0) { 584 | fprintf(stderr, "Alignment score and position are not consensus.\n"); 585 | exit(1); 586 | } 587 | direction = (int8_t*)realloc(direction, s2 * sizeof(int8_t)); 588 | } 589 | direction_line = direction; 590 | for (j = 1; LIKELY(j < width - 1); j ++) h_b[j] = 0; 591 | for (i = 0; LIKELY(i < readLen); i ++) { 592 | int32_t beg = 0, end = refLen - 1, u = 0, edge; 593 | j = i - band_width; beg = beg > j ? beg : j; // band start 594 | j = i + band_width; end = end < j ? end : j; // band end 595 | edge = end + 1 < width - 1 ? end + 1 : width - 1; 596 | f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0; 597 | direction_line = direction + width_d * i * 3; 598 | 599 | for (j = beg; LIKELY(j <= end); j ++) { 600 | int32_t b, e1, f1, d, de, df, dh; 601 | set_u(u, band_width, i, j); set_u(e, band_width, i - 1, j); 602 | set_u(b, band_width, i, j - 1); set_u(d, band_width, i - 1, j - 1); 603 | set_d(de, band_width, i, j, 0); 604 | set_d(df, band_width, i, j, 1); 605 | set_d(dh, band_width, i, j, 2); 606 | 607 | temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO; 608 | temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE; 609 | e_b[u] = temp1 > temp2 ? temp1 : temp2; 610 | direction_line[de] = temp1 > temp2 ? 3 : 2; 611 | 612 | temp1 = h_c[b] - weight_gapO; 613 | temp2 = f - weight_gapE; 614 | f = temp1 > temp2 ? temp1 : temp2; 615 | direction_line[df] = temp1 > temp2 ? 5 : 4; 616 | 617 | e1 = e_b[u] > 0 ? e_b[u] : 0; 618 | f1 = f > 0 ? f : 0; 619 | temp1 = e1 > f1 ? e1 : f1; 620 | temp2 = h_b[d] + mat[ref[j] * n + read[i]]; 621 | h_c[u] = temp1 > temp2 ? temp1 : temp2; 622 | 623 | if (h_c[u] > max) max = h_c[u]; 624 | 625 | if (temp1 <= temp2) direction_line[dh] = 1; 626 | else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df]; 627 | } 628 | for (j = 1; j <= u; j ++) h_b[j] = h_c[j]; 629 | } 630 | band_width *= 2; 631 | } while (LIKELY(max < score)); 632 | band_width /= 2; 633 | 634 | // trace back 635 | i = readLen - 1; 636 | j = refLen - 1; 637 | e = 0; // Count the number of M, D or I. 638 | l = 0; // record length of current cigar 639 | op = prev_op = 'M'; 640 | temp2 = 2; // h 641 | while (LIKELY(i > 0)) { 642 | set_d(temp1, band_width, i, j, temp2); 643 | switch (direction_line[temp1]) { 644 | case 1: 645 | --i; 646 | --j; 647 | temp2 = 2; 648 | direction_line -= width_d * 3; 649 | op = 'M'; 650 | break; 651 | case 2: 652 | --i; 653 | temp2 = 0; // e 654 | direction_line -= width_d * 3; 655 | op = 'I'; 656 | break; 657 | case 3: 658 | --i; 659 | temp2 = 2; 660 | direction_line -= width_d * 3; 661 | op = 'I'; 662 | break; 663 | case 4: 664 | --j; 665 | temp2 = 1; 666 | op = 'D'; 667 | break; 668 | case 5: 669 | --j; 670 | temp2 = 2; 671 | op = 'D'; 672 | break; 673 | default: 674 | fprintf(stderr, "Trace back error: %d.\n", direction_line[temp1 - 1]); 675 | free(direction); 676 | free(h_c); 677 | free(e_b); 678 | free(h_b); 679 | free(c); 680 | free(result); 681 | return 0; 682 | } 683 | if (op == prev_op) ++e; 684 | else { 685 | ++l; 686 | while (l >= s) { 687 | ++s; 688 | kroundup32(s); 689 | c = (uint32_t*)realloc(c, s * sizeof(uint32_t)); 690 | } 691 | c[l - 1] = to_cigar_int(e, prev_op); 692 | prev_op = op; 693 | e = 1; 694 | } 695 | } 696 | if (op == 'M') { 697 | ++l; 698 | while (l >= s) { 699 | ++s; 700 | kroundup32(s); 701 | c = (uint32_t*)realloc(c, s * sizeof(uint32_t)); 702 | } 703 | c[l - 1] = to_cigar_int(e + 1, op); 704 | }else { 705 | l += 2; 706 | while (l >= s) { 707 | ++s; 708 | kroundup32(s); 709 | c = (uint32_t*)realloc(c, s * sizeof(uint32_t)); 710 | } 711 | c[l - 2] = to_cigar_int(e, op); 712 | c[l - 1] = to_cigar_int(1, 'M'); 713 | } 714 | 715 | // reverse cigar 716 | c1 = (uint32_t*)malloc(l * sizeof(uint32_t)); 717 | s = 0; 718 | e = l - 1; 719 | while (LIKELY(s <= e)) { 720 | c1[s] = c[e]; 721 | c1[e] = c[s]; 722 | ++ s; 723 | -- e; 724 | } 725 | result->seq = c1; 726 | result->length = l; 727 | 728 | free(direction); 729 | free(h_c); 730 | free(e_b); 731 | free(h_b); 732 | free(c); 733 | return result; 734 | } 735 | 736 | static int8_t* seq_reverse(const int8_t* seq, int32_t end) /* end is 0-based alignment ending position */ 737 | { 738 | int8_t* reverse = (int8_t*)calloc(end + 1, sizeof(int8_t)); 739 | int32_t start = 0; 740 | while (LIKELY(start <= end)) { 741 | reverse[start] = seq[end]; 742 | reverse[end] = seq[start]; 743 | ++ start; 744 | -- end; 745 | } 746 | return reverse; 747 | } 748 | 749 | s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size) { 750 | s_profile* p = (s_profile*)calloc(1, sizeof(struct _profile)); 751 | p->profile_byte = 0; 752 | p->profile_word = 0; 753 | p->bias = 0; 754 | 755 | if (score_size == 0 || score_size == 2) { 756 | /* Find the bias to use in the substitution matrix */ 757 | int32_t bias = 0, i; 758 | for (i = 0; i < n*n; i++) if (mat[i] < bias) bias = mat[i]; 759 | bias = abs(bias); 760 | 761 | p->bias = bias; 762 | p->profile_byte = qP_byte (read, mat, readLen, n, bias); 763 | } 764 | if (score_size == 1 || score_size == 2) p->profile_word = qP_word (read, mat, readLen, n); 765 | p->read = read; 766 | p->mat = mat; 767 | p->readLen = readLen; 768 | p->n = n; 769 | return p; 770 | } 771 | 772 | void init_destroy (s_profile* p) { 773 | free(p->profile_byte); 774 | free(p->profile_word); 775 | free(p); 776 | } 777 | 778 | s_align* ssw_align (const s_profile* prof, 779 | const int8_t* ref, 780 | int32_t refLen, 781 | const uint8_t weight_gapO, 782 | const uint8_t weight_gapE, 783 | const uint8_t flag, // (from high to low) bit 5: return the best alignment beginning position; 6: if (ref_end1 - ref_begin1 <= filterd) && (read_end1 - read_begin1 <= filterd), return cigar; 7: if max score >= filters, return cigar; 8: always return cigar; if 6 & 7 are both setted, only return cigar when both filter fulfilled 784 | const uint16_t filters, 785 | const int32_t filterd, 786 | const int32_t maskLen) { 787 | 788 | alignment_end* bests = 0, *bests_reverse = 0; 789 | __m128i* vP = 0; 790 | int32_t word = 0, band_width = 0, readLen = prof->readLen; 791 | int8_t* read_reverse = 0; 792 | cigar* path; 793 | s_align* r = (s_align*)calloc(1, sizeof(s_align)); 794 | r->ref_begin1 = -1; 795 | r->read_begin1 = -1; 796 | r->cigar = 0; 797 | r->cigarLen = 0; 798 | if (maskLen < 15) { 799 | fprintf(stderr, "When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n"); 800 | } 801 | 802 | // Find the alignment scores and ending positions 803 | if (prof->profile_byte) { 804 | bests = sw_sse2_byte(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_byte, -1, prof->bias, maskLen); 805 | if (prof->profile_word && bests[0].score == 255) { 806 | free(bests); 807 | bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen); 808 | word = 1; 809 | } else if (bests[0].score == 255) { 810 | fprintf(stderr, "Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n"); 811 | free(r); 812 | return NULL; 813 | } 814 | }else if (prof->profile_word) { 815 | bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen); 816 | word = 1; 817 | }else { 818 | fprintf(stderr, "Please call the function ssw_init before ssw_align.\n"); 819 | free(r); 820 | return NULL; 821 | } 822 | r->score1 = bests[0].score; 823 | r->ref_end1 = bests[0].ref; 824 | r->read_end1 = bests[0].read; 825 | if (maskLen >= 15) { 826 | r->score2 = bests[1].score; 827 | r->ref_end2 = bests[1].ref; 828 | } else { 829 | r->score2 = 0; 830 | r->ref_end2 = -1; 831 | } 832 | free(bests); 833 | if (flag == 0 || (flag == 2 && r->score1 < filters)) goto end; 834 | 835 | // Find the beginning position of the best alignment. 836 | read_reverse = seq_reverse(prof->read, r->read_end1); 837 | if (word == 0) { 838 | vP = qP_byte(read_reverse, prof->mat, r->read_end1 + 1, prof->n, prof->bias); 839 | bests_reverse = sw_sse2_byte(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, prof->bias, maskLen); 840 | } else { 841 | vP = qP_word(read_reverse, prof->mat, r->read_end1 + 1, prof->n); 842 | bests_reverse = sw_sse2_word(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, maskLen); 843 | } 844 | free(vP); 845 | free(read_reverse); 846 | r->ref_begin1 = bests_reverse[0].ref; 847 | r->read_begin1 = r->read_end1 - bests_reverse[0].read; 848 | free(bests_reverse); 849 | if ((7&flag) == 0 || ((2&flag) != 0 && r->score1 < filters) || ((4&flag) != 0 && (r->ref_end1 - r->ref_begin1 > filterd || r->read_end1 - r->read_begin1 > filterd))) goto end; 850 | 851 | // Generate cigar. 852 | refLen = r->ref_end1 - r->ref_begin1 + 1; 853 | readLen = r->read_end1 - r->read_begin1 + 1; 854 | band_width = abs(refLen - readLen) + 1; 855 | path = banded_sw(ref + r->ref_begin1, prof->read + r->read_begin1, refLen, readLen, r->score1, weight_gapO, weight_gapE, band_width, prof->mat, prof->n); 856 | if (path == 0) { 857 | free(r); 858 | r = NULL; 859 | } 860 | else { 861 | r->cigar = path->seq; 862 | r->cigarLen = path->length; 863 | free(path); 864 | } 865 | 866 | end: 867 | return r; 868 | } 869 | 870 | void align_destroy (s_align* a) { 871 | free(a->cigar); 872 | free(a); 873 | } 874 | 875 | char cigar_int_to_op (uint32_t cigar_int) 876 | { 877 | uint8_t letter_code = cigar_int & 0xfU; 878 | static const char map[] = { 879 | 'M', 880 | 'I', 881 | 'D', 882 | 'N', 883 | 'S', 884 | 'H', 885 | 'P', 886 | '=', 887 | 'X', 888 | }; 889 | 890 | if (letter_code >= (sizeof(map)/sizeof(map[0]))) { 891 | return 'M'; 892 | } 893 | 894 | return map[letter_code]; 895 | } 896 | 897 | uint32_t cigar_int_to_len (uint32_t cigar_int) 898 | { 899 | uint32_t res = cigar_int >> 4; 900 | return res; 901 | } 902 | -------------------------------------------------------------------------------- /ssw201507/ssw.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ssw.h 3 | * 4 | * Created by Mengyao Zhao on 6/22/10. 5 | * Copyright 2010 Boston College. All rights reserved. 6 | * Version 0.1.4 7 | * Last revision by Mengyao Zhao on 01/30/13. 8 | * 9 | */ 10 | 11 | #ifndef SSW_H 12 | #define SSW_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif // __cplusplus 22 | 23 | 24 | /*! @typedef structure of the query profile */ 25 | struct _profile; 26 | typedef struct _profile s_profile; 27 | 28 | /*! @typedef structure of the alignment result 29 | @field score1 the best alignment score 30 | @field score2 sub-optimal alignment score 31 | @field ref_begin1 0-based best alignment beginning position on reference; ref_begin1 = -1 when the best alignment beginning 32 | position is not available 33 | @field ref_end1 0-based best alignment ending position on reference 34 | @field read_begin1 0-based best alignment beginning position on read; read_begin1 = -1 when the best alignment beginning 35 | position is not available 36 | @field read_end1 0-based best alignment ending position on read 37 | @field read_end2 0-based sub-optimal alignment ending position on read 38 | @field cigar best alignment cigar; stored the same as that in BAM format, high 28 bits: length, low 4 bits: M/I/D (0/1/2); 39 | cigar = 0 when the best alignment path is not available 40 | @field cigarLen length of the cigar string; cigarLen = 0 when the best alignment path is not available 41 | */ 42 | typedef struct { 43 | uint16_t score1; 44 | uint16_t score2; 45 | int32_t ref_begin1; 46 | int32_t ref_end1; 47 | int32_t read_begin1; 48 | int32_t read_end1; 49 | int32_t ref_end2; 50 | uint32_t* cigar; 51 | int32_t cigarLen; 52 | } s_align; 53 | 54 | /*! @function Create the query profile using the query sequence. 55 | @param read pointer to the query sequence; the query sequence needs to be numbers 56 | @param readLen length of the query sequence 57 | @param mat pointer to the substitution matrix; mat needs to be corresponding to the read sequence 58 | @param n the square root of the number of elements in mat (mat has n*n elements) 59 | @param score_size estimated Smith-Waterman score; if your estimated best alignment score is surely < 255 please set 0; if 60 | your estimated best alignment score >= 255, please set 1; if you don't know, please set 2 61 | @return pointer to the query profile structure 62 | @note example for parameter read and mat: 63 | If the query sequence is: ACGTATC, the sequence that read points to can be: 1234142 64 | Then if the penalty for match is 2 and for mismatch is -2, the substitution matrix of parameter mat will be: 65 | //A C G T 66 | 2 -2 -2 -2 //A 67 | -2 2 -2 -2 //C 68 | -2 -2 2 -2 //G 69 | -2 -2 -2 2 //T 70 | mat is the pointer to the array {2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2} 71 | */ 72 | s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size); 73 | 74 | /*! @function Release the memory allocated by function ssw_init. 75 | @param p pointer to the query profile structure 76 | */ 77 | void init_destroy (s_profile* p); 78 | 79 | // @function ssw alignment. 80 | /*! @function Do Striped Smith-Waterman alignment. 81 | @param prof pointer to the query profile structure 82 | @param ref pointer to the target sequence; the target sequence needs to be numbers and corresponding to the mat parameter of 83 | function ssw_init 84 | @param refLen length of the target sequence 85 | @param weight_gapO the absolute value of gap open penalty 86 | @param weight_gapE the absolute value of gap extension penalty 87 | @param flag bitwise FLAG; (from high to low) bit 5: when setted as 1, function ssw_align will return the best alignment 88 | beginning position; bit 6: when setted as 1, if (ref_end1 - ref_begin1 < filterd && read_end1 - read_begin1 89 | < filterd), (whatever bit 5 is setted) the function will return the best alignment beginning position and 90 | cigar; bit 7: when setted as 1, if the best alignment score >= filters, (whatever bit 5 is setted) the function 91 | will return the best alignment beginning position and cigar; bit 8: when setted as 1, (whatever bit 5, 6 or 7 is 92 | setted) the function will always return the best alignment beginning position and cigar. When flag == 0, only 93 | the optimal and sub-optimal scores and the optimal alignment ending position will be returned. 94 | @param filters score filter: when bit 7 of flag is setted as 1 and bit 8 is setted as 0, filters will be used (Please check the 95 | decription of the flag parameter for detailed usage.) 96 | @param filterd distance filter: when bit 6 of flag is setted as 1 and bit 8 is setted as 0, filterd will be used (Please check 97 | the decription of the flag parameter for detailed usage.) 98 | @param maskLen The distance between the optimal and suboptimal alignment ending position >= maskLen. We suggest to use 99 | readLen/2, if you don't have special concerns. Note: maskLen has to be >= 15, otherwise this function will NOT 100 | return the suboptimal alignment information. Detailed description of maskLen: After locating the optimal 101 | alignment ending position, the suboptimal alignment score can be heuristically found by checking the second 102 | largest score in the array that contains the maximal score of each column of the SW matrix. In order to avoid 103 | picking the scores that belong to the alignments sharing the partial best alignment, SSW C library masks the 104 | reference loci nearby (mask length = maskLen) the best alignment ending position and locates the second largest 105 | score from the unmasked elements. 106 | @return pointer to the alignment result structure 107 | @note Whatever the parameter flag is setted, this function will at least return the optimal and sub-optimal alignment score, 108 | and the optimal alignment ending positions on target and query sequences. If both bit 6 and 7 of the flag are setted 109 | while bit 8 is not, the function will return cigar only when both criteria are fulfilled. All returned positions are 110 | 0-based coordinate. 111 | */ 112 | s_align* ssw_align (const s_profile* prof, 113 | const int8_t* ref, 114 | int32_t refLen, 115 | const uint8_t weight_gapO, 116 | const uint8_t weight_gapE, 117 | const uint8_t flag, 118 | const uint16_t filters, 119 | const int32_t filterd, 120 | const int32_t maskLen); 121 | 122 | /*! @function Release the memory allocated by function ssw_align. 123 | @param a pointer to the alignment result structure 124 | */ 125 | void align_destroy (s_align* a); 126 | 127 | /*! @function Produce CIGAR 32-bit unsigned integer from CIGAR operation and CIGAR length 128 | @param length length of CIGAR 129 | @param op_letter CIGAR operation character ('M', 'I', etc) 130 | @return 32-bit unsigned integer, representing encoded CIGAR operation and length 131 | */ 132 | static inline uint32_t to_cigar_int (uint32_t length, char op_letter) 133 | { 134 | uint32_t res; 135 | uint8_t op_code; 136 | 137 | switch (op_letter) { 138 | case 'M': /* alignment match (can be a sequence match or mismatch */ 139 | default: 140 | op_code = 0; 141 | break; 142 | case 'I': /* insertion to the reference */ 143 | op_code = 1; 144 | break; 145 | case 'D': /* deletion from the reference */ 146 | op_code = 2; 147 | break; 148 | case 'N': /* skipped region from the reference */ 149 | op_code = 3; 150 | break; 151 | case 'S': /* soft clipping (clipped sequences present in SEQ) */ 152 | op_code = 4; 153 | break; 154 | case 'H': /* hard clipping (clipped sequences NOT present in SEQ) */ 155 | op_code = 5; 156 | break; 157 | case 'P': /* padding (silent deletion from padded reference) */ 158 | op_code = 6; 159 | break; 160 | case '=': /* sequence match */ 161 | op_code = 7; 162 | break; 163 | case 'X': /* sequence mismatch */ 164 | op_code = 8; 165 | break; 166 | } 167 | 168 | res = (length << 4) | op_code; 169 | return res; 170 | } 171 | 172 | /*! @function Extract CIGAR operation character from CIGAR 32-bit unsigned integer 173 | @param cigar_int 32-bit unsigned integer, representing encoded CIGAR operation and length 174 | @return CIGAR operation character ('M', 'I', etc) 175 | */ 176 | char cigar_int_to_op (uint32_t cigar_int); 177 | 178 | /*! @function Extract length of a CIGAR operation from CIGAR 32-bit unsigned integer 179 | @param cigar_int 32-bit unsigned integer, representing encoded CIGAR operation and length 180 | @return length of CIGAR operation 181 | */ 182 | uint32_t cigar_int_to_len (uint32_t cigar_int); 183 | 184 | #ifdef __cplusplus 185 | } 186 | #endif // __cplusplus 187 | 188 | #endif // SSW_H 189 | -------------------------------------------------------------------------------- /ssw201507/ssw.h.gch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/ssw201507/ssw.h.gch -------------------------------------------------------------------------------- /ssw201507/ssw_wrap.py: -------------------------------------------------------------------------------- 1 | """ 2 | @package ssw_wrap 3 | @brief Simple python wrapper for SSW align library 4 | To use the dynamic library libssw.so you may need to modify the LD_LIBRARY_PATH environment 5 | variable to include the library directory (export LD_LIBRARY_PATH=$PWD) or for definitive 6 | inclusion of the lib edit /etc/ld.so.conf and add the path or the directory containing the 7 | library and update the cache by using /sbin/ldconfig as root 8 | @copyright [The MIT licence](http://opensource.org/licenses/MIT) 9 | @author Clement & Adrien Leger - 2014 10 | """ 11 | 12 | #~~~~~~~GLOBAL IMPORTS~~~~~~~# 13 | # Standard library packages 14 | from ctypes import * 15 | 16 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 17 | class CAlignRes(Structure): 18 | """ 19 | @class SSWAlignRes 20 | @brief ctypes Structure with s_align struct mapping returned by SSWAligner.Align func 21 | Correspond to the structure of the query profile 22 | """ 23 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 24 | 25 | #~~~~~~~Ctype Structure~~~~~~~# 26 | _fields_ = [('score', c_uint16), 27 | ('score2', c_uint16), 28 | ('ref_begin', c_int32), 29 | ('ref_end', c_int32), 30 | ('query_begin', c_int32), 31 | ('query_end', c_int32), 32 | ('ref_end2', c_int32), 33 | ('cigar', POINTER(c_uint32)), 34 | ('cigarLen', c_int32)] 35 | 36 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 37 | class Aligner(object): 38 | """ 39 | @class SSWAligner 40 | @brief Wrapper for SSW align library 41 | """ 42 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 43 | 44 | #~~~~~~~CLASS VARIABLES~~~~~~~# 45 | 46 | # Dictionnary to map Nucleotide to int as expected by the SSW C library 47 | base_to_int = { 'A':0, 'C':1, 'G':2, 'T':3, 'N':4, 'a':0, 'c':1, 'g':2, 't':3, 'n':4} 48 | int_to_base = { 0:'A', 1:'C', 2:'G', 3:'T', 4:'N'} 49 | 50 | # Load the ssw library using ctypes 51 | libssw = cdll.LoadLibrary('libssw.so') 52 | 53 | # Init and setup the functions pointer to map the one specified in the SSW lib 54 | # ssw_init method 55 | ssw_init = libssw.ssw_init 56 | ssw_init.restype = c_void_p 57 | ssw_init.argtypes = [POINTER(c_int8), c_int32, POINTER(c_int8), c_int32, c_int8] 58 | # init_destroy function 59 | init_destroy = libssw.init_destroy 60 | init_destroy.restype = None 61 | init_destroy.argtypes = [c_void_p] 62 | # ssw_align function 63 | ssw_align = libssw.ssw_align 64 | ssw_align.restype = POINTER(CAlignRes) 65 | ssw_align.argtypes = [c_void_p, POINTER(c_int8), c_int32, c_uint8, c_uint8, c_uint8, c_uint16, c_int32, c_int32] 66 | # align_destroy function 67 | align_destroy = libssw.align_destroy 68 | align_destroy.restype = None 69 | align_destroy.argtypes = [POINTER(CAlignRes)] 70 | 71 | #~~~~~~~FONDAMENTAL METHODS~~~~~~~# 72 | 73 | def __repr__(self): 74 | msg = self.__str__() 75 | msg += "SCORE PARAMETERS:\n" 76 | msg += " Gap Weight Open: {} Extension: {}\n".format(-self.gap_open, -self.gap_extend) 77 | msg += " Align Weight Match: {} Mismatch: {}\n\n".format(self.match, -self.mismatch) 78 | msg += " Match/mismatch Score matrix\n" 79 | msg += " \tA\tC\tG\tT\tN\n" 80 | msg += " A\t{}\t{}\t{}\t{}\t{}\n".format(self.match, -self.mismatch, -self.mismatch, -self.mismatch, 0) 81 | msg += " C\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, self.match, -self.mismatch, -self.mismatch, 0) 82 | msg += " G\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, self.match, -self.mismatch, 0) 83 | msg += " T\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, -self.mismatch, self.match, 0) 84 | msg += " N\t{}\t{}\t{}\t{}\t{}\n\n".format(0,0,0,0,0) 85 | msg += "RESULT PARAMETERS:\n" 86 | msg += " Report cigar {}\n".format(self.report_cigar) 87 | msg += " Report secondary match {}\n\n".format(self.report_secondary) 88 | msg += "REFERENCE SEQUENCE :\n" 89 | if self.ref_len <= 50: 90 | msg += "".join([self.int_to_base[i] for i in self.ref_seq])+"\n" 91 | else: 92 | msg += "".join([self.int_to_base[self.ref_seq[i]] for i in range(50)])+"...\n" 93 | msg += " Lenght :{} nucleotides\n".format(self.ref_len) 94 | return msg 95 | 96 | def __str__(self): 97 | return "\n\n".format(self.__class__.__name__, self.__module__) 98 | 99 | def __init__(self, 100 | ref_seq="", 101 | match=2, 102 | mismatch=2, 103 | gap_open=3, 104 | gap_extend=1, 105 | report_secondary=False, 106 | report_cigar=False): 107 | """ 108 | Initialize object by creating an interface with ssw library fonctions 109 | A reference sequence is also assigned to the object for multiple alignment against queries 110 | with the align function 111 | @param ref_seq Reference sequence as a python string (case insensitive) 112 | @param match Weight for a match 113 | @param mismatch Absolute value of mismatch penalty 114 | @param gap_open Absolute value of gap open penalty 115 | @param gap_extend Absolute value of gap extend penalty 116 | @param report_secondary Report the 2nd best alignement if true 117 | @param report_cigar Report cigar string if true 118 | """ 119 | 120 | # Store overall alignment parameters 121 | self.report_secondary = report_secondary 122 | self.report_cigar = report_cigar 123 | 124 | # Set gap penalties 125 | self.set_gap(gap_open, gap_extend) 126 | 127 | # Set the cost matrix 128 | self.set_mat(match, mismatch) 129 | 130 | # Set the reference sequence 131 | self.set_ref(ref_seq) 132 | 133 | #~~~~~~~SETTERS METHODS~~~~~~~# 134 | 135 | def set_gap(self, gap_open=3, gap_extend=1): 136 | """ 137 | Store gapopen and gap extension penalties 138 | """ 139 | self.gap_open = gap_open 140 | self.gap_extend = gap_extend 141 | 142 | 143 | def set_mat(self, match=2, mismatch=2): 144 | """ 145 | Store match and mismatch scores then initialize a Cost matrix and fill it with match and 146 | mismatch values. Ambiguous base: no penalty 147 | """ 148 | self.match = match 149 | self.mismatch = mismatch 150 | 151 | mat_decl = c_int8 * 25 152 | self.mat = mat_decl(match, -mismatch, -mismatch, -mismatch, 0, 153 | -mismatch, match, -mismatch, -mismatch, 0, 154 | -mismatch, -mismatch, match, -mismatch, 0, 155 | -mismatch, -mismatch, -mismatch, match, 0, 156 | 0, 0, 0, 0, 0) 157 | 158 | def set_ref(self, ref_seq): 159 | """ 160 | Determine the size of the ref sequence and cast it in a c type integer matrix 161 | """ 162 | if ref_seq: 163 | self.ref_len = len(ref_seq) 164 | self.ref_seq = self._DNA_to_int_mat (ref_seq, self.ref_len) 165 | else: 166 | self.ref_len = 0 167 | self.ref_seq = "" 168 | 169 | #~~~~~~~PUBLIC METHODS~~~~~~~# 170 | 171 | def align(self, query_seq, min_score=0, min_len=0): 172 | """ 173 | Perform the alignment of query against the object reference sequence 174 | @param query_seq Query sequence as a python string (case insensitive) 175 | @param min_score Minimal score of match. None will be return in case of filtering out 176 | @param min_len Minimal length of match. None will be return in case of filtering out 177 | @return A SSWAlignRes Object containing informations about the alignment. 178 | """ 179 | # Determine the size of the ref sequence and cast it in a c type integer matrix 180 | query_len = len(query_seq) 181 | query_seq = self._DNA_to_int_mat (query_seq, query_len) 182 | 183 | # Create the query profile using the query sequence 184 | profile = self.ssw_init(query_seq, # Query seq in c type integers 185 | c_int32(query_len), # Length of Queryseq in bites 186 | self.mat, # Score matrix 187 | 5, # Square root of the number of elements in mat 188 | 2) # flag = no estimation of the best alignment score 189 | 190 | # Setup the mask_len parameters = distance between the optimal and suboptimal alignment 191 | # if < 15, the function will NOT return the suboptimal alignment information 192 | 193 | if query_len > 30: 194 | #mask_len = query_len/2 # for python2.x, comment by Zhiqun Xie 195 | mask_len = int(query_len/2) # for python3.x, modified by Zhiqun Xie 196 | else: 197 | mask_len = 15 198 | 199 | c_result = self.ssw_align (profile, # Query profile 200 | self.ref_seq, # Ref seq in c type integers 201 | c_int32(self.ref_len), # Length of Refseq in bites 202 | self.gap_open, # Absolute value of gap open penalty 203 | self.gap_extend, # absolute value of gap extend penalty 204 | 1, # Bitwise FLAG for output values = return all 205 | 0, # Score filter = return all 206 | 0, # Distance filter = return all 207 | mask_len) # Distance between the optimal and suboptimal alignment 208 | 209 | # Transform the Cstructure into a python object if score and lenght match the requirements 210 | score = c_result.contents.score 211 | match_len = c_result.contents.query_end - c_result.contents.query_begin + 1 212 | 213 | if score >= min_score and match_len >= min_len: 214 | py_result = PyAlignRes(c_result, query_len, self.report_secondary, self.report_cigar) 215 | else: 216 | py_result = None 217 | 218 | # Free reserved space by ssw.init and ssw_init methods. 219 | self._init_destroy(profile) 220 | self._align_destroy(c_result) 221 | 222 | # Return the object 223 | return py_result 224 | 225 | #~~~~~~~PRIVATE METHODS~~~~~~~# 226 | 227 | def _DNA_to_int_mat (self, seq, len_seq): 228 | """ 229 | Cast a python DNA string into a Ctype int8 matrix 230 | """ 231 | # Declare the matrix 232 | query_num_decl = c_int8 * len_seq 233 | query_num = query_num_decl() 234 | 235 | # for each letters in ATCGN transform in integers thanks to self.base_to_int 236 | for i in range(len_seq): 237 | try: 238 | value = self.base_to_int[seq[i]] 239 | # if the base is not in the canonic DNA bases assign 4 as for N 240 | except KeyError: 241 | value = 4 242 | finally: 243 | query_num[i] = value 244 | 245 | return query_num 246 | 247 | def _init_destroy(self, profile): 248 | """ 249 | Free the space alocated for the matrix used by init 250 | """ 251 | self.init_destroy(profile) 252 | 253 | def _align_destroy(self, align): 254 | """ 255 | Free the space alocated for the matrix used by align 256 | """ 257 | self.align_destroy(align) 258 | 259 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 260 | class PyAlignRes(object): 261 | """ 262 | @class PyAlignRes 263 | @brief Extract and verify result from a CAlignRes structure. A comprehensive python 264 | object is created according to user requirements (+- cigar string and secondary alignment) 265 | """ 266 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 267 | 268 | #~~~~~~~CLASS VARIABLES~~~~~~~# 269 | 270 | # Load the ssw library using ctypes 271 | libssw = cdll.LoadLibrary('libssw.so') 272 | 273 | # Init and setup the functions pointer to map the one specified in the SSW lib 274 | # cigar_int_to_len function 275 | cigar_int_to_len = libssw.cigar_int_to_len 276 | cigar_int_to_len.restype = c_int32 277 | cigar_int_to_len.argtypes = [c_int32] 278 | # cigar_int_to_op function 279 | cigar_int_to_op = libssw.cigar_int_to_op 280 | cigar_int_to_op.restype = c_char 281 | cigar_int_to_op.argtypes = [c_int32] 282 | 283 | #~~~~~~~FONDAMENTAL METHOD~~~~~~~# 284 | 285 | def __repr__(self): 286 | msg = self.__str__() 287 | msg += "OPTIMAL MATCH\n" 288 | msg += "Score {}\n".format(self.score) 289 | msg += "Reference begin {}\n".format(self.ref_begin) 290 | msg += "Reference end {}\n".format(self.ref_end) 291 | msg += "Query begin {}\n".format(self.query_begin) 292 | msg += "Query end {}\n".format(self.query_end) 293 | 294 | if self.cigar_string: 295 | msg += "Cigar_string {}\n".format(self.cigar_string) 296 | 297 | if self.score2: 298 | msg += "SUB-OPTIMAL MATCH\n" 299 | msg += "Score 2 {}\n".format(self.score2) 300 | msg += "Ref_end2 {}\n".format(self.ref_end2) 301 | 302 | return msg 303 | 304 | def __str__(self): 305 | return "\n\n".format(self.__class__.__name__, self.__module__) 306 | 307 | 308 | def __init__ (self, Res, query_len, report_secondary=False, report_cigar=False): 309 | """ 310 | Parse CAlignRes structure and copy its values in object variables 311 | @param Res A CAlignRes structure 312 | @param query_len length of the query sequence 313 | @param report_secondary Report the 2nd best alignement if true 314 | @param report_cigar Report cigar string if true 315 | """ 316 | # Parse value in the C type structure pointer 317 | # Minimal mandatory parameters 318 | self.score = Res.contents.score 319 | self.ref_begin = Res.contents.ref_begin 320 | self.ref_end = Res.contents.ref_end 321 | self.query_begin = Res.contents.query_begin 322 | self.query_end = Res.contents.query_end 323 | 324 | # Information for sub-optimal match if require and available 325 | score2 = Res.contents.score2 326 | if report_secondary and score2 != 0: 327 | self.score2 = score2 328 | self.ref_end2 = Res.contents.ref_end2 329 | else: 330 | self.score2 = None 331 | self.ref_end2 = None 332 | 333 | # Cigar Information if CIGAR string if require and available 334 | cigar_len = Res.contents.cigarLen 335 | if report_cigar and cigar_len > 0: 336 | self.cigar_string = self._cigar_string (Res.contents.cigar, cigar_len, query_len) 337 | else: 338 | self.cigar_string = None 339 | 340 | #~~~~~~~PRIVATE METHODS~~~~~~~# 341 | 342 | def _cigar_string(self, cigar, cigar_len, query_len): 343 | """ 344 | Convert cigar and cigarLen into an human readable Cigar string as in SAM files 345 | """ 346 | # Empty string for iterative writing of the cigar string 347 | cigar_string = "" 348 | 349 | # If the query match do not start at its first base 350 | # = introduce a softclip at the begining 351 | if self.query_begin > 0: 352 | op_len = self.query_begin 353 | op_char = "S" 354 | cigar_string += '{}{}'.format(op_len, op_char) 355 | 356 | # Iterate over the cigar (pointer to a vector of int) 357 | for i in range(cigar_len): 358 | op_len = self.cigar_int_to_len(cigar[i]) 359 | op_char = self.cigar_int_to_op(cigar[i]) 360 | #cigar_string += '{}{}'.format(op_len, op_char) 361 | cigar_string += '{}{}'.format(op_len, op_char.decode()) # modified by Zhiqun Xie 362 | 363 | # If the lenght of bases aligned is shorter than the overall query length 364 | # = introduce a softclip at the end 365 | end_len = query_len - self.query_end - 1 366 | if end_len != 0: 367 | op_len = end_len 368 | op_char = "S" 369 | cigar_string += '{}{}'.format(op_len, op_char) 370 | 371 | return cigar_string 372 | -------------------------------------------------------------------------------- /ssw201507/test1.fna: -------------------------------------------------------------------------------- 1 | >IS1X3_leftend 2 | GGATAATGGTGCCAACTTACTGAT 3 | -------------------------------------------------------------------------------- /ssw201507/test11.fna: -------------------------------------------------------------------------------- 1 | >ISAcma33_left32 2 | GGTCGTGCATCAAAAAAGTGTGGGTTTGTTAA 3 | -------------------------------------------------------------------------------- /ssw201507/test2.fna: -------------------------------------------------------------------------------- 1 | >IS1X3_rightend 2 | GGTAATGACTCCAACTTACTGATA 3 | -------------------------------------------------------------------------------- /ssw201507/test22.fna: -------------------------------------------------------------------------------- 1 | >ISAcma33_right32 2 | GATTGTGCGTCAATAAAGTGTGGGATAGTTGA 3 | -------------------------------------------------------------------------------- /ssw_wrap.py: -------------------------------------------------------------------------------- 1 | """ 2 | @package ssw_wrap 3 | @brief Simple python wrapper for SSW align library 4 | To use the dynamic library libssw.so you may need to modify the LD_LIBRARY_PATH environment 5 | variable to include the library directory (export LD_LIBRARY_PATH=$PWD) or for definitive 6 | inclusion of the lib edit /etc/ld.so.conf and add the path or the directory containing the 7 | library and update the cache by using /sbin/ldconfig as root 8 | @copyright [The MIT licence](http://opensource.org/licenses/MIT) 9 | @author Clement & Adrien Leger - 2014 10 | """ 11 | 12 | #~~~~~~~GLOBAL IMPORTS~~~~~~~# 13 | # Standard library packages 14 | from ctypes import * 15 | 16 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 17 | class CAlignRes(Structure): 18 | """ 19 | @class SSWAlignRes 20 | @brief ctypes Structure with s_align struct mapping returned by SSWAligner.Align func 21 | Correspond to the structure of the query profile 22 | """ 23 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 24 | 25 | #~~~~~~~Ctype Structure~~~~~~~# 26 | _fields_ = [('score', c_uint16), 27 | ('score2', c_uint16), 28 | ('ref_begin', c_int32), 29 | ('ref_end', c_int32), 30 | ('query_begin', c_int32), 31 | ('query_end', c_int32), 32 | ('ref_end2', c_int32), 33 | ('cigar', POINTER(c_uint32)), 34 | ('cigarLen', c_int32)] 35 | 36 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 37 | class Aligner(object): 38 | """ 39 | @class SSWAligner 40 | @brief Wrapper for SSW align library 41 | """ 42 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 43 | 44 | #~~~~~~~CLASS VARIABLES~~~~~~~# 45 | 46 | # Dictionnary to map Nucleotide to int as expected by the SSW C library 47 | base_to_int = { 'A':0, 'C':1, 'G':2, 'T':3, 'N':4, 'a':0, 'c':1, 'g':2, 't':3, 'n':4} 48 | int_to_base = { 0:'A', 1:'C', 2:'G', 3:'T', 4:'N'} 49 | 50 | # Load the ssw library using ctypes 51 | libssw = cdll.LoadLibrary('libssw.so') 52 | 53 | # Init and setup the functions pointer to map the one specified in the SSW lib 54 | # ssw_init method 55 | ssw_init = libssw.ssw_init 56 | ssw_init.restype = c_void_p 57 | ssw_init.argtypes = [POINTER(c_int8), c_int32, POINTER(c_int8), c_int32, c_int8] 58 | # init_destroy function 59 | init_destroy = libssw.init_destroy 60 | init_destroy.restype = None 61 | init_destroy.argtypes = [c_void_p] 62 | # ssw_align function 63 | ssw_align = libssw.ssw_align 64 | ssw_align.restype = POINTER(CAlignRes) 65 | ssw_align.argtypes = [c_void_p, POINTER(c_int8), c_int32, c_uint8, c_uint8, c_uint8, c_uint16, c_int32, c_int32] 66 | # align_destroy function 67 | align_destroy = libssw.align_destroy 68 | align_destroy.restype = None 69 | align_destroy.argtypes = [POINTER(CAlignRes)] 70 | 71 | #~~~~~~~FONDAMENTAL METHODS~~~~~~~# 72 | 73 | def __repr__(self): 74 | msg = self.__str__() 75 | msg += "SCORE PARAMETERS:\n" 76 | msg += " Gap Weight Open: {} Extension: {}\n".format(-self.gap_open, -self.gap_extend) 77 | msg += " Align Weight Match: {} Mismatch: {}\n\n".format(self.match, -self.mismatch) 78 | msg += " Match/mismatch Score matrix\n" 79 | msg += " \tA\tC\tG\tT\tN\n" 80 | msg += " A\t{}\t{}\t{}\t{}\t{}\n".format(self.match, -self.mismatch, -self.mismatch, -self.mismatch, 0) 81 | msg += " C\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, self.match, -self.mismatch, -self.mismatch, 0) 82 | msg += " G\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, self.match, -self.mismatch, 0) 83 | msg += " T\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, -self.mismatch, self.match, 0) 84 | msg += " N\t{}\t{}\t{}\t{}\t{}\n\n".format(0,0,0,0,0) 85 | msg += "RESULT PARAMETERS:\n" 86 | msg += " Report cigar {}\n".format(self.report_cigar) 87 | msg += " Report secondary match {}\n\n".format(self.report_secondary) 88 | msg += "REFERENCE SEQUENCE :\n" 89 | if self.ref_len <= 50: 90 | msg += "".join([self.int_to_base[i] for i in self.ref_seq])+"\n" 91 | else: 92 | msg += "".join([self.int_to_base[self.ref_seq[i]] for i in range(50)])+"...\n" 93 | msg += " Lenght :{} nucleotides\n".format(self.ref_len) 94 | return msg 95 | 96 | def __str__(self): 97 | return "\n\n".format(self.__class__.__name__, self.__module__) 98 | 99 | def __init__(self, 100 | ref_seq="", 101 | match=2, 102 | mismatch=2, 103 | gap_open=3, 104 | gap_extend=1, 105 | report_secondary=False, 106 | report_cigar=False): 107 | """ 108 | Initialize object by creating an interface with ssw library fonctions 109 | A reference sequence is also assigned to the object for multiple alignment against queries 110 | with the align function 111 | @param ref_seq Reference sequence as a python string (case insensitive) 112 | @param match Weight for a match 113 | @param mismatch Absolute value of mismatch penalty 114 | @param gap_open Absolute value of gap open penalty 115 | @param gap_extend Absolute value of gap extend penalty 116 | @param report_secondary Report the 2nd best alignement if true 117 | @param report_cigar Report cigar string if true 118 | """ 119 | 120 | # Store overall alignment parameters 121 | self.report_secondary = report_secondary 122 | self.report_cigar = report_cigar 123 | 124 | # Set gap penalties 125 | self.set_gap(gap_open, gap_extend) 126 | 127 | # Set the cost matrix 128 | self.set_mat(match, mismatch) 129 | 130 | # Set the reference sequence 131 | self.set_ref(ref_seq) 132 | 133 | #~~~~~~~SETTERS METHODS~~~~~~~# 134 | 135 | def set_gap(self, gap_open=3, gap_extend=1): 136 | """ 137 | Store gapopen and gap extension penalties 138 | """ 139 | self.gap_open = gap_open 140 | self.gap_extend = gap_extend 141 | 142 | 143 | def set_mat(self, match=2, mismatch=2): 144 | """ 145 | Store match and mismatch scores then initialize a Cost matrix and fill it with match and 146 | mismatch values. Ambiguous base: no penalty 147 | """ 148 | self.match = match 149 | self.mismatch = mismatch 150 | 151 | mat_decl = c_int8 * 25 152 | self.mat = mat_decl(match, -mismatch, -mismatch, -mismatch, 0, 153 | -mismatch, match, -mismatch, -mismatch, 0, 154 | -mismatch, -mismatch, match, -mismatch, 0, 155 | -mismatch, -mismatch, -mismatch, match, 0, 156 | 0, 0, 0, 0, 0) 157 | 158 | def set_ref(self, ref_seq): 159 | """ 160 | Determine the size of the ref sequence and cast it in a c type integer matrix 161 | """ 162 | if ref_seq: 163 | self.ref_len = len(ref_seq) 164 | self.ref_seq = self._DNA_to_int_mat (ref_seq, self.ref_len) 165 | else: 166 | self.ref_len = 0 167 | self.ref_seq = "" 168 | 169 | #~~~~~~~PUBLIC METHODS~~~~~~~# 170 | 171 | def align(self, query_seq, min_score=0, min_len=0): 172 | """ 173 | Perform the alignment of query against the object reference sequence 174 | @param query_seq Query sequence as a python string (case insensitive) 175 | @param min_score Minimal score of match. None will be return in case of filtering out 176 | @param min_len Minimal length of match. None will be return in case of filtering out 177 | @return A SSWAlignRes Object containing informations about the alignment. 178 | """ 179 | # Determine the size of the ref sequence and cast it in a c type integer matrix 180 | query_len = len(query_seq) 181 | query_seq = self._DNA_to_int_mat (query_seq, query_len) 182 | 183 | # Create the query profile using the query sequence 184 | profile = self.ssw_init(query_seq, # Query seq in c type integers 185 | c_int32(query_len), # Length of Queryseq in bites 186 | self.mat, # Score matrix 187 | 5, # Square root of the number of elements in mat 188 | 2) # flag = no estimation of the best alignment score 189 | 190 | # Setup the mask_len parameters = distance between the optimal and suboptimal alignment 191 | # if < 15, the function will NOT return the suboptimal alignment information 192 | 193 | if query_len > 30: 194 | #mask_len = query_len/2 # for python2.x, commented by Zhiqun Xie 195 | mask_len = int(query_len/2) # for python3.x, modified by Zhiqun Xie 196 | else: 197 | mask_len = 15 198 | 199 | c_result = self.ssw_align(profile, # Query profile 200 | self.ref_seq, # Ref seq in c type integers 201 | c_int32(self.ref_len), # Length of Refseq in bites 202 | self.gap_open, # Absolute value of gap open penalty 203 | self.gap_extend, # absolute value of gap extend penalty 204 | 1, # Bitwise FLAG for output values = return all 205 | 0, # Score filter = return all 206 | 0, # Distance filter = return all 207 | mask_len) # Distance between the optimal and suboptimal alignment 208 | 209 | # Transform the Cstructure into a python object if score and lenght match the requirements 210 | score = c_result.contents.score 211 | match_len = c_result.contents.query_end - c_result.contents.query_begin + 1 212 | 213 | if score >= min_score and match_len >= min_len: 214 | py_result = PyAlignRes(c_result, query_len, self.report_secondary, self.report_cigar) 215 | else: 216 | py_result = None 217 | 218 | # Free reserved space by ssw.init and ssw_init methods. 219 | self._init_destroy(profile) 220 | self._align_destroy(c_result) 221 | 222 | # Return the object 223 | return py_result 224 | 225 | #~~~~~~~PRIVATE METHODS~~~~~~~# 226 | 227 | def _DNA_to_int_mat (self, seq, len_seq): 228 | """ 229 | Cast a python DNA string into a Ctype int8 matrix 230 | """ 231 | # Declare the matrix 232 | query_num_decl = c_int8 * len_seq 233 | query_num = query_num_decl() 234 | 235 | # for each letters in ATCGN transform in integers thanks to self.base_to_int 236 | for i in range(len_seq): 237 | try: 238 | value = self.base_to_int[seq[i]] 239 | # if the base is not in the canonic DNA bases assign 4 as for N 240 | except KeyError: 241 | value = 4 242 | finally: 243 | query_num[i] = value 244 | 245 | return query_num 246 | 247 | def _init_destroy(self, profile): 248 | """ 249 | Free the space alocated for the matrix used by init 250 | """ 251 | self.init_destroy(profile) 252 | 253 | def _align_destroy(self, align): 254 | """ 255 | Free the space alocated for the matrix used by align 256 | """ 257 | self.align_destroy(align) 258 | 259 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 260 | class PyAlignRes(object): 261 | """ 262 | @class PyAlignRes 263 | @brief Extract and verify result from a CAlignRes structure. A comprehensive python 264 | object is created according to user requirements (+- cigar string and secondary alignment) 265 | """ 266 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# 267 | 268 | #~~~~~~~CLASS VARIABLES~~~~~~~# 269 | 270 | # Load the ssw library using ctypes 271 | libssw = cdll.LoadLibrary('libssw.so') 272 | 273 | # Init and setup the functions pointer to map the one specified in the SSW lib 274 | # cigar_int_to_len function 275 | cigar_int_to_len = libssw.cigar_int_to_len 276 | cigar_int_to_len.restype = c_int32 277 | cigar_int_to_len.argtypes = [c_int32] 278 | # cigar_int_to_op function 279 | cigar_int_to_op = libssw.cigar_int_to_op 280 | cigar_int_to_op.restype = c_char 281 | cigar_int_to_op.argtypes = [c_int32] 282 | 283 | #~~~~~~~FONDAMENTAL METHOD~~~~~~~# 284 | 285 | def __repr__(self): 286 | msg = self.__str__() 287 | msg += "OPTIMAL MATCH\n" 288 | msg += "Score {}\n".format(self.score) 289 | msg += "Reference begin {}\n".format(self.ref_begin) 290 | msg += "Reference end {}\n".format(self.ref_end) 291 | msg += "Query begin {}\n".format(self.query_begin) 292 | msg += "Query end {}\n".format(self.query_end) 293 | 294 | if self.cigar_string: 295 | msg += "Cigar_string {}\n".format(self.cigar_string) 296 | 297 | if self.score2: 298 | msg += "SUB-OPTIMAL MATCH\n" 299 | msg += "Score 2 {}\n".format(self.score2) 300 | msg += "Ref_end2 {}\n".format(self.ref_end2) 301 | 302 | return msg 303 | 304 | def __str__(self): 305 | return "\n\n".format(self.__class__.__name__, self.__module__) 306 | 307 | 308 | def __init__ (self, Res, query_len, report_secondary=False, report_cigar=False): 309 | """ 310 | Parse CAlignRes structure and copy its values in object variables 311 | @param Res A CAlignRes structure 312 | @param query_len length of the query sequence 313 | @param report_secondary Report the 2nd best alignement if true 314 | @param report_cigar Report cigar string if true 315 | """ 316 | # Parse value in the C type structure pointer 317 | # Minimal mandatory parameters 318 | self.score = Res.contents.score 319 | self.ref_begin = Res.contents.ref_begin 320 | self.ref_end = Res.contents.ref_end 321 | self.query_begin = Res.contents.query_begin 322 | self.query_end = Res.contents.query_end 323 | 324 | # Information for sub-optimal match if require and available 325 | score2 = Res.contents.score2 326 | if report_secondary and score2 != 0: 327 | self.score2 = score2 328 | self.ref_end2 = Res.contents.ref_end2 329 | else: 330 | self.score2 = None 331 | self.ref_end2 = None 332 | 333 | # Cigar Information if CIGAR string if require and available 334 | cigar_len = Res.contents.cigarLen 335 | if report_cigar and cigar_len > 0: 336 | self.cigar_string = self._cigar_string (Res.contents.cigar, cigar_len, query_len) 337 | else: 338 | self.cigar_string = None 339 | 340 | #~~~~~~~PRIVATE METHODS~~~~~~~# 341 | 342 | def _cigar_string(self, cigar, cigar_len, query_len): 343 | """ 344 | Convert cigar and cigarLen into an human readable Cigar string as in SAM files 345 | """ 346 | # Empty string for iterative writing of the cigar string 347 | cigar_string = "" 348 | 349 | # If the query match do not start at its first base 350 | # = introduce a softclip at the begining 351 | if self.query_begin > 0: 352 | op_len = self.query_begin 353 | op_char = "S" 354 | cigar_string += '{}{}'.format(op_len, op_char) 355 | 356 | # Iterate over the cigar (pointer to a vector of int) 357 | for i in range(cigar_len): 358 | op_len = self.cigar_int_to_len(cigar[i]) 359 | op_char = self.cigar_int_to_op(cigar[i]) 360 | #cigar_string += '{}{}'.format(op_len, op_char) 361 | cigar_string += '{}{}'.format(op_len, op_char.decode()) # modified by Zhiqun Xie 362 | 363 | # If the lenght of bases aligned is shorter than the overall query length 364 | # = introduce a softclip at the end 365 | end_len = query_len - self.query_end - 1 366 | if end_len != 0: 367 | op_len = end_len 368 | op_char = "S" 369 | cigar_string += '{}{}'.format(op_len, op_char) 370 | 371 | return cigar_string 372 | --------------------------------------------------------------------------------