├── LICENSE
├── NC_012624.fna
├── README.md
├── __pycache__
├── constants.cpython-33.pyc
├── isPredict.cpython-33.pyc
├── is_analysis.cpython-33.pyc
├── pred.cpython-33.pyc
├── ssw_wrap.cpython-33.pyc
└── tools.cpython-33.pyc
├── constants.py
├── isPredict.py
├── is_analysis.py
├── isescan.py
├── pHMMs
├── clusters.faa.hmm
└── clusters.single.faa
├── pred.py
├── publication
├── SupplementaryMaterials.docx
├── SupplementaryMaterials.xlsx
└── btx433.pdf
├── pyssw.py
├── ssw201507
├── Makefile
├── __pycache__
│ └── ssw_wrap.cpython-33.pyc
├── example.c
├── kseq.h
├── license.ssw.txt
├── main.c
├── pyssw.py
├── result.sam
├── ssw.c
├── ssw.h
├── ssw.h.gch
├── ssw_wrap.py
├── test1.fna
├── test11.fna
├── test2.fna
└── test22.fna
├── ssw_wrap.py
└── tools.py
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ISEScan [](https://bioconda.github.io/recipes/isescan/README.html) [](https://quay.io/repository/biocontainers/isescan)
2 |
3 | ## A python pipeline to identify IS (Insertion Sequence) elements in genome and metagenome
4 | - ISEScan can be used to identify/annotate full-length or non-full-length IS elements in any DNA sequence but ISEScan was only tested on prokarytoic genome including draft genome and meta-genome.
5 | - Among the existing tools identifying IS elements, ISEScan might be the only one that gives TIR (Terminal Inverted Repeat) sequences.
6 | - The input sequence file (namely, genome or meta-genome) of ISEScan can contain one or more sequences and there is no limit on the length of each sequence, though ISEScan was only tested on complete genome with one or more sequences, draft genome with many contigs, assembled meta-genome with many contigs.
7 | - The only requirment for the input sequence file is: the sequence file must be in **FASTA** format. When ISEScan is started, it first scans the sequences in the FASTA file one by one, then identify/annotate the IS elements in each sequence independently, finally output all identified/annotated IS elements for each sequence and the statistics of identified/annotated IS elements from all sequences in the input FASTA file.
8 | - Unknown bases are allowed in the sequences, e.g. ACACGCCCGTTGTTTT**NNNNNNNNN**, GGGTCAGGTCATCAACTTTAGCGTAACGC**NNNNN**GGG.
9 | - If you just want to identify potential transposases (not FULL or partial IS elements) in your sequences and don't like to install ISEScan, you can do so by following two steps: 1) download the transposase models (clusters.faa.hmm and clusters.single.fa) from ISEScan subdirectory [pHMMs](https://github.com/xiezhq/ISEScan/tree/master/pHMMs), 2) install and use software HMMER (version 3.1b2 or later) to search transposases in your sequences.
10 | - ISEScan users asked many good questions (see [issues](https://github.com/xiezhq/ISEScan/issues)) which have been answered by the developer of ISEScan. If you didn't find the answers you want at [issues](https://github.com/xiezhq/ISEScan/issues), you can open a new issue at [issues](https://github.com/xiezhq/ISEScan/issues).
11 | - If you want to replace some (or all) of genes/proteins predicted by ISEScan (actually FraGeneScan called by ISEScan) to predict transposases and IS elements, you can try manually replacing gene boudaries and protein sequences in file `.faa` under directory `results/proteome` after you run ISEScan on your genome sequences. For how to do so, please check [my comments](https://github.com/xiezhq/ISEScan/issues/45) on May 2022.
12 |
13 | ## Table of Contents
14 | - [Overview](#Overview)
15 | - [Citation](#Citation)
16 | - [Contact](#Contact)
17 | - [Installation](#Installation)
18 | - [ISEScan on linux](#install-on-linux)
19 | - [ISEScan on mac](#install-on-mac)
20 | - [Automated install by Bioconda (recommended!)](#Bioconda-install)
21 | - [Manual install (install from source code)](#Manual-install)
22 | - [Upgrade ISEScan to the latest version](#Upgrade)
23 | - [Usage example](#Usage)
24 | - [Tips to run ISEScan efficiently](#Tips)
25 | - [How to run a set of genomes in a row](#lots-of-genomes)
26 | - [Re-run ISEScan without gene/protein prediction and HMMER searching](#Re-run)
27 | - [Release History](#Release)
28 |
29 |
30 | ## Overview
31 | ISEScan is a python pipeline to identify IS (Insertion Sequence) elements in genome. It includes an option to report either complete IS elements or both complete and partial IS elements. It might be a good idea to try reporting both complete and partial IS elements when it is used to identify the IS elements in the assemblies of metegenome. ISEScan reports both complete and partial IS elements by default.
32 |
33 | ISEScan was developed using Python3. It 1) scans genome (or metagenome) in fasta format; 2) predicts/translates (using FragGeneScan) genome into proteome; 3) searches the pre-built pHMMs (profile Hidden Markov Models) of transposases (two files shipped with ISEScan; clusters.faa.hmm and clusters.single.faa) against the proteome and identifies the transposase gene in genome; 4) then extends the identified transposase gene into the complete IS (Insertion Sequence) elements based on the common characteristics shared by the known IS elements reported by literatures and database; 5) finally reports the identified IS elements in a few result files (e.g. a file containing a list of IS elements, a file containing sequences of IS elements in fasta format, an annotation file in GFF3 format).
34 |
35 |
36 | ## Citation
37 | Zhiqun Xie, Haixu Tang. ISEScan: automated identification of Insertion Sequence Elements in prokaryotic genomes. *Bioinformatics*, 2017, 33(21): 3340-3347.
38 |
39 | Download: [full text](https://doi.org/10.1093/bioinformatics/btx433), [SupplementaryMaterials.docx](publication/SupplementaryMaterials.docx), [SupplementaryMaterials.xlsx](publication/SupplementaryMaterials.xlsx).
40 |
41 |
42 | ## Contact
43 | Zhiqun Xie: `xiezhq@hotmail.com`
44 |
45 |
46 | ## Installation
47 |
48 | #### ISEScan on linux
49 | ISEScan was tested on Linux only and can be installed from Bioconda packages and source code. Install from Bioconda is recommended as it is the simplest way for non-experienced users.
50 |
51 | #### ISEScan on mac
52 | I have no idea about ISEScan on mac as I only fully tested it on Linux. If you cannot install ISEScan on mac from Bioconda, you can try installing ISEScan from source codes. For installing ISEScan from source codes, I knew there was an issue to compile FragGensScan on Mac but I once solved it. To solve the problem of running FragGeneScan on Mac, please modify two source files in FragGeneScan source codes: 1) open util_lib.c and comment out ‘#include ’ on line3; 2) open hmm_lib.c and comment out ‘‘#include ’ on line6 and replace values.h with limits.h on line4. The modified FragGeneScan can run on Mac and Linux without problem according to my test result.
53 |
54 |
55 | #### Automated install by Bioconda (recommended!)
56 | The steps below will install ISEScan package via bioconda to /apps/inst/miniconda3/. You can install ISEScan to other place by changing the default miniconda3 install path in step **Install Miniconda3**. Visit [Bioconda recipe for ISEScan](https://bioconda.github.io/recipes/isescan/README.html) for more details (Thanks both [pbasting](https://github.com/pbasting) and [tseemann](https://github.com/tseemann) for making it available!).
57 | - Install [Bioconda](https://bioconda.github.io/user/install.html). To minimize the install time and size, we [install miniconda](https://docs.conda.io/en/latest/miniconda.html#linux-installers)
58 | - Download [Miniconda3-latest-Linux-x86_64 installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh)
59 | ```
60 | curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
61 | ```
62 | - Install Miniconda3
63 | ```
64 | sh Miniconda3-latest-Linux-x86_64.sh
65 | ```
66 | - Please answer yes (see my screen shot below) for all questions of `sh Miniconda3-latest-Linux-x86_64.sh` if you have no idea about the questions.
67 | ```
68 | Do you wish the installer to initialize Miniconda3
69 | by running conda init? [yes|no]
70 | [no] >>> yes
71 | ```
72 | ```
73 | rm Miniconda3-latest-Linux-x86_64.sh
74 | source ~/.bashrc
75 | ```
76 | - Add the bioconda channel as well as the other channels bioconda depends on. It is important to add them in this order so that the priority is set correctly (that is, conda-forge is highest priority).
77 | ```
78 | conda config --add channels defaults
79 | conda config --add channels bioconda
80 | conda config --add channels conda-forge
81 | ```
82 | - Install and update ISEScan
83 | ```
84 | conda install isescan
85 | ```
86 | - Try ISEScan (You can find the available command options by running `isescan.py -h`).
87 | ```
88 | cp /apps/inst/miniconda3/test/NC_012624.fna ./
89 | isescan.py --seqfile NC_012624.fna --output results --nthread 2
90 | ```
91 | Note: replace `/apps/inst/miniconda3` in commands with your conda install path.
92 |
93 | If system reports `isescan.py: command not found...`, please add ISEScan package to your `PATH` (replace `/apps/inst/miniconda3` in the command below with your conda install path):
94 | ```
95 | export PATH=/apps/inst/miniconda3/bin/:$PATH
96 | ```
97 | Then, try ISEScan again:
98 | ```
99 | isescan.py --seqfile NC_012624.fna --output results --nthread 2
100 | ```
101 |
102 |
103 | #### Manual install (install from source code)
104 | - Install ISEScan
105 | - Download the latest ISEScan from https://github.com/xiezhq/ISEScan/releases, e.g. **Source code (tar.gz)**.
106 |
107 | - Uncompress the .zip (or .tar.gz) file.
108 | - Use unzip command to uncompress the zip file:
109 | ```
110 | unzip v1.7.2.2.zip
111 | ```
112 | - Use tar command to uncompress the tar.gz file:
113 | ```
114 | tar -zvxf v1.7.2.2.tar.gz
115 | ```
116 | This will create a ISEScan folder, e.g. ISEScan-1.7.2.2. You need to go to ISEScan folder to configure and run it.
117 | ```
118 | cd ISEScan-1.7.2.2
119 | ```
120 | - Install dependencies before you run ISEScan
121 | - Python 3.3.3 or later
122 | - numpy-1.8.0 or later
123 | - scipy-0.13.1 or later
124 | - fastcluster, latest version recommended, https://pypi.python.org/pypi/fastcluster
125 | - FragGeneScan1.30 or earlier, (The .faa file output by version1.31 is not compatible with ISEScan!), http://omics.informatics.indiana.edu/FragGeneScan
126 | - HMMER-3.1b2 or later, http://hmmer.org/download.html
127 | - BLAST 2.2.31 or later
128 | - SSW Library, the latest version is not tested with ISEScan and the tested version of SSW library is shipped with ISEScan, please find it at ssw201507 subdirectory.
129 | - To use the shipped SSW library in ISEScan, please go to ssw201507 and then compile the codes by gcc:
130 | ```
131 | cd ssw201507
132 | gcc -Wall -O3 -pipe -fPIC -shared -rdynamic -o libssw.so ssw.c ssw.h
133 | ```
134 | - And then copy libssw.so and set search path:
135 | ```
136 | cp libssw.so ../
137 | export LD_LIBRARY_PATH=/home/xiezhq/projects/ISEScan-1.7.2.2:$LD_LIBRARY_PATH
138 | ```
139 | In command `export LD_LIBRARY_PATH=/home/xiezhq/projects/ISEScan-1.7.2.2:$LD_LIBRARY_PATH`, please replace `/home/xiezhq/projects/ISEScan-1.7.2.2` with the actual path of libssw.so on your computer!
140 | - The latest SSW library can be found at https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library.
141 | - biopython 1.62 or later (required by SSW library)
142 |
143 | - Add the required packages to your $PATH before you run ISEScan
144 | - Add to $PATH the paths pointing to run_FragGeneScan.pl, phmmer, hmmsearch, blastn, blastp, makeblastdb
145 | ```
146 | export PATH=$PATH:/apps/inst/FragGeneScan1.30:/apps/inst/hmmer-3.3/bin:/apps/inst/ncbi-blast-2.10.0+/bin
147 | ```
148 | In command export above, please replace `/apps/inst/FragGeneScan1.30`, `/apps/inst/hmmer-3.3/bin` and `/apps/inst/ncbi-blast-2.10.0+/bin` with the actual paths of FragGeneScan, HMMER and BLAST on your computer!
149 |
150 |
151 |
152 | ## Upgrade ISEScan to the latest version
153 | ### Automated upgrade from Bioconda
154 | The lastest version becomes available on Bioconda is in a few hours or days after it is released on https://github.com/xiezhq/ISEScan. You can run the command below to upgrade the existing ISEScan if the existing ISEScan was installed by Bioconda.
155 | ```
156 | conda update isescan
157 | ```
158 | ### Manual upgrade from existing ISEScan
159 | By manual upgrade, you may get the lastest version immediately from https://github.com/xiezhq/ISEScan). It is quite easy to upgrade the existing ISEScan to the latest version: copy all .py files from the latest version to the ISEScan install directory.
160 | - Locate the existing ISEscan (ISEScan install directory). If you don't know where isescan.py is installed, you can run `which isescan.py` to help find where it is on your system.
161 | ```
162 | which isescan.py
163 | /apps/inst/miniconda3/bin/isescan.py
164 | ```
165 | - Get the latest ISEScan source codes and then copy the all .py files to ISEScan install directory. Please check [Manual install (install from source code)](#Manual-install) on how to get the latest ISEScan source codes. For example, you downlaoded the latest ISEScan, v1.7.2.2.2.tar.gz.
166 | ```
167 | tar -zxf v1.7.2.2.2.tar.gz
168 | cd ISEScan-1.7.2.2.2/
169 | cp *.py /apps/inst/miniconda3/bin/
170 | ```
171 | ### Check and test the upgraded ISEScan
172 | - Check the version of the upgraded ISEScan.
173 | ```
174 | python3 isescan.py --version
175 | ```
176 | or
177 | ```
178 | isescan.py --version
179 | ```
180 | - Test the upgraded ISEScan.
181 | ```
182 | python3 isescan.py --seqfile /apps/inst/miniconda3/test/NC_012624.fna --output /home/xiezhq/results --nthread 2
183 | ```
184 |
185 |
186 | ## Usage example
187 | Let's try an example, NC_012624.fna.
188 |
189 | - The command below scans NC_012624.fna (genome sequence of Sulfolobus_islandicus_Y_N_15_51, ~42 kb), and outputs all results in `results` directory:
190 | ```
191 | cp /apps/inst/miniconda3/test/NC_012624.fna ./
192 | isescan.py --seqfile NC_012624.fna --output results --nthread 2
193 | ```
194 | Note: run `isescan.py -h` or `isescan.py --help` to get help.
195 | - Wait for its finishing. It may take a while (~40 seconds) as ISEScan uses the HMMER to scan the genome sequences and it will use 621 profile HMM models to scan each protein sequence (predicted by FragGeneScan) in the genome sequence. HMMER searching is usually more sensitive but slower than the regular BLAST searching for remote homologs. The running time for larger genome will increase quickly, e.g. about 20 minutes for NC_000913.fna (genome sequence of Escherichia coli str. K-12 substr. MG1655, ~4.6 Mb) with two cpu cores on my virtual machine.
196 |
197 | - After ISEScan finish running, you can find the output files in results directory:
198 | - NC_012624.fna.sum: the summarization of IS copies for each IS family
199 | - NC_012624.fna.csv: details about IS copies in NC_012624, one copy per line, comma-separated tabular table
200 | - NC_012624.fna.tsv: details about IS copies in NC_012624, one copy per line, tab-separated tabular table
201 | - NC_012624.fna.raw: details about IS copies in NC_012624, one copy per line
202 | - NC_012624.fna.gff: listing each IS copy and its TIR, gff3 format
203 | - NC_012624.fna.is.fna: the nucleic acid sequence of each IS copy, fasta format
204 | - NC_012624.fna.orf.fna: the nucleic acid sequence of the Tpase gene in each IS copy, fasta format
205 | - NC_012624.fna.orf.faa: the amino acid sequence of the Tpase in each IS copy, fasta format
206 |
207 | - Details about NC_012624.fna.sum:
208 | - The title line starts with `#`, followed by the summarization of IS content for each sequence in NC_012624. The last line is the summarization of IS content for all sequences in NC_012624.
209 | - Summarization of IS content for each sequence in NC_012624:
210 | - seqid: sequence identifier, extracted from head lines begining with `>` in NC_012624.fna, usuall the texts between `>` and the first blank character in a head line
211 | - family: family name of IS element
212 | - nIS: number of IS copies assigned to the specific family in a sequence
213 | - %Genome: percentage of genome sequence content spaned by IS elements in a sequence, calculated by bps4IS/dnaLen (see the following columns)
214 | - bps4IS: length of sequence segments spaned by IS elements in a sequence
215 | - dnaLen: length of the specific sequence
216 |
217 | - Details about NC_012624.fna.csv (NC_012624.fna.tsv, NC_012624.fna.raw):
218 | - The first row is header line listing column names.
219 | - The rows after the first row are the main content of NC_012624.fna.csv file, one IS copy per line.
220 | - Columns in NC_012624.fna.csv (NC_012624.fna.tsv, NC_012624.fna.raw):
221 | - seqID: sequence identifier
222 | - family: family name of IS element
223 | - cluster: Tpase cluster
224 | - isBegin and isEnd: genome coordinates of the predicted IS element
225 | - isLen: length of the predicted IS element
226 | - ncopy4is: number of predicted IS copies including full-length and partial IS copies
227 | - start1, end1, start2, end2: genome coordinates of the IRs
228 | - score: score of the IRs
229 | - irId: number of identical matches in pairwise alignment of left and righ hand invered repeats
230 | - irLen, length of inverted repeats
231 | - nGaps: number of gaps in IRs
232 | - orfBegin, orfEnd: genome coordinates of the predicted Tpase ORF
233 | - strand: strand where the Tpase is
234 | - orfLen: length of predicted Tpase ORF
235 | - E-value: the best E-value among all IS copies for the same IS element, the smaller the better
236 | - E-value4copy: the E-value of the reported IS copy, the smaller the better
237 | - Note: the E-value is the E-value returned by hmmer when searching profile HMMs against proteome translated from a genome sequence
238 | - type: type of IS element copy, 'c' for complete IS element and 'p' for partial IS element
239 | - ov: ov number returned by hmmer search
240 | - tir: terminal inverted repeat sequences
241 |
242 |
243 | ## Tips to run ISEScan efficiently:
244 |
245 | ### How to run a set of genomes in a row
246 | Sometimes, we want to run hundres of genomes in one line of command and then wait for all computing jobs to complete. Before doing it, we assume:
247 | - You can successfully run ISEScan on one genome:
248 | - run commands as the following if you installed ISEScan via Bioconda.
249 | ```
250 | conda activate base
251 | isescan.py --seqfile NC_012624.fna --output results
252 | ```
253 | - run the commands as the following if you installed ISEScan manually.
254 | ```
255 | python3 /home/xiezhq/projects/ISEScan-1.7.2.2/isescan.py --seqfile genome1.fa --output results
256 | ```
257 | where genome1.fa is your genome sequence file in fasta format. By default, ISEScan will use one CPU core but you can change it using command option `--nthread NTHREAD`, e.g.
258 | ```
259 | isescan.py --seqfile genome1.fa --output results --nthread 2
260 | ```
261 | - You are running ISEScan jobs on a Linux computer instead of a Linux cluster system.
262 | - Your Linux computer has **nproc** (nproc could be 1 or 2 or 4 or 6 or 8 or ....) CPU cores.
263 | - You want to run ISEScan on ngenome (ngenome could be 1 or 2 or 3, ...) fasta file(s) (genome) in parallel on your Linux computer.
264 |
265 | Now, let's run 200 genomes in one line of command and then wait for all computing jobs to complete (probably several days or weeks, depending on how many hours are required for each of your 200 genomes on average). If your computer has 8 CPU cores, you can execute the command below:
266 | ```
267 | nohup cat test.fna.list | xargs -n 1 -P 4 -I{} isescan.py --seqfile {} --output results --nthread 2 > log.txt &
268 | ```
269 |
270 | In the command line,
271 | - **test.fna.list** is a text file which includes 200 fasta files, one fasta file per row, for example:
272 | ```
273 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS014235.scaffolds.fa
274 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS049959.scaffolds.fa
275 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS020233.scaffolds.fa
276 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS022609.scaffolds.fa
277 | /N/dc2/scratch/zhiqxie/hmp/HMASM/SRS024132.scaffolds.fa
278 | ```
279 | - **-n 1** tells your computer to pick only one fasta file from **test.fna.list** for each ISEScan computing job.
280 | - **-P 4** tells your computer to spawn 4 processes at the same time (run 4 ISEScan jobs in parallel, namely, run 4 genomes at the same time). When one job completes with success or exits with error, a new ISEScan job on the next fasta file (e.g. 5th fasta file) in **test.fna.list** is spawned. So, the command line will keep 4 ISEScan computing jobs (one fasta file per ISEScan job) running on your computer, and each job utilizes two CPU cores by default. It means all of 8 CPU cores on your computer have been utilized by your 4 ISEScan computing jobs till the last fasta file is processed by ISEScan.
281 | - **> log.txt** tells your computer to write the screen messages output by ISEScan to the file **log.txt**.
282 | - **&** tells your computer to run jobs in the background without interrupting you on the current terminal (e.g. xterm), in order that you can work on other things on the same terminal.
283 | You can check your job status by the command `top -c -u xiezhq` (assuming your user name is **xiezhq**).
284 |
285 | It might take several days or weeks for 200 genomes to complete. It depends on how many CPU cores you have on your computer and how fast each CPU core is. Please do not load too many ISEScan jobs because each ISEScan job will consume part of your RAM on your computer. However, you can always test and estimate how many GB RAM and how many hours are required for one genome.
286 |
287 |
288 | ### Re-run ISEScan without gene/protein prediction and HMMER searching
289 | - ISEScan will run much faster if you run it on the same genome sequence more than once (e.g., trying different optimal parameters of near and far regions (see our paper [...] for the definitions of near and far regions)) to search for IS elements in your genome). The reason is that it skips either FragGeneScan or both FragGeneScan and phmer/hmmsearch steps which are most time-consuming steps in ISEScan pipeline.
290 | - If you prefer ISEScan recalculating the the results, you can simply remove the proteome file and HMMER search results which are related to your genome sequence file name. For example, you can delete NC_012624.fna.faa in `results/proteome` directory and clusters.faa.hmm.NC_012624.fna.faa and clusters.single.faa.NC_012624.fna.faa in `results/hmm` directory, and then rerun it:
291 | ```
292 | isescan.py --seqfile NC_012624.fna --output results
293 | ```
294 |
295 |
296 | ## Release History
297 | - 1.7.3
298 | - fix the bug reported referenced by issue59-60 and increase version number from 1.7.2.3 to 1.7.3. (Thanks lxsteiner, adriludwig, the-reese, ChristophKnapp and SRooke for reporting the issue)
299 | - 1.7.2.3
300 | - remove the bug in pred.py, which cuases the issue 'UnboundLocalError: local variable raworfhits referenced before assignment' in rare cases.
301 | - 1.7.2.2.2
302 | - add code to remove temporary files (created by tempfile.NamedTemporaryFile()) once blastn search completes in case that large amounts of temporary files consume too much space. (Thanks Biancamaria for the suggestion)
303 | - 1.7.2.2
304 | - ISEScan can output .csv (columns are separated by `,`) and .tsv (columns are separated by `tab`) result files, which are much easier for users to parse the results (Thanks oschwengers for his suggestion)
305 | - add command options `--seqfile` and `--output` to remove the positional parmater `seqfile`, `proteome` and `hmm` (Thanks oschwengers for his suggestion)
306 | - modify constants.py to remove the hard coded paths pointing to the third party dependencies and the output directory `dir4prediction` (Thanks oschwengers for his suggestion)
307 | - add tips for installing ISEScan from source codes on Mac (Thanks [Ania Gorska](https://github.com/gvalchca) for her suggestion)
308 | - 1.7.2.1
309 | - modify constants.py to remove the hard coded path poiting to the profile HMM files (clusters.single.faa and clusters.faa.hmm)
310 | - update readme to add an introduction for installing ISEScan package via bioconda (Thanks both [pbasting](https://github.com/pbasting) and [tseemann](https://github.com/tseemann) for making it available!)
311 | - 1.7.2.
312 | - Add command options `--removeShortIS` and `--no-FragGeneScan`, and remove `removeShortIS` and `translateGenome` from constants.py. (Thanks EricDeveaud for his suggestion and codes)
313 | - Add command option `--nthread` to isescan.py, and remove `nthread` and `nproc` from constants.py.
314 | - Remove useless parallel testing codes from code base.
315 | - 1.7.1
316 | - fix a bug in constants.py, which fails to locate the correct path pointing to profile HMM files (clusters.single.faa and clusters.faa.hmm). Thank giuliodimaria92 for it.
317 | - 1.7
318 | - Set removeShortIS = False in constants.py for ISEScan to report both complete and partial IS elements by default. One additional column (type) was added accordingly in .raw output file to label each IS element copy as either complete (c) or partial (p) IS element. For details refer to the section 'Details about NC_012624.fna.raw' in Readme.
319 | - 1.6
320 | - Update Readme about the configuration of ISEScan where the paths to clusters.faa.hmm and clusters.single.faa should also be correctly specified in constants.py (Thank Ania Gorska for it).
321 | - 1.5.4.3
322 | - Fix the bug which failed to report the Tpase ORFs in multi-copy IS elements, and ISEScan now output a .raw file with one additional column E-value4copy which is the E-value of the reported IS copy while the column E-value is the best E-value among all IS copies for the same IS element.
323 | - 1.5.4.1
324 | - fix bug for batch4bacteria.py when *.sum files were created by either outputIndividual() or outputIS4multipleSeqOneFile() in pred.py
325 | - 1.5.4
326 | - Add removeFalsePositive() to remove the potentail false positive in the 'new' family: 1) single-copy hits with e-value > e-50 or no tir or nGaps > 0 or irId < 20 or irId/irLen < 0.75; 2) multi-copy hits with evalue > e-50 and (irId < 13 or (irId < 20 and ngaps > 0))
327 | - Modify refineHits() to remove the single-copy partial IS elements: 1) if evalue > e-50 or (irId < 13 or (irId < 20 and ngaps > 0 for familys other than IS200/IS605)
328 | - Modify refineHits() to remove the multi-copy partial IS elements: 1) if evalue > e-50 for IS200/IS605 family; 2) if irId < 10 for familys other than ten familys which could have the full IS without perfect TIR (irId < 10), IS110, IS4, IS5, IS6, ISAS1, ISH3, ISNCY.
329 | - Change irSim4singleCopy in constants.py from 0.85 to 0.75, for the use in removeFalsePositive()
330 | - 1.5.3
331 | - Fix bug in getFullIS4seqOnStream() for genome sequence with long multi-copy fregments containing the common IS element
332 | - Use 'average' instead of 'single' method in fastcluster.linkage()
333 | - Fix bug in removeOverlappedOrfhits() to correctly count single-copy IS elements for genome sequence without multi-copy IS elements
334 | - 1.5.2
335 | - Fix bug for genome sequence without multi-copy IS elements
336 | - 1.5.1
337 | - Change: changed consensusBoundaryByCutoff() to consensusBoundaryByCutoffBySeparated()
338 | - Change: added consensusBoundaryByCutoffByCombined() and getbds4opt4start(), to determine the left and right boundaries of multi-copy pro-IS element simultaneously, namely, to determine the optimal combined left and right boundaries instead of separated left and right boundaries.
339 | - 1.5
340 | - Change: add consensusBoundaryByCutoff() and ncopyByCutoff() in tools.py, to determine the optimal boundary of multi-copy pro-IS element.
341 | - 1.4
342 | - Change: recruit the IS copies without predicted Tpase when search for multi-copy IS elements
343 | - 1.3
344 | - Remove buildHMM.py from ISEScan
345 | - 1.2
346 | - CHANGE: pHMMs `clusters.faa.hmm` and `clusters.single.faa`, both files are now built upon the curated ACLAME dataset (ACLAME is a mobile genetic element database.)
347 | - 1.1.1
348 | - Add option in `constants.py` to report either complete IS elements or both complete and partial IS elements
349 | - 1.0
350 | - The first proper release
351 |
--------------------------------------------------------------------------------
/__pycache__/constants.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/constants.cpython-33.pyc
--------------------------------------------------------------------------------
/__pycache__/isPredict.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/isPredict.cpython-33.pyc
--------------------------------------------------------------------------------
/__pycache__/is_analysis.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/is_analysis.cpython-33.pyc
--------------------------------------------------------------------------------
/__pycache__/pred.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/pred.cpython-33.pyc
--------------------------------------------------------------------------------
/__pycache__/ssw_wrap.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/ssw_wrap.cpython-33.pyc
--------------------------------------------------------------------------------
/__pycache__/tools.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/__pycache__/tools.cpython-33.pyc
--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | ## Config packages
4 | #
5 | # Set the path variables pointing to the required packages in order that ISEScan can find the required packages
6 | # on your computer.
7 | #
8 | # FragGeneScan
9 | FragGeneScan = 'run_FragGeneScan.pl'
10 | # Hmmer
11 | phmmer = 'phmmer'
12 | hmmsearch = 'hmmsearch'
13 | # Blast
14 | blastn = 'blastn'
15 | blastp = 'blastp'
16 | makeblastdb = 'makeblastdb'
17 | #
18 | ## Config packages
19 |
20 | # get path where isescan.py is
21 | import sys
22 | path2isescan = os.path.dirname(sys.argv[0])
23 |
24 | # Set the path variables pointing to the profile HMM files (clusters.single.faa and clusters.faa.hmm).
25 | #
26 | # The peptide sequences of single-member clusters, which is used by phmmer in hmmer
27 | file4clusterSeqFile4phmmer = os.path.join(path2isescan, 'pHMMs', 'clusters.single.faa')
28 | #
29 | # The profile HMMs of multiple-member clusters, which is used by hmmsearch in hmmer
30 | file4clusterHMM = os.path.join(path2isescan, 'pHMMs', 'clusters.faa.hmm')
31 | #
32 |
33 |
34 | # for local linux machine
35 | #path2results = ''
36 | #dir4prediction = os.path.join(path2results, 'prediction')
37 |
38 |
39 | # Optimal values for SSW to find TIR in database
40 | # (gapopen, gapextend, match, mismatch)
41 | #
42 | # Optimal filter when aligning two sequences with length = maxLenIR
43 | filters4ssw4isMax = [(1, 10, 4, 5)] # giving the greatest number of matched IS elements and
44 | # the greatest number of matched best IS elements
45 | filters4ssw4trial = [(2, 6, 2, 2)] # trial filter to stop alignment from creating the consecutive gaps
46 |
47 | # minimal and maximal values of length of full-length IS element in each family
48 | minMaxLen4is = {
49 | 'IS1': (732, 4601),
50 | 'IS110': (969, 4105),
51 | 'IS1182': (1330, 1980),
52 | 'IS1380': (1474, 4160),
53 | 'IS1595': (701, 7915),
54 | # IS1016V5 (272 bp) is a deleted variant of IS1016V6: 242/711 bp. IS1016V4 (672 bp) is a deleted variant of IS1016V6: 673/711 bp.
55 | # Then ISMha1 (701 bp) is the shortest member in family IS1595.
56 |
57 | 'IS1634': (1511, 2089),
58 | 'IS200/IS605': (407, 2223),
59 | 'IS21': (1924, 3533),
60 | 'IS256': (1124, 1629),
61 | 'IS3': (435, 1814),
62 | 'IS30': (1027, 8273),
63 | 'IS4': (521, 5396),
64 | 'IS481': (553, 3451),
65 | 'IS5': (789, 5396),
66 | 'IS6': (696, 1648),
67 | 'IS607': (1415, 2607),
68 | 'IS630': (895, 2009),
69 | 'IS66': (1364, 3481),
70 | # IS867 has about 75 % homology with IS866. IS866 is 2716 bp.
71 | # Then ISMno3 (1364 bp) is the shortest member in family IS66.
72 | 'IS701': (1016, 2207),
73 | 'IS91': (712, 2604),
74 | 'IS982': (845, 1282),
75 | 'ISAS1': (1139, 3041),
76 | 'ISAZO13': (1284, 2171),
77 | 'ISH3': (1200, 1509),
78 | 'ISKRA4': (1164, 3746),
79 | 'ISL3': (536, 9109),
80 | 'ISNCY': (786, 3989),
81 | 'new': (400, 10000), # for the novel IS families in database
82 | }
83 |
84 | # peptide and ORF lengths of tpases:
85 | # The first collumn: shortest tpase ORF (bp)
86 | # The second collumn: longest tpase ORF (bp)
87 | # The third collumn: shortest peptide ORF (bp) among all peptides in IS_PEP record for each IS element
88 | # To be added: shortest tpase (aa), longest tpase (aa),
89 | # ORF = tpase + stopcodon
90 | minMax4tpase = {
91 | 'IS1': (666, 1119, 252),
92 | 'IS110': (603, 1380, 156),
93 | 'IS1182': (822, 1731, 570),
94 | 'IS1380': (1158, 1554, 1158),
95 | 'IS1595': (576, 1158, 426),
96 | 'IS1634': (1314, 1875, 1314),
97 | 'IS200/IS605': (366, 1482, 147),
98 | 'IS21': (882, 1758, 231),
99 | 'IS256': (990, 1389, 990),
100 | 'IS3': (441, 1581, 120),
101 | 'IS30': (540, 1419, 189),
102 | 'IS4': (570, 1629, 219),
103 | 'IS481': (447, 1794, 447),
104 | 'IS5': (360, 1908, 75),
105 | 'IS6': (528, 1062, 246),
106 | 'IS607': (768, 1653, 453),
107 | 'IS630': (510, 1194, 318),
108 | 'IS66': (354, 1695, 165),
109 | 'IS701': (921, 1410, 921),
110 | 'IS91': (648, 1548, 648),
111 | 'IS982': (627, 981, 429),
112 | 'ISAS1': (594, 1329, 189),
113 | 'ISAZO13': (1203, 2094, 513),
114 | 'ISH3': (573, 1206, 549),
115 | 'ISKRA4': (1047, 1719, 114),
116 | 'ISL3': (414, 1716, 408),
117 | 'ISNCY': (573, 1815, 123),
118 | 'new': (300, 2100, 50), # for the novel IS families in database
119 | }
120 |
121 | # allowed minimal and maximal and optimal values of the length of TIR sequence for each family
122 | # Here, the optimal values are the empirical parameter based on the observations.
123 | # The 4th collumn is marker indicating whether the family always has TIR (1) or no TIR (0),
124 | # and -1 for not determined (in the family, some members have tir but others have no tir).
125 | minMax4tir = {
126 | 'IS1': (8, 67, 14, 1),
127 | 'IS110': (2, 31, 14, -1),
128 | 'IS1182': (8, 44, 10, 1),
129 | 'IS1380': (7, 39, 10, 1),
130 | 'IS1595': (10, 43, 15, 1),
131 | 'IS1634': (11, 32, 12, 1),
132 | 'IS200/IS605': (10000, 0, 10000, 0), # prevent program from finding any tir with irLen > 0
133 | #'IS200/IS605_8': (11, 11, 11, 1), # cluster 8 (cdhit30) of IS200/IS605 has tir with irLen == 0 or irLen == 11
134 | #'IS200/IS605': (11, 11, 11, -1), # cluster 8 (cdhit30) of IS200/IS605 has tir with irLen == 0 or irLen == 11
135 | 'IS21': (8, 76, 10, 1),
136 | 'IS256': (8, 48, 15, 1),
137 | 'IS3': (7, 54, 10, -1),
138 | 'IS30': (11, 50, 12, 1),
139 | 'IS4': (8, 67, 12, 1),
140 | 'IS481': (5, 52, 10, 1),
141 | 'IS5': (7, 45, 14, 1),
142 | 'IS6': (12, 36, 14, 1),
143 | 'IS607': (12, 46, 12, -1),
144 | 'IS630': (3, 92, 11, 1),
145 | 'IS66': (11, 144, 11, 1),
146 | 'IS701': (12, 38, 12, 1),
147 | 'IS91': (11, 21, 11, -1),
148 | 'IS982': (11, 35, 11, 1),
149 | 'ISAS1': (12, 34, 12, 1),
150 | 'ISAZO13': (18, 48, 18, 1),
151 | 'ISH3': (11, 31, 15, 1),
152 | 'ISKRA4': (15, 40, 18, 1),
153 | 'ISL3': (6, 50, 11, 1),
154 | 'ISNCY': (4, 52, 13, -1),
155 | 'new': (10, 50, 20, -1), # use the popular values for the novel IS families in database
156 | }
157 | # ssw will use minMax4tir[2] as minimal length of the alignement of two tir sequences
158 | # if useOPTtir == True else minMax[0] as minimal length of the alignment of two tir sequences.
159 | #useOPTtir = True
160 | useOPTtir = False
161 |
162 | # the minimum of rations of irId/irLen
163 | minIrIdentity = 0.4
164 | # optimal ration of irId/irLen
165 | optIrIdentity = 0.6
166 | # stringent irId/irLen, which is usually required when irLen < 5(stringentShortestIR) or irLen > 55(stringentLongestIR)
167 | stringentIrIdentity = 0.7
168 |
169 | # maximum distance (bp) between two neighboring orfs (including +/- strand) within one IS element
170 | # 764 IS elements with multiple ORFs with clear coordinates in ORF records,
171 | # 405 with distBetweenORFs >=0,
172 | # 1/405 with dist >= 1000, 6/405(1%) with dist >= 500, 14/405(3%) with dist >= 400,
173 | # 22/405(5%) with dist >= 300, 31/405(8%) with dist >= 250, 44/405(11%) with dist >= 200,
174 | # 90/405(22%) with dist >= 100, 202/405(50%) with dist >= 55, 214/405(53%) with dist >= 50
175 | #
176 | # not to merge
177 | #maxDistBetweenOrfs = -1
178 | # merge ORFs with gap = 0, ('NC_000913.3', 4518418, 4519014, '+') and ('NC_000913.3', 4519015, 4519224, '+')
179 | #maxDistBetweenOrfs = 0
180 | # merge ORFs with gap <= 100 bps
181 | maxDistBetweenOrfs = 100
182 |
183 | # In a dataset, 3891 IS elements with both lORF2TER and rORF2TER >= 0,
184 | # 36/3891(1%) with lORF2TER >= 500, 177/3891(5%) with lORF2TER >= 250,
185 | # 51/3891 with rORF2TER >= 500, 232/3891 with rORF2TER >= 250
186 | # ~99% IS elements in dataset has lORF2TER/rORF2TER less than 500 bps
187 | # ~95% IS elements in dataset has lORF2TER/rORF2TER less than 250 bps
188 | #
189 | # switch maxDist4ter2orf between 500 and 250
190 | #maxDist4ter2orf = 250
191 | maxDist4ter2orf = 500
192 | outerDist4ter2tpase = (150,500)
193 |
194 | # Minimum distance (bp) from near ends of IS element to the nearest ORF, namely,
195 | # the length of the shortest linker between TIR and the nearest ORF.
196 | minDist4ter2orf = -150
197 | #minDist4ter2orf = -50
198 | #
199 | # There is no linkder (space) between TIR and the nearest ORF.
200 | #minDist4ter2orf = 1
201 |
202 | # The strand does not matter when extracting two terminal sequences to align, namely,
203 | # which sequence is the first sequence in pairwise alignement does not make sense.
204 | #splitAlign2orf = True
205 | splitAlign2orf = False
206 |
207 | # IS elements with identicalBases/lengthOfAlignment > sim4iso are regarded as the same IS element (isoform)
208 | # Isoforms have been defined as elements which share in the first instance more than 95% identity
209 | # at the level of their transposase protein sequence or otherwise 90% at the DNA level.
210 | #sim4iso = 0.85
211 | sim4iso = 0.9
212 | #
213 | # SIM4ISO = sim4iso * 100, used by blastn search to get copy number of hit
214 | #SIM4ISO = 85
215 | SIM4ISO = 90
216 | #
217 | # similarity cutoff for protein sequence
218 | aaSim4iso = 0.95
219 | aaSIM4ISO = 95
220 |
221 | # Two neighboring sequences with overlap >= min4overlap are deemed overlapped.
222 | min4overlap = 0.5 # 50%
223 |
224 | # Two sequences with intersect >= min4intersect are deemed intersect.
225 | #min4intersect = 100 # 100 bp, namely, 33 aa or so.
226 | min4intersect = 1 # 1 bp.
227 |
228 | # two neighoring segments with overlap > overlap2removeRedundancy are considered overlapped (redundant)
229 | overlap2removeRedundancy = 0.5 # 50%
230 | #overlap2removeRedundancy = 0.99999999999 # 100%
231 |
232 | # use min4intersect if True else overlap2removeRedundancy as the threshold to
233 | # turn on clustering and remove intersected ISs/hits except the representative in a cluster.
234 | #intersected2remove = True
235 | intersected2remove = False
236 |
237 | # hits with evalue <= min4evalue are defined as the final hits.
238 | min4evalue = 1e-10
239 | #min4evalue = 1e-5
240 |
241 | # more strict evalue and tir are required for single copy hits
242 | evalue4singleCopy = 1e-50
243 | #irSim4singleCopy = 0.85 # irId/irLen
244 | irSim4singleCopy = 0.75 # irId/irLen
245 |
246 | # E-value cutoff for filtering hits returned by HMM search
247 | evalue2filterHMMhits = min4evalue
248 | #evalue2filterHMMhits = 10 # do not filter out any hits returned by HMM search
249 |
250 | # Paramter for removing potential falsely discovered novel IS elements (family 'new') and partial IS elements
251 | #
252 | # {excludedFamilys:(full,partial,no)}:
253 | # {'IS110':(54,19,3), 'IS4':(2,3,1), 'IS5':(2,1,1), 'IS6':(2,0,0), 'IS630':(1,1,7),
254 | # 'IS66':(1,0,2), 'IS91':(1,1,2), 'ISAS1':(2,0,0), 'ISH3':(8,1,3), 'ISNCY':(3,1,4)}
255 | # The full IS elements in the familys above might exist without perfect TIR with irId < 10.
256 | # We should hence exclude these familys when filtering out the partial IS elements without perfect TIR.
257 | #excludedFamilys = ['IS110', 'IS4', 'IS5', 'IS6', 'IS630', 'IS66', 'IS91', 'ISAS1', 'ISH3', 'ISNCY']
258 | excludedFamilys = ['IS110', 'IS4', 'IS5', 'IS6', 'ISAS1', 'ISH3', 'ISNCY']
259 | #
260 | # number of matches in tir alignment,
261 | # which are used for removing the potential falsely discovered IS elements (false positive) and partial IS elements without perfect TIR.
262 | # Refer to removeFalsePositive() and refineHits() in pred.py for more details.
263 | cutoff4irId4short = 13
264 | cutoff4irId4long = 20
265 | cutoff4irId4multicopy = 10
266 | #
267 | # Paramter for removing potential falsely discovered novel IS elements (family 'new') and partial IS elements
268 |
269 |
270 | # width of line in fasta file created by us
271 | fastaLineWidth = 60
272 |
273 | # complementary table for DNA
274 | #------------------------------------------
275 | # Code Represents Complement
276 | # A Adenine T
277 | # G Guanine C
278 | # C Cytosine G
279 | # T Thymine A
280 | # Y Pyrimidine (C or T) R
281 | # R Purine (A or G) Y
282 | # W weak (A or T) W
283 | # S strong (G or C) S
284 | # K keto (T or G) M
285 | # M amino (C or A) K
286 | # D A, G, T (not C) H
287 | # V A, C, G (not T) B
288 | # H A, C, T (not G) D
289 | # B C, G, T (not A) V
290 | # X/N any base X/N
291 | # - Gap -
292 | #------------------------------------------
293 | #na1u = 'ATCGN'
294 | na1u = 'ATCGNRYWSKMDVHBX'
295 | #na2u = 'TAGCN'
296 | na2u = 'TAGCNYRWSMKHBDVX'
297 | #na1l = 'atcgn'
298 | na1l = 'atcgry'
299 | #na2l = 'tagcn'
300 | na2l = 'tagcyr'
301 | #na1ul = 'ATCGNatcgn'
302 | na1ul = 'ATCGRYatcgry'
303 | #na2ul = 'TAGCNtagcn'
304 | na2ul = 'TAGCYRtagcyr'
305 |
306 | # The Genetic Codes
307 | # Refer to http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes
308 | # The Bacterial, Archaeal and Plant Plastid Code (transl_table=11).
309 | table11 = {
310 | 'starts': ('TTG', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTG'),
311 |
312 | 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
313 | 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
314 | 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
315 | 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
316 |
317 | 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
318 | 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
319 | 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
320 | 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
321 |
322 | 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
323 | 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
324 | 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
325 | 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
326 |
327 | 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
328 | 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
329 | 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
330 | 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
331 | }
332 | gene2pepTable = {'11': table11}
333 |
--------------------------------------------------------------------------------
/isPredict.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import time, random
5 | import os
6 | import argparse
7 | import sys
8 | import datetime
9 | import operator
10 | import concurrent.futures
11 |
12 | import constants
13 | import tools
14 | import is_analysis
15 | import pred
16 |
17 |
18 | def genome2proteome(args2concurrent):
19 | print("\nBegin to translate genome into proteome.")
20 |
21 | for args in args2concurrent:
22 | outs = is_analysis.translate_genome_dna_v3(args)
23 | dna_file = args[0]
24 | if outs == 0:
25 | print('Translating genome into proteome for', dna_file, ', return ', outs)
26 | else:
27 | e = "Translating genome into proteome for {}, return error!".format(dna_file)
28 | raise RuntimeError(e)
29 |
30 | print("\nFinish translating genome into proteome.", datetime.datetime.now().ctime())
31 |
32 |
33 | # proteome_file: (faaFileName, org)
34 | # faaFileName: peptide sequence file output by FragGeneScan
35 | # org: organism id which is the parent directory of DNA sequence file
36 | # outFiles4phmmer: [output_file, ...]
37 | # output_file: file,
38 | # hmmer hits file with full path, e.g. /path/output4hmmsearch_illumina_5_cdhit30/HMASM/clusters.single.faa.SRS078176.scaffolds.fa.faa
39 | def prepare4phmmer(clusterSeqFile4phmmer, proteome_files, path_to_hmmsearch_results, nthread):
40 | args2concurrent = []
41 | outFiles4phmmer = []
42 | query = os.path.basename(clusterSeqFile4phmmer)
43 | for proteome_file in proteome_files:
44 | faaFileName, org, update = proteome_file
45 | if not os.path.isfile(faaFileName) or os.stat(faaFileName).st_size == 0:
46 | print('No such file or Empty file', faaFileName)
47 | continue
48 | fileName = '.'.join([query, os.path.basename(faaFileName)])
49 | output_file = os.path.join(path_to_hmmsearch_results, org, fileName)
50 | callhmmer = False
51 | if update == True:
52 | callhmmer = True
53 | elif os.path.isfile(output_file) and os.stat(output_file).st_size > 0:
54 | fp = open(output_file, 'r')
55 | fp.seek(fp.seek(0,2)-len('# [ok]\n'))
56 | if '# [ok]\n' in fp.read():
57 | callhmmer = False
58 | else:
59 | # incomplete file missing the last line of the normal file created by hmmer-3.1b2
60 | callhmmer = True
61 | else:
62 | callhmmer = True
63 |
64 | if callhmmer == True:
65 | args2concurrent.append((clusterSeqFile4phmmer, faaFileName, output_file, nthread))
66 | tools.makedir(os.path.dirname(output_file))
67 | else:
68 | print('Skip phmmer {} against {}'.format(clusterSeqFile4phmmer, faaFileName))
69 |
70 | outFiles4phmmer.append(output_file)
71 | return (args2concurrent, outFiles4phmmer)
72 |
73 | # outFiles4hmmsearch: [output_file, ...]
74 | # output_file: output of hmmsearch, e.g. clusters.faa.hmm.NC_000913.fna.faa, clusters.faa.hmm.SRS014235.scaffolds.fa.faa
75 | def prepare4hmmsearch(hmms_file, proteome_files, path_to_hmmsearch_results, nthread):
76 | args2concurrent = []
77 | outFiles4hmmsearch = []
78 | query = os.path.basename(hmms_file)
79 | for proteome_file in proteome_files:
80 | faaFileName, org, update = proteome_file
81 | if not os.path.isfile(faaFileName) or os.stat(faaFileName).st_size == 0:
82 | print('No such file or Empty file', faaFileName)
83 | continue
84 | fileName = '.'.join([query, os.path.basename(faaFileName)])
85 | output_file = os.path.join(path_to_hmmsearch_results, org, fileName)
86 | callhmmer = False
87 | if update == True:
88 | callhmmer = True
89 | elif os.path.isfile(output_file) and os.stat(output_file).st_size > 0:
90 | fp = open(output_file, 'r')
91 | fp.seek(fp.seek(0,2)-len('# [ok]\n'))
92 | if '# [ok]\n' in fp.read():
93 | callhmmer = False
94 | else:
95 | # incomplete file missing the last line of the normal file created by hmmer-3.1b2
96 | callhmmer = True
97 | else:
98 | callhmmer = True
99 | if callhmmer == True:
100 | args2concurrent.append((hmms_file, faaFileName, output_file, nthread))
101 | tools.makedir(os.path.dirname(output_file))
102 | else:
103 | print('Skip hmmsearch {} against {}'.format(hmms_file, faaFileName))
104 |
105 | outFiles4hmmsearch.append(output_file)
106 | return (args2concurrent, outFiles4hmmsearch)
107 |
108 | def hmmSearch(args2concurrent):
109 | print("\nBegin to profile HMM search against proteome database.", datetime.datetime.now().ctime())
110 |
111 | for args in args2concurrent:
112 | outs = is_analysis.is_hmmsearch_v2(args)
113 | hmms_file, proteome_file, hmmHitsFile, nthread = args
114 | if outs == 0:
115 | print('Finish Profile HMM searching', hmms_file, ' against', proteome_file, ', output', hmmHitsFile)
116 | else:
117 | e = 'Profile HMM searching ' + hmms_file + ' against ' + proteome_file + ', return error!\n'
118 | raise RuntimeError(e)
119 |
120 | print("\nFinish profile HMM searching against proteome database.", datetime.datetime.now().ctime())
121 |
122 | def phmmerSearch(args2concurrent4phmmer):
123 | print("\nBegin to phmmer search against proteome database.", datetime.datetime.now().ctime())
124 |
125 | for arg in args2concurrent4phmmer:
126 | outs = is_analysis.is_phmmer(arg)
127 | seqFile, proteome_file, hmmHitsFile, nthread = arg
128 | if outs == 0:
129 | print('Finish phmmer searching', seqFile, ' against', proteome_file, ', output', hmmHitsFile)
130 | else:
131 | e = 'phmmer searching ' + seqFile + ' against ' + proteome_file + ', return error!\n'
132 | raise RuntimeError(e)
133 |
134 | print("\nFinish phmmer searching against proteome database.", datetime.datetime.now().ctime())
135 |
136 |
137 | # dnaFiles: [(file, org), ..., (file, org)]
138 | def translateGenomeByFGS_v2(dnaFiles, dir2proteome, nthread):
139 | #seq_type = '1'
140 | #train_model = 'complete'
141 | seq_type = '0'
142 | #train_model = 'sanger_5'
143 | #train_model = 'sanger_10'
144 | #train_model = '454_5'
145 | #train_model = '454_10'
146 | #train_model = '454_30'
147 | train_model = 'illumina_5'
148 | #train_model = 'illumina_10'
149 |
150 | proteome_files = []
151 | args2concurrent = []
152 | for item in dnaFiles:
153 | dna_file, org = item
154 |
155 | outputFile = os.path.basename(dna_file)
156 | output_file = os.path.join(dir2proteome, org, outputFile)
157 |
158 | faaFile = output_file + '.faa'
159 | # prepare to translate genome into proteome if protome file has not been available.
160 | update = False
161 | if not os.path.isfile(faaFile):
162 | tools.makedir(os.path.dirname(faaFile))
163 | args2concurrent.append((dna_file, output_file, seq_type, train_model, nthread))
164 | update = True
165 | elif os.stat(faaFile).st_size > 0:
166 | print('Skip translating {} into {}'.format(dna_file, faaFile))
167 | else:
168 | print('No gene was found for', dna_file)
169 | continue
170 |
171 | proteome_files.append((faaFile, org, update))
172 |
173 | # Translate genome into proteome.
174 | if len(args2concurrent) > 0:
175 | genome2proteome(args2concurrent)
176 | else:
177 | print('Skip translating genome into proteome.')
178 | return proteome_files
179 |
180 | # Based on .faa and .ptt files, it read annotated protein sequence from NCBI
181 | # and then write a protein sequence file same as the output of FragGeneScan.
182 | # dnaFiles: [(file, org), ..., (file, org)]
183 | def proteinFromNCBI(dnaFiles, dir2proteome):
184 | proteome_files = []
185 | # Convert GeneBank protein info (NC_000913.faa and NC_000913.ptt)
186 | # into FragGeneScan protein file format(NC_000913.fna.faa)
187 | update = True
188 | for item in dnaFiles:
189 | fnaFile, org = item
190 | #faaFile = fnaFile[:-4] + '.faa'
191 | #pttFile = fnaFile[:-4] + '.ptt'
192 | gbkFile = fnaFile[:-4] + '.gbk'
193 | fgsFile = os.path.join(dir2proteome, org, os.path.basename(fnaFile + '.faa'))
194 | #tools.gb2fgs4protein(fnaFile, faaFile, pttFile, fgsFile)
195 | tools.gbk2fgs4protein(fnaFile, gbkFile, fgsFile)
196 | proteome_files.append((fgsFile, org, update))
197 | return proteome_files
198 |
199 | def isPredict(dna_list, output, removeShortIS, translateGenome,
200 | nthread=1):
201 | dnaFiles = tools.rdDNAlist(dna_list)
202 | path_to_proteome = os.path.join(output, 'proteome')
203 | if translateGenome == True:
204 | print ("predict and translate genes from genome sequence into protein database using FragGeneScan program")
205 | proteome_files = translateGenomeByFGS_v2(dnaFiles, path_to_proteome, nthread)
206 | else:
207 | print ("use NCBI protein database")
208 | proteome_files = proteinFromNCBI(dnaFiles, path_to_proteome)
209 |
210 | clusterSeqFile4phmmer = constants.file4clusterSeqFile4phmmer
211 | hmms_file = constants.file4clusterHMM
212 |
213 | # HMM searches against protein database
214 | #
215 | path_to_hmmsearch_results = os.path.join(output, 'hmm')
216 | if os.path.isfile(clusterSeqFile4phmmer) and os.stat(clusterSeqFile4phmmer).st_size > 0:
217 | args2concurrent4phmmer, outFiles4phmmer = prepare4phmmer(clusterSeqFile4phmmer,
218 | proteome_files, path_to_hmmsearch_results, nthread)
219 | else: # no valid clusters.single.faa available
220 | #args2concurrent4phmmer,outFiles4phmmer = [], []
221 | e = clusterSeqFile4phmmer + ' is not found or empty!\n'
222 | raise RuntimeError(e)
223 | if len(args2concurrent4phmmer) > 0:
224 | phmmerSearch(args2concurrent4phmmer)
225 |
226 | if os.path.isfile(hmms_file) and os.stat(hmms_file).st_size > 0:
227 | args2concurrent4hmmsearch, outFiles4hmmsearch = prepare4hmmsearch(hmms_file,
228 | proteome_files, path_to_hmmsearch_results, nthread)
229 | else: # no valid clusters.faa.hmm available
230 | #args2concurrent4hmmsearch, outFiles4hmmsearch = [], []
231 | e = hmms_file + ' is not found or empty!\n'
232 | raise RuntimeError(e)
233 | if len(args2concurrent4hmmsearch) > 0:
234 | hmmSearch(args2concurrent4hmmsearch)
235 |
236 | # Select significant ones (predictions) from hits returned by HMM search
237 | hitsFile = outFiles4phmmer + outFiles4hmmsearch
238 | if len(hitsFile) > 0:
239 | args4pred = {
240 | 'dna_list': dna_list,
241 | 'output': output,
242 | 'path_to_proteome': path_to_proteome,
243 | 'path_to_hmmsearch_results': path_to_hmmsearch_results,
244 | 'hitsFile': hitsFile,
245 | 'removeShortIS' : removeShortIS,
246 | 'nthread': nthread,
247 | }
248 | pred.pred(args4pred)
249 | if removeShortIS is False:
250 | print('Both complete and partial IS elements are reported.')
251 | else:
252 | print('Only complete IS elements are reported.')
253 | else:
254 | e = 'No hit was returned by HMM search against protein database. ' + datetime.datetime.now().ctime()
255 | print(e)
256 |
--------------------------------------------------------------------------------
/isescan.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | # ISEScan version
5 | version = '1.7.3'
6 |
7 | import argparse
8 | import os
9 | import sys
10 | import datetime
11 |
12 | import isPredict
13 |
14 | def isPredictSingle(args):
15 | print('ISEScan starts at', datetime.datetime.now().ctime())
16 |
17 | seqfile = args['seqfile']
18 | output = args['output']
19 | seqfilename = os.path.basename(seqfile)
20 | org = os.path.basename(os.path.dirname(seqfile))
21 | filelist = org + '_' + seqfilename + '.list'
22 | with open(filelist, 'w') as fp:
23 | fp.write(seqfile+'\n')
24 |
25 | isPredict.isPredict(filelist, args['output'], args['removeShortIS'], args['translateGenome'],
26 | args['nthread'])
27 | os.remove(filelist)
28 | print('ISEScan ends at', datetime.datetime.now().ctime())
29 |
30 | if __name__ == "__main__":
31 | import textwrap
32 |
33 | # Parse command line arguments
34 | descriptStr = '''\
35 | ISEScan is a python pipeline to identify Insertion Sequence elements (both complete and incomplete IS elements) in genom. A typical invocation would be:
36 | python3 isescan.py seqfile proteome hmm
37 |
38 | - If you want isescan to report only complete IS elements, you need to set command line option --removeShortIS.'''
39 | parser = argparse.ArgumentParser(prog='isescan', description = textwrap.dedent(descriptStr),
40 | formatter_class=argparse.RawDescriptionHelpFormatter)
41 |
42 | parser.add_argument('--version', action='version', version='%(prog)s' + ' ' + version)
43 |
44 | parser.add_argument(
45 | '--removeShortIS',
46 | action='store_true',
47 | help = "Remove incomplete (partial) IS elements which include IS element with length < 400 or single copy IS element without perfect TIR.",
48 | )
49 |
50 | parser.add_argument(
51 | '--no-FragGeneScan',
52 | action='store_false',
53 | help = "Use the annotated protein sequences in NCBI GenBank file (.gbk which must be in the same folder with genome sequence file), instead of the protein sequences predicted/translated by FragGeneScan. (Experimental feature!)",
54 | )
55 |
56 | parser.add_argument(
57 | '--seqfile',
58 | required = True,
59 | default='',
60 | help = "Sequence file in fasta format, '' by default",
61 | )
62 |
63 | parser.add_argument(
64 | '--output',
65 | required = True,
66 | default='results',
67 | help = "Output directory, 'results' by default",
68 | )
69 |
70 | parser.add_argument(
71 | '--nthread',
72 | required = False,
73 | type = int,
74 | default = 1,
75 | help = 'Number of CPU cores used for FragGeneScan and hmmer, 1 by default.')
76 |
77 | args = parser.parse_args()
78 |
79 | args4isPredictSingle = {
80 | 'removeShortIS' : args.removeShortIS,
81 | 'translateGenome' : args.no_FragGeneScan,
82 | 'seqfile': args.seqfile.strip(),
83 | 'output': args.output.strip(),
84 | 'nthread': args.nthread,
85 | }
86 |
87 | isPredictSingle(args4isPredictSingle)
88 |
--------------------------------------------------------------------------------
/publication/SupplementaryMaterials.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/publication/SupplementaryMaterials.docx
--------------------------------------------------------------------------------
/publication/SupplementaryMaterials.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/publication/SupplementaryMaterials.xlsx
--------------------------------------------------------------------------------
/publication/btx433.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/publication/btx433.pdf
--------------------------------------------------------------------------------
/pyssw.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | @package pyssw
6 | @brief Python standalone program for ssw alignment using the C library
7 | Complete-Striped-Smith-Waterman-Library
8 | Biopython module is require for fastq/fastq parsing
9 | @copyright [The MIT licence](http://opensource.org/licenses/MIT)
10 | @author Adrien Leger - 2014
11 | *
12 | *
13 | *
14 | * [Github](https://github.com/a-slide)
15 | * [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/)
16 | """
17 |
18 | #~~~~~~~GLOBAL IMPORTS~~~~~~~#
19 | # Standard library packages
20 | import optparse
21 | import sys
22 | from time import time
23 | import gzip
24 |
25 | #~~~~~~~MAIN FUNCTION~~~~~~~#
26 | def main (opt):
27 |
28 | print ("Inport subject sequence")
29 | # Import fasta subject
30 | if opt.subject.rpartition(".")[2].lower() == "gz":
31 | subject_handle = gzip.open(opt.subject, "r")
32 | else:
33 | subject_handle = open(opt.subject, "r")
34 | subject = SeqIO.read(subject_handle, "fasta")
35 |
36 | print ("Inport query sequences and count the number of sequences")
37 | # Import fasta subject
38 | if opt.query.rpartition(".")[2].lower() == "gz":
39 | nseq = count_seq(opt.query, opt.qtype, True)
40 | query_handle = gzip.open(opt.query, "r")
41 | else:
42 | nseq = count_seq(opt.query, opt.qtype, False)
43 | query_handle = open(opt.query, "r")
44 | query_gen = SeqIO.parse(query_handle, opt.qtype)
45 |
46 | print("{} contains {} sequences to align".format(opt.query, nseq))
47 | # Calculate a step list for the progress bar
48 | nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)]
49 |
50 | print ("Initialize ssw aligner with the subject sequence")
51 | # Init the an Aligner object with the reference value
52 | ssw = Aligner(
53 | str(subject.seq),
54 | match=int(opt.match),
55 | mismatch=int(opt.mismatch),
56 | gap_open=int(opt.gap_open),
57 | gap_extend= int(opt.gap_extend),
58 | report_secondary=False,
59 | report_cigar=True)
60 |
61 | # Write the header of the SAM file
62 | with open("result.sam", "w") as f:
63 | f.write("@HD\tVN:1.0\tSO:unsorted\n")
64 | f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq)))
65 | f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n")
66 | f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format(
67 | opt.match,
68 | opt.mismatch,
69 | opt.gap_open,
70 | opt.gap_extend))
71 | f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format(
72 | opt.min_score,
73 | opt.min_len))
74 |
75 | print ("Starting alignment of queries against the subject sequence")
76 | start = time()
77 | # Align each query along the subject an write result in a SAM file
78 | i = 0
79 | for query in query_gen:
80 |
81 | # Find the best alignment
82 | if opt.reverse:
83 | al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len))
84 | else:
85 | al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True
86 |
87 | # If valid match found
88 | if al:
89 | f.write(sam_line(
90 | qname=query.id,
91 | flag=0 if orient else 16,
92 | rname=subject.id,
93 | pos=al.ref_begin+1,
94 | cigar=al.cigar_string,
95 | seq=str(query.seq),
96 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*",
97 | tags=["AS:i:{}".format(al.score)]))
98 |
99 | # If no valid match found and -u flag activated (report unaligned)
100 | elif opt.unaligned:
101 | f.write(sam_line(
102 | qname=query.id,
103 | flag=4,
104 | seq=str(query.seq),
105 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*"))
106 | # Else = match unreported
107 |
108 | # Progress bar
109 | i+=1
110 | if i in nseq_list:
111 | frac = i/float(nseq)
112 | t = time()-start
113 | print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2)))
114 |
115 | print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2)))
116 |
117 | #~~~~~~~HELPER FUNCTIONS~~~~~~~#
118 |
119 |
120 | def sam_line (qname='*', flag=4, rname='*', pos=0, mapq=0, cigar='*', rnext='*', pnext=0, tlen=0, seq='*', qual='*', tags=None):
121 | """
122 | Return a minimal sam line = by default return an undetermined sam line. Check the document
123 | [SAM Format Specification](http://samtools.sourceforge.net/SAM1.pdf) for a full description.
124 | @param qname Query template NAME
125 | @param flag bitwise FLAG
126 | @param rname Reference sequence NAME of the alignment
127 | @param pos 1-based leftmost mapping POSition of the first matching base
128 | @param mapq MAPping Quality
129 | @param cigar CIGAR string
130 | @param rnext Reference sequence name of the primary alignment of the mate
131 | @param pnext 1-based leftmost position of the primary alignment of the mate
132 | @param tlen signed observed Template LENgth
133 | @param seq segment SEQuence
134 | @param qual ASCII of base QUALity plus 33
135 | @param tags list of optional tags
136 | @return A Sam alignment line
137 | """
138 | if tags:
139 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
140 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, " ".join(tags))
141 | else:
142 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
143 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual)
144 |
145 | def find_best_align (ssw, query, min_score, min_len):
146 |
147 | # Align reverse and forward query
148 | forward_al = ssw.align(str(query.seq), min_score, min_len)
149 | reverse_al = ssw.align(str(query.seq.reverse_complement()), min_score, min_len)
150 |
151 | # Decision tree to return the best aligned sequence taking into acount the absence of result
152 | # by ssw_wrap in case of score filtering
153 |
154 | if not forward_al:
155 | if not reverse_al:
156 | return (None, None)
157 | else:
158 | return (reverse_al, False)
159 |
160 | else:
161 | if not reverse_al:
162 | return (forward_al, True)
163 | else:
164 | if forward_al.score >= reverse_al.score:
165 | return (forward_al, True)
166 | else:
167 | return (reverse_al, False)
168 |
169 | def count_seq (filename, seq_type="fasta", gziped=False):
170 | """
171 | Count the number of sequences in a fastq or a fastq file
172 | @param filename Path to a valid readeable file
173 | @param file_type Should be either fastq or fastq. Default fasta
174 | @param gziped Boolean indicating if the file is gziped or not. Default False
175 | """
176 | #Standard library import
177 | import gzip
178 | from mmap import mmap
179 |
180 | # Verify if the file is fasta or fastq type
181 | assert seq_type in ["fasta", "fastq"], "The file has to be either fastq or fasta format"
182 |
183 | # Open the file
184 | if gziped:
185 | f = gzip.open(filename, "r")
186 | else:
187 | f = open(filename, "r")
188 |
189 | # FASTA Find a start line seq character ">" an increment the counter each time
190 | if seq_type == "fasta":
191 | nline = 0
192 | for line in f:
193 | if line[0] == ">":
194 | nline+=1
195 | f.close()
196 | return nline
197 |
198 | # FASTQ No motif to find, but 4 lines correspond to 1 sequence
199 | else:
200 | nline = 0
201 | for line in f:
202 | nline+=1
203 | f.close()
204 | return nline/4
205 |
206 | def optparser():
207 |
208 | print("Parse command line options")
209 | # Usage and version strings
210 | program_name = "pyssw"
211 | program_version = 0.1
212 | version_string = "{}\t{}".format(program_name, program_version)
213 | usage_string = "{}.py -s subject.fasta -q fastq (or fasta) [Facultative options]".format(program_name)
214 | optparser = optparse.OptionParser(usage = usage_string, version = version_string)
215 |
216 | # Define optparser options
217 | hstr = "Path of the fasta file containing the subject genome sequence. Can be gziped. [REQUIRED] "
218 | optparser.add_option( '-s', '--subject', dest="subject", help=hstr)
219 | hstr = "Path of the fastq or fasta file containing the short read to be aligned. Can be gziped. [REQUIRED]"
220 | optparser.add_option( '-q', '--query', dest="query", help=hstr)
221 | hstr = "Type of the query file = fastq or fasta. [default: fastq]"
222 | optparser.add_option( '-t', '--qtype', dest="qtype", default="fastq", help=hstr)
223 | hstr = "Positive integer for weight match in genome sequence alignment. [default: 2]"
224 | optparser.add_option( '-m', '--match', dest="match",default=2, help=hstr)
225 | hstr = "Positive integer. The negative value will be used as weight mismatch in genome sequence alignment. [default: 2]"
226 | optparser.add_option( '-x', '--mismatch', dest="mismatch", default=2, help=hstr)
227 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 3]"
228 | optparser.add_option( '-o', '--gap_open', dest="gap_open", default=3, help=hstr)
229 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 1]"
230 | optparser.add_option( '-e', '--gap_extend', dest="gap_extend", default=1, help=hstr)
231 | hstr = "Integer. Consider alignments having a score <= as not aligned. [default: 0]"
232 | optparser.add_option( '-f', '--min_score', dest="min_score", default=0, help=hstr)
233 | hstr = "Integer. Consider alignments having a length <= as not aligned. [default: 0]"
234 | optparser.add_option( '-l', '--min_len', dest="min_len", default=0, help=hstr)
235 | hstr = "Flag. Align query in forward and reverse orientation and choose the best alignment. [Set by default]"
236 | optparser.add_option( '-r', '--reverse', dest="reverse", action="store_true", default=True, help=hstr)
237 | hstr = "Flag. Write unaligned reads in sam output [Unset by default]"
238 | optparser.add_option( '-u', '--unaligned', dest="unaligned", action="store_true", default=False, help=hstr)
239 |
240 | # Parse arg and return a dictionnary_like object of options
241 | opt, args = optparser.parse_args()
242 |
243 | if not opt.subject:
244 | print ("\nERROR: a subject fasta file has to be provided (-s option)\n")
245 | optparser.print_help()
246 | sys.exit()
247 |
248 | if not opt.query:
249 | print ("\nERROR: a query fasta or fastq file has to be provided (-q option)\n")
250 | optparser.print_help()
251 | sys.exit()
252 |
253 | return opt
254 |
255 | #~~~~~~~TOP LEVEL INSTRUCTIONS~~~~~~~#
256 |
257 | if __name__ == '__main__':
258 |
259 | # try to import Third party and local packages
260 | try:
261 | from Bio import SeqIO
262 | except ImportError:
263 | print ("ERROR: Please install Biopython package")
264 | sys.exit()
265 |
266 | try:
267 | from ssw_wrap import Aligner
268 | except ImportError:
269 | print ("ERROR: Please place ssw_wrap in the current directory or add its dir to python path")
270 | sys.exit()
271 |
272 | # Parse command line arguments
273 | opt = optparser()
274 | # Run the main function
275 | main(opt)
276 |
--------------------------------------------------------------------------------
/ssw201507/Makefile:
--------------------------------------------------------------------------------
1 | CC = gcc
2 | CXX = g++
3 | CFLAGS := -Wall -O3 -pipe #-pg
4 | CXXFLAGS := $(CFLAGS)
5 | LOBJS = ssw.o
6 | LCPPOBJS = ssw_cpp.o
7 | PROG = ssw_test
8 | LIB = libssw.so
9 | EXAMPLE = example_c
10 | EXAMPLE_CPP = example_cpp
11 | JAVA_JAR = ssw.jar
12 | JAVA_LIB = libsswjni.so
13 | JAVA_INLCUDES = -I"$(JAVA_HOME)/include" -I"$(JAVA_HOME)/include/linux"
14 | JAVA_OBJ = ssw/Aligner.class ssw/Alignment.class ssw/Example.class
15 |
16 | .PHONY: all default java clean
17 |
18 | default: $(PROG) $(EXAMPLE) $(EXAMPLE_CPP) $(LIB)
19 |
20 | all: default java
21 |
22 | java: $(JAVA_JAR) $(JAVA_LIB)
23 |
24 | $(LIB): ssw.c ssw.h
25 | $(CC) $(CFLAGS) -fPIC -shared -rdynamic -o $@ $<
26 |
27 | $(PROG): main.c kseq.h
28 |
29 | $(EXAMPLE): example.c
30 |
31 | $(PROG) $(EXAMPLE): $(LOBJS)
32 | $(CC) -o $@ $(filter-out %.h,$^) $(CFLAGS) -lm -lz
33 |
34 | $(EXAMPLE_CPP): example.cpp $(LOBJS) $(LCPPOBJS)
35 | $(CXX) -o $@ $^ $(CXXFLAGS) -lm -lz
36 |
37 | $(JAVA_LIB): sswjni.c ssw.c ssw.h
38 | $(CC) $(CFLAGS) $(JAVA_INLCUDES) -fPIC -shared -rdynamic -o $@ $< ssw.c
39 |
40 | $(JAVA_JAR): $(JAVA_OBJ)
41 | jar cvfe $@ ssw.Example $^
42 |
43 | %.class: %.java
44 | javac $<
45 |
46 | ssw.o: ssw.c ssw.h
47 | $(CC) -c -o $@ $< $(CFLAGS)
48 |
49 | ssw_cpp.o: ssw_cpp.cpp ssw_cpp.h ssw.h
50 | $(CXX) -c -o $@ $< $(CXXFLAGS)
51 |
52 | clean:
53 | -rm -f $(LOBJS) $(LCPPOBJS) $(PROG) $(LIB) $(EXAMPLE) $(EXAMPLE_CPP) $(JAVA_LIB) $(JAVA_JAR) $(JAVA_OBJ) *~
54 |
--------------------------------------------------------------------------------
/ssw201507/__pycache__/ssw_wrap.cpython-33.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/ssw201507/__pycache__/ssw_wrap.cpython-33.pyc
--------------------------------------------------------------------------------
/ssw201507/example.c:
--------------------------------------------------------------------------------
1 | /* example.c
2 | * This is a simple example to show you how to use the SSW C library.
3 | * To run this example:
4 | * 1) gcc -Wall -lz ssw.c example.c
5 | * 2) ./a.out
6 | * Created by Mengyao Zhao on 07/31/12.
7 | */
8 |
9 | #include
10 | #include
11 | #include
12 | #include "ssw.h"
13 |
14 | // Print the BLAST like output.
15 | static void ssw_write (const s_align* a,
16 | const char* ref_seq,
17 | const char* read_seq,
18 | const int8_t* table) {
19 |
20 | fprintf(stdout, "hello, %d\n", a->cigarLen);
21 | int i;
22 | fprintf(stdout, "hello\n");
23 | for(i = 0; i < a->cigarLen; ++i)
24 | {
25 | fprintf(stdout, "%d%c",cigar_int_to_len(a->cigar[i]), cigar_int_to_op(a->cigar[i]));
26 | }
27 | fprintf(stdout, "\nhello\n");
28 |
29 | fprintf(stdout, "optimal_alignment_score: %d\tsub-optimal_alignment_score: %d\t", a->score1, a->score2);
30 | if (a->ref_begin1 + 1) fprintf(stdout, "target_begin: %d\t", a->ref_begin1 + 1);
31 | fprintf(stdout, "target_end: %d\t", a->ref_end1 + 1);
32 | if (a->read_begin1 + 1) fprintf(stdout, "query_begin: %d\t", a->read_begin1 + 1);
33 | fprintf(stdout, "query_end: %d\n\n", a->read_end1 + 1);
34 | if (a->cigar) {
35 | int32_t c = 0, left = 0, e = 0, qb = a->ref_begin1, pb = a->read_begin1;
36 | uint32_t i;
37 | while (e < a->cigarLen || left > 0) {
38 | int32_t count = 0;
39 | int32_t q = qb;
40 | int32_t p = pb;
41 | fprintf(stdout, "Target: %8d ", q + 1);
42 | for (c = e; c < a->cigarLen; ++c) {
43 | char letter = cigar_int_to_op(a->cigar[c]);
44 | uint32_t length = cigar_int_to_len(a->cigar[c]);
45 | uint32_t l = (count == 0 && left > 0) ? left: length;
46 | for (i = 0; i < l; ++i) {
47 | if (letter == 'I') fprintf(stdout, "-");
48 | else {
49 | fprintf(stdout, "%c", *(ref_seq + q));
50 | ++ q;
51 | }
52 | ++ count;
53 | if (count == 60) goto step2;
54 | }
55 | }
56 | step2:
57 | fprintf(stdout, " %d\n ", q);
58 | q = qb;
59 | count = 0;
60 | for (c = e; c < a->cigarLen; ++c) {
61 | char letter = cigar_int_to_op(a->cigar[c]);
62 | uint32_t length = cigar_int_to_len(a->cigar[c]);
63 | uint32_t l = (count == 0 && left > 0) ? left: length;
64 | for (i = 0; i < l; ++i){
65 | if (letter == 'M') {
66 | if (table[(int)*(ref_seq + q)] == table[(int)*(read_seq + p)])fprintf(stdout, "|");
67 | else fprintf(stdout, "*");
68 | ++q;
69 | ++p;
70 | } else {
71 | fprintf(stdout, "*");
72 | if (letter == 'I') ++p;
73 | else ++q;
74 | }
75 | ++ count;
76 | if (count == 60) {
77 | qb = q;
78 | goto step3;
79 | }
80 | }
81 | }
82 | step3:
83 | p = pb;
84 | fprintf(stdout, "\nQuery: %8d ", p + 1);
85 | count = 0;
86 | for (c = e; c < a->cigarLen; ++c) {
87 | char letter = cigar_int_to_op(a->cigar[c]);
88 | uint32_t length = cigar_int_to_len(a->cigar[c]);
89 | uint32_t l = (count == 0 && left > 0) ? left: length;
90 | for (i = 0; i < l; ++i) {
91 | if (letter == 'D') fprintf(stdout, "-");
92 | else {
93 | fprintf(stdout, "%c", *(read_seq + p));
94 | ++p;
95 | }
96 | ++ count;
97 | if (count == 60) {
98 | pb = p;
99 | left = l - i - 1;
100 | e = (left == 0) ? (c + 1) : c;
101 | goto end;
102 | }
103 | }
104 | }
105 | e = c;
106 | left = 0;
107 | end:
108 | fprintf(stdout, " %d\n\n", p);
109 | }
110 | }
111 | }
112 |
113 | // Align a pair of genome sequences.
114 | int main (int argc, char * const argv[]) {
115 | int32_t l, m, k, match = 2, mismatch = 2, gap_open = 3, gap_extension = 1; // default parameters for genome sequence alignment
116 | // reference sequence
117 | static const char ref_seq[40] = {'A', 'A', 'G', 'C', 'C', 'T', 'T', 'T', 'C', 'T', 'G', 'A', 'C', 'C', 'C', 'G', 'G', 'A', 'A', 'A', 'T',
118 | 'C', 'A', 'A', 'A', 'A', 'T', 'A', 'G', 'G', 'C', 'A', 'C', 'A', 'A', 'C', 'A', 'A', 'A', '\0'};
119 | static const char read_seq[16] = {'C', 'T', 'G', 'A', 'G', 'C', 'C', 'G', 'G', 'T', 'A', 'A', 'A', 'T', 'C', '\0'}; // read sequence
120 | s_profile* profile;
121 | int8_t* num = (int8_t*)malloc(16); // the read sequence represented in numbers
122 | int8_t* ref_num = (int8_t*)malloc(64); // the read sequence represented in numbers
123 | s_align* result;
124 |
125 | /* This table is used to transform nucleotide letters into numbers. */
126 | static const int8_t nt_table[128] = {
127 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
128 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
129 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
130 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
131 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
132 | 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
133 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
134 | 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
135 | };
136 |
137 | // initialize scoring matrix for genome sequences
138 | // A C G T N (or other ambiguous code)
139 | // 2 -2 -2 -2 0 A
140 | // -2 2 -2 -2 0 C
141 | // -2 -2 2 -2 0 G
142 | // -2 -2 -2 2 0 T
143 | // 0 0 0 0 0 N (or other ambiguous code)
144 | int8_t* mat = (int8_t*)calloc(25, sizeof(int8_t));
145 | for (l = k = 0; l < 4; ++l) {
146 | for (m = 0; m < 4; ++m) mat[k++] = l == m ? match : - mismatch; /* weight_match : -weight_mismatch */
147 | mat[k++] = 0; // ambiguous base: no penalty
148 | }
149 | for (m = 0; m < 5; ++m) mat[k++] = 0;
150 |
151 | for (m = 0; m < 15; ++m) num[m] = nt_table[(int)read_seq[m]];
152 | profile = ssw_init(num, 15, mat, 5, 2);
153 | for (m = 0; m < 39; ++m) ref_num[m] = nt_table[(int)ref_seq[m]];
154 |
155 | // Only the 8 bit of the flag is setted. ssw_align will always return the best alignment beginning position and cigar.
156 | result = ssw_align (profile, ref_num, 39, gap_open, gap_extension, 1, 0, 0, 15);
157 | ssw_write(result, ref_seq, read_seq, nt_table);
158 |
159 | align_destroy(result);
160 | init_destroy(profile);
161 | free(mat);
162 | free(ref_num);
163 | free(num);
164 | return(0);
165 | }
166 |
--------------------------------------------------------------------------------
/ssw201507/kseq.h:
--------------------------------------------------------------------------------
1 | /* The MIT License
2 |
3 | Copyright (c) 2008 Genome Research Ltd (GRL).
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | */
25 |
26 | /* Contact: Heng Li */
27 |
28 | /* Last Modified: 12APR2009 */
29 |
30 | #ifndef AC_KSEQ_H
31 | #define AC_KSEQ_H
32 |
33 | #include
34 | #include
35 | #include
36 |
37 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
38 | #define KS_SEP_TAB 1 // isspace() && !' '
39 | #define KS_SEP_MAX 1
40 |
41 | #define __KS_TYPE(type_t) \
42 | typedef struct __kstream_t { \
43 | char *buf; \
44 | int begin, end, is_eof; \
45 | type_t f; \
46 | } kstream_t;
47 |
48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50 |
51 | #define __KS_BASIC(type_t, __bufsize) \
52 | static inline kstream_t *ks_init(type_t f) \
53 | { \
54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
55 | ks->f = f; \
56 | ks->buf = (char*)malloc(__bufsize); \
57 | return ks; \
58 | } \
59 | static inline void ks_destroy(kstream_t *ks) \
60 | { \
61 | if (ks) { \
62 | free(ks->buf); \
63 | free(ks); \
64 | } \
65 | }
66 |
67 | #define __KS_GETC(__read, __bufsize) \
68 | static inline int ks_getc(kstream_t *ks) \
69 | { \
70 | if (ks->is_eof && ks->begin >= ks->end) return -1; \
71 | if (ks->begin >= ks->end) { \
72 | ks->begin = 0; \
73 | ks->end = __read(ks->f, ks->buf, __bufsize); \
74 | if (ks->end < __bufsize) ks->is_eof = 1; \
75 | if (ks->end == 0) return -1; \
76 | } \
77 | return (int)ks->buf[ks->begin++]; \
78 | }
79 |
80 | #ifndef KSTRING_T
81 | #define KSTRING_T kstring_t
82 | typedef struct __kstring_t {
83 | size_t l, m;
84 | char *s;
85 | } kstring_t;
86 | #endif
87 |
88 | #ifndef kroundup32
89 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
90 | #endif
91 |
92 | #define __KS_GETUNTIL(__read, __bufsize) \
93 | static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
94 | { \
95 | if (dret) *dret = 0; \
96 | str->l = 0; \
97 | if (ks->begin >= ks->end && ks->is_eof) return -1; \
98 | for (;;) { \
99 | int i; \
100 | if (ks->begin >= ks->end) { \
101 | if (!ks->is_eof) { \
102 | ks->begin = 0; \
103 | ks->end = __read(ks->f, ks->buf, __bufsize); \
104 | if (ks->end < __bufsize) ks->is_eof = 1; \
105 | if (ks->end == 0) break; \
106 | } else break; \
107 | } \
108 | if (delimiter > KS_SEP_MAX) { \
109 | for (i = ks->begin; i < ks->end; ++i) \
110 | if (ks->buf[i] == delimiter) break; \
111 | } else if (delimiter == KS_SEP_SPACE) { \
112 | for (i = ks->begin; i < ks->end; ++i) \
113 | if (isspace(ks->buf[i])) break; \
114 | } else if (delimiter == KS_SEP_TAB) { \
115 | for (i = ks->begin; i < ks->end; ++i) \
116 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
117 | } else i = 0; /* never come to here! */ \
118 | if (str->m - str->l < i - ks->begin + 1) { \
119 | str->m = str->l + (i - ks->begin) + 1; \
120 | kroundup32(str->m); \
121 | str->s = (char*)realloc(str->s, str->m); \
122 | } \
123 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
124 | str->l = str->l + (i - ks->begin); \
125 | ks->begin = i + 1; \
126 | if (i < ks->end) { \
127 | if (dret) *dret = ks->buf[i]; \
128 | break; \
129 | } \
130 | } \
131 | if (str->l == 0) { \
132 | str->m = 1; \
133 | str->s = (char*)calloc(1, 1); \
134 | } \
135 | str->s[str->l] = '\0'; \
136 | return str->l; \
137 | }
138 |
139 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
140 | __KS_TYPE(type_t) \
141 | __KS_BASIC(type_t, __bufsize) \
142 | __KS_GETC(__read, __bufsize) \
143 | __KS_GETUNTIL(__read, __bufsize)
144 |
145 | #define __KSEQ_BASIC(type_t) \
146 | static inline kseq_t *kseq_init(type_t fd) \
147 | { \
148 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
149 | s->f = ks_init(fd); \
150 | return s; \
151 | } \
152 | static inline void kseq_rewind(kseq_t *ks) \
153 | { \
154 | ks->last_char = 0; \
155 | ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
156 | } \
157 | static inline void kseq_destroy(kseq_t *ks) \
158 | { \
159 | if (!ks) return; \
160 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
161 | ks_destroy(ks->f); \
162 | free(ks); \
163 | }
164 |
165 | /* Return value:
166 | >=0 length of the sequence (normal)
167 | -1 end-of-file
168 | -2 truncated quality string
169 | */
170 | #define __KSEQ_READ \
171 | static int kseq_read(kseq_t *seq) \
172 | { \
173 | int c; \
174 | kstream_t *ks = seq->f; \
175 | if (seq->last_char == 0) { /* then jump to the next header line */ \
176 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
177 | if (c == -1) return -1; /* end of file */ \
178 | seq->last_char = c; \
179 | } /* the first header char has been read */ \
180 | seq->comment.l = seq->seq.l = seq->qual.l = 0; \
181 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
182 | if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
183 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
184 | if (isgraph(c)) { /* printable non-space character */ \
185 | if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
186 | seq->seq.m = seq->seq.l + 2; \
187 | kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
188 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
189 | } \
190 | seq->seq.s[seq->seq.l++] = (char)c; \
191 | } \
192 | } \
193 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
195 | if (c != '+') return seq->seq.l; /* FASTA */ \
196 | if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
197 | seq->qual.m = seq->seq.m; \
198 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
199 | } \
200 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
201 | if (c == -1) return -2; /* we should not stop here */ \
202 | while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
203 | if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
204 | seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
205 | seq->last_char = 0; /* we have not come to the next header line */ \
206 | if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
207 | return seq->seq.l; \
208 | }
209 |
210 | #define __KSEQ_TYPE(type_t) \
211 | typedef struct { \
212 | kstring_t name, comment, seq, qual; \
213 | int last_char; \
214 | kstream_t *f; \
215 | } kseq_t;
216 |
217 | #define KSEQ_INIT(type_t, __read) \
218 | KSTREAM_INIT(type_t, __read, 4096) \
219 | __KSEQ_TYPE(type_t) \
220 | __KSEQ_BASIC(type_t) \
221 | __KSEQ_READ
222 |
223 | #endif
224 |
--------------------------------------------------------------------------------
/ssw201507/license.ssw.txt:
--------------------------------------------------------------------------------
1 | SSW Library: An SIMD Smith-Waterman C/C++/Python/Java Library for Use in Genomic Applications
2 |
3 | License: MIT
4 |
5 | Copyright (c) 2012-2015 Boston College
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
8 |
9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 |
13 |
--------------------------------------------------------------------------------
/ssw201507/main.c:
--------------------------------------------------------------------------------
1 | /* main.c
2 | * Created by Mengyao Zhao on 06/23/11.
3 | * Version 0.1.5
4 | * Last revision by Mengyao Zhao on 06/27/14.
5 | */
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include "ssw.h"
17 | #include "kseq.h"
18 |
19 | #ifdef __GNUC__
20 | #define LIKELY(x) __builtin_expect((x),1)
21 | #define UNLIKELY(x) __builtin_expect((x),0)
22 | #else
23 | #define LIKELY(x) (x)
24 | #define UNLIKELY(x) (x)
25 | #endif
26 |
27 | /*! @function
28 | @abstract Round an integer to the next closest power-2 integer.
29 | @param x integer to be rounded (in place)
30 | @discussion x will be modified.
31 | */
32 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
33 |
34 | KSEQ_INIT(gzFile, gzread)
35 |
36 | static void reverse_comple(const char* seq, char* rc) {
37 | int32_t end = strlen(seq), start = 0;
38 | static const int8_t rc_table[128] = {
39 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
40 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
41 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
42 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
43 | 4, 84, 4, 71, 4, 4, 4, 67, 4, 4, 4, 4, 4, 4, 4, 4,
44 | 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
45 | 4, 84, 4, 71, 4, 4, 4, 67, 4, 4, 4, 4, 4, 4, 4, 4,
46 | 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
47 | };
48 | rc[end] = '\0';
49 | -- end;
50 | while (LIKELY(start < end)) {
51 | rc[start] = (char)rc_table[(int8_t)seq[end]];
52 | rc[end] = (char)rc_table[(int8_t)seq[start]];
53 | ++ start;
54 | -- end;
55 | }
56 | if (start == end) rc[start] = (char)rc_table[(int8_t)seq[start]];
57 | }
58 |
59 | static void ssw_write (const s_align* a,
60 | const kseq_t* ref_seq,
61 | const kseq_t* read,
62 | const char* read_seq, // strand == 0: original read; strand == 1: reverse complement read
63 | const int8_t* table,
64 | int8_t strand, // 0: forward aligned ; 1: reverse complement aligned
65 | int8_t sam) { // 0: Blast like output; 1: Sam format output
66 |
67 | if (sam == 0) { // Blast like output
68 | fprintf(stdout, "target_name: %s\nquery_name: %s\noptimal_alignment_score: %d\t", ref_seq->name.s, read->name.s, a->score1);
69 | if (a->score2 > 0) fprintf(stdout, "suboptimal_alignment_score: %d\t", a->score2);
70 | if (strand == 0) fprintf(stdout, "strand: +\t");
71 | else fprintf(stdout, "strand: -\t");
72 | if (a->ref_begin1 + 1) fprintf(stdout, "target_begin: %d\t", a->ref_begin1 + 1);
73 | fprintf(stdout, "target_end: %d\t", a->ref_end1 + 1);
74 | if (a->read_begin1 + 1) fprintf(stdout, "query_begin: %d\t", a->read_begin1 + 1);
75 | fprintf(stdout, "query_end: %d\n\n", a->read_end1 + 1);
76 | if (a->cigar) {
77 | int32_t c = 0, left = 0, e = 0, qb = a->ref_begin1, pb = a->read_begin1;
78 | uint32_t i;
79 | while (e < a->cigarLen || left > 0) {
80 | int32_t count = 0;
81 | int32_t q = qb;
82 | int32_t p = pb;
83 | fprintf(stdout, "Target: %8d ", q + 1);
84 | for (c = e; c < a->cigarLen; ++c) {
85 | char letter = cigar_int_to_op(a->cigar[c]);
86 | uint32_t length = cigar_int_to_len(a->cigar[c]);
87 | uint32_t l = (count == 0 && left > 0) ? left: length;
88 | for (i = 0; i < l; ++i) {
89 | if (letter == 'I') fprintf(stdout, "-");
90 | else {
91 | fprintf(stdout, "%c", *(ref_seq->seq.s + q));
92 | ++ q;
93 | }
94 | ++ count;
95 | if (count == 60) goto step2;
96 | }
97 | }
98 | step2:
99 | fprintf(stdout, " %d\n ", q);
100 | q = qb;
101 | count = 0;
102 | for (c = e; c < a->cigarLen; ++c) {
103 | char letter = cigar_int_to_op(a->cigar[c]);
104 | uint32_t length = cigar_int_to_len(a->cigar[c]);
105 | uint32_t l = (count == 0 && left > 0) ? left: length;
106 | for (i = 0; i < l; ++i){
107 | if (letter == 'M') {
108 | if (table[(int)*(ref_seq->seq.s + q)] == table[(int)*(read_seq + p)])fprintf(stdout, "|");
109 | else fprintf(stdout, "*");
110 | ++q;
111 | ++p;
112 | } else {
113 | fprintf(stdout, " ");
114 | if (letter == 'I') ++p;
115 | else ++q;
116 | }
117 | ++ count;
118 | if (count == 60) {
119 | qb = q;
120 | goto step3;
121 | }
122 | }
123 | }
124 | step3:
125 | p = pb;
126 | fprintf(stdout, "\nQuery: %8d ", p + 1);
127 | count = 0;
128 | for (c = e; c < a->cigarLen; ++c) {
129 | char letter = cigar_int_to_op(a->cigar[c]);
130 | uint32_t length = cigar_int_to_len(a->cigar[c]);
131 | uint32_t l = (count == 0 && left > 0) ? left: length;
132 | for (i = 0; i < l; ++i) {
133 | if (letter == 'D') fprintf(stdout, "-");
134 | else {
135 | fprintf(stdout, "%c", *(read_seq + p));
136 | ++p;
137 | }
138 | ++ count;
139 | if (count == 60) {
140 | pb = p;
141 | left = l - i - 1;
142 | e = (left == 0) ? (c + 1) : c;
143 | goto end;
144 | }
145 | }
146 | }
147 | e = c;
148 | left = 0;
149 | end:
150 | fprintf(stdout, " %d\n\n", p);
151 | }
152 | }
153 | }else { // Sam format output
154 | fprintf(stdout, "%s\t", read->name.s);
155 | if (a->score1 == 0) fprintf(stdout, "4\t*\t0\t255\t*\t*\t0\t0\t*\t*\n");
156 | else {
157 | int32_t c, l = a->read_end1 - a->read_begin1 + 1, qb = a->ref_begin1, pb = a->read_begin1, p;
158 | uint32_t mapq = -4.343 * log(1 - (double)abs(a->score1 - a->score2)/(double)a->score1);
159 | mapq = (uint32_t) (mapq + 4.99);
160 | mapq = mapq < 254 ? mapq : 254;
161 | if (strand) fprintf(stdout, "16\t");
162 | else fprintf(stdout, "0\t");
163 | fprintf(stdout, "%s\t%d\t%d\t", ref_seq->name.s, a->ref_begin1 + 1, mapq);
164 | for (c = 0; c < a->cigarLen; ++c) {
165 | char letter = cigar_int_to_op(a->cigar[c]);
166 | uint32_t length = cigar_int_to_len(a->cigar[c]);
167 | fprintf(stdout, "%lu%c", (unsigned long)length, letter);
168 | }
169 | fprintf(stdout, "\t*\t0\t0\t");
170 | for (c = a->read_begin1; c <= a->read_end1; ++c) fprintf(stdout, "%c", read_seq[c]);
171 | fprintf(stdout, "\t");
172 | if (read->qual.s && strand) {
173 | p = a->read_end1;
174 | for (c = 0; c < l; ++c) {
175 | fprintf(stdout, "%c", read->qual.s[p]);
176 | --p;
177 | }
178 | }else if (read->qual.s){
179 | p = a->read_begin1;
180 | for (c = 0; c < l; ++c) {
181 | fprintf(stdout, "%c", read->qual.s[p]);
182 | ++p;
183 | }
184 | } else fprintf(stdout, "*");
185 | fprintf(stdout, "\tAS:i:%d", a->score1);
186 | mapq = 0; // counter of difference
187 | for (c = 0; c < a->cigarLen; ++c) {
188 | char letter = cigar_int_to_op(a->cigar[c]);
189 | uint32_t length = cigar_int_to_len(a->cigar[c]);
190 | if (letter == 'M') {
191 | for (p = 0; p < length; ++p){
192 | if (table[(int)*(ref_seq->seq.s + qb)] != table[(int)*(read_seq + pb)]) ++mapq;
193 | ++qb;
194 | ++pb;
195 | }
196 | } else if (letter == 'I') {
197 | pb += length;
198 | mapq += length;
199 | } else {
200 | qb += length;
201 | mapq += length;
202 | }
203 | }
204 | fprintf(stdout,"\tNM:i:%d\t", mapq);
205 | if (a->score2 > 0) fprintf(stdout, "ZS:i:%d\n", a->score2);
206 | else fprintf(stdout, "\n");
207 | }
208 | }
209 | }
210 |
211 | int main (int argc, char * const argv[]) {
212 | clock_t start, end;
213 | float cpu_time;
214 | gzFile read_fp, ref_fp;
215 | kseq_t *read_seq, *ref_seq;
216 | int32_t l, m, k, match = 2, mismatch = 2, gap_open = 3, gap_extension = 1, path = 0, reverse = 0, n = 5, sam = 0, protein = 0, header = 0, s1 = 67108864, s2 = 128, filter = 0;
217 | int8_t* mata = (int8_t*)calloc(25, sizeof(int8_t));
218 | const int8_t* mat = mata;
219 | char mat_name[16];
220 | mat_name[0] = '\0';
221 | int8_t* ref_num = (int8_t*)malloc(s1);
222 | int8_t* num = (int8_t*)malloc(s2), *num_rc = 0;
223 | char* read_rc = 0;
224 |
225 | static const int8_t mat50[] = {
226 | // A R N D C Q E G H I L K M F P S T W Y V B Z X *
227 | 5, -2, -1, -2, -1, -1, -1, 0, -2, -1, -2, -1, -1, -3, -1, 1, 0, -3, -2, 0, -2, -1, -1, -5, // A
228 | -2, 7, -1, -2, -4, 1, 0, -3, 0, -4, -3, 3, -2, -3, -3, -1, -1, -3, -1, -3, -1, 0, -1, -5, // R
229 | -1, -1, 7, 2, -2, 0, 0, 0, 1, -3, -4, 0, -2, -4, -2, 1, 0, -4, -2, -3, 5, 0, -1, -5, // N
230 | -2, -2, 2, 8, -4, 0, 2, -1, -1, -4, -4, -1, -4, -5, -1, 0, -1, -5, -3, -4, 6, 1, -1, -5, // D
231 | -1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1, -1, -5, -3, -1, -3, -3, -1, -5, // C
232 | -1, 1, 0, 0, -3, 7, 2, -2, 1, -3, -2, 2, 0, -4, -1, 0, -1, -1, -1, -3, 0, 4, -1, -5, // Q
233 | -1, 0, 0, 2, -3, 2, 6, -3, 0, -4, -3, 1, -2, -3, -1, -1, -1, -3, -2, -3, 1, 5, -1, -5, // E
234 | 0, -3, 0, -1, -3, -2, -3, 8, -2, -4, -4, -2, -3, -4, -2, 0, -2, -3, -3, -4, -1, -2, -1, -5, // G
235 | -2, 0, 1, -1, -3, 1, 0, -2, 10, -4, -3, 0, -1, -1, -2, -1, -2, -3, 2, -4, 0, 0, -1, -5, // H
236 | -1, -4, -3, -4, -2, -3, -4, -4, -4, 5, 2, -3, 2, 0, -3, -3, -1, -3, -1, 4, -4, -3, -1, -5, // I
237 | -2, -3, -4, -4, -2, -2, -3, -4, -3, 2, 5, -3, 3, 1, -4, -3, -1, -2, -1, 1, -4, -3, -1, -5, // L
238 | -1, 3, 0, -1, -3, 2, 1, -2, 0, -3, -3, 6, -2, -4, -1, 0, -1, -3, -2, -3, 0, 1, -1, -5, // K
239 | -1, -2, -2, -4, -2, 0, -2, -3, -1, 2, 3, -2, 7, 0, -3, -2, -1, -1, 0, 1, -3, -1, -1, -5, // M
240 | -3, -3, -4, -5, -2, -4, -3, -4, -1, 0, 1, -4, 0, 8, -4, -3, -2, 1, 4, -1, -4, -4, -1, -5, // F
241 | -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -4, -1, -3, -4, 10, -1, -1, -4, -3, -3, -2, -1, -1, -5, // P
242 | 1, -1, 1, 0, -1, 0, -1, 0, -1, -3, -3, 0, -2, -3, -1, 5, 2, -4, -2, -2, 0, 0, -1, -5, // S
243 | 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 2, 5, -3, -2, 0, 0, -1, -1, -5, // T
244 | -3, -3, -4, -5, -5, -1, -3, -3, -3, -3, -2, -3, -1, 1, -4, -4, -3, 15, 2, -3, -5, -2, -1, -5, // W
245 | -2, -1, -2, -3, -3, -1, -2, -3, 2, -1, -1, -2, 0, 4, -3, -2, -2, 2, 8, -1, -3, -2, -1, -5, // Y
246 | 0, -3, -3, -4, -1, -3, -3, -4, -4, 4, 1, -3, 1, -1, -3, -2, 0, -3, -1, 5, -3, -3, -1, -5, // V
247 | -2, -1, 5, 6, -3, 0, 1, -1, 0, -4, -4, 0, -3, -4, -2, 0, 0, -5, -3, -3, 6, 1, -1, -5, // B
248 | -1, 0, 0, 1, -3, 4, 5, -2, 0, -3, -3, 1, -1, -4, -1, 0, -1, -2, -2, -3, 1, 5, -1, -5, // Z
249 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -5, // X
250 | -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 1 // *
251 | };
252 |
253 | /* This table is used to transform amino acid letters into numbers. */
254 | int8_t aa_table[128] = {
255 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
256 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
257 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
258 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
259 | 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23,
260 | 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23,
261 | 23, 0, 20, 4, 3, 6, 13, 7, 8, 9, 23, 11, 10, 12, 2, 23,
262 | 14, 5, 1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23
263 | };
264 |
265 | /* This table is used to transform nucleotide letters into numbers. */
266 | int8_t nt_table[128] = {
267 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
268 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
269 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
270 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
271 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
272 | 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
273 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
274 | 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
275 | };
276 |
277 | int8_t* table = nt_table;
278 |
279 | // Parse command line.
280 | while ((l = getopt(argc, argv, "m:x:o:e:a:f:pcrsh")) >= 0) {
281 | switch (l) {
282 | case 'm': match = atoi(optarg); break;
283 | case 'x': mismatch = atoi(optarg); break;
284 | case 'o': gap_open = atoi(optarg); break;
285 | case 'e': gap_extension = atoi(optarg); break;
286 | case 'a': strcpy(mat_name, optarg); break;
287 | case 'f': filter = atoi(optarg); break;
288 | case 'p': protein = 1; break;
289 | case 'c': path = 1; break;
290 | case 'r': reverse = 1; break;
291 | case 's': sam = 1; break;
292 | case 'h': header = 1; break;
293 | }
294 | }
295 | if (optind + 2 > argc) {
296 | fprintf(stderr, "\n");
297 | fprintf(stderr, "Usage: ssw_test [options] ... (or )\n");
298 | fprintf(stderr, "Options:\n");
299 | fprintf(stderr, "\t-m N\tN is a positive integer for weight match in genome sequence alignment. [default: 2]\n");
300 | fprintf(stderr, "\t-x N\tN is a positive integer. -N will be used as weight mismatch in genome sequence alignment. [default: 2]\n");
301 | fprintf(stderr, "\t-o N\tN is a positive integer. -N will be used as the weight for the gap opening. [default: 3]\n");
302 | fprintf(stderr, "\t-e N\tN is a positive integer. -N will be used as the weight for the gap extension. [default: 1]\n");
303 | fprintf(stderr, "\t-p\tDo protein sequence alignment. Without this option, the ssw_test will do genome sequence alignment.\n");
304 | fprintf(stderr, "\t-a FILE\tFILE is either the Blosum or Pam weight matrix. [default: Blosum50]\n");
305 | fprintf(stderr, "\t-c\tReturn the alignment path.\n");
306 | fprintf(stderr, "\t-f N\tN is a positive integer. Only output the alignments with the Smith-Waterman score >= N.\n");
307 | fprintf(stderr, "\t-r\tThe best alignment will be picked between the original read alignment and the reverse complement read alignment.\n");
308 | fprintf(stderr, "\t-s\tOutput in SAM format. [default: no header]\n");
309 | fprintf(stderr, "\t-h\tIf -s is used, include header in SAM output.\n\n");
310 | return 1;
311 | }
312 |
313 | // initialize scoring matrix for genome sequences
314 | for (l = k = 0; LIKELY(l < 4); ++l) {
315 | for (m = 0; LIKELY(m < 4); ++m) mata[k++] = l == m ? match : -mismatch; /* weight_match : -weight_mismatch */
316 | mata[k++] = 0; // ambiguous base
317 | }
318 | for (m = 0; LIKELY(m < 5); ++m) mata[k++] = 0;
319 |
320 | if (protein == 1 && (! strcmp(mat_name, "\0"))) {
321 | n = 24;
322 | table = aa_table;
323 | mat = mat50;
324 | } else if (strcmp(mat_name, "\0")) {
325 |
326 | // Parse score matrix.
327 | FILE *f_mat = fopen(mat_name, "r");
328 | char line[128];
329 | mata = (int8_t*)realloc(mata, 1024 * sizeof(int8_t));
330 | k = 0;
331 | m = 0;
332 | while (fgets(line, 128, f_mat)) {
333 | if (line[0] == '*' || (line[0] >= 'A' && line[0] <= 'Z')) {
334 | if (line[0] >= 'A' && line[0] <= 'Z') aa_table[(int)line[0]] = aa_table[(int)line[0] + 32] = m;
335 | char str[4], *s = str;
336 | str[0] = '\0';
337 | l = 1;
338 | while (line[l]) {
339 | if ((line[l] >= '0' && line[l] <= '9') || line[l] == '-') *s++ = line[l];
340 | else if (str[0] != '\0') {
341 | *s = '\0';
342 | mata[k++] = (int8_t)atoi(str);
343 | s = str;
344 | str[0] = '\0';
345 | }
346 | ++l;
347 | }
348 | if (str[0] != '\0') {
349 | *s = '\0';
350 | mata[k++] = (int8_t)atoi(str);
351 | s = str;
352 | str[0] = '\0';
353 | }
354 | ++m;
355 | }
356 | }
357 | if (k == 0) {
358 | fprintf(stderr, "Problem of reading the weight matrix file.\n");
359 | return 1;
360 | }
361 | fclose(f_mat);
362 | n = m;
363 | table = aa_table;
364 | mat = mata;
365 | }
366 |
367 | read_fp = gzopen(argv[optind + 1], "r");
368 | read_seq = kseq_init(read_fp);
369 | if (sam && header && path) {
370 | fprintf(stdout, "@HD\tVN:1.4\tSO:queryname\n");
371 | ref_fp = gzopen(argv[optind], "r");
372 | ref_seq = kseq_init(ref_fp);
373 | while ((l = kseq_read(ref_seq)) >= 0) fprintf(stdout, "@SQ\tSN:%s\tLN:%d\n", ref_seq->name.s, (int32_t)ref_seq->seq.l);
374 | kseq_destroy(ref_seq);
375 | gzclose(ref_fp);
376 | } else if (sam && !path) {
377 | fprintf(stderr, "SAM format output is only available together with option -c.\n");
378 | sam = 0;
379 | }
380 |
381 | // alignment
382 | if (reverse == 1 && n == 5) {
383 | read_rc = (char*)malloc(s2);
384 | num_rc = (int8_t*)malloc(s2);
385 | }
386 | start = clock();
387 | while (kseq_read(read_seq) >= 0) {
388 | s_profile* p, *p_rc = 0;
389 | int32_t readLen = read_seq->seq.l;
390 | int32_t maskLen = readLen / 2;
391 | // int32_t maskLen = 2*readLen;
392 |
393 | while (readLen >= s2) {
394 | ++s2;
395 | kroundup32(s2);
396 | num = (int8_t*)realloc(num, s2);
397 | if (reverse == 1 && n == 5) {
398 | read_rc = (char*)realloc(read_rc, s2);
399 | num_rc = (int8_t*)realloc(num_rc, s2);
400 | }
401 | }
402 | for (m = 0; m < readLen; ++m) num[m] = table[(int)read_seq->seq.s[m]];
403 | p = ssw_init(num, readLen, mat, n, 2);
404 | if (reverse == 1 && n == 5) {
405 | reverse_comple(read_seq->seq.s, read_rc);
406 | for (m = 0; m < readLen; ++m) num_rc[m] = table[(int)read_rc[m]];
407 | p_rc = ssw_init(num_rc, readLen, mat, n, 2);
408 | }else if (reverse == 1 && n == 24) {
409 | fprintf (stderr, "Reverse complement alignment is not available for protein sequences. \n");
410 | return 1;
411 | }
412 |
413 | ref_fp = gzopen(argv[optind], "r");
414 | ref_seq = kseq_init(ref_fp);
415 | while (kseq_read(ref_seq) >= 0) {
416 | s_align* result, *result_rc = 0;
417 | int32_t refLen = ref_seq->seq.l;
418 | int8_t flag = 0;
419 | while (refLen > s1) {
420 | ++s1;
421 | kroundup32(s1);
422 | ref_num = (int8_t*)realloc(ref_num, s1);
423 | }
424 | for (m = 0; m < refLen; ++m) ref_num[m] = table[(int)ref_seq->seq.s[m]];
425 | if (path == 1) flag = 2;
426 | result = ssw_align (p, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen);
427 | if (reverse == 1 && protein == 0)
428 | result_rc = ssw_align(p_rc, ref_num, refLen, gap_open, gap_extension, flag, filter, 0, maskLen);
429 | if (result_rc && result_rc->score1 > result->score1 && result_rc->score1 >= filter) {
430 | if (sam) ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 1);
431 | else ssw_write (result_rc, ref_seq, read_seq, read_rc, table, 1, 0);
432 | }else if (result && result->score1 >= filter){
433 | if (sam) ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 1);
434 | else ssw_write(result, ref_seq, read_seq, read_seq->seq.s, table, 0, 0);
435 | } else if (! result) return 1;
436 | if (result_rc) align_destroy(result_rc);
437 | align_destroy(result);
438 | }
439 |
440 | if(p_rc) init_destroy(p_rc);
441 | init_destroy(p);
442 | kseq_destroy(ref_seq);
443 | gzclose(ref_fp);
444 | }
445 | end = clock();
446 | cpu_time = ((float) (end - start)) / CLOCKS_PER_SEC;
447 | fprintf(stderr, "CPU time: %f seconds\n", cpu_time);
448 |
449 | if (num_rc) {
450 | free(num_rc);
451 | free(read_rc);
452 | }
453 | kseq_destroy(read_seq);
454 | gzclose(read_fp);
455 | free(num);
456 | free(ref_num);
457 | free(mata);
458 | return 0;
459 | }
460 |
--------------------------------------------------------------------------------
/ssw201507/pyssw.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | @package pyssw
6 | @brief Python standalone program for ssw alignment using the C library
7 | Complete-Striped-Smith-Waterman-Library
8 | Biopython module is require for fastq/fastq parsing
9 | @copyright [The MIT licence](http://opensource.org/licenses/MIT)
10 | @author Adrien Leger - 2014
11 | *
12 | *
13 | *
14 | * [Github](https://github.com/a-slide)
15 | * [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/)
16 | """
17 |
18 | #~~~~~~~GLOBAL IMPORTS~~~~~~~#
19 | # Standard library packages
20 | import optparse
21 | import sys
22 | from time import time
23 | import gzip
24 |
25 | #~~~~~~~MAIN FUNCTION~~~~~~~#
26 | def main (opt):
27 |
28 | print ("Inport subject sequence")
29 | # Import fasta subject
30 | if opt.subject.rpartition(".")[2].lower() == "gz":
31 | subject_handle = gzip.open(opt.subject, "r")
32 | else:
33 | subject_handle = open(opt.subject, "r")
34 | subject = SeqIO.read(subject_handle, "fasta")
35 |
36 | print ("Inport query sequences and count the number of sequences")
37 | # Import fasta subject
38 | if opt.query.rpartition(".")[2].lower() == "gz":
39 | nseq = count_seq(opt.query, opt.qtype, True)
40 | query_handle = gzip.open(opt.query, "r")
41 | else:
42 | nseq = count_seq(opt.query, opt.qtype, False)
43 | query_handle = open(opt.query, "r")
44 | query_gen = SeqIO.parse(query_handle, opt.qtype)
45 |
46 | print("{} contains {} sequences to align".format(opt.query, nseq))
47 | # Calculate a step list for the progress bar
48 | nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)]
49 |
50 | print ("Initialize ssw aligner with the subject sequence")
51 | # Init the an Aligner object with the reference value
52 | ssw = Aligner(
53 | str(subject.seq),
54 | match=int(opt.match),
55 | mismatch=int(opt.mismatch),
56 | gap_open=int(opt.gap_open),
57 | gap_extend= int(opt.gap_extend),
58 | report_secondary=False,
59 | report_cigar=True)
60 |
61 | # Write the header of the SAM file
62 | with open("result.sam", "w") as f:
63 | f.write("@HD\tVN:1.0\tSO:unsorted\n")
64 | f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq)))
65 | f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n")
66 | f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format(
67 | opt.match,
68 | opt.mismatch,
69 | opt.gap_open,
70 | opt.gap_extend))
71 | f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format(
72 | opt.min_score,
73 | opt.min_len))
74 |
75 | print ("Starting alignment of queries against the subject sequence")
76 | start = time()
77 | # Align each query along the subject an write result in a SAM file
78 | i = 0
79 | for query in query_gen:
80 |
81 | # Find the best alignment
82 | if opt.reverse:
83 | al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len))
84 | else:
85 | al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True
86 |
87 | # If valid match found
88 | if al:
89 | f.write(sam_line(
90 | qname=query.id,
91 | flag=0 if orient else 16,
92 | rname=subject.id,
93 | pos=al.ref_begin+1,
94 | cigar=al.cigar_string,
95 | seq=str(query.seq),
96 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*",
97 | tags=["AS:i:{}".format(al.score)]))
98 |
99 | # If no valid match found and -u flag activated (report unaligned)
100 | elif opt.unaligned:
101 | f.write(sam_line(
102 | qname=query.id,
103 | flag=4,
104 | seq=str(query.seq),
105 | qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*"))
106 | # Else = match unreported
107 |
108 | # Progress bar
109 | i+=1
110 | if i in nseq_list:
111 | frac = i/float(nseq)
112 | t = time()-start
113 | print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2)))
114 |
115 | print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2)))
116 |
117 | #~~~~~~~HELPER FUNCTIONS~~~~~~~#
118 |
119 |
120 | def sam_line (qname='*', flag=4, rname='*', pos=0, mapq=0, cigar='*', rnext='*', pnext=0, tlen=0, seq='*', qual='*', tags=None):
121 | """
122 | Return a minimal sam line = by default return an undetermined sam line. Check the document
123 | [SAM Format Specification](http://samtools.sourceforge.net/SAM1.pdf) for a full description.
124 | @param qname Query template NAME
125 | @param flag bitwise FLAG
126 | @param rname Reference sequence NAME of the alignment
127 | @param pos 1-based leftmost mapping POSition of the first matching base
128 | @param mapq MAPping Quality
129 | @param cigar CIGAR string
130 | @param rnext Reference sequence name of the primary alignment of the mate
131 | @param pnext 1-based leftmost position of the primary alignment of the mate
132 | @param tlen signed observed Template LENgth
133 | @param seq segment SEQuence
134 | @param qual ASCII of base QUALity plus 33
135 | @param tags list of optional tags
136 | @return A Sam alignment line
137 | """
138 | if tags:
139 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
140 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, " ".join(tags))
141 | else:
142 | return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
143 | qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual)
144 |
145 | def find_best_align (ssw, query, min_score, min_len):
146 |
147 | # Align reverse and forward query
148 | forward_al = ssw.align(str(query.seq), min_score, min_len)
149 | reverse_al = ssw.align(str(query.seq.reverse_complement()), min_score, min_len)
150 |
151 | # Decision tree to return the best aligned sequence taking into acount the absence of result
152 | # by ssw_wrap in case of score filtering
153 |
154 | if not forward_al:
155 | if not reverse_al:
156 | return (None, None)
157 | else:
158 | return (reverse_al, False)
159 |
160 | else:
161 | if not reverse_al:
162 | return (forward_al, True)
163 | else:
164 | if forward_al.score >= reverse_al.score:
165 | return (forward_al, True)
166 | else:
167 | return (reverse_al, False)
168 |
169 | def count_seq (filename, seq_type="fasta", gziped=False):
170 | """
171 | Count the number of sequences in a fastq or a fastq file
172 | @param filename Path to a valid readeable file
173 | @param file_type Should be either fastq or fastq. Default fasta
174 | @param gziped Boolean indicating if the file is gziped or not. Default False
175 | """
176 | #Standard library import
177 | import gzip
178 | from mmap import mmap
179 |
180 | # Verify if the file is fasta or fastq type
181 | assert seq_type in ["fasta", "fastq"], "The file has to be either fastq or fasta format"
182 |
183 | # Open the file
184 | if gziped:
185 | f = gzip.open(filename, "r")
186 | else:
187 | f = open(filename, "r")
188 |
189 | # FASTA Find a start line seq character ">" an increment the counter each time
190 | if seq_type == "fasta":
191 | nline = 0
192 | for line in f:
193 | if line[0] == ">":
194 | nline+=1
195 | f.close()
196 | return nline
197 |
198 | # FASTQ No motif to find, but 4 lines correspond to 1 sequence
199 | else:
200 | nline = 0
201 | for line in f:
202 | nline+=1
203 | f.close()
204 | return nline/4
205 |
206 | def optparser():
207 |
208 | print("Parse command line options")
209 | # Usage and version strings
210 | program_name = "pyssw"
211 | program_version = 0.1
212 | version_string = "{}\t{}".format(program_name, program_version)
213 | usage_string = "{}.py -s subject.fasta -q fastq (or fasta) [Facultative options]".format(program_name)
214 | optparser = optparse.OptionParser(usage = usage_string, version = version_string)
215 |
216 | # Define optparser options
217 | hstr = "Path of the fasta file containing the subject genome sequence. Can be gziped. [REQUIRED] "
218 | optparser.add_option( '-s', '--subject', dest="subject", help=hstr)
219 | hstr = "Path of the fastq or fasta file containing the short read to be aligned. Can be gziped. [REQUIRED]"
220 | optparser.add_option( '-q', '--query', dest="query", help=hstr)
221 | hstr = "Type of the query file = fastq or fasta. [default: fastq]"
222 | optparser.add_option( '-t', '--qtype', dest="qtype", default="fastq", help=hstr)
223 | hstr = "Positive integer for weight match in genome sequence alignment. [default: 2]"
224 | optparser.add_option( '-m', '--match', dest="match",default=2, help=hstr)
225 | hstr = "Positive integer. The negative value will be used as weight mismatch in genome sequence alignment. [default: 2]"
226 | optparser.add_option( '-x', '--mismatch', dest="mismatch", default=2, help=hstr)
227 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 3]"
228 | optparser.add_option( '-o', '--gap_open', dest="gap_open", default=3, help=hstr)
229 | hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 1]"
230 | optparser.add_option( '-e', '--gap_extend', dest="gap_extend", default=1, help=hstr)
231 | hstr = "Integer. Consider alignments having a score <= as not aligned. [default: 0]"
232 | optparser.add_option( '-f', '--min_score', dest="min_score", default=0, help=hstr)
233 | hstr = "Integer. Consider alignments having a length <= as not aligned. [default: 0]"
234 | optparser.add_option( '-l', '--min_len', dest="min_len", default=0, help=hstr)
235 | hstr = "Flag. Align query in forward and reverse orientation and choose the best alignment. [Set by default]"
236 | optparser.add_option( '-r', '--reverse', dest="reverse", action="store_true", default=True, help=hstr)
237 | hstr = "Flag. Write unaligned reads in sam output [Unset by default]"
238 | optparser.add_option( '-u', '--unaligned', dest="unaligned", action="store_true", default=False, help=hstr)
239 |
240 | # Parse arg and return a dictionnary_like object of options
241 | opt, args = optparser.parse_args()
242 |
243 | if not opt.subject:
244 | print ("\nERROR: a subject fasta file has to be provided (-s option)\n")
245 | optparser.print_help()
246 | sys.exit()
247 |
248 | if not opt.query:
249 | print ("\nERROR: a query fasta or fastq file has to be provided (-q option)\n")
250 | optparser.print_help()
251 | sys.exit()
252 |
253 | return opt
254 |
255 | #~~~~~~~TOP LEVEL INSTRUCTIONS~~~~~~~#
256 |
257 | if __name__ == '__main__':
258 |
259 | # try to import Third party and local packages
260 | try:
261 | from Bio import SeqIO
262 | except ImportError:
263 | print ("ERROR: Please install Biopython package")
264 | sys.exit()
265 |
266 | try:
267 | from ssw_wrap import Aligner
268 | except ImportError:
269 | print ("ERROR: Please place ssw_wrap in the current directory or add its dir to python path")
270 | sys.exit()
271 |
272 | # Parse command line arguments
273 | opt = optparser()
274 | # Run the main function
275 | main(opt)
276 |
--------------------------------------------------------------------------------
/ssw201507/result.sam:
--------------------------------------------------------------------------------
1 | @HD VN:1.0 SO:unsorted
2 | @SQ SN:ISAcma33_left32 LN:32
3 | @PG ID:Striped-Smith-Waterman PN:pyssw VN:0.1
4 | @CO Score_values = match 3, mismatch 3, gap_open 2, gap_extend 1
5 | @CO Filter Options = min_score 0, min_len 0
6 | ISAcma33_right32 0 ISAcma33_left32 3 0 3S21M1I1M2I3M1S * 0 0 GATTGTGCGTCAATAAAGTGTGGGATAGTTGA * AS:i:56
7 |
--------------------------------------------------------------------------------
/ssw201507/ssw.c:
--------------------------------------------------------------------------------
1 | /* The MIT License
2 |
3 | Copyright (c) 2012-1015 Boston College.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | */
25 |
26 | /* Contact: Mengyao Zhao */
27 |
28 | /*
29 | * ssw.c
30 | *
31 | * Created by Mengyao Zhao on 6/22/10.
32 | * Copyright 2010 Boston College. All rights reserved.
33 | * Version 0.1.4
34 | * Last revision by Mengyao Zhao on 06/27/14.
35 | *
36 | */
37 |
38 | #include
39 | #include
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include "ssw.h"
45 |
46 | #ifdef __GNUC__
47 | #define LIKELY(x) __builtin_expect((x),1)
48 | #define UNLIKELY(x) __builtin_expect((x),0)
49 | #else
50 | #define LIKELY(x) (x)
51 | #define UNLIKELY(x) (x)
52 | #endif
53 |
54 | /* Convert the coordinate in the scoring matrix into the coordinate in one line of the band. */
55 | #define set_u(u, w, i, j) { int x=(i)-(w); x=x>0?x:0; (u)=(j)-x+1; }
56 |
57 | /* Convert the coordinate in the direction matrix into the coordinate in one line of the band. */
58 | #define set_d(u, w, i, j, p) { int x=(i)-(w); x=x>0?x:0; x=(j)-x; (u)=x*3+p; }
59 |
60 | /*! @function
61 | @abstract Round an integer to the next closest power-2 integer.
62 | @param x integer to be rounded (in place)
63 | @discussion x will be modified.
64 | */
65 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
66 |
67 | typedef struct {
68 | uint16_t score;
69 | int32_t ref; //0-based position
70 | int32_t read; //alignment ending position on read, 0-based
71 | } alignment_end;
72 |
73 | typedef struct {
74 | uint32_t* seq;
75 | int32_t length;
76 | } cigar;
77 |
78 | struct _profile{
79 | __m128i* profile_byte; // 0: none
80 | __m128i* profile_word; // 0: none
81 | const int8_t* read;
82 | const int8_t* mat;
83 | int32_t readLen;
84 | int32_t n;
85 | uint8_t bias;
86 | };
87 |
88 | /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
89 | static __m128i* qP_byte (const int8_t* read_num,
90 | const int8_t* mat,
91 | const int32_t readLen,
92 | const int32_t n, /* the edge length of the squre matrix mat */
93 | uint8_t bias) {
94 |
95 | int32_t segLen = (readLen + 15) / 16; /* Split the 128 bit register into 16 pieces.
96 | Each piece is 8 bit. Split the read into 16 segments.
97 | Calculat 16 segments in parallel.
98 | */
99 | __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
100 | int8_t* t = (int8_t*)vProfile;
101 | int32_t nt, i, j, segNum;
102 |
103 | /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
104 | for (nt = 0; LIKELY(nt < n); nt ++) {
105 | for (i = 0; i < segLen; i ++) {
106 | j = i;
107 | for (segNum = 0; LIKELY(segNum < 16) ; segNum ++) {
108 | *t++ = j>= readLen ? bias : mat[nt * n + read_num[j]] + bias;
109 | j += segLen;
110 | }
111 | }
112 | }
113 | return vProfile;
114 | }
115 |
116 | /* Striped Smith-Waterman
117 | Record the highest score of each reference position.
118 | Return the alignment score and ending position of the best alignment, 2nd best alignment, etc.
119 | Gap begin and gap extension are different.
120 | wight_match > 0, all other weights < 0.
121 | The returned positions are 0-based.
122 | */
123 | static alignment_end* sw_sse2_byte (const int8_t* ref,
124 | int8_t ref_dir, // 0: forward ref; 1: reverse ref
125 | int32_t refLen,
126 | int32_t readLen,
127 | const uint8_t weight_gapO, /* will be used as - */
128 | const uint8_t weight_gapE, /* will be used as - */
129 | const __m128i* vProfile,
130 | uint8_t terminate, /* the best alignment score: used to terminate
131 | the matrix calculation when locating the
132 | alignment beginning point. If this score
133 | is set to 0, it will not be used */
134 | uint8_t bias, /* Shift 0 point to a positive value. */
135 | int32_t maskLen) {
136 |
137 | #define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
138 | (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
139 | (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
140 | (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
141 | (m) = _mm_extract_epi16((vm), 0)
142 |
143 | uint8_t max = 0; /* the max alignment score */
144 | int32_t end_read = readLen - 1;
145 | int32_t end_ref = -1; /* 0_based best alignment ending point; Initialized as isn't aligned -1. */
146 | int32_t segLen = (readLen + 15) / 16; /* number of segment */
147 |
148 | /* array to record the largest score of each reference position */
149 | uint8_t* maxColumn = (uint8_t*) calloc(refLen, 1);
150 |
151 | /* array to record the alignment read ending position of the largest score of each reference position */
152 | int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
153 |
154 | /* Define 16 byte 0 vector. */
155 | __m128i vZero = _mm_set1_epi32(0);
156 |
157 | __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
158 | __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
159 | __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
160 | __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
161 |
162 | int32_t i, j;
163 | /* 16 byte insertion begin vector */
164 | __m128i vGapO = _mm_set1_epi8(weight_gapO);
165 |
166 | /* 16 byte insertion extension vector */
167 | __m128i vGapE = _mm_set1_epi8(weight_gapE);
168 |
169 | /* 16 byte bias vector */
170 | __m128i vBias = _mm_set1_epi8(bias);
171 |
172 | __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
173 | __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
174 | __m128i vTemp;
175 | int32_t edge, begin = 0, end = refLen, step = 1;
176 | // int32_t distance = readLen * 2 / 3;
177 | // int32_t distance = readLen / 2;
178 | // int32_t distance = readLen;
179 |
180 | /* outer loop to process the reference sequence */
181 | if (ref_dir == 1) {
182 | begin = refLen - 1;
183 | end = -1;
184 | step = -1;
185 | }
186 | for (i = begin; LIKELY(i != end); i += step) {
187 | int32_t cmp;
188 | __m128i e, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
189 | Any errors to vH values will be corrected in the Lazy_F loop.
190 | */
191 | // max16(maxColumn[i], vMaxColumn);
192 | // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
193 |
194 | __m128i vH = pvHStore[segLen - 1];
195 | vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
196 | const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
197 |
198 | /* Swap the 2 H buffers. */
199 | __m128i* pv = pvHLoad;
200 | pvHLoad = pvHStore;
201 | pvHStore = pv;
202 |
203 | /* inner loop to process the query sequence */
204 | for (j = 0; LIKELY(j < segLen); ++j) {
205 | vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
206 | vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
207 | // max16(maxColumn[i], vH);
208 | // fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]);
209 | // int8_t* t;
210 | // int32_t ti;
211 | //for (t = (int8_t*)&vH, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
212 |
213 | /* Get max from vH, vE and vF. */
214 | e = _mm_load_si128(pvE + j);
215 | vH = _mm_max_epu8(vH, e);
216 | vH = _mm_max_epu8(vH, vF);
217 | vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
218 |
219 | // max16(maxColumn[i], vMaxColumn);
220 | // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
221 | // for (t = (int8_t*)&vMaxColumn, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
222 |
223 | /* Save vH values. */
224 | _mm_store_si128(pvHStore + j, vH);
225 |
226 | /* Update vE value. */
227 | vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
228 | e = _mm_subs_epu8(e, vGapE);
229 | e = _mm_max_epu8(e, vH);
230 | _mm_store_si128(pvE + j, e);
231 |
232 | /* Update vF value. */
233 | vF = _mm_subs_epu8(vF, vGapE);
234 | vF = _mm_max_epu8(vF, vH);
235 |
236 | /* Load the next vH. */
237 | vH = _mm_load_si128(pvHLoad + j);
238 | }
239 |
240 | /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
241 | /* reset pointers to the start of the saved data */
242 | j = 0;
243 | vH = _mm_load_si128 (pvHStore + j);
244 |
245 | /* the computed vF value is for the given column. since */
246 | /* we are at the end, we need to shift the vF value over */
247 | /* to the next column. */
248 | vF = _mm_slli_si128 (vF, 1);
249 | vTemp = _mm_subs_epu8 (vH, vGapO);
250 | vTemp = _mm_subs_epu8 (vF, vTemp);
251 | vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
252 | cmp = _mm_movemask_epi8 (vTemp);
253 |
254 | while (cmp != 0xffff)
255 | {
256 | vH = _mm_max_epu8 (vH, vF);
257 | vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
258 | _mm_store_si128 (pvHStore + j, vH);
259 | vF = _mm_subs_epu8 (vF, vGapE);
260 | j++;
261 | if (j >= segLen)
262 | {
263 | j = 0;
264 | vF = _mm_slli_si128 (vF, 1);
265 | }
266 | vH = _mm_load_si128 (pvHStore + j);
267 |
268 | vTemp = _mm_subs_epu8 (vH, vGapO);
269 | vTemp = _mm_subs_epu8 (vF, vTemp);
270 | vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
271 | cmp = _mm_movemask_epi8 (vTemp);
272 | }
273 |
274 | vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
275 | vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
276 | cmp = _mm_movemask_epi8(vTemp);
277 | if (cmp != 0xffff) {
278 | uint8_t temp;
279 | vMaxMark = vMaxScore;
280 | max16(temp, vMaxScore);
281 | vMaxScore = vMaxMark;
282 |
283 | if (LIKELY(temp > max)) {
284 | max = temp;
285 | if (max + bias >= 255) break; //overflow
286 | end_ref = i;
287 |
288 | /* Store the column with the highest alignment score in order to trace the alignment ending position on read. */
289 | for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
290 | }
291 | }
292 |
293 | /* Record the max score of current column. */
294 | max16(maxColumn[i], vMaxColumn);
295 | // fprintf(stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
296 | if (maxColumn[i] == terminate) break;
297 | }
298 |
299 | /* Trace the alignment ending position on read. */
300 | uint8_t *t = (uint8_t*)pvHmax;
301 | int32_t column_len = segLen * 16;
302 | for (i = 0; LIKELY(i < column_len); ++i, ++t) {
303 | int32_t temp;
304 | if (*t == max) {
305 | temp = i / 16 + i % 16 * segLen;
306 | if (temp < end_read) end_read = temp;
307 | }
308 | }
309 |
310 | free(pvHmax);
311 | free(pvE);
312 | free(pvHLoad);
313 | free(pvHStore);
314 |
315 | /* Find the most possible 2nd best alignment. */
316 | alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
317 | bests[0].score = max + bias >= 255 ? 255 : max;
318 | bests[0].ref = end_ref;
319 | bests[0].read = end_read;
320 |
321 | bests[1].score = 0;
322 | bests[1].ref = 0;
323 | bests[1].read = 0;
324 |
325 | edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
326 | for (i = 0; i < edge; i ++) {
327 | // fprintf (stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
328 | if (maxColumn[i] > bests[1].score) {
329 | bests[1].score = maxColumn[i];
330 | bests[1].ref = i;
331 | }
332 | }
333 | edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
334 | for (i = edge + 1; i < refLen; i ++) {
335 | // fprintf (stderr, "refLen: %d\tmaxColumn[%d]: %d\n", refLen, i, maxColumn[i]);
336 | if (maxColumn[i] > bests[1].score) {
337 | bests[1].score = maxColumn[i];
338 | bests[1].ref = i;
339 | }
340 | }
341 |
342 | free(maxColumn);
343 | free(end_read_column);
344 | return bests;
345 | }
346 |
347 | static __m128i* qP_word (const int8_t* read_num,
348 | const int8_t* mat,
349 | const int32_t readLen,
350 | const int32_t n) {
351 |
352 | int32_t segLen = (readLen + 7) / 8;
353 | __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
354 | int16_t* t = (int16_t*)vProfile;
355 | int32_t nt, i, j;
356 | int32_t segNum;
357 |
358 | /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
359 | for (nt = 0; LIKELY(nt < n); nt ++) {
360 | for (i = 0; i < segLen; i ++) {
361 | j = i;
362 | for (segNum = 0; LIKELY(segNum < 8) ; segNum ++) {
363 | *t++ = j>= readLen ? 0 : mat[nt * n + read_num[j]];
364 | j += segLen;
365 | }
366 | }
367 | }
368 | return vProfile;
369 | }
370 |
371 | static alignment_end* sw_sse2_word (const int8_t* ref,
372 | int8_t ref_dir, // 0: forward ref; 1: reverse ref
373 | int32_t refLen,
374 | int32_t readLen,
375 | const uint8_t weight_gapO, /* will be used as - */
376 | const uint8_t weight_gapE, /* will be used as - */
377 | const __m128i* vProfile,
378 | uint16_t terminate,
379 | int32_t maskLen) {
380 |
381 | #define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
382 | (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
383 | (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
384 | (m) = _mm_extract_epi16((vm), 0)
385 |
386 | uint16_t max = 0; /* the max alignment score */
387 | int32_t end_read = readLen - 1;
388 | int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */
389 | int32_t segLen = (readLen + 7) / 8; /* number of segment */
390 |
391 | /* array to record the largest score of each reference position */
392 | uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2);
393 |
394 | /* array to record the alignment read ending position of the largest score of each reference position */
395 | int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
396 |
397 | /* Define 16 byte 0 vector. */
398 | __m128i vZero = _mm_set1_epi32(0);
399 |
400 | __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
401 | __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
402 | __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
403 | __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
404 |
405 | int32_t i, j, k;
406 | /* 16 byte insertion begin vector */
407 | __m128i vGapO = _mm_set1_epi16(weight_gapO);
408 |
409 | /* 16 byte insertion extension vector */
410 | __m128i vGapE = _mm_set1_epi16(weight_gapE);
411 |
412 | __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
413 | __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
414 | __m128i vTemp;
415 | int32_t edge, begin = 0, end = refLen, step = 1;
416 |
417 | /* outer loop to process the reference sequence */
418 | if (ref_dir == 1) {
419 | begin = refLen - 1;
420 | end = -1;
421 | step = -1;
422 | }
423 | for (i = begin; LIKELY(i != end); i += step) {
424 | int32_t cmp;
425 | __m128i e, vF = vZero; /* Initialize F value to 0.
426 | Any errors to vH values will be corrected in the Lazy_F loop.
427 | */
428 | __m128i vH = pvHStore[segLen - 1];
429 | vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
430 |
431 | /* Swap the 2 H buffers. */
432 | __m128i* pv = pvHLoad;
433 |
434 | __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
435 |
436 | const __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
437 | pvHLoad = pvHStore;
438 | pvHStore = pv;
439 |
440 | /* inner loop to process the query sequence */
441 | for (j = 0; LIKELY(j < segLen); j ++) {
442 | vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
443 |
444 | /* Get max from vH, vE and vF. */
445 | e = _mm_load_si128(pvE + j);
446 | vH = _mm_max_epi16(vH, e);
447 | vH = _mm_max_epi16(vH, vF);
448 | vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
449 |
450 | /* Save vH values. */
451 | _mm_store_si128(pvHStore + j, vH);
452 |
453 | /* Update vE value. */
454 | vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
455 | e = _mm_subs_epu16(e, vGapE);
456 | e = _mm_max_epi16(e, vH);
457 | _mm_store_si128(pvE + j, e);
458 |
459 | /* Update vF value. */
460 | vF = _mm_subs_epu16(vF, vGapE);
461 | vF = _mm_max_epi16(vF, vH);
462 |
463 | /* Load the next vH. */
464 | vH = _mm_load_si128(pvHLoad + j);
465 | }
466 |
467 | /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
468 | for (k = 0; LIKELY(k < 8); ++k) {
469 | vF = _mm_slli_si128 (vF, 2);
470 | for (j = 0; LIKELY(j < segLen); ++j) {
471 | vH = _mm_load_si128(pvHStore + j);
472 | vH = _mm_max_epi16(vH, vF);
473 | _mm_store_si128(pvHStore + j, vH);
474 | vH = _mm_subs_epu16(vH, vGapO);
475 | vF = _mm_subs_epu16(vF, vGapE);
476 | if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
477 | }
478 | }
479 |
480 | end:
481 | vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);
482 | vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
483 | cmp = _mm_movemask_epi8(vTemp);
484 | if (cmp != 0xffff) {
485 | uint16_t temp;
486 | vMaxMark = vMaxScore;
487 | max8(temp, vMaxScore);
488 | vMaxScore = vMaxMark;
489 |
490 | if (LIKELY(temp > max)) {
491 | max = temp;
492 | end_ref = i;
493 | for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
494 | }
495 | }
496 |
497 | /* Record the max score of current column. */
498 | max8(maxColumn[i], vMaxColumn);
499 | if (maxColumn[i] == terminate) break;
500 | }
501 |
502 | /* Trace the alignment ending position on read. */
503 | uint16_t *t = (uint16_t*)pvHmax;
504 | int32_t column_len = segLen * 8;
505 | for (i = 0; LIKELY(i < column_len); ++i, ++t) {
506 | int32_t temp;
507 | if (*t == max) {
508 | temp = i / 8 + i % 8 * segLen;
509 | if (temp < end_read) end_read = temp;
510 | }
511 | }
512 |
513 | free(pvHmax);
514 | free(pvE);
515 | free(pvHLoad);
516 | free(pvHStore);
517 |
518 | /* Find the most possible 2nd best alignment. */
519 | alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
520 | bests[0].score = max;
521 | bests[0].ref = end_ref;
522 | bests[0].read = end_read;
523 |
524 | bests[1].score = 0;
525 | bests[1].ref = 0;
526 | bests[1].read = 0;
527 |
528 | edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
529 | for (i = 0; i < edge; i ++) {
530 | if (maxColumn[i] > bests[1].score) {
531 | bests[1].score = maxColumn[i];
532 | bests[1].ref = i;
533 | }
534 | }
535 | edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
536 | for (i = edge; i < refLen; i ++) {
537 | if (maxColumn[i] > bests[1].score) {
538 | bests[1].score = maxColumn[i];
539 | bests[1].ref = i;
540 | }
541 | }
542 |
543 | free(maxColumn);
544 | free(end_read_column);
545 | return bests;
546 | }
547 |
548 | static cigar* banded_sw (const int8_t* ref,
549 | const int8_t* read,
550 | int32_t refLen,
551 | int32_t readLen,
552 | int32_t score,
553 | const uint32_t weight_gapO, /* will be used as - */
554 | const uint32_t weight_gapE, /* will be used as - */
555 | int32_t band_width,
556 | const int8_t* mat, /* pointer to the weight matrix */
557 | int32_t n) {
558 |
559 | uint32_t *c = (uint32_t*)malloc(16 * sizeof(uint32_t)), *c1;
560 | int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, l, max = 0;
561 | int64_t s2 = 1024;
562 | char op, prev_op;
563 | int32_t width, width_d, *h_b, *e_b, *h_c;
564 | int8_t *direction, *direction_line;
565 | cigar* result = (cigar*)malloc(sizeof(cigar));
566 | h_b = (int32_t*)malloc(s1 * sizeof(int32_t));
567 | e_b = (int32_t*)malloc(s1 * sizeof(int32_t));
568 | h_c = (int32_t*)malloc(s1 * sizeof(int32_t));
569 | direction = (int8_t*)malloc(s2 * sizeof(int8_t));
570 |
571 | do {
572 | width = band_width * 2 + 3, width_d = band_width * 2 + 1;
573 | while (width >= s1) {
574 | ++s1;
575 | kroundup32(s1);
576 | h_b = (int32_t*)realloc(h_b, s1 * sizeof(int32_t));
577 | e_b = (int32_t*)realloc(e_b, s1 * sizeof(int32_t));
578 | h_c = (int32_t*)realloc(h_c, s1 * sizeof(int32_t));
579 | }
580 | while (width_d * readLen * 3 >= s2) {
581 | ++s2;
582 | kroundup32(s2);
583 | if (s2 < 0) {
584 | fprintf(stderr, "Alignment score and position are not consensus.\n");
585 | exit(1);
586 | }
587 | direction = (int8_t*)realloc(direction, s2 * sizeof(int8_t));
588 | }
589 | direction_line = direction;
590 | for (j = 1; LIKELY(j < width - 1); j ++) h_b[j] = 0;
591 | for (i = 0; LIKELY(i < readLen); i ++) {
592 | int32_t beg = 0, end = refLen - 1, u = 0, edge;
593 | j = i - band_width; beg = beg > j ? beg : j; // band start
594 | j = i + band_width; end = end < j ? end : j; // band end
595 | edge = end + 1 < width - 1 ? end + 1 : width - 1;
596 | f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0;
597 | direction_line = direction + width_d * i * 3;
598 |
599 | for (j = beg; LIKELY(j <= end); j ++) {
600 | int32_t b, e1, f1, d, de, df, dh;
601 | set_u(u, band_width, i, j); set_u(e, band_width, i - 1, j);
602 | set_u(b, band_width, i, j - 1); set_u(d, band_width, i - 1, j - 1);
603 | set_d(de, band_width, i, j, 0);
604 | set_d(df, band_width, i, j, 1);
605 | set_d(dh, band_width, i, j, 2);
606 |
607 | temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO;
608 | temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE;
609 | e_b[u] = temp1 > temp2 ? temp1 : temp2;
610 | direction_line[de] = temp1 > temp2 ? 3 : 2;
611 |
612 | temp1 = h_c[b] - weight_gapO;
613 | temp2 = f - weight_gapE;
614 | f = temp1 > temp2 ? temp1 : temp2;
615 | direction_line[df] = temp1 > temp2 ? 5 : 4;
616 |
617 | e1 = e_b[u] > 0 ? e_b[u] : 0;
618 | f1 = f > 0 ? f : 0;
619 | temp1 = e1 > f1 ? e1 : f1;
620 | temp2 = h_b[d] + mat[ref[j] * n + read[i]];
621 | h_c[u] = temp1 > temp2 ? temp1 : temp2;
622 |
623 | if (h_c[u] > max) max = h_c[u];
624 |
625 | if (temp1 <= temp2) direction_line[dh] = 1;
626 | else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df];
627 | }
628 | for (j = 1; j <= u; j ++) h_b[j] = h_c[j];
629 | }
630 | band_width *= 2;
631 | } while (LIKELY(max < score));
632 | band_width /= 2;
633 |
634 | // trace back
635 | i = readLen - 1;
636 | j = refLen - 1;
637 | e = 0; // Count the number of M, D or I.
638 | l = 0; // record length of current cigar
639 | op = prev_op = 'M';
640 | temp2 = 2; // h
641 | while (LIKELY(i > 0)) {
642 | set_d(temp1, band_width, i, j, temp2);
643 | switch (direction_line[temp1]) {
644 | case 1:
645 | --i;
646 | --j;
647 | temp2 = 2;
648 | direction_line -= width_d * 3;
649 | op = 'M';
650 | break;
651 | case 2:
652 | --i;
653 | temp2 = 0; // e
654 | direction_line -= width_d * 3;
655 | op = 'I';
656 | break;
657 | case 3:
658 | --i;
659 | temp2 = 2;
660 | direction_line -= width_d * 3;
661 | op = 'I';
662 | break;
663 | case 4:
664 | --j;
665 | temp2 = 1;
666 | op = 'D';
667 | break;
668 | case 5:
669 | --j;
670 | temp2 = 2;
671 | op = 'D';
672 | break;
673 | default:
674 | fprintf(stderr, "Trace back error: %d.\n", direction_line[temp1 - 1]);
675 | free(direction);
676 | free(h_c);
677 | free(e_b);
678 | free(h_b);
679 | free(c);
680 | free(result);
681 | return 0;
682 | }
683 | if (op == prev_op) ++e;
684 | else {
685 | ++l;
686 | while (l >= s) {
687 | ++s;
688 | kroundup32(s);
689 | c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
690 | }
691 | c[l - 1] = to_cigar_int(e, prev_op);
692 | prev_op = op;
693 | e = 1;
694 | }
695 | }
696 | if (op == 'M') {
697 | ++l;
698 | while (l >= s) {
699 | ++s;
700 | kroundup32(s);
701 | c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
702 | }
703 | c[l - 1] = to_cigar_int(e + 1, op);
704 | }else {
705 | l += 2;
706 | while (l >= s) {
707 | ++s;
708 | kroundup32(s);
709 | c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
710 | }
711 | c[l - 2] = to_cigar_int(e, op);
712 | c[l - 1] = to_cigar_int(1, 'M');
713 | }
714 |
715 | // reverse cigar
716 | c1 = (uint32_t*)malloc(l * sizeof(uint32_t));
717 | s = 0;
718 | e = l - 1;
719 | while (LIKELY(s <= e)) {
720 | c1[s] = c[e];
721 | c1[e] = c[s];
722 | ++ s;
723 | -- e;
724 | }
725 | result->seq = c1;
726 | result->length = l;
727 |
728 | free(direction);
729 | free(h_c);
730 | free(e_b);
731 | free(h_b);
732 | free(c);
733 | return result;
734 | }
735 |
736 | static int8_t* seq_reverse(const int8_t* seq, int32_t end) /* end is 0-based alignment ending position */
737 | {
738 | int8_t* reverse = (int8_t*)calloc(end + 1, sizeof(int8_t));
739 | int32_t start = 0;
740 | while (LIKELY(start <= end)) {
741 | reverse[start] = seq[end];
742 | reverse[end] = seq[start];
743 | ++ start;
744 | -- end;
745 | }
746 | return reverse;
747 | }
748 |
749 | s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size) {
750 | s_profile* p = (s_profile*)calloc(1, sizeof(struct _profile));
751 | p->profile_byte = 0;
752 | p->profile_word = 0;
753 | p->bias = 0;
754 |
755 | if (score_size == 0 || score_size == 2) {
756 | /* Find the bias to use in the substitution matrix */
757 | int32_t bias = 0, i;
758 | for (i = 0; i < n*n; i++) if (mat[i] < bias) bias = mat[i];
759 | bias = abs(bias);
760 |
761 | p->bias = bias;
762 | p->profile_byte = qP_byte (read, mat, readLen, n, bias);
763 | }
764 | if (score_size == 1 || score_size == 2) p->profile_word = qP_word (read, mat, readLen, n);
765 | p->read = read;
766 | p->mat = mat;
767 | p->readLen = readLen;
768 | p->n = n;
769 | return p;
770 | }
771 |
772 | void init_destroy (s_profile* p) {
773 | free(p->profile_byte);
774 | free(p->profile_word);
775 | free(p);
776 | }
777 |
778 | s_align* ssw_align (const s_profile* prof,
779 | const int8_t* ref,
780 | int32_t refLen,
781 | const uint8_t weight_gapO,
782 | const uint8_t weight_gapE,
783 | const uint8_t flag, // (from high to low) bit 5: return the best alignment beginning position; 6: if (ref_end1 - ref_begin1 <= filterd) && (read_end1 - read_begin1 <= filterd), return cigar; 7: if max score >= filters, return cigar; 8: always return cigar; if 6 & 7 are both setted, only return cigar when both filter fulfilled
784 | const uint16_t filters,
785 | const int32_t filterd,
786 | const int32_t maskLen) {
787 |
788 | alignment_end* bests = 0, *bests_reverse = 0;
789 | __m128i* vP = 0;
790 | int32_t word = 0, band_width = 0, readLen = prof->readLen;
791 | int8_t* read_reverse = 0;
792 | cigar* path;
793 | s_align* r = (s_align*)calloc(1, sizeof(s_align));
794 | r->ref_begin1 = -1;
795 | r->read_begin1 = -1;
796 | r->cigar = 0;
797 | r->cigarLen = 0;
798 | if (maskLen < 15) {
799 | fprintf(stderr, "When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n");
800 | }
801 |
802 | // Find the alignment scores and ending positions
803 | if (prof->profile_byte) {
804 | bests = sw_sse2_byte(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_byte, -1, prof->bias, maskLen);
805 | if (prof->profile_word && bests[0].score == 255) {
806 | free(bests);
807 | bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
808 | word = 1;
809 | } else if (bests[0].score == 255) {
810 | fprintf(stderr, "Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n");
811 | free(r);
812 | return NULL;
813 | }
814 | }else if (prof->profile_word) {
815 | bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
816 | word = 1;
817 | }else {
818 | fprintf(stderr, "Please call the function ssw_init before ssw_align.\n");
819 | free(r);
820 | return NULL;
821 | }
822 | r->score1 = bests[0].score;
823 | r->ref_end1 = bests[0].ref;
824 | r->read_end1 = bests[0].read;
825 | if (maskLen >= 15) {
826 | r->score2 = bests[1].score;
827 | r->ref_end2 = bests[1].ref;
828 | } else {
829 | r->score2 = 0;
830 | r->ref_end2 = -1;
831 | }
832 | free(bests);
833 | if (flag == 0 || (flag == 2 && r->score1 < filters)) goto end;
834 |
835 | // Find the beginning position of the best alignment.
836 | read_reverse = seq_reverse(prof->read, r->read_end1);
837 | if (word == 0) {
838 | vP = qP_byte(read_reverse, prof->mat, r->read_end1 + 1, prof->n, prof->bias);
839 | bests_reverse = sw_sse2_byte(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, prof->bias, maskLen);
840 | } else {
841 | vP = qP_word(read_reverse, prof->mat, r->read_end1 + 1, prof->n);
842 | bests_reverse = sw_sse2_word(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, maskLen);
843 | }
844 | free(vP);
845 | free(read_reverse);
846 | r->ref_begin1 = bests_reverse[0].ref;
847 | r->read_begin1 = r->read_end1 - bests_reverse[0].read;
848 | free(bests_reverse);
849 | if ((7&flag) == 0 || ((2&flag) != 0 && r->score1 < filters) || ((4&flag) != 0 && (r->ref_end1 - r->ref_begin1 > filterd || r->read_end1 - r->read_begin1 > filterd))) goto end;
850 |
851 | // Generate cigar.
852 | refLen = r->ref_end1 - r->ref_begin1 + 1;
853 | readLen = r->read_end1 - r->read_begin1 + 1;
854 | band_width = abs(refLen - readLen) + 1;
855 | path = banded_sw(ref + r->ref_begin1, prof->read + r->read_begin1, refLen, readLen, r->score1, weight_gapO, weight_gapE, band_width, prof->mat, prof->n);
856 | if (path == 0) {
857 | free(r);
858 | r = NULL;
859 | }
860 | else {
861 | r->cigar = path->seq;
862 | r->cigarLen = path->length;
863 | free(path);
864 | }
865 |
866 | end:
867 | return r;
868 | }
869 |
870 | void align_destroy (s_align* a) {
871 | free(a->cigar);
872 | free(a);
873 | }
874 |
875 | char cigar_int_to_op (uint32_t cigar_int)
876 | {
877 | uint8_t letter_code = cigar_int & 0xfU;
878 | static const char map[] = {
879 | 'M',
880 | 'I',
881 | 'D',
882 | 'N',
883 | 'S',
884 | 'H',
885 | 'P',
886 | '=',
887 | 'X',
888 | };
889 |
890 | if (letter_code >= (sizeof(map)/sizeof(map[0]))) {
891 | return 'M';
892 | }
893 |
894 | return map[letter_code];
895 | }
896 |
897 | uint32_t cigar_int_to_len (uint32_t cigar_int)
898 | {
899 | uint32_t res = cigar_int >> 4;
900 | return res;
901 | }
902 |
--------------------------------------------------------------------------------
/ssw201507/ssw.h:
--------------------------------------------------------------------------------
1 | /*
2 | * ssw.h
3 | *
4 | * Created by Mengyao Zhao on 6/22/10.
5 | * Copyright 2010 Boston College. All rights reserved.
6 | * Version 0.1.4
7 | * Last revision by Mengyao Zhao on 01/30/13.
8 | *
9 | */
10 |
11 | #ifndef SSW_H
12 | #define SSW_H
13 |
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif // __cplusplus
22 |
23 |
24 | /*! @typedef structure of the query profile */
25 | struct _profile;
26 | typedef struct _profile s_profile;
27 |
28 | /*! @typedef structure of the alignment result
29 | @field score1 the best alignment score
30 | @field score2 sub-optimal alignment score
31 | @field ref_begin1 0-based best alignment beginning position on reference; ref_begin1 = -1 when the best alignment beginning
32 | position is not available
33 | @field ref_end1 0-based best alignment ending position on reference
34 | @field read_begin1 0-based best alignment beginning position on read; read_begin1 = -1 when the best alignment beginning
35 | position is not available
36 | @field read_end1 0-based best alignment ending position on read
37 | @field read_end2 0-based sub-optimal alignment ending position on read
38 | @field cigar best alignment cigar; stored the same as that in BAM format, high 28 bits: length, low 4 bits: M/I/D (0/1/2);
39 | cigar = 0 when the best alignment path is not available
40 | @field cigarLen length of the cigar string; cigarLen = 0 when the best alignment path is not available
41 | */
42 | typedef struct {
43 | uint16_t score1;
44 | uint16_t score2;
45 | int32_t ref_begin1;
46 | int32_t ref_end1;
47 | int32_t read_begin1;
48 | int32_t read_end1;
49 | int32_t ref_end2;
50 | uint32_t* cigar;
51 | int32_t cigarLen;
52 | } s_align;
53 |
54 | /*! @function Create the query profile using the query sequence.
55 | @param read pointer to the query sequence; the query sequence needs to be numbers
56 | @param readLen length of the query sequence
57 | @param mat pointer to the substitution matrix; mat needs to be corresponding to the read sequence
58 | @param n the square root of the number of elements in mat (mat has n*n elements)
59 | @param score_size estimated Smith-Waterman score; if your estimated best alignment score is surely < 255 please set 0; if
60 | your estimated best alignment score >= 255, please set 1; if you don't know, please set 2
61 | @return pointer to the query profile structure
62 | @note example for parameter read and mat:
63 | If the query sequence is: ACGTATC, the sequence that read points to can be: 1234142
64 | Then if the penalty for match is 2 and for mismatch is -2, the substitution matrix of parameter mat will be:
65 | //A C G T
66 | 2 -2 -2 -2 //A
67 | -2 2 -2 -2 //C
68 | -2 -2 2 -2 //G
69 | -2 -2 -2 2 //T
70 | mat is the pointer to the array {2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2}
71 | */
72 | s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size);
73 |
74 | /*! @function Release the memory allocated by function ssw_init.
75 | @param p pointer to the query profile structure
76 | */
77 | void init_destroy (s_profile* p);
78 |
79 | // @function ssw alignment.
80 | /*! @function Do Striped Smith-Waterman alignment.
81 | @param prof pointer to the query profile structure
82 | @param ref pointer to the target sequence; the target sequence needs to be numbers and corresponding to the mat parameter of
83 | function ssw_init
84 | @param refLen length of the target sequence
85 | @param weight_gapO the absolute value of gap open penalty
86 | @param weight_gapE the absolute value of gap extension penalty
87 | @param flag bitwise FLAG; (from high to low) bit 5: when setted as 1, function ssw_align will return the best alignment
88 | beginning position; bit 6: when setted as 1, if (ref_end1 - ref_begin1 < filterd && read_end1 - read_begin1
89 | < filterd), (whatever bit 5 is setted) the function will return the best alignment beginning position and
90 | cigar; bit 7: when setted as 1, if the best alignment score >= filters, (whatever bit 5 is setted) the function
91 | will return the best alignment beginning position and cigar; bit 8: when setted as 1, (whatever bit 5, 6 or 7 is
92 | setted) the function will always return the best alignment beginning position and cigar. When flag == 0, only
93 | the optimal and sub-optimal scores and the optimal alignment ending position will be returned.
94 | @param filters score filter: when bit 7 of flag is setted as 1 and bit 8 is setted as 0, filters will be used (Please check the
95 | decription of the flag parameter for detailed usage.)
96 | @param filterd distance filter: when bit 6 of flag is setted as 1 and bit 8 is setted as 0, filterd will be used (Please check
97 | the decription of the flag parameter for detailed usage.)
98 | @param maskLen The distance between the optimal and suboptimal alignment ending position >= maskLen. We suggest to use
99 | readLen/2, if you don't have special concerns. Note: maskLen has to be >= 15, otherwise this function will NOT
100 | return the suboptimal alignment information. Detailed description of maskLen: After locating the optimal
101 | alignment ending position, the suboptimal alignment score can be heuristically found by checking the second
102 | largest score in the array that contains the maximal score of each column of the SW matrix. In order to avoid
103 | picking the scores that belong to the alignments sharing the partial best alignment, SSW C library masks the
104 | reference loci nearby (mask length = maskLen) the best alignment ending position and locates the second largest
105 | score from the unmasked elements.
106 | @return pointer to the alignment result structure
107 | @note Whatever the parameter flag is setted, this function will at least return the optimal and sub-optimal alignment score,
108 | and the optimal alignment ending positions on target and query sequences. If both bit 6 and 7 of the flag are setted
109 | while bit 8 is not, the function will return cigar only when both criteria are fulfilled. All returned positions are
110 | 0-based coordinate.
111 | */
112 | s_align* ssw_align (const s_profile* prof,
113 | const int8_t* ref,
114 | int32_t refLen,
115 | const uint8_t weight_gapO,
116 | const uint8_t weight_gapE,
117 | const uint8_t flag,
118 | const uint16_t filters,
119 | const int32_t filterd,
120 | const int32_t maskLen);
121 |
122 | /*! @function Release the memory allocated by function ssw_align.
123 | @param a pointer to the alignment result structure
124 | */
125 | void align_destroy (s_align* a);
126 |
127 | /*! @function Produce CIGAR 32-bit unsigned integer from CIGAR operation and CIGAR length
128 | @param length length of CIGAR
129 | @param op_letter CIGAR operation character ('M', 'I', etc)
130 | @return 32-bit unsigned integer, representing encoded CIGAR operation and length
131 | */
132 | static inline uint32_t to_cigar_int (uint32_t length, char op_letter)
133 | {
134 | uint32_t res;
135 | uint8_t op_code;
136 |
137 | switch (op_letter) {
138 | case 'M': /* alignment match (can be a sequence match or mismatch */
139 | default:
140 | op_code = 0;
141 | break;
142 | case 'I': /* insertion to the reference */
143 | op_code = 1;
144 | break;
145 | case 'D': /* deletion from the reference */
146 | op_code = 2;
147 | break;
148 | case 'N': /* skipped region from the reference */
149 | op_code = 3;
150 | break;
151 | case 'S': /* soft clipping (clipped sequences present in SEQ) */
152 | op_code = 4;
153 | break;
154 | case 'H': /* hard clipping (clipped sequences NOT present in SEQ) */
155 | op_code = 5;
156 | break;
157 | case 'P': /* padding (silent deletion from padded reference) */
158 | op_code = 6;
159 | break;
160 | case '=': /* sequence match */
161 | op_code = 7;
162 | break;
163 | case 'X': /* sequence mismatch */
164 | op_code = 8;
165 | break;
166 | }
167 |
168 | res = (length << 4) | op_code;
169 | return res;
170 | }
171 |
172 | /*! @function Extract CIGAR operation character from CIGAR 32-bit unsigned integer
173 | @param cigar_int 32-bit unsigned integer, representing encoded CIGAR operation and length
174 | @return CIGAR operation character ('M', 'I', etc)
175 | */
176 | char cigar_int_to_op (uint32_t cigar_int);
177 |
178 | /*! @function Extract length of a CIGAR operation from CIGAR 32-bit unsigned integer
179 | @param cigar_int 32-bit unsigned integer, representing encoded CIGAR operation and length
180 | @return length of CIGAR operation
181 | */
182 | uint32_t cigar_int_to_len (uint32_t cigar_int);
183 |
184 | #ifdef __cplusplus
185 | }
186 | #endif // __cplusplus
187 |
188 | #endif // SSW_H
189 |
--------------------------------------------------------------------------------
/ssw201507/ssw.h.gch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiezhq/ISEScan/769bd956fb8edfe925a3fec18e75091c9803e15a/ssw201507/ssw.h.gch
--------------------------------------------------------------------------------
/ssw201507/ssw_wrap.py:
--------------------------------------------------------------------------------
1 | """
2 | @package ssw_wrap
3 | @brief Simple python wrapper for SSW align library
4 | To use the dynamic library libssw.so you may need to modify the LD_LIBRARY_PATH environment
5 | variable to include the library directory (export LD_LIBRARY_PATH=$PWD) or for definitive
6 | inclusion of the lib edit /etc/ld.so.conf and add the path or the directory containing the
7 | library and update the cache by using /sbin/ldconfig as root
8 | @copyright [The MIT licence](http://opensource.org/licenses/MIT)
9 | @author Clement & Adrien Leger - 2014
10 | """
11 |
12 | #~~~~~~~GLOBAL IMPORTS~~~~~~~#
13 | # Standard library packages
14 | from ctypes import *
15 |
16 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
17 | class CAlignRes(Structure):
18 | """
19 | @class SSWAlignRes
20 | @brief ctypes Structure with s_align struct mapping returned by SSWAligner.Align func
21 | Correspond to the structure of the query profile
22 | """
23 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
24 |
25 | #~~~~~~~Ctype Structure~~~~~~~#
26 | _fields_ = [('score', c_uint16),
27 | ('score2', c_uint16),
28 | ('ref_begin', c_int32),
29 | ('ref_end', c_int32),
30 | ('query_begin', c_int32),
31 | ('query_end', c_int32),
32 | ('ref_end2', c_int32),
33 | ('cigar', POINTER(c_uint32)),
34 | ('cigarLen', c_int32)]
35 |
36 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
37 | class Aligner(object):
38 | """
39 | @class SSWAligner
40 | @brief Wrapper for SSW align library
41 | """
42 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
43 |
44 | #~~~~~~~CLASS VARIABLES~~~~~~~#
45 |
46 | # Dictionnary to map Nucleotide to int as expected by the SSW C library
47 | base_to_int = { 'A':0, 'C':1, 'G':2, 'T':3, 'N':4, 'a':0, 'c':1, 'g':2, 't':3, 'n':4}
48 | int_to_base = { 0:'A', 1:'C', 2:'G', 3:'T', 4:'N'}
49 |
50 | # Load the ssw library using ctypes
51 | libssw = cdll.LoadLibrary('libssw.so')
52 |
53 | # Init and setup the functions pointer to map the one specified in the SSW lib
54 | # ssw_init method
55 | ssw_init = libssw.ssw_init
56 | ssw_init.restype = c_void_p
57 | ssw_init.argtypes = [POINTER(c_int8), c_int32, POINTER(c_int8), c_int32, c_int8]
58 | # init_destroy function
59 | init_destroy = libssw.init_destroy
60 | init_destroy.restype = None
61 | init_destroy.argtypes = [c_void_p]
62 | # ssw_align function
63 | ssw_align = libssw.ssw_align
64 | ssw_align.restype = POINTER(CAlignRes)
65 | ssw_align.argtypes = [c_void_p, POINTER(c_int8), c_int32, c_uint8, c_uint8, c_uint8, c_uint16, c_int32, c_int32]
66 | # align_destroy function
67 | align_destroy = libssw.align_destroy
68 | align_destroy.restype = None
69 | align_destroy.argtypes = [POINTER(CAlignRes)]
70 |
71 | #~~~~~~~FONDAMENTAL METHODS~~~~~~~#
72 |
73 | def __repr__(self):
74 | msg = self.__str__()
75 | msg += "SCORE PARAMETERS:\n"
76 | msg += " Gap Weight Open: {} Extension: {}\n".format(-self.gap_open, -self.gap_extend)
77 | msg += " Align Weight Match: {} Mismatch: {}\n\n".format(self.match, -self.mismatch)
78 | msg += " Match/mismatch Score matrix\n"
79 | msg += " \tA\tC\tG\tT\tN\n"
80 | msg += " A\t{}\t{}\t{}\t{}\t{}\n".format(self.match, -self.mismatch, -self.mismatch, -self.mismatch, 0)
81 | msg += " C\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, self.match, -self.mismatch, -self.mismatch, 0)
82 | msg += " G\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, self.match, -self.mismatch, 0)
83 | msg += " T\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, -self.mismatch, self.match, 0)
84 | msg += " N\t{}\t{}\t{}\t{}\t{}\n\n".format(0,0,0,0,0)
85 | msg += "RESULT PARAMETERS:\n"
86 | msg += " Report cigar {}\n".format(self.report_cigar)
87 | msg += " Report secondary match {}\n\n".format(self.report_secondary)
88 | msg += "REFERENCE SEQUENCE :\n"
89 | if self.ref_len <= 50:
90 | msg += "".join([self.int_to_base[i] for i in self.ref_seq])+"\n"
91 | else:
92 | msg += "".join([self.int_to_base[self.ref_seq[i]] for i in range(50)])+"...\n"
93 | msg += " Lenght :{} nucleotides\n".format(self.ref_len)
94 | return msg
95 |
96 | def __str__(self):
97 | return "\n\n".format(self.__class__.__name__, self.__module__)
98 |
99 | def __init__(self,
100 | ref_seq="",
101 | match=2,
102 | mismatch=2,
103 | gap_open=3,
104 | gap_extend=1,
105 | report_secondary=False,
106 | report_cigar=False):
107 | """
108 | Initialize object by creating an interface with ssw library fonctions
109 | A reference sequence is also assigned to the object for multiple alignment against queries
110 | with the align function
111 | @param ref_seq Reference sequence as a python string (case insensitive)
112 | @param match Weight for a match
113 | @param mismatch Absolute value of mismatch penalty
114 | @param gap_open Absolute value of gap open penalty
115 | @param gap_extend Absolute value of gap extend penalty
116 | @param report_secondary Report the 2nd best alignement if true
117 | @param report_cigar Report cigar string if true
118 | """
119 |
120 | # Store overall alignment parameters
121 | self.report_secondary = report_secondary
122 | self.report_cigar = report_cigar
123 |
124 | # Set gap penalties
125 | self.set_gap(gap_open, gap_extend)
126 |
127 | # Set the cost matrix
128 | self.set_mat(match, mismatch)
129 |
130 | # Set the reference sequence
131 | self.set_ref(ref_seq)
132 |
133 | #~~~~~~~SETTERS METHODS~~~~~~~#
134 |
135 | def set_gap(self, gap_open=3, gap_extend=1):
136 | """
137 | Store gapopen and gap extension penalties
138 | """
139 | self.gap_open = gap_open
140 | self.gap_extend = gap_extend
141 |
142 |
143 | def set_mat(self, match=2, mismatch=2):
144 | """
145 | Store match and mismatch scores then initialize a Cost matrix and fill it with match and
146 | mismatch values. Ambiguous base: no penalty
147 | """
148 | self.match = match
149 | self.mismatch = mismatch
150 |
151 | mat_decl = c_int8 * 25
152 | self.mat = mat_decl(match, -mismatch, -mismatch, -mismatch, 0,
153 | -mismatch, match, -mismatch, -mismatch, 0,
154 | -mismatch, -mismatch, match, -mismatch, 0,
155 | -mismatch, -mismatch, -mismatch, match, 0,
156 | 0, 0, 0, 0, 0)
157 |
158 | def set_ref(self, ref_seq):
159 | """
160 | Determine the size of the ref sequence and cast it in a c type integer matrix
161 | """
162 | if ref_seq:
163 | self.ref_len = len(ref_seq)
164 | self.ref_seq = self._DNA_to_int_mat (ref_seq, self.ref_len)
165 | else:
166 | self.ref_len = 0
167 | self.ref_seq = ""
168 |
169 | #~~~~~~~PUBLIC METHODS~~~~~~~#
170 |
171 | def align(self, query_seq, min_score=0, min_len=0):
172 | """
173 | Perform the alignment of query against the object reference sequence
174 | @param query_seq Query sequence as a python string (case insensitive)
175 | @param min_score Minimal score of match. None will be return in case of filtering out
176 | @param min_len Minimal length of match. None will be return in case of filtering out
177 | @return A SSWAlignRes Object containing informations about the alignment.
178 | """
179 | # Determine the size of the ref sequence and cast it in a c type integer matrix
180 | query_len = len(query_seq)
181 | query_seq = self._DNA_to_int_mat (query_seq, query_len)
182 |
183 | # Create the query profile using the query sequence
184 | profile = self.ssw_init(query_seq, # Query seq in c type integers
185 | c_int32(query_len), # Length of Queryseq in bites
186 | self.mat, # Score matrix
187 | 5, # Square root of the number of elements in mat
188 | 2) # flag = no estimation of the best alignment score
189 |
190 | # Setup the mask_len parameters = distance between the optimal and suboptimal alignment
191 | # if < 15, the function will NOT return the suboptimal alignment information
192 |
193 | if query_len > 30:
194 | #mask_len = query_len/2 # for python2.x, comment by Zhiqun Xie
195 | mask_len = int(query_len/2) # for python3.x, modified by Zhiqun Xie
196 | else:
197 | mask_len = 15
198 |
199 | c_result = self.ssw_align (profile, # Query profile
200 | self.ref_seq, # Ref seq in c type integers
201 | c_int32(self.ref_len), # Length of Refseq in bites
202 | self.gap_open, # Absolute value of gap open penalty
203 | self.gap_extend, # absolute value of gap extend penalty
204 | 1, # Bitwise FLAG for output values = return all
205 | 0, # Score filter = return all
206 | 0, # Distance filter = return all
207 | mask_len) # Distance between the optimal and suboptimal alignment
208 |
209 | # Transform the Cstructure into a python object if score and lenght match the requirements
210 | score = c_result.contents.score
211 | match_len = c_result.contents.query_end - c_result.contents.query_begin + 1
212 |
213 | if score >= min_score and match_len >= min_len:
214 | py_result = PyAlignRes(c_result, query_len, self.report_secondary, self.report_cigar)
215 | else:
216 | py_result = None
217 |
218 | # Free reserved space by ssw.init and ssw_init methods.
219 | self._init_destroy(profile)
220 | self._align_destroy(c_result)
221 |
222 | # Return the object
223 | return py_result
224 |
225 | #~~~~~~~PRIVATE METHODS~~~~~~~#
226 |
227 | def _DNA_to_int_mat (self, seq, len_seq):
228 | """
229 | Cast a python DNA string into a Ctype int8 matrix
230 | """
231 | # Declare the matrix
232 | query_num_decl = c_int8 * len_seq
233 | query_num = query_num_decl()
234 |
235 | # for each letters in ATCGN transform in integers thanks to self.base_to_int
236 | for i in range(len_seq):
237 | try:
238 | value = self.base_to_int[seq[i]]
239 | # if the base is not in the canonic DNA bases assign 4 as for N
240 | except KeyError:
241 | value = 4
242 | finally:
243 | query_num[i] = value
244 |
245 | return query_num
246 |
247 | def _init_destroy(self, profile):
248 | """
249 | Free the space alocated for the matrix used by init
250 | """
251 | self.init_destroy(profile)
252 |
253 | def _align_destroy(self, align):
254 | """
255 | Free the space alocated for the matrix used by align
256 | """
257 | self.align_destroy(align)
258 |
259 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
260 | class PyAlignRes(object):
261 | """
262 | @class PyAlignRes
263 | @brief Extract and verify result from a CAlignRes structure. A comprehensive python
264 | object is created according to user requirements (+- cigar string and secondary alignment)
265 | """
266 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
267 |
268 | #~~~~~~~CLASS VARIABLES~~~~~~~#
269 |
270 | # Load the ssw library using ctypes
271 | libssw = cdll.LoadLibrary('libssw.so')
272 |
273 | # Init and setup the functions pointer to map the one specified in the SSW lib
274 | # cigar_int_to_len function
275 | cigar_int_to_len = libssw.cigar_int_to_len
276 | cigar_int_to_len.restype = c_int32
277 | cigar_int_to_len.argtypes = [c_int32]
278 | # cigar_int_to_op function
279 | cigar_int_to_op = libssw.cigar_int_to_op
280 | cigar_int_to_op.restype = c_char
281 | cigar_int_to_op.argtypes = [c_int32]
282 |
283 | #~~~~~~~FONDAMENTAL METHOD~~~~~~~#
284 |
285 | def __repr__(self):
286 | msg = self.__str__()
287 | msg += "OPTIMAL MATCH\n"
288 | msg += "Score {}\n".format(self.score)
289 | msg += "Reference begin {}\n".format(self.ref_begin)
290 | msg += "Reference end {}\n".format(self.ref_end)
291 | msg += "Query begin {}\n".format(self.query_begin)
292 | msg += "Query end {}\n".format(self.query_end)
293 |
294 | if self.cigar_string:
295 | msg += "Cigar_string {}\n".format(self.cigar_string)
296 |
297 | if self.score2:
298 | msg += "SUB-OPTIMAL MATCH\n"
299 | msg += "Score 2 {}\n".format(self.score2)
300 | msg += "Ref_end2 {}\n".format(self.ref_end2)
301 |
302 | return msg
303 |
304 | def __str__(self):
305 | return "\n\n".format(self.__class__.__name__, self.__module__)
306 |
307 |
308 | def __init__ (self, Res, query_len, report_secondary=False, report_cigar=False):
309 | """
310 | Parse CAlignRes structure and copy its values in object variables
311 | @param Res A CAlignRes structure
312 | @param query_len length of the query sequence
313 | @param report_secondary Report the 2nd best alignement if true
314 | @param report_cigar Report cigar string if true
315 | """
316 | # Parse value in the C type structure pointer
317 | # Minimal mandatory parameters
318 | self.score = Res.contents.score
319 | self.ref_begin = Res.contents.ref_begin
320 | self.ref_end = Res.contents.ref_end
321 | self.query_begin = Res.contents.query_begin
322 | self.query_end = Res.contents.query_end
323 |
324 | # Information for sub-optimal match if require and available
325 | score2 = Res.contents.score2
326 | if report_secondary and score2 != 0:
327 | self.score2 = score2
328 | self.ref_end2 = Res.contents.ref_end2
329 | else:
330 | self.score2 = None
331 | self.ref_end2 = None
332 |
333 | # Cigar Information if CIGAR string if require and available
334 | cigar_len = Res.contents.cigarLen
335 | if report_cigar and cigar_len > 0:
336 | self.cigar_string = self._cigar_string (Res.contents.cigar, cigar_len, query_len)
337 | else:
338 | self.cigar_string = None
339 |
340 | #~~~~~~~PRIVATE METHODS~~~~~~~#
341 |
342 | def _cigar_string(self, cigar, cigar_len, query_len):
343 | """
344 | Convert cigar and cigarLen into an human readable Cigar string as in SAM files
345 | """
346 | # Empty string for iterative writing of the cigar string
347 | cigar_string = ""
348 |
349 | # If the query match do not start at its first base
350 | # = introduce a softclip at the begining
351 | if self.query_begin > 0:
352 | op_len = self.query_begin
353 | op_char = "S"
354 | cigar_string += '{}{}'.format(op_len, op_char)
355 |
356 | # Iterate over the cigar (pointer to a vector of int)
357 | for i in range(cigar_len):
358 | op_len = self.cigar_int_to_len(cigar[i])
359 | op_char = self.cigar_int_to_op(cigar[i])
360 | #cigar_string += '{}{}'.format(op_len, op_char)
361 | cigar_string += '{}{}'.format(op_len, op_char.decode()) # modified by Zhiqun Xie
362 |
363 | # If the lenght of bases aligned is shorter than the overall query length
364 | # = introduce a softclip at the end
365 | end_len = query_len - self.query_end - 1
366 | if end_len != 0:
367 | op_len = end_len
368 | op_char = "S"
369 | cigar_string += '{}{}'.format(op_len, op_char)
370 |
371 | return cigar_string
372 |
--------------------------------------------------------------------------------
/ssw201507/test1.fna:
--------------------------------------------------------------------------------
1 | >IS1X3_leftend
2 | GGATAATGGTGCCAACTTACTGAT
3 |
--------------------------------------------------------------------------------
/ssw201507/test11.fna:
--------------------------------------------------------------------------------
1 | >ISAcma33_left32
2 | GGTCGTGCATCAAAAAAGTGTGGGTTTGTTAA
3 |
--------------------------------------------------------------------------------
/ssw201507/test2.fna:
--------------------------------------------------------------------------------
1 | >IS1X3_rightend
2 | GGTAATGACTCCAACTTACTGATA
3 |
--------------------------------------------------------------------------------
/ssw201507/test22.fna:
--------------------------------------------------------------------------------
1 | >ISAcma33_right32
2 | GATTGTGCGTCAATAAAGTGTGGGATAGTTGA
3 |
--------------------------------------------------------------------------------
/ssw_wrap.py:
--------------------------------------------------------------------------------
1 | """
2 | @package ssw_wrap
3 | @brief Simple python wrapper for SSW align library
4 | To use the dynamic library libssw.so you may need to modify the LD_LIBRARY_PATH environment
5 | variable to include the library directory (export LD_LIBRARY_PATH=$PWD) or for definitive
6 | inclusion of the lib edit /etc/ld.so.conf and add the path or the directory containing the
7 | library and update the cache by using /sbin/ldconfig as root
8 | @copyright [The MIT licence](http://opensource.org/licenses/MIT)
9 | @author Clement & Adrien Leger - 2014
10 | """
11 |
12 | #~~~~~~~GLOBAL IMPORTS~~~~~~~#
13 | # Standard library packages
14 | from ctypes import *
15 |
16 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
17 | class CAlignRes(Structure):
18 | """
19 | @class SSWAlignRes
20 | @brief ctypes Structure with s_align struct mapping returned by SSWAligner.Align func
21 | Correspond to the structure of the query profile
22 | """
23 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
24 |
25 | #~~~~~~~Ctype Structure~~~~~~~#
26 | _fields_ = [('score', c_uint16),
27 | ('score2', c_uint16),
28 | ('ref_begin', c_int32),
29 | ('ref_end', c_int32),
30 | ('query_begin', c_int32),
31 | ('query_end', c_int32),
32 | ('ref_end2', c_int32),
33 | ('cigar', POINTER(c_uint32)),
34 | ('cigarLen', c_int32)]
35 |
36 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
37 | class Aligner(object):
38 | """
39 | @class SSWAligner
40 | @brief Wrapper for SSW align library
41 | """
42 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
43 |
44 | #~~~~~~~CLASS VARIABLES~~~~~~~#
45 |
46 | # Dictionnary to map Nucleotide to int as expected by the SSW C library
47 | base_to_int = { 'A':0, 'C':1, 'G':2, 'T':3, 'N':4, 'a':0, 'c':1, 'g':2, 't':3, 'n':4}
48 | int_to_base = { 0:'A', 1:'C', 2:'G', 3:'T', 4:'N'}
49 |
50 | # Load the ssw library using ctypes
51 | libssw = cdll.LoadLibrary('libssw.so')
52 |
53 | # Init and setup the functions pointer to map the one specified in the SSW lib
54 | # ssw_init method
55 | ssw_init = libssw.ssw_init
56 | ssw_init.restype = c_void_p
57 | ssw_init.argtypes = [POINTER(c_int8), c_int32, POINTER(c_int8), c_int32, c_int8]
58 | # init_destroy function
59 | init_destroy = libssw.init_destroy
60 | init_destroy.restype = None
61 | init_destroy.argtypes = [c_void_p]
62 | # ssw_align function
63 | ssw_align = libssw.ssw_align
64 | ssw_align.restype = POINTER(CAlignRes)
65 | ssw_align.argtypes = [c_void_p, POINTER(c_int8), c_int32, c_uint8, c_uint8, c_uint8, c_uint16, c_int32, c_int32]
66 | # align_destroy function
67 | align_destroy = libssw.align_destroy
68 | align_destroy.restype = None
69 | align_destroy.argtypes = [POINTER(CAlignRes)]
70 |
71 | #~~~~~~~FONDAMENTAL METHODS~~~~~~~#
72 |
73 | def __repr__(self):
74 | msg = self.__str__()
75 | msg += "SCORE PARAMETERS:\n"
76 | msg += " Gap Weight Open: {} Extension: {}\n".format(-self.gap_open, -self.gap_extend)
77 | msg += " Align Weight Match: {} Mismatch: {}\n\n".format(self.match, -self.mismatch)
78 | msg += " Match/mismatch Score matrix\n"
79 | msg += " \tA\tC\tG\tT\tN\n"
80 | msg += " A\t{}\t{}\t{}\t{}\t{}\n".format(self.match, -self.mismatch, -self.mismatch, -self.mismatch, 0)
81 | msg += " C\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, self.match, -self.mismatch, -self.mismatch, 0)
82 | msg += " G\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, self.match, -self.mismatch, 0)
83 | msg += " T\t{}\t{}\t{}\t{}\t{}\n".format(-self.mismatch, -self.mismatch, -self.mismatch, self.match, 0)
84 | msg += " N\t{}\t{}\t{}\t{}\t{}\n\n".format(0,0,0,0,0)
85 | msg += "RESULT PARAMETERS:\n"
86 | msg += " Report cigar {}\n".format(self.report_cigar)
87 | msg += " Report secondary match {}\n\n".format(self.report_secondary)
88 | msg += "REFERENCE SEQUENCE :\n"
89 | if self.ref_len <= 50:
90 | msg += "".join([self.int_to_base[i] for i in self.ref_seq])+"\n"
91 | else:
92 | msg += "".join([self.int_to_base[self.ref_seq[i]] for i in range(50)])+"...\n"
93 | msg += " Lenght :{} nucleotides\n".format(self.ref_len)
94 | return msg
95 |
96 | def __str__(self):
97 | return "\n\n".format(self.__class__.__name__, self.__module__)
98 |
99 | def __init__(self,
100 | ref_seq="",
101 | match=2,
102 | mismatch=2,
103 | gap_open=3,
104 | gap_extend=1,
105 | report_secondary=False,
106 | report_cigar=False):
107 | """
108 | Initialize object by creating an interface with ssw library fonctions
109 | A reference sequence is also assigned to the object for multiple alignment against queries
110 | with the align function
111 | @param ref_seq Reference sequence as a python string (case insensitive)
112 | @param match Weight for a match
113 | @param mismatch Absolute value of mismatch penalty
114 | @param gap_open Absolute value of gap open penalty
115 | @param gap_extend Absolute value of gap extend penalty
116 | @param report_secondary Report the 2nd best alignement if true
117 | @param report_cigar Report cigar string if true
118 | """
119 |
120 | # Store overall alignment parameters
121 | self.report_secondary = report_secondary
122 | self.report_cigar = report_cigar
123 |
124 | # Set gap penalties
125 | self.set_gap(gap_open, gap_extend)
126 |
127 | # Set the cost matrix
128 | self.set_mat(match, mismatch)
129 |
130 | # Set the reference sequence
131 | self.set_ref(ref_seq)
132 |
133 | #~~~~~~~SETTERS METHODS~~~~~~~#
134 |
135 | def set_gap(self, gap_open=3, gap_extend=1):
136 | """
137 | Store gapopen and gap extension penalties
138 | """
139 | self.gap_open = gap_open
140 | self.gap_extend = gap_extend
141 |
142 |
143 | def set_mat(self, match=2, mismatch=2):
144 | """
145 | Store match and mismatch scores then initialize a Cost matrix and fill it with match and
146 | mismatch values. Ambiguous base: no penalty
147 | """
148 | self.match = match
149 | self.mismatch = mismatch
150 |
151 | mat_decl = c_int8 * 25
152 | self.mat = mat_decl(match, -mismatch, -mismatch, -mismatch, 0,
153 | -mismatch, match, -mismatch, -mismatch, 0,
154 | -mismatch, -mismatch, match, -mismatch, 0,
155 | -mismatch, -mismatch, -mismatch, match, 0,
156 | 0, 0, 0, 0, 0)
157 |
158 | def set_ref(self, ref_seq):
159 | """
160 | Determine the size of the ref sequence and cast it in a c type integer matrix
161 | """
162 | if ref_seq:
163 | self.ref_len = len(ref_seq)
164 | self.ref_seq = self._DNA_to_int_mat (ref_seq, self.ref_len)
165 | else:
166 | self.ref_len = 0
167 | self.ref_seq = ""
168 |
169 | #~~~~~~~PUBLIC METHODS~~~~~~~#
170 |
171 | def align(self, query_seq, min_score=0, min_len=0):
172 | """
173 | Perform the alignment of query against the object reference sequence
174 | @param query_seq Query sequence as a python string (case insensitive)
175 | @param min_score Minimal score of match. None will be return in case of filtering out
176 | @param min_len Minimal length of match. None will be return in case of filtering out
177 | @return A SSWAlignRes Object containing informations about the alignment.
178 | """
179 | # Determine the size of the ref sequence and cast it in a c type integer matrix
180 | query_len = len(query_seq)
181 | query_seq = self._DNA_to_int_mat (query_seq, query_len)
182 |
183 | # Create the query profile using the query sequence
184 | profile = self.ssw_init(query_seq, # Query seq in c type integers
185 | c_int32(query_len), # Length of Queryseq in bites
186 | self.mat, # Score matrix
187 | 5, # Square root of the number of elements in mat
188 | 2) # flag = no estimation of the best alignment score
189 |
190 | # Setup the mask_len parameters = distance between the optimal and suboptimal alignment
191 | # if < 15, the function will NOT return the suboptimal alignment information
192 |
193 | if query_len > 30:
194 | #mask_len = query_len/2 # for python2.x, commented by Zhiqun Xie
195 | mask_len = int(query_len/2) # for python3.x, modified by Zhiqun Xie
196 | else:
197 | mask_len = 15
198 |
199 | c_result = self.ssw_align(profile, # Query profile
200 | self.ref_seq, # Ref seq in c type integers
201 | c_int32(self.ref_len), # Length of Refseq in bites
202 | self.gap_open, # Absolute value of gap open penalty
203 | self.gap_extend, # absolute value of gap extend penalty
204 | 1, # Bitwise FLAG for output values = return all
205 | 0, # Score filter = return all
206 | 0, # Distance filter = return all
207 | mask_len) # Distance between the optimal and suboptimal alignment
208 |
209 | # Transform the Cstructure into a python object if score and lenght match the requirements
210 | score = c_result.contents.score
211 | match_len = c_result.contents.query_end - c_result.contents.query_begin + 1
212 |
213 | if score >= min_score and match_len >= min_len:
214 | py_result = PyAlignRes(c_result, query_len, self.report_secondary, self.report_cigar)
215 | else:
216 | py_result = None
217 |
218 | # Free reserved space by ssw.init and ssw_init methods.
219 | self._init_destroy(profile)
220 | self._align_destroy(c_result)
221 |
222 | # Return the object
223 | return py_result
224 |
225 | #~~~~~~~PRIVATE METHODS~~~~~~~#
226 |
227 | def _DNA_to_int_mat (self, seq, len_seq):
228 | """
229 | Cast a python DNA string into a Ctype int8 matrix
230 | """
231 | # Declare the matrix
232 | query_num_decl = c_int8 * len_seq
233 | query_num = query_num_decl()
234 |
235 | # for each letters in ATCGN transform in integers thanks to self.base_to_int
236 | for i in range(len_seq):
237 | try:
238 | value = self.base_to_int[seq[i]]
239 | # if the base is not in the canonic DNA bases assign 4 as for N
240 | except KeyError:
241 | value = 4
242 | finally:
243 | query_num[i] = value
244 |
245 | return query_num
246 |
247 | def _init_destroy(self, profile):
248 | """
249 | Free the space alocated for the matrix used by init
250 | """
251 | self.init_destroy(profile)
252 |
253 | def _align_destroy(self, align):
254 | """
255 | Free the space alocated for the matrix used by align
256 | """
257 | self.align_destroy(align)
258 |
259 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
260 | class PyAlignRes(object):
261 | """
262 | @class PyAlignRes
263 | @brief Extract and verify result from a CAlignRes structure. A comprehensive python
264 | object is created according to user requirements (+- cigar string and secondary alignment)
265 | """
266 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
267 |
268 | #~~~~~~~CLASS VARIABLES~~~~~~~#
269 |
270 | # Load the ssw library using ctypes
271 | libssw = cdll.LoadLibrary('libssw.so')
272 |
273 | # Init and setup the functions pointer to map the one specified in the SSW lib
274 | # cigar_int_to_len function
275 | cigar_int_to_len = libssw.cigar_int_to_len
276 | cigar_int_to_len.restype = c_int32
277 | cigar_int_to_len.argtypes = [c_int32]
278 | # cigar_int_to_op function
279 | cigar_int_to_op = libssw.cigar_int_to_op
280 | cigar_int_to_op.restype = c_char
281 | cigar_int_to_op.argtypes = [c_int32]
282 |
283 | #~~~~~~~FONDAMENTAL METHOD~~~~~~~#
284 |
285 | def __repr__(self):
286 | msg = self.__str__()
287 | msg += "OPTIMAL MATCH\n"
288 | msg += "Score {}\n".format(self.score)
289 | msg += "Reference begin {}\n".format(self.ref_begin)
290 | msg += "Reference end {}\n".format(self.ref_end)
291 | msg += "Query begin {}\n".format(self.query_begin)
292 | msg += "Query end {}\n".format(self.query_end)
293 |
294 | if self.cigar_string:
295 | msg += "Cigar_string {}\n".format(self.cigar_string)
296 |
297 | if self.score2:
298 | msg += "SUB-OPTIMAL MATCH\n"
299 | msg += "Score 2 {}\n".format(self.score2)
300 | msg += "Ref_end2 {}\n".format(self.ref_end2)
301 |
302 | return msg
303 |
304 | def __str__(self):
305 | return "\n\n".format(self.__class__.__name__, self.__module__)
306 |
307 |
308 | def __init__ (self, Res, query_len, report_secondary=False, report_cigar=False):
309 | """
310 | Parse CAlignRes structure and copy its values in object variables
311 | @param Res A CAlignRes structure
312 | @param query_len length of the query sequence
313 | @param report_secondary Report the 2nd best alignement if true
314 | @param report_cigar Report cigar string if true
315 | """
316 | # Parse value in the C type structure pointer
317 | # Minimal mandatory parameters
318 | self.score = Res.contents.score
319 | self.ref_begin = Res.contents.ref_begin
320 | self.ref_end = Res.contents.ref_end
321 | self.query_begin = Res.contents.query_begin
322 | self.query_end = Res.contents.query_end
323 |
324 | # Information for sub-optimal match if require and available
325 | score2 = Res.contents.score2
326 | if report_secondary and score2 != 0:
327 | self.score2 = score2
328 | self.ref_end2 = Res.contents.ref_end2
329 | else:
330 | self.score2 = None
331 | self.ref_end2 = None
332 |
333 | # Cigar Information if CIGAR string if require and available
334 | cigar_len = Res.contents.cigarLen
335 | if report_cigar and cigar_len > 0:
336 | self.cigar_string = self._cigar_string (Res.contents.cigar, cigar_len, query_len)
337 | else:
338 | self.cigar_string = None
339 |
340 | #~~~~~~~PRIVATE METHODS~~~~~~~#
341 |
342 | def _cigar_string(self, cigar, cigar_len, query_len):
343 | """
344 | Convert cigar and cigarLen into an human readable Cigar string as in SAM files
345 | """
346 | # Empty string for iterative writing of the cigar string
347 | cigar_string = ""
348 |
349 | # If the query match do not start at its first base
350 | # = introduce a softclip at the begining
351 | if self.query_begin > 0:
352 | op_len = self.query_begin
353 | op_char = "S"
354 | cigar_string += '{}{}'.format(op_len, op_char)
355 |
356 | # Iterate over the cigar (pointer to a vector of int)
357 | for i in range(cigar_len):
358 | op_len = self.cigar_int_to_len(cigar[i])
359 | op_char = self.cigar_int_to_op(cigar[i])
360 | #cigar_string += '{}{}'.format(op_len, op_char)
361 | cigar_string += '{}{}'.format(op_len, op_char.decode()) # modified by Zhiqun Xie
362 |
363 | # If the lenght of bases aligned is shorter than the overall query length
364 | # = introduce a softclip at the end
365 | end_len = query_len - self.query_end - 1
366 | if end_len != 0:
367 | op_len = end_len
368 | op_char = "S"
369 | cigar_string += '{}{}'.format(op_len, op_char)
370 |
371 | return cigar_string
372 |
--------------------------------------------------------------------------------