├── .github └── workflows │ ├── conda-build.yml │ ├── conda-install.yml │ ├── install_notRootLaunch.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── bin └── stringdecomposer ├── requirements.txt ├── setup.py └── stringdecomposer ├── __init__.py ├── __version__.py ├── main.py ├── models └── ont_logreg_model.txt ├── py ├── git.py └── standard_logger.py ├── src ├── edlib.cpp ├── edlib.h └── main.cpp └── test_data ├── DXZ1_star_monomers.fa ├── final_decomposition_fc89af8.tsv └── read.fa /.github/workflows/conda-build.yml: -------------------------------------------------------------------------------- 1 | name: Checking build using conda 2 | 3 | on: [push, workflow_dispatch] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | python-version: [3.5, 3.6, 3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Add conda to system path 20 | run: | 21 | # $CONDA is an environment variable pointing to the root of the miniconda directory 22 | echo $CONDA/bin >> $GITHUB_PATH 23 | - name: Install dependencies 24 | run: | 25 | conda config --add channels defaults 26 | conda config --add channels bioconda 27 | conda config --add channels conda-forge 28 | conda install -y --file requirements.txt 29 | - name: Build 30 | run: | 31 | make 32 | - name: Run test dataset 33 | run: | 34 | make test_launch 35 | - name: Clean 36 | run: | 37 | make clean 38 | -------------------------------------------------------------------------------- /.github/workflows/conda-install.yml: -------------------------------------------------------------------------------- 1 | name: Checking install using conda 2 | 3 | on: [push, workflow_dispatch] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | python-version: [3.5, 3.6, 3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Add conda to system path 20 | run: | 21 | # $CONDA is an environment variable pointing to the root of the miniconda directory 22 | echo $CONDA/bin >> $GITHUB_PATH 23 | - name: Install dependencies 24 | run: | 25 | conda config --add channels defaults 26 | conda config --add channels bioconda 27 | conda config --add channels conda-forge 28 | conda install -y --file requirements.txt 29 | - name: Install 30 | run: | 31 | make install 32 | - name: Run test dataset with installed StringDecomposer 33 | run: | 34 | make test_launch_install 35 | - name: Uninstall and clean 36 | run: | 37 | make uninstall 38 | make clean 39 | -------------------------------------------------------------------------------- /.github/workflows/install_notRootLaunch.yml: -------------------------------------------------------------------------------- 1 | name: Install with test launch not in root directory 2 | 3 | on: [push, workflow_dispatch] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | python-version: [3.5, 3.6, 3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Add conda to system path 20 | run: | 21 | # $CONDA is an environment variable pointing to the root of the miniconda directory 22 | echo $CONDA/bin >> $GITHUB_PATH 23 | - name: Install dependencies 24 | run: | 25 | conda config --add channels defaults 26 | conda config --add channels bioconda 27 | conda config --add channels conda-forge 28 | conda install -y --file requirements.txt 29 | - name: Install 30 | run: | 31 | make install 32 | make uninstall 33 | - name: Run test dataset 34 | run: | 35 | mkdir test && cd test 36 | make -C .. test_launch 37 | - name: Run test dataset w/ install 38 | run: | 39 | cd test 40 | make -C .. test_launch_install 41 | make -C .. uninstall 42 | - name: Run test dataset w/ binary 43 | run: | 44 | cd test 45 | make -C .. install 46 | stringdecomposer ../stringdecomposer/test_data/read.fa ../stringdecomposer/test_data/DXZ1_star_monomers.fa -o . --second-best 47 | make -C .. uninstall 48 | - name: Run test dataset w/ binary 2 49 | run: | 50 | make install 51 | cd test 52 | stringdecomposer ../stringdecomposer/test_data/read.fa ../stringdecomposer/test_data/DXZ1_star_monomers.fa -o . --second-best 53 | make -C .. uninstall 54 | - name: Run test dataset w/ binary 3 55 | run: | 56 | make install 57 | cd .. 58 | mkdir test && cd test 59 | stringdecomposer ../stringdecomposer/stringdecomposer/test_data/read.fa ../stringdecomposer/stringdecomposer/test_data/DXZ1_star_monomers.fa -o . --second-best 60 | make -C ../stringdecomposer uninstall 61 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Pip build and install 5 | 6 | on: [push, workflow_dispatch] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: [3.5, 3.6, 3.7, 3.8, 3.9] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install flake8 27 | pip install -r requirements.txt 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/github/gitignore/blob/master/Python.gitignore 2 | # Jul 18, 2020. Commit 14f8a8b 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # pytype static type analyzer 138 | .pytype/ 139 | 140 | # Cython debug symbols 141 | cython_debug/ 142 | 143 | # Vim swp 144 | *.swp 145 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | StringDecomposer 2 | Copyright (c) 2020 Saint Petersburg State University 3 | 4 | StringDecomposer is free software; you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License, Version 2, 6 | dated June 1991, as published by the Free Software Foundation. 7 | 8 | StringDecomposer is distributed in the hope that it will be useful, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 | General Public License for more details. 12 | 13 | You should have received a copy of the GNU General Public License along 14 | with this program; if not, write to the Free Software Foundation, Inc., 15 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 16 | 17 | 18 | ------------------------------------------------------------------------------- 19 | 20 | Third-party tools used into StringDecomposer: 21 | 22 | 1. StringDecomposer uses edlib library to calculate identities at the final stage of decomposition. 23 | 24 | For more details about edlib library please refer to 25 | https://github.com/Martinsos/edlib and to the following paper: 26 | 27 | Martin Šošić, Mile Šikić; Edlib: a C/C ++ library for fast, exact sequence alignment using edit distance. 28 | Bioinformatics 2017 btw753. doi: 10.1093/bioinformatics/btw753 29 | 30 | ------------------------------------------------------------------------------- 31 | 32 | GNU GENERAL PUBLIC LICENSE 33 | Version 2, June 1991 34 | 35 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 36 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 37 | Everyone is permitted to copy and distribute verbatim copies 38 | of this license document, but changing it is not allowed. 39 | 40 | Preamble 41 | 42 | The licenses for most software are designed to take away your 43 | freedom to share and change it. By contrast, the GNU General Public 44 | License is intended to guarantee your freedom to share and change free 45 | software--to make sure the software is free for all its users. This 46 | General Public License applies to most of the Free Software 47 | Foundation's software and to any other program whose authors commit to 48 | using it. (Some other Free Software Foundation software is covered by 49 | the GNU Lesser General Public License instead.) You can apply it to 50 | your programs, too. 51 | 52 | When we speak of free software, we are referring to freedom, not 53 | price. Our General Public Licenses are designed to make sure that you 54 | have the freedom to distribute copies of free software (and charge for 55 | this service if you wish), that you receive source code or can get it 56 | if you want it, that you can change the software or use pieces of it 57 | in new free programs; and that you know you can do these things. 58 | 59 | To protect your rights, we need to make restrictions that forbid 60 | anyone to deny you these rights or to ask you to surrender the rights. 61 | These restrictions translate to certain responsibilities for you if you 62 | distribute copies of the software, or if you modify it. 63 | 64 | For example, if you distribute copies of such a program, whether 65 | gratis or for a fee, you must give the recipients all the rights that 66 | you have. You must make sure that they, too, receive or can get the 67 | source code. And you must show them these terms so they know their 68 | rights. 69 | 70 | We protect your rights with two steps: (1) copyright the software, and 71 | (2) offer you this license which gives you legal permission to copy, 72 | distribute and/or modify the software. 73 | 74 | Also, for each author's protection and ours, we want to make certain 75 | that everyone understands that there is no warranty for this free 76 | software. If the software is modified by someone else and passed on, we 77 | want its recipients to know that what they have is not the original, so 78 | that any problems introduced by others will not reflect on the original 79 | authors' reputations. 80 | 81 | Finally, any free program is threatened constantly by software 82 | patents. We wish to avoid the danger that redistributors of a free 83 | program will individually obtain patent licenses, in effect making the 84 | program proprietary. To prevent this, we have made it clear that any 85 | patent must be licensed for everyone's free use or not licensed at all. 86 | 87 | The precise terms and conditions for copying, distribution and 88 | modification follow. 89 | 90 | GNU GENERAL PUBLIC LICENSE 91 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 92 | 93 | 0. This License applies to any program or other work which contains 94 | a notice placed by the copyright holder saying it may be distributed 95 | under the terms of this General Public License. The "Program", below, 96 | refers to any such program or work, and a "work based on the Program" 97 | means either the Program or any derivative work under copyright law: 98 | that is to say, a work containing the Program or a portion of it, 99 | either verbatim or with modifications and/or translated into another 100 | language. (Hereinafter, translation is included without limitation in 101 | the term "modification".) Each licensee is addressed as "you". 102 | 103 | Activities other than copying, distribution and modification are not 104 | covered by this License; they are outside its scope. The act of 105 | running the Program is not restricted, and the output from the Program 106 | is covered only if its contents constitute a work based on the 107 | Program (independent of having been made by running the Program). 108 | Whether that is true depends on what the Program does. 109 | 110 | 1. You may copy and distribute verbatim copies of the Program's 111 | source code as you receive it, in any medium, provided that you 112 | conspicuously and appropriately publish on each copy an appropriate 113 | copyright notice and disclaimer of warranty; keep intact all the 114 | notices that refer to this License and to the absence of any warranty; 115 | and give any other recipients of the Program a copy of this License 116 | along with the Program. 117 | 118 | You may charge a fee for the physical act of transferring a copy, and 119 | you may at your option offer warranty protection in exchange for a fee. 120 | 121 | 2. You may modify your copy or copies of the Program or any portion 122 | of it, thus forming a work based on the Program, and copy and 123 | distribute such modifications or work under the terms of Section 1 124 | above, provided that you also meet all of these conditions: 125 | 126 | a) You must cause the modified files to carry prominent notices 127 | stating that you changed the files and the date of any change. 128 | 129 | b) You must cause any work that you distribute or publish, that in 130 | whole or in part contains or is derived from the Program or any 131 | part thereof, to be licensed as a whole at no charge to all third 132 | parties under the terms of this License. 133 | 134 | c) If the modified program normally reads commands interactively 135 | when run, you must cause it, when started running for such 136 | interactive use in the most ordinary way, to print or display an 137 | announcement including an appropriate copyright notice and a 138 | notice that there is no warranty (or else, saying that you provide 139 | a warranty) and that users may redistribute the program under 140 | these conditions, and telling the user how to view a copy of this 141 | License. (Exception: if the Program itself is interactive but 142 | does not normally print such an announcement, your work based on 143 | the Program is not required to print an announcement.) 144 | 145 | These requirements apply to the modified work as a whole. If 146 | identifiable sections of that work are not derived from the Program, 147 | and can be reasonably considered independent and separate works in 148 | themselves, then this License, and its terms, do not apply to those 149 | sections when you distribute them as separate works. But when you 150 | distribute the same sections as part of a whole which is a work based 151 | on the Program, the distribution of the whole must be on the terms of 152 | this License, whose permissions for other licensees extend to the 153 | entire whole, and thus to each and every part regardless of who wrote it. 154 | 155 | Thus, it is not the intent of this section to claim rights or contest 156 | your rights to work written entirely by you; rather, the intent is to 157 | exercise the right to control the distribution of derivative or 158 | collective works based on the Program. 159 | 160 | In addition, mere aggregation of another work not based on the Program 161 | with the Program (or with a work based on the Program) on a volume of 162 | a storage or distribution medium does not bring the other work under 163 | the scope of this License. 164 | 165 | 3. You may copy and distribute the Program (or a work based on it, 166 | under Section 2) in object code or executable form under the terms of 167 | Sections 1 and 2 above provided that you also do one of the following: 168 | 169 | a) Accompany it with the complete corresponding machine-readable 170 | source code, which must be distributed under the terms of Sections 171 | 1 and 2 above on a medium customarily used for software interchange; or, 172 | 173 | b) Accompany it with a written offer, valid for at least three 174 | years, to give any third party, for a charge no more than your 175 | cost of physically performing source distribution, a complete 176 | machine-readable copy of the corresponding source code, to be 177 | distributed under the terms of Sections 1 and 2 above on a medium 178 | customarily used for software interchange; or, 179 | 180 | c) Accompany it with the information you received as to the offer 181 | to distribute corresponding source code. (This alternative is 182 | allowed only for noncommercial distribution and only if you 183 | received the program in object code or executable form with such 184 | an offer, in accord with Subsection b above.) 185 | 186 | The source code for a work means the preferred form of the work for 187 | making modifications to it. For an executable work, complete source 188 | code means all the source code for all modules it contains, plus any 189 | associated interface definition files, plus the scripts used to 190 | control compilation and installation of the executable. However, as a 191 | special exception, the source code distributed need not include 192 | anything that is normally distributed (in either source or binary 193 | form) with the major components (compiler, kernel, and so on) of the 194 | operating system on which the executable runs, unless that component 195 | itself accompanies the executable. 196 | 197 | If distribution of executable or object code is made by offering 198 | access to copy from a designated place, then offering equivalent 199 | access to copy the source code from the same place counts as 200 | distribution of the source code, even though third parties are not 201 | compelled to copy the source along with the object code. 202 | 203 | 4. You may not copy, modify, sublicense, or distribute the Program 204 | except as expressly provided under this License. Any attempt 205 | otherwise to copy, modify, sublicense or distribute the Program is 206 | void, and will automatically terminate your rights under this License. 207 | However, parties who have received copies, or rights, from you under 208 | this License will not have their licenses terminated so long as such 209 | parties remain in full compliance. 210 | 211 | 5. You are not required to accept this License, since you have not 212 | signed it. However, nothing else grants you permission to modify or 213 | distribute the Program or its derivative works. These actions are 214 | prohibited by law if you do not accept this License. Therefore, by 215 | modifying or distributing the Program (or any work based on the 216 | Program), you indicate your acceptance of this License to do so, and 217 | all its terms and conditions for copying, distributing or modifying 218 | the Program or works based on it. 219 | 220 | 6. Each time you redistribute the Program (or any work based on the 221 | Program), the recipient automatically receives a license from the 222 | original licensor to copy, distribute or modify the Program subject to 223 | these terms and conditions. You may not impose any further 224 | restrictions on the recipients' exercise of the rights granted herein. 225 | You are not responsible for enforcing compliance by third parties to 226 | this License. 227 | 228 | 7. If, as a consequence of a court judgment or allegation of patent 229 | infringement or for any other reason (not limited to patent issues), 230 | conditions are imposed on you (whether by court order, agreement or 231 | otherwise) that contradict the conditions of this License, they do not 232 | excuse you from the conditions of this License. If you cannot 233 | distribute so as to satisfy simultaneously your obligations under this 234 | License and any other pertinent obligations, then as a consequence you 235 | may not distribute the Program at all. For example, if a patent 236 | license would not permit royalty-free redistribution of the Program by 237 | all those who receive copies directly or indirectly through you, then 238 | the only way you could satisfy both it and this License would be to 239 | refrain entirely from distribution of the Program. 240 | 241 | If any portion of this section is held invalid or unenforceable under 242 | any particular circumstance, the balance of the section is intended to 243 | apply and the section as a whole is intended to apply in other 244 | circumstances. 245 | 246 | It is not the purpose of this section to induce you to infringe any 247 | patents or other property right claims or to contest validity of any 248 | such claims; this section has the sole purpose of protecting the 249 | integrity of the free software distribution system, which is 250 | implemented by public license practices. Many people have made 251 | generous contributions to the wide range of software distributed 252 | through that system in reliance on consistent application of that 253 | system; it is up to the author/donor to decide if he or she is willing 254 | to distribute software through any other system and a licensee cannot 255 | impose that choice. 256 | 257 | This section is intended to make thoroughly clear what is believed to 258 | be a consequence of the rest of this License. 259 | 260 | 8. If the distribution and/or use of the Program is restricted in 261 | certain countries either by patents or by copyrighted interfaces, the 262 | original copyright holder who places the Program under this License 263 | may add an explicit geographical distribution limitation excluding 264 | those countries, so that distribution is permitted only in or among 265 | countries not thus excluded. In such case, this License incorporates 266 | the limitation as if written in the body of this License. 267 | 268 | 9. The Free Software Foundation may publish revised and/or new versions 269 | of the General Public License from time to time. Such new versions will 270 | be similar in spirit to the present version, but may differ in detail to 271 | address new problems or concerns. 272 | 273 | Each version is given a distinguishing version number. If the Program 274 | specifies a version number of this License which applies to it and "any 275 | later version", you have the option of following the terms and conditions 276 | either of that version or of any later version published by the Free 277 | Software Foundation. If the Program does not specify a version number of 278 | this License, you may choose any version ever published by the Free Software 279 | Foundation. 280 | 281 | 10. If you wish to incorporate parts of the Program into other free 282 | programs whose distribution conditions are different, write to the author 283 | to ask for permission. For software which is copyrighted by the Free 284 | Software Foundation, write to the Free Software Foundation; we sometimes 285 | make exceptions for this. Our decision will be guided by the two goals 286 | of preserving the free status of all derivatives of our free software and 287 | of promoting the sharing and reuse of software generally. 288 | 289 | NO WARRANTY 290 | 291 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 292 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 293 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 294 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 295 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 296 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 297 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 298 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 299 | REPAIR OR CORRECTION. 300 | 301 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 302 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 303 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 304 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 305 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 306 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 307 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 308 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 309 | POSSIBILITY OF SUCH DAMAGES. 310 | 311 | END OF TERMS AND CONDITIONS 312 | 313 | How to Apply These Terms to Your New Programs 314 | 315 | If you develop a new program, and you want it to be of the greatest 316 | possible use to the public, the best way to achieve this is to make it 317 | free software which everyone can redistribute and change under these terms. 318 | 319 | To do so, attach the following notices to the program. It is safest 320 | to attach them to the start of each source file to most effectively 321 | convey the exclusion of warranty; and each file should have at least 322 | the "copyright" line and a pointer to where the full notice is found. 323 | 324 | 325 | Copyright (C) 326 | 327 | This program is free software; you can redistribute it and/or modify 328 | it under the terms of the GNU General Public License as published by 329 | the Free Software Foundation; either version 2 of the License, or 330 | (at your option) any later version. 331 | 332 | This program is distributed in the hope that it will be useful, 333 | but WITHOUT ANY WARRANTY; without even the implied warranty of 334 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 335 | GNU General Public License for more details. 336 | 337 | You should have received a copy of the GNU General Public License along 338 | with this program; if not, write to the Free Software Foundation, Inc., 339 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 340 | 341 | Also add information on how to contact you by electronic and paper mail. 342 | 343 | If the program is interactive, make it output a short notice like this 344 | when it starts in an interactive mode: 345 | 346 | Gnomovision version 69, Copyright (C) year name of author 347 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 348 | This is free software, and you are welcome to redistribute it 349 | under certain conditions; type `show c' for details. 350 | 351 | The hypothetical commands `show w' and `show c' should show the appropriate 352 | parts of the General Public License. Of course, the commands you use may 353 | be called something other than `show w' and `show c'; they could even be 354 | mouse-clicks or menu items--whatever suits your program. 355 | 356 | You should also get your employer (if you work as a programmer) or your 357 | school, if any, to sign a "copyright disclaimer" for the program, if 358 | necessary. Here is a sample; alter the names: 359 | 360 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 361 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 362 | 363 | , 1 April 1989 364 | Ty Coon, President of Vice 365 | 366 | This General Public License does not permit incorporating your program into 367 | proprietary programs. If your program is a subroutine library, you may 368 | consider it more useful to permit linking proprietary applications with the 369 | library. If this is what you want to do, use the GNU Lesser General 370 | Public License instead of this License. 371 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | export ROOT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 2 | SD_DIR = $(ROOT_DIR)/stringdecomposer 3 | BUILD_DIR = $(SD_DIR)/build 4 | BIN_DIR = $(BUILD_DIR)/bin 5 | SRC_DIR = $(SD_DIR)/src 6 | 7 | TEST_QUERY = $(SD_DIR)/test_data/read.fa 8 | TEST_MONOMERS = $(SD_DIR)/test_data/DXZ1_star_monomers.fa 9 | TEST_OUTDIR = $(SD_DIR)/test_data 10 | TEST_REFERENCE = $(SD_DIR)/test_data/final_decomposition_fc89af8.tsv 11 | 12 | build: 13 | mkdir -p $(BIN_DIR) 14 | ${CXX} -o $(BIN_DIR)/dp $(SRC_DIR)/main.cpp $(SRC_DIR)/edlib.cpp -fopenmp --std=c++11 -O2 -Wall -Wextra -pedantic -Wshadow -Wfloat-equal 15 | 16 | test_launch: build 17 | bin/stringdecomposer $(TEST_QUERY) $(TEST_MONOMERS) -o $(TEST_OUTDIR) --second-best 18 | grep -q "Thank you for using StringDecomposer!" $(TEST_OUTDIR)/stringdecomposer.log 19 | diff -q $(TEST_REFERENCE) $(TEST_OUTDIR)/final_decomposition.tsv 20 | 21 | install: build 22 | python setup.py install --record install_footprint.txt 23 | 24 | test_launch_install: install 25 | stringdecomposer $(TEST_QUERY) $(TEST_MONOMERS) -o $(TEST_OUTDIR) --second-best 26 | grep -q "Thank you for using StringDecomposer!" $(TEST_OUTDIR)/stringdecomposer.log 27 | diff -q $(TEST_REFERENCE) $(TEST_OUTDIR)/final_decomposition.tsv 28 | 29 | clean: 30 | -rm -rf $(BUILD_DIR) 31 | -rm -rf $(SD_DIR)/test_data/final_decomposition_alt.tsv 32 | -rm -rf $(SD_DIR)/test_data/final_decomposition_raw.tsv 33 | -rm -rf $(SD_DIR)/test_data/final_decomposition.tsv 34 | -rm -rf $(SD_DIR)/test_data/stringdecomposer.log 35 | -rm -rf StringDecomposer.egg-info dist build 36 | 37 | uninstall: 38 | @if [ -f install_footprint.txt ]; then\ 39 | echo "removing install footprint from install_footprint.txt";\ 40 | cat install_footprint.txt | xargs rm -rf;\ 41 | rm -rf install_footprint.txt;\ 42 | fi 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/stringdecomposer/badges/installer/conda.svg)](https://anaconda.org/bioconda/stringdecomposer) 2 | 3 | # StringDecomposer 4 | 5 | ## Version 1.1.2 6 | 7 | As an input StringDecomposer algorithm takes the set of monomers (typically, alpha satellites) and a genomic segment (assembly, Oxford Nanopore or a PacBio HiFi read) that contains a tandem repeat consisting of the given monomers. 8 | StringDecomposer partitions this segment into distinct monomers, providing an accurate translation from the nucleotide alphabet into the monomer alphabet. 9 | 10 | 11 | ## Installation 12 | 13 | The recommended way to install StringDecomposer is with conda package manager: 14 | ``` 15 | conda install -c bioconda stringdecomposer 16 | ``` 17 | 18 | 19 | Alternatively, StringDecomposer can be build and installed from source. 20 | 21 | Requirements: 22 | - Python3.5+ 23 | - [biopython](https://biopython.org/wiki/Download) 24 | - [pandas](https://pypi.org/project/pandas/) 25 | - [python-edlib](https://pypi.org/project/edlib/) 26 | - [setuptools](https://pypi.org/project/setuptools/) 27 | - g++ (version 5.3.1 or higher) 28 | 29 | The required python packages can be installed through conda using 30 | 31 | conda install --file requirements.txt 32 | 33 | Local building without installation: 34 | 35 | git clone https://github.com/ablab/stringdecomposer.git 36 | cd stringdecomposer 37 | make 38 | 39 | Then, StringDecomposer is available as 40 | 41 | bin/stringdecomposer 42 | 43 | 44 | Installing from source: 45 | 46 | git clone https://github.com/ablab/stringdecomposer.git 47 | cd stringdecomposer 48 | make install 49 | 50 | Then, StringDecomposer is available as 51 | 52 | stringdecomposer 53 | 54 | Removal of StringDecomposer installed from source: 55 | 56 | make uninstall 57 | 58 | ## Quick start 59 | The following command assumes that StringDecomposer is either installed through conda or from source. 60 | 61 | stringdecomposer ./stringdecomposer/test_data/read.fa ./stringdecomposer/test_data/DXZ1_star_monomers.fa -o ./stringdecomposer/test_data 62 | 63 | The same result can be achieved with `make test_launch` (for local build without installation) and 64 | `make test_launch_install` (for installed from source or via conda). 65 | These `make` rules ensure correctness of StringDecomposer's output on the test dataset. 66 | 67 | In case StringDecomposer is built locally, the command that achieves the same result is 68 | 69 | ./bin/stringdecomposer ./stringdecomposer/test_data/read.fa ./stringdecomposer/test_data/DXZ1_star_monomers.fa -o ./stringdecomposer/test_data 70 | 71 | Results can be found in 72 | 73 | ./stringdecomposer/test_data/final_decomposition.tsv final decomposition of sequences to monomer alphabet 74 | ./stringdecomposer/test_data/final_decomposition_alt.tsv final decomposition of sequences to monomer alphabet with alternative monomers for each position 75 | ./stringdecomposer/test_data/final_decomposition_raw.tsv raw decomposition with initial dynamic programming scores instead of identities 76 | 77 | Each line in final_decomposition.tsv file has the following form: 78 | 79 | 80 | 81 | `homo`-related columns represent statistics of the best-scoring (second-best-scoring) monomer after compression of homopolymer runs in both the monomer and the target read. 82 | Reliability is either equal to `?` (signifies unreliable alignment which can be caused by a retrotransposon insertion or a poor quality segment of a read) or `+` (if the alignment is reliable). 83 | The columns ``, ``, ``, and `_homo_`-related columns will have values `None` and `-1` unless the user supplies the argument `--second-best` (see Synopsis below). 84 | 85 | 86 | ## Synopsis 87 | 88 | stringdecomposer [-h] [-t THREADS] [-o OUT_FILE] [-i MIN_IDENTITY] [-s SCORING] [-b BATCH_SIZE] [--second-best] sequences monomers 89 | 90 | Required arguments: 91 | 92 | sequences fasta-file with long reads or genomic sequences (accepts multiple sequences in one file) 93 | monomers fasta-file with monomers 94 | 95 | Optional arguments: 96 | 97 | -h, --help show this help message and exit 98 | 99 | -t THREADS, --threads THREADS number of threads (by default 1) 100 | 101 | -o OUT_FILE, --out-file OUT_FILE output tsv-file (by default final_decomposition.tsv) 102 | 103 | -i MIN_IDENTITY, --min-identity MIN_IDENTITY only monomer alignments with percent identity >= MIN_IDENTITY are printed (by default MIN_IDENTITY=0%) 104 | 105 | -s SCORING, --scoring SCORING set scoring scheme for StringDecomposer in the format "insertion,deletion,mismatch,match" (by default "-1,-1,-1,1") 106 | 107 | -b BATCH_SIZE, --batch-size BATCH_SIZE set size of the batch in parallelization (by default 5000) 108 | 109 | --second-best StringDecomposer will generate , , and _homo_-related columns (not recommended when running StringDecomposer of a large number of monomers) 110 | 111 | ## Latest updates 112 | 113 | ### StringDecomposer 1.1.2 release (12 Oct 2021) 114 | 115 | * Remove building with Address Sanitizer by default 116 | 117 | ### StringDecomposer 1.1.1 release (20 July 2021) 118 | 119 | * git hash is disabled to enable execution outside of git repo 120 | 121 | ### StringDecomposer 1.1 release (28 June 2021) 122 | 123 | * CI support via github actions 124 | * improved build and installation 125 | * removal of unnecessary dependencies 126 | * py module of StringDecomposer saves commit hash and has a logger 127 | 128 | ### StringDecomposer 1.0 release (11 August 2020) 129 | * initial StringDecomposer release 130 | * conda support 131 | * results of StringDecomposer monomer annotation for available centromere assemblies and ONT and Hifi reads of cen6, cen8, and cenX can be found at [Figshare](https://doi.org/10.6084/m9.figshare.12783371) 132 | 133 | 134 | ## Citation 135 | 136 | The String Decomposition Problem and its Applications to Centromere Analysis and Assembly. *Tatiana Dvorkina, Andrey V. Bzikadze, Pavel A. Pevzner* Bioinformatics, Volume 36, Issue Supplement_1, July 2020, Pages i93–i101; doi: [https://doi.org/10.1093/bioinformatics/btaa454](https://doi.org/10.1093/bioinformatics/btaa454) 137 | 138 | ## Contact 139 | 140 | In case of any issues please use [issue tracker](https://github.com/ablab/stringdecomposer/issues) or email directly to [t.dvorkina@spbu.ru](mailto:t.dvorkina@spbu.ru) 141 | -------------------------------------------------------------------------------- /bin/stringdecomposer: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | 6 | sd_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir) 7 | sys.path.insert(0, sd_root) 8 | 9 | from stringdecomposer.main import main 10 | sys.exit(main()) 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython 2 | pandas 3 | python-edlib 4 | setuptools 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | 6 | try: 7 | import setuptools 8 | except ImportError: 9 | sys.exit("setuptools package not found. " 10 | "Please use 'pip install setuptools' first") 11 | 12 | from setuptools import setup 13 | from distutils.command.build import build as DistutilsBuild 14 | from distutils.spawn import find_executable 15 | 16 | from stringdecomposer.__version__ import __version__ 17 | 18 | 19 | # Make sure we're running from the setup.py directory. 20 | script_dir = os.path.dirname(os.path.realpath(__file__)) 21 | if script_dir != os.getcwd(): 22 | os.chdir(script_dir) 23 | 24 | 25 | requirements_fn = os.path.join(script_dir, 'requirements.txt') 26 | requirements = [] 27 | with open(requirements_fn) as f: 28 | for line in f: 29 | line = line.strip() 30 | if line == 'python-edlib': 31 | requirements.append('edlib') 32 | else: 33 | requirements.append(line) 34 | 35 | 36 | description = \ 37 | """ 38 | StringDecomposer (SD) algorithm takes the set of monomers 39 | and a long error-prone read (or a genomic segment) 40 | and partitions this read into distinct monomers, 41 | providing an accurate translation of each read 42 | from a nucleotide alphabet into a monomer alphabet. 43 | """ 44 | 45 | 46 | class MakeBuild(DistutilsBuild): 47 | def run(self): 48 | if not find_executable("make"): 49 | sys.exit("ERROR: 'make' command is unavailable") 50 | try: 51 | subprocess.check_call(["make"]) 52 | except subprocess.CalledProcessError as e: 53 | sys.exit("Compilation error: ", e) 54 | DistutilsBuild.run(self) 55 | 56 | 57 | setup( 58 | name="StringDecomposer", 59 | version=__version__, 60 | description=description, 61 | url='https://github.com/ablab/stringdecomposer', 62 | author='Tatiana Dvorkina', 63 | author_email='tanunia@gmail.com', 64 | license='GNU General Public License v2.0', 65 | install_requires=requirements, 66 | packages=['stringdecomposer'], 67 | package_dir={'stringdecomposer': 'stringdecomposer'}, 68 | package_data={'stringdecomposer': ['build/bin/dp', 'models/*', '*', 'py/*']}, 69 | entry_points={ 70 | 'console_scripts': ['stringdecomposer=stringdecomposer.main:main'] 71 | }, 72 | cmdclass={'build': MakeBuild} 73 | ) 74 | -------------------------------------------------------------------------------- /stringdecomposer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ablab/stringdecomposer/0a967f6bf131face88397445ca49b65fee49489c/stringdecomposer/__init__.py -------------------------------------------------------------------------------- /stringdecomposer/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.2" 2 | -------------------------------------------------------------------------------- /stringdecomposer/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import pathlib 6 | import subprocess 7 | import sys 8 | 9 | from Bio import SeqIO 10 | from Bio.SeqRecord import SeqRecord 11 | import edlib 12 | import pandas as pd 13 | import re 14 | 15 | from stringdecomposer.py.standard_logger import get_logger 16 | from stringdecomposer.py.git import get_git_revision_short_hash 17 | 18 | 19 | CUR_FILE = os.path.abspath(__file__) 20 | CUR_DIR = os.path.dirname(CUR_FILE) 21 | SD_BIN = os.path.join(CUR_DIR, 'build', 'bin', 'dp') 22 | LOGREG_FILE = os.path.join(CUR_DIR, 23 | 'models', 24 | 'ont_logreg_model.txt') 25 | with open(LOGREG_FILE) as f: 26 | LR_MODEL_COEF = list(map(float, f.readline().strip().split())) 27 | 28 | 29 | def edist(lst): 30 | if len(str(lst[0])) == 0: 31 | return -1, "" 32 | if len(str(lst[1])) == 0: 33 | return -1, "" 34 | result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", task="path") 35 | return result["editDistance"], result["cigar"] 36 | 37 | 38 | def aai(ar): 39 | p1, p2 = str(ar[0]), str(ar[1]) 40 | if p1.endswith("*"): 41 | p1 = p1[:-1] 42 | if p2.endswith("*"): 43 | p2 = p2[:-1] 44 | ed, cigar = edist([str(p1), str(p2)]) 45 | if ed == -1: 46 | return 0 47 | total_length = 0 #max(len(p1), len(p2)) 48 | n = 0 49 | for c in cigar: 50 | if c.isdigit(): 51 | n = n*10 + int(c) 52 | else: 53 | total_length += n 54 | n = 0 55 | matches = re.findall(r'\d+=', cigar) 56 | aai = 0.0 57 | for m in matches: 58 | aai += int(m[:-1]) 59 | aai /= total_length 60 | return aai*100 61 | 62 | 63 | def load_fasta(filename, tp = "list"): 64 | if tp == "map": 65 | records = SeqIO.to_dict(SeqIO.parse(filename, "fasta")) 66 | for r in records: 67 | records[r] = records[r].upper() 68 | else: 69 | records = list(SeqIO.parse(filename, "fasta")) 70 | for i in range(len(records)): 71 | records[i] = records[i].upper() 72 | return records 73 | 74 | 75 | def make_record(seq, name, sid, d=""): 76 | return SeqRecord(seq, id=sid, name=name, description = d) 77 | 78 | 79 | def add_rc_monomers(monomers): 80 | res = [] 81 | for m in monomers: 82 | res.append(m) 83 | res.append(make_record(m.seq.reverse_complement(), m.name + "'", m.id + "'")) 84 | return res 85 | 86 | 87 | def convert_to_homo(seq): 88 | res = "" 89 | for c in seq: 90 | if len(res) == 0 or res[-1] != c: 91 | res += c 92 | return res 93 | 94 | 95 | def classify(reads_mapping): 96 | df = pd.DataFrame(reads_mapping) 97 | df["idnt_diff"] = df["score"] - df["second_best_score"] 98 | X = pd.concat([df["score"], df["idnt_diff"]], axis=1, keys = ["idnt", "idnt_diff"]) 99 | X.insert(0, 'intercept', 1) 100 | y_pred = (X.dot(LR_MODEL_COEF)) > 0 101 | for i in range(len(reads_mapping)): 102 | if y_pred[i] != 1: 103 | reads_mapping[i]["q"] = "?" 104 | return reads_mapping 105 | 106 | 107 | def convert_read(decomposition, read, monomers, light = False): 108 | res = [] 109 | for d in decomposition: 110 | monomer, start, end = d["m"], d["start"], d["end"] 111 | if light: 112 | scores = {} 113 | for m in monomers: 114 | if m.name == monomer: 115 | score = aai([read.seq[start:end + 1], m.seq]) 116 | scores[m.name] = score 117 | res.append({"m": monomer, "start": str(d["start"]), "end": str(d["end"]), "score": scores[monomer], \ 118 | "second_best": "None", "second_best_score": -1,\ 119 | "homo_best": "None", "homo_best_score": -1,\ 120 | "homo_second_best": "None", "homo_second_best_score": -1,\ 121 | "alt": {}, "q": "+"}) 122 | else: 123 | scores = {} 124 | for m in monomers: 125 | score = aai([read.seq[start:end + 1], m.seq]) 126 | scores[m.name] = score 127 | if monomer == None: 128 | for s in scores: 129 | if monomer == None or scores[s] > scores[monomer]: 130 | monomer = s 131 | secondbest, secondbest_score = None, -1 132 | for m in scores: 133 | if m != monomer: # and abs(scores[m] - scores[monomer]) < 5: 134 | if not secondbest or secondbest_score < scores[m]: 135 | secondbest, secondbest_score = m, scores[m] 136 | 137 | homo_scores = [] 138 | homo_subseq = convert_to_homo(read.seq[start:end + 1]) 139 | for m in monomers: 140 | score = aai([homo_subseq, convert_to_homo(m.seq)]) 141 | homo_scores.append([m.name, score]) 142 | homo_scores = sorted(homo_scores, key = lambda x: -x[1]) 143 | res.append({"m": monomer, "start": str(d["start"]), "end": str(d["end"]), "score": scores[monomer], \ 144 | "second_best": str(secondbest), "second_best_score": secondbest_score,\ 145 | "homo_best": homo_scores[0][0], "homo_best_score": homo_scores[0][1],\ 146 | "homo_second_best": homo_scores[1][0], "homo_second_best_score": homo_scores[1][1],\ 147 | "alt": scores, "q": "+"}) 148 | 149 | res = classify(res) 150 | return res 151 | 152 | 153 | def print_read(fout, fout_alt, dec, read, monomers, identity_th, light): 154 | dec = convert_read(dec, read, monomers, light) 155 | for d in dec: 156 | if d["score"] >= identity_th: 157 | fout.write("\t".join([read.name, d["m"], d["start"], d["end"], "{:.2f}".format(d["score"]), \ 158 | d["second_best"], "{:.2f}".format(d["second_best_score"]), \ 159 | d["homo_best"], "{:.2f}".format(d["homo_best_score"]), \ 160 | d["homo_second_best"], "{:.2f}".format(d["homo_second_best_score"]), d["q"]]) + "\n") 161 | for a in d["alt"]: 162 | star = "-" 163 | if a == d["m"]: 164 | star = "*" 165 | fout_alt.write("\t".join([read.name, a, d["start"], d["end"], "{:.2f}".format(d["alt"][a]), star]) + "\n") 166 | 167 | 168 | def convert_tsv(decomposition, reads, monomers, outfile, identity_th, light): 169 | with open(outfile[:-len(".tsv")] + "_alt.tsv", "w") as fout_alt: 170 | with open(outfile, "w") as fout: 171 | cur_dec = [] 172 | prev_read = None 173 | for ln in decomposition.split("\n")[:-1]: 174 | read, monomer, start, end = ln.split("\t")[:4] 175 | read = read.split()[0] 176 | monomer = monomer.split()[0] 177 | if read != prev_read and prev_read != None: 178 | print_read(fout, fout_alt, cur_dec, reads[prev_read], monomers, identity_th, light) 179 | cur_dec = [] 180 | prev_read = read 181 | start, end = int(start), int(end) 182 | cur_dec.append({"m": monomer, "start": start, "end": end}) 183 | if len(cur_dec) > 0: 184 | print_read(fout, fout_alt, cur_dec, reads[prev_read], monomers, identity_th, light) 185 | 186 | def run(sequences, monomers, num_threads, scoring, batch_size, raw_file, ed_thr, overlap, logger): 187 | ins, dels, mm, match = scoring.split(",") 188 | if not os.path.isfile(SD_BIN): 189 | logger.info('The binary of String Decomposer is not available. Did you forget to run `make`? Aborting.') 190 | sys.exit(1) 191 | 192 | logger.info(' '.join(["Run", SD_BIN, "with parameters", sequences, monomers, str(num_threads), str(batch_size), str(overlap), scoring])) 193 | with open(raw_file, 'w') as f: 194 | subprocess.run([SD_BIN, sequences, monomers, num_threads, batch_size, overlap, ins, dels, mm, match, str(ed_thr)], stdout = f, check = True) 195 | with open(raw_file, 'r') as f: 196 | raw_decomposition = "".join(f.readlines()) 197 | return raw_decomposition 198 | 199 | 200 | 201 | def main(): 202 | parser = argparse.ArgumentParser(description='Decomposes string into blocks alphabet') 203 | parser.add_argument('sequences', help='fasta-file with long reads or genomic sequences') 204 | parser.add_argument('monomers', help='fasta-file with monomers') 205 | parser.add_argument('-t', '--threads', help='number of threads (by default 1)', default="1", required=False) 206 | parser.add_argument('-o', '--out-dir', help='output directory (by default .)', default=".", required=False) 207 | parser.add_argument('--out-file', help='output tsv-file (by default "final_decomposition")', default="final_decomposition", required=False) 208 | parser.add_argument('-i', '--min-identity', \ 209 | help='only monomer alignments with percent identity >= MIN_IDENTITY are printed (by default MIN_IDENTITY=0)', type=int, default=0, required=False) 210 | parser.add_argument('-s', '--scoring', \ 211 | help='set scoring scheme for SD in the format "insertion,deletion,mismatch,match" (by default "-1,-1,-1,1")', default="-1,-1,-1,1", required=False) 212 | parser.add_argument('-b', '--batch-size', help='set size of the batch in parallelization (by default 5000)', type=str, default="5000", required=False) 213 | parser.add_argument('--second-best', dest="second_best", help='generate second best monomer and homopolymer scores', action="store_true") 214 | parser.add_argument('--ed_thr', help='align only monomers with edit distance less then ed_thr for each segment (by default align all monomers)', default=-1, 215 | type=int, required=False) 216 | parser.add_argument('-v', '--overlap', help='set size of batch overlap (by default 500)', type=str, default="500", required=False) 217 | args = parser.parse_args() 218 | pathlib.Path(args.out_dir).mkdir(parents=True, exist_ok=True) 219 | 220 | logfn = os.path.join(args.out_dir, 'stringdecomposer.log') 221 | logger = get_logger(logfn, logger_name='StringDecomposer') 222 | 223 | logger.info(f'cmd: {sys.argv}') 224 | # TODO get_git_revision_short_hash is commented out 225 | # since it does not work when stringdecomposer is run from outside of repo 226 | # logger.info(f'git hash: {get_git_revision_short_hash()}') 227 | 228 | raw_decomp_fn = os.path.join(args.out_dir, args.out_file + "_raw.tsv") 229 | raw_decomposition = run(args.sequences, args.monomers, args.threads, args.scoring, args.batch_size, raw_decomp_fn, args.ed_thr, args.overlap, logger) 230 | logger.info("Saved raw decomposition to " + raw_decomp_fn) 231 | 232 | reads = load_fasta(args.sequences, "map") 233 | monomers = load_fasta(args.monomers) 234 | monomers = add_rc_monomers(monomers) 235 | logger.info("Transforming raw alignments...") 236 | 237 | convert_tsv_fn = os.path.join(args.out_dir, args.out_file + ".tsv") 238 | convert_tsv(raw_decomposition, reads, monomers, convert_tsv_fn, int(args.min_identity), not args.second_best) 239 | logger.info("Transformation finished. Results can be found in " + convert_tsv_fn) 240 | 241 | logger.info("Thank you for using StringDecomposer!") 242 | 243 | 244 | if __name__ == "__main__": 245 | main() 246 | -------------------------------------------------------------------------------- /stringdecomposer/models/ont_logreg_model.txt: -------------------------------------------------------------------------------- 1 | -31.48494996 0.41784018 0.69186882 2 | -------------------------------------------------------------------------------- /stringdecomposer/py/git.py: -------------------------------------------------------------------------------- 1 | # (c) 2020 by Authors 2 | # This file is a part of centroFlye program. 3 | # Released under the BSD license (see LICENSE file) 4 | 5 | import subprocess 6 | 7 | 8 | # Disclaimer: this code can only be run from the git repo and thus should not 9 | # be used in scripts intended for installation 10 | 11 | 12 | def get_git_revision_hash(): 13 | return subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip() 14 | 15 | 16 | def get_git_revision_short_hash(): 17 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip() 18 | -------------------------------------------------------------------------------- /stringdecomposer/py/standard_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def get_logger(filename, 6 | logger_name='StringDecomposer', 7 | level=logging.INFO, 8 | filemode='a', 9 | stdout=True): 10 | logger = logging.getLogger(logger_name) 11 | logger.setLevel(level) 12 | 13 | # create the logging file handler 14 | fh = logging.FileHandler(filename, mode=filemode) 15 | 16 | formatter = logging.Formatter( 17 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | fh.setFormatter(formatter) 19 | 20 | # add handler to logger object 21 | logger.addHandler(fh) 22 | 23 | if stdout: 24 | sh = logging.StreamHandler(sys.stdout) 25 | sh.setFormatter(formatter) 26 | logger.addHandler(sh) 27 | 28 | return logger 29 | -------------------------------------------------------------------------------- /stringdecomposer/src/edlib.cpp: -------------------------------------------------------------------------------- 1 | #include "edlib.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | typedef uint64_t Word; 13 | static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits 14 | static const Word WORD_1 = static_cast(1); 15 | static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1); // 100..00 16 | static const int MAX_UCHAR = 255; 17 | 18 | // Data needed to find alignment. 19 | struct AlignmentData { 20 | Word* Ps; 21 | Word* Ms; 22 | int* scores; 23 | int* firstBlocks; 24 | int* lastBlocks; 25 | 26 | AlignmentData(int maxNumBlocks, int targetLength) { 27 | // We build a complete table and mark first and last block for each column 28 | // (because algorithm is banded so only part of each columns is used). 29 | // TODO: do not build a whole table, but just enough blocks for each column. 30 | Ps = new Word[maxNumBlocks * targetLength]; 31 | Ms = new Word[maxNumBlocks * targetLength]; 32 | scores = new int[maxNumBlocks * targetLength]; 33 | firstBlocks = new int[targetLength]; 34 | lastBlocks = new int[targetLength]; 35 | } 36 | 37 | ~AlignmentData() { 38 | delete[] Ps; 39 | delete[] Ms; 40 | delete[] scores; 41 | delete[] firstBlocks; 42 | delete[] lastBlocks; 43 | } 44 | }; 45 | 46 | struct Block { 47 | Word P; // Pvin 48 | Word M; // Mvin 49 | int score; // score of last cell in block; 50 | 51 | Block() {} 52 | Block(Word p, Word m, int s) :P(p), M(m), score(s) {} 53 | }; 54 | 55 | 56 | /** 57 | * Defines equality relation on alphabet characters. 58 | * By default each character is always equal only to itself, but you can also provide additional equalities. 59 | */ 60 | class EqualityDefinition { 61 | private: 62 | bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1]; 63 | public: 64 | EqualityDefinition(const string& alphabet, 65 | const EdlibEqualityPair* additionalEqualities = NULL, 66 | const int additionalEqualitiesLength = 0) { 67 | for (int i = 0; i < static_cast(alphabet.size()); i++) { 68 | for (int j = 0; j < static_cast(alphabet.size()); j++) { 69 | matrix[i][j] = (i == j); 70 | } 71 | } 72 | if (additionalEqualities != NULL) { 73 | for (int i = 0; i < additionalEqualitiesLength; i++) { 74 | size_t firstTransformed = alphabet.find(additionalEqualities[i].first); 75 | size_t secondTransformed = alphabet.find(additionalEqualities[i].second); 76 | if (firstTransformed != string::npos && secondTransformed != string::npos) { 77 | matrix[firstTransformed][secondTransformed] = matrix[secondTransformed][firstTransformed] = true; 78 | } 79 | } 80 | } 81 | } 82 | 83 | /** 84 | * @param a Element from transformed sequence. 85 | * @param b Element from transformed sequence. 86 | * @return True if a and b are defined as equal, false otherwise. 87 | */ 88 | bool areEqual(unsigned char a, unsigned char b) const { 89 | return matrix[a][b]; 90 | } 91 | }; 92 | 93 | static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks, 94 | int queryLength, 95 | const unsigned char* target, int targetLength, 96 | int k, EdlibAlignMode mode, 97 | int* bestScore_, int** positions_, int* numPositions_); 98 | 99 | static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks, 100 | int queryLength, 101 | const unsigned char* target, int targetLength, 102 | int k, int* bestScore_, 103 | int* position_, bool findAlignment, 104 | AlignmentData** alignData, int targetStopPosition); 105 | 106 | 107 | static int obtainAlignment( 108 | const unsigned char* query, const unsigned char* rQuery, int queryLength, 109 | const unsigned char* target, const unsigned char* rTarget, int targetLength, 110 | const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore, 111 | unsigned char** alignment, int* alignmentLength); 112 | 113 | static int obtainAlignmentHirschberg( 114 | const unsigned char* query, const unsigned char* rQuery, int queryLength, 115 | const unsigned char* target, const unsigned char* rTarget, int targetLength, 116 | const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore, 117 | unsigned char** alignment, int* alignmentLength); 118 | 119 | static int obtainAlignmentTraceback(int queryLength, int targetLength, 120 | int bestScore, const AlignmentData* alignData, 121 | unsigned char** alignment, int* alignmentLength); 122 | 123 | static string transformSequences(const char* queryOriginal, int queryLength, 124 | const char* targetOriginal, int targetLength, 125 | unsigned char** queryTransformed, 126 | unsigned char** targetTransformed); 127 | 128 | static inline int ceilDiv(int x, int y); 129 | 130 | static inline unsigned char* createReverseCopy(const unsigned char* seq, int length); 131 | 132 | static inline Word* buildPeq(const int alphabetLength, 133 | const unsigned char* query, 134 | const int queryLength, 135 | const EqualityDefinition& equalityDefinition); 136 | 137 | 138 | /** 139 | * Main edlib method. 140 | */ 141 | extern "C" EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength, 142 | const char* const targetOriginal, const int targetLength, 143 | const EdlibAlignConfig config) { 144 | EdlibAlignResult result; 145 | result.status = EDLIB_STATUS_OK; 146 | result.editDistance = -1; 147 | result.endLocations = result.startLocations = NULL; 148 | result.numLocations = 0; 149 | result.alignment = NULL; 150 | result.alignmentLength = 0; 151 | result.alphabetLength = 0; 152 | 153 | /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/ 154 | unsigned char* query, * target; 155 | string alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength, 156 | &query, &target); 157 | result.alphabetLength = static_cast(alphabet.size()); 158 | /*-------------------------------------------------------*/ 159 | 160 | // Handle special situation when at least one of the sequences has length 0. 161 | if (queryLength == 0 || targetLength == 0) { 162 | if (config.mode == EDLIB_MODE_NW) { 163 | result.editDistance = std::max(queryLength, targetLength); 164 | result.endLocations = static_cast(malloc(sizeof(int) * 1)); 165 | result.endLocations[0] = targetLength - 1; 166 | result.numLocations = 1; 167 | } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) { 168 | result.editDistance = queryLength; 169 | result.endLocations = static_cast(malloc(sizeof(int) * 1)); 170 | result.endLocations[0] = -1; 171 | result.numLocations = 1; 172 | } else { 173 | result.status = EDLIB_STATUS_ERROR; 174 | } 175 | 176 | free(query); 177 | free(target); 178 | return result; 179 | } 180 | 181 | /*--------------------- INITIALIZATION ------------------*/ 182 | int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers 183 | int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks 184 | EqualityDefinition equalityDefinition(alphabet, config.additionalEqualities, config.additionalEqualitiesLength); 185 | Word* Peq = buildPeq(static_cast(alphabet.size()), query, queryLength, equalityDefinition); 186 | /*-------------------------------------------------------*/ 187 | 188 | /*------------------ MAIN CALCULATION -------------------*/ 189 | // TODO: Store alignment data only after k is determined? That could make things faster. 190 | int positionNW; // Used only when mode is NW. 191 | AlignmentData* alignData = NULL; 192 | bool dynamicK = false; 193 | int k = config.k; 194 | if (k < 0) { // If valid k is not given, auto-adjust k until solution is found. 195 | dynamicK = true; 196 | k = WORD_SIZE; // Gives better results than smaller k. 197 | } 198 | 199 | do { 200 | if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) { 201 | myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks, 202 | queryLength, target, targetLength, 203 | k, config.mode, &(result.editDistance), 204 | &(result.endLocations), &(result.numLocations)); 205 | } else { // mode == EDLIB_MODE_NW 206 | myersCalcEditDistanceNW(Peq, W, maxNumBlocks, 207 | queryLength, target, targetLength, 208 | k, &(result.editDistance), &positionNW, 209 | false, &alignData, -1); 210 | } 211 | k *= 2; 212 | } while(dynamicK && result.editDistance == -1); 213 | 214 | if (result.editDistance >= 0) { // If there is solution. 215 | // If NW mode, set end location explicitly. 216 | if (config.mode == EDLIB_MODE_NW) { 217 | result.endLocations = static_cast(malloc(sizeof(int) * 1)); 218 | result.endLocations[0] = targetLength - 1; 219 | result.numLocations = 1; 220 | } 221 | 222 | // Find starting locations. 223 | if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) { 224 | result.startLocations = static_cast(malloc(result.numLocations * sizeof(int))); 225 | if (config.mode == EDLIB_MODE_HW) { // If HW, I need to calculate start locations. 226 | const unsigned char* rTarget = createReverseCopy(target, targetLength); 227 | const unsigned char* rQuery = createReverseCopy(query, queryLength); 228 | // Peq for reversed query. 229 | Word* rPeq = buildPeq(static_cast(alphabet.size()), rQuery, queryLength, equalityDefinition); 230 | for (int i = 0; i < result.numLocations; i++) { 231 | int endLocation = result.endLocations[i]; 232 | if (endLocation == -1) { 233 | // NOTE: Sometimes one of optimal solutions is that query starts before target, like this: 234 | // AAGG <- target 235 | // CCTT <- query 236 | // It will never be only optimal solution and it does not happen often, however it is 237 | // possible and in that case end location will be -1. What should we do with that? 238 | // Should we just skip reporting such end location, although it is a solution? 239 | // If we do report it, what is the start location? -4? -1? Nothing? 240 | // TODO: Figure this out. This has to do in general with how we think about start 241 | // and end locations. 242 | // Also, we have alignment later relying on this locations to limit the space of it's 243 | // search -> how can it do it right if these locations are negative or incorrect? 244 | result.startLocations[i] = 0; // I put 0 for now, but it does not make much sense. 245 | } else { 246 | int bestScoreSHW, numPositionsSHW; 247 | int* positionsSHW; 248 | myersCalcEditDistanceSemiGlobal( 249 | rPeq, W, maxNumBlocks, 250 | queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1, 251 | result.editDistance, EDLIB_MODE_SHW, 252 | &bestScoreSHW, &positionsSHW, &numPositionsSHW); 253 | // Taking last location as start ensures that alignment will not start with insertions 254 | // if it can start with mismatches instead. 255 | result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1]; 256 | free(positionsSHW); 257 | } 258 | } 259 | delete[] rTarget; 260 | delete[] rQuery; 261 | delete[] rPeq; 262 | } else { // If mode is SHW or NW 263 | for (int i = 0; i < result.numLocations; i++) { 264 | result.startLocations[i] = 0; 265 | } 266 | } 267 | } 268 | 269 | // Find alignment -> all comes down to finding alignment for NW. 270 | // Currently we return alignment only for first pair of locations. 271 | if (config.task == EDLIB_TASK_PATH) { 272 | int alnStartLocation = result.startLocations[0]; 273 | int alnEndLocation = result.endLocations[0]; 274 | const unsigned char* alnTarget = target + alnStartLocation; 275 | const int alnTargetLength = alnEndLocation - alnStartLocation + 1; 276 | const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength); 277 | const unsigned char* rQuery = createReverseCopy(query, queryLength); 278 | obtainAlignment(query, rQuery, queryLength, 279 | alnTarget, rAlnTarget, alnTargetLength, 280 | equalityDefinition, static_cast(alphabet.size()), result.editDistance, 281 | &(result.alignment), &(result.alignmentLength)); 282 | delete[] rAlnTarget; 283 | delete[] rQuery; 284 | } 285 | } 286 | /*-------------------------------------------------------*/ 287 | 288 | //--- Free memory ---// 289 | delete[] Peq; 290 | free(query); 291 | free(target); 292 | if (alignData) delete alignData; 293 | //-------------------// 294 | 295 | return result; 296 | } 297 | 298 | extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength, 299 | const EdlibCigarFormat cigarFormat) { 300 | if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) { 301 | return 0; 302 | } 303 | 304 | // Maps move code from alignment to char in cigar. 305 | // 0 1 2 3 306 | char moveCodeToChar[] = {'=', 'I', 'D', 'X'}; 307 | if (cigarFormat == EDLIB_CIGAR_STANDARD) { 308 | moveCodeToChar[0] = moveCodeToChar[3] = 'M'; 309 | } 310 | 311 | vector* cigar = new vector(); 312 | char lastMove = 0; // Char of last move. 0 if there was no previous move. 313 | int numOfSameMoves = 0; 314 | for (int i = 0; i <= alignmentLength; i++) { 315 | // if new sequence of same moves started 316 | if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) { 317 | // Write number of moves to cigar string. 318 | int numDigits = 0; 319 | for (; numOfSameMoves; numOfSameMoves /= 10) { 320 | cigar->push_back('0' + numOfSameMoves % 10); 321 | numDigits++; 322 | } 323 | reverse(cigar->end() - numDigits, cigar->end()); 324 | // Write code of move to cigar string. 325 | cigar->push_back(lastMove); 326 | // If not at the end, start new sequence of moves. 327 | if (i < alignmentLength) { 328 | // Check if alignment has valid values. 329 | if (alignment[i] > 3) { 330 | delete cigar; 331 | return 0; 332 | } 333 | numOfSameMoves = 0; 334 | } 335 | } 336 | if (i < alignmentLength) { 337 | lastMove = moveCodeToChar[alignment[i]]; 338 | numOfSameMoves++; 339 | } 340 | } 341 | cigar->push_back(0); // Null character termination. 342 | char* cigar_ = static_cast(malloc(cigar->size() * sizeof(char))); 343 | memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char)); 344 | delete cigar; 345 | 346 | return cigar_; 347 | } 348 | 349 | /** 350 | * Build Peq table for given query and alphabet. 351 | * Peq is table of dimensions alphabetLength+1 x maxNumBlocks. 352 | * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0. 353 | * NOTICE: free returned array with delete[]! 354 | */ 355 | static inline Word* buildPeq(const int alphabetLength, 356 | const unsigned char* const query, 357 | const int queryLength, 358 | const EqualityDefinition& equalityDefinition) { 359 | int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); 360 | // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard. 361 | Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks]; 362 | 363 | // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s 364 | for (unsigned char symbol = 0; symbol <= alphabetLength; symbol++) { 365 | for (int b = 0; b < maxNumBlocks; b++) { 366 | if (symbol < alphabetLength) { 367 | Peq[symbol * maxNumBlocks + b] = 0; 368 | for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) { 369 | Peq[symbol * maxNumBlocks + b] <<= 1; 370 | // NOTE: We pretend like query is padded at the end with W wildcard symbols 371 | if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol)) 372 | Peq[symbol * maxNumBlocks + b] += 1; 373 | } 374 | } else { // Last symbol is wildcard, so it is all 1s 375 | Peq[symbol * maxNumBlocks + b] = static_cast(-1); 376 | } 377 | } 378 | } 379 | 380 | return Peq; 381 | } 382 | 383 | 384 | /** 385 | * Returns new sequence that is reverse of given sequence. 386 | * Free returned array with delete[]. 387 | */ 388 | static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) { 389 | unsigned char* rSeq = new unsigned char[length]; 390 | for (int i = 0; i < length; i++) { 391 | rSeq[i] = seq[length - i - 1]; 392 | } 393 | return rSeq; 394 | } 395 | 396 | /** 397 | * Corresponds to Advance_Block function from Myers. 398 | * Calculates one word(block), which is part of a column. 399 | * Highest bit of word (one most to the left) is most bottom cell of block from column. 400 | * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1]. 401 | * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0. 402 | * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0. 403 | * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch. 404 | * @param [in] hin Will be +1, 0 or -1. 405 | * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0. 406 | * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0. 407 | * @param [out] hout Will be +1, 0 or -1. 408 | */ 409 | static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin, 410 | Word &PvOut, Word &MvOut) { 411 | // hin can be 1, -1 or 0. 412 | // 1 -> 00...01 413 | // 0 -> 00...00 414 | // -1 -> 11...11 (2-complement) 415 | 416 | Word hinIsNeg = static_cast(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1 417 | 418 | Word Xv = Eq | Mv; 419 | // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1; 420 | Eq |= hinIsNeg; 421 | Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; 422 | 423 | Word Ph = Mv | ~(Xh | Pv); 424 | Word Mh = Pv & Xh; 425 | 426 | int hout = 0; 427 | // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1; 428 | hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1); 429 | // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1; 430 | hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1); 431 | 432 | Ph <<= 1; 433 | Mh <<= 1; 434 | 435 | // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1; 436 | Mh |= hinIsNeg; 437 | // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1; 438 | Ph |= static_cast((hin + 1) >> 1); 439 | 440 | PvOut = Mh | ~(Xv | Ph); 441 | MvOut = Ph & Xv; 442 | 443 | return hout; 444 | } 445 | 446 | /** 447 | * Does ceiling division x / y. 448 | * Note: x and y must be non-negative and x + y must not overflow. 449 | */ 450 | static inline int ceilDiv(const int x, const int y) { 451 | return x % y ? x / y + 1 : x / y; 452 | } 453 | 454 | static inline int min(const int x, const int y) { 455 | return x < y ? x : y; 456 | } 457 | 458 | static inline int max(const int x, const int y) { 459 | return x > y ? x : y; 460 | } 461 | 462 | 463 | /** 464 | * @param [in] block 465 | * @return Values of cells in block, starting with bottom cell in block. 466 | */ 467 | static inline vector getBlockCellValues(const Block block) { 468 | vector scores(WORD_SIZE); 469 | int score = block.score; 470 | Word mask = HIGH_BIT_MASK; 471 | for (int i = 0; i < WORD_SIZE - 1; i++) { 472 | scores[i] = score; 473 | if (block.P & mask) score--; 474 | if (block.M & mask) score++; 475 | mask >>= 1; 476 | } 477 | scores[WORD_SIZE - 1] = score; 478 | return scores; 479 | } 480 | 481 | /** 482 | * Writes values of cells in block into given array, starting with first/top cell. 483 | * @param [in] block 484 | * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. 485 | */ 486 | static inline void readBlock(const Block block, int* const dest) { 487 | int score = block.score; 488 | Word mask = HIGH_BIT_MASK; 489 | for (int i = 0; i < WORD_SIZE - 1; i++) { 490 | dest[WORD_SIZE - 1 - i] = score; 491 | if (block.P & mask) score--; 492 | if (block.M & mask) score++; 493 | mask >>= 1; 494 | } 495 | dest[0] = score; 496 | } 497 | 498 | /** 499 | * Writes values of cells in block into given array, starting with last/bottom cell. 500 | * @param [in] block 501 | * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE. 502 | */ 503 | static inline void readBlockReverse(const Block block, int* const dest) { 504 | int score = block.score; 505 | Word mask = HIGH_BIT_MASK; 506 | for (int i = 0; i < WORD_SIZE - 1; i++) { 507 | dest[i] = score; 508 | if (block.P & mask) score--; 509 | if (block.M & mask) score++; 510 | mask >>= 1; 511 | } 512 | dest[WORD_SIZE - 1] = score; 513 | } 514 | 515 | /** 516 | * @param [in] block 517 | * @param [in] k 518 | * @return True if all cells in block have value larger than k, otherwise false. 519 | */ 520 | static inline bool allBlockCellsLarger(const Block block, const int k) { 521 | vector scores = getBlockCellValues(block); 522 | for (int i = 0; i < WORD_SIZE; i++) { 523 | if (scores[i] <= k) return false; 524 | } 525 | return true; 526 | } 527 | 528 | 529 | /** 530 | * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods. 531 | * @param [in] Peq Query profile. 532 | * @param [in] W Size of padding in last block. 533 | * TODO: Calculate this directly from query, instead of passing it. 534 | * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. 535 | * TODO: Calculate this directly from query, instead of passing it. 536 | * @param [in] queryLength 537 | * @param [in] target 538 | * @param [in] targetLength 539 | * @param [in] k 540 | * @param [in] mode EDLIB_MODE_HW or EDLIB_MODE_SHW 541 | * @param [out] bestScore_ Edit distance. 542 | * @param [out] positions_ Array of 0-indexed positions in target at which best score was found. 543 | Make sure to free this array with free(). 544 | * @param [out] numPositions_ Number of positions in the positions_ array. 545 | * @return Status. 546 | */ 547 | static int myersCalcEditDistanceSemiGlobal( 548 | const Word* const Peq, const int W, const int maxNumBlocks, 549 | const int queryLength, 550 | const unsigned char* const target, const int targetLength, 551 | int k, const EdlibAlignMode mode, 552 | int* const bestScore_, int** const positions_, int* const numPositions_) { 553 | *positions_ = NULL; 554 | *numPositions_ = 0; 555 | 556 | // firstBlock is 0-based index of first block in Ukkonen band. 557 | // lastBlock is 0-based index of last block in Ukkonen band. 558 | int firstBlock = 0; 559 | int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers 560 | Block *bl; // Current block 561 | 562 | Block* blocks = new Block[maxNumBlocks]; 563 | 564 | // For HW, solution will never be larger then queryLength. 565 | if (mode == EDLIB_MODE_HW) { 566 | k = min(queryLength, k); 567 | } 568 | 569 | // Each STRONG_REDUCE_NUM column is reduced in more expensive way. 570 | // This gives speed up of about 2 times for small k. 571 | const int STRONG_REDUCE_NUM = 2048; 572 | 573 | // Initialize P, M and score 574 | bl = blocks; 575 | for (int b = 0; b <= lastBlock; b++) { 576 | bl->score = (b + 1) * WORD_SIZE; 577 | bl->P = static_cast(-1); // All 1s 578 | bl->M = static_cast(0); 579 | bl++; 580 | } 581 | 582 | int bestScore = -1; 583 | vector positions; // TODO: Maybe put this on heap? 584 | const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized; 585 | const unsigned char* targetChar = target; 586 | for (int c = 0; c < targetLength; c++) { // for each column 587 | const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks; 588 | 589 | //----------------------- Calculate column -------------------------// 590 | int hout = startHout; 591 | bl = blocks + firstBlock; 592 | Peq_c += firstBlock; 593 | for (int b = firstBlock; b <= lastBlock; b++) { 594 | hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M); 595 | bl->score += hout; 596 | bl++; Peq_c++; 597 | } 598 | bl--; Peq_c--; 599 | //------------------------------------------------------------------// 600 | 601 | //---------- Adjust number of blocks according to Ukkonen ----------// 602 | if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block 603 | && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block 604 | // If score of left block is not too big, calculate one more block 605 | lastBlock++; bl++; Peq_c++; 606 | bl->P = static_cast(-1); // All 1s 607 | bl->M = static_cast(0); 608 | bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M); 609 | } else { 610 | while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) { 611 | lastBlock--; bl--; Peq_c--; 612 | } 613 | } 614 | 615 | // Every some columns, do some expensive but also more efficient block reducing. 616 | // This is important! 617 | // 618 | // Reduce the band by decreasing last block if possible. 619 | if (c % STRONG_REDUCE_NUM == 0) { 620 | while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) { 621 | lastBlock--; bl--; Peq_c--; 622 | } 623 | } 624 | // For HW, even if all cells are > k, there still may be solution in next 625 | // column because starting conditions at upper boundary are 0. 626 | // That means that first block is always candidate for solution, 627 | // and we can never end calculation before last column. 628 | if (mode == EDLIB_MODE_HW && lastBlock == -1) { 629 | lastBlock++; bl++; Peq_c++; 630 | } 631 | 632 | // Reduce band by increasing first block if possible. Not applicable to HW. 633 | if (mode != EDLIB_MODE_HW) { 634 | while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) { 635 | firstBlock++; 636 | } 637 | if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks 638 | while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) { 639 | firstBlock++; 640 | } 641 | } 642 | } 643 | 644 | // If band stops to exist finish 645 | if (lastBlock < firstBlock) { 646 | *bestScore_ = bestScore; 647 | if (bestScore != -1) { 648 | *positions_ = static_cast(malloc(sizeof(int) * static_cast(positions.size()))); 649 | *numPositions_ = static_cast(positions.size()); 650 | copy(positions.begin(), positions.end(), *positions_); 651 | } 652 | delete[] blocks; 653 | return EDLIB_STATUS_OK; 654 | } 655 | //------------------------------------------------------------------// 656 | 657 | //------------------------- Update best score ----------------------// 658 | if (lastBlock == maxNumBlocks - 1) { 659 | int colScore = bl->score; 660 | if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k. 661 | // NOTE: Score that I find in column c is actually score from column c-W 662 | if (bestScore == -1 || colScore <= bestScore) { 663 | if (colScore != bestScore) { 664 | positions.clear(); 665 | bestScore = colScore; 666 | // Change k so we will look only for equal or better 667 | // scores then the best found so far. 668 | k = bestScore; 669 | } 670 | positions.push_back(c - W); 671 | } 672 | } 673 | } 674 | //------------------------------------------------------------------// 675 | 676 | targetChar++; 677 | } 678 | 679 | 680 | // Obtain results for last W columns from last column. 681 | if (lastBlock == maxNumBlocks - 1) { 682 | vector blockScores = getBlockCellValues(*bl); 683 | for (int i = 0; i < W; i++) { 684 | int colScore = blockScores[i + 1]; 685 | if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) { 686 | if (colScore != bestScore) { 687 | positions.clear(); 688 | k = bestScore = colScore; 689 | } 690 | positions.push_back(targetLength - W + i); 691 | } 692 | } 693 | } 694 | 695 | *bestScore_ = bestScore; 696 | if (bestScore != -1) { 697 | *positions_ = static_cast(malloc(sizeof(int) * static_cast(positions.size()))); 698 | *numPositions_ = static_cast(positions.size()); 699 | copy(positions.begin(), positions.end(), *positions_); 700 | } 701 | 702 | delete[] blocks; 703 | return EDLIB_STATUS_OK; 704 | } 705 | 706 | 707 | /** 708 | * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method. 709 | * @param [in] Peq Query profile. 710 | * @param [in] W Size of padding in last block. 711 | * TODO: Calculate this directly from query, instead of passing it. 712 | * @param [in] maxNumBlocks Number of blocks needed to cover the whole query. 713 | * TODO: Calculate this directly from query, instead of passing it. 714 | * @param [in] queryLength 715 | * @param [in] target 716 | * @param [in] targetLength 717 | * @param [in] k 718 | * @param [out] bestScore_ Edit distance. 719 | * @param [out] position_ 0-indexed position in target at which best score was found. 720 | * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned. 721 | * Quadratic amount of memory is consumed. 722 | * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment). 723 | * Set only if findAlignment is set to true, otherwise it is NULL. 724 | * Make sure to free this array using delete[]. 725 | * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected. 726 | * If set to p, calculation is performed up to position p in target (inclusive) 727 | * and column p is returned as the only column in alignData. 728 | * @return Status. 729 | */ 730 | static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks, 731 | const int queryLength, 732 | const unsigned char* const target, const int targetLength, 733 | int k, int* const bestScore_, 734 | int* const position_, const bool findAlignment, 735 | AlignmentData** const alignData, const int targetStopPosition) { 736 | if (targetStopPosition > -1 && findAlignment) { 737 | // They can not be both set at the same time! 738 | return EDLIB_STATUS_ERROR; 739 | } 740 | 741 | // Each STRONG_REDUCE_NUM column is reduced in more expensive way. 742 | const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation 743 | 744 | if (k < abs(targetLength - queryLength)) { 745 | *bestScore_ = *position_ = -1; 746 | return EDLIB_STATUS_OK; 747 | } 748 | 749 | k = min(k, max(queryLength, targetLength)); // Upper bound for k 750 | 751 | // firstBlock is 0-based index of first block in Ukkonen band. 752 | // lastBlock is 0-based index of last block in Ukkonen band. 753 | int firstBlock = 0; 754 | // This is optimal now, by my formula. 755 | int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1; 756 | Block* bl; // Current block 757 | 758 | Block* blocks = new Block[maxNumBlocks]; 759 | 760 | // Initialize P, M and score 761 | bl = blocks; 762 | for (int b = 0; b <= lastBlock; b++) { 763 | bl->score = (b + 1) * WORD_SIZE; 764 | bl->P = static_cast(-1); // All 1s 765 | bl->M = static_cast(0); 766 | bl++; 767 | } 768 | 769 | // If we want to find alignment, we have to store needed data. 770 | if (findAlignment) 771 | *alignData = new AlignmentData(maxNumBlocks, targetLength); 772 | else if (targetStopPosition > -1) 773 | *alignData = new AlignmentData(maxNumBlocks, 1); 774 | else 775 | *alignData = NULL; 776 | 777 | const unsigned char* targetChar = target; 778 | for (int c = 0; c < targetLength; c++) { // for each column 779 | const Word* Peq_c = Peq + *targetChar * maxNumBlocks; 780 | 781 | //----------------------- Calculate column -------------------------// 782 | int hout = 1; 783 | bl = blocks + firstBlock; 784 | for (int b = firstBlock; b <= lastBlock; b++) { 785 | hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M); 786 | bl->score += hout; 787 | bl++; 788 | } 789 | bl--; 790 | //------------------------------------------------------------------// 791 | // bl now points to last block 792 | 793 | // Update k. I do it only on end of column because it would slow calculation too much otherwise. 794 | // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up. 795 | k = min(k, bl->score 796 | + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1) 797 | + (lastBlock == maxNumBlocks - 1 ? W : 0)); 798 | 799 | //---------- Adjust number of blocks according to Ukkonen ----------// 800 | //--- Adjust last block ---// 801 | // If block is not beneath band, calculate next block. Only next because others are certainly beneath band. 802 | if (lastBlock + 1 < maxNumBlocks 803 | && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also! 804 | ((lastBlock + 1) * WORD_SIZE - 1 805 | > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) { 806 | lastBlock++; bl++; 807 | bl->P = static_cast(-1); // All 1s 808 | bl->M = static_cast(0); 809 | int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M); 810 | bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout; 811 | hout = newHout; 812 | } 813 | 814 | // While block is out of band, move one block up. 815 | // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it. 816 | // I could consider adding that max part, for optimal performance. 817 | while (lastBlock >= firstBlock 818 | && (bl->score >= k + WORD_SIZE 819 | || ((lastBlock + 1) * WORD_SIZE - 1 > 820 | // TODO: Does not work if do not put +1! Why??? 821 | k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) { 822 | lastBlock--; bl--; 823 | } 824 | //-------------------------// 825 | 826 | //--- Adjust first block ---// 827 | // While outside of band, advance block 828 | while (firstBlock <= lastBlock 829 | && (blocks[firstBlock].score >= k + WORD_SIZE 830 | || ((firstBlock + 1) * WORD_SIZE - 1 < 831 | blocks[firstBlock].score - k - targetLength + queryLength + c))) { 832 | firstBlock++; 833 | } 834 | //--------------------------/ 835 | 836 | 837 | // TODO: consider if this part is useful, it does not seem to help much 838 | if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction 839 | while (lastBlock >= firstBlock) { 840 | // If all cells outside of band, remove block 841 | vector scores = getBlockCellValues(*bl); 842 | int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; 843 | int r = lastBlock * WORD_SIZE + numCells - 1; 844 | bool reduce = true; 845 | for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { 846 | // TODO: Does not work if do not put +1! Why??? 847 | if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) { 848 | reduce = false; 849 | break; 850 | } 851 | r--; 852 | } 853 | if (!reduce) break; 854 | lastBlock--; bl--; 855 | } 856 | 857 | while (firstBlock <= lastBlock) { 858 | // If all cells outside of band, remove block 859 | vector scores = getBlockCellValues(blocks[firstBlock]); 860 | int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE; 861 | int r = firstBlock * WORD_SIZE + numCells - 1; 862 | bool reduce = true; 863 | for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) { 864 | if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) { 865 | reduce = false; 866 | break; 867 | } 868 | r--; 869 | } 870 | if (!reduce) break; 871 | firstBlock++; 872 | } 873 | } 874 | 875 | 876 | // If band stops to exist finish 877 | if (lastBlock < firstBlock) { 878 | *bestScore_ = *position_ = -1; 879 | delete[] blocks; 880 | return EDLIB_STATUS_OK; 881 | } 882 | //------------------------------------------------------------------// 883 | 884 | 885 | //---- Save column so it can be used for reconstruction ----// 886 | if (findAlignment && c < targetLength) { 887 | bl = blocks + firstBlock; 888 | for (int b = firstBlock; b <= lastBlock; b++) { 889 | (*alignData)->Ps[maxNumBlocks * c + b] = bl->P; 890 | (*alignData)->Ms[maxNumBlocks * c + b] = bl->M; 891 | (*alignData)->scores[maxNumBlocks * c + b] = bl->score; 892 | (*alignData)->firstBlocks[c] = firstBlock; 893 | (*alignData)->lastBlocks[c] = lastBlock; 894 | bl++; 895 | } 896 | } 897 | //----------------------------------------------------------// 898 | //---- If this is stop column, save it and finish ----// 899 | if (c == targetStopPosition) { 900 | for (int b = firstBlock; b <= lastBlock; b++) { 901 | (*alignData)->Ps[b] = (blocks + b)->P; 902 | (*alignData)->Ms[b] = (blocks + b)->M; 903 | (*alignData)->scores[b] = (blocks + b)->score; 904 | (*alignData)->firstBlocks[0] = firstBlock; 905 | (*alignData)->lastBlocks[0] = lastBlock; 906 | } 907 | *bestScore_ = -1; 908 | *position_ = targetStopPosition; 909 | delete[] blocks; 910 | return EDLIB_STATUS_OK; 911 | } 912 | //----------------------------------------------------// 913 | 914 | targetChar++; 915 | } 916 | 917 | if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated 918 | // Obtain best score from block -> it is complicated because query is padded with W cells 919 | int bestScore = getBlockCellValues(blocks[lastBlock])[W]; 920 | if (bestScore <= k) { 921 | *bestScore_ = bestScore; 922 | *position_ = targetLength - 1; 923 | delete[] blocks; 924 | return EDLIB_STATUS_OK; 925 | } 926 | } 927 | 928 | *bestScore_ = *position_ = -1; 929 | delete[] blocks; 930 | return EDLIB_STATUS_OK; 931 | } 932 | 933 | 934 | /** 935 | * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix, 936 | * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength). 937 | * @param [in] queryLength Normal length, without W. 938 | * @param [in] targetLength Normal length, without W. 939 | * @param [in] bestScore Best score. 940 | * @param [in] alignData Data obtained during finding best score that is useful for finding alignment. 941 | * @param [out] alignment Alignment. 942 | * @param [out] alignmentLength Length of alignment. 943 | * @return Status code. 944 | */ 945 | static int obtainAlignmentTraceback(const int queryLength, const int targetLength, 946 | const int bestScore, const AlignmentData* const alignData, 947 | unsigned char** const alignment, int* const alignmentLength) { 948 | const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); 949 | const int W = maxNumBlocks * WORD_SIZE - queryLength; 950 | 951 | *alignment = static_cast(malloc((queryLength + targetLength - 1) * sizeof(unsigned char))); 952 | *alignmentLength = 0; 953 | int c = targetLength - 1; // index of column 954 | int b = maxNumBlocks - 1; // index of block in column 955 | int currScore = bestScore; // Score of current cell 956 | int lScore = -1; // Score of left cell 957 | int uScore = -1; // Score of upper cell 958 | int ulScore = -1; // Score of upper left cell 959 | Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block 960 | Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block 961 | // True if block to left exists and is in band 962 | bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]; 963 | // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the 964 | // calculation as both lP and lM should be initialized at some moment later (but compiler can not 965 | // detect it since this initialization is guaranteed by "business" logic). 966 | Word lP = 0, lM = 0; 967 | if (thereIsLeftBlock) { 968 | lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left 969 | lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left 970 | } 971 | currP <<= W; 972 | currM <<= W; 973 | int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos 974 | 975 | // TODO(martin): refactor this whole piece of code. There are too many if-else statements, 976 | // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases. 977 | // We need better separation of logic and responsibilities. 978 | while (true) { 979 | if (c == 0) { 980 | thereIsLeftBlock = true; 981 | lScore = b * WORD_SIZE + blockPos + 1; 982 | ulScore = lScore - 1; 983 | } 984 | 985 | // TODO: improvement: calculate only those cells that are needed, 986 | // for example if I calculate upper cell and can move up, 987 | // there is no need to calculate left and upper left cell 988 | //---------- Calculate scores ---------// 989 | if (lScore == -1 && thereIsLeftBlock) { 990 | lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left 991 | for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) { 992 | if (lP & HIGH_BIT_MASK) lScore--; 993 | if (lM & HIGH_BIT_MASK) lScore++; 994 | lP <<= 1; 995 | lM <<= 1; 996 | } 997 | } 998 | if (ulScore == -1) { 999 | if (lScore != -1) { 1000 | ulScore = lScore; 1001 | if (lP & HIGH_BIT_MASK) ulScore--; 1002 | if (lM & HIGH_BIT_MASK) ulScore++; 1003 | } 1004 | else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) { 1005 | // This is the case when upper left cell is last cell in block, 1006 | // and block to left is not in band so lScore is -1. 1007 | ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1]; 1008 | } 1009 | } 1010 | if (uScore == -1) { 1011 | uScore = currScore; 1012 | if (currP & HIGH_BIT_MASK) uScore--; 1013 | if (currM & HIGH_BIT_MASK) uScore++; 1014 | currP <<= 1; 1015 | currM <<= 1; 1016 | } 1017 | //-------------------------------------// 1018 | 1019 | // TODO: should I check if there is upper block? 1020 | 1021 | //-------------- Move --------------// 1022 | // Move up - insertion to target - deletion from query 1023 | if (uScore != -1 && uScore + 1 == currScore) { 1024 | currScore = uScore; 1025 | lScore = ulScore; 1026 | uScore = ulScore = -1; 1027 | if (blockPos == 0) { // If entering new (upper) block 1028 | if (b == 0) { // If there are no cells above (only boundary cells) 1029 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up 1030 | for (int i = 0; i < c + 1; i++) // Move left until end 1031 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; 1032 | break; 1033 | } else { 1034 | blockPos = WORD_SIZE - 1; 1035 | b--; 1036 | currP = alignData->Ps[c * maxNumBlocks + b]; 1037 | currM = alignData->Ms[c * maxNumBlocks + b]; 1038 | if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { 1039 | thereIsLeftBlock = true; 1040 | lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations 1041 | lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; 1042 | } else { 1043 | thereIsLeftBlock = false; 1044 | // TODO(martin): There may not be left block, but there can be left boundary - do we 1045 | // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this. 1046 | } 1047 | } 1048 | } else { 1049 | blockPos--; 1050 | lP <<= 1; 1051 | lM <<= 1; 1052 | } 1053 | // Mark move 1054 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; 1055 | } 1056 | // Move left - deletion from target - insertion to query 1057 | else if (lScore != -1 && lScore + 1 == currScore) { 1058 | currScore = lScore; 1059 | uScore = ulScore; 1060 | lScore = ulScore = -1; 1061 | c--; 1062 | if (c == -1) { // If there are no cells to the left (only boundary cells) 1063 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left 1064 | int numUp = b * WORD_SIZE + blockPos + 1; 1065 | for (int i = 0; i < numUp; i++) // Move up until end 1066 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; 1067 | break; 1068 | } 1069 | currP = lP; 1070 | currM = lM; 1071 | if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { 1072 | thereIsLeftBlock = true; 1073 | lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; 1074 | lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; 1075 | } else { 1076 | if (c == 0) { // If there are no cells to the left (only boundary cells) 1077 | thereIsLeftBlock = true; 1078 | lScore = b * WORD_SIZE + blockPos + 1; 1079 | ulScore = lScore - 1; 1080 | } else { 1081 | thereIsLeftBlock = false; 1082 | } 1083 | } 1084 | // Mark move 1085 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; 1086 | } 1087 | // Move up left - (mis)match 1088 | else if (ulScore != -1) { 1089 | unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH; 1090 | currScore = ulScore; 1091 | uScore = lScore = ulScore = -1; 1092 | c--; 1093 | if (c == -1) { // If there are no cells to the left (only boundary cells) 1094 | (*alignment)[(*alignmentLength)++] = moveCode; // Move left 1095 | int numUp = b * WORD_SIZE + blockPos; 1096 | for (int i = 0; i < numUp; i++) // Move up until end 1097 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; 1098 | break; 1099 | } 1100 | if (blockPos == 0) { // If entering upper left block 1101 | if (b == 0) { // If there are no more cells above (only boundary cells) 1102 | (*alignment)[(*alignmentLength)++] = moveCode; // Move up left 1103 | for (int i = 0; i < c + 1; i++) // Move left until end 1104 | (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; 1105 | break; 1106 | } 1107 | blockPos = WORD_SIZE - 1; 1108 | b--; 1109 | currP = alignData->Ps[c * maxNumBlocks + b]; 1110 | currM = alignData->Ms[c * maxNumBlocks + b]; 1111 | } else { // If entering left block 1112 | blockPos--; 1113 | currP = lP; 1114 | currM = lM; 1115 | currP <<= 1; 1116 | currM <<= 1; 1117 | } 1118 | // Set new left block 1119 | if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) { 1120 | thereIsLeftBlock = true; 1121 | lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; 1122 | lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; 1123 | } else { 1124 | if (c == 0) { // If there are no cells to the left (only boundary cells) 1125 | thereIsLeftBlock = true; 1126 | lScore = b * WORD_SIZE + blockPos + 1; 1127 | ulScore = lScore - 1; 1128 | } else { 1129 | thereIsLeftBlock = false; 1130 | } 1131 | } 1132 | // Mark move 1133 | (*alignment)[(*alignmentLength)++] = moveCode; 1134 | } else { 1135 | // Reached end - finished! 1136 | break; 1137 | } 1138 | //----------------------------------// 1139 | } 1140 | 1141 | *alignment = static_cast(realloc(*alignment, (*alignmentLength) * sizeof(unsigned char))); 1142 | reverse(*alignment, *alignment + (*alignmentLength)); 1143 | return EDLIB_STATUS_OK; 1144 | } 1145 | 1146 | 1147 | /** 1148 | * Finds one possible alignment that gives optimal score (bestScore). 1149 | * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough, 1150 | * it will solve them using traceback algorithm. 1151 | * @param [in] query 1152 | * @param [in] rQuery Reversed query. 1153 | * @param [in] queryLength 1154 | * @param [in] target 1155 | * @param [in] rTarget Reversed target. 1156 | * @param [in] targetLength 1157 | * @param [in] equalityDefinition 1158 | * @param [in] alphabetLength 1159 | * @param [in] bestScore Best(optimal) score. 1160 | * @param [out] alignment Sequence of edit operations that make target equal to query. 1161 | * @param [out] alignmentLength Length of alignment. 1162 | * @return Status code. 1163 | */ 1164 | static int obtainAlignment( 1165 | const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, 1166 | const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, 1167 | const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore, 1168 | unsigned char** const alignment, int* const alignmentLength) { 1169 | 1170 | // Handle special case when one of sequences has length of 0. 1171 | if (queryLength == 0 || targetLength == 0) { 1172 | *alignmentLength = targetLength + queryLength; 1173 | *alignment = static_cast(malloc((*alignmentLength) * sizeof(unsigned char))); 1174 | for (int i = 0; i < *alignmentLength; i++) { 1175 | (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT; 1176 | } 1177 | return EDLIB_STATUS_OK; 1178 | } 1179 | 1180 | const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); 1181 | const int W = maxNumBlocks * WORD_SIZE - queryLength; 1182 | int statusCode; 1183 | 1184 | // TODO: think about reducing number of memory allocations in alignment functions, probably 1185 | // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg, 1186 | // and it could also be done for alignments - we could have one big array for alignment that would be 1187 | // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results. 1188 | 1189 | // If estimated memory consumption for traceback algorithm is smaller than 1MB use it, 1190 | // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal. 1191 | long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength 1192 | + 2ll * sizeof(int) * targetLength; 1193 | if (alignmentDataSize < 1024 * 1024) { 1194 | int score_, endLocation_; // Used only to call function. 1195 | AlignmentData* alignData = NULL; 1196 | Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); 1197 | myersCalcEditDistanceNW(Peq, W, maxNumBlocks, 1198 | queryLength, 1199 | target, targetLength, 1200 | bestScore, 1201 | &score_, &endLocation_, true, &alignData, -1); 1202 | //assert(score_ == bestScore); 1203 | //assert(endLocation_ == targetLength - 1); 1204 | 1205 | statusCode = obtainAlignmentTraceback(queryLength, targetLength, 1206 | bestScore, alignData, alignment, alignmentLength); 1207 | delete alignData; 1208 | delete[] Peq; 1209 | } else { 1210 | statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength, 1211 | target, rTarget, targetLength, 1212 | equalityDefinition, alphabetLength, bestScore, 1213 | alignment, alignmentLength); 1214 | } 1215 | return statusCode; 1216 | } 1217 | 1218 | 1219 | /** 1220 | * Finds one possible alignment that gives optimal score (bestScore). 1221 | * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together. 1222 | * @param [in] query 1223 | * @param [in] rQuery Reversed query. 1224 | * @param [in] queryLength 1225 | * @param [in] target 1226 | * @param [in] rTarget Reversed target. 1227 | * @param [in] targetLength 1228 | * @param [in] alphabetLength 1229 | * @param [in] bestScore Best(optimal) score. 1230 | * @param [out] alignment Sequence of edit operations that make target equal to query. 1231 | * @param [out] alignmentLength Length of alignment. 1232 | * @return Status code. 1233 | */ 1234 | static int obtainAlignmentHirschberg( 1235 | const unsigned char* const query, const unsigned char* const rQuery, const int queryLength, 1236 | const unsigned char* const target, const unsigned char* const rTarget, const int targetLength, 1237 | const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore, 1238 | unsigned char** const alignment, int* const alignmentLength) { 1239 | 1240 | const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); 1241 | const int W = maxNumBlocks * WORD_SIZE - queryLength; 1242 | 1243 | Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition); 1244 | Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition); 1245 | 1246 | // Used only to call functions. 1247 | int score_, endLocation_; 1248 | 1249 | // Divide dynamic matrix into two halfs, left and right. 1250 | const int leftHalfWidth = targetLength / 2; 1251 | const int rightHalfWidth = targetLength - leftHalfWidth; 1252 | 1253 | // Calculate left half. 1254 | AlignmentData* alignDataLeftHalf = NULL; 1255 | int leftHalfCalcStatus = myersCalcEditDistanceNW( 1256 | Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore, 1257 | &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1); 1258 | 1259 | // Calculate right half. 1260 | AlignmentData* alignDataRightHalf = NULL; 1261 | int rightHalfCalcStatus = myersCalcEditDistanceNW( 1262 | rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore, 1263 | &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1); 1264 | 1265 | delete[] Peq; 1266 | delete[] rPeq; 1267 | 1268 | if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) { 1269 | if (alignDataLeftHalf) delete alignDataLeftHalf; 1270 | if (alignDataRightHalf) delete alignDataRightHalf; 1271 | return EDLIB_STATUS_ERROR; 1272 | } 1273 | 1274 | // Unwrap the left half. 1275 | int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0]; 1276 | int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0]; 1277 | // TODO: avoid this allocation by using some shared array? 1278 | // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index) 1279 | // and ending with scoresLeftEndIdx row (0-indexed). 1280 | int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE; 1281 | int* scoresLeft = new int[scoresLeftLength]; 1282 | for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) { 1283 | Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx], 1284 | alignDataLeftHalf->scores[blockIdx]); 1285 | readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE); 1286 | } 1287 | int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE; 1288 | // If last block contains padding, shorten the length of scores for the length of padding. 1289 | if (lastBlockIdxLeft == maxNumBlocks - 1) { 1290 | scoresLeftLength -= W; 1291 | } 1292 | 1293 | // Unwrap the right half (I also reverse it while unwraping). 1294 | int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0]; 1295 | int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0]; 1296 | int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE; 1297 | int* scoresRight = new int[scoresRightLength]; 1298 | int* scoresRightOriginalStart = scoresRight; 1299 | for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) { 1300 | Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx], 1301 | alignDataRightHalf->scores[blockIdx]); 1302 | readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE); 1303 | } 1304 | int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE; 1305 | // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do), 1306 | // move pointer forward to remove the padding (that is why we remember originalStart). 1307 | if (scoresRightStartIdx < 0) { 1308 | //assert(scoresRightStartIdx == -1 * W); 1309 | scoresRight += W; 1310 | scoresRightStartIdx += W; 1311 | scoresRightLength -= W; 1312 | } 1313 | 1314 | delete alignDataLeftHalf; 1315 | delete alignDataRightHalf; 1316 | 1317 | //--------------------- Find the best move ----------------// 1318 | // Find the query/row index of cell in left column which together with its lower right neighbour 1319 | // from right column gives the best score (when summed). We also have to consider boundary cells 1320 | // (those cells at -1 indexes). 1321 | // x| 1322 | // -+- 1323 | // |x 1324 | int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1); 1325 | int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1, 1326 | scoresRightStartIdx + scoresRightLength - 2); 1327 | int leftScore = -1, rightScore = -1; 1328 | int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through. 1329 | bool queryIdxLeftAlignmentFound = false; 1330 | for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) { 1331 | leftScore = scoresLeft[queryIdx - scoresLeftStartIdx]; 1332 | rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx]; 1333 | if (leftScore + rightScore == bestScore) { 1334 | queryIdxLeftAlignment = queryIdx; 1335 | queryIdxLeftAlignmentFound = true; 1336 | break; 1337 | } 1338 | } 1339 | // Check boundary cells. 1340 | if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) { 1341 | leftScore = leftHalfWidth; 1342 | rightScore = scoresRight[0]; 1343 | if (leftScore + rightScore == bestScore) { 1344 | queryIdxLeftAlignment = -1; 1345 | queryIdxLeftAlignmentFound = true; 1346 | } 1347 | } 1348 | if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength 1349 | && scoresRightStartIdx + scoresRightLength == queryLength) { 1350 | leftScore = scoresLeft[scoresLeftLength - 1]; 1351 | rightScore = rightHalfWidth; 1352 | if (leftScore + rightScore == bestScore) { 1353 | queryIdxLeftAlignment = queryLength - 1; 1354 | queryIdxLeftAlignmentFound = true; 1355 | } 1356 | } 1357 | 1358 | delete[] scoresLeft; 1359 | delete[] scoresRightOriginalStart; 1360 | 1361 | if (queryIdxLeftAlignmentFound == false) { 1362 | // If there was no move that is part of optimal alignment, then there is no such alignment 1363 | // or given bestScore is not correct! 1364 | return EDLIB_STATUS_ERROR; 1365 | } 1366 | //----------------------------------------------------------// 1367 | 1368 | // Calculate alignments for upper half of left half (upper left - ul) 1369 | // and lower half of right half (lower right - lr). 1370 | const int ulHeight = queryIdxLeftAlignment + 1; 1371 | const int lrHeight = queryLength - ulHeight; 1372 | const int ulWidth = leftHalfWidth; 1373 | const int lrWidth = rightHalfWidth; 1374 | unsigned char* ulAlignment = NULL; int ulAlignmentLength; 1375 | int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight, 1376 | target, rTarget + lrWidth, ulWidth, 1377 | equalityDefinition, alphabetLength, leftScore, 1378 | &ulAlignment, &ulAlignmentLength); 1379 | unsigned char* lrAlignment = NULL; int lrAlignmentLength; 1380 | int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight, 1381 | target + ulWidth, rTarget, lrWidth, 1382 | equalityDefinition, alphabetLength, rightScore, 1383 | &lrAlignment, &lrAlignmentLength); 1384 | if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) { 1385 | if (ulAlignment) free(ulAlignment); 1386 | if (lrAlignment) free(lrAlignment); 1387 | return EDLIB_STATUS_ERROR; 1388 | } 1389 | 1390 | // Build alignment by concatenating upper left alignment with lower right alignment. 1391 | *alignmentLength = ulAlignmentLength + lrAlignmentLength; 1392 | *alignment = static_cast(malloc((*alignmentLength) * sizeof(unsigned char))); 1393 | memcpy(*alignment, ulAlignment, ulAlignmentLength); 1394 | memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength); 1395 | 1396 | free(ulAlignment); 1397 | free(lrAlignment); 1398 | return EDLIB_STATUS_OK; 1399 | } 1400 | 1401 | 1402 | /** 1403 | * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences 1404 | * where elements in sequences are not any more letters of alphabet, but their index in alphabet. 1405 | * Most of internal edlib functions expect such transformed sequences. 1406 | * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done. 1407 | * Example: 1408 | * Original sequences: "ACT" and "CGT". 1409 | * Alphabet would be recognized as "ACTG". Alphabet length = 4. 1410 | * Transformed sequences: [0, 1, 2] and [1, 3, 2]. 1411 | * @param [in] queryOriginal 1412 | * @param [in] queryLength 1413 | * @param [in] targetOriginal 1414 | * @param [in] targetLength 1415 | * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1]. 1416 | * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1]. 1417 | * @return Alphabet as a string of unique characters, where index of each character is its value in transformed 1418 | * sequences. 1419 | */ 1420 | static string transformSequences(const char* const queryOriginal, const int queryLength, 1421 | const char* const targetOriginal, const int targetLength, 1422 | unsigned char** const queryTransformed, 1423 | unsigned char** const targetTransformed) { 1424 | // Alphabet is constructed from letters that are present in sequences. 1425 | // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1, 1426 | // and new query and target are created in which letters are replaced with their ordinal numbers. 1427 | // This query and target are used in all the calculations later. 1428 | *queryTransformed = static_cast(malloc(sizeof(unsigned char) * queryLength)); 1429 | *targetTransformed = static_cast(malloc(sizeof(unsigned char) * targetLength)); 1430 | 1431 | string alphabet = ""; 1432 | 1433 | // Alphabet information, it is constructed on fly while transforming sequences. 1434 | // letterIdx[c] is index of letter c in alphabet. 1435 | unsigned char letterIdx[MAX_UCHAR + 1]; 1436 | bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet 1437 | for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false; 1438 | 1439 | for (int i = 0; i < queryLength; i++) { 1440 | unsigned char c = static_cast(queryOriginal[i]); 1441 | if (!inAlphabet[c]) { 1442 | inAlphabet[c] = true; 1443 | letterIdx[c] = static_cast(alphabet.size()); 1444 | alphabet += queryOriginal[i]; 1445 | } 1446 | (*queryTransformed)[i] = letterIdx[c]; 1447 | } 1448 | for (int i = 0; i < targetLength; i++) { 1449 | unsigned char c = static_cast(targetOriginal[i]); 1450 | if (!inAlphabet[c]) { 1451 | inAlphabet[c] = true; 1452 | letterIdx[c] = static_cast(alphabet.size()); 1453 | alphabet += targetOriginal[i]; 1454 | } 1455 | (*targetTransformed)[i] = letterIdx[c]; 1456 | } 1457 | 1458 | return alphabet; 1459 | } 1460 | 1461 | 1462 | extern "C" EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task, 1463 | const EdlibEqualityPair* additionalEqualities, 1464 | int additionalEqualitiesLength) { 1465 | EdlibAlignConfig config; 1466 | config.k = k; 1467 | config.mode = mode; 1468 | config.task = task; 1469 | config.additionalEqualities = additionalEqualities; 1470 | config.additionalEqualitiesLength = additionalEqualitiesLength; 1471 | return config; 1472 | } 1473 | 1474 | extern "C" EdlibAlignConfig edlibDefaultAlignConfig(void) { 1475 | return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0); 1476 | } 1477 | 1478 | extern "C" void edlibFreeAlignResult(EdlibAlignResult result) { 1479 | if (result.endLocations) free(result.endLocations); 1480 | if (result.startLocations) free(result.startLocations); 1481 | if (result.alignment) free(result.alignment); 1482 | } -------------------------------------------------------------------------------- /stringdecomposer/src/edlib.h: -------------------------------------------------------------------------------- 1 | #ifndef EDLIB_H 2 | #define EDLIB_H 3 | 4 | /** 5 | * @file 6 | * @author Martin Sosic 7 | * @brief Main header file, containing all public functions and structures. 8 | */ 9 | 10 | // Define EDLIB_API macro to properly export symbols 11 | #ifdef EDLIB_SHARED 12 | # ifdef _WIN32 13 | # ifdef EDLIB_BUILD 14 | # define EDLIB_API __declspec(dllexport) 15 | # else 16 | # define EDLIB_API __declspec(dllimport) 17 | # endif 18 | # else 19 | # define EDLIB_API __attribute__ ((visibility ("default"))) 20 | # endif 21 | #else 22 | # define EDLIB_API 23 | #endif 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | // Status codes 30 | #define EDLIB_STATUS_OK 0 31 | #define EDLIB_STATUS_ERROR 1 32 | 33 | /** 34 | * Alignment methods - how should Edlib treat gaps before and after query? 35 | */ 36 | typedef enum { 37 | /** 38 | * Global method. This is the standard method. 39 | * Useful when you want to find out how similar is first sequence to second sequence. 40 | */ 41 | EDLIB_MODE_NW, 42 | /** 43 | * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized. 44 | * What that means is that deleting elements from the end of second sequence is "free"! 45 | * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end 46 | * of second sequence is "free" and does not count into total edit distance. This method is appropriate 47 | * when you want to find out how well first sequence fits at the beginning of second sequence. 48 | */ 49 | EDLIB_MODE_SHW, 50 | /** 51 | * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are 52 | * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"! 53 | * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start 54 | * and GAC from the end of second sequence is "free" and does not count into total edit distance. 55 | * This method is appropriate when you want to find out how well first sequence fits at any part of 56 | * second sequence. 57 | * For example, if your second sequence was a long text and your first sequence was a sentence from that text, 58 | * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in 59 | * that text. In bioinformatics, this method is appropriate for aligning read to a sequence. 60 | */ 61 | EDLIB_MODE_HW 62 | } EdlibAlignMode; 63 | 64 | /** 65 | * Alignment tasks - what do you want Edlib to do? 66 | */ 67 | typedef enum { 68 | EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations. 69 | EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations. 70 | EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path. 71 | } EdlibAlignTask; 72 | 73 | /** 74 | * Describes cigar format. 75 | * @see http://samtools.github.io/hts-specs/SAMv1.pdf 76 | * @see http://drive5.com/usearch/manual/cigar.html 77 | */ 78 | typedef enum { 79 | EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'. 80 | EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'. 81 | } EdlibCigarFormat; 82 | 83 | // Edit operations. 84 | #define EDLIB_EDOP_MATCH 0 //!< Match. 85 | #define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query. 86 | #define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query. 87 | #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch. 88 | 89 | /** 90 | * @brief Defines two given characters as equal. 91 | */ 92 | typedef struct { 93 | char first; 94 | char second; 95 | } EdlibEqualityPair; 96 | 97 | /** 98 | * @brief Configuration object for edlibAlign() function. 99 | */ 100 | typedef struct { 101 | /** 102 | * Set k to non-negative value to tell edlib that edit distance is not larger than k. 103 | * Smaller k can significantly improve speed of computation. 104 | * If edit distance is larger than k, edlib will set edit distance to -1. 105 | * Set k to negative value and edlib will internally auto-adjust k until score is found. 106 | */ 107 | int k; 108 | 109 | /** 110 | * Alignment method. 111 | * EDLIB_MODE_NW: global (Needleman-Wunsch) 112 | * EDLIB_MODE_SHW: prefix. Gap after query is not penalized. 113 | * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized. 114 | */ 115 | EdlibAlignMode mode; 116 | 117 | /** 118 | * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is. 119 | * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target. 120 | * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target. 121 | * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target). 122 | */ 123 | EdlibAlignTask task; 124 | 125 | /** 126 | * List of pairs of characters, where each pair defines two characters as equal. 127 | * This way you can extend edlib's definition of equality (which is that each character is equal only 128 | * to itself). 129 | * This can be useful if you have some wildcard characters that should match multiple other characters, 130 | * or e.g. if you want edlib to be case insensitive. 131 | * Can be set to NULL if there are none. 132 | */ 133 | const EdlibEqualityPair* additionalEqualities; 134 | 135 | /** 136 | * Number of additional equalities, which is non-negative number. 137 | * 0 if there are none. 138 | */ 139 | int additionalEqualitiesLength; 140 | } EdlibAlignConfig; 141 | 142 | /** 143 | * Helper method for easy construction of configuration object. 144 | * @return Configuration object filled with given parameters. 145 | */ 146 | EDLIB_API EdlibAlignConfig edlibNewAlignConfig( 147 | int k, EdlibAlignMode mode, EdlibAlignTask task, 148 | const EdlibEqualityPair* additionalEqualities, 149 | int additionalEqualitiesLength 150 | ); 151 | 152 | /** 153 | * @return Default configuration object, with following defaults: 154 | * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities. 155 | */ 156 | EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void); 157 | 158 | 159 | /** 160 | * Container for results of alignment done by edlibAlign() function. 161 | */ 162 | typedef struct { 163 | /** 164 | * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values. 165 | */ 166 | int status; 167 | 168 | /** 169 | * -1 if k is non-negative and edit distance is larger than k. 170 | */ 171 | int editDistance; 172 | 173 | /** 174 | * Array of zero-based positions in target where optimal alignment paths end. 175 | * If gap after query is penalized, gap counts as part of query (NW), otherwise not. 176 | * Set to NULL if edit distance is larger than k. 177 | * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). 178 | */ 179 | int* endLocations; 180 | 181 | /** 182 | * Array of zero-based positions in target where optimal alignment paths start, 183 | * they correspond to endLocations. 184 | * If gap before query is penalized, gap counts as part of query (NW), otherwise not. 185 | * Set to NULL if not calculated or if edit distance is larger than k. 186 | * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). 187 | */ 188 | int* startLocations; 189 | 190 | /** 191 | * Number of end (and start) locations. 192 | */ 193 | int numLocations; 194 | 195 | /** 196 | * Alignment is found for first pair of start and end locations. 197 | * Set to NULL if not calculated. 198 | * Alignment is sequence of numbers: 0, 1, 2, 3. 199 | * 0 stands for match. 200 | * 1 stands for insertion to target. 201 | * 2 stands for insertion to query. 202 | * 3 stands for mismatch. 203 | * Alignment aligns query to target from begining of query till end of query. 204 | * If gaps are not penalized, they are not in alignment. 205 | * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). 206 | */ 207 | unsigned char* alignment; 208 | 209 | /** 210 | * Length of alignment. 211 | */ 212 | int alignmentLength; 213 | 214 | /** 215 | * Number of different characters in query and target together. 216 | */ 217 | int alphabetLength; 218 | } EdlibAlignResult; 219 | 220 | /** 221 | * Frees memory in EdlibAlignResult that was allocated by edlib. 222 | * If you do not use it, make sure to free needed members manually using free(). 223 | */ 224 | EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result); 225 | 226 | 227 | /** 228 | * Aligns two sequences (query and target) using edit distance (levenshtein distance). 229 | * Through config parameter, this function supports different alignment methods (global, prefix, infix), 230 | * as well as different modes of search (tasks). 231 | * It always returns edit distance and end locations of optimal alignment in target. 232 | * It optionally returns start locations of optimal alignment in target and alignment path, 233 | * if you choose appropriate tasks. 234 | * @param [in] query First sequence. 235 | * @param [in] queryLength Number of characters in first sequence. 236 | * @param [in] target Second sequence. 237 | * @param [in] targetLength Number of characters in second sequence. 238 | * @param [in] config Additional alignment parameters, like alignment method and wanted results. 239 | * @return Result of alignment, which can contain edit distance, start and end locations and alignment path. 240 | * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members. 241 | */ 242 | EDLIB_API EdlibAlignResult edlibAlign( 243 | const char* query, int queryLength, 244 | const char* target, int targetLength, 245 | const EdlibAlignConfig config 246 | ); 247 | 248 | 249 | /** 250 | * Builds cigar string from given alignment sequence. 251 | * @param [in] alignment Alignment sequence. 252 | * 0 stands for match. 253 | * 1 stands for insertion to target. 254 | * 2 stands for insertion to query. 255 | * 3 stands for mismatch. 256 | * @param [in] alignmentLength 257 | * @param [in] cigarFormat Cigar will be returned in specified format. 258 | * @return Cigar string. 259 | * I stands for insertion. 260 | * D stands for deletion. 261 | * X stands for mismatch. (used only in extended format) 262 | * = stands for match. (used only in extended format) 263 | * M stands for (mis)match. (used only in standard format) 264 | * String is null terminated. 265 | * Needed memory is allocated and given pointer is set to it. 266 | * Do not forget to free it later using free()! 267 | */ 268 | EDLIB_API char* edlibAlignmentToCigar( 269 | const unsigned char* alignment, int alignmentLength, 270 | EdlibCigarFormat cigarFormat 271 | ); 272 | 273 | #ifdef __cplusplus 274 | } 275 | #endif 276 | 277 | #endif // EDLIB_H -------------------------------------------------------------------------------- /stringdecomposer/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "edlib.h" 13 | 14 | using namespace std; 15 | 16 | struct ReadId { 17 | string name; 18 | int id = -1; 19 | 20 | ReadId(string name_): name(name_) {} 21 | ReadId(string name_, int id_): name(name_), id(id_) {} 22 | }; 23 | 24 | struct Seq { 25 | ReadId read_id; 26 | string seq; 27 | 28 | Seq(string name_, string seq_): read_id(ReadId(name_)), seq(seq_) { 29 | transform(seq.begin(), seq.end(), seq.begin(), ::toupper); 30 | } 31 | 32 | Seq(string name_, string seq_, int id_): read_id(ReadId(name_, id_)), seq(seq_) {} 33 | 34 | size_t size() { return seq.size();} 35 | }; 36 | 37 | struct MonomerAlignment { 38 | string monomer_name; 39 | string read_name; 40 | int start_pos; 41 | int end_pos; 42 | float identity; 43 | bool best; 44 | 45 | MonomerAlignment() {} 46 | 47 | MonomerAlignment(string monomer_name_, string read_name_, int start_pos_, int end_pos_, float identity_, bool best_) 48 | : monomer_name(monomer_name_), read_name(read_name_), start_pos(start_pos_), end_pos(end_pos_), identity(identity_), best(best_) {} 49 | }; 50 | 51 | bool sortby1(const pair> &a 52 | , const pair> &b) { 53 | return (a.first < b.first); 54 | } 55 | 56 | class MonomersAligner { 57 | 58 | public: 59 | MonomersAligner(vector &monomers, int ins = -1, int del = -1, int mismatch = -1, int match = 1) 60 | : monomers_(monomers), 61 | ins_(ins), 62 | del_(del), 63 | mismatch_(mismatch), 64 | match_(match) { 65 | } 66 | 67 | void AlignReadsSet(vector &reads, int threads, int part_size, int ed_thr, int overlap = 500) { 68 | vector new_reads; 69 | vector save_steps; 70 | for (const auto & r: reads) { 71 | int cnt = 0; 72 | //cout << r.seq.size() << endl; 73 | for (size_t i = 0; i < r.seq.size(); i += part_size) { 74 | if ((int) r.seq.size() - i >= overlap || r.seq.size() < overlap) { 75 | Seq seq = Seq(r.read_id.name, r.seq.substr(i, min(part_size + overlap, static_cast(r.seq.size() - i)) ), i ); 76 | new_reads.push_back(seq); 77 | ++ cnt; 78 | } 79 | } 80 | save_steps.push_back(cnt); 81 | } 82 | cerr << "Prepared reads\n"; 83 | 84 | size_t start = 0, p = 0; 85 | int step = threads*2; 86 | vector>> subbatches; 87 | for (size_t i = 0; i < new_reads.size(); i += step) { 88 | #pragma omp parallel for num_threads(threads) 89 | for (size_t j = i; j < min(i + step, new_reads.size()); ++ j) { 90 | std::vector aln; 91 | if (ed_thr > -1) { 92 | std::vector filter_monomers = FilterMonomersForRead(new_reads[j], ed_thr); 93 | aln = AlignPartClassicDP(new_reads[j], filter_monomers); 94 | } else { 95 | aln = AlignPartClassicDP(new_reads[j], monomers_); 96 | } 97 | 98 | #pragma omp critical(aligner) 99 | { 100 | subbatches.push_back(pair> (j, aln)); 101 | } 102 | } 103 | sort(subbatches.begin() + i, subbatches.begin() + min(i + step, new_reads.size()), sortby1); 104 | while (p < save_steps.size() && start + save_steps[p] <= subbatches.size()) { 105 | vector batch; 106 | for (size_t j = start; j < start + save_steps[p]; ++ j) { 107 | int read_index = subbatches[j].first; 108 | for (auto a: subbatches[j].second) { 109 | MonomerAlignment new_m_aln(a.monomer_name, a.read_name, 110 | new_reads[read_index].read_id.id + a.start_pos, new_reads[read_index].read_id.id + a.end_pos, 111 | a.identity, a.best); 112 | batch.push_back(new_m_aln); 113 | } 114 | } 115 | cerr << (p + 1) * 100/save_steps.size() << "%: Aligned " << batch[0].read_name << endl; 116 | batch = PostProcessing(batch); 117 | SaveBatch(batch); 118 | start += save_steps[p]; 119 | ++ p; 120 | } 121 | } 122 | } 123 | 124 | ~MonomersAligner() { 125 | } 126 | 127 | private: 128 | double MonomerEditDistance(Seq& monomer, Seq& read) { 129 | EdlibAlignResult result = edlibAlign(monomer.seq.c_str(), monomer.seq.size(), read.seq.c_str(), read.seq.size(), edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_DISTANCE, NULL, 0)); 130 | double res = result.editDistance; 131 | edlibFreeAlignResult(result); 132 | return res; 133 | } 134 | 135 | std::vector FilterMonomersForRead(Seq& read, int ed_thr) { 136 | std::vector monomers_for_read; 137 | std::vector> mn_edit; 138 | for (size_t i = 0; i < monomers_.size(); ++i) { 139 | mn_edit.push_back(std::make_pair(MonomerEditDistance(monomers_[i], read), i)); 140 | } 141 | std::sort(mn_edit.begin(), mn_edit.end()); 142 | monomers_for_read.push_back(monomers_[mn_edit[0].second]); 143 | for (size_t i = 1; i < mn_edit.size(); ++i) { 144 | if (mn_edit[i].first <= ed_thr) { 145 | monomers_for_read.push_back(monomers_[mn_edit[i].second]); 146 | } 147 | } 148 | return monomers_for_read; 149 | } 150 | 151 | vector AlignPartClassicDP(Seq &read, std::vector& monomers) { 152 | int ins = ins_; 153 | int del = del_; 154 | int match = match_; 155 | int mismatch = mismatch_; 156 | int INF = -1000000; 157 | int monomers_num = (int) monomers.size(); 158 | vector>> dp(read.seq.size()); 159 | //cout << dp.size() << endl; 160 | for (size_t i = 0; i < read.seq.size(); ++ i) { 161 | for (const auto & m: monomers) { 162 | dp[i].push_back(vector(m.seq.size())); 163 | for (size_t k = 0; k < m.seq.size(); ++ k) { 164 | dp[i][dp[i].size() - 1][k] = INF; 165 | } 166 | } 167 | dp[i].push_back(vector(1)); 168 | dp[i][monomers_num][0] = INF; 169 | } 170 | 171 | for (size_t j = 0; j < monomers.size(); ++ j) { 172 | Seq m = monomers[j]; 173 | if (m.seq[0] == read.seq[0]) { 174 | dp[0][j][0] = match; 175 | } else { 176 | dp[0][j][0] = mismatch; 177 | } 178 | for (size_t k = 1; k < m.seq.size(); ++ k) { 179 | long long mm_score = monomers[j].seq[k] == read.seq[0] ? match: mismatch; 180 | dp[0][j][k] = max(dp[0][j][k-1] + del, (long long)(del*(k-1) + mm_score)); 181 | } 182 | } 183 | for (size_t i = 1; i < read.seq.size(); ++ i) { 184 | for (size_t j = 0; j < monomers.size(); ++ j) { 185 | dp[i][monomers_num][0] = max(dp[i][monomers_num][0], dp[i-1][j][monomers[j].size() - 1]); 186 | } 187 | for (size_t j = 0; j < monomers.size(); ++ j) { 188 | for (size_t k = 0; k < monomers[j].size(); ++ k) { 189 | long long score = INF; 190 | int mm_score = monomers[j].seq[k] == read.seq[i] ? match: mismatch; 191 | if (dp[i][monomers_num][0] > INF) { 192 | score = max(score, dp[i][monomers_num][0] + mm_score + static_cast(k*del)); 193 | } 194 | if (k > 0) { 195 | if (dp[i-1][j][k-1] > INF) { 196 | score = max(score, dp[i-1][j][k-1] + mm_score); 197 | } 198 | if (dp[i-1][j][k] > INF) { 199 | score = max(score, dp[i-1][j][k] + ins); 200 | } 201 | if (dp[i][j][k-1] > INF) { 202 | score = max(score, dp[i][j][k-1] + del); 203 | } 204 | } 205 | dp[i][j][k] = score; 206 | } 207 | } 208 | } 209 | int max_score = INF; 210 | int best_m = monomers_num; 211 | for (size_t j = 0; j < monomers.size(); ++ j) { 212 | if (max_score < dp[read.seq.size()-1][j][monomers[j].size() -1] ) { 213 | max_score = dp[read.seq.size()-1][j][monomers[j].size() -1]; 214 | best_m = j; 215 | } 216 | } 217 | vector ans; 218 | long long i = read.seq.size() - 1; 219 | long long j = best_m; 220 | long long k = dp[i][j].size() - 1; 221 | bool monomer_changed = true; 222 | MonomerAlignment cur_aln; 223 | while (i >= 0) { 224 | if (k == static_cast(dp[i][j].size() - 1) && j != monomers_num && monomer_changed) { 225 | cur_aln = MonomerAlignment(monomers[j].read_id.name, read.read_id.name, i, i, dp[i][j][k], true); 226 | monomer_changed = false; 227 | } 228 | if (j == monomers_num) { 229 | if (i != 0) { 230 | for (size_t p = 0; p < dp[i - 1].size(); ++ p) { 231 | if (dp[i - 1][p][dp[i - 1][p].size() - 1] == dp[i][j][k]) { 232 | -- i; 233 | j = p; 234 | k = dp[i][j].size() - 1; 235 | break; 236 | } 237 | } 238 | } else { 239 | -- i; 240 | } 241 | } else { 242 | if (k != 0 && dp[i][j][k] == dp[i][j][k-1] + del) { 243 | --k; 244 | } else { 245 | if (i != 0 && dp[i][j][k] == dp[i-1][j][k] + ins) { 246 | --i; 247 | } else{ 248 | int mm_score = monomers[j].seq[k] == read.seq[i] ? match: mismatch; 249 | if (i != 0 && k != 0 && dp[i][j][k] == dp[i-1][j][k-1] + mm_score) { 250 | --i; --k; 251 | } else { 252 | monomer_changed = true; 253 | if (i != 0 && dp[i][monomers_num][0] + k*del + mm_score == dp[i][j][k]) { 254 | cur_aln.start_pos = i; 255 | cur_aln.identity = cur_aln.identity - dp[i][monomers_num][0]; 256 | ans.push_back(cur_aln); 257 | j = monomers_num; k = 0; 258 | } else { 259 | cur_aln.start_pos = i; 260 | ans.push_back(cur_aln); 261 | --i; 262 | } 263 | } 264 | } 265 | } 266 | } 267 | } 268 | reverse(ans.begin(), ans.end()); 269 | return ans; 270 | } 271 | 272 | void SaveBatch(vector &batch) { 273 | int prev_end = 0; 274 | for (auto a: batch) { 275 | string s = a.read_name + "\t" 276 | + a.monomer_name + "\t" 277 | + to_string(a.start_pos) + "\t" 278 | + to_string(a.end_pos) + "\t" 279 | + to_string(a.identity) + "\t" 280 | + to_string(a.start_pos - prev_end) + "\t" 281 | + to_string(a.end_pos - a.start_pos); 282 | prev_end = a.end_pos; 283 | cout << s << "\n"; 284 | } 285 | } 286 | 287 | vector PostProcessing(vector &batch) { 288 | vector res; 289 | size_t i = 0; 290 | while (i < batch.size()) { 291 | for (size_t j = i + 1; j < min(i + 7, batch.size()); ++ j) { 292 | if ((batch[i].end_pos - batch[j].start_pos)*2 > (batch[j].end_pos - batch[j].start_pos)) { 293 | res.push_back(batch[i]); 294 | i = j + 1; 295 | break; 296 | } 297 | } 298 | if (i < batch.size() ) res.push_back(batch[i]); 299 | ++ i; 300 | } 301 | return res; 302 | } 303 | 304 | vector monomers_; 305 | const int SAVE_STEP = 1; 306 | int ins_; 307 | int del_; 308 | int mismatch_; 309 | int match_; 310 | }; 311 | 312 | 313 | 314 | vector load_fasta(string filename) { 315 | std::ifstream input_file; 316 | input_file.open(filename, std::ifstream::in); 317 | string s; 318 | vector seqs; 319 | while (getline(input_file, s)) { 320 | if (s[0] == '>') { 321 | string header = s.substr(1, s.size() - 1); 322 | istringstream iss(header); 323 | vector header_v((istream_iterator(iss)), 324 | istream_iterator()); 325 | seqs.push_back(Seq(header_v[0], "")); 326 | } else { 327 | seqs[seqs.size()-1].seq += s; 328 | } 329 | } 330 | set nucs = {'A', 'C', 'G', 'T', 'N'}; 331 | bool hasN = false; 332 | for (const auto & seq: seqs) { 333 | for (char c: seq.seq) { 334 | if (nucs.count(c) == 0) { 335 | cerr << "ERROR: Sequence " << seq.read_id.name <<" contains undefined symbol (not ACGT): " << c << endl; 336 | exit(-1); 337 | } else if (c == 'N') { 338 | hasN = true; 339 | } 340 | } 341 | } 342 | if (hasN) { 343 | cerr << "WARNING: sequences in " << filename << " contain N symbol. It will be counted as a separate symbol in scoring!" << endl; 344 | } 345 | return seqs; 346 | } 347 | 348 | string reverse_complement(string &s){ 349 | string res = ""; 350 | map rc = {{'A', 'T'}, {'T', 'A'}, {'G','C'}, {'C','G'}, {'N','N'}}; 351 | for (int i = (int) s.size() - 1; i >= 0; --i){ 352 | try { 353 | res += rc.at(s[i]); 354 | } 355 | catch (std::out_of_range& e) 356 | { 357 | cerr << e.what() << std::endl; 358 | exit(-1); 359 | } 360 | } 361 | return res; 362 | } 363 | 364 | void add_reverse_complement(vector &monomers) { 365 | vector rev_c_monomers; 366 | for (auto s: monomers) { 367 | rev_c_monomers.push_back(Seq(s.read_id.name + "'", reverse_complement(s.seq))); 368 | } 369 | monomers.insert(monomers.end(), rev_c_monomers.begin(), rev_c_monomers.end()); 370 | return; 371 | } 372 | 373 | 374 | int main(int argc, char **argv) { 375 | if (argc < 5) { 376 | cout << "Failed to process. Number of arguments < 5\n"; 377 | cout << "./decompose [ ]\n"; 378 | return -1; 379 | } 380 | int ins = -1, del = -1, mismatch = -1, match = 1; 381 | if (argc == 10) { 382 | ins = stoi(argv[6]); 383 | del = stoi(argv[7]); 384 | mismatch = stoi(argv[8]); 385 | match = stoi(argv[9]); 386 | } 387 | 388 | int ed_thr = -1; 389 | if (argc == 11) { 390 | ed_thr = stoi(argv[10]); 391 | } 392 | 393 | cerr << "Scores: insertion=" << ins << " deletion=" << del << " mismatch=" << mismatch << " match=" << match << endl; 394 | vector reads = load_fasta(argv[1]); 395 | vector monomers = load_fasta(argv[2]); 396 | add_reverse_complement(monomers); 397 | MonomersAligner monomers_aligner(monomers, ins, del, mismatch, match); 398 | int num_threads = stoi(argv[3]); 399 | int part_size = stoi(argv[4]); 400 | int overlap = stoi(argv[5]); 401 | monomers_aligner.AlignReadsSet(reads, num_threads, part_size, ed_thr, overlap); 402 | } 403 | -------------------------------------------------------------------------------- /stringdecomposer/test_data/DXZ1_star_monomers.fa: -------------------------------------------------------------------------------- 1 | >A_0_DXZ1*_doubled/1978_2147/R 2 | TCCGTTTAGCTTTTAGGTGAAGATTATCCCGTTTCCAACGAAACCTTCAAAGAGGTCCAAATATCCCCTTGCGGATCCCACAGAAAGAGTGTTTCGAAACTGCTGTTTCAAAGGAATCTTCAACTCTGTGAGTTGAATGCAATCATCACAAAGAAGTTTCTGACAATGCT 3 | >B_1_DXZ1*_doubled/94_279/R 4 | TCTCTCTCGTCTTTCTGTGAAGATAAAGGAAAAGGCTTTCAGGCCTTTTCCACCACAGGCCTGAAAGCGCTCCAAATGTCCACTTGCAGATTCTGCCAAAAGAATATTTCAAAACTGCTCTATGAAAAGCAATGTTAAACTCTGTGGCTCGAACACAAACATCACAAAGCAGTTTCTGAGAATGCT 5 | >C_2_DXZ1*_doubled/280_450/R 6 | TCAGTTTAGTTTTTCTGTGGAAATATTCCCGTTTCCAAAGAAATCTTCAAAGAGGTCCACGTATCCACTTACAGATTCTACAAAAAGACAGTTTCAAAACTGCTCCATCAAAAGGAGGGTTCAACTGTGTGACTTGAATGCAATCATCACTCAGAAGTTTCTGAGAATGCT 7 | >D_3_DXZ1*_doubled/451_620/R 8 | TCTCTTTAGTTTTTACGTGAACATATACCCGTTTCGAACGAAGGCCACCCAGTGGTCCAAATATCCACTTGCAGATTCTACAGAAAGAGTGTTTCGAACCTGAACTCTCAAAGGCAGGTTCATCTCTGCGAGTTAAATGCATTCATCATGAAGAACTTTCTCAGAGTGTT 9 | >E_4_DXZ1*_doubled/621_788/R 10 | TGTGTTTAGTTATGGGAAATTATTCCCGTTTCCAACGAAATCCTCAGAGAGCTCCAAATATCCACCTGCAGATTCTACCAAAAGTGTATTTGGAAACTGCTCCATCAAAAGGCATGTTCAGCTCTGTGAGTGAAACTCCATCATCACAAAGAATATTCTGAGAATGCT 11 | >F_5_DXZ1*_doubled/789_959/R 12 | TCCGTTTGCCTTTTATATGAAGTTCCTTCCTGTACTACCGTAGGCCTCAAAGCAGTCCAAATCTCCATTTGCAGATTCTACAAAAAGAGTGATTCCAATCTGCTCTATCAATAGGATTGTTCAACTCCATGAGTTGAATGCCATCCTCACAAAGTCGTTTCTGAGAATGCT 13 | >G_6_DXZ1*_doubled/960_1129/R 14 | TCTATCTAGTTTTTATGTGAAGATATTTCCTTTTCCACCACAGGCCTCAAAGCCCTCCAAACGTCCACTTGCAGATTCTCGAAAAAGAGTGTTTCATAGCTGCTCTTTCAAAGGAAAGTTCAACTCTGGGAGTTGAATACAAACATCACAAAGTAGTTTCCGAGAATGCT 15 | >H_7_DXZ1*_doubled/1130_1300/R 16 | TCTGTTTAGTTTTTATGTGAAGATGATCCCGTTTCCAGTGAAATCTTCAAAGAGGTCCACATATCCCCTTGCAGATTCCAAAGAAAGAGGGTTTCAAAACTGCTCCATCAGAAGGATTGTTCAACTCTGTGAGTTGAATGCAGTCATCGCAGAAAACTTTCTGAGAATGCT 17 | >I_8_DXZ1*_doubled/1301_1470/R 18 | TCTGTCTAGGTTTGATGTGAAGATATAGACGTTTCAAACGAAGGCTACAAAGTGGTCAAAATATACACTTGCAGATTCTACTACAAGGGTGTTGCAAACCTGAACTATCAAAGGAAGGTTCAACTCTGTGAGTTGAATACAAACATCACAAAGAATGTTCTGAGTTTGCT 19 | >J_9_DXZ1*_doubled/1471_1638/R 20 | TCCGTTCAGTTATGGGAAGTTGATCCCGTTTCCAACGAAATCCTCAGAGAGGTCCAAATATCCCCTTGCAGATTCTACAAAACGTGTGTTTGGAAACTGCTCCATCATAACGAATGTTCAGCTCCCTGAGTTAAACTCCATCGTCACAAAGAATTTTCTGAGAGTGCT 21 | >K_10_DXZ1*_doubled/1639_1808/R 22 | ACCGTCTGGTTTTTATATGAAGTTCTTTCCTTCACTACCACAGGCCTCAAAGCGGTCCAAATCTCCACTTGCAGATTCTACAAAAAGAGTGTTTGCAAACTGCTCTATCAAAGGAATGTTCAACTCTGGGAGTTGAATGCAATCATCACAGAGCAGTTTCTGAGAATGCT 23 | >L_11_DXZ1*_doubled/1809_1977/R 24 | TCTATGTCGTTTTTAGGAGAAGATATTTCCTTTTCCAACACAGTCCTCCAAGCCCGCTAAATAGCCACTTGCACATTGTAGAAAAAGTGTGTCAAAGCTGCGCTATCAAAGGGAAAGTTCAACTCTGTGAGGTGAATGCAAACATCCCAAAGAAGTTTCTGAGAATGCT 25 | --------------------------------------------------------------------------------