├── .github
    └── workflows
    │   ├── conda-build.yml
    │   ├── conda-install.yml
    │   ├── install_notRootLaunch.yml
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── bin
    └── stringdecomposer
├── requirements.txt
├── setup.py
└── stringdecomposer
    ├── __init__.py
    ├── __version__.py
    ├── main.py
    ├── models
        └── ont_logreg_model.txt
    ├── py
        ├── git.py
        └── standard_logger.py
    ├── src
        ├── edlib.cpp
        ├── edlib.h
        └── main.cpp
    └── test_data
        ├── DXZ1_star_monomers.fa
        ├── final_decomposition_fc89af8.tsv
        └── read.fa


/.github/workflows/conda-build.yml:
--------------------------------------------------------------------------------
 1 | name: Checking build using conda
 2 | 
 3 | on: [push, workflow_dispatch]
 4 | 
 5 | jobs:
 6 |   build-linux:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Add conda to system path
20 |       run: |
21 |         # $CONDA is an environment variable pointing to the root of the miniconda directory
22 |         echo $CONDA/bin >> $GITHUB_PATH
23 |     - name: Install dependencies
24 |       run: |
25 |         conda config --add channels defaults
26 |         conda config --add channels bioconda
27 |         conda config --add channels conda-forge
28 |         conda install -y --file requirements.txt
29 |     - name: Build
30 |       run: |
31 |         make
32 |     - name: Run test dataset
33 |       run: |
34 |         make test_launch
35 |     - name: Clean
36 |       run: |
37 |         make clean
38 | 


--------------------------------------------------------------------------------
/.github/workflows/conda-install.yml:
--------------------------------------------------------------------------------
 1 | name: Checking install using conda
 2 | 
 3 | on: [push, workflow_dispatch]
 4 | 
 5 | jobs:
 6 |   build-linux:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Add conda to system path
20 |       run: |
21 |         # $CONDA is an environment variable pointing to the root of the miniconda directory
22 |         echo $CONDA/bin >> $GITHUB_PATH
23 |     - name: Install dependencies
24 |       run: |
25 |         conda config --add channels defaults
26 |         conda config --add channels bioconda
27 |         conda config --add channels conda-forge
28 |         conda install -y --file requirements.txt
29 |     - name: Install
30 |       run: |
31 |         make install
32 |     - name: Run test dataset with installed StringDecomposer
33 |       run: |
34 |         make test_launch_install
35 |     - name: Uninstall and clean
36 |       run: |
37 |         make uninstall
38 |         make clean
39 | 


--------------------------------------------------------------------------------
/.github/workflows/install_notRootLaunch.yml:
--------------------------------------------------------------------------------
 1 | name: Install with test launch not in root directory
 2 | 
 3 | on: [push, workflow_dispatch]
 4 | 
 5 | jobs:
 6 |   build-linux:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Add conda to system path
20 |       run: |
21 |         # $CONDA is an environment variable pointing to the root of the miniconda directory
22 |         echo $CONDA/bin >> $GITHUB_PATH
23 |     - name: Install dependencies
24 |       run: |
25 |         conda config --add channels defaults
26 |         conda config --add channels bioconda
27 |         conda config --add channels conda-forge
28 |         conda install -y --file requirements.txt
29 |     - name: Install
30 |       run: |
31 |         make install
32 |         make uninstall
33 |     - name: Run test dataset
34 |       run: |
35 |         mkdir test && cd test
36 |         make -C .. test_launch
37 |     - name: Run test dataset w/ install
38 |       run: |
39 |         cd test
40 |         make -C .. test_launch_install
41 |         make -C .. uninstall
42 |     - name: Run test dataset w/ binary
43 |       run: |
44 |         cd test
45 |         make -C .. install
46 |         stringdecomposer ../stringdecomposer/test_data/read.fa ../stringdecomposer/test_data/DXZ1_star_monomers.fa -o . --second-best
47 |         make -C .. uninstall
48 |     - name: Run test dataset w/ binary 2
49 |       run: |
50 |         make install
51 |         cd test
52 |         stringdecomposer ../stringdecomposer/test_data/read.fa ../stringdecomposer/test_data/DXZ1_star_monomers.fa -o . --second-best
53 |         make -C .. uninstall
54 |     - name: Run test dataset w/ binary 3
55 |       run: |
56 |         make install
57 |         cd ..
58 |         mkdir test && cd test
59 |         stringdecomposer ../stringdecomposer/stringdecomposer/test_data/read.fa ../stringdecomposer/stringdecomposer/test_data/DXZ1_star_monomers.fa -o . --second-best
60 |         make -C ../stringdecomposer uninstall
61 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Pip build and install
 5 | 
 6 | on: [push, workflow_dispatch]
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: [3.5, 3.6, 3.7, 3.8, 3.9]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         python -m pip install flake8
27 |         pip install -r requirements.txt
28 |     - name: Lint with flake8
29 |       run: |
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Taken from https://github.com/github/gitignore/blob/master/Python.gitignore
  2 | # Jul 18, 2020. Commit 14f8a8b
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # pytype static type analyzer
138 | .pytype/
139 | 
140 | # Cython debug symbols
141 | cython_debug/
142 | 
143 | # Vim swp
144 | *.swp
145 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | StringDecomposer
  2 | Copyright (c) 2020 Saint Petersburg State University
  3 | 
  4 | StringDecomposer is free software; you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License, Version 2,
  6 | dated June 1991, as published by the Free Software Foundation.
  7 | 
  8 | StringDecomposer is distributed in the hope that it will be useful, but
  9 | WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | General Public License for more details.
 12 | 
 13 | You should have received a copy of the GNU General Public License along
 14 | with this program; if not, write to the Free Software Foundation, Inc.,
 15 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 16 | 
 17 | 
 18 | -------------------------------------------------------------------------------
 19 | 
 20 | Third-party tools used into StringDecomposer:
 21 | 
 22 | 1. StringDecomposer uses edlib library to calculate identities at the final stage of decomposition.
 23 | 
 24 | For more details about edlib library please refer to
 25 | https://github.com/Martinsos/edlib and to the following paper:
 26 | 
 27 | Martin Šošić, Mile Šikić; Edlib: a C/C ++ library for fast, exact sequence alignment using edit distance. 
 28 | Bioinformatics 2017 btw753. doi: 10.1093/bioinformatics/btw753
 29 | 
 30 | -------------------------------------------------------------------------------
 31 | 
 32 | 		    GNU GENERAL PUBLIC LICENSE
 33 | 		       Version 2, June 1991
 34 | 
 35 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
 36 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 37 |  Everyone is permitted to copy and distribute verbatim copies
 38 |  of this license document, but changing it is not allowed.
 39 | 
 40 | 			    Preamble
 41 | 
 42 |   The licenses for most software are designed to take away your
 43 | freedom to share and change it.  By contrast, the GNU General Public
 44 | License is intended to guarantee your freedom to share and change free
 45 | software--to make sure the software is free for all its users.  This
 46 | General Public License applies to most of the Free Software
 47 | Foundation's software and to any other program whose authors commit to
 48 | using it.  (Some other Free Software Foundation software is covered by
 49 | the GNU Lesser General Public License instead.)  You can apply it to
 50 | your programs, too.
 51 | 
 52 |   When we speak of free software, we are referring to freedom, not
 53 | price.  Our General Public Licenses are designed to make sure that you
 54 | have the freedom to distribute copies of free software (and charge for
 55 | this service if you wish), that you receive source code or can get it
 56 | if you want it, that you can change the software or use pieces of it
 57 | in new free programs; and that you know you can do these things.
 58 | 
 59 |   To protect your rights, we need to make restrictions that forbid
 60 | anyone to deny you these rights or to ask you to surrender the rights.
 61 | These restrictions translate to certain responsibilities for you if you
 62 | distribute copies of the software, or if you modify it.
 63 | 
 64 |   For example, if you distribute copies of such a program, whether
 65 | gratis or for a fee, you must give the recipients all the rights that
 66 | you have.  You must make sure that they, too, receive or can get the
 67 | source code.  And you must show them these terms so they know their
 68 | rights.
 69 | 
 70 |   We protect your rights with two steps: (1) copyright the software, and
 71 | (2) offer you this license which gives you legal permission to copy,
 72 | distribute and/or modify the software.
 73 | 
 74 |   Also, for each author's protection and ours, we want to make certain
 75 | that everyone understands that there is no warranty for this free
 76 | software.  If the software is modified by someone else and passed on, we
 77 | want its recipients to know that what they have is not the original, so
 78 | that any problems introduced by others will not reflect on the original
 79 | authors' reputations.
 80 | 
 81 |   Finally, any free program is threatened constantly by software
 82 | patents.  We wish to avoid the danger that redistributors of a free
 83 | program will individually obtain patent licenses, in effect making the
 84 | program proprietary.  To prevent this, we have made it clear that any
 85 | patent must be licensed for everyone's free use or not licensed at all.
 86 | 
 87 |   The precise terms and conditions for copying, distribution and
 88 | modification follow.
 89 | 
 90 | 		    GNU GENERAL PUBLIC LICENSE
 91 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 92 | 
 93 |   0. This License applies to any program or other work which contains
 94 | a notice placed by the copyright holder saying it may be distributed
 95 | under the terms of this General Public License.  The "Program", below,
 96 | refers to any such program or work, and a "work based on the Program"
 97 | means either the Program or any derivative work under copyright law:
 98 | that is to say, a work containing the Program or a portion of it,
 99 | either verbatim or with modifications and/or translated into another
100 | language.  (Hereinafter, translation is included without limitation in
101 | the term "modification".)  Each licensee is addressed as "you".
102 | 
103 | Activities other than copying, distribution and modification are not
104 | covered by this License; they are outside its scope.  The act of
105 | running the Program is not restricted, and the output from the Program
106 | is covered only if its contents constitute a work based on the
107 | Program (independent of having been made by running the Program).
108 | Whether that is true depends on what the Program does.
109 | 
110 |   1. You may copy and distribute verbatim copies of the Program's
111 | source code as you receive it, in any medium, provided that you
112 | conspicuously and appropriately publish on each copy an appropriate
113 | copyright notice and disclaimer of warranty; keep intact all the
114 | notices that refer to this License and to the absence of any warranty;
115 | and give any other recipients of the Program a copy of this License
116 | along with the Program.
117 | 
118 | You may charge a fee for the physical act of transferring a copy, and
119 | you may at your option offer warranty protection in exchange for a fee.
120 | 
121 |   2. You may modify your copy or copies of the Program or any portion
122 | of it, thus forming a work based on the Program, and copy and
123 | distribute such modifications or work under the terms of Section 1
124 | above, provided that you also meet all of these conditions:
125 | 
126 |     a) You must cause the modified files to carry prominent notices
127 |     stating that you changed the files and the date of any change.
128 | 
129 |     b) You must cause any work that you distribute or publish, that in
130 |     whole or in part contains or is derived from the Program or any
131 |     part thereof, to be licensed as a whole at no charge to all third
132 |     parties under the terms of this License.
133 | 
134 |     c) If the modified program normally reads commands interactively
135 |     when run, you must cause it, when started running for such
136 |     interactive use in the most ordinary way, to print or display an
137 |     announcement including an appropriate copyright notice and a
138 |     notice that there is no warranty (or else, saying that you provide
139 |     a warranty) and that users may redistribute the program under
140 |     these conditions, and telling the user how to view a copy of this
141 |     License.  (Exception: if the Program itself is interactive but
142 |     does not normally print such an announcement, your work based on
143 |     the Program is not required to print an announcement.)
144 | 
145 | These requirements apply to the modified work as a whole.  If
146 | identifiable sections of that work are not derived from the Program,
147 | and can be reasonably considered independent and separate works in
148 | themselves, then this License, and its terms, do not apply to those
149 | sections when you distribute them as separate works.  But when you
150 | distribute the same sections as part of a whole which is a work based
151 | on the Program, the distribution of the whole must be on the terms of
152 | this License, whose permissions for other licensees extend to the
153 | entire whole, and thus to each and every part regardless of who wrote it.
154 | 
155 | Thus, it is not the intent of this section to claim rights or contest
156 | your rights to work written entirely by you; rather, the intent is to
157 | exercise the right to control the distribution of derivative or
158 | collective works based on the Program.
159 | 
160 | In addition, mere aggregation of another work not based on the Program
161 | with the Program (or with a work based on the Program) on a volume of
162 | a storage or distribution medium does not bring the other work under
163 | the scope of this License.
164 | 
165 |   3. You may copy and distribute the Program (or a work based on it,
166 | under Section 2) in object code or executable form under the terms of
167 | Sections 1 and 2 above provided that you also do one of the following:
168 | 
169 |     a) Accompany it with the complete corresponding machine-readable
170 |     source code, which must be distributed under the terms of Sections
171 |     1 and 2 above on a medium customarily used for software interchange; or,
172 | 
173 |     b) Accompany it with a written offer, valid for at least three
174 |     years, to give any third party, for a charge no more than your
175 |     cost of physically performing source distribution, a complete
176 |     machine-readable copy of the corresponding source code, to be
177 |     distributed under the terms of Sections 1 and 2 above on a medium
178 |     customarily used for software interchange; or,
179 | 
180 |     c) Accompany it with the information you received as to the offer
181 |     to distribute corresponding source code.  (This alternative is
182 |     allowed only for noncommercial distribution and only if you
183 |     received the program in object code or executable form with such
184 |     an offer, in accord with Subsection b above.)
185 | 
186 | The source code for a work means the preferred form of the work for
187 | making modifications to it.  For an executable work, complete source
188 | code means all the source code for all modules it contains, plus any
189 | associated interface definition files, plus the scripts used to
190 | control compilation and installation of the executable.  However, as a
191 | special exception, the source code distributed need not include
192 | anything that is normally distributed (in either source or binary
193 | form) with the major components (compiler, kernel, and so on) of the
194 | operating system on which the executable runs, unless that component
195 | itself accompanies the executable.
196 | 
197 | If distribution of executable or object code is made by offering
198 | access to copy from a designated place, then offering equivalent
199 | access to copy the source code from the same place counts as
200 | distribution of the source code, even though third parties are not
201 | compelled to copy the source along with the object code.
202 | 
203 |   4. You may not copy, modify, sublicense, or distribute the Program
204 | except as expressly provided under this License.  Any attempt
205 | otherwise to copy, modify, sublicense or distribute the Program is
206 | void, and will automatically terminate your rights under this License.
207 | However, parties who have received copies, or rights, from you under
208 | this License will not have their licenses terminated so long as such
209 | parties remain in full compliance.
210 | 
211 |   5. You are not required to accept this License, since you have not
212 | signed it.  However, nothing else grants you permission to modify or
213 | distribute the Program or its derivative works.  These actions are
214 | prohibited by law if you do not accept this License.  Therefore, by
215 | modifying or distributing the Program (or any work based on the
216 | Program), you indicate your acceptance of this License to do so, and
217 | all its terms and conditions for copying, distributing or modifying
218 | the Program or works based on it.
219 | 
220 |   6. Each time you redistribute the Program (or any work based on the
221 | Program), the recipient automatically receives a license from the
222 | original licensor to copy, distribute or modify the Program subject to
223 | these terms and conditions.  You may not impose any further
224 | restrictions on the recipients' exercise of the rights granted herein.
225 | You are not responsible for enforcing compliance by third parties to
226 | this License.
227 | 
228 |   7. If, as a consequence of a court judgment or allegation of patent
229 | infringement or for any other reason (not limited to patent issues),
230 | conditions are imposed on you (whether by court order, agreement or
231 | otherwise) that contradict the conditions of this License, they do not
232 | excuse you from the conditions of this License.  If you cannot
233 | distribute so as to satisfy simultaneously your obligations under this
234 | License and any other pertinent obligations, then as a consequence you
235 | may not distribute the Program at all.  For example, if a patent
236 | license would not permit royalty-free redistribution of the Program by
237 | all those who receive copies directly or indirectly through you, then
238 | the only way you could satisfy both it and this License would be to
239 | refrain entirely from distribution of the Program.
240 | 
241 | If any portion of this section is held invalid or unenforceable under
242 | any particular circumstance, the balance of the section is intended to
243 | apply and the section as a whole is intended to apply in other
244 | circumstances.
245 | 
246 | It is not the purpose of this section to induce you to infringe any
247 | patents or other property right claims or to contest validity of any
248 | such claims; this section has the sole purpose of protecting the
249 | integrity of the free software distribution system, which is
250 | implemented by public license practices.  Many people have made
251 | generous contributions to the wide range of software distributed
252 | through that system in reliance on consistent application of that
253 | system; it is up to the author/donor to decide if he or she is willing
254 | to distribute software through any other system and a licensee cannot
255 | impose that choice.
256 | 
257 | This section is intended to make thoroughly clear what is believed to
258 | be a consequence of the rest of this License.
259 | 
260 |   8. If the distribution and/or use of the Program is restricted in
261 | certain countries either by patents or by copyrighted interfaces, the
262 | original copyright holder who places the Program under this License
263 | may add an explicit geographical distribution limitation excluding
264 | those countries, so that distribution is permitted only in or among
265 | countries not thus excluded.  In such case, this License incorporates
266 | the limitation as if written in the body of this License.
267 | 
268 |   9. The Free Software Foundation may publish revised and/or new versions
269 | of the General Public License from time to time.  Such new versions will
270 | be similar in spirit to the present version, but may differ in detail to
271 | address new problems or concerns.
272 | 
273 | Each version is given a distinguishing version number.  If the Program
274 | specifies a version number of this License which applies to it and "any
275 | later version", you have the option of following the terms and conditions
276 | either of that version or of any later version published by the Free
277 | Software Foundation.  If the Program does not specify a version number of
278 | this License, you may choose any version ever published by the Free Software
279 | Foundation.
280 | 
281 |   10. If you wish to incorporate parts of the Program into other free
282 | programs whose distribution conditions are different, write to the author
283 | to ask for permission.  For software which is copyrighted by the Free
284 | Software Foundation, write to the Free Software Foundation; we sometimes
285 | make exceptions for this.  Our decision will be guided by the two goals
286 | of preserving the free status of all derivatives of our free software and
287 | of promoting the sharing and reuse of software generally.
288 | 
289 | 			    NO WARRANTY
290 | 
291 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
292 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
293 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
294 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
295 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
296 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
297 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
298 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
299 | REPAIR OR CORRECTION.
300 | 
301 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
302 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
303 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
304 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
305 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
306 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
307 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
308 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
309 | POSSIBILITY OF SUCH DAMAGES.
310 | 
311 | 		     END OF TERMS AND CONDITIONS
312 | 
313 | 	    How to Apply These Terms to Your New Programs
314 | 
315 |   If you develop a new program, and you want it to be of the greatest
316 | possible use to the public, the best way to achieve this is to make it
317 | free software which everyone can redistribute and change under these terms.
318 | 
319 |   To do so, attach the following notices to the program.  It is safest
320 | to attach them to the start of each source file to most effectively
321 | convey the exclusion of warranty; and each file should have at least
322 | the "copyright" line and a pointer to where the full notice is found.
323 | 
324 |     <one line to give the program's name and a brief idea of what it does.>
325 |     Copyright (C) <year>  <name of author>
326 | 
327 |     This program is free software; you can redistribute it and/or modify
328 |     it under the terms of the GNU General Public License as published by
329 |     the Free Software Foundation; either version 2 of the License, or
330 |     (at your option) any later version.
331 | 
332 |     This program is distributed in the hope that it will be useful,
333 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
334 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
335 |     GNU General Public License for more details.
336 | 
337 |     You should have received a copy of the GNU General Public License along
338 |     with this program; if not, write to the Free Software Foundation, Inc.,
339 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
340 | 
341 | Also add information on how to contact you by electronic and paper mail.
342 | 
343 | If the program is interactive, make it output a short notice like this
344 | when it starts in an interactive mode:
345 | 
346 |     Gnomovision version 69, Copyright (C) year name of author
347 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
348 |     This is free software, and you are welcome to redistribute it
349 |     under certain conditions; type `show c' for details.
350 | 
351 | The hypothetical commands `show w' and `show c' should show the appropriate
352 | parts of the General Public License.  Of course, the commands you use may
353 | be called something other than `show w' and `show c'; they could even be
354 | mouse-clicks or menu items--whatever suits your program.
355 | 
356 | You should also get your employer (if you work as a programmer) or your
357 | school, if any, to sign a "copyright disclaimer" for the program, if
358 | necessary.  Here is a sample; alter the names:
359 | 
360 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
361 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
362 | 
363 |   <signature of Ty Coon>, 1 April 1989
364 |   Ty Coon, President of Vice
365 | 
366 | This General Public License does not permit incorporating your program into
367 | proprietary programs.  If your program is a subroutine library, you may
368 | consider it more useful to permit linking proprietary applications with the
369 | library.  If this is what you want to do, use the GNU Lesser General
370 | Public License instead of this License.
371 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | export ROOT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 2 | SD_DIR = $(ROOT_DIR)/stringdecomposer
 3 | BUILD_DIR = $(SD_DIR)/build
 4 | BIN_DIR = $(BUILD_DIR)/bin
 5 | SRC_DIR = $(SD_DIR)/src
 6 | 
 7 | TEST_QUERY = $(SD_DIR)/test_data/read.fa
 8 | TEST_MONOMERS = $(SD_DIR)/test_data/DXZ1_star_monomers.fa
 9 | TEST_OUTDIR = $(SD_DIR)/test_data
10 | TEST_REFERENCE = $(SD_DIR)/test_data/final_decomposition_fc89af8.tsv
11 | 
12 | build:
13 | 	mkdir -p $(BIN_DIR)
14 | 	${CXX} -o $(BIN_DIR)/dp $(SRC_DIR)/main.cpp $(SRC_DIR)/edlib.cpp -fopenmp --std=c++11 -O2 -Wall -Wextra -pedantic -Wshadow -Wfloat-equal
15 | 
16 | test_launch: build
17 | 	bin/stringdecomposer $(TEST_QUERY) $(TEST_MONOMERS) -o $(TEST_OUTDIR) --second-best
18 | 	grep -q "Thank you for using StringDecomposer!" $(TEST_OUTDIR)/stringdecomposer.log
19 | 	diff -q $(TEST_REFERENCE) $(TEST_OUTDIR)/final_decomposition.tsv
20 | 
21 | install: build
22 | 	python setup.py install --record install_footprint.txt
23 | 
24 | test_launch_install: install
25 | 	stringdecomposer $(TEST_QUERY) $(TEST_MONOMERS) -o $(TEST_OUTDIR) --second-best
26 | 	grep -q "Thank you for using StringDecomposer!" $(TEST_OUTDIR)/stringdecomposer.log
27 | 	diff -q $(TEST_REFERENCE) $(TEST_OUTDIR)/final_decomposition.tsv
28 | 
29 | clean:
30 | 	-rm -rf $(BUILD_DIR)
31 | 	-rm -rf $(SD_DIR)/test_data/final_decomposition_alt.tsv
32 | 	-rm -rf $(SD_DIR)/test_data/final_decomposition_raw.tsv
33 | 	-rm -rf $(SD_DIR)/test_data/final_decomposition.tsv
34 | 	-rm -rf $(SD_DIR)/test_data/stringdecomposer.log
35 | 	-rm -rf StringDecomposer.egg-info dist build
36 | 
37 | uninstall:
38 | 	@if [ -f install_footprint.txt ]; then\
39 | 		echo "removing install footprint from install_footprint.txt";\
40 | 		cat install_footprint.txt | xargs rm -rf;\
41 | 		rm -rf install_footprint.txt;\
42 | 	fi
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/stringdecomposer/badges/installer/conda.svg)](https://anaconda.org/bioconda/stringdecomposer)
  2 | 
  3 | # StringDecomposer
  4 | 
  5 | ## Version 1.1.2
  6 | 
  7 | As an input StringDecomposer algorithm takes the set of monomers (typically, alpha satellites) and a genomic segment (assembly, Oxford Nanopore or a PacBio HiFi read) that contains a tandem repeat consisting of the given monomers.
  8 | StringDecomposer partitions this segment into distinct monomers, providing an accurate translation from the nucleotide alphabet into the monomer alphabet.
  9 | 
 10 | 
 11 | ## Installation
 12 | 
 13 | The recommended way to install StringDecomposer is with conda package manager:
 14 | ```
 15 | conda install -c bioconda stringdecomposer
 16 | ```
 17 | 
 18 | 
 19 | Alternatively, StringDecomposer can be build and installed from source.
 20 | 
 21 | Requirements:
 22 | - Python3.5+
 23 |     - [biopython](https://biopython.org/wiki/Download)
 24 |     - [pandas](https://pypi.org/project/pandas/)
 25 |     - [python-edlib](https://pypi.org/project/edlib/)
 26 |     - [setuptools](https://pypi.org/project/setuptools/)
 27 | - g++ (version 5.3.1 or higher)
 28 | 
 29 | The required python packages can be installed through conda using 
 30 | 
 31 |     conda install --file requirements.txt
 32 | 
 33 | Local building without installation:
 34 | 
 35 |     git clone https://github.com/ablab/stringdecomposer.git
 36 |     cd stringdecomposer
 37 |     make
 38 | 
 39 | Then, StringDecomposer is available as
 40 | 
 41 |     bin/stringdecomposer
 42 | 
 43 | 
 44 | Installing from source:
 45 | 
 46 |     git clone https://github.com/ablab/stringdecomposer.git
 47 |     cd stringdecomposer
 48 |     make install
 49 | 
 50 | Then, StringDecomposer is available as
 51 | 
 52 |     stringdecomposer
 53 | 
 54 | Removal of StringDecomposer installed from source:
 55 | 
 56 |     make uninstall
 57 | 
 58 | ## Quick start
 59 | The following command assumes that StringDecomposer is either installed through conda or from source.
 60 | 
 61 |     stringdecomposer ./stringdecomposer/test_data/read.fa ./stringdecomposer/test_data/DXZ1_star_monomers.fa -o ./stringdecomposer/test_data
 62 | 
 63 | The same result can be achieved with `make test_launch` (for local build without installation) and
 64 | `make test_launch_install` (for installed from source or via conda).
 65 | These `make` rules ensure correctness of StringDecomposer's output on the test dataset.
 66 | 
 67 | In case StringDecomposer is built locally, the command that achieves the same result is
 68 | 
 69 |     ./bin/stringdecomposer ./stringdecomposer/test_data/read.fa ./stringdecomposer/test_data/DXZ1_star_monomers.fa -o ./stringdecomposer/test_data
 70 | 
 71 | Results can be found in
 72 | 
 73 |     ./stringdecomposer/test_data/final_decomposition.tsv           final decomposition of sequences to monomer alphabet
 74 |     ./stringdecomposer/test_data/final_decomposition_alt.tsv       final decomposition of sequences to monomer alphabet with alternative monomers for each position
 75 |     ./stringdecomposer/test_data/final_decomposition_raw.tsv       raw decomposition with initial dynamic programming scores instead of identities
 76 | 
 77 | Each line in final_decomposition.tsv file has the following form:
 78 | 
 79 |     <read-name> <best-monomer> <start-pos> <end-pos> <identity> <second-best-monomer> <second-best-monomer-identity> <homo-best-monomer> <homo-identity> <homo-second-best-monomer> <homo-second-best-monomer-identity> <reliability>
 80 | 
 81 | `homo`-related columns represent statistics of the best-scoring (second-best-scoring) monomer after compression of homopolymer runs in both the monomer and the target read.
 82 | Reliability is either equal to `?` (signifies unreliable alignment which can be caused by a retrotransposon insertion or a poor quality segment of a read) or `+` (if the alignment is reliable).
 83 | The columns `<second-best-monomer>`, `<second-best-monomer-identity>`, `<reliability>`, and `_homo_`-related columns will have values `None` and `-1` unless the user supplies the argument `--second-best` (see Synopsis below).
 84 | 
 85 | 
 86 | ## Synopsis
 87 | 
 88 |     stringdecomposer [-h] [-t THREADS] [-o OUT_FILE] [-i MIN_IDENTITY] [-s SCORING] [-b BATCH_SIZE] [--second-best] sequences monomers
 89 | 
 90 | Required arguments:
 91 | 
 92 |     sequences                                         fasta-file with long reads or genomic sequences (accepts multiple sequences in one file)
 93 |     monomers                                          fasta-file with monomers
 94 | 
 95 | Optional arguments:
 96 | 
 97 |     -h, --help                                         show this help message and exit
 98 | 
 99 |     -t THREADS, --threads THREADS                      number of threads (by default 1)
100 | 
101 |     -o OUT_FILE, --out-file OUT_FILE                   output tsv-file (by default final_decomposition.tsv)
102 | 
103 |     -i MIN_IDENTITY, --min-identity MIN_IDENTITY       only monomer alignments with percent identity >= MIN_IDENTITY are printed (by default MIN_IDENTITY=0%)
104 | 
105 |     -s SCORING, --scoring SCORING                      set scoring scheme for StringDecomposer in the format "insertion,deletion,mismatch,match" (by default "-1,-1,-1,1")
106 | 
107 |     -b BATCH_SIZE, --batch-size BATCH_SIZE             set size of the batch in parallelization (by default 5000)
108 | 
109 |     --second-best                                      StringDecomposer will generate <second-best-monomer>, <second-best-monomer-identity>, <reliability> and _homo_-related columns (not recommended when running StringDecomposer of a large number of monomers)
110 | 
111 | ## Latest updates
112 | 
113 | ### StringDecomposer 1.1.2 release (12 Oct 2021)
114 | 
115 | * Remove building with Address Sanitizer by default
116 | 
117 | ### StringDecomposer 1.1.1 release (20 July 2021)
118 | 
119 | * git hash is disabled to enable execution outside of git repo
120 | 
121 | ### StringDecomposer 1.1 release (28 June 2021)
122 | 
123 | * CI support via github actions
124 | * improved build and installation
125 | * removal of unnecessary dependencies 
126 | * py module of StringDecomposer saves commit hash and has a logger
127 | 
128 | ### StringDecomposer 1.0 release (11 August 2020)
129 | * initial StringDecomposer release
130 | * conda support
131 | * results of StringDecomposer monomer annotation for available centromere assemblies and ONT and Hifi reads of cen6, cen8, and cenX can be found at [Figshare](https://doi.org/10.6084/m9.figshare.12783371)
132 | 
133 | 
134 | ## Citation
135 | 
136 | The String Decomposition Problem and its Applications to Centromere Analysis and Assembly. *Tatiana Dvorkina, Andrey V. Bzikadze, Pavel A. Pevzner* Bioinformatics, Volume 36, Issue Supplement_1, July 2020, Pages i93–i101; doi: [https://doi.org/10.1093/bioinformatics/btaa454](https://doi.org/10.1093/bioinformatics/btaa454)
137 | 
138 | ## Contact
139 | 
140 | In case of any issues please use [issue tracker](https://github.com/ablab/stringdecomposer/issues) or email directly to [t.dvorkina@spbu.ru](mailto:t.dvorkina@spbu.ru)
141 | 


--------------------------------------------------------------------------------
/bin/stringdecomposer:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | sd_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir)
 7 | sys.path.insert(0, sd_root)
 8 | 
 9 | from stringdecomposer.main import main
10 | sys.exit(main())
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | pandas
3 | python-edlib
4 | setuptools
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | 
 6 | try:
 7 |     import setuptools
 8 | except ImportError:
 9 |     sys.exit("setuptools package not found. "
10 |              "Please use 'pip install setuptools' first")
11 | 
12 | from setuptools import setup
13 | from distutils.command.build import build as DistutilsBuild
14 | from distutils.spawn import find_executable
15 | 
16 | from stringdecomposer.__version__ import __version__
17 | 
18 | 
19 | # Make sure we're running from the setup.py directory.
20 | script_dir = os.path.dirname(os.path.realpath(__file__))
21 | if script_dir != os.getcwd():
22 |     os.chdir(script_dir)
23 | 
24 | 
25 | requirements_fn = os.path.join(script_dir, 'requirements.txt')
26 | requirements = []
27 | with open(requirements_fn) as f:
28 |     for line in f:
29 |         line = line.strip()
30 |         if line == 'python-edlib':
31 |             requirements.append('edlib')
32 |         else:
33 |             requirements.append(line)
34 | 
35 | 
36 | description = \
37 |     """
38 | StringDecomposer (SD) algorithm takes the set of monomers
39 | and a long error-prone read (or a genomic segment)
40 | and partitions this read into distinct monomers,
41 | providing an accurate translation of each read
42 | from a nucleotide alphabet into a monomer alphabet.
43 | """
44 | 
45 | 
46 | class MakeBuild(DistutilsBuild):
47 |     def run(self):
48 |         if not find_executable("make"):
49 |             sys.exit("ERROR: 'make' command is unavailable")
50 |         try:
51 |             subprocess.check_call(["make"])
52 |         except subprocess.CalledProcessError as e:
53 |             sys.exit("Compilation error: ", e)
54 |         DistutilsBuild.run(self)
55 | 
56 | 
57 | setup(
58 |     name="StringDecomposer",
59 |     version=__version__,
60 |     description=description,
61 |     url='https://github.com/ablab/stringdecomposer',
62 |     author='Tatiana Dvorkina',
63 |     author_email='tanunia@gmail.com',
64 |     license='GNU General Public License v2.0',
65 |     install_requires=requirements,
66 |     packages=['stringdecomposer'],
67 |     package_dir={'stringdecomposer': 'stringdecomposer'},
68 |     package_data={'stringdecomposer': ['build/bin/dp', 'models/*', '*', 'py/*']},
69 |     entry_points={
70 |         'console_scripts': ['stringdecomposer=stringdecomposer.main:main']
71 |     },
72 |     cmdclass={'build': MakeBuild}
73 | )
74 | 


--------------------------------------------------------------------------------
/stringdecomposer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ablab/stringdecomposer/0a967f6bf131face88397445ca49b65fee49489c/stringdecomposer/__init__.py


--------------------------------------------------------------------------------
/stringdecomposer/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.2"
2 | 


--------------------------------------------------------------------------------
/stringdecomposer/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import os
  5 | import pathlib
  6 | import subprocess
  7 | import sys
  8 | 
  9 | from Bio import SeqIO
 10 | from Bio.SeqRecord import SeqRecord
 11 | import edlib
 12 | import pandas as pd
 13 | import re
 14 | 
 15 | from stringdecomposer.py.standard_logger import get_logger
 16 | from stringdecomposer.py.git import get_git_revision_short_hash
 17 | 
 18 | 
 19 | CUR_FILE = os.path.abspath(__file__)
 20 | CUR_DIR = os.path.dirname(CUR_FILE)
 21 | SD_BIN = os.path.join(CUR_DIR, 'build', 'bin', 'dp')
 22 | LOGREG_FILE = os.path.join(CUR_DIR,
 23 |                            'models',
 24 |                            'ont_logreg_model.txt')
 25 | with open(LOGREG_FILE) as f:
 26 |     LR_MODEL_COEF = list(map(float, f.readline().strip().split()))
 27 | 
 28 | 
 29 | def edist(lst):
 30 |     if len(str(lst[0])) == 0:
 31 |         return -1, ""
 32 |     if len(str(lst[1])) == 0:
 33 |         return -1, ""
 34 |     result = edlib.align(str(lst[0]), str(lst[1]), mode="NW", task="path")
 35 |     return result["editDistance"], result["cigar"]
 36 | 
 37 | 
 38 | def aai(ar):
 39 |     p1, p2 = str(ar[0]), str(ar[1])
 40 |     if p1.endswith("*"):
 41 |         p1 = p1[:-1]
 42 |     if p2.endswith("*"):
 43 |         p2 = p2[:-1]
 44 |     ed, cigar = edist([str(p1), str(p2)])
 45 |     if ed == -1:
 46 |         return 0
 47 |     total_length = 0 #max(len(p1), len(p2))
 48 |     n = 0
 49 |     for c in cigar:
 50 |         if c.isdigit():
 51 |             n = n*10 + int(c)
 52 |         else:
 53 |             total_length += n
 54 |             n = 0
 55 |     matches = re.findall(r'\d+=', cigar)
 56 |     aai = 0.0
 57 |     for m in matches:
 58 |         aai += int(m[:-1])
 59 |     aai /= total_length
 60 |     return aai*100
 61 | 
 62 | 
 63 | def load_fasta(filename, tp = "list"):
 64 |     if tp == "map":
 65 |         records = SeqIO.to_dict(SeqIO.parse(filename, "fasta"))
 66 |         for r in records:
 67 |             records[r] = records[r].upper()
 68 |     else:
 69 |         records = list(SeqIO.parse(filename, "fasta"))
 70 |         for i in range(len(records)):
 71 |             records[i] = records[i].upper()
 72 |     return records
 73 | 
 74 | 
 75 | def make_record(seq, name, sid, d=""):
 76 |     return SeqRecord(seq, id=sid, name=name, description = d)
 77 | 
 78 | 
 79 | def add_rc_monomers(monomers):
 80 |     res = []
 81 |     for m in monomers:
 82 |         res.append(m)
 83 |         res.append(make_record(m.seq.reverse_complement(), m.name + "'", m.id + "'"))
 84 |     return res
 85 | 
 86 | 
 87 | def convert_to_homo(seq):
 88 |     res = ""
 89 |     for c in seq:
 90 |         if len(res) == 0 or res[-1] != c:
 91 |             res += c
 92 |     return res
 93 | 
 94 | 
 95 | def classify(reads_mapping):
 96 |     df = pd.DataFrame(reads_mapping)
 97 |     df["idnt_diff"] = df["score"] - df["second_best_score"]
 98 |     X = pd.concat([df["score"], df["idnt_diff"]], axis=1, keys = ["idnt", "idnt_diff"])
 99 |     X.insert(0, 'intercept', 1)
100 |     y_pred = (X.dot(LR_MODEL_COEF)) > 0
101 |     for i in range(len(reads_mapping)):
102 |         if y_pred[i] != 1:
103 |             reads_mapping[i]["q"] = "?"
104 |     return reads_mapping
105 | 
106 | 
107 | def convert_read(decomposition, read, monomers, light = False):
108 |     res = []
109 |     for d in decomposition:
110 |         monomer, start, end = d["m"], d["start"], d["end"]
111 |         if light:
112 |             scores = {}
113 |             for m in monomers:
114 |                 if m.name == monomer:
115 |                     score = aai([read.seq[start:end + 1], m.seq])
116 |                     scores[m.name] = score
117 |             res.append({"m": monomer, "start": str(d["start"]), "end": str(d["end"]), "score": scores[monomer], \
118 |                                     "second_best": "None", "second_best_score": -1,\
119 |                                     "homo_best": "None", "homo_best_score": -1,\
120 |                                     "homo_second_best": "None", "homo_second_best_score": -1,\
121 |                                     "alt": {}, "q": "+"})
122 |         else:
123 |             scores = {}
124 |             for m in monomers:
125 |                 score = aai([read.seq[start:end + 1], m.seq])
126 |                 scores[m.name] = score
127 |             if monomer == None:
128 |                 for s in scores:
129 |                     if monomer == None or scores[s] > scores[monomer]:
130 |                         monomer = s
131 |             secondbest, secondbest_score = None, -1
132 |             for m in scores:
133 |                 if m != monomer: # and abs(scores[m] - scores[monomer]) < 5:
134 |                     if not secondbest or secondbest_score < scores[m]:
135 |                         secondbest, secondbest_score = m, scores[m]
136 | 
137 |             homo_scores = []
138 |             homo_subseq = convert_to_homo(read.seq[start:end + 1])
139 |             for m in monomers:
140 |                 score = aai([homo_subseq, convert_to_homo(m.seq)])
141 |                 homo_scores.append([m.name, score])
142 |             homo_scores = sorted(homo_scores, key = lambda x: -x[1])
143 |             res.append({"m": monomer, "start": str(d["start"]), "end": str(d["end"]), "score": scores[monomer], \
144 |                                     "second_best": str(secondbest), "second_best_score": secondbest_score,\
145 |                                     "homo_best": homo_scores[0][0], "homo_best_score": homo_scores[0][1],\
146 |                                     "homo_second_best": homo_scores[1][0], "homo_second_best_score": homo_scores[1][1],\
147 |                                     "alt": scores, "q": "+"})
148 | 
149 |     res = classify(res)
150 |     return res
151 | 
152 | 
153 | def print_read(fout, fout_alt, dec, read, monomers, identity_th, light):
154 |     dec = convert_read(dec, read, monomers, light)
155 |     for d in dec:
156 |         if d["score"] >= identity_th:
157 |             fout.write("\t".join([read.name, d["m"], d["start"], d["end"], "{:.2f}".format(d["score"]), \
158 |                                                     d["second_best"], "{:.2f}".format(d["second_best_score"]), \
159 |                                                     d["homo_best"], "{:.2f}".format(d["homo_best_score"]), \
160 |                                                     d["homo_second_best"], "{:.2f}".format(d["homo_second_best_score"]), d["q"]]) + "\n")
161 |             for a in d["alt"]:
162 |                 star = "-"
163 |                 if a == d["m"]:
164 |                     star = "*"
165 |                 fout_alt.write("\t".join([read.name, a, d["start"], d["end"], "{:.2f}".format(d["alt"][a]), star]) + "\n")
166 | 
167 | 
168 | def convert_tsv(decomposition, reads, monomers, outfile, identity_th, light):
169 |     with open(outfile[:-len(".tsv")] + "_alt.tsv", "w") as fout_alt:
170 |         with open(outfile, "w") as fout:
171 |             cur_dec = []
172 |             prev_read = None
173 |             for ln in decomposition.split("\n")[:-1]:
174 |                 read, monomer, start, end = ln.split("\t")[:4]
175 |                 read = read.split()[0]
176 |                 monomer = monomer.split()[0]
177 |                 if read != prev_read and prev_read != None:
178 |                     print_read(fout, fout_alt, cur_dec, reads[prev_read], monomers, identity_th, light)
179 |                     cur_dec = []
180 |                 prev_read = read
181 |                 start, end = int(start), int(end)
182 |                 cur_dec.append({"m": monomer, "start": start, "end": end})
183 |             if len(cur_dec) > 0:
184 |                 print_read(fout, fout_alt, cur_dec, reads[prev_read], monomers, identity_th, light)
185 | 
186 | def run(sequences, monomers, num_threads, scoring, batch_size, raw_file, ed_thr, overlap, logger):
187 |     ins, dels, mm, match = scoring.split(",")
188 |     if not os.path.isfile(SD_BIN):
189 |         logger.info('The binary of String Decomposer is not available. Did you forget to run `make`? Aborting.')
190 |         sys.exit(1)
191 | 
192 |     logger.info(' '.join(["Run", SD_BIN, "with parameters", sequences, monomers, str(num_threads), str(batch_size), str(overlap), scoring]))
193 |     with open(raw_file, 'w') as f:
194 |         subprocess.run([SD_BIN, sequences, monomers, num_threads, batch_size, overlap, ins, dels, mm, match, str(ed_thr)], stdout = f, check = True)
195 |     with open(raw_file, 'r') as f:
196 |         raw_decomposition = "".join(f.readlines())
197 |     return raw_decomposition
198 | 
199 | 
200 | 
201 | def main():
202 |     parser = argparse.ArgumentParser(description='Decomposes string into blocks alphabet')
203 |     parser.add_argument('sequences', help='fasta-file with long reads or genomic sequences')
204 |     parser.add_argument('monomers', help='fasta-file with monomers')
205 |     parser.add_argument('-t', '--threads',  help='number of threads (by default 1)', default="1", required=False)
206 |     parser.add_argument('-o', '--out-dir',  help='output directory (by default .)', default=".", required=False)
207 |     parser.add_argument('--out-file',  help='output tsv-file (by default "final_decomposition")', default="final_decomposition", required=False)
208 |     parser.add_argument('-i', '--min-identity',  \
209 |                          help='only monomer alignments with percent identity >= MIN_IDENTITY are printed (by default MIN_IDENTITY=0)', type=int, default=0, required=False)
210 |     parser.add_argument('-s', '--scoring', \
211 |                          help='set scoring scheme for SD in the format "insertion,deletion,mismatch,match" (by default "-1,-1,-1,1")', default="-1,-1,-1,1", required=False)
212 |     parser.add_argument('-b', '--batch-size',  help='set size of the batch in parallelization (by default 5000)', type=str, default="5000", required=False)
213 |     parser.add_argument('--second-best', dest="second_best", help='generate second best monomer and homopolymer scores', action="store_true")
214 |     parser.add_argument('--ed_thr', help='align only monomers with edit distance less then ed_thr for each segment (by default align all monomers)', default=-1,
215 |                         type=int, required=False)
216 |     parser.add_argument('-v', '--overlap',  help='set size of batch overlap (by default 500)', type=str, default="500", required=False)
217 |     args = parser.parse_args()
218 |     pathlib.Path(args.out_dir).mkdir(parents=True, exist_ok=True)
219 | 
220 |     logfn = os.path.join(args.out_dir, 'stringdecomposer.log')
221 |     logger = get_logger(logfn, logger_name='StringDecomposer')
222 | 
223 |     logger.info(f'cmd: {sys.argv}')
224 |     # TODO get_git_revision_short_hash is commented out
225 |     # since it does not work when stringdecomposer is run from outside of repo
226 |     # logger.info(f'git hash: {get_git_revision_short_hash()}')
227 | 
228 |     raw_decomp_fn = os.path.join(args.out_dir, args.out_file + "_raw.tsv")
229 |     raw_decomposition = run(args.sequences, args.monomers, args.threads, args.scoring, args.batch_size, raw_decomp_fn, args.ed_thr, args.overlap, logger)
230 |     logger.info("Saved raw decomposition to " + raw_decomp_fn)
231 | 
232 |     reads = load_fasta(args.sequences, "map")
233 |     monomers = load_fasta(args.monomers)
234 |     monomers = add_rc_monomers(monomers)
235 |     logger.info("Transforming raw alignments...")
236 | 
237 |     convert_tsv_fn = os.path.join(args.out_dir, args.out_file + ".tsv")
238 |     convert_tsv(raw_decomposition, reads, monomers, convert_tsv_fn, int(args.min_identity), not args.second_best)
239 |     logger.info("Transformation finished. Results can be found in " + convert_tsv_fn)
240 | 
241 |     logger.info("Thank you for using StringDecomposer!")
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     main()
246 | 


--------------------------------------------------------------------------------
/stringdecomposer/models/ont_logreg_model.txt:
--------------------------------------------------------------------------------
1 | -31.48494996 0.41784018 0.69186882
2 | 


--------------------------------------------------------------------------------
/stringdecomposer/py/git.py:
--------------------------------------------------------------------------------
 1 | # (c) 2020 by Authors
 2 | # This file is a part of centroFlye program.
 3 | # Released under the BSD license (see LICENSE file)
 4 | 
 5 | import subprocess
 6 | 
 7 | 
 8 | # Disclaimer: this code can only be run from the git repo and thus should not
 9 | # be used in scripts intended for installation
10 | 
11 | 
12 | def get_git_revision_hash():
13 |     return subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()
14 | 
15 | 
16 | def get_git_revision_short_hash():
17 |     return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
18 | 


--------------------------------------------------------------------------------
/stringdecomposer/py/standard_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def get_logger(filename,
 6 |                logger_name='StringDecomposer',
 7 |                level=logging.INFO,
 8 |                filemode='a',
 9 |                stdout=True):
10 |     logger = logging.getLogger(logger_name)
11 |     logger.setLevel(level)
12 | 
13 |     # create the logging file handler
14 |     fh = logging.FileHandler(filename, mode=filemode)
15 | 
16 |     formatter = logging.Formatter(
17 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 |     fh.setFormatter(formatter)
19 | 
20 |     # add handler to logger object
21 |     logger.addHandler(fh)
22 | 
23 |     if stdout:
24 |         sh = logging.StreamHandler(sys.stdout)
25 |         sh.setFormatter(formatter)
26 |         logger.addHandler(sh)
27 | 
28 |     return logger
29 | 


--------------------------------------------------------------------------------
/stringdecomposer/src/edlib.cpp:
--------------------------------------------------------------------------------
   1 | #include "edlib.h"
   2 | 
   3 | #include <stdint.h>
   4 | #include <cstdlib>
   5 | #include <algorithm>
   6 | #include <vector>
   7 | #include <cstring>
   8 | #include <string>
   9 | 
  10 | using namespace std;
  11 | 
  12 | typedef uint64_t Word;
  13 | static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits
  14 | static const Word WORD_1 = static_cast<Word>(1);
  15 | static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1);  // 100..00
  16 | static const int MAX_UCHAR = 255;
  17 | 
  18 | // Data needed to find alignment.
  19 | struct AlignmentData {
  20 |     Word* Ps;
  21 |     Word* Ms;
  22 |     int* scores;
  23 |     int* firstBlocks;
  24 |     int* lastBlocks;
  25 | 
  26 |     AlignmentData(int maxNumBlocks, int targetLength) {
  27 |         // We build a complete table and mark first and last block for each column
  28 |         // (because algorithm is banded so only part of each columns is used).
  29 |         // TODO: do not build a whole table, but just enough blocks for each column.
  30 |          Ps     = new Word[maxNumBlocks * targetLength];
  31 |          Ms     = new Word[maxNumBlocks * targetLength];
  32 |          scores = new  int[maxNumBlocks * targetLength];
  33 |          firstBlocks = new int[targetLength];
  34 |          lastBlocks  = new int[targetLength];
  35 |     }
  36 | 
  37 |     ~AlignmentData() {
  38 |         delete[] Ps;
  39 |         delete[] Ms;
  40 |         delete[] scores;
  41 |         delete[] firstBlocks;
  42 |         delete[] lastBlocks;
  43 |     }
  44 | };
  45 | 
  46 | struct Block {
  47 |     Word P;  // Pvin
  48 |     Word M;  // Mvin
  49 |     int score; // score of last cell in block;
  50 | 
  51 |     Block() {}
  52 |     Block(Word p, Word m, int s) :P(p), M(m), score(s) {}
  53 | };
  54 | 
  55 | 
  56 | /**
  57 |  * Defines equality relation on alphabet characters.
  58 |  * By default each character is always equal only to itself, but you can also provide additional equalities.
  59 |  */
  60 | class EqualityDefinition {
  61 | private:
  62 |     bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1];
  63 | public:
  64 |     EqualityDefinition(const string& alphabet,
  65 |                        const EdlibEqualityPair* additionalEqualities = NULL,
  66 |                        const int additionalEqualitiesLength = 0) {
  67 |         for (int i = 0; i < static_cast<int>(alphabet.size()); i++) {
  68 |             for (int j = 0; j < static_cast<int>(alphabet.size()); j++) {
  69 |                 matrix[i][j] = (i == j);
  70 |             }
  71 |         }
  72 |         if (additionalEqualities != NULL) {
  73 |             for (int i = 0; i < additionalEqualitiesLength; i++) {
  74 |                 size_t firstTransformed = alphabet.find(additionalEqualities[i].first);
  75 |                 size_t secondTransformed = alphabet.find(additionalEqualities[i].second);
  76 |                 if (firstTransformed != string::npos && secondTransformed != string::npos) {
  77 |                     matrix[firstTransformed][secondTransformed] = matrix[secondTransformed][firstTransformed] = true;
  78 |                 }
  79 |             }
  80 |         }
  81 |     }
  82 | 
  83 |     /**
  84 |      * @param a  Element from transformed sequence.
  85 |      * @param b  Element from transformed sequence.
  86 |      * @return True if a and b are defined as equal, false otherwise.
  87 |      */
  88 |     bool areEqual(unsigned char a, unsigned char b) const {
  89 |         return matrix[a][b];
  90 |     }
  91 | };
  92 | 
  93 | static int myersCalcEditDistanceSemiGlobal(const Word* Peq, int W, int maxNumBlocks,
  94 |                                            int queryLength,
  95 |                                            const unsigned char* target, int targetLength,
  96 |                                            int k, EdlibAlignMode mode,
  97 |                                            int* bestScore_, int** positions_, int* numPositions_);
  98 | 
  99 | static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks,
 100 |                                    int queryLength,
 101 |                                    const unsigned char* target, int targetLength,
 102 |                                    int k, int* bestScore_,
 103 |                                    int* position_, bool findAlignment,
 104 |                                    AlignmentData** alignData, int targetStopPosition);
 105 | 
 106 | 
 107 | static int obtainAlignment(
 108 |         const unsigned char* query, const unsigned char* rQuery, int queryLength,
 109 |         const unsigned char* target, const unsigned char* rTarget, int targetLength,
 110 |         const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
 111 |         unsigned char** alignment, int* alignmentLength);
 112 | 
 113 | static int obtainAlignmentHirschberg(
 114 |         const unsigned char* query, const unsigned char* rQuery, int queryLength,
 115 |         const unsigned char* target, const unsigned char* rTarget, int targetLength,
 116 |         const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
 117 |         unsigned char** alignment, int* alignmentLength);
 118 | 
 119 | static int obtainAlignmentTraceback(int queryLength, int targetLength,
 120 |                                     int bestScore, const AlignmentData* alignData,
 121 |                                     unsigned char** alignment, int* alignmentLength);
 122 | 
 123 | static string transformSequences(const char* queryOriginal, int queryLength,
 124 |                                  const char* targetOriginal, int targetLength,
 125 |                                  unsigned char** queryTransformed,
 126 |                                  unsigned char** targetTransformed);
 127 | 
 128 | static inline int ceilDiv(int x, int y);
 129 | 
 130 | static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
 131 | 
 132 | static inline Word* buildPeq(const int alphabetLength,
 133 |                              const unsigned char* query,
 134 |                              const int queryLength,
 135 |                              const EqualityDefinition& equalityDefinition);
 136 | 
 137 | 
 138 | /**
 139 |  * Main edlib method.
 140 |  */
 141 | extern "C" EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
 142 |                                        const char* const targetOriginal, const int targetLength,
 143 |                                        const EdlibAlignConfig config) {
 144 |     EdlibAlignResult result;
 145 |     result.status = EDLIB_STATUS_OK;
 146 |     result.editDistance = -1;
 147 |     result.endLocations = result.startLocations = NULL;
 148 |     result.numLocations = 0;
 149 |     result.alignment = NULL;
 150 |     result.alignmentLength = 0;
 151 |     result.alphabetLength = 0;
 152 | 
 153 |     /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
 154 |     unsigned char* query, * target;
 155 |     string alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
 156 |                                          &query, &target);
 157 |     result.alphabetLength = static_cast<int>(alphabet.size());
 158 |     /*-------------------------------------------------------*/
 159 | 
 160 |     // Handle special situation when at least one of the sequences has length 0.
 161 |     if (queryLength == 0 || targetLength == 0) {
 162 |         if (config.mode == EDLIB_MODE_NW) {
 163 |             result.editDistance = std::max(queryLength, targetLength);
 164 |             result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
 165 |             result.endLocations[0] = targetLength - 1;
 166 |             result.numLocations = 1;
 167 |         } else if (config.mode == EDLIB_MODE_SHW || config.mode == EDLIB_MODE_HW) {
 168 |             result.editDistance = queryLength;
 169 |             result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
 170 |             result.endLocations[0] = -1;
 171 |             result.numLocations = 1;
 172 |         } else {
 173 |             result.status = EDLIB_STATUS_ERROR;
 174 |         }
 175 | 
 176 |         free(query);
 177 |         free(target);
 178 |         return result;
 179 |     }
 180 | 
 181 |     /*--------------------- INITIALIZATION ------------------*/
 182 |     int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
 183 |     int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
 184 |     EqualityDefinition equalityDefinition(alphabet, config.additionalEqualities, config.additionalEqualitiesLength);
 185 |     Word* Peq = buildPeq(static_cast<int>(alphabet.size()), query, queryLength, equalityDefinition);
 186 |     /*-------------------------------------------------------*/
 187 | 
 188 |     /*------------------ MAIN CALCULATION -------------------*/
 189 |     // TODO: Store alignment data only after k is determined? That could make things faster.
 190 |     int positionNW; // Used only when mode is NW.
 191 |     AlignmentData* alignData = NULL;
 192 |     bool dynamicK = false;
 193 |     int k = config.k;
 194 |     if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
 195 |         dynamicK = true;
 196 |         k = WORD_SIZE; // Gives better results than smaller k.
 197 |     }
 198 | 
 199 |     do {
 200 |         if (config.mode == EDLIB_MODE_HW || config.mode == EDLIB_MODE_SHW) {
 201 |             myersCalcEditDistanceSemiGlobal(Peq, W, maxNumBlocks,
 202 |                                             queryLength, target, targetLength,
 203 |                                             k, config.mode, &(result.editDistance),
 204 |                                             &(result.endLocations), &(result.numLocations));
 205 |         } else {  // mode == EDLIB_MODE_NW
 206 |             myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
 207 |                                     queryLength, target, targetLength,
 208 |                                     k, &(result.editDistance), &positionNW,
 209 |                                     false, &alignData, -1);
 210 |         }
 211 |         k *= 2;
 212 |     } while(dynamicK && result.editDistance == -1);
 213 | 
 214 |     if (result.editDistance >= 0) {  // If there is solution.
 215 |         // If NW mode, set end location explicitly.
 216 |         if (config.mode == EDLIB_MODE_NW) {
 217 |             result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
 218 |             result.endLocations[0] = targetLength - 1;
 219 |             result.numLocations = 1;
 220 |         }
 221 | 
 222 |         // Find starting locations.
 223 |         if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
 224 |             result.startLocations = static_cast<int *>(malloc(result.numLocations * sizeof(int)));
 225 |             if (config.mode == EDLIB_MODE_HW) {  // If HW, I need to calculate start locations.
 226 |                 const unsigned char* rTarget = createReverseCopy(target, targetLength);
 227 |                 const unsigned char* rQuery  = createReverseCopy(query, queryLength);
 228 |                 // Peq for reversed query.
 229 |                 Word* rPeq = buildPeq(static_cast<int>(alphabet.size()), rQuery, queryLength, equalityDefinition);
 230 |                 for (int i = 0; i < result.numLocations; i++) {
 231 |                     int endLocation = result.endLocations[i];
 232 |                     if (endLocation == -1) {
 233 |                         // NOTE: Sometimes one of optimal solutions is that query starts before target, like this:
 234 |                         //                       AAGG <- target
 235 |                         //                   CCTT     <- query
 236 |                         //   It will never be only optimal solution and it does not happen often, however it is
 237 |                         //   possible and in that case end location will be -1. What should we do with that?
 238 |                         //   Should we just skip reporting such end location, although it is a solution?
 239 |                         //   If we do report it, what is the start location? -4? -1? Nothing?
 240 |                         // TODO: Figure this out. This has to do in general with how we think about start
 241 |                         //   and end locations.
 242 |                         //   Also, we have alignment later relying on this locations to limit the space of it's
 243 |                         //   search -> how can it do it right if these locations are negative or incorrect?
 244 |                         result.startLocations[i] = 0;  // I put 0 for now, but it does not make much sense.
 245 |                     } else {
 246 |                         int bestScoreSHW, numPositionsSHW;
 247 |                         int* positionsSHW;
 248 |                         myersCalcEditDistanceSemiGlobal(
 249 |                                 rPeq, W, maxNumBlocks,
 250 |                                 queryLength, rTarget + targetLength - endLocation - 1, endLocation + 1,
 251 |                                 result.editDistance, EDLIB_MODE_SHW,
 252 |                                 &bestScoreSHW, &positionsSHW, &numPositionsSHW);
 253 |                         // Taking last location as start ensures that alignment will not start with insertions
 254 |                         // if it can start with mismatches instead.
 255 |                         result.startLocations[i] = endLocation - positionsSHW[numPositionsSHW - 1];
 256 |                         free(positionsSHW);
 257 |                     }
 258 |                 }
 259 |                 delete[] rTarget;
 260 |                 delete[] rQuery;
 261 |                 delete[] rPeq;
 262 |             } else {  // If mode is SHW or NW
 263 |                 for (int i = 0; i < result.numLocations; i++) {
 264 |                     result.startLocations[i] = 0;
 265 |                 }
 266 |             }
 267 |         }
 268 | 
 269 |         // Find alignment -> all comes down to finding alignment for NW.
 270 |         // Currently we return alignment only for first pair of locations.
 271 |         if (config.task == EDLIB_TASK_PATH) {
 272 |             int alnStartLocation = result.startLocations[0];
 273 |             int alnEndLocation = result.endLocations[0];
 274 |             const unsigned char* alnTarget = target + alnStartLocation;
 275 |             const int alnTargetLength = alnEndLocation - alnStartLocation + 1;
 276 |             const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength);
 277 |             const unsigned char* rQuery  = createReverseCopy(query, queryLength);
 278 |             obtainAlignment(query, rQuery, queryLength,
 279 |                             alnTarget, rAlnTarget, alnTargetLength,
 280 |                             equalityDefinition, static_cast<int>(alphabet.size()), result.editDistance,
 281 |                             &(result.alignment), &(result.alignmentLength));
 282 |             delete[] rAlnTarget;
 283 |             delete[] rQuery;
 284 |         }
 285 |     }
 286 |     /*-------------------------------------------------------*/
 287 | 
 288 |     //--- Free memory ---//
 289 |     delete[] Peq;
 290 |     free(query);
 291 |     free(target);
 292 |     if (alignData) delete alignData;
 293 |     //-------------------//
 294 | 
 295 |     return result;
 296 | }
 297 | 
 298 | extern "C" char* edlibAlignmentToCigar(const unsigned char* const alignment, const int alignmentLength,
 299 |                                        const EdlibCigarFormat cigarFormat) {
 300 |     if (cigarFormat != EDLIB_CIGAR_EXTENDED && cigarFormat != EDLIB_CIGAR_STANDARD) {
 301 |         return 0;
 302 |     }
 303 | 
 304 |     // Maps move code from alignment to char in cigar.
 305 |     //                        0    1    2    3
 306 |     char moveCodeToChar[] = {'=', 'I', 'D', 'X'};
 307 |     if (cigarFormat == EDLIB_CIGAR_STANDARD) {
 308 |         moveCodeToChar[0] = moveCodeToChar[3] = 'M';
 309 |     }
 310 | 
 311 |     vector<char>* cigar = new vector<char>();
 312 |     char lastMove = 0;  // Char of last move. 0 if there was no previous move.
 313 |     int numOfSameMoves = 0;
 314 |     for (int i = 0; i <= alignmentLength; i++) {
 315 |         // if new sequence of same moves started
 316 |         if (i == alignmentLength || (moveCodeToChar[alignment[i]] != lastMove && lastMove != 0)) {
 317 |             // Write number of moves to cigar string.
 318 |             int numDigits = 0;
 319 |             for (; numOfSameMoves; numOfSameMoves /= 10) {
 320 |                 cigar->push_back('0' + numOfSameMoves % 10);
 321 |                 numDigits++;
 322 |             }
 323 |             reverse(cigar->end() - numDigits, cigar->end());
 324 |             // Write code of move to cigar string.
 325 |             cigar->push_back(lastMove);
 326 |             // If not at the end, start new sequence of moves.
 327 |             if (i < alignmentLength) {
 328 |                 // Check if alignment has valid values.
 329 |                 if (alignment[i] > 3) {
 330 |                     delete cigar;
 331 |                     return 0;
 332 |                 }
 333 |                 numOfSameMoves = 0;
 334 |             }
 335 |         }
 336 |         if (i < alignmentLength) {
 337 |             lastMove = moveCodeToChar[alignment[i]];
 338 |             numOfSameMoves++;
 339 |         }
 340 |     }
 341 |     cigar->push_back(0);  // Null character termination.
 342 |     char* cigar_ = static_cast<char *>(malloc(cigar->size() * sizeof(char)));
 343 |     memcpy(cigar_, &(*cigar)[0], cigar->size() * sizeof(char));
 344 |     delete cigar;
 345 | 
 346 |     return cigar_;
 347 | }
 348 | 
 349 | /**
 350 |  * Build Peq table for given query and alphabet.
 351 |  * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
 352 |  * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
 353 |  * NOTICE: free returned array with delete[]!
 354 |  */
 355 | static inline Word* buildPeq(const int alphabetLength,
 356 |                              const unsigned char* const query,
 357 |                              const int queryLength,
 358 |                              const EqualityDefinition& equalityDefinition) {
 359 |     int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
 360 |     // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
 361 |     Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
 362 | 
 363 |     // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
 364 |     for (unsigned char symbol = 0; symbol <= alphabetLength; symbol++) {
 365 |         for (int b = 0; b < maxNumBlocks; b++) {
 366 |             if (symbol < alphabetLength) {
 367 |                 Peq[symbol * maxNumBlocks + b] = 0;
 368 |                 for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
 369 |                     Peq[symbol * maxNumBlocks + b] <<= 1;
 370 |                     // NOTE: We pretend like query is padded at the end with W wildcard symbols
 371 |                     if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol))
 372 |                         Peq[symbol * maxNumBlocks + b] += 1;
 373 |                 }
 374 |             } else { // Last symbol is wildcard, so it is all 1s
 375 |                 Peq[symbol * maxNumBlocks + b] = static_cast<Word>(-1);
 376 |             }
 377 |         }
 378 |     }
 379 | 
 380 |     return Peq;
 381 | }
 382 | 
 383 | 
 384 | /**
 385 |  * Returns new sequence that is reverse of given sequence.
 386 |  * Free returned array with delete[].
 387 |  */
 388 | static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
 389 |     unsigned char* rSeq = new unsigned char[length];
 390 |     for (int i = 0; i < length; i++) {
 391 |         rSeq[i] = seq[length - i - 1];
 392 |     }
 393 |     return rSeq;
 394 | }
 395 | 
 396 | /**
 397 |  * Corresponds to Advance_Block function from Myers.
 398 |  * Calculates one word(block), which is part of a column.
 399 |  * Highest bit of word (one most to the left) is most bottom cell of block from column.
 400 |  * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
 401 |  * @param [in] Pv  Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
 402 |  * @param [in] Mv  Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
 403 |  * @param [in] Eq  Bitset, Eq[i] == 1 if match, 0 if mismatch.
 404 |  * @param [in] hin  Will be +1, 0 or -1.
 405 |  * @param [out] PvOut  Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
 406 |  * @param [out] MvOut  Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
 407 |  * @param [out] hout  Will be +1, 0 or -1.
 408 |  */
 409 | static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
 410 |                                  Word &PvOut, Word &MvOut) {
 411 |     // hin can be 1, -1 or 0.
 412 |     // 1  -> 00...01
 413 |     // 0  -> 00...00
 414 |     // -1 -> 11...11 (2-complement)
 415 | 
 416 |     Word hinIsNeg = static_cast<Word>(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
 417 | 
 418 |     Word Xv = Eq | Mv;
 419 |     // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
 420 |     Eq |= hinIsNeg;
 421 |     Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
 422 | 
 423 |     Word Ph = Mv | ~(Xh | Pv);
 424 |     Word Mh = Pv & Xh;
 425 | 
 426 |     int hout = 0;
 427 |     // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
 428 |     hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
 429 |     // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
 430 |     hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
 431 | 
 432 |     Ph <<= 1;
 433 |     Mh <<= 1;
 434 | 
 435 |     // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
 436 |     Mh |= hinIsNeg;
 437 |     // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
 438 |     Ph |= static_cast<Word>((hin + 1) >> 1);
 439 | 
 440 |     PvOut = Mh | ~(Xv | Ph);
 441 |     MvOut = Ph & Xv;
 442 | 
 443 |     return hout;
 444 | }
 445 | 
 446 | /**
 447 |  * Does ceiling division x / y.
 448 |  * Note: x and y must be non-negative and x + y must not overflow.
 449 |  */
 450 | static inline int ceilDiv(const int x, const int y) {
 451 |     return x % y ? x / y + 1 : x / y;
 452 | }
 453 | 
 454 | static inline int min(const int x, const int y) {
 455 |     return x < y ? x : y;
 456 | }
 457 | 
 458 | static inline int max(const int x, const int y) {
 459 |     return x > y ? x : y;
 460 | }
 461 | 
 462 | 
 463 | /**
 464 |  * @param [in] block
 465 |  * @return Values of cells in block, starting with bottom cell in block.
 466 |  */
 467 | static inline vector<int> getBlockCellValues(const Block block) {
 468 |     vector<int> scores(WORD_SIZE);
 469 |     int score = block.score;
 470 |     Word mask = HIGH_BIT_MASK;
 471 |     for (int i = 0; i < WORD_SIZE - 1; i++) {
 472 |         scores[i] = score;
 473 |         if (block.P & mask) score--;
 474 |         if (block.M & mask) score++;
 475 |         mask >>= 1;
 476 |     }
 477 |     scores[WORD_SIZE - 1] = score;
 478 |     return scores;
 479 | }
 480 | 
 481 | /**
 482 |  * Writes values of cells in block into given array, starting with first/top cell.
 483 |  * @param [in] block
 484 |  * @param [out] dest  Array into which cell values are written. Must have size of at least WORD_SIZE.
 485 |  */
 486 | static inline void readBlock(const Block block, int* const dest) {
 487 |     int score = block.score;
 488 |     Word mask = HIGH_BIT_MASK;
 489 |     for (int i = 0; i < WORD_SIZE - 1; i++) {
 490 |         dest[WORD_SIZE - 1 - i] = score;
 491 |         if (block.P & mask) score--;
 492 |         if (block.M & mask) score++;
 493 |         mask >>= 1;
 494 |     }
 495 |     dest[0] = score;
 496 | }
 497 | 
 498 | /**
 499 |  * Writes values of cells in block into given array, starting with last/bottom cell.
 500 |  * @param [in] block
 501 |  * @param [out] dest  Array into which cell values are written. Must have size of at least WORD_SIZE.
 502 |  */
 503 | static inline void readBlockReverse(const Block block, int* const dest) {
 504 |     int score = block.score;
 505 |     Word mask = HIGH_BIT_MASK;
 506 |     for (int i = 0; i < WORD_SIZE - 1; i++) {
 507 |         dest[i] = score;
 508 |         if (block.P & mask) score--;
 509 |         if (block.M & mask) score++;
 510 |         mask >>= 1;
 511 |     }
 512 |     dest[WORD_SIZE - 1] = score;
 513 | }
 514 | 
 515 | /**
 516 |  * @param [in] block
 517 |  * @param [in] k
 518 |  * @return True if all cells in block have value larger than k, otherwise false.
 519 |  */
 520 | static inline bool allBlockCellsLarger(const Block block, const int k) {
 521 |     vector<int> scores = getBlockCellValues(block);
 522 |     for (int i = 0; i < WORD_SIZE; i++) {
 523 |         if (scores[i] <= k) return false;
 524 |     }
 525 |     return true;
 526 | }
 527 | 
 528 | 
 529 | /**
 530 |  * Uses Myers' bit-vector algorithm to find edit distance for one of semi-global alignment methods.
 531 |  * @param [in] Peq  Query profile.
 532 |  * @param [in] W  Size of padding in last block.
 533 |  *                TODO: Calculate this directly from query, instead of passing it.
 534 |  * @param [in] maxNumBlocks  Number of blocks needed to cover the whole query.
 535 |  *                           TODO: Calculate this directly from query, instead of passing it.
 536 |  * @param [in] queryLength
 537 |  * @param [in] target
 538 |  * @param [in] targetLength
 539 |  * @param [in] k
 540 |  * @param [in] mode  EDLIB_MODE_HW or EDLIB_MODE_SHW
 541 |  * @param [out] bestScore_  Edit distance.
 542 |  * @param [out] positions_  Array of 0-indexed positions in target at which best score was found.
 543 |                             Make sure to free this array with free().
 544 |  * @param [out] numPositions_  Number of positions in the positions_ array.
 545 |  * @return Status.
 546 |  */
 547 | static int myersCalcEditDistanceSemiGlobal(
 548 |         const Word* const Peq, const int W, const int maxNumBlocks,
 549 |         const int queryLength,
 550 |         const unsigned char* const target, const int targetLength,
 551 |         int k, const EdlibAlignMode mode,
 552 |         int* const bestScore_, int** const positions_, int* const numPositions_) {
 553 |     *positions_ = NULL;
 554 |     *numPositions_ = 0;
 555 | 
 556 |     // firstBlock is 0-based index of first block in Ukkonen band.
 557 |     // lastBlock is 0-based index of last block in Ukkonen band.
 558 |     int firstBlock = 0;
 559 |     int lastBlock = min(ceilDiv(k + 1, WORD_SIZE), maxNumBlocks) - 1; // y in Myers
 560 |     Block *bl; // Current block
 561 | 
 562 |     Block* blocks = new Block[maxNumBlocks];
 563 | 
 564 |     // For HW, solution will never be larger then queryLength.
 565 |     if (mode == EDLIB_MODE_HW) {
 566 |         k = min(queryLength, k);
 567 |     }
 568 | 
 569 |     // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
 570 |     // This gives speed up of about 2 times for small k.
 571 |     const int STRONG_REDUCE_NUM = 2048;
 572 | 
 573 |     // Initialize P, M and score
 574 |     bl = blocks;
 575 |     for (int b = 0; b <= lastBlock; b++) {
 576 |         bl->score = (b + 1) * WORD_SIZE;
 577 |         bl->P = static_cast<Word>(-1); // All 1s
 578 |         bl->M = static_cast<Word>(0);
 579 |         bl++;
 580 |     }
 581 | 
 582 |     int bestScore = -1;
 583 |     vector<int> positions; // TODO: Maybe put this on heap?
 584 |     const int startHout = mode == EDLIB_MODE_HW ? 0 : 1; // If 0 then gap before query is not penalized;
 585 |     const unsigned char* targetChar = target;
 586 |     for (int c = 0; c < targetLength; c++) { // for each column
 587 |         const Word* Peq_c = Peq + (*targetChar) * maxNumBlocks;
 588 | 
 589 |         //----------------------- Calculate column -------------------------//
 590 |         int hout = startHout;
 591 |         bl = blocks + firstBlock;
 592 |         Peq_c += firstBlock;
 593 |         for (int b = firstBlock; b <= lastBlock; b++) {
 594 |             hout = calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
 595 |             bl->score += hout;
 596 |             bl++; Peq_c++;
 597 |         }
 598 |         bl--; Peq_c--;
 599 |         //------------------------------------------------------------------//
 600 | 
 601 |         //---------- Adjust number of blocks according to Ukkonen ----------//
 602 |         if ((lastBlock < maxNumBlocks - 1) && (bl->score - hout <= k) // bl is pointing to last block
 603 |             && ((*(Peq_c + 1) & WORD_1) || hout < 0)) { // Peq_c is pointing to last block
 604 |             // If score of left block is not too big, calculate one more block
 605 |             lastBlock++; bl++; Peq_c++;
 606 |             bl->P = static_cast<Word>(-1); // All 1s
 607 |             bl->M = static_cast<Word>(0);
 608 |             bl->score = (bl - 1)->score - hout + WORD_SIZE + calculateBlock(bl->P, bl->M, *Peq_c, hout, bl->P, bl->M);
 609 |         } else {
 610 |             while (lastBlock >= firstBlock && bl->score >= k + WORD_SIZE) {
 611 |                 lastBlock--; bl--; Peq_c--;
 612 |             }
 613 |         }
 614 | 
 615 |         // Every some columns, do some expensive but also more efficient block reducing.
 616 |         // This is important!
 617 |         //
 618 |         // Reduce the band by decreasing last block if possible.
 619 |         if (c % STRONG_REDUCE_NUM == 0) {
 620 |             while (lastBlock >= 0 && lastBlock >= firstBlock && allBlockCellsLarger(*bl, k)) {
 621 |                 lastBlock--; bl--; Peq_c--;
 622 |             }
 623 |         }
 624 |         // For HW, even if all cells are > k, there still may be solution in next
 625 |         // column because starting conditions at upper boundary are 0.
 626 |         // That means that first block is always candidate for solution,
 627 |         // and we can never end calculation before last column.
 628 |         if (mode == EDLIB_MODE_HW && lastBlock == -1) {
 629 |             lastBlock++; bl++; Peq_c++;
 630 |         }
 631 | 
 632 |         // Reduce band by increasing first block if possible. Not applicable to HW.
 633 |         if (mode != EDLIB_MODE_HW) {
 634 |             while (firstBlock <= lastBlock && blocks[firstBlock].score >= k + WORD_SIZE) {
 635 |                 firstBlock++;
 636 |             }
 637 |             if (c % STRONG_REDUCE_NUM == 0) { // Do strong reduction every some blocks
 638 |                 while (firstBlock <= lastBlock && allBlockCellsLarger(blocks[firstBlock], k)) {
 639 |                     firstBlock++;
 640 |                 }
 641 |             }
 642 |         }
 643 | 
 644 |         // If band stops to exist finish
 645 |         if (lastBlock < firstBlock) {
 646 |             *bestScore_ = bestScore;
 647 |             if (bestScore != -1) {
 648 |                 *positions_ = static_cast<int *>(malloc(sizeof(int) * static_cast<int>(positions.size())));
 649 |                 *numPositions_ = static_cast<int>(positions.size());
 650 |                 copy(positions.begin(), positions.end(), *positions_);
 651 |             }
 652 |             delete[] blocks;
 653 |             return EDLIB_STATUS_OK;
 654 |         }
 655 |         //------------------------------------------------------------------//
 656 | 
 657 |         //------------------------- Update best score ----------------------//
 658 |         if (lastBlock == maxNumBlocks - 1) {
 659 |             int colScore = bl->score;
 660 |             if (colScore <= k) { // Scores > k dont have correct values (so we cannot use them), but are certainly > k.
 661 |                 // NOTE: Score that I find in column c is actually score from column c-W
 662 |                 if (bestScore == -1 || colScore <= bestScore) {
 663 |                     if (colScore != bestScore) {
 664 |                         positions.clear();
 665 |                         bestScore = colScore;
 666 |                         // Change k so we will look only for equal or better
 667 |                         // scores then the best found so far.
 668 |                         k = bestScore;
 669 |                     }
 670 |                     positions.push_back(c - W);
 671 |                 }
 672 |             }
 673 |         }
 674 |         //------------------------------------------------------------------//
 675 | 
 676 |         targetChar++;
 677 |     }
 678 | 
 679 | 
 680 |     // Obtain results for last W columns from last column.
 681 |     if (lastBlock == maxNumBlocks - 1) {
 682 |         vector<int> blockScores = getBlockCellValues(*bl);
 683 |         for (int i = 0; i < W; i++) {
 684 |             int colScore = blockScores[i + 1];
 685 |             if (colScore <= k && (bestScore == -1 || colScore <= bestScore)) {
 686 |                 if (colScore != bestScore) {
 687 |                     positions.clear();
 688 |                     k = bestScore = colScore;
 689 |                 }
 690 |                 positions.push_back(targetLength - W + i);
 691 |             }
 692 |         }
 693 |     }
 694 | 
 695 |     *bestScore_ = bestScore;
 696 |     if (bestScore != -1) {
 697 |         *positions_ = static_cast<int *>(malloc(sizeof(int) * static_cast<int>(positions.size())));
 698 |         *numPositions_ = static_cast<int>(positions.size());
 699 |         copy(positions.begin(), positions.end(), *positions_);
 700 |     }
 701 | 
 702 |     delete[] blocks;
 703 |     return EDLIB_STATUS_OK;
 704 | }
 705 | 
 706 | 
 707 | /**
 708 |  * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method.
 709 |  * @param [in] Peq  Query profile.
 710 |  * @param [in] W  Size of padding in last block.
 711 |  *                TODO: Calculate this directly from query, instead of passing it.
 712 |  * @param [in] maxNumBlocks  Number of blocks needed to cover the whole query.
 713 |  *                           TODO: Calculate this directly from query, instead of passing it.
 714 |  * @param [in] queryLength
 715 |  * @param [in] target
 716 |  * @param [in] targetLength
 717 |  * @param [in] k
 718 |  * @param [out] bestScore_  Edit distance.
 719 |  * @param [out] position_  0-indexed position in target at which best score was found.
 720 |  * @param [in] findAlignment  If true, whole matrix is remembered and alignment data is returned.
 721 |  *                            Quadratic amount of memory is consumed.
 722 |  * @param [out] alignData  Data needed for alignment traceback (for reconstruction of alignment).
 723 |  *                         Set only if findAlignment is set to true, otherwise it is NULL.
 724 |  *                         Make sure to free this array using delete[].
 725 |  * @param [out] targetStopPosition  If set to -1, whole calculation is performed normally, as expected.
 726 |  *         If set to p, calculation is performed up to position p in target (inclusive)
 727 |  *         and column p is returned as the only column in alignData.
 728 |  * @return Status.
 729 |  */
 730 | static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks,
 731 |                                    const int queryLength,
 732 |                                    const unsigned char* const target, const int targetLength,
 733 |                                    int k, int* const bestScore_,
 734 |                                    int* const position_, const bool findAlignment,
 735 |                                    AlignmentData** const alignData, const int targetStopPosition) {
 736 |     if (targetStopPosition > -1 && findAlignment) {
 737 |         // They can not be both set at the same time!
 738 |         return EDLIB_STATUS_ERROR;
 739 |     }
 740 | 
 741 |     // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
 742 |     const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation
 743 | 
 744 |     if (k < abs(targetLength - queryLength)) {
 745 |         *bestScore_ = *position_ = -1;
 746 |         return EDLIB_STATUS_OK;
 747 |     }
 748 | 
 749 |     k = min(k, max(queryLength, targetLength));  // Upper bound for k
 750 | 
 751 |     // firstBlock is 0-based index of first block in Ukkonen band.
 752 |     // lastBlock is 0-based index of last block in Ukkonen band.
 753 |     int firstBlock = 0;
 754 |     // This is optimal now, by my formula.
 755 |     int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1;
 756 |     Block* bl; // Current block
 757 | 
 758 |     Block* blocks = new Block[maxNumBlocks];
 759 | 
 760 |     // Initialize P, M and score
 761 |     bl = blocks;
 762 |     for (int b = 0; b <= lastBlock; b++) {
 763 |         bl->score = (b + 1) * WORD_SIZE;
 764 |         bl->P = static_cast<Word>(-1); // All 1s
 765 |         bl->M = static_cast<Word>(0);
 766 |         bl++;
 767 |     }
 768 | 
 769 |     // If we want to find alignment, we have to store needed data.
 770 |     if (findAlignment)
 771 |         *alignData = new AlignmentData(maxNumBlocks, targetLength);
 772 |     else if (targetStopPosition > -1)
 773 |         *alignData = new AlignmentData(maxNumBlocks, 1);
 774 |     else
 775 |         *alignData = NULL;
 776 | 
 777 |     const unsigned char* targetChar = target;
 778 |     for (int c = 0; c < targetLength; c++) { // for each column
 779 |         const Word* Peq_c = Peq + *targetChar * maxNumBlocks;
 780 | 
 781 |         //----------------------- Calculate column -------------------------//
 782 |         int hout = 1;
 783 |         bl = blocks + firstBlock;
 784 |         for (int b = firstBlock; b <= lastBlock; b++) {
 785 |             hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M);
 786 |             bl->score += hout;
 787 |             bl++;
 788 |         }
 789 |         bl--;
 790 |         //------------------------------------------------------------------//
 791 |         // bl now points to last block
 792 | 
 793 |         // Update k. I do it only on end of column because it would slow calculation too much otherwise.
 794 |         // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up.
 795 |         k = min(k, bl->score
 796 |                 + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1)
 797 |                 + (lastBlock == maxNumBlocks - 1 ? W : 0));
 798 | 
 799 |         //---------- Adjust number of blocks according to Ukkonen ----------//
 800 |         //--- Adjust last block ---//
 801 |         // If block is not beneath band, calculate next block. Only next because others are certainly beneath band.
 802 |         if (lastBlock + 1 < maxNumBlocks
 803 |             && !(//score[lastBlock] >= k + WORD_SIZE ||  // NOTICE: this condition could be satisfied if above block also!
 804 |                  ((lastBlock + 1) * WORD_SIZE - 1
 805 |                   > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) {
 806 |             lastBlock++; bl++;
 807 |             bl->P = static_cast<Word>(-1); // All 1s
 808 |             bl->M = static_cast<Word>(0);
 809 |             int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M);
 810 |             bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout;
 811 |             hout = newHout;
 812 |         }
 813 | 
 814 |         // While block is out of band, move one block up.
 815 |         // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it.
 816 |         // I could consider adding that max part, for optimal performance.
 817 |         while (lastBlock >= firstBlock
 818 |                && (bl->score >= k + WORD_SIZE
 819 |                    || ((lastBlock + 1) * WORD_SIZE - 1 >
 820 |                        // TODO: Does not work if do not put +1! Why???
 821 |                        k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) {
 822 |             lastBlock--; bl--;
 823 |         }
 824 |         //-------------------------//
 825 | 
 826 |         //--- Adjust first block ---//
 827 |         // While outside of band, advance block
 828 |         while (firstBlock <= lastBlock
 829 |                && (blocks[firstBlock].score >= k + WORD_SIZE
 830 |                    || ((firstBlock + 1) * WORD_SIZE - 1 <
 831 |                        blocks[firstBlock].score - k - targetLength + queryLength + c))) {
 832 |             firstBlock++;
 833 |         }
 834 |         //--------------------------/
 835 | 
 836 | 
 837 |         // TODO: consider if this part is useful, it does not seem to help much
 838 |         if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction
 839 |             while (lastBlock >= firstBlock) {
 840 |                 // If all cells outside of band, remove block
 841 |                 vector<int> scores = getBlockCellValues(*bl);
 842 |                 int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
 843 |                 int r = lastBlock * WORD_SIZE + numCells - 1;
 844 |                 bool reduce = true;
 845 |                 for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
 846 |                     // TODO: Does not work if do not put +1! Why???
 847 |                     if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) {
 848 |                         reduce = false;
 849 |                         break;
 850 |                     }
 851 |                     r--;
 852 |                 }
 853 |                 if (!reduce) break;
 854 |                 lastBlock--; bl--;
 855 |             }
 856 | 
 857 |             while (firstBlock <= lastBlock) {
 858 |                 // If all cells outside of band, remove block
 859 |                 vector<int> scores = getBlockCellValues(blocks[firstBlock]);
 860 |                 int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
 861 |                 int r = firstBlock * WORD_SIZE + numCells - 1;
 862 |                 bool reduce = true;
 863 |                 for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
 864 |                     if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) {
 865 |                         reduce = false;
 866 |                         break;
 867 |                     }
 868 |                     r--;
 869 |                 }
 870 |                 if (!reduce) break;
 871 |                 firstBlock++;
 872 |             }
 873 |         }
 874 | 
 875 | 
 876 |         // If band stops to exist finish
 877 |         if (lastBlock < firstBlock) {
 878 |             *bestScore_ = *position_ = -1;
 879 |             delete[] blocks;
 880 |             return EDLIB_STATUS_OK;
 881 |         }
 882 |         //------------------------------------------------------------------//
 883 | 
 884 | 
 885 |         //---- Save column so it can be used for reconstruction ----//
 886 |         if (findAlignment && c < targetLength) {
 887 |             bl = blocks + firstBlock;
 888 |             for (int b = firstBlock; b <= lastBlock; b++) {
 889 |                 (*alignData)->Ps[maxNumBlocks * c + b] = bl->P;
 890 |                 (*alignData)->Ms[maxNumBlocks * c + b] = bl->M;
 891 |                 (*alignData)->scores[maxNumBlocks * c + b] = bl->score;
 892 |                 (*alignData)->firstBlocks[c] = firstBlock;
 893 |                 (*alignData)->lastBlocks[c] = lastBlock;
 894 |                 bl++;
 895 |             }
 896 |         }
 897 |         //----------------------------------------------------------//
 898 |         //---- If this is stop column, save it and finish ----//
 899 |         if (c == targetStopPosition) {
 900 |             for (int b = firstBlock; b <= lastBlock; b++) {
 901 |                 (*alignData)->Ps[b] = (blocks + b)->P;
 902 |                 (*alignData)->Ms[b] = (blocks + b)->M;
 903 |                 (*alignData)->scores[b] = (blocks + b)->score;
 904 |                 (*alignData)->firstBlocks[0] = firstBlock;
 905 |                 (*alignData)->lastBlocks[0] = lastBlock;
 906 |             }
 907 |             *bestScore_ = -1;
 908 |             *position_ = targetStopPosition;
 909 |             delete[] blocks;
 910 |             return EDLIB_STATUS_OK;
 911 |         }
 912 |         //----------------------------------------------------//
 913 | 
 914 |         targetChar++;
 915 |     }
 916 | 
 917 |     if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated
 918 |         // Obtain best score from block -> it is complicated because query is padded with W cells
 919 |         int bestScore = getBlockCellValues(blocks[lastBlock])[W];
 920 |         if (bestScore <= k) {
 921 |             *bestScore_ = bestScore;
 922 |             *position_ = targetLength - 1;
 923 |             delete[] blocks;
 924 |             return EDLIB_STATUS_OK;
 925 |         }
 926 |     }
 927 | 
 928 |     *bestScore_ = *position_ = -1;
 929 |     delete[] blocks;
 930 |     return EDLIB_STATUS_OK;
 931 | }
 932 | 
 933 | 
 934 | /**
 935 |  * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix,
 936 |  * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength).
 937 |  * @param [in] queryLength  Normal length, without W.
 938 |  * @param [in] targetLength  Normal length, without W.
 939 |  * @param [in] bestScore  Best score.
 940 |  * @param [in] alignData  Data obtained during finding best score that is useful for finding alignment.
 941 |  * @param [out] alignment  Alignment.
 942 |  * @param [out] alignmentLength  Length of alignment.
 943 |  * @return Status code.
 944 |  */
 945 | static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
 946 |                                     const int bestScore, const AlignmentData* const alignData,
 947 |                                     unsigned char** const alignment, int* const alignmentLength) {
 948 |     const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
 949 |     const int W = maxNumBlocks * WORD_SIZE - queryLength;
 950 | 
 951 |     *alignment = static_cast<unsigned char*>(malloc((queryLength + targetLength - 1) * sizeof(unsigned char)));
 952 |     *alignmentLength = 0;
 953 |     int c = targetLength - 1; // index of column
 954 |     int b = maxNumBlocks - 1; // index of block in column
 955 |     int currScore = bestScore; // Score of current cell
 956 |     int lScore  = -1; // Score of left cell
 957 |     int uScore  = -1; // Score of upper cell
 958 |     int ulScore = -1; // Score of upper left cell
 959 |     Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block
 960 |     Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block
 961 |     // True if block to left exists and is in band
 962 |     bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1];
 963 |     // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the
 964 |     // calculation as both lP and lM should be initialized at some moment later (but compiler can not
 965 |     // detect it since this initialization is guaranteed by "business" logic).
 966 |     Word lP = 0, lM = 0;
 967 |     if (thereIsLeftBlock) {
 968 |         lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left
 969 |         lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left
 970 |     }
 971 |     currP <<= W;
 972 |     currM <<= W;
 973 |     int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos
 974 | 
 975 |     // TODO(martin): refactor this whole piece of code. There are too many if-else statements,
 976 |     // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases.
 977 |     // We need better separation of logic and responsibilities.
 978 |     while (true) {
 979 |         if (c == 0) {
 980 |             thereIsLeftBlock = true;
 981 |             lScore = b * WORD_SIZE + blockPos + 1;
 982 |             ulScore = lScore - 1;
 983 |         }
 984 | 
 985 |         // TODO: improvement: calculate only those cells that are needed,
 986 |         //       for example if I calculate upper cell and can move up,
 987 |         //       there is no need to calculate left and upper left cell
 988 |         //---------- Calculate scores ---------//
 989 |         if (lScore == -1 && thereIsLeftBlock) {
 990 |             lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left
 991 |             for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) {
 992 |                 if (lP & HIGH_BIT_MASK) lScore--;
 993 |                 if (lM & HIGH_BIT_MASK) lScore++;
 994 |                 lP <<= 1;
 995 |                 lM <<= 1;
 996 |             }
 997 |         }
 998 |         if (ulScore == -1) {
 999 |             if (lScore != -1) {
1000 |                 ulScore = lScore;
1001 |                 if (lP & HIGH_BIT_MASK) ulScore--;
1002 |                 if (lM & HIGH_BIT_MASK) ulScore++;
1003 |             }
1004 |             else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) {
1005 |                 // This is the case when upper left cell is last cell in block,
1006 |                 // and block to left is not in band so lScore is -1.
1007 |                 ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1];
1008 |             }
1009 |         }
1010 |         if (uScore == -1) {
1011 |             uScore = currScore;
1012 |             if (currP & HIGH_BIT_MASK) uScore--;
1013 |             if (currM & HIGH_BIT_MASK) uScore++;
1014 |             currP <<= 1;
1015 |             currM <<= 1;
1016 |         }
1017 |         //-------------------------------------//
1018 | 
1019 |         // TODO: should I check if there is upper block?
1020 | 
1021 |         //-------------- Move --------------//
1022 |         // Move up - insertion to target - deletion from query
1023 |         if (uScore != -1 && uScore + 1 == currScore) {
1024 |             currScore = uScore;
1025 |             lScore = ulScore;
1026 |             uScore = ulScore = -1;
1027 |             if (blockPos == 0) { // If entering new (upper) block
1028 |                 if (b == 0) { // If there are no cells above (only boundary cells)
1029 |                     (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up
1030 |                     for (int i = 0; i < c + 1; i++) // Move left until end
1031 |                         (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1032 |                     break;
1033 |                 } else {
1034 |                     blockPos = WORD_SIZE - 1;
1035 |                     b--;
1036 |                     currP = alignData->Ps[c * maxNumBlocks + b];
1037 |                     currM = alignData->Ms[c * maxNumBlocks + b];
1038 |                     if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1039 |                         thereIsLeftBlock = true;
1040 |                         lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations
1041 |                         lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1042 |                     } else {
1043 |                         thereIsLeftBlock = false;
1044 |                         // TODO(martin): There may not be left block, but there can be left boundary - do we
1045 |                         // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this.
1046 |                     }
1047 |                 }
1048 |             } else {
1049 |                 blockPos--;
1050 |                 lP <<= 1;
1051 |                 lM <<= 1;
1052 |             }
1053 |             // Mark move
1054 |             (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1055 |         }
1056 |         // Move left - deletion from target - insertion to query
1057 |         else if (lScore != -1 && lScore + 1 == currScore) {
1058 |             currScore = lScore;
1059 |             uScore = ulScore;
1060 |             lScore = ulScore = -1;
1061 |             c--;
1062 |             if (c == -1) { // If there are no cells to the left (only boundary cells)
1063 |                 (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left
1064 |                 int numUp = b * WORD_SIZE + blockPos + 1;
1065 |                 for (int i = 0; i < numUp; i++) // Move up until end
1066 |                     (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1067 |                 break;
1068 |             }
1069 |             currP = lP;
1070 |             currM = lM;
1071 |             if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1072 |                 thereIsLeftBlock = true;
1073 |                 lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
1074 |                 lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1075 |             } else {
1076 |                 if (c == 0) { // If there are no cells to the left (only boundary cells)
1077 |                     thereIsLeftBlock = true;
1078 |                     lScore = b * WORD_SIZE + blockPos + 1;
1079 |                     ulScore = lScore - 1;
1080 |                 } else {
1081 |                     thereIsLeftBlock = false;
1082 |                 }
1083 |             }
1084 |             // Mark move
1085 |             (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1086 |         }
1087 |         // Move up left - (mis)match
1088 |         else if (ulScore != -1) {
1089 |             unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH;
1090 |             currScore = ulScore;
1091 |             uScore = lScore = ulScore = -1;
1092 |             c--;
1093 |             if (c == -1) { // If there are no cells to the left (only boundary cells)
1094 |                 (*alignment)[(*alignmentLength)++] = moveCode; // Move left
1095 |                 int numUp = b * WORD_SIZE + blockPos;
1096 |                 for (int i = 0; i < numUp; i++) // Move up until end
1097 |                     (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
1098 |                 break;
1099 |             }
1100 |             if (blockPos == 0) { // If entering upper left block
1101 |                 if (b == 0) { // If there are no more cells above (only boundary cells)
1102 |                     (*alignment)[(*alignmentLength)++] = moveCode; // Move up left
1103 |                     for (int i = 0; i < c + 1; i++) // Move left until end
1104 |                         (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
1105 |                     break;
1106 |                 }
1107 |                 blockPos = WORD_SIZE - 1;
1108 |                 b--;
1109 |                 currP = alignData->Ps[c * maxNumBlocks + b];
1110 |                 currM = alignData->Ms[c * maxNumBlocks + b];
1111 |             } else { // If entering left block
1112 |                 blockPos--;
1113 |                 currP = lP;
1114 |                 currM = lM;
1115 |                 currP <<= 1;
1116 |                 currM <<= 1;
1117 |             }
1118 |             // Set new left block
1119 |             if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
1120 |                 thereIsLeftBlock = true;
1121 |                 lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
1122 |                 lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
1123 |             } else {
1124 |                 if (c == 0) { // If there are no cells to the left (only boundary cells)
1125 |                     thereIsLeftBlock = true;
1126 |                     lScore = b * WORD_SIZE + blockPos + 1;
1127 |                     ulScore = lScore - 1;
1128 |                 } else {
1129 |                     thereIsLeftBlock = false;
1130 |                 }
1131 |             }
1132 |             // Mark move
1133 |             (*alignment)[(*alignmentLength)++] = moveCode;
1134 |         } else {
1135 |             // Reached end - finished!
1136 |             break;
1137 |         }
1138 |         //----------------------------------//
1139 |     }
1140 | 
1141 |     *alignment = static_cast<unsigned char*>(realloc(*alignment, (*alignmentLength) * sizeof(unsigned char)));
1142 |     reverse(*alignment, *alignment + (*alignmentLength));
1143 |     return EDLIB_STATUS_OK;
1144 | }
1145 | 
1146 | 
1147 | /**
1148 |  * Finds one possible alignment that gives optimal score (bestScore).
1149 |  * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough,
1150 |  * it will solve them using traceback algorithm.
1151 |  * @param [in] query
1152 |  * @param [in] rQuery  Reversed query.
1153 |  * @param [in] queryLength
1154 |  * @param [in] target
1155 |  * @param [in] rTarget  Reversed target.
1156 |  * @param [in] targetLength
1157 |  * @param [in] equalityDefinition
1158 |  * @param [in] alphabetLength
1159 |  * @param [in] bestScore  Best(optimal) score.
1160 |  * @param [out] alignment  Sequence of edit operations that make target equal to query.
1161 |  * @param [out] alignmentLength  Length of alignment.
1162 |  * @return Status code.
1163 |  */
1164 | static int obtainAlignment(
1165 |         const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1166 |         const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1167 |         const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1168 |         unsigned char** const alignment, int* const alignmentLength) {
1169 | 
1170 |     // Handle special case when one of sequences has length of 0.
1171 |     if (queryLength == 0 || targetLength == 0) {
1172 |         *alignmentLength = targetLength + queryLength;
1173 |         *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1174 |         for (int i = 0; i < *alignmentLength; i++) {
1175 |             (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT;
1176 |         }
1177 |         return EDLIB_STATUS_OK;
1178 |     }
1179 | 
1180 |     const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1181 |     const int W = maxNumBlocks * WORD_SIZE - queryLength;
1182 |     int statusCode;
1183 | 
1184 |     // TODO: think about reducing number of memory allocations in alignment functions, probably
1185 |     // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg,
1186 |     // and it could also be done for alignments - we could have one big array for alignment that would be
1187 |     // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results.
1188 | 
1189 |     // If estimated memory consumption for traceback algorithm is smaller than 1MB use it,
1190 |     // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal.
1191 |     long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength
1192 |         + 2ll * sizeof(int) * targetLength;
1193 |     if (alignmentDataSize < 1024 * 1024) {
1194 |         int score_, endLocation_;  // Used only to call function.
1195 |         AlignmentData* alignData = NULL;
1196 |         Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1197 |         myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
1198 |                                 queryLength,
1199 |                                 target, targetLength,
1200 |                                 bestScore,
1201 |                                 &score_, &endLocation_, true, &alignData, -1);
1202 |         //assert(score_ == bestScore);
1203 |         //assert(endLocation_ == targetLength - 1);
1204 | 
1205 |         statusCode = obtainAlignmentTraceback(queryLength, targetLength,
1206 |                                               bestScore, alignData, alignment, alignmentLength);
1207 |         delete alignData;
1208 |         delete[] Peq;
1209 |     } else {
1210 |         statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength,
1211 |                                                target, rTarget, targetLength,
1212 |                                                equalityDefinition, alphabetLength, bestScore,
1213 |                                                alignment, alignmentLength);
1214 |     }
1215 |     return statusCode;
1216 | }
1217 | 
1218 | 
1219 | /**
1220 |  * Finds one possible alignment that gives optimal score (bestScore).
1221 |  * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together.
1222 |  * @param [in] query
1223 |  * @param [in] rQuery  Reversed query.
1224 |  * @param [in] queryLength
1225 |  * @param [in] target
1226 |  * @param [in] rTarget  Reversed target.
1227 |  * @param [in] targetLength
1228 |  * @param [in] alphabetLength
1229 |  * @param [in] bestScore  Best(optimal) score.
1230 |  * @param [out] alignment  Sequence of edit operations that make target equal to query.
1231 |  * @param [out] alignmentLength  Length of alignment.
1232 |  * @return Status code.
1233 |  */
1234 | static int obtainAlignmentHirschberg(
1235 |         const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1236 |         const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1237 |         const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1238 |         unsigned char** const alignment, int* const alignmentLength) {
1239 | 
1240 |     const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1241 |     const int W = maxNumBlocks * WORD_SIZE - queryLength;
1242 | 
1243 |     Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1244 |     Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition);
1245 | 
1246 |     // Used only to call functions.
1247 |     int score_, endLocation_;
1248 | 
1249 |     // Divide dynamic matrix into two halfs, left and right.
1250 |     const int leftHalfWidth = targetLength / 2;
1251 |     const int rightHalfWidth = targetLength - leftHalfWidth;
1252 | 
1253 |     // Calculate left half.
1254 |     AlignmentData* alignDataLeftHalf = NULL;
1255 |     int leftHalfCalcStatus = myersCalcEditDistanceNW(
1256 |             Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore,
1257 |             &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1);
1258 | 
1259 |     // Calculate right half.
1260 |     AlignmentData* alignDataRightHalf = NULL;
1261 |     int rightHalfCalcStatus = myersCalcEditDistanceNW(
1262 |             rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore,
1263 |             &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1);
1264 | 
1265 |     delete[] Peq;
1266 |     delete[] rPeq;
1267 | 
1268 |     if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) {
1269 |         if (alignDataLeftHalf) delete alignDataLeftHalf;
1270 |         if (alignDataRightHalf) delete alignDataRightHalf;
1271 |         return EDLIB_STATUS_ERROR;
1272 |     }
1273 | 
1274 |     // Unwrap the left half.
1275 |     int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0];
1276 |     int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0];
1277 |     // TODO: avoid this allocation by using some shared array?
1278 |     // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index)
1279 |     // and ending with scoresLeftEndIdx row (0-indexed).
1280 |     int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE;
1281 |     int* scoresLeft = new int[scoresLeftLength];
1282 |     for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) {
1283 |         Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx],
1284 |                     alignDataLeftHalf->scores[blockIdx]);
1285 |         readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE);
1286 |     }
1287 |     int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE;
1288 |     // If last block contains padding, shorten the length of scores for the length of padding.
1289 |     if (lastBlockIdxLeft == maxNumBlocks - 1) {
1290 |         scoresLeftLength -= W;
1291 |     }
1292 | 
1293 |     // Unwrap the right half (I also reverse it while unwraping).
1294 |     int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0];
1295 |     int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0];
1296 |     int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE;
1297 |     int* scoresRight = new int[scoresRightLength];
1298 |     int* scoresRightOriginalStart = scoresRight;
1299 |     for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) {
1300 |         Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx],
1301 |                     alignDataRightHalf->scores[blockIdx]);
1302 |         readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE);
1303 |     }
1304 |     int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE;
1305 |     // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do),
1306 |     // move pointer forward to remove the padding (that is why we remember originalStart).
1307 |     if (scoresRightStartIdx < 0) {
1308 |         //assert(scoresRightStartIdx == -1 * W);
1309 |         scoresRight += W;
1310 |         scoresRightStartIdx += W;
1311 |         scoresRightLength -= W;
1312 |     }
1313 | 
1314 |     delete alignDataLeftHalf;
1315 |     delete alignDataRightHalf;
1316 | 
1317 |     //--------------------- Find the best move ----------------//
1318 |     // Find the query/row index of cell in left column which together with its lower right neighbour
1319 |     // from right column gives the best score (when summed). We also have to consider boundary cells
1320 |     // (those cells at -1 indexes).
1321 |     //  x|
1322 |     //  -+-
1323 |     //   |x
1324 |     int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1);
1325 |     int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1,
1326 |                           scoresRightStartIdx + scoresRightLength - 2);
1327 |     int leftScore = -1, rightScore = -1;
1328 |     int queryIdxLeftAlignment = -1;  // Query/row index of cell in left column where alignment is passing through.
1329 |     bool queryIdxLeftAlignmentFound = false;
1330 |     for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) {
1331 |         leftScore = scoresLeft[queryIdx - scoresLeftStartIdx];
1332 |         rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx];
1333 |         if (leftScore + rightScore == bestScore) {
1334 |             queryIdxLeftAlignment = queryIdx;
1335 |             queryIdxLeftAlignmentFound = true;
1336 |             break;
1337 |         }
1338 |     }
1339 |     // Check boundary cells.
1340 |     if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) {
1341 |         leftScore = leftHalfWidth;
1342 |         rightScore = scoresRight[0];
1343 |         if (leftScore + rightScore == bestScore) {
1344 |             queryIdxLeftAlignment = -1;
1345 |             queryIdxLeftAlignmentFound = true;
1346 |         }
1347 |     }
1348 |     if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength
1349 |         && scoresRightStartIdx + scoresRightLength == queryLength) {
1350 |         leftScore = scoresLeft[scoresLeftLength - 1];
1351 |         rightScore = rightHalfWidth;
1352 |         if (leftScore + rightScore == bestScore) {
1353 |             queryIdxLeftAlignment = queryLength - 1;
1354 |             queryIdxLeftAlignmentFound = true;
1355 |         }
1356 |     }
1357 | 
1358 |     delete[] scoresLeft;
1359 |     delete[] scoresRightOriginalStart;
1360 | 
1361 |     if (queryIdxLeftAlignmentFound == false) {
1362 |         // If there was no move that is part of optimal alignment, then there is no such alignment
1363 |         // or given bestScore is not correct!
1364 |         return EDLIB_STATUS_ERROR;
1365 |     }
1366 |     //----------------------------------------------------------//
1367 | 
1368 |     // Calculate alignments for upper half of left half (upper left - ul)
1369 |     // and lower half of right half (lower right - lr).
1370 |     const int ulHeight = queryIdxLeftAlignment + 1;
1371 |     const int lrHeight = queryLength - ulHeight;
1372 |     const int ulWidth = leftHalfWidth;
1373 |     const int lrWidth = rightHalfWidth;
1374 |     unsigned char* ulAlignment = NULL; int ulAlignmentLength;
1375 |     int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight,
1376 |                                        target, rTarget + lrWidth, ulWidth,
1377 |                                        equalityDefinition, alphabetLength, leftScore,
1378 |                                        &ulAlignment, &ulAlignmentLength);
1379 |     unsigned char* lrAlignment = NULL; int lrAlignmentLength;
1380 |     int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight,
1381 |                                        target + ulWidth, rTarget, lrWidth,
1382 |                                        equalityDefinition, alphabetLength, rightScore,
1383 |                                        &lrAlignment, &lrAlignmentLength);
1384 |     if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) {
1385 |         if (ulAlignment) free(ulAlignment);
1386 |         if (lrAlignment) free(lrAlignment);
1387 |         return EDLIB_STATUS_ERROR;
1388 |     }
1389 | 
1390 |     // Build alignment by concatenating upper left alignment with lower right alignment.
1391 |     *alignmentLength = ulAlignmentLength + lrAlignmentLength;
1392 |     *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1393 |     memcpy(*alignment, ulAlignment, ulAlignmentLength);
1394 |     memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength);
1395 | 
1396 |     free(ulAlignment);
1397 |     free(lrAlignment);
1398 |     return EDLIB_STATUS_OK;
1399 | }
1400 | 
1401 | 
1402 | /**
1403 |  * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
1404 |  * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
1405 |  * Most of internal edlib functions expect such transformed sequences.
1406 |  * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
1407 |  * Example:
1408 |  *   Original sequences: "ACT" and "CGT".
1409 |  *   Alphabet would be recognized as "ACTG". Alphabet length = 4.
1410 |  *   Transformed sequences: [0, 1, 2] and [1, 3, 2].
1411 |  * @param [in] queryOriginal
1412 |  * @param [in] queryLength
1413 |  * @param [in] targetOriginal
1414 |  * @param [in] targetLength
1415 |  * @param [out] queryTransformed  It will contain values in range [0, alphabet length - 1].
1416 |  * @param [out] targetTransformed  It will contain values in range [0, alphabet length - 1].
1417 |  * @return  Alphabet as a string of unique characters, where index of each character is its value in transformed
1418 |  *          sequences.
1419 |  */
1420 | static string transformSequences(const char* const queryOriginal, const int queryLength,
1421 |                                  const char* const targetOriginal, const int targetLength,
1422 |                                  unsigned char** const queryTransformed,
1423 |                                  unsigned char** const targetTransformed) {
1424 |     // Alphabet is constructed from letters that are present in sequences.
1425 |     // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
1426 |     // and new query and target are created in which letters are replaced with their ordinal numbers.
1427 |     // This query and target are used in all the calculations later.
1428 |     *queryTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * queryLength));
1429 |     *targetTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * targetLength));
1430 | 
1431 |     string alphabet = "";
1432 | 
1433 |     // Alphabet information, it is constructed on fly while transforming sequences.
1434 |     // letterIdx[c] is index of letter c in alphabet.
1435 |     unsigned char letterIdx[MAX_UCHAR + 1];
1436 |     bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet
1437 |     for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false;
1438 | 
1439 |     for (int i = 0; i < queryLength; i++) {
1440 |         unsigned char c = static_cast<unsigned char>(queryOriginal[i]);
1441 |         if (!inAlphabet[c]) {
1442 |             inAlphabet[c] = true;
1443 |             letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1444 |             alphabet += queryOriginal[i];
1445 |         }
1446 |         (*queryTransformed)[i] = letterIdx[c];
1447 |     }
1448 |     for (int i = 0; i < targetLength; i++) {
1449 |         unsigned char c = static_cast<unsigned char>(targetOriginal[i]);
1450 |         if (!inAlphabet[c]) {
1451 |             inAlphabet[c] = true;
1452 |             letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1453 |             alphabet += targetOriginal[i];
1454 |         }
1455 |         (*targetTransformed)[i] = letterIdx[c];
1456 |     }
1457 | 
1458 |     return alphabet;
1459 | }
1460 | 
1461 | 
1462 | extern "C" EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task,
1463 |                                                 const EdlibEqualityPair* additionalEqualities,
1464 |                                                 int additionalEqualitiesLength) {
1465 |     EdlibAlignConfig config;
1466 |     config.k = k;
1467 |     config.mode = mode;
1468 |     config.task = task;
1469 |     config.additionalEqualities = additionalEqualities;
1470 |     config.additionalEqualitiesLength = additionalEqualitiesLength;
1471 |     return config;
1472 | }
1473 | 
1474 | extern "C" EdlibAlignConfig edlibDefaultAlignConfig(void) {
1475 |     return edlibNewAlignConfig(-1, EDLIB_MODE_NW, EDLIB_TASK_DISTANCE, NULL, 0);
1476 | }
1477 | 
1478 | extern "C" void edlibFreeAlignResult(EdlibAlignResult result) {
1479 |     if (result.endLocations) free(result.endLocations);
1480 |     if (result.startLocations) free(result.startLocations);
1481 |     if (result.alignment) free(result.alignment);
1482 | }


--------------------------------------------------------------------------------
/stringdecomposer/src/edlib.h:
--------------------------------------------------------------------------------
  1 | #ifndef EDLIB_H
  2 | #define EDLIB_H
  3 | 
  4 | /**
  5 |  * @file
  6 |  * @author Martin Sosic
  7 |  * @brief Main header file, containing all public functions and structures.
  8 |  */
  9 | 
 10 | // Define EDLIB_API macro to properly export symbols
 11 | #ifdef EDLIB_SHARED
 12 | #    ifdef _WIN32
 13 | #        ifdef EDLIB_BUILD
 14 | #            define EDLIB_API __declspec(dllexport)
 15 | #        else
 16 | #            define EDLIB_API __declspec(dllimport)
 17 | #        endif
 18 | #    else
 19 | #        define EDLIB_API __attribute__ ((visibility ("default")))
 20 | #    endif
 21 | #else
 22 | #    define EDLIB_API
 23 | #endif
 24 | 
 25 | #ifdef __cplusplus
 26 | extern "C" {
 27 | #endif
 28 | 
 29 | // Status codes
 30 | #define EDLIB_STATUS_OK 0
 31 | #define EDLIB_STATUS_ERROR 1
 32 | 
 33 |     /**
 34 |      * Alignment methods - how should Edlib treat gaps before and after query?
 35 |      */
 36 |     typedef enum {
 37 |         /**
 38 |          * Global method. This is the standard method.
 39 |          * Useful when you want to find out how similar is first sequence to second sequence.
 40 |          */
 41 |         EDLIB_MODE_NW,
 42 |         /**
 43 |          * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
 44 |          * What that means is that deleting elements from the end of second sequence is "free"!
 45 |          * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
 46 |          * of second sequence is "free" and does not count into total edit distance. This method is appropriate
 47 |          * when you want to find out how well first sequence fits at the beginning of second sequence.
 48 |          */
 49 |         EDLIB_MODE_SHW,
 50 |         /**
 51 |          * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
 52 |          * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
 53 |          * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
 54 |          * and GAC from the end of second sequence is "free" and does not count into total edit distance.
 55 |          * This method is appropriate when you want to find out how well first sequence fits at any part of
 56 |          * second sequence.
 57 |          * For example, if your second sequence was a long text and your first sequence was a sentence from that text,
 58 |          * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
 59 |          * that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
 60 |          */
 61 |         EDLIB_MODE_HW
 62 |     } EdlibAlignMode;
 63 | 
 64 |     /**
 65 |      * Alignment tasks - what do you want Edlib to do?
 66 |      */
 67 |     typedef enum {
 68 |         EDLIB_TASK_DISTANCE,  //!< Find edit distance and end locations.
 69 |         EDLIB_TASK_LOC,       //!< Find edit distance, end locations and start locations.
 70 |         EDLIB_TASK_PATH       //!< Find edit distance, end locations and start locations and alignment path.
 71 |     } EdlibAlignTask;
 72 | 
 73 |     /**
 74 |      * Describes cigar format.
 75 |      * @see http://samtools.github.io/hts-specs/SAMv1.pdf
 76 |      * @see http://drive5.com/usearch/manual/cigar.html
 77 |      */
 78 |     typedef enum {
 79 |         EDLIB_CIGAR_STANDARD,  //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
 80 |         EDLIB_CIGAR_EXTENDED   //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
 81 |     } EdlibCigarFormat;
 82 | 
 83 | // Edit operations.
 84 | #define EDLIB_EDOP_MATCH 0    //!< Match.
 85 | #define EDLIB_EDOP_INSERT 1   //!< Insertion to target = deletion from query.
 86 | #define EDLIB_EDOP_DELETE 2   //!< Deletion from target = insertion to query.
 87 | #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
 88 | 
 89 |     /**
 90 |      * @brief Defines two given characters as equal.
 91 |      */
 92 |     typedef struct {
 93 |         char first;
 94 |         char second;
 95 |     } EdlibEqualityPair;
 96 | 
 97 |     /**
 98 |      * @brief Configuration object for edlibAlign() function.
 99 |      */
100 |     typedef struct {
101 |         /**
102 |          * Set k to non-negative value to tell edlib that edit distance is not larger than k.
103 |          * Smaller k can significantly improve speed of computation.
104 |          * If edit distance is larger than k, edlib will set edit distance to -1.
105 |          * Set k to negative value and edlib will internally auto-adjust k until score is found.
106 |          */
107 |         int k;
108 | 
109 |         /**
110 |          * Alignment method.
111 |          * EDLIB_MODE_NW: global (Needleman-Wunsch)
112 |          * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
113 |          * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
114 |          */
115 |         EdlibAlignMode mode;
116 | 
117 |         /**
118 |          * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
119 |          * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
120 |          * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
121 |          * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
122 |          */
123 |         EdlibAlignTask task;
124 | 
125 |         /**
126 |          * List of pairs of characters, where each pair defines two characters as equal.
127 |          * This way you can extend edlib's definition of equality (which is that each character is equal only
128 |          * to itself).
129 |          * This can be useful if you have some wildcard characters that should match multiple other characters,
130 |          * or e.g. if you want edlib to be case insensitive.
131 |          * Can be set to NULL if there are none.
132 |          */
133 |         const EdlibEqualityPair* additionalEqualities;
134 | 
135 |         /**
136 |          * Number of additional equalities, which is non-negative number.
137 |          * 0 if there are none.
138 |          */
139 |         int additionalEqualitiesLength;
140 |     } EdlibAlignConfig;
141 | 
142 |     /**
143 |      * Helper method for easy construction of configuration object.
144 |      * @return Configuration object filled with given parameters.
145 |      */
146 |     EDLIB_API EdlibAlignConfig edlibNewAlignConfig(
147 |         int k, EdlibAlignMode mode, EdlibAlignTask task,
148 |         const EdlibEqualityPair* additionalEqualities,
149 |         int additionalEqualitiesLength
150 |     );
151 | 
152 |     /**
153 |      * @return Default configuration object, with following defaults:
154 |      *         k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities.
155 |      */
156 |     EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void);
157 | 
158 | 
159 |     /**
160 |      * Container for results of alignment done by edlibAlign() function.
161 |      */
162 |     typedef struct {
163 |         /**
164 |          * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
165 |          */
166 |         int status;
167 | 
168 |         /**
169 |          * -1 if k is non-negative and edit distance is larger than k.
170 |          */
171 |         int editDistance;
172 | 
173 |         /**
174 |          * Array of zero-based positions in target where optimal alignment paths end.
175 |          * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
176 |          * Set to NULL if edit distance is larger than k.
177 |          * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
178 |          */
179 |         int* endLocations;
180 | 
181 |         /**
182 |          * Array of zero-based positions in target where optimal alignment paths start,
183 |          * they correspond to endLocations.
184 |          * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
185 |          * Set to NULL if not calculated or if edit distance is larger than k.
186 |          * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
187 |          */
188 |         int* startLocations;
189 | 
190 |         /**
191 |          * Number of end (and start) locations.
192 |          */
193 |         int numLocations;
194 | 
195 |         /**
196 |          * Alignment is found for first pair of start and end locations.
197 |          * Set to NULL if not calculated.
198 |          * Alignment is sequence of numbers: 0, 1, 2, 3.
199 |          * 0 stands for match.
200 |          * 1 stands for insertion to target.
201 |          * 2 stands for insertion to query.
202 |          * 3 stands for mismatch.
203 |          * Alignment aligns query to target from begining of query till end of query.
204 |          * If gaps are not penalized, they are not in alignment.
205 |          * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
206 |          */
207 |         unsigned char* alignment;
208 | 
209 |         /**
210 |          * Length of alignment.
211 |          */
212 |         int alignmentLength;
213 | 
214 |         /**
215 |          * Number of different characters in query and target together.
216 |          */
217 |         int alphabetLength;
218 |     } EdlibAlignResult;
219 | 
220 |     /**
221 |      * Frees memory in EdlibAlignResult that was allocated by edlib.
222 |      * If you do not use it, make sure to free needed members manually using free().
223 |      */
224 |     EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result);
225 | 
226 | 
227 |     /**
228 |      * Aligns two sequences (query and target) using edit distance (levenshtein distance).
229 |      * Through config parameter, this function supports different alignment methods (global, prefix, infix),
230 |      * as well as different modes of search (tasks).
231 |      * It always returns edit distance and end locations of optimal alignment in target.
232 |      * It optionally returns start locations of optimal alignment in target and alignment path,
233 |      * if you choose appropriate tasks.
234 |      * @param [in] query  First sequence.
235 |      * @param [in] queryLength  Number of characters in first sequence.
236 |      * @param [in] target  Second sequence.
237 |      * @param [in] targetLength  Number of characters in second sequence.
238 |      * @param [in] config  Additional alignment parameters, like alignment method and wanted results.
239 |      * @return  Result of alignment, which can contain edit distance, start and end locations and alignment path.
240 |      *          Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
241 |      */
242 |     EDLIB_API EdlibAlignResult edlibAlign(
243 |         const char* query, int queryLength,
244 |         const char* target, int targetLength,
245 |         const EdlibAlignConfig config
246 |     );
247 | 
248 | 
249 |     /**
250 |      * Builds cigar string from given alignment sequence.
251 |      * @param [in] alignment  Alignment sequence.
252 |      *     0 stands for match.
253 |      *     1 stands for insertion to target.
254 |      *     2 stands for insertion to query.
255 |      *     3 stands for mismatch.
256 |      * @param [in] alignmentLength
257 |      * @param [in] cigarFormat  Cigar will be returned in specified format.
258 |      * @return Cigar string.
259 |      *     I stands for insertion.
260 |      *     D stands for deletion.
261 |      *     X stands for mismatch. (used only in extended format)
262 |      *     = stands for match. (used only in extended format)
263 |      *     M stands for (mis)match. (used only in standard format)
264 |      *     String is null terminated.
265 |      *     Needed memory is allocated and given pointer is set to it.
266 |      *     Do not forget to free it later using free()!
267 |      */
268 |     EDLIB_API char* edlibAlignmentToCigar(
269 |         const unsigned char* alignment, int alignmentLength,
270 |         EdlibCigarFormat cigarFormat
271 |     );
272 | 
273 | #ifdef __cplusplus
274 | }
275 | #endif
276 | 
277 | #endif // EDLIB_H


--------------------------------------------------------------------------------
/stringdecomposer/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <vector>
  4 | #include <map>
  5 | #include <set>
  6 | #include <algorithm>
  7 | #include <cstring>
  8 | #include <sstream>
  9 | #include <iterator>
 10 | #include <omp.h>
 11 | 
 12 | #include "edlib.h"
 13 | 
 14 | using namespace std;
 15 | 
 16 | struct ReadId {
 17 |     string name;
 18 |     int id = -1;
 19 | 
 20 |     ReadId(string name_): name(name_) {}
 21 |     ReadId(string name_, int id_): name(name_), id(id_) {}
 22 | };
 23 | 
 24 | struct Seq {
 25 |     ReadId read_id;
 26 |     string seq;
 27 | 
 28 |     Seq(string name_, string seq_): read_id(ReadId(name_)), seq(seq_) {
 29 |         transform(seq.begin(), seq.end(), seq.begin(), ::toupper);
 30 |     }
 31 | 
 32 |     Seq(string name_, string seq_, int id_): read_id(ReadId(name_, id_)), seq(seq_)  {}
 33 | 
 34 |     size_t size() { return seq.size();}
 35 | };
 36 | 
 37 | struct MonomerAlignment {
 38 |     string monomer_name;
 39 |     string read_name;
 40 |     int start_pos;
 41 |     int end_pos;
 42 |     float identity;
 43 |     bool best;
 44 | 
 45 |     MonomerAlignment() {}
 46 | 
 47 |     MonomerAlignment(string monomer_name_, string read_name_, int start_pos_, int end_pos_, float identity_, bool best_)
 48 |     : monomer_name(monomer_name_), read_name(read_name_), start_pos(start_pos_), end_pos(end_pos_), identity(identity_), best(best_) {}
 49 | };
 50 | 
 51 | bool sortby1(const pair<int, vector<MonomerAlignment>> &a
 52 |              , const pair<int, vector<MonomerAlignment>> &b) {
 53 |     return (a.first < b.first);
 54 | }
 55 | 
 56 | class MonomersAligner {
 57 | 
 58 | public:
 59 |     MonomersAligner(vector<Seq> &monomers, int ins = -1, int del = -1, int mismatch = -1, int match = 1)
 60 |       : monomers_(monomers),
 61 |         ins_(ins),
 62 |         del_(del),
 63 |         mismatch_(mismatch),
 64 |         match_(match) {
 65 |     }
 66 | 
 67 |     void AlignReadsSet(vector<Seq> &reads, int threads, int part_size, int ed_thr, int overlap = 500) {
 68 |         vector<Seq> new_reads;
 69 |         vector<int> save_steps;
 70 |         for (const auto & r: reads) {
 71 |             int cnt = 0;
 72 |             //cout << r.seq.size() << endl;
 73 |             for (size_t i = 0; i < r.seq.size(); i += part_size) {
 74 |                 if ((int) r.seq.size() - i >= overlap || r.seq.size() < overlap) {
 75 |                     Seq seq = Seq(r.read_id.name, r.seq.substr(i, min(part_size + overlap, static_cast<int>(r.seq.size() - i)) ), i );
 76 |                     new_reads.push_back(seq);
 77 |                     ++ cnt;
 78 |                 }
 79 |             }
 80 |             save_steps.push_back(cnt);
 81 |         }
 82 |         cerr << "Prepared reads\n";
 83 |         
 84 |         size_t start = 0, p = 0;
 85 |         int step = threads*2;
 86 |         vector<pair<int, vector<MonomerAlignment>>> subbatches;
 87 |         for (size_t i = 0; i < new_reads.size(); i += step) {
 88 |             #pragma omp parallel for num_threads(threads)
 89 |             for (size_t j = i; j < min(i + step, new_reads.size()); ++ j) {
 90 |                 std::vector<MonomerAlignment> aln;
 91 |                 if (ed_thr > -1) {
 92 |                     std::vector<Seq> filter_monomers = FilterMonomersForRead(new_reads[j], ed_thr);
 93 |                     aln = AlignPartClassicDP(new_reads[j], filter_monomers);
 94 |                 } else {
 95 |                     aln = AlignPartClassicDP(new_reads[j], monomers_);
 96 |                 }
 97 |                 
 98 |                 #pragma omp critical(aligner)
 99 |                 {
100 |                     subbatches.push_back(pair<int, vector<MonomerAlignment>> (j, aln));
101 |                 }
102 |             }
103 |             sort(subbatches.begin() + i, subbatches.begin() + min(i + step, new_reads.size()), sortby1);
104 |             while (p < save_steps.size() && start + save_steps[p] <= subbatches.size()) {
105 |                 vector<MonomerAlignment> batch;
106 |                 for (size_t j = start; j < start + save_steps[p]; ++ j) {
107 |                     int read_index = subbatches[j].first;
108 |                     for (auto a: subbatches[j].second) {
109 |                         MonomerAlignment new_m_aln(a.monomer_name, a.read_name,
110 |                                                     new_reads[read_index].read_id.id + a.start_pos, new_reads[read_index].read_id.id + a.end_pos,
111 |                                                     a.identity, a.best);
112 |                         batch.push_back(new_m_aln);
113 |                     }
114 |                 }
115 |                 cerr << (p + 1) * 100/save_steps.size() << "%: Aligned " << batch[0].read_name << endl;
116 |                 batch = PostProcessing(batch);
117 |                 SaveBatch(batch);
118 |                 start += save_steps[p];
119 |                 ++ p;
120 |             }
121 |         }           
122 |     }
123 | 
124 |     ~MonomersAligner() {
125 |     }
126 | 
127 | private:
128 |     double MonomerEditDistance(Seq& monomer, Seq& read) {
129 |         EdlibAlignResult result = edlibAlign(monomer.seq.c_str(), monomer.seq.size(), read.seq.c_str(), read.seq.size(), edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_DISTANCE, NULL, 0));
130 |         double res = result.editDistance;
131 |         edlibFreeAlignResult(result);
132 |         return res;
133 |     }
134 | 
135 |     std::vector<Seq> FilterMonomersForRead(Seq& read, int ed_thr) {
136 |         std::vector<Seq> monomers_for_read;
137 |         std::vector<std::pair<double, int>> mn_edit;
138 |         for (size_t i = 0; i < monomers_.size(); ++i) {
139 |             mn_edit.push_back(std::make_pair(MonomerEditDistance(monomers_[i], read), i));
140 |         }
141 |         std::sort(mn_edit.begin(), mn_edit.end());
142 |         monomers_for_read.push_back(monomers_[mn_edit[0].second]);
143 |         for (size_t i = 1; i < mn_edit.size(); ++i) {
144 |             if (mn_edit[i].first <= ed_thr) {
145 |                 monomers_for_read.push_back(monomers_[mn_edit[i].second]);
146 |             }
147 |         }
148 |         return monomers_for_read;
149 |     }
150 | 
151 |     vector<MonomerAlignment> AlignPartClassicDP(Seq &read, std::vector<Seq>& monomers) {
152 |         int ins = ins_;
153 |         int del = del_;
154 |         int match = match_;
155 |         int mismatch = mismatch_;
156 |         int INF = -1000000;
157 |         int monomers_num = (int) monomers.size();
158 |         vector<vector<vector<long long>>> dp(read.seq.size());
159 |         //cout << dp.size() << endl;
160 |         for (size_t i = 0; i < read.seq.size(); ++ i) {
161 |             for (const auto & m: monomers) {
162 |                 dp[i].push_back(vector<long long>(m.seq.size()));
163 |                 for (size_t k = 0; k < m.seq.size(); ++ k) {
164 |                     dp[i][dp[i].size() - 1][k] = INF;
165 |                 }
166 |             }
167 |             dp[i].push_back(vector<long long>(1));
168 |             dp[i][monomers_num][0] = INF;
169 |         }
170 | 
171 |         for (size_t j = 0; j < monomers.size(); ++ j) {
172 |             Seq m = monomers[j];
173 |             if (m.seq[0] == read.seq[0]) {
174 |                 dp[0][j][0] = match;
175 |             } else {
176 |                 dp[0][j][0] = mismatch;
177 |             }
178 |             for (size_t k = 1; k < m.seq.size(); ++ k) {
179 |                 long long mm_score = monomers[j].seq[k] == read.seq[0] ? match: mismatch;
180 |                 dp[0][j][k] = max(dp[0][j][k-1] + del, (long long)(del*(k-1) + mm_score));
181 |             }
182 |         }
183 |         for (size_t i = 1; i < read.seq.size(); ++ i) {
184 |             for (size_t j = 0; j < monomers.size(); ++ j) {
185 |                 dp[i][monomers_num][0] = max(dp[i][monomers_num][0], dp[i-1][j][monomers[j].size() - 1]);
186 |             }
187 |             for (size_t j = 0; j < monomers.size(); ++ j) {
188 |                 for (size_t k = 0; k < monomers[j].size(); ++ k) {
189 |                     long long score = INF;
190 |                     int mm_score = monomers[j].seq[k] == read.seq[i] ? match: mismatch;
191 |                     if (dp[i][monomers_num][0] > INF) {
192 |                         score = max(score, dp[i][monomers_num][0] + mm_score + static_cast<long long>(k*del));
193 |                     }
194 |                     if (k > 0) {
195 |                         if (dp[i-1][j][k-1] > INF) {
196 |                             score = max(score, dp[i-1][j][k-1] + mm_score);
197 |                         }
198 |                         if (dp[i-1][j][k] > INF) {
199 |                             score = max(score, dp[i-1][j][k] + ins);
200 |                         }
201 |                         if (dp[i][j][k-1] > INF) {
202 |                             score = max(score, dp[i][j][k-1] + del);
203 |                         }
204 |                     }
205 |                     dp[i][j][k] = score;
206 |                 }
207 |             }
208 |         }
209 |         int max_score = INF;
210 |         int best_m = monomers_num;
211 |         for (size_t j = 0; j < monomers.size(); ++ j) {
212 |             if (max_score < dp[read.seq.size()-1][j][monomers[j].size() -1] ) {
213 |                 max_score = dp[read.seq.size()-1][j][monomers[j].size() -1];
214 |                 best_m = j;
215 |             }
216 |         }
217 |         vector<MonomerAlignment> ans;
218 |         long long i = read.seq.size() - 1;
219 |         long long j = best_m;
220 |         long long k = dp[i][j].size() - 1;
221 |         bool monomer_changed = true;
222 |         MonomerAlignment cur_aln;
223 |         while (i >= 0) {
224 |             if (k == static_cast<long long>(dp[i][j].size() - 1) && j != monomers_num && monomer_changed) {
225 |                 cur_aln = MonomerAlignment(monomers[j].read_id.name, read.read_id.name, i, i, dp[i][j][k], true);
226 |                 monomer_changed = false;
227 |             } 
228 |             if (j == monomers_num) {
229 |                 if (i != 0) {
230 |                     for (size_t p = 0; p < dp[i - 1].size(); ++ p) {
231 |                         if (dp[i - 1][p][dp[i - 1][p].size() - 1] == dp[i][j][k]) {
232 |                             -- i; 
233 |                             j = p;
234 |                             k = dp[i][j].size() - 1;
235 |                             break;
236 |                         } 
237 |                     }
238 |                 } else {
239 |                     -- i;
240 |                 }
241 |             } else {
242 |                 if (k != 0 && dp[i][j][k] == dp[i][j][k-1] + del) {
243 |                     --k;
244 |                 } else {
245 |                     if (i != 0 && dp[i][j][k] == dp[i-1][j][k] + ins) {
246 |                         --i;
247 |                     } else{
248 |                         int mm_score = monomers[j].seq[k] == read.seq[i] ? match: mismatch;
249 |                         if (i != 0 && k != 0 && dp[i][j][k] == dp[i-1][j][k-1] + mm_score) {
250 |                             --i; --k;
251 |                         } else {
252 |                             monomer_changed = true;
253 |                             if (i != 0 && dp[i][monomers_num][0] + k*del + mm_score ==  dp[i][j][k]) {
254 |                                 cur_aln.start_pos = i;
255 |                                 cur_aln.identity = cur_aln.identity - dp[i][monomers_num][0];
256 |                                 ans.push_back(cur_aln);
257 |                                 j = monomers_num; k = 0;
258 |                             } else {
259 |                                 cur_aln.start_pos = i;
260 |                                 ans.push_back(cur_aln);
261 |                                 --i;
262 |                             }
263 |                         }
264 |                     }
265 |                 }
266 |             }
267 |         }
268 |         reverse(ans.begin(), ans.end());
269 |         return ans;
270 |     }
271 | 
272 |     void SaveBatch(vector<MonomerAlignment> &batch) {
273 |         int prev_end = 0;
274 |         for (auto a: batch) {
275 |             string s = a.read_name + "\t" 
276 |                        + a.monomer_name + "\t" 
277 |                        + to_string(a.start_pos) + "\t"
278 |                        + to_string(a.end_pos) + "\t"
279 |                        + to_string(a.identity) + "\t"
280 |                        + to_string(a.start_pos - prev_end) + "\t"
281 |                        + to_string(a.end_pos - a.start_pos);
282 |             prev_end = a.end_pos;
283 |             cout << s << "\n";
284 |         }
285 |     }
286 | 
287 |     vector<MonomerAlignment> PostProcessing(vector<MonomerAlignment> &batch) {
288 |         vector<MonomerAlignment> res;
289 |         size_t i = 0;
290 |         while (i < batch.size()) {
291 |             for (size_t j = i + 1; j < min(i + 7, batch.size()); ++ j) {
292 |                 if ((batch[i].end_pos - batch[j].start_pos)*2 > (batch[j].end_pos - batch[j].start_pos)) {
293 |                     res.push_back(batch[i]);
294 |                     i = j + 1;
295 |                     break;
296 |                 }
297 |             }
298 |             if (i < batch.size() ) res.push_back(batch[i]);
299 |             ++ i;
300 |         }
301 |         return res;
302 |     }
303 | 
304 |     vector<Seq> monomers_;
305 |     const int SAVE_STEP = 1;
306 |     int ins_;
307 |     int del_;
308 |     int mismatch_;
309 |     int match_;
310 | };
311 | 
312 | 
313 | 
314 | vector<Seq> load_fasta(string filename) {
315 |     std::ifstream input_file;
316 |     input_file.open(filename, std::ifstream::in);
317 |     string s;
318 |     vector<Seq> seqs;
319 |     while (getline(input_file, s)) {
320 |         if (s[0] == '>') {
321 |             string header = s.substr(1, s.size() - 1);
322 |             istringstream iss(header);
323 |             vector<std::string> header_v((istream_iterator<string>(iss)),
324 |                                              istream_iterator<string>());
325 |             seqs.push_back(Seq(header_v[0], ""));
326 |         } else {
327 |             seqs[seqs.size()-1].seq += s;
328 |         }
329 |     }
330 |     set<char> nucs = {'A', 'C', 'G', 'T', 'N'};
331 |     bool hasN = false;
332 |     for (const auto & seq: seqs) {
333 |         for (char c: seq.seq) {
334 |             if (nucs.count(c) == 0) {
335 |                 cerr << "ERROR: Sequence " << seq.read_id.name <<" contains undefined symbol (not ACGT): " << c << endl;
336 |                 exit(-1); 
337 |             } else if (c == 'N') {
338 |                 hasN = true;
339 |             }
340 |         }
341 |     }
342 |     if (hasN) {
343 |         cerr << "WARNING: sequences in " << filename  << " contain N symbol. It will be counted as a separate symbol in scoring!" << endl;
344 |     }
345 |     return seqs;
346 | }
347 | 
348 | string reverse_complement(string &s){
349 |     string res = "";
350 |     map<char, char> rc = {{'A', 'T'}, {'T', 'A'}, {'G','C'}, {'C','G'}, {'N','N'}};
351 |     for (int i = (int) s.size() - 1; i >= 0; --i){
352 |         try {
353 |             res += rc.at(s[i]);
354 |         }
355 |         catch (std::out_of_range& e)
356 |         {
357 |             cerr << e.what() << std::endl;
358 |             exit(-1);
359 |         }
360 |     }
361 |     return res;
362 | }
363 | 
364 | void add_reverse_complement(vector<Seq> &monomers) {
365 |     vector<Seq> rev_c_monomers;
366 |     for (auto s: monomers) {
367 |         rev_c_monomers.push_back(Seq(s.read_id.name + "'", reverse_complement(s.seq)));
368 |     }
369 |     monomers.insert(monomers.end(), rev_c_monomers.begin(), rev_c_monomers.end());
370 |     return;
371 | }
372 | 
373 | 
374 | int main(int argc, char **argv) {
375 |     if (argc < 5) {
376 |         cout << "Failed to process. Number of arguments < 5\n";
377 |         cout << "./decompose <reads> <monomers> <threads> <part-size> <overlap> [<ins-score> <del-score> <mismatch-score> <match-score>]\n";
378 |         return -1;
379 |     }
380 |     int ins = -1, del = -1, mismatch = -1, match = 1;
381 |     if (argc == 10) {
382 |         ins = stoi(argv[6]);
383 |         del = stoi(argv[7]);
384 |         mismatch = stoi(argv[8]);
385 |         match = stoi(argv[9]);
386 |     }
387 | 
388 |     int ed_thr = -1;
389 |     if (argc == 11) {
390 |         ed_thr = stoi(argv[10]);
391 |     }
392 | 
393 |     cerr << "Scores: insertion=" << ins << " deletion=" << del << " mismatch=" << mismatch << " match=" << match << endl;
394 |     vector<Seq> reads = load_fasta(argv[1]);
395 |     vector<Seq> monomers = load_fasta(argv[2]);
396 |     add_reverse_complement(monomers);
397 |     MonomersAligner monomers_aligner(monomers, ins, del, mismatch, match);
398 |     int num_threads = stoi(argv[3]);
399 |     int part_size = stoi(argv[4]);
400 |     int overlap = stoi(argv[5]);
401 |     monomers_aligner.AlignReadsSet(reads, num_threads, part_size, ed_thr, overlap);
402 | }
403 | 


--------------------------------------------------------------------------------
/stringdecomposer/test_data/DXZ1_star_monomers.fa:
--------------------------------------------------------------------------------
 1 | >A_0_DXZ1*_doubled/1978_2147/R
 2 | TCCGTTTAGCTTTTAGGTGAAGATTATCCCGTTTCCAACGAAACCTTCAAAGAGGTCCAAATATCCCCTTGCGGATCCCACAGAAAGAGTGTTTCGAAACTGCTGTTTCAAAGGAATCTTCAACTCTGTGAGTTGAATGCAATCATCACAAAGAAGTTTCTGACAATGCT
 3 | >B_1_DXZ1*_doubled/94_279/R
 4 | TCTCTCTCGTCTTTCTGTGAAGATAAAGGAAAAGGCTTTCAGGCCTTTTCCACCACAGGCCTGAAAGCGCTCCAAATGTCCACTTGCAGATTCTGCCAAAAGAATATTTCAAAACTGCTCTATGAAAAGCAATGTTAAACTCTGTGGCTCGAACACAAACATCACAAAGCAGTTTCTGAGAATGCT
 5 | >C_2_DXZ1*_doubled/280_450/R
 6 | TCAGTTTAGTTTTTCTGTGGAAATATTCCCGTTTCCAAAGAAATCTTCAAAGAGGTCCACGTATCCACTTACAGATTCTACAAAAAGACAGTTTCAAAACTGCTCCATCAAAAGGAGGGTTCAACTGTGTGACTTGAATGCAATCATCACTCAGAAGTTTCTGAGAATGCT
 7 | >D_3_DXZ1*_doubled/451_620/R
 8 | TCTCTTTAGTTTTTACGTGAACATATACCCGTTTCGAACGAAGGCCACCCAGTGGTCCAAATATCCACTTGCAGATTCTACAGAAAGAGTGTTTCGAACCTGAACTCTCAAAGGCAGGTTCATCTCTGCGAGTTAAATGCATTCATCATGAAGAACTTTCTCAGAGTGTT
 9 | >E_4_DXZ1*_doubled/621_788/R
10 | TGTGTTTAGTTATGGGAAATTATTCCCGTTTCCAACGAAATCCTCAGAGAGCTCCAAATATCCACCTGCAGATTCTACCAAAAGTGTATTTGGAAACTGCTCCATCAAAAGGCATGTTCAGCTCTGTGAGTGAAACTCCATCATCACAAAGAATATTCTGAGAATGCT
11 | >F_5_DXZ1*_doubled/789_959/R
12 | TCCGTTTGCCTTTTATATGAAGTTCCTTCCTGTACTACCGTAGGCCTCAAAGCAGTCCAAATCTCCATTTGCAGATTCTACAAAAAGAGTGATTCCAATCTGCTCTATCAATAGGATTGTTCAACTCCATGAGTTGAATGCCATCCTCACAAAGTCGTTTCTGAGAATGCT
13 | >G_6_DXZ1*_doubled/960_1129/R
14 | TCTATCTAGTTTTTATGTGAAGATATTTCCTTTTCCACCACAGGCCTCAAAGCCCTCCAAACGTCCACTTGCAGATTCTCGAAAAAGAGTGTTTCATAGCTGCTCTTTCAAAGGAAAGTTCAACTCTGGGAGTTGAATACAAACATCACAAAGTAGTTTCCGAGAATGCT
15 | >H_7_DXZ1*_doubled/1130_1300/R
16 | TCTGTTTAGTTTTTATGTGAAGATGATCCCGTTTCCAGTGAAATCTTCAAAGAGGTCCACATATCCCCTTGCAGATTCCAAAGAAAGAGGGTTTCAAAACTGCTCCATCAGAAGGATTGTTCAACTCTGTGAGTTGAATGCAGTCATCGCAGAAAACTTTCTGAGAATGCT
17 | >I_8_DXZ1*_doubled/1301_1470/R
18 | TCTGTCTAGGTTTGATGTGAAGATATAGACGTTTCAAACGAAGGCTACAAAGTGGTCAAAATATACACTTGCAGATTCTACTACAAGGGTGTTGCAAACCTGAACTATCAAAGGAAGGTTCAACTCTGTGAGTTGAATACAAACATCACAAAGAATGTTCTGAGTTTGCT
19 | >J_9_DXZ1*_doubled/1471_1638/R
20 | TCCGTTCAGTTATGGGAAGTTGATCCCGTTTCCAACGAAATCCTCAGAGAGGTCCAAATATCCCCTTGCAGATTCTACAAAACGTGTGTTTGGAAACTGCTCCATCATAACGAATGTTCAGCTCCCTGAGTTAAACTCCATCGTCACAAAGAATTTTCTGAGAGTGCT
21 | >K_10_DXZ1*_doubled/1639_1808/R
22 | ACCGTCTGGTTTTTATATGAAGTTCTTTCCTTCACTACCACAGGCCTCAAAGCGGTCCAAATCTCCACTTGCAGATTCTACAAAAAGAGTGTTTGCAAACTGCTCTATCAAAGGAATGTTCAACTCTGGGAGTTGAATGCAATCATCACAGAGCAGTTTCTGAGAATGCT
23 | >L_11_DXZ1*_doubled/1809_1977/R
24 | TCTATGTCGTTTTTAGGAGAAGATATTTCCTTTTCCAACACAGTCCTCCAAGCCCGCTAAATAGCCACTTGCACATTGTAGAAAAAGTGTGTCAAAGCTGCGCTATCAAAGGGAAAGTTCAACTCTGTGAGGTGAATGCAAACATCCCAAAGAAGTTTCTGAGAATGCT
25 | 


--------------------------------------------------------------------------------