├── .gitignore
├── LICENSE
├── PositionsToExplore_Vsig4.tsv
├── PositionsToExplore_example.tsv
├── README.md
├── Vsig4_example
    └── Backbones
    │   ├── 5IMKBBmove1.pdb
    │   ├── 5IMKBBmove10.pdb
    │   ├── 5IMKBBmove11.pdb
    │   ├── 5IMKBBmove12.pdb
    │   ├── 5IMKBBmove13.pdb
    │   ├── 5IMKBBmove14.pdb
    │   ├── 5IMKBBmove2.pdb
    │   ├── 5IMKBBmove3.pdb
    │   ├── 5IMKBBmove4.pdb
    │   ├── 5IMKBBmove5.pdb
    │   ├── 5IMKBBmove6.pdb
    │   ├── 5IMKBBmove7.pdb
    │   ├── 5IMKBBmove8.pdb
    │   ├── 5IMKBBmove9.pdb
    │   ├── RP5imkBA.pdb
    │   └── RP5imlBA.pdb
├── evolvex_config_Vsig4.yaml
├── evolvex_config_example.yaml
├── evolvex_slurm_head_example.sbatch
├── setup.py
└── src
    └── evolvex
        ├── __init__.py
        ├── command_line_interface.py
        ├── dask_parallel.py
        ├── foldx_commands.py
        ├── main.py
        ├── model_dataclasses.py
        ├── model_generation.py
        ├── mutate_interface.py
        ├── search_algorithms.py
        ├── utils.py
        └── utils_bio.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # VSCode 
 86 | .vscode/*
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | 
165 | # Project specific
166 | .ipynb


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/PositionsToExplore_Vsig4.tsv:
--------------------------------------------------------------------------------
  1 | Pdb	number	Res1	Chain	AA_Allowed	MakeAla
  2 | RP5imkBA	52	R	N	AUTO	Y
  3 | RP5imkBA	53	W	N	AUTO	Y
  4 | RP5imkBA	54	N	N	AUTO	Y
  5 | RP5imkBA	56	G	N	AUTO	Y
  6 | RP5imkBA	57	S	N	AUTO	Y
  7 | RP5imkBA	100	R	N	AUTO	Y
  8 | RP5imkBA	101	W	N	AUTO	Y
  9 | RP5imkBA	102	D	N	AUTO	Y
 10 | RP5imkBA	103	K	N	AUTO	Y
 11 | RP5imkBA	104	Y	N	AUTO	Y
 12 | RP5imkBA	106	S	N	AUTO	Y
 13 | RP5imkBA	107	S	N	AUTO	Y
 14 | RP5imkBA	108	F	N	AUTO	Y
 15 | RP5imkBA	110	D	N	AUTO	Y
 16 | RP5imkBA	111	E	N	AUTO	Y
 17 | RP5imkBA	112	Y	N	AUTO	Y
 18 | RP5imkBA	113	D	N	AUTO	Y
 19 | RP5imlBA	52	R	N	AUTO	Y
 20 | RP5imlBA	53	W	N	AUTO	Y
 21 | RP5imlBA	54	N	N	AUTO	Y
 22 | RP5imlBA	56	G	N	AUTO	Y
 23 | RP5imlBA	57	S	N	AUTO	Y
 24 | RP5imlBA	100	R	N	AUTO	Y
 25 | RP5imlBA	101	W	N	AUTO	Y
 26 | RP5imlBA	102	D	N	AUTO	Y
 27 | RP5imlBA	103	K	N	AUTO	Y
 28 | RP5imlBA	104	Y	N	AUTO	Y
 29 | RP5imlBA	106	S	N	AUTO	Y
 30 | RP5imlBA	107	S	N	AUTO	Y
 31 | RP5imlBA	108	F	N	AUTO	Y
 32 | RP5imlBA	110	D	N	AUTO	Y
 33 | RP5imlBA	111	E	N	AUTO	Y
 34 | RP5imlBA	112	Y	N	AUTO	Y
 35 | RP5imlBA	113	D	N	AUTO	Y
 36 | RP5immBA	52	R	N	AUTO	Y
 37 | RP5immBA	53	W	N	AUTO	Y
 38 | RP5immBA	54	N	N	AUTO	Y
 39 | RP5immBA	56	G	N	AUTO	Y
 40 | RP5immBA	57	S	N	AUTO	Y
 41 | RP5immBA	100	R	N	AUTO	Y
 42 | RP5immBA	101	W	N	AUTO	Y
 43 | RP5immBA	102	D	N	AUTO	Y
 44 | RP5immBA	103	K	N	AUTO	Y
 45 | RP5immBA	104	Y	N	AUTO	Y
 46 | RP5immBA	106	S	N	AUTO	Y
 47 | RP5immBA	107	S	N	AUTO	Y
 48 | RP5immBA	108	F	N	AUTO	Y
 49 | RP5immBA	110	D	N	AUTO	Y
 50 | RP5immBA	111	E	N	AUTO	Y
 51 | RP5immBA	112	Y	N	AUTO	Y
 52 | RP5immBA	113	D	N	AUTO	Y
 53 | RP5imoBA	52	R	N	AUTO	Y
 54 | RP5imoBA	53	W	N	AUTO	Y
 55 | RP5imoBA	54	N	N	AUTO	Y
 56 | RP5imoBA	56	G	N	AUTO	Y
 57 | RP5imoBA	57	S	N	AUTO	Y
 58 | RP5imoBA	100	R	N	AUTO	Y
 59 | RP5imoBA	101	W	N	AUTO	Y
 60 | RP5imoBA	102	D	N	AUTO	Y
 61 | RP5imoBA	103	K	N	AUTO	Y
 62 | RP5imoBA	104	Y	N	AUTO	Y
 63 | RP5imoBA	106	S	N	AUTO	Y
 64 | RP5imoBA	107	S	N	AUTO	Y
 65 | RP5imoBA	108	F	N	AUTO	Y
 66 | RP5imoBA	110	D	N	AUTO	Y
 67 | RP5imoBA	111	E	N	AUTO	Y
 68 | RP5imoBA	112	Y	N	AUTO	Y
 69 | RP5imoBA	113	D	N	AUTO	Y
 70 | Shake5imkBA	52	R	N	AUTO	Y
 71 | Shake5imkBA	53	W	N	AUTO	Y
 72 | Shake5imkBA	54	N	N	AUTO	Y
 73 | Shake5imkBA	56	G	N	AUTO	Y
 74 | Shake5imkBA	57	S	N	AUTO	Y
 75 | Shake5imkBA	100	R	N	AUTO	Y
 76 | Shake5imkBA	101	W	N	AUTO	Y
 77 | Shake5imkBA	102	D	N	AUTO	Y
 78 | Shake5imkBA	103	K	N	AUTO	Y
 79 | Shake5imkBA	104	Y	N	AUTO	Y
 80 | Shake5imkBA	106	S	N	AUTO	Y
 81 | Shake5imkBA	107	S	N	AUTO	Y
 82 | Shake5imkBA	108	F	N	AUTO	Y
 83 | Shake5imkBA	110	D	N	AUTO	Y
 84 | Shake5imkBA	111	E	N	AUTO	Y
 85 | Shake5imkBA	112	Y	N	AUTO	Y
 86 | Shake5imkBA	113	D	N	AUTO	Y
 87 | Shake5imlBA	52	R	N	AUTO	Y
 88 | Shake5imlBA	53	W	N	AUTO	Y
 89 | Shake5imlBA	54	N	N	AUTO	Y
 90 | Shake5imlBA	56	G	N	AUTO	Y
 91 | Shake5imlBA	57	S	N	AUTO	Y
 92 | Shake5imlBA	100	R	N	AUTO	Y
 93 | Shake5imlBA	101	W	N	AUTO	Y
 94 | Shake5imlBA	102	D	N	AUTO	Y
 95 | Shake5imlBA	103	K	N	AUTO	Y
 96 | Shake5imlBA	104	Y	N	AUTO	Y
 97 | Shake5imlBA	106	S	N	AUTO	Y
 98 | Shake5imlBA	107	S	N	AUTO	Y
 99 | Shake5imlBA	108	F	N	AUTO	Y
100 | Shake5imlBA	110	D	N	AUTO	Y
101 | Shake5imlBA	111	E	N	AUTO	Y
102 | Shake5imlBA	112	Y	N	AUTO	Y
103 | Shake5imlBA	113	D	N	AUTO	Y
104 | Shake5immBA	52	R	N	AUTO	Y
105 | Shake5immBA	53	W	N	AUTO	Y
106 | Shake5immBA	54	N	N	AUTO	Y
107 | Shake5immBA	56	G	N	AUTO	Y
108 | Shake5immBA	57	S	N	AUTO	Y
109 | Shake5immBA	100	R	N	AUTO	Y
110 | Shake5immBA	101	W	N	AUTO	Y
111 | Shake5immBA	102	D	N	AUTO	Y
112 | Shake5immBA	103	K	N	AUTO	Y
113 | Shake5immBA	104	Y	N	AUTO	Y
114 | Shake5immBA	106	S	N	AUTO	Y
115 | Shake5immBA	107	S	N	AUTO	Y
116 | Shake5immBA	108	F	N	AUTO	Y
117 | Shake5immBA	110	D	N	AUTO	Y
118 | Shake5immBA	111	E	N	AUTO	Y
119 | Shake5immBA	112	Y	N	AUTO	Y
120 | Shake5immBA	113	D	N	AUTO	Y
121 | Shake5imoBA	52	R	N	AUTO	Y
122 | Shake5imoBA	53	W	N	AUTO	Y
123 | Shake5imoBA	54	N	N	AUTO	Y
124 | Shake5imoBA	56	G	N	AUTO	Y
125 | Shake5imoBA	57	S	N	AUTO	Y
126 | Shake5imoBA	100	R	N	AUTO	Y
127 | Shake5imoBA	101	W	N	AUTO	Y
128 | Shake5imoBA	102	D	N	AUTO	Y
129 | Shake5imoBA	103	K	N	AUTO	Y
130 | Shake5imoBA	104	Y	N	AUTO	Y
131 | Shake5imoBA	106	S	N	AUTO	Y
132 | Shake5imoBA	107	S	N	AUTO	Y
133 | Shake5imoBA	108	F	N	AUTO	Y
134 | Shake5imoBA	110	D	N	AUTO	Y
135 | Shake5imoBA	111	E	N	AUTO	Y
136 | Shake5imoBA	112	Y	N	AUTO	Y
137 | Shake5imoBA	113	D	N	AUTO	Y
138 | 5IMKBBmove1	52	R	N	AUTO	Y
139 | 5IMKBBmove1	53	W	N	AUTO	Y
140 | 5IMKBBmove1	54	N	N	AUTO	Y
141 | 5IMKBBmove1	56	G	N	AUTO	Y
142 | 5IMKBBmove1	57	S	N	AUTO	Y
143 | 5IMKBBmove1	100	R	N	AUTO	Y
144 | 5IMKBBmove1	101	W	N	AUTO	Y
145 | 5IMKBBmove1	102	D	N	AUTO	Y
146 | 5IMKBBmove1	103	K	N	AUTO	Y
147 | 5IMKBBmove1	104	Y	N	AUTO	Y
148 | 5IMKBBmove1	106	K	N	AUTO	Y
149 | 5IMKBBmove1	107	F	N	AUTO	Y
150 | 5IMKBBmove1	109	D	N	AUTO	Y
151 | 5IMKBBmove1	110	E	N	AUTO	Y
152 | 5IMKBBmove1	111	Y	N	AUTO	Y
153 | 5IMKBBmove1	112	D	N	AUTO	Y
154 | 5IMKBBmove2	52	R	N	AUTO	Y
155 | 5IMKBBmove2	53	W	N	AUTO	Y
156 | 5IMKBBmove2	54	N	N	AUTO	Y
157 | 5IMKBBmove2	56	G	N	AUTO	Y
158 | 5IMKBBmove2	57	S	N	AUTO	Y
159 | 5IMKBBmove2	100	R	N	AUTO	Y
160 | 5IMKBBmove2	101	W	N	AUTO	Y
161 | 5IMKBBmove2	102	D	N	AUTO	Y
162 | 5IMKBBmove2	103	K	N	AUTO	Y
163 | 5IMKBBmove2	104	Y	N	AUTO	Y
164 | 5IMKBBmove2	106	A	N	AUTO	Y
165 | 5IMKBBmove2	107	A	N	AUTO	Y
166 | 5IMKBBmove2	108	F	N	AUTO	Y
167 | 5IMKBBmove2	110	D	N	AUTO	Y
168 | 5IMKBBmove2	111	E	N	AUTO	Y
169 | 5IMKBBmove2	112	Y	N	AUTO	Y
170 | 5IMKBBmove2	113	D	N	AUTO	Y
171 | 5IMKBBmove3	52	R	N	AUTO	Y
172 | 5IMKBBmove3	53	W	N	AUTO	Y
173 | 5IMKBBmove3	54	N	N	AUTO	Y
174 | 5IMKBBmove3	56	G	N	AUTO	Y
175 | 5IMKBBmove3	57	S	N	AUTO	Y
176 | 5IMKBBmove3	100	R	N	AUTO	Y
177 | 5IMKBBmove3	101	W	N	AUTO	Y
178 | 5IMKBBmove3	102	D	N	AUTO	Y
179 | 5IMKBBmove3	103	K	N	AUTO	Y
180 | 5IMKBBmove3	104	Y	N	AUTO	Y
181 | 5IMKBBmove3	106	G	N	AUTO	Y
182 | 5IMKBBmove3	107	G	N	AUTO	Y
183 | 5IMKBBmove3	108	F	N	AUTO	Y
184 | 5IMKBBmove3	110	D	N	AUTO	Y
185 | 5IMKBBmove3	111	E	N	AUTO	Y
186 | 5IMKBBmove3	112	Y	N	AUTO	Y
187 | 5IMKBBmove3	113	D	N	AUTO	Y
188 | 5IMKBBmove4	52	R	N	AUTO	Y
189 | 5IMKBBmove4	53	W	N	AUTO	Y
190 | 5IMKBBmove4	54	N	N	AUTO	Y
191 | 5IMKBBmove4	56	G	N	AUTO	Y
192 | 5IMKBBmove4	57	S	N	AUTO	Y
193 | 5IMKBBmove4	100	R	N	AUTO	Y
194 | 5IMKBBmove4	101	W	N	AUTO	Y
195 | 5IMKBBmove4	102	D	N	AUTO	Y
196 | 5IMKBBmove4	103	K	N	AUTO	Y
197 | 5IMKBBmove4	104	Y	N	AUTO	Y
198 | 5IMKBBmove4	106	G	N	AUTO	Y
199 | 5IMKBBmove4	107	G	N	AUTO	Y
200 | 5IMKBBmove4	108	F	N	AUTO	Y
201 | 5IMKBBmove4	110	D	N	AUTO	Y
202 | 5IMKBBmove4	111	E	N	AUTO	Y
203 | 5IMKBBmove4	112	Y	N	AUTO	Y
204 | 5IMKBBmove4	113	D	N	AUTO	Y
205 | 5IMKBBmove5	52	R	N	AUTO	Y
206 | 5IMKBBmove5	53	W	N	AUTO	Y
207 | 5IMKBBmove5	54	N	N	AUTO	Y
208 | 5IMKBBmove5	56	G	N	AUTO	Y
209 | 5IMKBBmove5	57	S	N	AUTO	Y
210 | 5IMKBBmove5	100	R	N	AUTO	Y
211 | 5IMKBBmove5	101	W	N	AUTO	Y
212 | 5IMKBBmove5	102	D	N	AUTO	Y
213 | 5IMKBBmove5	103	K	N	AUTO	Y
214 | 5IMKBBmove5	104	Y	N	AUTO	Y
215 | 5IMKBBmove5	106	G	N	AUTO	Y
216 | 5IMKBBmove5	107	G	N	AUTO	Y
217 | 5IMKBBmove5	108	F	N	AUTO	Y
218 | 5IMKBBmove5	110	D	N	AUTO	Y
219 | 5IMKBBmove5	111	E	N	AUTO	Y
220 | 5IMKBBmove5	112	Y	N	AUTO	Y
221 | 5IMKBBmove5	113	D	N	AUTO	Y
222 | 5IMKBBmove6	52	R	N	AUTO	Y
223 | 5IMKBBmove6	53	W	N	AUTO	Y
224 | 5IMKBBmove6	54	N	N	AUTO	Y
225 | 5IMKBBmove6	56	G	N	AUTO	Y
226 | 5IMKBBmove6	57	S	N	AUTO	Y
227 | 5IMKBBmove6	100	R	N	AUTO	Y
228 | 5IMKBBmove6	101	W	N	AUTO	Y
229 | 5IMKBBmove6	102	D	N	AUTO	Y
230 | 5IMKBBmove6	103	K	N	AUTO	Y
231 | 5IMKBBmove6	104	Y	N	AUTO	Y
232 | 5IMKBBmove6	106	G	N	AUTO	Y
233 | 5IMKBBmove6	107	G	N	AUTO	Y
234 | 5IMKBBmove6	108	F	N	AUTO	Y
235 | 5IMKBBmove6	110	D	N	AUTO	Y
236 | 5IMKBBmove6	111	E	N	AUTO	Y
237 | 5IMKBBmove6	112	Y	N	AUTO	Y
238 | 5IMKBBmove6	113	D	N	AUTO	Y
239 | 5IMKBBmove7	52	R	N	AUTO	Y
240 | 5IMKBBmove7	53	W	N	AUTO	Y
241 | 5IMKBBmove7	54	N	N	AUTO	Y
242 | 5IMKBBmove7	56	G	N	AUTO	Y
243 | 5IMKBBmove7	57	S	N	AUTO	Y
244 | 5IMKBBmove7	100	R	N	AUTO	Y
245 | 5IMKBBmove7	101	W	N	AUTO	Y
246 | 5IMKBBmove7	102	D	N	AUTO	Y
247 | 5IMKBBmove7	103	K	N	AUTO	Y
248 | 5IMKBBmove7	104	Y	N	AUTO	Y
249 | 5IMKBBmove7	106	G	N	AUTO	Y
250 | 5IMKBBmove7	107	G	N	AUTO	Y
251 | 5IMKBBmove7	108	F	N	AUTO	Y
252 | 5IMKBBmove7	110	D	N	AUTO	Y
253 | 5IMKBBmove7	111	E	N	AUTO	Y
254 | 5IMKBBmove7	112	Y	N	AUTO	Y
255 | 5IMKBBmove7	113	D	N	AUTO	Y
256 | 5IMKBBmove8	52	R	N	AUTO	Y
257 | 5IMKBBmove8	53	W	N	AUTO	Y
258 | 5IMKBBmove8	54	N	N	AUTO	Y
259 | 5IMKBBmove8	56	G	N	AUTO	Y
260 | 5IMKBBmove8	57	S	N	AUTO	Y
261 | 5IMKBBmove8	100	R	N	AUTO	Y
262 | 5IMKBBmove8	101	W	N	AUTO	Y
263 | 5IMKBBmove8	102	D	N	AUTO	Y
264 | 5IMKBBmove8	103	K	N	AUTO	Y
265 | 5IMKBBmove8	104	Y	N	AUTO	Y
266 | 5IMKBBmove8	106	S	N	AUTO	Y
267 | 5IMKBBmove8	107	S	N	AUTO	Y
268 | 5IMKBBmove8	108	F	N	AUTO	Y
269 | 5IMKBBmove8	110	D	N	AUTO	Y
270 | 5IMKBBmove8	111	E	N	AUTO	Y
271 | 5IMKBBmove8	112	Y	N	AUTO	Y
272 | 5IMKBBmove8	113	D	N	AUTO	Y
273 | 5IMKBBmove9	52	R	N	AUTO	Y
274 | 5IMKBBmove9	53	W	N	AUTO	Y
275 | 5IMKBBmove9	54	N	N	AUTO	Y
276 | 5IMKBBmove9	56	G	N	AUTO	Y
277 | 5IMKBBmove9	57	S	N	AUTO	Y
278 | 5IMKBBmove9	100	R	N	AUTO	Y
279 | 5IMKBBmove9	101	W	N	AUTO	Y
280 | 5IMKBBmove9	102	D	N	AUTO	Y
281 | 5IMKBBmove9	103	K	N	AUTO	Y
282 | 5IMKBBmove9	104	Y	N	AUTO	Y
283 | 5IMKBBmove9	106	K	N	AUTO	Y
284 | 5IMKBBmove9	107	E	N	AUTO	Y
285 | 5IMKBBmove9	108	F	N	AUTO	Y
286 | 5IMKBBmove9	110	D	N	AUTO	Y
287 | 5IMKBBmove9	111	E	N	AUTO	Y
288 | 5IMKBBmove9	112	Y	N	AUTO	Y
289 | 5IMKBBmove9	113	D	N	AUTO	Y
290 | 5IMKBBmove10	52	R	N	AUTO	Y
291 | 5IMKBBmove10	53	W	N	AUTO	Y
292 | 5IMKBBmove10	54	N	N	AUTO	Y
293 | 5IMKBBmove10	56	G	N	AUTO	Y
294 | 5IMKBBmove10	57	S	N	AUTO	Y
295 | 5IMKBBmove10	100	R	N	AUTO	Y
296 | 5IMKBBmove10	101	W	N	AUTO	Y
297 | 5IMKBBmove10	102	D	N	AUTO	Y
298 | 5IMKBBmove10	103	K	N	AUTO	Y
299 | 5IMKBBmove10	104	Y	N	AUTO	Y
300 | 5IMKBBmove10	106	A	N	AUTO	Y
301 | 5IMKBBmove10	107	A	N	AUTO	Y
302 | 5IMKBBmove10	108	F	N	AUTO	Y
303 | 5IMKBBmove10	110	D	N	AUTO	Y
304 | 5IMKBBmove10	111	E	N	AUTO	Y
305 | 5IMKBBmove10	112	Y	N	AUTO	Y
306 | 5IMKBBmove10	113	D	N	AUTO	Y
307 | 5IMKBBmove11	52	R	N	AUTO	Y
308 | 5IMKBBmove11	53	W	N	AUTO	Y
309 | 5IMKBBmove11	54	N	N	AUTO	Y
310 | 5IMKBBmove11	56	G	N	AUTO	Y
311 | 5IMKBBmove11	57	S	N	AUTO	Y
312 | 5IMKBBmove11	100	R	N	AUTO	Y
313 | 5IMKBBmove11	101	W	N	AUTO	Y
314 | 5IMKBBmove11	102	D	N	AUTO	Y
315 | 5IMKBBmove11	103	K	N	AUTO	Y
316 | 5IMKBBmove11	104	Y	N	AUTO	Y
317 | 5IMKBBmove11	106	A	N	AUTO	Y
318 | 5IMKBBmove11	107	A	N	AUTO	Y
319 | 5IMKBBmove11	108	F	N	AUTO	Y
320 | 5IMKBBmove11	110	D	N	AUTO	Y
321 | 5IMKBBmove11	111	E	N	AUTO	Y
322 | 5IMKBBmove11	112	Y	N	AUTO	Y
323 | 5IMKBBmove11	113	D	N	AUTO	Y
324 | 5IMKBBmove12	52	R	N	AUTO	Y
325 | 5IMKBBmove12	53	W	N	AUTO	Y
326 | 5IMKBBmove12	54	N	N	AUTO	Y
327 | 5IMKBBmove12	56	G	N	AUTO	Y
328 | 5IMKBBmove12	57	S	N	AUTO	Y
329 | 5IMKBBmove12	100	R	N	AUTO	Y
330 | 5IMKBBmove12	101	W	N	AUTO	Y
331 | 5IMKBBmove12	102	D	N	AUTO	Y
332 | 5IMKBBmove12	103	K	N	AUTO	Y
333 | 5IMKBBmove12	104	Y	N	AUTO	Y
334 | 5IMKBBmove12	106	A	N	AUTO	Y
335 | 5IMKBBmove12	107	A	N	AUTO	Y
336 | 5IMKBBmove12	108	F	N	AUTO	Y
337 | 5IMKBBmove12	110	D	N	AUTO	Y
338 | 5IMKBBmove12	111	E	N	AUTO	Y
339 | 5IMKBBmove12	112	Y	N	AUTO	Y
340 | 5IMKBBmove12	113	D	N	AUTO	Y
341 | 5IMKBBmove13	52	R	N	AUTO	Y
342 | 5IMKBBmove13	53	W	N	AUTO	Y
343 | 5IMKBBmove13	54	N	N	AUTO	Y
344 | 5IMKBBmove13	56	G	N	AUTO	Y
345 | 5IMKBBmove13	57	S	N	AUTO	Y
346 | 5IMKBBmove13	100	R	N	AUTO	Y
347 | 5IMKBBmove13	101	W	N	AUTO	Y
348 | 5IMKBBmove13	102	D	N	AUTO	Y
349 | 5IMKBBmove13	103	K	N	AUTO	Y
350 | 5IMKBBmove13	104	Y	N	AUTO	Y
351 | 5IMKBBmove13	106	E	N	AUTO	Y
352 | 5IMKBBmove13	107	H	N	AUTO	Y
353 | 5IMKBBmove13	108	F	N	AUTO	Y
354 | 5IMKBBmove13	110	D	N	AUTO	Y
355 | 5IMKBBmove13	111	E	N	AUTO	Y
356 | 5IMKBBmove13	112	Y	N	AUTO	Y
357 | 5IMKBBmove13	113	D	N	AUTO	Y
358 | 5IMKBBmove14	52	R	N	AUTO	Y
359 | 5IMKBBmove14	53	W	N	AUTO	Y
360 | 5IMKBBmove14	54	N	N	AUTO	Y
361 | 5IMKBBmove14	56	G	N	AUTO	Y
362 | 5IMKBBmove14	57	S	N	AUTO	Y
363 | 5IMKBBmove14	100	R	N	AUTO	Y
364 | 5IMKBBmove14	101	W	N	AUTO	Y
365 | 5IMKBBmove14	102	D	N	AUTO	Y
366 | 5IMKBBmove14	103	K	N	AUTO	Y
367 | 5IMKBBmove14	104	Y	N	AUTO	Y
368 | 5IMKBBmove14	106	E	N	AUTO	Y
369 | 5IMKBBmove14	107	H	N	AUTO	Y
370 | 5IMKBBmove14	108	F	N	AUTO	Y
371 | 5IMKBBmove14	110	D	N	AUTO	Y
372 | 5IMKBBmove14	111	E	N	AUTO	Y
373 | 5IMKBBmove14	112	Y	N	AUTO	Y
374 | 5IMKBBmove14	113	D	N	AUTO	Y


--------------------------------------------------------------------------------
/PositionsToExplore_example.tsv:
--------------------------------------------------------------------------------
 1 | Pdb	number	Res1	Chain	AA_Allowed	MakeAla
 2 | ABCD_dock_1	90	T	H	AUTO	Y
 3 | ABCD_dock_1	92	F	H	AUTO	Y
 4 | ABCD_dock_1	94	N	H	AUTO	Y
 5 | ABCD_dock_1	96	A	H	AUTO	Y
 6 | ABCD_dock_1	98	N	H	AUTO	Y
 7 | ABCD_dock_1	100	Y	H	AUTO	Y
 8 | ABCD_dock_2	90	T	H	AUTO	Y
 9 | ABCD_dock_2	92	F	H	AUTO	Y
10 | ABCD_dock_2	94	N	H	AUTO	Y
11 | ABCD_dock_2	96	A	H	AUTO	Y
12 | ABCD_dock_2	98	N	H	AUTO	Y
13 | ABCD_dock_2	100	Y	H	AUTO	Y


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EvolveX
 2 | 
 3 | This repository contains the code of EvolveX, a *de novo* antibody computational design pipeline introduced in <[future link to paper]()>. Specifically, it corresponds to the computational pipeline that generates antibody designs given an initial set of antibody-antigen docks and a set of positions to mutate and explore, which is driven by the FoldX force field to optimize the binding affinity while maintaining the thermodynamic stability of the designed antibodies.
 4 | 
 5 | # Installation
 6 | 
 7 | - **Python version >= 3.9**
 8 | - Strongly recommended to create a virtual environment (run "python -m venv venv", then "source venv/bin/activate")
 9 | - Download and unzip the github repository.
10 | - Run "pip install ." from the directory that contains the "setup.py file". This will create an "evolvex" command that you can run from the command line.
11 | - Running EvolveX additionally requires FoldX **(version >= 5)**, which can be obtained [here](https://foldxsuite.crg.eu/licensing-and-services).
12 | 
13 | The code has been tested on Linux and MacOS operating systems.
14 | 
15 | # How to run EvolveX
16 | 
17 | The evolvex command only takes a single input, which is a YAML configuration file, for which an example can be found in "evolvex_config_example.yaml".
18 | 
19 | **If you wish to run EvolveX on the human Vsig4 nanobody design example** showcased in our publication, simply set the number of CPU cores and the path to your FoldX folder in the pre-filled configuration file "evolvex_config_Vsig4.yaml" and run "evolvex evolvex_config_Vsig4.yaml". This will use the pre-generated antibody-antigen docks in the "Vsig4_example" folder.
20 | Note that the search parameters have been set to a reduced version as it only runs 10 iterations, 2 models per dock and performs recombination every 5 iterations, which should take ~3 hours to run on a personal laptop with 10 CPU cores. To run an even more reduced and faster version, remove all but 1 PDB file in the "Vsig4_example" folder, which should take < 1h. To run the same search as we did using 500 iterations, a population of 50 models per dock and recombination every 50 iterations, you would need to run it on a lab cluster or HPC (see the [additional details section](#additional-details) to run it on SLURM-based HPCs), otherwise it would take weeks to run on a personal laptop.
21 | 
22 | EvolveX generates two main outputs in the working_dir folder:
23 | 
24 |   - A "generated_models_info.csv" file containing the antibody sequence designs selected at each iteration for each model.
25 |   - A "model_PDB_files" folder containing the PDB files of each model in the CSV file.
26 | 
27 | We then filter these designs using a number of thresholds for different characteristics which are determined based on the distribution of each characteristic in known antibody 3D structures, all of which is described in detail in our publication.
28 | 
29 | For details about the configuration file and additional input files needed to run EvolveX, read below.
30 | 
31 | ## Additional details
32 | The YAML configuration parameters are the following:
33 | 
34 | - The "antibody_chains" and "antigen_chains" parameters are self-explanatory. The antibody can be a single chain (i.e nanobody) or a standard double chain Fv.
35 | 
36 | - In the "Required paths" section:
37 |   - The working_dir is where EvolveX will write all the files it generates. If you are running on an HPC, make sure this folder points to a directory with enough space and which can perform fast writes.
38 |   - The foldx_path should contain an executable file named "foldx".
39 |   - The Backbones_dir should contain PDB files corresponding to the initial set of antibody-antigen docks.
40 |   - The PositionsToExplore_file_path should be a TSV-formated file containing the positions to mutate for each PDB file in Backbones_dir (see PositionsToExplore_example.tsv for format). "AA_Allowed" can either be a string of pre-selected amino acids in single letter format (e.g KRHDE), or set to "AUTO" to let EvolveX test each individual mutation and determine which ones are worth trying during the GA search. "MakeAla" can be set to "Y", in which case that position will be mutated to Alanine before the GA search, or to "N", in which case the wildtype amino acid will be kept as the starting amino acid at that position before the GA search.
41 | 
42 | - In the "Search algorithm settings" section, the population_size corresponds to the number of models that will be generated and explored PER DOCK. So if you have 100 PDBs and set this to 100, 10000 models will be generated and explored over the number of iterations you have selected. By default, we run 500 iterations, 50 models per dock and do a recombination step every 50 iterations.
43 | 
44 | - In the "Compute settings" section:
45 |   - The "compute_env" can be set to "local" (default) or "SLURM" if running on a SLURM-based HPC.
46 |   - The "n_cores" sets the number of parallel CPU cores to use, both for the local or SLURM compute environments.
47 |   - When running on SLURM, additional parameters are required:
48 |     - The number of CPU cores are split across "max_SLURM_jobs". Most SLURM HPCs limit the number of jobs a user can have in the queue at any time, so you should set the "max_SLURM_jobs" accordingly. For example, if n_cores=250 and max_SLURM_jobs=25, then 25 jobs with 10 CPU cores each will be submitted.
49 |     - The "walltime" corresponds to the maximum time the jobs will run for.
50 |     - Set the "account_name", "cluster_name" and "cluster_partition" according to your HPC.
51 |     - Adapt the "SLURM_job_prologue" to your HPC, making sure the Python version you load is the same as the one used to create the virtual environment used to install Evolvex.
52 | 
53 | Once the YAML file is ready, run "evolvex evolvex_config.yaml".
54 | 
55 | **NOTE: When running on a SLURM environment, if a "tcp connection error" or any other similar error that suggests that the head process has lost communication with the workers arises when launching EvolveX from a login node, try launching the evolvex command through a SLURM script so that the head process runs on a compute node instead (see the "evolvex_slurm_head_example.sbatch").**


--------------------------------------------------------------------------------
/evolvex_config_Vsig4.yaml:
--------------------------------------------------------------------------------
 1 | # PDB information
 2 | antibody_chains: "N"
 3 | antigen_chains: "L"
 4 | 
 5 | # Required paths
 6 | working_dir: "./Vsig4_example"
 7 | foldx_dir: "/path/to/foldx"
 8 | Backbones_dir: "./Vsig4_example/Backbones"
 9 | PositionsToExplore_file_path: "PositionsToExplore_Vsig4.tsv"
10 | 
11 | # Search algorithm settings
12 | search_algorithm: "GA"
13 | max_iterations: 10
14 | population_size: 2 # To be understood as the population size per backbone. A warning is issued if the value is < 50.
15 | recombine_every_nth_iteration: 5
16 | 
17 | # Compute settings
18 | compute_env: "local"  # local | SLURM
19 | n_cores: 10
20 | # SLURM specific settings
21 | # max_SLURM_jobs: 100
22 | # walltime: 3-00
23 | # account_name: "xxxxxx" # SLURM credit account
24 | # cluster_name: "xxxx"
25 | # cluster_partition: "xxxx"
26 | # SLURM_job_prologue:
27 | #   - "module --force purge"
28 | #   - "module load path/to/cluster/partition" 
29 | #   - "module load Python/xxxx"
30 | #   - "source /path/to/venv/bin/activate"
31 | 
32 | # Other settings
33 | residues_to_ignore: "GMHC"
34 | vdwDesign: 2 # See https://foldxsuite.crg.eu/parameter/vdwDesign
35 | print_stdout: false # Useful for debugging
36 | calculate_binding_dG_with_water: true
37 | 


--------------------------------------------------------------------------------
/evolvex_config_example.yaml:
--------------------------------------------------------------------------------
 1 | # PDB information
 2 | antibody_chains: "HL"
 3 | antigen_chains: "A"
 4 | 
 5 | # Required paths
 6 | working_dir: "/path/to/working/dir"
 7 | foldx_dir: "/path/to/foldx"
 8 | Backbones_dir: "/path/to/Backbones"
 9 | PositionsToExplore_file_path: "/path/to/PositionsToExplore.txt"
10 | 
11 | # Search algorithm settings
12 | search_algorithm: "MC-GA"
13 | max_iterations: 500
14 | population_size: 50 # To be understood as the population size per backbone. A warning is issued if the value is < 50.
15 | recombine_every_nth_iteration: 50
16 | 
17 | # Compute settings
18 | compute_env: "local"  # local | SLURM
19 | n_cores: 10
20 | # SLURM specific settings
21 | # max_SLURM_jobs: 100
22 | # walltime: 3-00
23 | # account_name: "xxxxxx" # SLURM credit account
24 | # cluster_name: "xxxx"
25 | # cluster_partition: "xxxx"
26 | # SLURM_job_prologue:
27 | #   - "module --force purge"
28 | #   - "module load path/to/cluster/partition" 
29 | #   - "module load Python/xxxx"
30 | #   - "source /path/to/venv/bin/activate"
31 | 
32 | # Other settings
33 | residues_to_ignore: "GMHC"
34 | vdwDesign: 2 # See https://foldxsuite.crg.eu/parameter/vdwDesign
35 | print_stdout: false # Useful for debugging
36 | calculate_binding_dG_with_water: true
37 | 


--------------------------------------------------------------------------------
/evolvex_slurm_head_example.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=evolvex_head
 3 | #SBATCH --account=xxxx
 4 | #SBATCH --clusters=xxxx
 5 | #SBATCH --partition=xxxx
 6 | #SBATCH --output=output_%a.out
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --mem=20G 
10 | #SBATCH --time=3-00
11 | 
12 | module --force purge
13 | 
14 | module load path/to/cluster/partition
15 | module load Python/xxxx
16 | 
17 | source /full/path/to/venv/bin/activate
18 | 
19 | /full/path/to/evolvex_executable /full/path/to/evolvex_config.yaml # To obtain the full path of the evolvex_executable, run "which evolvex".


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='EvolveX',
 5 |     version='1',
 6 |     description='EvolveX antibody design pipeline',
 7 |     author='Gabriel Cia and Rob Van Der Kant - SwitchLab',
 8 |     packages=find_packages(where='src'),
 9 |     package_dir={'': 'src'},
10 |     python_requires='>=3.9',
11 |     install_requires=['pyyaml', 'biopython>=1.81', 'pandas', 'dask', 'distributed', 'dask-jobqueue'],
12 |     entry_points={
13 |         'console_scripts': [
14 |             'evolvex = evolvex.main:main',
15 |         ]
16 |     },
17 | )


--------------------------------------------------------------------------------
/src/evolvex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SwitchLab-VIB/EvolveX/7dbe0dd7c70119dc4fd07ccf0c0b253a50dc4f96/src/evolvex/__init__.py


--------------------------------------------------------------------------------
/src/evolvex/command_line_interface.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import argparse
 5 | from types import SimpleNamespace
 6 | import warnings
 7 | import yaml
 8 | from pathlib import Path
 9 | 
10 | def read_and_validate_config_file(file_path):
11 |     with open(file_path, 'rt') as f:
12 |         GLOBALS = yaml.safe_load(f)
13 |         GLOBALS = SimpleNamespace(**GLOBALS) # SimpleNamespace enables the use of dot notation to access values instead of dict brackets
14 | 
15 |     GLOBALS.working_dir = Path(GLOBALS.working_dir)
16 |     GLOBALS.foldx_dir = Path(GLOBALS.foldx_dir)
17 |     GLOBALS.Backbones_dir = Path(GLOBALS.Backbones_dir)
18 |     GLOBALS.PositionsToExplore_file_path = Path(GLOBALS.PositionsToExplore_file_path)
19 |     if not (GLOBALS.working_dir.is_dir()):
20 |         GLOBALS.working_dir.mkdir(parents=True)
21 | 
22 |     if not (GLOBALS.foldx_dir.is_dir() and (GLOBALS.foldx_dir / 'foldx').exists()):
23 |         raise ValueError("The foldx_dir must contain an executable file named 'foldx'.")
24 | 
25 |     if GLOBALS.search_algorithm not in ('systematic', 'GA'):
26 |         raise ValueError("The search_algorithm must be one of 'GA' or 'systematic'.")
27 |     
28 |     if GLOBALS.search_algorithm == 'GA':
29 |         if GLOBALS.population_size % 2 != 0:
30 |             raise ValueError("Population_size must be an even number.")
31 |         if GLOBALS.population_size < 50:
32 |             warnings.warn("The population_size for each PDB backbone is < 50, which is low.")
33 | 
34 |     if GLOBALS.compute_env == 'SLURM':
35 |         SLURM_parameters = ('account_name', 'cluster_name', 'cluster_partition')
36 |         if not all(hasattr(GLOBALS, parameter) for parameter in SLURM_parameters):
37 |             raise ValueError("The SLURM compute_env requires the following parameters: {SLURM_parameters}")
38 | 
39 |     return GLOBALS
40 | 
41 | def command_line_interface():
42 |     parser = argparse.ArgumentParser(
43 |         description='EvolveX',
44 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
45 |     )
46 |     
47 |     parser.add_argument(
48 |         'config', type=Path,
49 |         help="Path to YAML configuration file. See the evolvex_config_example.yaml file for the list of available parameters, and the README for an exaplanation of each parameter."
50 |     )
51 |     
52 |     args = parser.parse_args()
53 | 
54 |     GLOBALS = read_and_validate_config_file(file_path = args.config)
55 |     
56 |     return GLOBALS
57 | 


--------------------------------------------------------------------------------
/src/evolvex/dask_parallel.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from dask.distributed import Client, LocalCluster, wait
 4 | from dask_jobqueue import SLURMCluster
 5 | 
 6 | def setup_dask_parallel_executor(GLOBALS):
 7 |     compute_env = GLOBALS.compute_env
 8 |     n_cores = GLOBALS.n_cores
 9 | 
10 |     if compute_env == 'local':
11 |         cluster = LocalCluster(
12 |             n_workers = n_cores,
13 |             threads_per_worker = 1,
14 |             processes = True,
15 |         )
16 | 
17 |     elif compute_env == 'SLURM':
18 |         account = GLOBALS.account_name
19 |         clusters = GLOBALS.cluster_name
20 |         SLURM_job_prologue = GLOBALS.SLURM_job_prologue
21 |         max_SLURM_jobs = GLOBALS.max_SLURM_jobs
22 |         partition = GLOBALS.cluster_partition
23 |         walltime = GLOBALS.walltime
24 |         
25 |         # SLURMCluster determines which resources are mobilized by each worker, which are then scaled according to the number of CPUs by the cluster.adapt function.
26 |         SLURM_info_files_dir = GLOBALS.working_dir / 'slurm_info_files'; SLURM_info_files_dir.mkdir()
27 | 
28 |         n_cores_per_job = math.ceil(n_cores / max_SLURM_jobs)
29 |         print(f'{max_SLURM_jobs} jobs, each running {n_cores_per_job} single core workers.', flush=True)
30 | 
31 |         cluster = SLURMCluster(
32 |             processes = n_cores_per_job,
33 |             cores = n_cores_per_job,
34 |             memory = f'{3 * n_cores_per_job} GB', # Could turn this into a parameter for the config file
35 |             job_extra_directives = [f'--{account=}', f'--{clusters=}', f'--{partition=}', f'--output={str(SLURM_info_files_dir)}/slurm-%j.out'],
36 |             walltime = walltime,
37 |             job_script_prologue = SLURM_job_prologue, # Used to setup each job and associated workers with the necessary modules and virtual environment
38 |             death_timeout=300
39 |         )
40 |         cluster.scale(jobs = max_SLURM_jobs)
41 |         #cluster.adapt(minimum_jobs = 1, maximum_jobs = max_SLURM_jobs, interval = '4s') # NOTE: Adaptive scaling does not work, workers randomly die which kills the whole run. Setting minimum_jobs = max_SLURM_jobs does not solve the problem.
42 | 
43 |     else:
44 |         raise ValueError("compute_env must be either 'local' or 'SLURM'.")
45 | 
46 |     parallel_executor = Client(cluster) 
47 | 
48 |     print(f'Dask dashboard link: ', parallel_executor.dashboard_link, flush=True) # To monitor how well the parallelization is going
49 | 
50 |     return parallel_executor
51 | 
52 | def wait_and_remove(parallel_executor, futures):
53 |     wait(futures)
54 |     if futures:
55 |         parallel_executor.cancel(futures)
56 |     return


--------------------------------------------------------------------------------
/src/evolvex/foldx_commands.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import subprocess
  3 | import shutil
  4 | 
  5 | from evolvex.utils import NDIGIS_ROUNDING
  6 | 
  7 | def get_alanine_mutant(full_residue_IDs):
  8 |     """
  9 |     Given the list of full_residue_IDs, returns a comma separated string of mutation names to Alanine, 
 10 |     following the FoldX naming convention (e.g 'LH4A,TH9A,KH17A').
 11 |     """
 12 |     mutation_names = (
 13 |         f'{full_residue_ID}A' # e.g KH52A
 14 |         for full_residue_ID in full_residue_IDs
 15 |     )
 16 |     return ','.join(mutation_names)
 17 | 
 18 | def create_individual_list_foldx_mutations_file(mutant, output_dir, output_file_name='individual_list_foldx_mutations_file.txt'):
 19 |     """
 20 |     Mutant should be a comma separated string of mutations that describe a mutant (e.g: "DH52A,LH80A,KH99A").
 21 | 
 22 |     By default, generates a file named "individual_list_foldx_mutations_file.txt" in output_dir.
 23 |     """
 24 |     output_file_path = output_dir / output_file_name
 25 |     with open(output_file_path, 'wt') as file_handle:
 26 |         file_handle.write(f'{mutant};\n')
 27 | 
 28 |     return output_file_path
 29 | 
 30 | 
 31 | def run_foldx_BuildModel(
 32 |         foldx_dir, PDB_file_dir, PDB_file_name, individual_list_foldx_mutations_file_path, move_neighbors_flag, vdwDesign, print_stdout, output_dir, output_file_tag, PDB_file_tag=None
 33 |     ):
 34 |     """
 35 |     Returns the full paths of the mutant and wildtype PDB file generated by BuildModel.
 36 |     """
 37 |     input_file = PDB_file_dir / f'{PDB_file_name}.pdb'
 38 |     assert input_file.exists(), f'{input_file} does not exist.'
 39 |     
 40 |     command = [
 41 |         str(foldx_dir / 'foldx'), 
 42 |         '--command', 'BuildModel',
 43 |         '--pdb-dir', str(PDB_file_dir),
 44 |         '--pdb', f'{PDB_file_name}.pdb',
 45 |         '--mutant-file', str(individual_list_foldx_mutations_file_path),
 46 |         '--pdbHydrogens', 'true',
 47 |         '--output-dir', str(output_dir),
 48 |         '--moveNeighbours', 'true' if move_neighbors_flag else 'false',
 49 |         '--vdwDesign', str(vdwDesign),
 50 |         '--screen', 'true' if print_stdout else 'false'
 51 |     ]
 52 |     if output_file_tag:
 53 |         command += ['--output-file', output_file_tag]
 54 | 
 55 |     subprocess.run(command, check=True, stdout=None if print_stdout else subprocess.DEVNULL)
 56 |     
 57 |     foldx_mutant_PDB_file_path   = PDB_file_dir / f'{PDB_file_name}_1.pdb'
 58 |     foldx_wildtype_PDB_file_path = PDB_file_dir / f'WT_{PDB_file_name}_1.pdb'
 59 | 
 60 |     # A new PDB of the wildtype is not generated by FoldX when move_neighbors_flag is False, as it simply corresponds to the input PDB file,
 61 |     # but to keep it consistent we create a copy of the file with the expected name.
 62 |     if move_neighbors_flag == False:
 63 |         shutil.copy(
 64 |             src = PDB_file_dir / f'{PDB_file_name}.pdb', 
 65 |             dst = foldx_wildtype_PDB_file_path
 66 |         )
 67 |     
 68 |     # As for the other files generated by BuildModel, gives the possibility to add a tag to the generated PDB files.
 69 |     if PDB_file_tag:
 70 |         foldx_mutant_PDB_file_path = foldx_mutant_PDB_file_path.rename(foldx_mutant_PDB_file_path.with_stem(f"{foldx_mutant_PDB_file_path.stem}_{PDB_file_tag}"))
 71 |         foldx_wildtype_PDB_file_path = foldx_wildtype_PDB_file_path.rename(foldx_wildtype_PDB_file_path.with_stem(f"{foldx_wildtype_PDB_file_path.stem}_{PDB_file_tag}"))
 72 | 
 73 |     return (foldx_mutant_PDB_file_path, foldx_wildtype_PDB_file_path)
 74 | 
 75 | def run_foldx_AnalyseComplex(foldx_dir, PDB_file_dir, PDB_file_name, antibody_chains, antigen_chains, vdwDesign, print_stdout, output_dir, output_file_tag, with_predicted_waters=False):
 76 |     input_file = PDB_file_dir / f'{PDB_file_name}.pdb'
 77 |     assert input_file.exists(), f'{input_file} does not exist.'
 78 | 
 79 |     command = [
 80 |         str(foldx_dir / 'foldx'), 
 81 |         '--command', 'AnalyseComplex',
 82 |         '--pdb-dir', str(PDB_file_dir),
 83 |         '--pdb', f'{PDB_file_name}.pdb',
 84 |         '--analyseComplexChains', f'{antibody_chains},{antigen_chains}',
 85 |         '--vdwDesign', str(vdwDesign),
 86 |         '--output-dir', str(output_dir),
 87 |         '--screen', 'true' if print_stdout else 'false'
 88 |     ]
 89 |     
 90 |     if with_predicted_waters:
 91 |         command += [
 92 |             '--water', '-PREDICT',
 93 |             '--ionStrength', '0.150',
 94 |         ]
 95 | 
 96 |     if output_file_tag:
 97 |         command += ['--output-file', output_file_tag]
 98 | 
 99 |     subprocess.run(command, check=True, stdout=None if print_stdout else subprocess.DEVNULL)
100 |     return
101 | 
102 | def run_foldx_Stability(foldx_dir, PDB_file_dir, PDB_file_name, vdwDesign, print_stdout, output_dir, output_file_tag):
103 |     input_file = PDB_file_dir / f'{PDB_file_name}.pdb'
104 |     assert input_file.exists(), f'{input_file} does not exist.'
105 | 
106 |     command = [
107 |         str(foldx_dir / 'foldx'), 
108 |         '--command', 'Stability',
109 |         '--pdb-dir', str(PDB_file_dir),
110 |         '--pdb', f'{PDB_file_name}.pdb',
111 |         '--vdwDesign', str(vdwDesign),
112 |         '--output-dir', str(output_dir),
113 |     ]
114 |     if output_file_tag:
115 |         command += ['--output-file', output_file_tag]
116 | 
117 |     subprocess.run(command, check=True, stdout=None if print_stdout else subprocess.DEVNULL)
118 |     return
119 | 
120 | 
121 | def get_binding_dG(interaction_file_path):
122 |     assert interaction_file_path.name.startswith('Interaction_'), "To obtain binding dG, provide an 'Interaction_' file generated by AnalyseComplex."
123 | 
124 |     with open(interaction_file_path, 'rt') as file_handle:
125 |         lines = file_handle.readlines()
126 | 
127 |     binding_dG = float( lines[9].split('\t')[5] )
128 |     return round(binding_dG, NDIGIS_ROUNDING)
129 | 
130 | def get_binding_ddG(wildtype_interaction_file_path, mutant_interaction_file_path):
131 |     assert wildtype_interaction_file_path.name.startswith('Interaction_') and mutant_interaction_file_path.name.startswith('Interaction_'), "To obtain binding ddG, provide two 'Interaction_' files generated by AnalyseComplex."
132 | 
133 |     wildtype_binding_dG = get_binding_dG(wildtype_interaction_file_path)
134 |     mutant_binding_dG = get_binding_dG(mutant_interaction_file_path)
135 | 
136 |     binding_ddG = mutant_binding_dG - wildtype_binding_dG
137 |     return round(binding_ddG, NDIGIS_ROUNDING)
138 | 
139 | def get_chain_group_stability_dG(indiv_file_path, chain_group_name):
140 |     assert indiv_file_path.name.startswith('Indiv_'), "To obtain the stability dG of a chain group, provide an 'Indiv_' file generated by AnalyseComplex"
141 |     
142 |     with open(indiv_file_path, 'rt') as file_handle:
143 |         lines = file_handle.readlines()
144 | 
145 |     for line in lines[9:]:
146 |         line = line.split('\t')
147 |         if line[1] == chain_group_name:
148 |             chain_group_stability_dG = float( line[2] )
149 |             return round(chain_group_stability_dG, NDIGIS_ROUNDING)
150 |         
151 |     raise ValueError(f'Could not find {chain_group_name=} in {indiv_file_path}')
152 |     return
153 | 
154 | def get_chain_group_stability_ddG(wildtype_indiv_file_path, mutant_indiv_file_path, chain_group_name):
155 |     assert wildtype_indiv_file_path.name.startswith('Indiv_') and mutant_indiv_file_path.name.startswith('Indiv_'), "To obtain the stability ddG of a chain group, provide two 'Indiv_' files generated by AnalyseComplex."
156 | 
157 |     wildtype_chain_group_stability_dG = get_chain_group_stability_dG(wildtype_indiv_file_path, chain_group_name)
158 |     mutant_chain_group_stability_dG = get_chain_group_stability_dG(mutant_indiv_file_path, chain_group_name)
159 | 
160 |     chain_group_stability_ddG = mutant_chain_group_stability_dG - wildtype_chain_group_stability_dG
161 |     return round(chain_group_stability_ddG, NDIGIS_ROUNDING)
162 | 
163 | def get_complex_stability_dG(st_file_path):
164 |     # NOTE: The same information could be obtained from the 'Raw_' file generated by BuildModel, so we could skip running Stability completely
165 |     assert st_file_path.name.endswith('_ST.fxout'), "To obtain the stability dG of a complex, provide a '_ST.fxout' file generated by Stability."
166 | 
167 |     with open(st_file_path, 'rt') as file_handle:
168 |         lines = file_handle.readlines()
169 | 
170 |     complex_stability_dG = float( lines[0].split('\t')[1] )
171 |     return round(complex_stability_dG, NDIGIS_ROUNDING)
172 | 
173 | def get_complex_stability_ddG(dif_file_path):
174 |     assert dif_file_path.name.startswith('Dif_'), "To obtain the stability ddG of a complex, provide a 'Dif_' file generated by BuildModel."
175 |     
176 |     with open(dif_file_path, 'rt') as file_handle:
177 |         lines = file_handle.readlines()
178 | 
179 |     complex_stability_ddG = float( lines[9].split('\t')[1] )
180 |     return round(complex_stability_ddG, NDIGIS_ROUNDING)
181 | 
182 | def get_chain_group_intraclash_score(interaction_file_path, chain_group_name):
183 |     assert interaction_file_path.name.startswith('Interaction_'), "To obtain intraclash scores of a chain group, provide an 'Interaction_' file generated by AnalyseComplex."
184 | 
185 |     with open(interaction_file_path, 'rt') as file_handle:
186 |         lines = file_handle.readlines()
187 |     
188 |     line = lines[9].split('\t')
189 |     intraclash_scores = {
190 |         line[1]:float(line[3]), # e.g 'HL':2.363
191 |         line[2]:float(line[4])
192 |     }
193 |     return round(intraclash_scores[chain_group_name], NDIGIS_ROUNDING)
194 | 
195 | def get_chain_group_delta_intraclash_score(wildtype_interaction_file_path, mutant_interaction_file_path, chain_group_name):
196 |     assert wildtype_interaction_file_path.name.startswith('Interaction_') and mutant_interaction_file_path.name.startswith('Interaction_'), "To obtain a change in intraclash score of a chain group, provide two 'Interaction_' files generated by AnalyseComplex."
197 |     
198 |     wildtype_intraclash_score = get_chain_group_intraclash_score(wildtype_interaction_file_path, chain_group_name)
199 |     mutant_intraclash_score = get_chain_group_intraclash_score(mutant_interaction_file_path, chain_group_name)
200 |     
201 |     chain_group_delta_intraclash_score = mutant_intraclash_score - wildtype_intraclash_score
202 |     return round(chain_group_delta_intraclash_score, NDIGIS_ROUNDING)
203 | 
204 | def get_all_other_interaction_file_info(interaction_file_path):
205 |     assert interaction_file_path.name.startswith('Interaction_'), "To obtain all the information to the right of 'Backbone Hbond' from an interaction file, provide an 'Interaction_' file generated by AnalyseComplex."
206 | 
207 |     with open(interaction_file_path, 'rt') as file_handle:
208 |         lines = file_handle.readlines()
209 | 
210 |     column_names = lines[8].strip().split('\t')
211 |     values = lines[9].strip().split('\t')
212 | 
213 |     return {column_names[i]:round(float(values[i]), NDIGIS_ROUNDING) for i in range(6, len(column_names))}
214 | 


--------------------------------------------------------------------------------
/src/evolvex/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from Bio.Data.IUPACData import protein_letters
 4 | 
 5 | from dask.distributed import as_completed
 6 | 
 7 | from evolvex.dask_parallel import setup_dask_parallel_executor
 8 | from evolvex.mutate_interface import generate_Alanine_mutant, mutate_antibody_hotspot_position, generate_mutations_summary_file
 9 | from evolvex.search_algorithms import GA_search, systematic_search
10 | from evolvex.model_generation import generate_initial_models
11 | from evolvex.dask_parallel import wait_and_remove
12 | from evolvex.command_line_interface import command_line_interface
13 | 
14 | def main():
15 |     """
16 |     """
17 |     GLOBALS = command_line_interface()
18 |     
19 |     evolvex_working_dir = GLOBALS.working_dir / 'EvolveX'; evolvex_working_dir.mkdir(exist_ok = True) ### Should be false
20 | 
21 |     backbone_PDB_files_paths = list(GLOBALS.Backbones_dir.glob('*.pdb'))
22 |     all_PDBs_positions_to_explore_df = pd.read_csv(GLOBALS.PositionsToExplore_file_path, header=0, sep='\t')
23 |     
24 |     parallel_executor = setup_dask_parallel_executor(GLOBALS)
25 |     
26 | 
27 |     # Generate Alanine mutants of all PDB backbones
28 |     futures_1 = []
29 |     for PDB_file_path in backbone_PDB_files_paths:
30 |         PDB_positions_to_explore_df = all_PDBs_positions_to_explore_df[all_PDBs_positions_to_explore_df['Pdb'] == PDB_file_path.stem]
31 | 
32 |         future_1 = parallel_executor.submit(
33 |             generate_Alanine_mutant, PDB_file_path, PDB_positions_to_explore_df, evolvex_working_dir, GLOBALS
34 |         )
35 |         futures_1.append(future_1)
36 | 
37 |     # Explore all possible mutations for positions marked as 'AUTO'
38 |     print('Exploring mutations at each position...', flush=True)
39 |     residues_to_explore = set(protein_letters) - set(GLOBALS.residues_to_ignore)
40 |     futures_2 = []
41 |     for future_1, result in as_completed(futures_1, with_results=True):
42 |         foldx_Alanine_mutant_PDB_file_path, AUTO_Ala_positions_full_residue_IDs, output_dir = result
43 |         future_1.release()
44 |         for full_residue_ID in AUTO_Ala_positions_full_residue_IDs:
45 |             hotspot_mutants_dir = output_dir / 'hotspot_mutants' / full_residue_ID
46 |             hotspot_mutants_dir.mkdir(parents=True, exist_ok=True)
47 |             for mutant_residue in residues_to_explore:
48 |                 future_2 = parallel_executor.submit(
49 |                     mutate_antibody_hotspot_position, foldx_Alanine_mutant_PDB_file_path, full_residue_ID, mutant_residue, hotspot_mutants_dir, GLOBALS
50 |                 )
51 |                 futures_2.append(future_2)
52 | 
53 |     wait_and_remove(parallel_executor, futures_2)
54 | 
55 |     # Generate a summary file for each PDB backbone, which includes the ddG_binding, ddG_stability_complex and ddG_stability_antibody for all possible 
56 |     # mutations at each position. For positions with pre-selected mutations, the fields are artificially set to -100.
57 |     futures_3 = []
58 |     for PDB_file_path in backbone_PDB_files_paths:
59 |         PDB_dir = evolvex_working_dir / PDB_file_path.stem
60 |         PDB_positions_to_explore_df = all_PDBs_positions_to_explore_df[all_PDBs_positions_to_explore_df['Pdb'] == PDB_file_path.stem]
61 | 
62 |         future_3 = parallel_executor.submit(
63 |             generate_mutations_summary_file, PDB_dir, PDB_positions_to_explore_df, GLOBALS
64 |         )
65 |         futures_3.append(future_3)
66 | 
67 |     wait_and_remove(parallel_executor, futures_3)
68 | 
69 |     # Run search algorithm
70 |     if GLOBALS.search_algorithm == 'systematic':
71 |         systematic_search(parallel_executor, backbone_PDB_files_paths, evolvex_working_dir, GLOBALS)
72 |     
73 |     else:
74 |         generated_models_info_file_path = GLOBALS.working_dir / 'generated_models_info.csv'
75 |         model_PDB_files_dir = GLOBALS.working_dir / 'model_PDB_files'; model_PDB_files_dir.mkdir(exist_ok = True)
76 | 
77 |         print('Generating initial models...', flush=True)
78 |         initial_models_population = generate_initial_models(parallel_executor, evolvex_working_dir, backbone_PDB_files_paths, GLOBALS)
79 | 
80 |         print('Running search...', flush=True)
81 |         GA_search(parallel_executor, initial_models_population, generated_models_info_file_path, model_PDB_files_dir, GLOBALS)
82 | 
83 | 
84 |     print('Finished.', flush=True)
85 |     parallel_executor.close()
86 |     return
87 | 
88 | if __name__ == '__main__':
89 |     main()


--------------------------------------------------------------------------------
/src/evolvex/model_dataclasses.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from dataclasses import dataclass
 3 | 
 4 | @dataclass
 5 | class MC_Model:
 6 |     model_dir: Path 
 7 |     full_residue_IDs_list: list[str] # Full residue IDs of the mutable positions
 8 |     
 9 |     # Constants needed for the MC
10 |     backbone_PDB_file_name: str
11 |     antibody_stability_dG_original_wildtype: float
12 |     antibody_seq_map_original_wildtype: dict
13 |     allowed_AA_mutations_per_position_map: dict
14 | 
15 | @dataclass
16 | class GA_Model:
17 |     model_dir: Path
18 |     full_residue_IDs_list: list[str]
19 | 
20 |     # Parameters used to generate a Model's PDB file using BuildModel with only 1 mutation based on the parent instead of starting back from the Alanine PDB every time, which would be slow
21 |     parent_model_dir: Path
22 |     mutations_to_generate_PDB: list[str]


--------------------------------------------------------------------------------
/src/evolvex/model_generation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import shutil
  3 | from collections import defaultdict
  4 | import statistics
  5 | from pathlib import Path
  6 | import random
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from evolvex.model_dataclasses import MC_Model
 11 | from evolvex.foldx_commands import create_individual_list_foldx_mutations_file, run_foldx_BuildModel, get_complex_stability_ddG, get_chain_group_stability_dG
 12 | from evolvex.utils_bio import get_chain_to_sequence_map
 13 | 
 14 | large_hydrophobic_residues = 'FILWY'
 15 | 
 16 | def create_model(input_PDB_file_path, copy_PDB_file_to_output_dir, mutations_list, output_dir, GLOBALS, output_file_tag=None, PDB_file_tag=None):
 17 |     """
 18 |     Creates a PDB model using BuildModel, taking as input a PDB file and a list of mutations.
 19 |     """
 20 |     output_dir.mkdir(exist_ok = True)
 21 | 
 22 |     if copy_PDB_file_to_output_dir:
 23 |         input_PDB_file_path = shutil.copy(
 24 |             src = input_PDB_file_path, 
 25 |             dst = output_dir
 26 |         )
 27 |         input_PDB_file_path = Path(input_PDB_file_path)
 28 | 
 29 |     individual_list_foldx_mutations_file_path = create_individual_list_foldx_mutations_file(
 30 |         mutant = ','.join(mutations_list), 
 31 |         output_dir = output_dir
 32 |     )
 33 |     
 34 |     run_foldx_BuildModel(
 35 |         foldx_dir=GLOBALS.foldx_dir, 
 36 |         PDB_file_dir=input_PDB_file_path.parent, PDB_file_name=input_PDB_file_path.stem,
 37 |         individual_list_foldx_mutations_file_path=individual_list_foldx_mutations_file_path,
 38 |         move_neighbors_flag=True,
 39 |         vdwDesign=GLOBALS.vdwDesign,
 40 |         print_stdout=GLOBALS.print_stdout,
 41 |         output_dir=output_dir, output_file_tag=output_file_tag, PDB_file_tag=PDB_file_tag,
 42 |     )
 43 | 
 44 |     return
 45 | 
 46 | 
 47 | def get_acceptable_positions_mut_names_map(all_mutations_summary_df, PDB_name):
 48 |     acceptable_mutations_map = defaultdict(list)
 49 |     for position, position_df in all_mutations_summary_df.groupby('position'):
 50 |         binding_ddG_variance = statistics.variance(position_df.binding_ddG.values)
 51 |         antibody_stability_ddG_variance = statistics.variance(position_df.antibody_stability_ddG.values)
 52 |         mean_binding_and_stability_variance = (binding_ddG_variance + antibody_stability_ddG_variance) / 2
 53 |         for mut_name, row in position_df.iterrows():
 54 |             mutant_residue = mut_name[-1]
 55 |             # This filters out mutations that are too destabilizing either in terms of binding or stability
 56 |             if row.complex_stability_ddG > 1 or row.binding_ddG > 1 or row.antibody_stability_ddG > 2:
 57 |                 continue
 58 | 
 59 |             # We don't want mutations to large hydrophobics when mutations at that position don't seem to change binding or stability a lot (ddG values ~ 0)
 60 |             if mean_binding_and_stability_variance < 0.1 and mutant_residue in large_hydrophobic_residues:
 61 |                 continue
 62 |             
 63 |             acceptable_mutations_map[position].append(mut_name)
 64 | 
 65 |         # If no mutation passes the filters, allow all mutations at that position and let the search algorithm find what's best
 66 |         if not position in acceptable_mutations_map:
 67 |             print(f'No acceptable mutations found for {position = } in {PDB_name = }. Allowing all mutations at that position during the search. Probably needs manual inspection.', flush=True)
 68 |             acceptable_mutations_map[position].extend(position_df.index.values)
 69 | 
 70 |     return acceptable_mutations_map
 71 | 
 72 | def get_hotspot_positions_mut_names_map(all_mutations_summary_df):
 73 |     hotspot_mutations_map = defaultdict(list)
 74 |     for mut_name, row in all_mutations_summary_df.iterrows():
 75 |         original_residue = row.original_residue
 76 |         mutant_residue = mut_name[-1]
 77 |         position = mut_name[2:-1]
 78 | 
 79 |         # Hotspot mutations need to strongly improve binding affinity and not be too destabilizing for the antibody
 80 |         if row.binding_ddG < -1.5 and row.antibody_stability_ddG < 1:
 81 |             hotspot_mutations_map[position].append(mut_name)
 82 | 
 83 |         # If mutating from Alanine to the original residue is highly stabilizing for the antibody, then we consider it as a hotspot mutation, even if it maybe
 84 |         # doesn't contribute to binding
 85 |         elif mutant_residue == original_residue and row.antibody_stability_ddG < -2:
 86 |             hotspot_mutations_map[position].append(mut_name)
 87 | 
 88 |     return hotspot_mutations_map
 89 | 
 90 | def get_allowed_mutations_per_position_maps(PDB_name, all_mutations_summary_file_path):
 91 |     """
 92 |     Returns two dictionaries:
 93 |     1) A dict where keys are positions and values are mutation names (i.e {'53':['A53R', 'A53K', ...]})
 94 |     2) A dict where values are amino acid mutations (i.e {'53':['R', 'K', ...]}) 
 95 |     """
 96 |     all_mutations_summary_df = pd.read_csv(all_mutations_summary_file_path, header=0, index_col=0, dtype = {'position':str})
 97 | 
 98 |     acceptable_positions_mut_names_map = get_acceptable_positions_mut_names_map(all_mutations_summary_df, PDB_name)
 99 |     hotspot_positions_mut_names_map = get_hotspot_positions_mut_names_map(all_mutations_summary_df)
100 | 
101 |     allowed_mut_names_per_position_map = {}
102 |     allowed_AA_per_position_map = {}
103 |     for position, position_df in all_mutations_summary_df.groupby('position'):
104 |         # In positions with an antibody folding hotspot mutation, only consider hotspot mutations, otherwise allow both hotspot and acceptable mutations.
105 |         if any(position_df.antibody_stability_ddG < -2):
106 |             allowed_mut_names = hotspot_positions_mut_names_map[position]
107 |             allowed_AA_mutations = {mut_name[-1] for mut_name in allowed_mut_names}
108 |         else:
109 |             allowed_mut_names = hotspot_positions_mut_names_map[position] + acceptable_positions_mut_names_map[position]
110 |             allowed_AA_mutations = {mut_name[-1] for mut_name in allowed_mut_names}
111 | 
112 |         if allowed_mut_names:
113 |             allowed_mut_names_per_position_map[position] = allowed_mut_names
114 |             allowed_AA_per_position_map[position] = allowed_AA_mutations
115 |         
116 |     return allowed_mut_names_per_position_map, allowed_AA_per_position_map 
117 | 
118 | 
119 | def clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model):
120 |     for file in model_dir.iterdir():
121 |         if file.name != PDB_file_name_to_keep_as_model:
122 |             file.unlink()
123 | 
124 |     PDB_file_path_to_keep_as_model = model_dir / PDB_file_name_to_keep_as_model
125 |     if PDB_file_name_to_keep_as_model != 'model.pdb':
126 |         PDB_file_path_to_keep_as_model.rename(PDB_file_path_to_keep_as_model.with_name('model.pdb'))
127 |     return
128 | 
129 | def get_random_mutations_list_for_initial_population(allowed_mut_names_per_position_map):
130 |     random_mutations_list = []
131 |     for position, mut_names_list in allowed_mut_names_per_position_map.items():
132 |         # Skip positions where the wildtype residue is not Alanine, as this means the position was marked as MakeAla = "N"
133 |         if all(mut_name[0] != 'A' for mut_name in mut_names_list):
134 |             continue
135 | 
136 |         random_mut_name = random.choice(mut_names_list)
137 |         random_mutations_list.append(random_mut_name)
138 |     
139 |     random.shuffle(random_mutations_list)
140 |     return random_mutations_list
141 | 
142 | def generate_random_model(
143 |         PDB_name, foldx_Alanine_mutant_PDB_file_path, allowed_mut_names_per_position_map, allowed_AA_mutations_per_position_map,  antibody_stability_dG_original_wildtype,
144 |         antibody_seq_map_original_wildtype, model_dir, GLOBALS
145 |     ):
146 |     model, n_tries = None, 0
147 |     while model == None:
148 |         random_mutations_list = get_random_mutations_list_for_initial_population(allowed_mut_names_per_position_map)
149 | 
150 |         create_model(
151 |             input_PDB_file_path = foldx_Alanine_mutant_PDB_file_path, 
152 |             copy_PDB_file_to_output_dir = True, 
153 |             mutations_list = random_mutations_list, 
154 |             output_dir = model_dir, 
155 |             GLOBALS = GLOBALS
156 |         )
157 | 
158 |         complex_stability_ddG = get_complex_stability_ddG(model_dir / f'Dif_{PDB_name}_1_Alanine_mutant.fxout')
159 |         if complex_stability_ddG < 0.5 or n_tries == 5:
160 |             full_residue_IDs_list = [f'{mut_name[-1]}{mut_name[1:-1]}' for mut_name in random_mutations_list]
161 |             model = MC_Model(
162 |                 model_dir = model_dir,
163 |                 full_residue_IDs_list = full_residue_IDs_list,
164 |                 backbone_PDB_file_name = PDB_name,
165 |                 antibody_stability_dG_original_wildtype = antibody_stability_dG_original_wildtype,
166 |                 antibody_seq_map_original_wildtype = antibody_seq_map_original_wildtype,
167 |                 allowed_AA_mutations_per_position_map = allowed_AA_mutations_per_position_map,
168 |             )
169 |         else:
170 |             shutil.rmtree(model_dir) # Empty the directory and try again with a new random list of mutations
171 | 
172 |         n_tries += 1
173 | 
174 |     clean_up_model_dir(model_dir = model_dir, PDB_file_name_to_keep_as_model = f'{PDB_name}_1_Alanine_mutant_1.pdb')
175 |     return model
176 | 
177 | 
178 | def generate_initial_models(parallel_executor, evolvex_working_dir, backbone_PDB_files_paths, GLOBALS):
179 |     futures = []
180 |     for PDB_file_path in backbone_PDB_files_paths:
181 |         PDB_dir = evolvex_working_dir / PDB_file_path.stem
182 |         PDB_name = PDB_dir.name
183 | 
184 |         foldx_Alanine_mutant_PDB_file_path = PDB_dir / f'{PDB_name}_1_Alanine_mutant.pdb'
185 | 
186 |         search_output_dir = PDB_dir / 'search_results'; search_output_dir.mkdir(exist_ok = True)
187 | 
188 |         all_mutations_summary_file_path = PDB_dir / 'hotspot_mutants' / 'all_mutations_summary.csv'
189 |         if not all_mutations_summary_file_path.exists():
190 |             print(f'Could not find the all_mutations_summary.csv file for {PDB_name = }, this should not happen ! Skipping PDB backbone for search.', flush=True)
191 |             continue
192 | 
193 |         allowed_mut_names_per_position_map, allowed_AA_mutations_per_position_map = get_allowed_mutations_per_position_maps(
194 |             PDB_name = PDB_name, all_mutations_summary_file_path = all_mutations_summary_file_path
195 |         )
196 | 
197 |         antibody_stability_dG_original_wildtype = get_chain_group_stability_dG(indiv_file_path = PDB_dir / 'Indiv_energies_original_wildtype_AC.fxout', chain_group_name = GLOBALS.antibody_chains)
198 |         antibody_seq_map_original_wildtype = get_chain_to_sequence_map(PDB_file_path = foldx_Alanine_mutant_PDB_file_path, chain_subset = GLOBALS.antibody_chains)
199 | 
200 |         for ith_model in range(GLOBALS.population_size):
201 |             model_dir = search_output_dir / str(ith_model); model_dir.mkdir(exist_ok=True)
202 |             future = parallel_executor.submit(
203 |                 generate_random_model, PDB_name, foldx_Alanine_mutant_PDB_file_path, allowed_mut_names_per_position_map, allowed_AA_mutations_per_position_map,  
204 |                 antibody_stability_dG_original_wildtype, antibody_seq_map_original_wildtype, model_dir, GLOBALS
205 |             )
206 |             futures.append(future)
207 | 
208 |     initial_models_population = parallel_executor.gather(futures)
209 |     
210 |     return initial_models_population


--------------------------------------------------------------------------------
/src/evolvex/mutate_interface.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from evolvex.foldx_commands import (
  6 |     create_individual_list_foldx_mutations_file, get_alanine_mutant,
  7 |     run_foldx_BuildModel, run_foldx_AnalyseComplex, run_foldx_Stability,
  8 |     get_binding_ddG, get_complex_stability_ddG, get_chain_group_stability_ddG
  9 | )
 10 | 
 11 | def generate_Alanine_mutant(PDB_file_path, PDB_positions_to_explore_df, evolvex_working_dir, GLOBALS):
 12 |     PDB_file_name = PDB_file_path.stem
 13 |     output_dir = evolvex_working_dir / PDB_file_name; output_dir.mkdir(exist_ok=True)
 14 | 
 15 |     # PDB_file_path corresponds to the original file in the Backbones folder, which we do not want to modify. The PDB_file_path_copy is 
 16 |     # located in the corresponding subfolder named after the PDB file name in evolvex_working_dir
 17 |     PDB_file_path_copy = shutil.copy(
 18 |         src = PDB_file_path, 
 19 |         dst = output_dir / f'{PDB_file_name}.pdb'
 20 |     )
 21 | 
 22 |     antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains
 23 | 
 24 |     # PositionsToExplore possibilities:
 25 |     # 1) AA_Allowed=AUTO & MakeAla=Y --> Mutate to Alanine and find which mutations are worth exploring automatically 
 26 |     # 2) AA_Allowed=string of AA & MakeAla=Y --> Mutate to Alanine and only allow the specified mutations
 27 |     # 3) AA_Allowed=string of AA & MakeAla=N --> Do not mutate to Alanine (i.e keep the wildtype residue) and only allow the specified mutations
 28 |     PDB_positions_to_explore_df['full_residue_ID'] = PDB_positions_to_explore_df.apply(lambda row:f'{row.Res1}{row.Chain}{row.number}', axis=1)
 29 |     positions_to_Ala_df = PDB_positions_to_explore_df[PDB_positions_to_explore_df['MakeAla'] == 'Y']
 30 |     if positions_to_Ala_df.empty:
 31 |         raise ValueError('There must be at least 1 position to mutate to Alanine.')
 32 | 
 33 |     positions_to_Ala_full_residue_IDs = positions_to_Ala_df.full_residue_ID.values
 34 |     Alanine_mutant = get_alanine_mutant(positions_to_Ala_full_residue_IDs)
 35 |     individual_list_foldx_mutations_file_path = create_individual_list_foldx_mutations_file(mutant = Alanine_mutant, output_dir = output_dir)
 36 | 
 37 |     foldx_Alanine_mutant_PDB_file_path, foldx_wildtype_PDB_file_path = run_foldx_BuildModel(
 38 |         foldx_dir=GLOBALS.foldx_dir, 
 39 |         PDB_file_dir=PDB_file_path_copy.parent, PDB_file_name=PDB_file_name, 
 40 |         individual_list_foldx_mutations_file_path=individual_list_foldx_mutations_file_path, 
 41 |         move_neighbors_flag=False, # At this stage we do not need an optimized structure, it will get optimized latter
 42 |         vdwDesign=GLOBALS.vdwDesign,
 43 |         print_stdout=GLOBALS.print_stdout,
 44 |         output_dir=output_dir, output_file_tag='Alanine_mutant', PDB_file_tag='Alanine_mutant'
 45 |     )
 46 | 
 47 |     # The stability of the original wildtype antibody is needed for the MC and GA searches
 48 |     run_foldx_AnalyseComplex(
 49 |         GLOBALS.foldx_dir,
 50 |         PDB_file_path_copy.parent, PDB_file_path_copy.stem,
 51 |         antibody_chains, antigen_chains,
 52 |         GLOBALS.vdwDesign,
 53 |         GLOBALS.print_stdout,
 54 |         output_dir, output_file_tag='original_wildtype'
 55 |     )
 56 | 
 57 |     AUTO_Ala_positions_full_residue_IDs = positions_to_Ala_df[positions_to_Ala_df['AA_Allowed'] == 'AUTO'].full_residue_ID.values
 58 |     return foldx_Alanine_mutant_PDB_file_path, AUTO_Ala_positions_full_residue_IDs, output_dir 
 59 | 
 60 | def mutate_antibody_hotspot_position(foldx_Alanine_mutant_PDB_file_path, full_residue_ID, mutant_residue, hotspot_mutants_dir, GLOBALS):
 61 |     antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains
 62 |     hotspot_mutant = f'A{full_residue_ID[1:]}{mutant_residue}' # e.g: AH25K
 63 | 
 64 |     hotspot_mutant_output_dir = hotspot_mutants_dir / hotspot_mutant; hotspot_mutant_output_dir.mkdir(exist_ok=True)
 65 |     shutil.copy(
 66 |         src = foldx_Alanine_mutant_PDB_file_path, 
 67 |         dst = hotspot_mutant_output_dir
 68 |     ) 
 69 |     
 70 |     individual_list_foldx_mutations_file_path = create_individual_list_foldx_mutations_file(mutant = hotspot_mutant, output_dir = hotspot_mutant_output_dir)
 71 |     foldx_mutant_PDB_file_path, foldx_wildtype_PDB_file_path = run_foldx_BuildModel(
 72 |         foldx_dir=GLOBALS.foldx_dir,
 73 |         PDB_file_dir=hotspot_mutant_output_dir, PDB_file_name=foldx_Alanine_mutant_PDB_file_path.stem,
 74 |         individual_list_foldx_mutations_file_path=individual_list_foldx_mutations_file_path,
 75 |         move_neighbors_flag=True, # At this stage we want to optimize the generated structures
 76 |         vdwDesign=GLOBALS.vdwDesign,
 77 |         print_stdout=GLOBALS.print_stdout,
 78 |         output_dir=hotspot_mutant_output_dir, output_file_tag='hotspot_mutant', PDB_file_tag='hotspot_mutant'
 79 |     )
 80 | 
 81 |     # Mutant
 82 |     run_foldx_AnalyseComplex(
 83 |         GLOBALS.foldx_dir,
 84 |         hotspot_mutant_output_dir, foldx_mutant_PDB_file_path.stem,
 85 |         antibody_chains, antigen_chains,
 86 |         GLOBALS.vdwDesign,
 87 |         GLOBALS.print_stdout,
 88 |         hotspot_mutant_output_dir, output_file_tag='hotspot_mutant'
 89 |     )
 90 |     run_foldx_Stability(
 91 |         GLOBALS.foldx_dir,
 92 |         hotspot_mutant_output_dir, foldx_mutant_PDB_file_path.stem,
 93 |         GLOBALS.vdwDesign,
 94 |         GLOBALS.print_stdout,
 95 |         hotspot_mutant_output_dir, output_file_tag='hotspot_mutant'
 96 |     )
 97 | 
 98 |     # Wildtype
 99 |     run_foldx_AnalyseComplex(
100 |         GLOBALS.foldx_dir,
101 |         hotspot_mutant_output_dir, foldx_wildtype_PDB_file_path.stem,
102 |         antibody_chains, antigen_chains,
103 |         GLOBALS.vdwDesign,
104 |         GLOBALS.print_stdout,
105 |         hotspot_mutant_output_dir, output_file_tag='hotspot_wildtype'
106 |     )
107 |     run_foldx_Stability(
108 |         GLOBALS.foldx_dir,
109 |         foldx_wildtype_PDB_file_path.parent, foldx_wildtype_PDB_file_path.stem,
110 |         GLOBALS.vdwDesign,
111 |         GLOBALS.print_stdout,
112 |         hotspot_mutant_output_dir, output_file_tag='hotspot_wildtype'
113 |     )
114 | 
115 |     return
116 | 
117 | def generate_mutations_summary_file(PDB_dir, PDB_positions_to_explore_df, GLOBALS):
118 |     """
119 |     """
120 |     PDB_name = PDB_dir.name
121 | 
122 |     # Create a summary_df per position, and then combine them all into one final df
123 |     all_mutations_summary_df = []
124 |     for _, row in PDB_positions_to_explore_df.iterrows():
125 |         full_original_residue_ID = f'{row.Res1}{row.Chain}{row.number}'
126 | 
127 |         output_dir = PDB_dir / 'hotspot_mutants' / full_original_residue_ID
128 | 
129 |         binding_ddG_position, complex_stability_ddG_position, antibody_stability_ddG_position, index = [], [], [], []
130 |         if row.AA_Allowed != 'AUTO':
131 |             assert len(row.AA_Allowed) > 0, f"Rows where AA_Allowed is not 'AUTO' must have at least one specified residue ({row = })"
132 |             
133 |             output_dir.mkdir(exist_ok=True)
134 | 
135 |             # Generate an 'artificial' summary df where all fields are set to -100 for each amino acid
136 |             values = [-100] * len(row.AA_Allowed)
137 |             binding_ddG_position.extend(values)
138 |             complex_stability_ddG_position.extend(values)
139 |             antibody_stability_ddG_position.extend(values)
140 | 
141 |             wildtype_AA = 'A' if row.MakeAla == 'Y' else full_original_residue_ID[0]
142 |             index.extend((f'{wildtype_AA}{full_original_residue_ID[1:]}{mutant_AA}' for mutant_AA in row.AA_Allowed))
143 | 
144 |         else:
145 |             for mut_name_dir in output_dir.iterdir(): # Iterate over each individual mutation folder that was generated (e.g AH27C, AH27D, ...) 
146 |                 if not mut_name_dir.is_dir():
147 |                     continue
148 |             
149 |                 binding_ddG = get_binding_ddG(
150 |                     wildtype_interaction_file_path = mut_name_dir / 'Interaction_hotspot_wildtype_AC.fxout', # Wildtype = alanine mutant
151 |                     mutant_interaction_file_path = mut_name_dir / 'Interaction_hotspot_mutant_AC.fxout'
152 |                 )
153 |                 complex_stability_ddG = get_complex_stability_ddG(
154 |                     dif_file_path = mut_name_dir / f'Dif_hotspot_mutant_{PDB_name}_1_Alanine_mutant.fxout'
155 |                 )
156 |                 antibody_stability_ddG = get_chain_group_stability_ddG(
157 |                     wildtype_indiv_file_path = mut_name_dir / 'Indiv_energies_hotspot_wildtype_AC.fxout',
158 |                     mutant_indiv_file_path = mut_name_dir / 'Indiv_energies_hotspot_mutant_AC.fxout',
159 |                     chain_group_name = GLOBALS.antibody_chains
160 |                 )
161 | 
162 |                 binding_ddG_position.append(binding_ddG)
163 |                 complex_stability_ddG_position.append(complex_stability_ddG)
164 |                 antibody_stability_ddG_position.append(antibody_stability_ddG)
165 |                 
166 |                 index.append(mut_name_dir.name)
167 | 
168 | 
169 |         summary_df = pd.DataFrame(
170 |             data = {
171 |                 'binding_ddG':binding_ddG_position,
172 |                 'complex_stability_ddG':complex_stability_ddG_position,
173 |                 'antibody_stability_ddG':antibody_stability_ddG_position
174 |             },
175 |             index = index
176 |         ).sort_index()
177 |         summary_df['original_residue'] = full_original_residue_ID[0]
178 |         summary_df['position'] = full_original_residue_ID[2:]
179 | 
180 |         all_mutations_summary_df.append(summary_df)
181 | 
182 |     if len(all_mutations_summary_df) == 0:
183 |         print(f'Something went wrong when generating the mutations summary file for {PDB_name = }.')
184 |         
185 |     pd.concat(all_mutations_summary_df, axis=0).round(decimals = 6).to_csv(PDB_dir / 'hotspot_mutants' / 'all_mutations_summary.csv') 
186 |     return


--------------------------------------------------------------------------------
/src/evolvex/search_algorithms.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import itertools
  3 | import math
  4 | import random
  5 | 
  6 | import pandas as pd
  7 | from dask.distributed import as_completed
  8 | 
  9 | from evolvex.dask_parallel import wait_and_remove
 10 | from evolvex.foldx_commands import (get_binding_dG,
 11 |                           get_chain_group_intraclash_score, get_complex_stability_dG,
 12 |                           get_chain_group_stability_dG,
 13 |                           run_foldx_AnalyseComplex,
 14 |                           run_foldx_Stability, get_all_other_interaction_file_info)
 15 | from evolvex.model_generation import clean_up_model_dir, create_model
 16 | from evolvex.model_generation import get_acceptable_positions_mut_names_map, get_hotspot_positions_mut_names_map
 17 | from evolvex.utils import save_compressed_PDB_file
 18 | 
 19 | from evolvex.utils import NDIGIS_ROUNDING
 20 | 
 21 | paratope_AA =         [ 'A', 'C',  'D',  'E', 'F',  'G',  'H',  'I',  'K',  'L', 'M',  'N', 'P', 'Q',  'R',   'S',  'T', 'V', 'W',  'Y']
 22 | paratope_AA_weights = [2.95, 0.1, 6.75, 2.85, 3.9, 7.75, 2.95, 2.75, 2.55, 3.55, 0.7, 7.75, 1.9, 2.2, 5.15, 13.45, 5.85, 2.5, 5.3, 19.1]
 23 | 
 24 | 
 25 | def all_hotspot_and_acceptable_mutations_combinations_generator(all_mutations_summary_file_path):
 26 |     """
 27 |     Generator that yields all possible combinations of the hotspot and acceptable mutations.
 28 |     """
 29 |     all_mutations_summary_df = pd.read_csv(all_mutations_summary_file_path, header=0, index_col=0)
 30 | 
 31 |     acceptable_positions_mut_names_map = get_acceptable_positions_mut_names_map(all_mutations_summary_df)
 32 |     hotspot_positions_mut_names_map = get_hotspot_positions_mut_names_map(all_mutations_summary_df)
 33 |     
 34 |     # Remove hotspot positions from acceptable positions
 35 |     for position in hotspot_positions_mut_names_map.keys():
 36 |         del acceptable_positions_mut_names_map[position]
 37 | 
 38 |     ### NOTE: This only explores combinations of mutations in one order, do we want to get mutation order variability for BuildModel ? Would need to additionally use itertools.combinations.
 39 |     all_hotspot_mutations_combinations = list(itertools.product(*hotspot_positions_mut_names_map.values())) # hotspot_positions_mut_names_map.values() returns a list of sublists, where each sublist are the mutations for a given position
 40 |     if len(all_hotspot_mutations_combinations) == 0:
 41 |         all_hotspot_mutations_combinations = [[]]
 42 |     random.shuffle(all_hotspot_mutations_combinations)
 43 | 
 44 |     all_acceptable_mutations_combinations = list(itertools.product(*acceptable_positions_mut_names_map.values()))
 45 |     if len(all_acceptable_mutations_combinations) == 0:
 46 |         all_acceptable_mutations_combinations = [[]]
 47 |     random.shuffle(all_acceptable_mutations_combinations)
 48 | 
 49 |     for hotspot_mutations_list in all_hotspot_mutations_combinations:
 50 |         for acceptable_mutations_list in all_acceptable_mutations_combinations:
 51 |             mutations_list = hotspot_mutations_list + acceptable_mutations_list
 52 | 
 53 |             yield mutations_list
 54 |     
 55 |     return
 56 | 
 57 | def run_foldx_commands(mutant_PDB_file_path, wildtype_PDB_file_path, antibody_chains, antigen_chains, output_dir, GLOBALS):    
 58 |     run_foldx_AnalyseComplex(
 59 |         GLOBALS.foldx_dir,
 60 |         mutant_PDB_file_path.parent, mutant_PDB_file_path.stem,
 61 |         antibody_chains, antigen_chains,
 62 |         GLOBALS.vdwDesign,
 63 |         GLOBALS.print_stdout,
 64 |         output_dir, output_file_tag=None
 65 |     )
 66 |     run_foldx_AnalyseComplex(
 67 |         GLOBALS.foldx_dir,
 68 |         wildtype_PDB_file_path.parent, wildtype_PDB_file_path.stem,
 69 |         antibody_chains, antigen_chains,
 70 |         GLOBALS.vdwDesign,
 71 |         GLOBALS.print_stdout,
 72 |         output_dir, output_file_tag=None
 73 |     )
 74 | 
 75 |     run_foldx_Stability(
 76 |         GLOBALS.foldx_dir,
 77 |         mutant_PDB_file_path.parent, mutant_PDB_file_path.stem,
 78 |         GLOBALS.vdwDesign,
 79 |         GLOBALS.print_stdout,
 80 |         output_dir, output_file_tag=None
 81 |     )
 82 |     run_foldx_Stability(
 83 |         GLOBALS.foldx_dir,
 84 |         wildtype_PDB_file_path.parent, wildtype_PDB_file_path.stem,
 85 |         GLOBALS.vdwDesign,
 86 |         GLOBALS.print_stdout,
 87 |         output_dir, output_file_tag=None
 88 |     )
 89 | 
 90 |     if GLOBALS.calculate_binding_dG_with_water:
 91 |         run_foldx_AnalyseComplex(
 92 |             GLOBALS.foldx_dir,
 93 |             mutant_PDB_file_path.parent, mutant_PDB_file_path.stem,
 94 |             antibody_chains, antigen_chains,
 95 |             GLOBALS.vdwDesign,
 96 |             GLOBALS.print_stdout,
 97 |             output_dir, output_file_tag='mutant_with_waters',
 98 |             with_predicted_waters=True
 99 |         )
100 |         run_foldx_AnalyseComplex(
101 |             GLOBALS.foldx_dir,
102 |             wildtype_PDB_file_path.parent, wildtype_PDB_file_path.stem,
103 |             antibody_chains, antigen_chains,
104 |             GLOBALS.vdwDesign,
105 |             GLOBALS.print_stdout,
106 |             output_dir, output_file_tag='wildtype_with_waters',
107 |             with_predicted_waters=True
108 |         )
109 | 
110 |     return
111 | 
112 | def systematic_search(parallel_executor, backbone_PDB_files_paths, evolvex_working_dir, GLOBALS):
113 |     antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains
114 | 
115 |     for PDB_file_path in backbone_PDB_files_paths:
116 |         PDB_dir = evolvex_working_dir / PDB_file_path.stem
117 |         PDB_name = PDB_dir.name
118 | 
119 |         search_output_dir = PDB_dir / 'search_results'; search_output_dir.mkdir(exist_ok = True) ### Should be False
120 |         all_mutations_summary_file_path = PDB_dir / 'hotspot_mutants' / 'all_mutations_summary.csv'
121 | 
122 |         futures = []
123 |         for nth_model, mutations_list in enumerate(all_hotspot_and_acceptable_mutations_combinations_generator(all_mutations_summary_file_path)):
124 |             foldx_Alanine_mutant_PDB_file_path = PDB_dir / f'{PDB_name}_1_Alanine_mutant.pdb'
125 |             output_dir = search_output_dir / str(nth_model)
126 | 
127 |             future = parallel_executor.submit(
128 |                 create_model, foldx_Alanine_mutant_PDB_file_path, True, mutations_list, output_dir, GLOBALS,
129 |             )
130 |             futures.append(future)
131 |         
132 |         wait_and_remove(parallel_executor, futures)
133 |     
134 |     return
135 | 
136 | 
137 | def get_random_mut_name(full_residue_IDs_list, allowed_AA_mutations_per_position_map):
138 |     random_residue_ID = random.choice(full_residue_IDs_list)
139 |     wildtype_AA, residue_ID, position = random_residue_ID[0], random_residue_ID[1:], random_residue_ID[2:]
140 | 
141 |     allowed_mutations = allowed_AA_mutations_per_position_map[position]
142 |     if len(allowed_mutations) == 1:
143 |         # Can't index a set, and there is only one mutation, so this works
144 |         for mutant_AA in allowed_mutations:
145 |             pass
146 |     else:
147 |         mutant_AA = random.choices(paratope_AA, paratope_AA_weights)[0]
148 |         while not mutant_AA in allowed_mutations:
149 |             mutant_AA = random.choices(paratope_AA, paratope_AA_weights)[0]
150 |     
151 |     return f'{wildtype_AA}{residue_ID}{mutant_AA}'
152 | 
153 | def metropolis_criterion(energies):
154 |     if any(energy < 0 for energy in energies):
155 |         return True
156 |     
157 |     U = random.random()
158 |     for energy in energies:
159 |         P = math.exp(-(energy / 0.5919))
160 |         if P >= U:
161 |             return True
162 |         
163 |     return False
164 | 
165 | def keep_mutant_decision(model_dir, antibody_chains, antigen_chains, antibody_stability_dG_original_wildtype, iteration_fraction, generated_models_info, GLOBALS):
166 |     # Calculate energies and scores
167 |     wildtype_antibody_stability_dG = get_chain_group_stability_dG(indiv_file_path = model_dir / 'Indiv_energies_WT_model_1_AC.fxout', chain_group_name = antibody_chains)
168 |     mutant_antibody_stability_dG = get_chain_group_stability_dG(indiv_file_path = model_dir / 'Indiv_energies_model_1_AC.fxout', chain_group_name = antibody_chains)
169 |     antibody_stability_ddG = round(mutant_antibody_stability_dG - wildtype_antibody_stability_dG, NDIGIS_ROUNDING)
170 | 
171 |     wildtype_complex_stability_dG = get_complex_stability_dG(st_file_path = model_dir / 'WT_model_1_0_ST.fxout')
172 |     mutant_complex_stability_dG = get_complex_stability_dG(st_file_path = model_dir / 'model_1_0_ST.fxout')
173 | 
174 |     wildtype_binding_dG = get_binding_dG(interaction_file_path = model_dir / f'Interaction_WT_model_1_AC.fxout')
175 |     mutant_binding_dG = get_binding_dG(interaction_file_path = model_dir / f'Interaction_model_1_AC.fxout')
176 |     binding_ddG = round(mutant_binding_dG - wildtype_binding_dG, NDIGIS_ROUNDING)
177 |     
178 |     if GLOBALS.calculate_binding_dG_with_water:
179 |         wildtype_binding_dG_with_waters = get_binding_dG(interaction_file_path = model_dir / f'Interaction_wildtype_with_waters_AC.fxout')
180 |         mutant_binding_dG_with_waters = get_binding_dG(interaction_file_path = model_dir / f'Interaction_mutant_with_waters_AC.fxout')
181 |         binding_ddG_with_waters = round(mutant_binding_dG_with_waters - wildtype_binding_dG_with_waters, NDIGIS_ROUNDING)
182 | 
183 |     wildtype_antibody_intraclash_score = get_chain_group_intraclash_score(interaction_file_path = model_dir / f'Interaction_WT_model_1_AC.fxout', chain_group_name = antibody_chains)
184 |     mutant_antibody_intraclash_score = get_chain_group_intraclash_score(interaction_file_path = model_dir / f'Interaction_model_1_AC.fxout', chain_group_name = antibody_chains)
185 |     antibody_delta_intraclash_score = round(mutant_antibody_intraclash_score - wildtype_antibody_intraclash_score, NDIGIS_ROUNDING)
186 | 
187 |     # All columns from interactions file, starting from Backbone Hbond
188 |     wildtype_other_info_map = get_all_other_interaction_file_info(interaction_file_path = model_dir / f'Interaction_WT_model_1_AC.fxout')
189 |     mutant_other_info_map = get_all_other_interaction_file_info(interaction_file_path = model_dir / f'Interaction_model_1_AC.fxout')
190 | 
191 |     
192 |     # Make decision
193 |     if antibody_stability_ddG > 0.5 or mutant_antibody_stability_dG > (antibody_stability_dG_original_wildtype + 2):
194 |         keep_mutant =  False
195 |     
196 |     elif antibody_delta_intraclash_score > 0.5 or mutant_antibody_intraclash_score > 10:
197 |         keep_mutant =  False
198 |     
199 |     else:
200 |         energies = (binding_ddG, binding_ddG_with_waters) if GLOBALS.calculate_binding_dG_with_water else (binding_ddG,)
201 |         keep_mutant = metropolis_criterion(energies)
202 | 
203 |     # Log info of the selected model. This bit of code is uggly, but couldn't find a better way.
204 |     generated_models_info['antibody_stability_dG'].append(mutant_antibody_stability_dG if keep_mutant else wildtype_antibody_stability_dG)
205 |     generated_models_info['complex_stability_dG'].append(mutant_complex_stability_dG if keep_mutant else wildtype_complex_stability_dG)
206 |     generated_models_info['binding_dG'].append(mutant_binding_dG if keep_mutant else wildtype_binding_dG)
207 |     if GLOBALS.calculate_binding_dG_with_water:
208 |         generated_models_info['binding_dG_with_waters'].append(mutant_binding_dG_with_waters if keep_mutant else wildtype_binding_dG_with_waters)
209 |     generated_models_info['antibody_intraclash_score'].append(mutant_antibody_intraclash_score if keep_mutant else wildtype_antibody_intraclash_score)
210 |     
211 |     other_info_map = mutant_other_info_map if keep_mutant else wildtype_other_info_map
212 |     for key, value in other_info_map.items():
213 |         key = key.replace(' ', '_') # Backbone Hbond => Backbone_Hbond
214 |         generated_models_info[key].append(value)
215 | 
216 |     #
217 | 
218 |     return keep_mutant
219 | 
220 | def update_full_residue_IDs_list(model, mut_names_list):
221 |     for mut_name in mut_names_list:
222 |         wildtype_AA, residue_ID, mutant_AA = mut_name[0], mut_name[1:-1], mut_name[-1]
223 |         old_residue_ID = f'{wildtype_AA}{residue_ID}'
224 |         new_residue_ID = f'{mutant_AA}{residue_ID}'
225 | 
226 |         model.full_residue_IDs_list.remove(old_residue_ID)
227 |         model.full_residue_IDs_list.append(new_residue_ID)
228 | 
229 |     return
230 | 
231 | def make_MC_steps(model, n_MC_steps, nth_loop, iteration_fraction, model_PDB_files_dir, GLOBALS):
232 |     backbone_PDB_file_name = model.backbone_PDB_file_name
233 |     antibody_stability_dG_original_wildtype = model.antibody_stability_dG_original_wildtype
234 |     antibody_seq_map_original_wildtype = model.antibody_seq_map_original_wildtype
235 |     allowed_AA_mutations_per_position_map = model.allowed_AA_mutations_per_position_map
236 |     antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains
237 | 
238 |     generated_models_info = defaultdict(list)
239 |     generated_models_info['backbone_PDB_file_name'] = [backbone_PDB_file_name] * n_MC_steps
240 |     generated_models_info['nth_model'] = [model.model_dir.name] * n_MC_steps
241 |     generated_models_info['step'] = ['MC'] * n_MC_steps
242 | 
243 |     current_nth_MC_iteration = nth_loop * (n_MC_steps + 1) # +1 to account for the recombination steps
244 |     for i in range(n_MC_steps):
245 |         nth_iteration = current_nth_MC_iteration + i + 1
246 |         generated_models_info['nth_iteration'].append(nth_iteration)
247 | 
248 |         full_residue_IDs_list = model.full_residue_IDs_list
249 |         mut_name = get_random_mut_name(full_residue_IDs_list, allowed_AA_mutations_per_position_map)
250 | 
251 |         # Create the mutant with BuildModel, which will be called "model_1"
252 |         model_dir = model.model_dir
253 |         create_model(
254 |             input_PDB_file_path = model_dir / 'model.pdb', copy_PDB_file_to_output_dir = False, mutations_list = [mut_name], output_dir = model_dir, GLOBALS = GLOBALS,
255 |         )
256 | 
257 |         run_foldx_commands(
258 |             mutant_PDB_file_path = model_dir / 'model_1.pdb', wildtype_PDB_file_path = model_dir / 'WT_model_1.pdb',
259 |             antibody_chains = antibody_chains, antigen_chains = antigen_chains, 
260 |             output_dir = model_dir, GLOBALS = GLOBALS
261 |         )
262 | 
263 |         keep_mutant = keep_mutant_decision(model_dir, antibody_chains, antigen_chains, antibody_stability_dG_original_wildtype, iteration_fraction, generated_models_info, GLOBALS)
264 |         if keep_mutant:
265 |             clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model_1.pdb')
266 |             update_full_residue_IDs_list(model, mut_names_list = [mut_name])
267 |         
268 |         else:
269 |             clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model.pdb')
270 |         
271 |         save_compressed_PDB_file(
272 |             PDB_file_path = model_dir / 'model.pdb', 
273 |             output_name = f'{backbone_PDB_file_name}_{model.model_dir.name}_{nth_iteration}.pdb',
274 |             output_dir = model_PDB_files_dir
275 |         )
276 | 
277 |         generated_models_info['residue_IDs'].append(';'.join(full_residue_IDs_list))
278 |         generated_models_info['from_mut_name'].append(mut_name)
279 |         generated_models_info['mutation_accepted'].append(keep_mutant)
280 | 
281 |     return model, generated_models_info
282 | 
283 | def write_generated_models_info(generated_models_info, generated_models_info_file_handle):
284 |     if generated_models_info_file_handle.tell() == 0: # File is empty
285 |         headers = ','.join(generated_models_info.keys()) + '\n'
286 |         generated_models_info_file_handle.write(headers)
287 | 
288 |     # Equivalent to lines = pd.DataFrame(generated_models_info).to_csv(index=False, header=False), but 10x faster
289 |     lines = ''.join(
290 |         f"{','.join(map(str, row_of_values))}\n" # row CSV format
291 |         for row_of_values in zip(*generated_models_info.values()) # if generated_models_info = {'A':[1,2,3], 'B':[4,5,6]}, zip(*generated_models_info.values()) yields (1,4), (2,5) and (3,6)
292 |     )
293 |     
294 |     generated_models_info_file_handle.writelines(lines)
295 |     return
296 | 
297 | def random_model_pairing_generator(models_population):
298 |     backbone_PDB_grouped_models_map = defaultdict(list)
299 |     for model in models_population:
300 |         backbone_PDB_file_name = model.backbone_PDB_file_name
301 |         backbone_PDB_grouped_models_map[backbone_PDB_file_name].append(model)
302 | 
303 |     for _, backbone_models_list in backbone_PDB_grouped_models_map.items():
304 |         random.shuffle(backbone_models_list)
305 | 
306 |         for i in range(0, len(backbone_models_list), 2):
307 |             yield (backbone_models_list[i], backbone_models_list[i+1])
308 | 
309 |     return
310 | 
311 | def get_recombination_mut_names(model_1, model_2):
312 |     residue_ID_to_AA_map_1 = {full_residue_ID[1:]:full_residue_ID[0] for full_residue_ID in model_1.full_residue_IDs_list} # e.g {'H52':'K', 'H56':'Y', ...}
313 |     residue_ID_to_AA_map_2 = {full_residue_ID[1:]:full_residue_ID[0] for full_residue_ID in model_2.full_residue_IDs_list}
314 | 
315 |     shared_residue_IDs = set.intersection(set(residue_ID_to_AA_map_1.keys()), set(residue_ID_to_AA_map_2.keys()))
316 |     sorted_shared_residue_IDs = sorted(shared_residue_IDs, key = lambda residue_ID:(residue_ID[0], int(residue_ID[1:]))) 
317 |     if len(sorted_shared_residue_IDs) < 2:
318 |         raise ValueError(f'Cannot perform recombination between {model_1.model_dir} and {model_2.model_dir} because {shared_residue_IDs = }, and at least 2 shared residue IDs are needed.')
319 | 
320 |     recombination_location = random.randint(1, len(sorted_shared_residue_IDs) - 1)
321 |     mut_names_1, mut_names_2 = [], []
322 |     for residue_ID in sorted_shared_residue_IDs[:recombination_location]:
323 |         AA_1 = residue_ID_to_AA_map_1[residue_ID]
324 |         AA_2 = residue_ID_to_AA_map_2[residue_ID]
325 | 
326 |         mut_names_1.append(f'{AA_1}{residue_ID}{AA_2}') # e.g 'KH52R'
327 |         mut_names_2.append(f'{AA_2}{residue_ID}{AA_1}')
328 | 
329 |     return (mut_names_1, mut_names_2)
330 | 
331 | def make_recombination_step(model_1, model_2, nth_iteration, iteration_fraction, model_PDB_files_dir, GLOBALS):
332 |     # These variables are the same for both model 1 and 2
333 |     backbone_PDB_file_name = model_1.backbone_PDB_file_name
334 |     antibody_stability_dG_original_wildtype = model_1.antibody_stability_dG_original_wildtype
335 |     antibody_seq_map_original_wildtype = model_1.antibody_seq_map_original_wildtype
336 |     antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains
337 | 
338 |     generated_models_info = defaultdict(list)
339 |     generated_models_info['backbone_PDB_file_name'] = [backbone_PDB_file_name] * 2
340 |     generated_models_info['nth_model'] = [model_1.model_dir.name, model_2.model_dir.name]
341 |     generated_models_info['step'] = ['recombination'] * 2
342 |     generated_models_info['nth_iteration'] = [nth_iteration] * 2
343 | 
344 |     mut_names_1, mut_names_2 = get_recombination_mut_names(model_1, model_2)
345 |     for mut_names, model in [(mut_names_1, model_1), (mut_names_2, model_2)]:
346 |         model_dir = model.model_dir
347 |         full_residue_IDs_list = model.full_residue_IDs_list
348 | 
349 |         create_model(
350 |             input_PDB_file_path = model_dir / 'model.pdb', copy_PDB_file_to_output_dir = False, mutations_list = mut_names, output_dir = model_dir, GLOBALS = GLOBALS
351 |         )
352 | 
353 |         run_foldx_commands(
354 |             mutant_PDB_file_path = model_dir / 'model_1.pdb', wildtype_PDB_file_path = model_dir / 'WT_model_1.pdb', 
355 |             antibody_chains = antibody_chains, antigen_chains = antigen_chains, 
356 |             output_dir = model_dir, GLOBALS = GLOBALS
357 |         )
358 |         
359 |         keep_mutant = keep_mutant_decision(model_dir, antibody_chains, antigen_chains, antibody_stability_dG_original_wildtype, iteration_fraction, generated_models_info, GLOBALS)
360 |         if keep_mutant:
361 |             clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model_1.pdb')
362 |             update_full_residue_IDs_list(model, mut_names_list = mut_names)
363 |         
364 |         else:
365 |             clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model.pdb')
366 | 
367 |         save_compressed_PDB_file(
368 |             PDB_file_path = model_dir / 'model.pdb', 
369 |             output_name = f'{backbone_PDB_file_name}_{model.model_dir.name}_{nth_iteration}.pdb',
370 |             output_dir = model_PDB_files_dir
371 |         )
372 | 
373 |         generated_models_info['residue_IDs'].append(';'.join(full_residue_IDs_list))
374 |         generated_models_info['from_mut_name'].append(';'.join(mut_names))
375 |         generated_models_info['mutation_accepted'].append(keep_mutant)
376 |     
377 |     return model_1, model_2, generated_models_info
378 | 
379 | def GA_search(parallel_executor, initial_models_population, generated_models_info_file_path, model_PDB_files_dir, GLOBALS):
380 |     recombine_every_nth_iteration = GLOBALS.recombine_every_nth_iteration
381 |     max_iterations = GLOBALS.max_iterations
382 | 
383 |     # Each loop = n MC steps + 1 recombination step
384 |     n_MC_and_recombination_loops = max_iterations // recombine_every_nth_iteration
385 |     n_MC_steps_per_loop = recombine_every_nth_iteration - 1
386 | 
387 |     generated_models_info_file_handle = open(generated_models_info_file_path, 'a')
388 | 
389 |     models_population = initial_models_population
390 |     for nth_loop in range(n_MC_and_recombination_loops):
391 |         iteration_fraction = nth_loop / n_MC_and_recombination_loops
392 | 
393 |         # MC steps
394 |         futures = []
395 |         for model in models_population:
396 |             future = parallel_executor.submit(make_MC_steps, model, n_MC_steps_per_loop, nth_loop, iteration_fraction, model_PDB_files_dir, GLOBALS)
397 |             futures.append(future)
398 | 
399 |         models_population = []
400 |         for _, (model, generated_models_info) in as_completed(futures, with_results=True):
401 |             models_population.append(model)
402 |             write_generated_models_info(generated_models_info, generated_models_info_file_handle)
403 |         parallel_executor.cancel(futures)
404 |         
405 |         # Recombination step. The models are recombined with a model from the same backbone.
406 |         nth_iteration = (nth_loop + 1) * recombine_every_nth_iteration
407 |         futures = []
408 |         for model_1, model_2 in random_model_pairing_generator(models_population):
409 |             future = parallel_executor.submit(make_recombination_step, model_1, model_2, nth_iteration, iteration_fraction, model_PDB_files_dir, GLOBALS)
410 |             futures.append(future)
411 | 
412 |         models_population = []
413 |         for _, (model_1, model_2, generated_models_info) in as_completed(futures, with_results=True):
414 |             models_population.append(model_1)
415 |             models_population.append(model_2)
416 |             write_generated_models_info(generated_models_info, generated_models_info_file_handle)
417 |         parallel_executor.cancel(futures)
418 | 
419 |     generated_models_info_file_handle.close()
420 |     return


--------------------------------------------------------------------------------
/src/evolvex/utils.py:
--------------------------------------------------------------------------------
1 | 
2 | import tarfile
3 | 
4 | NDIGIS_ROUNDING = 4
5 | 
6 | def save_compressed_PDB_file(PDB_file_path, output_name, output_dir):
7 |     with tarfile.open(output_dir / f'{output_name}.tar.gz', 'w:gz') as tar_file_handle:
8 |         tar_file_handle.add(PDB_file_path, arcname=output_name)
9 |     return


--------------------------------------------------------------------------------
/src/evolvex/utils_bio.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from Bio import SeqIO
 4 | from Bio.PDB import PDBParser
 5 | from Bio.Data.IUPACData import protein_letters_3to1
 6 | 
 7 | from Bio import BiopythonParserWarning
 8 | from Bio.PDB.PDBExceptions import PDBConstructionWarning
 9 | 
10 | warnings.simplefilter('ignore', PDBConstructionWarning)
11 | warnings.simplefilter('ignore', BiopythonParserWarning)
12 | 
13 | def get_residue_ID_to_residue_name_map(PDB_file_path):
14 |     """
15 |     Returns a dict where keys are residue IDs (e.g A52, for residue 52 in chain A) and keys are single letter residues (e.g G for Glycine)
16 |     """
17 |     parser = PDBParser(QUIET=True)
18 |     structure = parser.get_structure(id='', file=PDB_file_path)
19 | 
20 |     residue_ID_to_residue_name_map = {}
21 |     for residue in structure.get_residues():
22 |         _, _, chain, (_, number, _) = residue.full_id
23 |         residue_ID = f'{chain}{number}'
24 |         residue_name = protein_letters_3to1[residue.resname.title()]
25 | 
26 |         residue_ID_to_residue_name_map[residue_ID] = residue_name
27 | 
28 |     return residue_ID_to_residue_name_map
29 | 
30 | def get_chain_to_sequence_map(PDB_file_path, chain_subset):
31 |     chain_to_sequence_map = {
32 |         record.annotations['chain']:str(record.seq).replace('X', '') # When extracting the sequences from ATOMS lines, SeqIO.parse introduces an 'X' when there are consecutive residues with a number difference > 1 (e.g: A1,Y2,P5 => AYXXP), but we don't want that.
33 |         for record in SeqIO.parse(PDB_file_path, format = 'pdb-atom')
34 |         if record.annotations['chain'] in chain_subset
35 |     }
36 |     return chain_to_sequence_map


--------------------------------------------------------------------------------