├── .gitignore ├── LICENSE ├── PositionsToExplore_Vsig4.tsv ├── PositionsToExplore_example.tsv ├── README.md ├── Vsig4_example └── Backbones │ ├── 5IMKBBmove1.pdb │ ├── 5IMKBBmove10.pdb │ ├── 5IMKBBmove11.pdb │ ├── 5IMKBBmove12.pdb │ ├── 5IMKBBmove13.pdb │ ├── 5IMKBBmove14.pdb │ ├── 5IMKBBmove2.pdb │ ├── 5IMKBBmove3.pdb │ ├── 5IMKBBmove4.pdb │ ├── 5IMKBBmove5.pdb │ ├── 5IMKBBmove6.pdb │ ├── 5IMKBBmove7.pdb │ ├── 5IMKBBmove8.pdb │ ├── 5IMKBBmove9.pdb │ ├── RP5imkBA.pdb │ └── RP5imlBA.pdb ├── evolvex_config_Vsig4.yaml ├── evolvex_config_example.yaml ├── evolvex_slurm_head_example.sbatch ├── setup.py └── src └── evolvex ├── __init__.py ├── command_line_interface.py ├── dask_parallel.py ├── foldx_commands.py ├── main.py ├── model_dataclasses.py ├── model_generation.py ├── mutate_interface.py ├── search_algorithms.py ├── utils.py └── utils_bio.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # VSCode 86 | .vscode/* 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | 165 | # Project specific 166 | .ipynb -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /PositionsToExplore_Vsig4.tsv: -------------------------------------------------------------------------------- 1 | Pdb number Res1 Chain AA_Allowed MakeAla 2 | RP5imkBA 52 R N AUTO Y 3 | RP5imkBA 53 W N AUTO Y 4 | RP5imkBA 54 N N AUTO Y 5 | RP5imkBA 56 G N AUTO Y 6 | RP5imkBA 57 S N AUTO Y 7 | RP5imkBA 100 R N AUTO Y 8 | RP5imkBA 101 W N AUTO Y 9 | RP5imkBA 102 D N AUTO Y 10 | RP5imkBA 103 K N AUTO Y 11 | RP5imkBA 104 Y N AUTO Y 12 | RP5imkBA 106 S N AUTO Y 13 | RP5imkBA 107 S N AUTO Y 14 | RP5imkBA 108 F N AUTO Y 15 | RP5imkBA 110 D N AUTO Y 16 | RP5imkBA 111 E N AUTO Y 17 | RP5imkBA 112 Y N AUTO Y 18 | RP5imkBA 113 D N AUTO Y 19 | RP5imlBA 52 R N AUTO Y 20 | RP5imlBA 53 W N AUTO Y 21 | RP5imlBA 54 N N AUTO Y 22 | RP5imlBA 56 G N AUTO Y 23 | RP5imlBA 57 S N AUTO Y 24 | RP5imlBA 100 R N AUTO Y 25 | RP5imlBA 101 W N AUTO Y 26 | RP5imlBA 102 D N AUTO Y 27 | RP5imlBA 103 K N AUTO Y 28 | RP5imlBA 104 Y N AUTO Y 29 | RP5imlBA 106 S N AUTO Y 30 | RP5imlBA 107 S N AUTO Y 31 | RP5imlBA 108 F N AUTO Y 32 | RP5imlBA 110 D N AUTO Y 33 | RP5imlBA 111 E N AUTO Y 34 | RP5imlBA 112 Y N AUTO Y 35 | RP5imlBA 113 D N AUTO Y 36 | RP5immBA 52 R N AUTO Y 37 | RP5immBA 53 W N AUTO Y 38 | RP5immBA 54 N N AUTO Y 39 | RP5immBA 56 G N AUTO Y 40 | RP5immBA 57 S N AUTO Y 41 | RP5immBA 100 R N AUTO Y 42 | RP5immBA 101 W N AUTO Y 43 | RP5immBA 102 D N AUTO Y 44 | RP5immBA 103 K N AUTO Y 45 | RP5immBA 104 Y N AUTO Y 46 | RP5immBA 106 S N AUTO Y 47 | RP5immBA 107 S N AUTO Y 48 | RP5immBA 108 F N AUTO Y 49 | RP5immBA 110 D N AUTO Y 50 | RP5immBA 111 E N AUTO Y 51 | RP5immBA 112 Y N AUTO Y 52 | RP5immBA 113 D N AUTO Y 53 | RP5imoBA 52 R N AUTO Y 54 | RP5imoBA 53 W N AUTO Y 55 | RP5imoBA 54 N N AUTO Y 56 | RP5imoBA 56 G N AUTO Y 57 | RP5imoBA 57 S N AUTO Y 58 | RP5imoBA 100 R N AUTO Y 59 | RP5imoBA 101 W N AUTO Y 60 | RP5imoBA 102 D N AUTO Y 61 | RP5imoBA 103 K N AUTO Y 62 | RP5imoBA 104 Y N AUTO Y 63 | RP5imoBA 106 S N AUTO Y 64 | RP5imoBA 107 S N AUTO Y 65 | RP5imoBA 108 F N AUTO Y 66 | RP5imoBA 110 D N AUTO Y 67 | RP5imoBA 111 E N AUTO Y 68 | RP5imoBA 112 Y N AUTO Y 69 | RP5imoBA 113 D N AUTO Y 70 | Shake5imkBA 52 R N AUTO Y 71 | Shake5imkBA 53 W N AUTO Y 72 | Shake5imkBA 54 N N AUTO Y 73 | Shake5imkBA 56 G N AUTO Y 74 | Shake5imkBA 57 S N AUTO Y 75 | Shake5imkBA 100 R N AUTO Y 76 | Shake5imkBA 101 W N AUTO Y 77 | Shake5imkBA 102 D N AUTO Y 78 | Shake5imkBA 103 K N AUTO Y 79 | Shake5imkBA 104 Y N AUTO Y 80 | Shake5imkBA 106 S N AUTO Y 81 | Shake5imkBA 107 S N AUTO Y 82 | Shake5imkBA 108 F N AUTO Y 83 | Shake5imkBA 110 D N AUTO Y 84 | Shake5imkBA 111 E N AUTO Y 85 | Shake5imkBA 112 Y N AUTO Y 86 | Shake5imkBA 113 D N AUTO Y 87 | Shake5imlBA 52 R N AUTO Y 88 | Shake5imlBA 53 W N AUTO Y 89 | Shake5imlBA 54 N N AUTO Y 90 | Shake5imlBA 56 G N AUTO Y 91 | Shake5imlBA 57 S N AUTO Y 92 | Shake5imlBA 100 R N AUTO Y 93 | Shake5imlBA 101 W N AUTO Y 94 | Shake5imlBA 102 D N AUTO Y 95 | Shake5imlBA 103 K N AUTO Y 96 | Shake5imlBA 104 Y N AUTO Y 97 | Shake5imlBA 106 S N AUTO Y 98 | Shake5imlBA 107 S N AUTO Y 99 | Shake5imlBA 108 F N AUTO Y 100 | Shake5imlBA 110 D N AUTO Y 101 | Shake5imlBA 111 E N AUTO Y 102 | Shake5imlBA 112 Y N AUTO Y 103 | Shake5imlBA 113 D N AUTO Y 104 | Shake5immBA 52 R N AUTO Y 105 | Shake5immBA 53 W N AUTO Y 106 | Shake5immBA 54 N N AUTO Y 107 | Shake5immBA 56 G N AUTO Y 108 | Shake5immBA 57 S N AUTO Y 109 | Shake5immBA 100 R N AUTO Y 110 | Shake5immBA 101 W N AUTO Y 111 | Shake5immBA 102 D N AUTO Y 112 | Shake5immBA 103 K N AUTO Y 113 | Shake5immBA 104 Y N AUTO Y 114 | Shake5immBA 106 S N AUTO Y 115 | Shake5immBA 107 S N AUTO Y 116 | Shake5immBA 108 F N AUTO Y 117 | Shake5immBA 110 D N AUTO Y 118 | Shake5immBA 111 E N AUTO Y 119 | Shake5immBA 112 Y N AUTO Y 120 | Shake5immBA 113 D N AUTO Y 121 | Shake5imoBA 52 R N AUTO Y 122 | Shake5imoBA 53 W N AUTO Y 123 | Shake5imoBA 54 N N AUTO Y 124 | Shake5imoBA 56 G N AUTO Y 125 | Shake5imoBA 57 S N AUTO Y 126 | Shake5imoBA 100 R N AUTO Y 127 | Shake5imoBA 101 W N AUTO Y 128 | Shake5imoBA 102 D N AUTO Y 129 | Shake5imoBA 103 K N AUTO Y 130 | Shake5imoBA 104 Y N AUTO Y 131 | Shake5imoBA 106 S N AUTO Y 132 | Shake5imoBA 107 S N AUTO Y 133 | Shake5imoBA 108 F N AUTO Y 134 | Shake5imoBA 110 D N AUTO Y 135 | Shake5imoBA 111 E N AUTO Y 136 | Shake5imoBA 112 Y N AUTO Y 137 | Shake5imoBA 113 D N AUTO Y 138 | 5IMKBBmove1 52 R N AUTO Y 139 | 5IMKBBmove1 53 W N AUTO Y 140 | 5IMKBBmove1 54 N N AUTO Y 141 | 5IMKBBmove1 56 G N AUTO Y 142 | 5IMKBBmove1 57 S N AUTO Y 143 | 5IMKBBmove1 100 R N AUTO Y 144 | 5IMKBBmove1 101 W N AUTO Y 145 | 5IMKBBmove1 102 D N AUTO Y 146 | 5IMKBBmove1 103 K N AUTO Y 147 | 5IMKBBmove1 104 Y N AUTO Y 148 | 5IMKBBmove1 106 K N AUTO Y 149 | 5IMKBBmove1 107 F N AUTO Y 150 | 5IMKBBmove1 109 D N AUTO Y 151 | 5IMKBBmove1 110 E N AUTO Y 152 | 5IMKBBmove1 111 Y N AUTO Y 153 | 5IMKBBmove1 112 D N AUTO Y 154 | 5IMKBBmove2 52 R N AUTO Y 155 | 5IMKBBmove2 53 W N AUTO Y 156 | 5IMKBBmove2 54 N N AUTO Y 157 | 5IMKBBmove2 56 G N AUTO Y 158 | 5IMKBBmove2 57 S N AUTO Y 159 | 5IMKBBmove2 100 R N AUTO Y 160 | 5IMKBBmove2 101 W N AUTO Y 161 | 5IMKBBmove2 102 D N AUTO Y 162 | 5IMKBBmove2 103 K N AUTO Y 163 | 5IMKBBmove2 104 Y N AUTO Y 164 | 5IMKBBmove2 106 A N AUTO Y 165 | 5IMKBBmove2 107 A N AUTO Y 166 | 5IMKBBmove2 108 F N AUTO Y 167 | 5IMKBBmove2 110 D N AUTO Y 168 | 5IMKBBmove2 111 E N AUTO Y 169 | 5IMKBBmove2 112 Y N AUTO Y 170 | 5IMKBBmove2 113 D N AUTO Y 171 | 5IMKBBmove3 52 R N AUTO Y 172 | 5IMKBBmove3 53 W N AUTO Y 173 | 5IMKBBmove3 54 N N AUTO Y 174 | 5IMKBBmove3 56 G N AUTO Y 175 | 5IMKBBmove3 57 S N AUTO Y 176 | 5IMKBBmove3 100 R N AUTO Y 177 | 5IMKBBmove3 101 W N AUTO Y 178 | 5IMKBBmove3 102 D N AUTO Y 179 | 5IMKBBmove3 103 K N AUTO Y 180 | 5IMKBBmove3 104 Y N AUTO Y 181 | 5IMKBBmove3 106 G N AUTO Y 182 | 5IMKBBmove3 107 G N AUTO Y 183 | 5IMKBBmove3 108 F N AUTO Y 184 | 5IMKBBmove3 110 D N AUTO Y 185 | 5IMKBBmove3 111 E N AUTO Y 186 | 5IMKBBmove3 112 Y N AUTO Y 187 | 5IMKBBmove3 113 D N AUTO Y 188 | 5IMKBBmove4 52 R N AUTO Y 189 | 5IMKBBmove4 53 W N AUTO Y 190 | 5IMKBBmove4 54 N N AUTO Y 191 | 5IMKBBmove4 56 G N AUTO Y 192 | 5IMKBBmove4 57 S N AUTO Y 193 | 5IMKBBmove4 100 R N AUTO Y 194 | 5IMKBBmove4 101 W N AUTO Y 195 | 5IMKBBmove4 102 D N AUTO Y 196 | 5IMKBBmove4 103 K N AUTO Y 197 | 5IMKBBmove4 104 Y N AUTO Y 198 | 5IMKBBmove4 106 G N AUTO Y 199 | 5IMKBBmove4 107 G N AUTO Y 200 | 5IMKBBmove4 108 F N AUTO Y 201 | 5IMKBBmove4 110 D N AUTO Y 202 | 5IMKBBmove4 111 E N AUTO Y 203 | 5IMKBBmove4 112 Y N AUTO Y 204 | 5IMKBBmove4 113 D N AUTO Y 205 | 5IMKBBmove5 52 R N AUTO Y 206 | 5IMKBBmove5 53 W N AUTO Y 207 | 5IMKBBmove5 54 N N AUTO Y 208 | 5IMKBBmove5 56 G N AUTO Y 209 | 5IMKBBmove5 57 S N AUTO Y 210 | 5IMKBBmove5 100 R N AUTO Y 211 | 5IMKBBmove5 101 W N AUTO Y 212 | 5IMKBBmove5 102 D N AUTO Y 213 | 5IMKBBmove5 103 K N AUTO Y 214 | 5IMKBBmove5 104 Y N AUTO Y 215 | 5IMKBBmove5 106 G N AUTO Y 216 | 5IMKBBmove5 107 G N AUTO Y 217 | 5IMKBBmove5 108 F N AUTO Y 218 | 5IMKBBmove5 110 D N AUTO Y 219 | 5IMKBBmove5 111 E N AUTO Y 220 | 5IMKBBmove5 112 Y N AUTO Y 221 | 5IMKBBmove5 113 D N AUTO Y 222 | 5IMKBBmove6 52 R N AUTO Y 223 | 5IMKBBmove6 53 W N AUTO Y 224 | 5IMKBBmove6 54 N N AUTO Y 225 | 5IMKBBmove6 56 G N AUTO Y 226 | 5IMKBBmove6 57 S N AUTO Y 227 | 5IMKBBmove6 100 R N AUTO Y 228 | 5IMKBBmove6 101 W N AUTO Y 229 | 5IMKBBmove6 102 D N AUTO Y 230 | 5IMKBBmove6 103 K N AUTO Y 231 | 5IMKBBmove6 104 Y N AUTO Y 232 | 5IMKBBmove6 106 G N AUTO Y 233 | 5IMKBBmove6 107 G N AUTO Y 234 | 5IMKBBmove6 108 F N AUTO Y 235 | 5IMKBBmove6 110 D N AUTO Y 236 | 5IMKBBmove6 111 E N AUTO Y 237 | 5IMKBBmove6 112 Y N AUTO Y 238 | 5IMKBBmove6 113 D N AUTO Y 239 | 5IMKBBmove7 52 R N AUTO Y 240 | 5IMKBBmove7 53 W N AUTO Y 241 | 5IMKBBmove7 54 N N AUTO Y 242 | 5IMKBBmove7 56 G N AUTO Y 243 | 5IMKBBmove7 57 S N AUTO Y 244 | 5IMKBBmove7 100 R N AUTO Y 245 | 5IMKBBmove7 101 W N AUTO Y 246 | 5IMKBBmove7 102 D N AUTO Y 247 | 5IMKBBmove7 103 K N AUTO Y 248 | 5IMKBBmove7 104 Y N AUTO Y 249 | 5IMKBBmove7 106 G N AUTO Y 250 | 5IMKBBmove7 107 G N AUTO Y 251 | 5IMKBBmove7 108 F N AUTO Y 252 | 5IMKBBmove7 110 D N AUTO Y 253 | 5IMKBBmove7 111 E N AUTO Y 254 | 5IMKBBmove7 112 Y N AUTO Y 255 | 5IMKBBmove7 113 D N AUTO Y 256 | 5IMKBBmove8 52 R N AUTO Y 257 | 5IMKBBmove8 53 W N AUTO Y 258 | 5IMKBBmove8 54 N N AUTO Y 259 | 5IMKBBmove8 56 G N AUTO Y 260 | 5IMKBBmove8 57 S N AUTO Y 261 | 5IMKBBmove8 100 R N AUTO Y 262 | 5IMKBBmove8 101 W N AUTO Y 263 | 5IMKBBmove8 102 D N AUTO Y 264 | 5IMKBBmove8 103 K N AUTO Y 265 | 5IMKBBmove8 104 Y N AUTO Y 266 | 5IMKBBmove8 106 S N AUTO Y 267 | 5IMKBBmove8 107 S N AUTO Y 268 | 5IMKBBmove8 108 F N AUTO Y 269 | 5IMKBBmove8 110 D N AUTO Y 270 | 5IMKBBmove8 111 E N AUTO Y 271 | 5IMKBBmove8 112 Y N AUTO Y 272 | 5IMKBBmove8 113 D N AUTO Y 273 | 5IMKBBmove9 52 R N AUTO Y 274 | 5IMKBBmove9 53 W N AUTO Y 275 | 5IMKBBmove9 54 N N AUTO Y 276 | 5IMKBBmove9 56 G N AUTO Y 277 | 5IMKBBmove9 57 S N AUTO Y 278 | 5IMKBBmove9 100 R N AUTO Y 279 | 5IMKBBmove9 101 W N AUTO Y 280 | 5IMKBBmove9 102 D N AUTO Y 281 | 5IMKBBmove9 103 K N AUTO Y 282 | 5IMKBBmove9 104 Y N AUTO Y 283 | 5IMKBBmove9 106 K N AUTO Y 284 | 5IMKBBmove9 107 E N AUTO Y 285 | 5IMKBBmove9 108 F N AUTO Y 286 | 5IMKBBmove9 110 D N AUTO Y 287 | 5IMKBBmove9 111 E N AUTO Y 288 | 5IMKBBmove9 112 Y N AUTO Y 289 | 5IMKBBmove9 113 D N AUTO Y 290 | 5IMKBBmove10 52 R N AUTO Y 291 | 5IMKBBmove10 53 W N AUTO Y 292 | 5IMKBBmove10 54 N N AUTO Y 293 | 5IMKBBmove10 56 G N AUTO Y 294 | 5IMKBBmove10 57 S N AUTO Y 295 | 5IMKBBmove10 100 R N AUTO Y 296 | 5IMKBBmove10 101 W N AUTO Y 297 | 5IMKBBmove10 102 D N AUTO Y 298 | 5IMKBBmove10 103 K N AUTO Y 299 | 5IMKBBmove10 104 Y N AUTO Y 300 | 5IMKBBmove10 106 A N AUTO Y 301 | 5IMKBBmove10 107 A N AUTO Y 302 | 5IMKBBmove10 108 F N AUTO Y 303 | 5IMKBBmove10 110 D N AUTO Y 304 | 5IMKBBmove10 111 E N AUTO Y 305 | 5IMKBBmove10 112 Y N AUTO Y 306 | 5IMKBBmove10 113 D N AUTO Y 307 | 5IMKBBmove11 52 R N AUTO Y 308 | 5IMKBBmove11 53 W N AUTO Y 309 | 5IMKBBmove11 54 N N AUTO Y 310 | 5IMKBBmove11 56 G N AUTO Y 311 | 5IMKBBmove11 57 S N AUTO Y 312 | 5IMKBBmove11 100 R N AUTO Y 313 | 5IMKBBmove11 101 W N AUTO Y 314 | 5IMKBBmove11 102 D N AUTO Y 315 | 5IMKBBmove11 103 K N AUTO Y 316 | 5IMKBBmove11 104 Y N AUTO Y 317 | 5IMKBBmove11 106 A N AUTO Y 318 | 5IMKBBmove11 107 A N AUTO Y 319 | 5IMKBBmove11 108 F N AUTO Y 320 | 5IMKBBmove11 110 D N AUTO Y 321 | 5IMKBBmove11 111 E N AUTO Y 322 | 5IMKBBmove11 112 Y N AUTO Y 323 | 5IMKBBmove11 113 D N AUTO Y 324 | 5IMKBBmove12 52 R N AUTO Y 325 | 5IMKBBmove12 53 W N AUTO Y 326 | 5IMKBBmove12 54 N N AUTO Y 327 | 5IMKBBmove12 56 G N AUTO Y 328 | 5IMKBBmove12 57 S N AUTO Y 329 | 5IMKBBmove12 100 R N AUTO Y 330 | 5IMKBBmove12 101 W N AUTO Y 331 | 5IMKBBmove12 102 D N AUTO Y 332 | 5IMKBBmove12 103 K N AUTO Y 333 | 5IMKBBmove12 104 Y N AUTO Y 334 | 5IMKBBmove12 106 A N AUTO Y 335 | 5IMKBBmove12 107 A N AUTO Y 336 | 5IMKBBmove12 108 F N AUTO Y 337 | 5IMKBBmove12 110 D N AUTO Y 338 | 5IMKBBmove12 111 E N AUTO Y 339 | 5IMKBBmove12 112 Y N AUTO Y 340 | 5IMKBBmove12 113 D N AUTO Y 341 | 5IMKBBmove13 52 R N AUTO Y 342 | 5IMKBBmove13 53 W N AUTO Y 343 | 5IMKBBmove13 54 N N AUTO Y 344 | 5IMKBBmove13 56 G N AUTO Y 345 | 5IMKBBmove13 57 S N AUTO Y 346 | 5IMKBBmove13 100 R N AUTO Y 347 | 5IMKBBmove13 101 W N AUTO Y 348 | 5IMKBBmove13 102 D N AUTO Y 349 | 5IMKBBmove13 103 K N AUTO Y 350 | 5IMKBBmove13 104 Y N AUTO Y 351 | 5IMKBBmove13 106 E N AUTO Y 352 | 5IMKBBmove13 107 H N AUTO Y 353 | 5IMKBBmove13 108 F N AUTO Y 354 | 5IMKBBmove13 110 D N AUTO Y 355 | 5IMKBBmove13 111 E N AUTO Y 356 | 5IMKBBmove13 112 Y N AUTO Y 357 | 5IMKBBmove13 113 D N AUTO Y 358 | 5IMKBBmove14 52 R N AUTO Y 359 | 5IMKBBmove14 53 W N AUTO Y 360 | 5IMKBBmove14 54 N N AUTO Y 361 | 5IMKBBmove14 56 G N AUTO Y 362 | 5IMKBBmove14 57 S N AUTO Y 363 | 5IMKBBmove14 100 R N AUTO Y 364 | 5IMKBBmove14 101 W N AUTO Y 365 | 5IMKBBmove14 102 D N AUTO Y 366 | 5IMKBBmove14 103 K N AUTO Y 367 | 5IMKBBmove14 104 Y N AUTO Y 368 | 5IMKBBmove14 106 E N AUTO Y 369 | 5IMKBBmove14 107 H N AUTO Y 370 | 5IMKBBmove14 108 F N AUTO Y 371 | 5IMKBBmove14 110 D N AUTO Y 372 | 5IMKBBmove14 111 E N AUTO Y 373 | 5IMKBBmove14 112 Y N AUTO Y 374 | 5IMKBBmove14 113 D N AUTO Y -------------------------------------------------------------------------------- /PositionsToExplore_example.tsv: -------------------------------------------------------------------------------- 1 | Pdb number Res1 Chain AA_Allowed MakeAla 2 | ABCD_dock_1 90 T H AUTO Y 3 | ABCD_dock_1 92 F H AUTO Y 4 | ABCD_dock_1 94 N H AUTO Y 5 | ABCD_dock_1 96 A H AUTO Y 6 | ABCD_dock_1 98 N H AUTO Y 7 | ABCD_dock_1 100 Y H AUTO Y 8 | ABCD_dock_2 90 T H AUTO Y 9 | ABCD_dock_2 92 F H AUTO Y 10 | ABCD_dock_2 94 N H AUTO Y 11 | ABCD_dock_2 96 A H AUTO Y 12 | ABCD_dock_2 98 N H AUTO Y 13 | ABCD_dock_2 100 Y H AUTO Y -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EvolveX 2 | 3 | This repository contains the code of EvolveX, a *de novo* antibody computational design pipeline introduced in <[future link to paper]()>. Specifically, it corresponds to the computational pipeline that generates antibody designs given an initial set of antibody-antigen docks and a set of positions to mutate and explore, which is driven by the FoldX force field to optimize the binding affinity while maintaining the thermodynamic stability of the designed antibodies. 4 | 5 | # Installation 6 | 7 | - **Python version >= 3.9** 8 | - Strongly recommended to create a virtual environment (run "python -m venv venv", then "source venv/bin/activate") 9 | - Download and unzip the github repository. 10 | - Run "pip install ." from the directory that contains the "setup.py file". This will create an "evolvex" command that you can run from the command line. 11 | - Running EvolveX additionally requires FoldX **(version >= 5)**, which can be obtained [here](https://foldxsuite.crg.eu/licensing-and-services). 12 | 13 | The code has been tested on Linux and MacOS operating systems. 14 | 15 | # How to run EvolveX 16 | 17 | The evolvex command only takes a single input, which is a YAML configuration file, for which an example can be found in "evolvex_config_example.yaml". 18 | 19 | **If you wish to run EvolveX on the human Vsig4 nanobody design example** showcased in our publication, simply set the number of CPU cores and the path to your FoldX folder in the pre-filled configuration file "evolvex_config_Vsig4.yaml" and run "evolvex evolvex_config_Vsig4.yaml". This will use the pre-generated antibody-antigen docks in the "Vsig4_example" folder. 20 | Note that the search parameters have been set to a reduced version as it only runs 10 iterations, 2 models per dock and performs recombination every 5 iterations, which should take ~3 hours to run on a personal laptop with 10 CPU cores. To run an even more reduced and faster version, remove all but 1 PDB file in the "Vsig4_example" folder, which should take < 1h. To run the same search as we did using 500 iterations, a population of 50 models per dock and recombination every 50 iterations, you would need to run it on a lab cluster or HPC (see the [additional details section](#additional-details) to run it on SLURM-based HPCs), otherwise it would take weeks to run on a personal laptop. 21 | 22 | EvolveX generates two main outputs in the working_dir folder: 23 | 24 | - A "generated_models_info.csv" file containing the antibody sequence designs selected at each iteration for each model. 25 | - A "model_PDB_files" folder containing the PDB files of each model in the CSV file. 26 | 27 | We then filter these designs using a number of thresholds for different characteristics which are determined based on the distribution of each characteristic in known antibody 3D structures, all of which is described in detail in our publication. 28 | 29 | For details about the configuration file and additional input files needed to run EvolveX, read below. 30 | 31 | ## Additional details 32 | The YAML configuration parameters are the following: 33 | 34 | - The "antibody_chains" and "antigen_chains" parameters are self-explanatory. The antibody can be a single chain (i.e nanobody) or a standard double chain Fv. 35 | 36 | - In the "Required paths" section: 37 | - The working_dir is where EvolveX will write all the files it generates. If you are running on an HPC, make sure this folder points to a directory with enough space and which can perform fast writes. 38 | - The foldx_path should contain an executable file named "foldx". 39 | - The Backbones_dir should contain PDB files corresponding to the initial set of antibody-antigen docks. 40 | - The PositionsToExplore_file_path should be a TSV-formated file containing the positions to mutate for each PDB file in Backbones_dir (see PositionsToExplore_example.tsv for format). "AA_Allowed" can either be a string of pre-selected amino acids in single letter format (e.g KRHDE), or set to "AUTO" to let EvolveX test each individual mutation and determine which ones are worth trying during the GA search. "MakeAla" can be set to "Y", in which case that position will be mutated to Alanine before the GA search, or to "N", in which case the wildtype amino acid will be kept as the starting amino acid at that position before the GA search. 41 | 42 | - In the "Search algorithm settings" section, the population_size corresponds to the number of models that will be generated and explored PER DOCK. So if you have 100 PDBs and set this to 100, 10000 models will be generated and explored over the number of iterations you have selected. By default, we run 500 iterations, 50 models per dock and do a recombination step every 50 iterations. 43 | 44 | - In the "Compute settings" section: 45 | - The "compute_env" can be set to "local" (default) or "SLURM" if running on a SLURM-based HPC. 46 | - The "n_cores" sets the number of parallel CPU cores to use, both for the local or SLURM compute environments. 47 | - When running on SLURM, additional parameters are required: 48 | - The number of CPU cores are split across "max_SLURM_jobs". Most SLURM HPCs limit the number of jobs a user can have in the queue at any time, so you should set the "max_SLURM_jobs" accordingly. For example, if n_cores=250 and max_SLURM_jobs=25, then 25 jobs with 10 CPU cores each will be submitted. 49 | - The "walltime" corresponds to the maximum time the jobs will run for. 50 | - Set the "account_name", "cluster_name" and "cluster_partition" according to your HPC. 51 | - Adapt the "SLURM_job_prologue" to your HPC, making sure the Python version you load is the same as the one used to create the virtual environment used to install Evolvex. 52 | 53 | Once the YAML file is ready, run "evolvex evolvex_config.yaml". 54 | 55 | **NOTE: When running on a SLURM environment, if a "tcp connection error" or any other similar error that suggests that the head process has lost communication with the workers arises when launching EvolveX from a login node, try launching the evolvex command through a SLURM script so that the head process runs on a compute node instead (see the "evolvex_slurm_head_example.sbatch").** -------------------------------------------------------------------------------- /evolvex_config_Vsig4.yaml: -------------------------------------------------------------------------------- 1 | # PDB information 2 | antibody_chains: "N" 3 | antigen_chains: "L" 4 | 5 | # Required paths 6 | working_dir: "./Vsig4_example" 7 | foldx_dir: "/path/to/foldx" 8 | Backbones_dir: "./Vsig4_example/Backbones" 9 | PositionsToExplore_file_path: "PositionsToExplore_Vsig4.tsv" 10 | 11 | # Search algorithm settings 12 | search_algorithm: "GA" 13 | max_iterations: 10 14 | population_size: 2 # To be understood as the population size per backbone. A warning is issued if the value is < 50. 15 | recombine_every_nth_iteration: 5 16 | 17 | # Compute settings 18 | compute_env: "local" # local | SLURM 19 | n_cores: 10 20 | # SLURM specific settings 21 | # max_SLURM_jobs: 100 22 | # walltime: 3-00 23 | # account_name: "xxxxxx" # SLURM credit account 24 | # cluster_name: "xxxx" 25 | # cluster_partition: "xxxx" 26 | # SLURM_job_prologue: 27 | # - "module --force purge" 28 | # - "module load path/to/cluster/partition" 29 | # - "module load Python/xxxx" 30 | # - "source /path/to/venv/bin/activate" 31 | 32 | # Other settings 33 | residues_to_ignore: "GMHC" 34 | vdwDesign: 2 # See https://foldxsuite.crg.eu/parameter/vdwDesign 35 | print_stdout: false # Useful for debugging 36 | calculate_binding_dG_with_water: true 37 | -------------------------------------------------------------------------------- /evolvex_config_example.yaml: -------------------------------------------------------------------------------- 1 | # PDB information 2 | antibody_chains: "HL" 3 | antigen_chains: "A" 4 | 5 | # Required paths 6 | working_dir: "/path/to/working/dir" 7 | foldx_dir: "/path/to/foldx" 8 | Backbones_dir: "/path/to/Backbones" 9 | PositionsToExplore_file_path: "/path/to/PositionsToExplore.txt" 10 | 11 | # Search algorithm settings 12 | search_algorithm: "MC-GA" 13 | max_iterations: 500 14 | population_size: 50 # To be understood as the population size per backbone. A warning is issued if the value is < 50. 15 | recombine_every_nth_iteration: 50 16 | 17 | # Compute settings 18 | compute_env: "local" # local | SLURM 19 | n_cores: 10 20 | # SLURM specific settings 21 | # max_SLURM_jobs: 100 22 | # walltime: 3-00 23 | # account_name: "xxxxxx" # SLURM credit account 24 | # cluster_name: "xxxx" 25 | # cluster_partition: "xxxx" 26 | # SLURM_job_prologue: 27 | # - "module --force purge" 28 | # - "module load path/to/cluster/partition" 29 | # - "module load Python/xxxx" 30 | # - "source /path/to/venv/bin/activate" 31 | 32 | # Other settings 33 | residues_to_ignore: "GMHC" 34 | vdwDesign: 2 # See https://foldxsuite.crg.eu/parameter/vdwDesign 35 | print_stdout: false # Useful for debugging 36 | calculate_binding_dG_with_water: true 37 | -------------------------------------------------------------------------------- /evolvex_slurm_head_example.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=evolvex_head 3 | #SBATCH --account=xxxx 4 | #SBATCH --clusters=xxxx 5 | #SBATCH --partition=xxxx 6 | #SBATCH --output=output_%a.out 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --mem=20G 10 | #SBATCH --time=3-00 11 | 12 | module --force purge 13 | 14 | module load path/to/cluster/partition 15 | module load Python/xxxx 16 | 17 | source /full/path/to/venv/bin/activate 18 | 19 | /full/path/to/evolvex_executable /full/path/to/evolvex_config.yaml # To obtain the full path of the evolvex_executable, run "which evolvex". -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='EvolveX', 5 | version='1', 6 | description='EvolveX antibody design pipeline', 7 | author='Gabriel Cia and Rob Van Der Kant - SwitchLab', 8 | packages=find_packages(where='src'), 9 | package_dir={'': 'src'}, 10 | python_requires='>=3.9', 11 | install_requires=['pyyaml', 'biopython>=1.81', 'pandas', 'dask', 'distributed', 'dask-jobqueue'], 12 | entry_points={ 13 | 'console_scripts': [ 14 | 'evolvex = evolvex.main:main', 15 | ] 16 | }, 17 | ) -------------------------------------------------------------------------------- /src/evolvex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SwitchLab-VIB/EvolveX/7dbe0dd7c70119dc4fd07ccf0c0b253a50dc4f96/src/evolvex/__init__.py -------------------------------------------------------------------------------- /src/evolvex/command_line_interface.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import argparse 5 | from types import SimpleNamespace 6 | import warnings 7 | import yaml 8 | from pathlib import Path 9 | 10 | def read_and_validate_config_file(file_path): 11 | with open(file_path, 'rt') as f: 12 | GLOBALS = yaml.safe_load(f) 13 | GLOBALS = SimpleNamespace(**GLOBALS) # SimpleNamespace enables the use of dot notation to access values instead of dict brackets 14 | 15 | GLOBALS.working_dir = Path(GLOBALS.working_dir) 16 | GLOBALS.foldx_dir = Path(GLOBALS.foldx_dir) 17 | GLOBALS.Backbones_dir = Path(GLOBALS.Backbones_dir) 18 | GLOBALS.PositionsToExplore_file_path = Path(GLOBALS.PositionsToExplore_file_path) 19 | if not (GLOBALS.working_dir.is_dir()): 20 | GLOBALS.working_dir.mkdir(parents=True) 21 | 22 | if not (GLOBALS.foldx_dir.is_dir() and (GLOBALS.foldx_dir / 'foldx').exists()): 23 | raise ValueError("The foldx_dir must contain an executable file named 'foldx'.") 24 | 25 | if GLOBALS.search_algorithm not in ('systematic', 'GA'): 26 | raise ValueError("The search_algorithm must be one of 'GA' or 'systematic'.") 27 | 28 | if GLOBALS.search_algorithm == 'GA': 29 | if GLOBALS.population_size % 2 != 0: 30 | raise ValueError("Population_size must be an even number.") 31 | if GLOBALS.population_size < 50: 32 | warnings.warn("The population_size for each PDB backbone is < 50, which is low.") 33 | 34 | if GLOBALS.compute_env == 'SLURM': 35 | SLURM_parameters = ('account_name', 'cluster_name', 'cluster_partition') 36 | if not all(hasattr(GLOBALS, parameter) for parameter in SLURM_parameters): 37 | raise ValueError("The SLURM compute_env requires the following parameters: {SLURM_parameters}") 38 | 39 | return GLOBALS 40 | 41 | def command_line_interface(): 42 | parser = argparse.ArgumentParser( 43 | description='EvolveX', 44 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 45 | ) 46 | 47 | parser.add_argument( 48 | 'config', type=Path, 49 | help="Path to YAML configuration file. See the evolvex_config_example.yaml file for the list of available parameters, and the README for an exaplanation of each parameter." 50 | ) 51 | 52 | args = parser.parse_args() 53 | 54 | GLOBALS = read_and_validate_config_file(file_path = args.config) 55 | 56 | return GLOBALS 57 | -------------------------------------------------------------------------------- /src/evolvex/dask_parallel.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from dask.distributed import Client, LocalCluster, wait 4 | from dask_jobqueue import SLURMCluster 5 | 6 | def setup_dask_parallel_executor(GLOBALS): 7 | compute_env = GLOBALS.compute_env 8 | n_cores = GLOBALS.n_cores 9 | 10 | if compute_env == 'local': 11 | cluster = LocalCluster( 12 | n_workers = n_cores, 13 | threads_per_worker = 1, 14 | processes = True, 15 | ) 16 | 17 | elif compute_env == 'SLURM': 18 | account = GLOBALS.account_name 19 | clusters = GLOBALS.cluster_name 20 | SLURM_job_prologue = GLOBALS.SLURM_job_prologue 21 | max_SLURM_jobs = GLOBALS.max_SLURM_jobs 22 | partition = GLOBALS.cluster_partition 23 | walltime = GLOBALS.walltime 24 | 25 | # SLURMCluster determines which resources are mobilized by each worker, which are then scaled according to the number of CPUs by the cluster.adapt function. 26 | SLURM_info_files_dir = GLOBALS.working_dir / 'slurm_info_files'; SLURM_info_files_dir.mkdir() 27 | 28 | n_cores_per_job = math.ceil(n_cores / max_SLURM_jobs) 29 | print(f'{max_SLURM_jobs} jobs, each running {n_cores_per_job} single core workers.', flush=True) 30 | 31 | cluster = SLURMCluster( 32 | processes = n_cores_per_job, 33 | cores = n_cores_per_job, 34 | memory = f'{3 * n_cores_per_job} GB', # Could turn this into a parameter for the config file 35 | job_extra_directives = [f'--{account=}', f'--{clusters=}', f'--{partition=}', f'--output={str(SLURM_info_files_dir)}/slurm-%j.out'], 36 | walltime = walltime, 37 | job_script_prologue = SLURM_job_prologue, # Used to setup each job and associated workers with the necessary modules and virtual environment 38 | death_timeout=300 39 | ) 40 | cluster.scale(jobs = max_SLURM_jobs) 41 | #cluster.adapt(minimum_jobs = 1, maximum_jobs = max_SLURM_jobs, interval = '4s') # NOTE: Adaptive scaling does not work, workers randomly die which kills the whole run. Setting minimum_jobs = max_SLURM_jobs does not solve the problem. 42 | 43 | else: 44 | raise ValueError("compute_env must be either 'local' or 'SLURM'.") 45 | 46 | parallel_executor = Client(cluster) 47 | 48 | print(f'Dask dashboard link: ', parallel_executor.dashboard_link, flush=True) # To monitor how well the parallelization is going 49 | 50 | return parallel_executor 51 | 52 | def wait_and_remove(parallel_executor, futures): 53 | wait(futures) 54 | if futures: 55 | parallel_executor.cancel(futures) 56 | return -------------------------------------------------------------------------------- /src/evolvex/foldx_commands.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import shutil 4 | 5 | from evolvex.utils import NDIGIS_ROUNDING 6 | 7 | def get_alanine_mutant(full_residue_IDs): 8 | """ 9 | Given the list of full_residue_IDs, returns a comma separated string of mutation names to Alanine, 10 | following the FoldX naming convention (e.g 'LH4A,TH9A,KH17A'). 11 | """ 12 | mutation_names = ( 13 | f'{full_residue_ID}A' # e.g KH52A 14 | for full_residue_ID in full_residue_IDs 15 | ) 16 | return ','.join(mutation_names) 17 | 18 | def create_individual_list_foldx_mutations_file(mutant, output_dir, output_file_name='individual_list_foldx_mutations_file.txt'): 19 | """ 20 | Mutant should be a comma separated string of mutations that describe a mutant (e.g: "DH52A,LH80A,KH99A"). 21 | 22 | By default, generates a file named "individual_list_foldx_mutations_file.txt" in output_dir. 23 | """ 24 | output_file_path = output_dir / output_file_name 25 | with open(output_file_path, 'wt') as file_handle: 26 | file_handle.write(f'{mutant};\n') 27 | 28 | return output_file_path 29 | 30 | 31 | def run_foldx_BuildModel( 32 | foldx_dir, PDB_file_dir, PDB_file_name, individual_list_foldx_mutations_file_path, move_neighbors_flag, vdwDesign, print_stdout, output_dir, output_file_tag, PDB_file_tag=None 33 | ): 34 | """ 35 | Returns the full paths of the mutant and wildtype PDB file generated by BuildModel. 36 | """ 37 | input_file = PDB_file_dir / f'{PDB_file_name}.pdb' 38 | assert input_file.exists(), f'{input_file} does not exist.' 39 | 40 | command = [ 41 | str(foldx_dir / 'foldx'), 42 | '--command', 'BuildModel', 43 | '--pdb-dir', str(PDB_file_dir), 44 | '--pdb', f'{PDB_file_name}.pdb', 45 | '--mutant-file', str(individual_list_foldx_mutations_file_path), 46 | '--pdbHydrogens', 'true', 47 | '--output-dir', str(output_dir), 48 | '--moveNeighbours', 'true' if move_neighbors_flag else 'false', 49 | '--vdwDesign', str(vdwDesign), 50 | '--screen', 'true' if print_stdout else 'false' 51 | ] 52 | if output_file_tag: 53 | command += ['--output-file', output_file_tag] 54 | 55 | subprocess.run(command, check=True, stdout=None if print_stdout else subprocess.DEVNULL) 56 | 57 | foldx_mutant_PDB_file_path = PDB_file_dir / f'{PDB_file_name}_1.pdb' 58 | foldx_wildtype_PDB_file_path = PDB_file_dir / f'WT_{PDB_file_name}_1.pdb' 59 | 60 | # A new PDB of the wildtype is not generated by FoldX when move_neighbors_flag is False, as it simply corresponds to the input PDB file, 61 | # but to keep it consistent we create a copy of the file with the expected name. 62 | if move_neighbors_flag == False: 63 | shutil.copy( 64 | src = PDB_file_dir / f'{PDB_file_name}.pdb', 65 | dst = foldx_wildtype_PDB_file_path 66 | ) 67 | 68 | # As for the other files generated by BuildModel, gives the possibility to add a tag to the generated PDB files. 69 | if PDB_file_tag: 70 | foldx_mutant_PDB_file_path = foldx_mutant_PDB_file_path.rename(foldx_mutant_PDB_file_path.with_stem(f"{foldx_mutant_PDB_file_path.stem}_{PDB_file_tag}")) 71 | foldx_wildtype_PDB_file_path = foldx_wildtype_PDB_file_path.rename(foldx_wildtype_PDB_file_path.with_stem(f"{foldx_wildtype_PDB_file_path.stem}_{PDB_file_tag}")) 72 | 73 | return (foldx_mutant_PDB_file_path, foldx_wildtype_PDB_file_path) 74 | 75 | def run_foldx_AnalyseComplex(foldx_dir, PDB_file_dir, PDB_file_name, antibody_chains, antigen_chains, vdwDesign, print_stdout, output_dir, output_file_tag, with_predicted_waters=False): 76 | input_file = PDB_file_dir / f'{PDB_file_name}.pdb' 77 | assert input_file.exists(), f'{input_file} does not exist.' 78 | 79 | command = [ 80 | str(foldx_dir / 'foldx'), 81 | '--command', 'AnalyseComplex', 82 | '--pdb-dir', str(PDB_file_dir), 83 | '--pdb', f'{PDB_file_name}.pdb', 84 | '--analyseComplexChains', f'{antibody_chains},{antigen_chains}', 85 | '--vdwDesign', str(vdwDesign), 86 | '--output-dir', str(output_dir), 87 | '--screen', 'true' if print_stdout else 'false' 88 | ] 89 | 90 | if with_predicted_waters: 91 | command += [ 92 | '--water', '-PREDICT', 93 | '--ionStrength', '0.150', 94 | ] 95 | 96 | if output_file_tag: 97 | command += ['--output-file', output_file_tag] 98 | 99 | subprocess.run(command, check=True, stdout=None if print_stdout else subprocess.DEVNULL) 100 | return 101 | 102 | def run_foldx_Stability(foldx_dir, PDB_file_dir, PDB_file_name, vdwDesign, print_stdout, output_dir, output_file_tag): 103 | input_file = PDB_file_dir / f'{PDB_file_name}.pdb' 104 | assert input_file.exists(), f'{input_file} does not exist.' 105 | 106 | command = [ 107 | str(foldx_dir / 'foldx'), 108 | '--command', 'Stability', 109 | '--pdb-dir', str(PDB_file_dir), 110 | '--pdb', f'{PDB_file_name}.pdb', 111 | '--vdwDesign', str(vdwDesign), 112 | '--output-dir', str(output_dir), 113 | ] 114 | if output_file_tag: 115 | command += ['--output-file', output_file_tag] 116 | 117 | subprocess.run(command, check=True, stdout=None if print_stdout else subprocess.DEVNULL) 118 | return 119 | 120 | 121 | def get_binding_dG(interaction_file_path): 122 | assert interaction_file_path.name.startswith('Interaction_'), "To obtain binding dG, provide an 'Interaction_' file generated by AnalyseComplex." 123 | 124 | with open(interaction_file_path, 'rt') as file_handle: 125 | lines = file_handle.readlines() 126 | 127 | binding_dG = float( lines[9].split('\t')[5] ) 128 | return round(binding_dG, NDIGIS_ROUNDING) 129 | 130 | def get_binding_ddG(wildtype_interaction_file_path, mutant_interaction_file_path): 131 | assert wildtype_interaction_file_path.name.startswith('Interaction_') and mutant_interaction_file_path.name.startswith('Interaction_'), "To obtain binding ddG, provide two 'Interaction_' files generated by AnalyseComplex." 132 | 133 | wildtype_binding_dG = get_binding_dG(wildtype_interaction_file_path) 134 | mutant_binding_dG = get_binding_dG(mutant_interaction_file_path) 135 | 136 | binding_ddG = mutant_binding_dG - wildtype_binding_dG 137 | return round(binding_ddG, NDIGIS_ROUNDING) 138 | 139 | def get_chain_group_stability_dG(indiv_file_path, chain_group_name): 140 | assert indiv_file_path.name.startswith('Indiv_'), "To obtain the stability dG of a chain group, provide an 'Indiv_' file generated by AnalyseComplex" 141 | 142 | with open(indiv_file_path, 'rt') as file_handle: 143 | lines = file_handle.readlines() 144 | 145 | for line in lines[9:]: 146 | line = line.split('\t') 147 | if line[1] == chain_group_name: 148 | chain_group_stability_dG = float( line[2] ) 149 | return round(chain_group_stability_dG, NDIGIS_ROUNDING) 150 | 151 | raise ValueError(f'Could not find {chain_group_name=} in {indiv_file_path}') 152 | return 153 | 154 | def get_chain_group_stability_ddG(wildtype_indiv_file_path, mutant_indiv_file_path, chain_group_name): 155 | assert wildtype_indiv_file_path.name.startswith('Indiv_') and mutant_indiv_file_path.name.startswith('Indiv_'), "To obtain the stability ddG of a chain group, provide two 'Indiv_' files generated by AnalyseComplex." 156 | 157 | wildtype_chain_group_stability_dG = get_chain_group_stability_dG(wildtype_indiv_file_path, chain_group_name) 158 | mutant_chain_group_stability_dG = get_chain_group_stability_dG(mutant_indiv_file_path, chain_group_name) 159 | 160 | chain_group_stability_ddG = mutant_chain_group_stability_dG - wildtype_chain_group_stability_dG 161 | return round(chain_group_stability_ddG, NDIGIS_ROUNDING) 162 | 163 | def get_complex_stability_dG(st_file_path): 164 | # NOTE: The same information could be obtained from the 'Raw_' file generated by BuildModel, so we could skip running Stability completely 165 | assert st_file_path.name.endswith('_ST.fxout'), "To obtain the stability dG of a complex, provide a '_ST.fxout' file generated by Stability." 166 | 167 | with open(st_file_path, 'rt') as file_handle: 168 | lines = file_handle.readlines() 169 | 170 | complex_stability_dG = float( lines[0].split('\t')[1] ) 171 | return round(complex_stability_dG, NDIGIS_ROUNDING) 172 | 173 | def get_complex_stability_ddG(dif_file_path): 174 | assert dif_file_path.name.startswith('Dif_'), "To obtain the stability ddG of a complex, provide a 'Dif_' file generated by BuildModel." 175 | 176 | with open(dif_file_path, 'rt') as file_handle: 177 | lines = file_handle.readlines() 178 | 179 | complex_stability_ddG = float( lines[9].split('\t')[1] ) 180 | return round(complex_stability_ddG, NDIGIS_ROUNDING) 181 | 182 | def get_chain_group_intraclash_score(interaction_file_path, chain_group_name): 183 | assert interaction_file_path.name.startswith('Interaction_'), "To obtain intraclash scores of a chain group, provide an 'Interaction_' file generated by AnalyseComplex." 184 | 185 | with open(interaction_file_path, 'rt') as file_handle: 186 | lines = file_handle.readlines() 187 | 188 | line = lines[9].split('\t') 189 | intraclash_scores = { 190 | line[1]:float(line[3]), # e.g 'HL':2.363 191 | line[2]:float(line[4]) 192 | } 193 | return round(intraclash_scores[chain_group_name], NDIGIS_ROUNDING) 194 | 195 | def get_chain_group_delta_intraclash_score(wildtype_interaction_file_path, mutant_interaction_file_path, chain_group_name): 196 | assert wildtype_interaction_file_path.name.startswith('Interaction_') and mutant_interaction_file_path.name.startswith('Interaction_'), "To obtain a change in intraclash score of a chain group, provide two 'Interaction_' files generated by AnalyseComplex." 197 | 198 | wildtype_intraclash_score = get_chain_group_intraclash_score(wildtype_interaction_file_path, chain_group_name) 199 | mutant_intraclash_score = get_chain_group_intraclash_score(mutant_interaction_file_path, chain_group_name) 200 | 201 | chain_group_delta_intraclash_score = mutant_intraclash_score - wildtype_intraclash_score 202 | return round(chain_group_delta_intraclash_score, NDIGIS_ROUNDING) 203 | 204 | def get_all_other_interaction_file_info(interaction_file_path): 205 | assert interaction_file_path.name.startswith('Interaction_'), "To obtain all the information to the right of 'Backbone Hbond' from an interaction file, provide an 'Interaction_' file generated by AnalyseComplex." 206 | 207 | with open(interaction_file_path, 'rt') as file_handle: 208 | lines = file_handle.readlines() 209 | 210 | column_names = lines[8].strip().split('\t') 211 | values = lines[9].strip().split('\t') 212 | 213 | return {column_names[i]:round(float(values[i]), NDIGIS_ROUNDING) for i in range(6, len(column_names))} 214 | -------------------------------------------------------------------------------- /src/evolvex/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from Bio.Data.IUPACData import protein_letters 4 | 5 | from dask.distributed import as_completed 6 | 7 | from evolvex.dask_parallel import setup_dask_parallel_executor 8 | from evolvex.mutate_interface import generate_Alanine_mutant, mutate_antibody_hotspot_position, generate_mutations_summary_file 9 | from evolvex.search_algorithms import GA_search, systematic_search 10 | from evolvex.model_generation import generate_initial_models 11 | from evolvex.dask_parallel import wait_and_remove 12 | from evolvex.command_line_interface import command_line_interface 13 | 14 | def main(): 15 | """ 16 | """ 17 | GLOBALS = command_line_interface() 18 | 19 | evolvex_working_dir = GLOBALS.working_dir / 'EvolveX'; evolvex_working_dir.mkdir(exist_ok = True) ### Should be false 20 | 21 | backbone_PDB_files_paths = list(GLOBALS.Backbones_dir.glob('*.pdb')) 22 | all_PDBs_positions_to_explore_df = pd.read_csv(GLOBALS.PositionsToExplore_file_path, header=0, sep='\t') 23 | 24 | parallel_executor = setup_dask_parallel_executor(GLOBALS) 25 | 26 | 27 | # Generate Alanine mutants of all PDB backbones 28 | futures_1 = [] 29 | for PDB_file_path in backbone_PDB_files_paths: 30 | PDB_positions_to_explore_df = all_PDBs_positions_to_explore_df[all_PDBs_positions_to_explore_df['Pdb'] == PDB_file_path.stem] 31 | 32 | future_1 = parallel_executor.submit( 33 | generate_Alanine_mutant, PDB_file_path, PDB_positions_to_explore_df, evolvex_working_dir, GLOBALS 34 | ) 35 | futures_1.append(future_1) 36 | 37 | # Explore all possible mutations for positions marked as 'AUTO' 38 | print('Exploring mutations at each position...', flush=True) 39 | residues_to_explore = set(protein_letters) - set(GLOBALS.residues_to_ignore) 40 | futures_2 = [] 41 | for future_1, result in as_completed(futures_1, with_results=True): 42 | foldx_Alanine_mutant_PDB_file_path, AUTO_Ala_positions_full_residue_IDs, output_dir = result 43 | future_1.release() 44 | for full_residue_ID in AUTO_Ala_positions_full_residue_IDs: 45 | hotspot_mutants_dir = output_dir / 'hotspot_mutants' / full_residue_ID 46 | hotspot_mutants_dir.mkdir(parents=True, exist_ok=True) 47 | for mutant_residue in residues_to_explore: 48 | future_2 = parallel_executor.submit( 49 | mutate_antibody_hotspot_position, foldx_Alanine_mutant_PDB_file_path, full_residue_ID, mutant_residue, hotspot_mutants_dir, GLOBALS 50 | ) 51 | futures_2.append(future_2) 52 | 53 | wait_and_remove(parallel_executor, futures_2) 54 | 55 | # Generate a summary file for each PDB backbone, which includes the ddG_binding, ddG_stability_complex and ddG_stability_antibody for all possible 56 | # mutations at each position. For positions with pre-selected mutations, the fields are artificially set to -100. 57 | futures_3 = [] 58 | for PDB_file_path in backbone_PDB_files_paths: 59 | PDB_dir = evolvex_working_dir / PDB_file_path.stem 60 | PDB_positions_to_explore_df = all_PDBs_positions_to_explore_df[all_PDBs_positions_to_explore_df['Pdb'] == PDB_file_path.stem] 61 | 62 | future_3 = parallel_executor.submit( 63 | generate_mutations_summary_file, PDB_dir, PDB_positions_to_explore_df, GLOBALS 64 | ) 65 | futures_3.append(future_3) 66 | 67 | wait_and_remove(parallel_executor, futures_3) 68 | 69 | # Run search algorithm 70 | if GLOBALS.search_algorithm == 'systematic': 71 | systematic_search(parallel_executor, backbone_PDB_files_paths, evolvex_working_dir, GLOBALS) 72 | 73 | else: 74 | generated_models_info_file_path = GLOBALS.working_dir / 'generated_models_info.csv' 75 | model_PDB_files_dir = GLOBALS.working_dir / 'model_PDB_files'; model_PDB_files_dir.mkdir(exist_ok = True) 76 | 77 | print('Generating initial models...', flush=True) 78 | initial_models_population = generate_initial_models(parallel_executor, evolvex_working_dir, backbone_PDB_files_paths, GLOBALS) 79 | 80 | print('Running search...', flush=True) 81 | GA_search(parallel_executor, initial_models_population, generated_models_info_file_path, model_PDB_files_dir, GLOBALS) 82 | 83 | 84 | print('Finished.', flush=True) 85 | parallel_executor.close() 86 | return 87 | 88 | if __name__ == '__main__': 89 | main() -------------------------------------------------------------------------------- /src/evolvex/model_dataclasses.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from dataclasses import dataclass 3 | 4 | @dataclass 5 | class MC_Model: 6 | model_dir: Path 7 | full_residue_IDs_list: list[str] # Full residue IDs of the mutable positions 8 | 9 | # Constants needed for the MC 10 | backbone_PDB_file_name: str 11 | antibody_stability_dG_original_wildtype: float 12 | antibody_seq_map_original_wildtype: dict 13 | allowed_AA_mutations_per_position_map: dict 14 | 15 | @dataclass 16 | class GA_Model: 17 | model_dir: Path 18 | full_residue_IDs_list: list[str] 19 | 20 | # Parameters used to generate a Model's PDB file using BuildModel with only 1 mutation based on the parent instead of starting back from the Alanine PDB every time, which would be slow 21 | parent_model_dir: Path 22 | mutations_to_generate_PDB: list[str] -------------------------------------------------------------------------------- /src/evolvex/model_generation.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | from collections import defaultdict 4 | import statistics 5 | from pathlib import Path 6 | import random 7 | 8 | import pandas as pd 9 | 10 | from evolvex.model_dataclasses import MC_Model 11 | from evolvex.foldx_commands import create_individual_list_foldx_mutations_file, run_foldx_BuildModel, get_complex_stability_ddG, get_chain_group_stability_dG 12 | from evolvex.utils_bio import get_chain_to_sequence_map 13 | 14 | large_hydrophobic_residues = 'FILWY' 15 | 16 | def create_model(input_PDB_file_path, copy_PDB_file_to_output_dir, mutations_list, output_dir, GLOBALS, output_file_tag=None, PDB_file_tag=None): 17 | """ 18 | Creates a PDB model using BuildModel, taking as input a PDB file and a list of mutations. 19 | """ 20 | output_dir.mkdir(exist_ok = True) 21 | 22 | if copy_PDB_file_to_output_dir: 23 | input_PDB_file_path = shutil.copy( 24 | src = input_PDB_file_path, 25 | dst = output_dir 26 | ) 27 | input_PDB_file_path = Path(input_PDB_file_path) 28 | 29 | individual_list_foldx_mutations_file_path = create_individual_list_foldx_mutations_file( 30 | mutant = ','.join(mutations_list), 31 | output_dir = output_dir 32 | ) 33 | 34 | run_foldx_BuildModel( 35 | foldx_dir=GLOBALS.foldx_dir, 36 | PDB_file_dir=input_PDB_file_path.parent, PDB_file_name=input_PDB_file_path.stem, 37 | individual_list_foldx_mutations_file_path=individual_list_foldx_mutations_file_path, 38 | move_neighbors_flag=True, 39 | vdwDesign=GLOBALS.vdwDesign, 40 | print_stdout=GLOBALS.print_stdout, 41 | output_dir=output_dir, output_file_tag=output_file_tag, PDB_file_tag=PDB_file_tag, 42 | ) 43 | 44 | return 45 | 46 | 47 | def get_acceptable_positions_mut_names_map(all_mutations_summary_df, PDB_name): 48 | acceptable_mutations_map = defaultdict(list) 49 | for position, position_df in all_mutations_summary_df.groupby('position'): 50 | binding_ddG_variance = statistics.variance(position_df.binding_ddG.values) 51 | antibody_stability_ddG_variance = statistics.variance(position_df.antibody_stability_ddG.values) 52 | mean_binding_and_stability_variance = (binding_ddG_variance + antibody_stability_ddG_variance) / 2 53 | for mut_name, row in position_df.iterrows(): 54 | mutant_residue = mut_name[-1] 55 | # This filters out mutations that are too destabilizing either in terms of binding or stability 56 | if row.complex_stability_ddG > 1 or row.binding_ddG > 1 or row.antibody_stability_ddG > 2: 57 | continue 58 | 59 | # We don't want mutations to large hydrophobics when mutations at that position don't seem to change binding or stability a lot (ddG values ~ 0) 60 | if mean_binding_and_stability_variance < 0.1 and mutant_residue in large_hydrophobic_residues: 61 | continue 62 | 63 | acceptable_mutations_map[position].append(mut_name) 64 | 65 | # If no mutation passes the filters, allow all mutations at that position and let the search algorithm find what's best 66 | if not position in acceptable_mutations_map: 67 | print(f'No acceptable mutations found for {position = } in {PDB_name = }. Allowing all mutations at that position during the search. Probably needs manual inspection.', flush=True) 68 | acceptable_mutations_map[position].extend(position_df.index.values) 69 | 70 | return acceptable_mutations_map 71 | 72 | def get_hotspot_positions_mut_names_map(all_mutations_summary_df): 73 | hotspot_mutations_map = defaultdict(list) 74 | for mut_name, row in all_mutations_summary_df.iterrows(): 75 | original_residue = row.original_residue 76 | mutant_residue = mut_name[-1] 77 | position = mut_name[2:-1] 78 | 79 | # Hotspot mutations need to strongly improve binding affinity and not be too destabilizing for the antibody 80 | if row.binding_ddG < -1.5 and row.antibody_stability_ddG < 1: 81 | hotspot_mutations_map[position].append(mut_name) 82 | 83 | # If mutating from Alanine to the original residue is highly stabilizing for the antibody, then we consider it as a hotspot mutation, even if it maybe 84 | # doesn't contribute to binding 85 | elif mutant_residue == original_residue and row.antibody_stability_ddG < -2: 86 | hotspot_mutations_map[position].append(mut_name) 87 | 88 | return hotspot_mutations_map 89 | 90 | def get_allowed_mutations_per_position_maps(PDB_name, all_mutations_summary_file_path): 91 | """ 92 | Returns two dictionaries: 93 | 1) A dict where keys are positions and values are mutation names (i.e {'53':['A53R', 'A53K', ...]}) 94 | 2) A dict where values are amino acid mutations (i.e {'53':['R', 'K', ...]}) 95 | """ 96 | all_mutations_summary_df = pd.read_csv(all_mutations_summary_file_path, header=0, index_col=0, dtype = {'position':str}) 97 | 98 | acceptable_positions_mut_names_map = get_acceptable_positions_mut_names_map(all_mutations_summary_df, PDB_name) 99 | hotspot_positions_mut_names_map = get_hotspot_positions_mut_names_map(all_mutations_summary_df) 100 | 101 | allowed_mut_names_per_position_map = {} 102 | allowed_AA_per_position_map = {} 103 | for position, position_df in all_mutations_summary_df.groupby('position'): 104 | # In positions with an antibody folding hotspot mutation, only consider hotspot mutations, otherwise allow both hotspot and acceptable mutations. 105 | if any(position_df.antibody_stability_ddG < -2): 106 | allowed_mut_names = hotspot_positions_mut_names_map[position] 107 | allowed_AA_mutations = {mut_name[-1] for mut_name in allowed_mut_names} 108 | else: 109 | allowed_mut_names = hotspot_positions_mut_names_map[position] + acceptable_positions_mut_names_map[position] 110 | allowed_AA_mutations = {mut_name[-1] for mut_name in allowed_mut_names} 111 | 112 | if allowed_mut_names: 113 | allowed_mut_names_per_position_map[position] = allowed_mut_names 114 | allowed_AA_per_position_map[position] = allowed_AA_mutations 115 | 116 | return allowed_mut_names_per_position_map, allowed_AA_per_position_map 117 | 118 | 119 | def clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model): 120 | for file in model_dir.iterdir(): 121 | if file.name != PDB_file_name_to_keep_as_model: 122 | file.unlink() 123 | 124 | PDB_file_path_to_keep_as_model = model_dir / PDB_file_name_to_keep_as_model 125 | if PDB_file_name_to_keep_as_model != 'model.pdb': 126 | PDB_file_path_to_keep_as_model.rename(PDB_file_path_to_keep_as_model.with_name('model.pdb')) 127 | return 128 | 129 | def get_random_mutations_list_for_initial_population(allowed_mut_names_per_position_map): 130 | random_mutations_list = [] 131 | for position, mut_names_list in allowed_mut_names_per_position_map.items(): 132 | # Skip positions where the wildtype residue is not Alanine, as this means the position was marked as MakeAla = "N" 133 | if all(mut_name[0] != 'A' for mut_name in mut_names_list): 134 | continue 135 | 136 | random_mut_name = random.choice(mut_names_list) 137 | random_mutations_list.append(random_mut_name) 138 | 139 | random.shuffle(random_mutations_list) 140 | return random_mutations_list 141 | 142 | def generate_random_model( 143 | PDB_name, foldx_Alanine_mutant_PDB_file_path, allowed_mut_names_per_position_map, allowed_AA_mutations_per_position_map, antibody_stability_dG_original_wildtype, 144 | antibody_seq_map_original_wildtype, model_dir, GLOBALS 145 | ): 146 | model, n_tries = None, 0 147 | while model == None: 148 | random_mutations_list = get_random_mutations_list_for_initial_population(allowed_mut_names_per_position_map) 149 | 150 | create_model( 151 | input_PDB_file_path = foldx_Alanine_mutant_PDB_file_path, 152 | copy_PDB_file_to_output_dir = True, 153 | mutations_list = random_mutations_list, 154 | output_dir = model_dir, 155 | GLOBALS = GLOBALS 156 | ) 157 | 158 | complex_stability_ddG = get_complex_stability_ddG(model_dir / f'Dif_{PDB_name}_1_Alanine_mutant.fxout') 159 | if complex_stability_ddG < 0.5 or n_tries == 5: 160 | full_residue_IDs_list = [f'{mut_name[-1]}{mut_name[1:-1]}' for mut_name in random_mutations_list] 161 | model = MC_Model( 162 | model_dir = model_dir, 163 | full_residue_IDs_list = full_residue_IDs_list, 164 | backbone_PDB_file_name = PDB_name, 165 | antibody_stability_dG_original_wildtype = antibody_stability_dG_original_wildtype, 166 | antibody_seq_map_original_wildtype = antibody_seq_map_original_wildtype, 167 | allowed_AA_mutations_per_position_map = allowed_AA_mutations_per_position_map, 168 | ) 169 | else: 170 | shutil.rmtree(model_dir) # Empty the directory and try again with a new random list of mutations 171 | 172 | n_tries += 1 173 | 174 | clean_up_model_dir(model_dir = model_dir, PDB_file_name_to_keep_as_model = f'{PDB_name}_1_Alanine_mutant_1.pdb') 175 | return model 176 | 177 | 178 | def generate_initial_models(parallel_executor, evolvex_working_dir, backbone_PDB_files_paths, GLOBALS): 179 | futures = [] 180 | for PDB_file_path in backbone_PDB_files_paths: 181 | PDB_dir = evolvex_working_dir / PDB_file_path.stem 182 | PDB_name = PDB_dir.name 183 | 184 | foldx_Alanine_mutant_PDB_file_path = PDB_dir / f'{PDB_name}_1_Alanine_mutant.pdb' 185 | 186 | search_output_dir = PDB_dir / 'search_results'; search_output_dir.mkdir(exist_ok = True) 187 | 188 | all_mutations_summary_file_path = PDB_dir / 'hotspot_mutants' / 'all_mutations_summary.csv' 189 | if not all_mutations_summary_file_path.exists(): 190 | print(f'Could not find the all_mutations_summary.csv file for {PDB_name = }, this should not happen ! Skipping PDB backbone for search.', flush=True) 191 | continue 192 | 193 | allowed_mut_names_per_position_map, allowed_AA_mutations_per_position_map = get_allowed_mutations_per_position_maps( 194 | PDB_name = PDB_name, all_mutations_summary_file_path = all_mutations_summary_file_path 195 | ) 196 | 197 | antibody_stability_dG_original_wildtype = get_chain_group_stability_dG(indiv_file_path = PDB_dir / 'Indiv_energies_original_wildtype_AC.fxout', chain_group_name = GLOBALS.antibody_chains) 198 | antibody_seq_map_original_wildtype = get_chain_to_sequence_map(PDB_file_path = foldx_Alanine_mutant_PDB_file_path, chain_subset = GLOBALS.antibody_chains) 199 | 200 | for ith_model in range(GLOBALS.population_size): 201 | model_dir = search_output_dir / str(ith_model); model_dir.mkdir(exist_ok=True) 202 | future = parallel_executor.submit( 203 | generate_random_model, PDB_name, foldx_Alanine_mutant_PDB_file_path, allowed_mut_names_per_position_map, allowed_AA_mutations_per_position_map, 204 | antibody_stability_dG_original_wildtype, antibody_seq_map_original_wildtype, model_dir, GLOBALS 205 | ) 206 | futures.append(future) 207 | 208 | initial_models_population = parallel_executor.gather(futures) 209 | 210 | return initial_models_population -------------------------------------------------------------------------------- /src/evolvex/mutate_interface.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | import pandas as pd 4 | 5 | from evolvex.foldx_commands import ( 6 | create_individual_list_foldx_mutations_file, get_alanine_mutant, 7 | run_foldx_BuildModel, run_foldx_AnalyseComplex, run_foldx_Stability, 8 | get_binding_ddG, get_complex_stability_ddG, get_chain_group_stability_ddG 9 | ) 10 | 11 | def generate_Alanine_mutant(PDB_file_path, PDB_positions_to_explore_df, evolvex_working_dir, GLOBALS): 12 | PDB_file_name = PDB_file_path.stem 13 | output_dir = evolvex_working_dir / PDB_file_name; output_dir.mkdir(exist_ok=True) 14 | 15 | # PDB_file_path corresponds to the original file in the Backbones folder, which we do not want to modify. The PDB_file_path_copy is 16 | # located in the corresponding subfolder named after the PDB file name in evolvex_working_dir 17 | PDB_file_path_copy = shutil.copy( 18 | src = PDB_file_path, 19 | dst = output_dir / f'{PDB_file_name}.pdb' 20 | ) 21 | 22 | antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains 23 | 24 | # PositionsToExplore possibilities: 25 | # 1) AA_Allowed=AUTO & MakeAla=Y --> Mutate to Alanine and find which mutations are worth exploring automatically 26 | # 2) AA_Allowed=string of AA & MakeAla=Y --> Mutate to Alanine and only allow the specified mutations 27 | # 3) AA_Allowed=string of AA & MakeAla=N --> Do not mutate to Alanine (i.e keep the wildtype residue) and only allow the specified mutations 28 | PDB_positions_to_explore_df['full_residue_ID'] = PDB_positions_to_explore_df.apply(lambda row:f'{row.Res1}{row.Chain}{row.number}', axis=1) 29 | positions_to_Ala_df = PDB_positions_to_explore_df[PDB_positions_to_explore_df['MakeAla'] == 'Y'] 30 | if positions_to_Ala_df.empty: 31 | raise ValueError('There must be at least 1 position to mutate to Alanine.') 32 | 33 | positions_to_Ala_full_residue_IDs = positions_to_Ala_df.full_residue_ID.values 34 | Alanine_mutant = get_alanine_mutant(positions_to_Ala_full_residue_IDs) 35 | individual_list_foldx_mutations_file_path = create_individual_list_foldx_mutations_file(mutant = Alanine_mutant, output_dir = output_dir) 36 | 37 | foldx_Alanine_mutant_PDB_file_path, foldx_wildtype_PDB_file_path = run_foldx_BuildModel( 38 | foldx_dir=GLOBALS.foldx_dir, 39 | PDB_file_dir=PDB_file_path_copy.parent, PDB_file_name=PDB_file_name, 40 | individual_list_foldx_mutations_file_path=individual_list_foldx_mutations_file_path, 41 | move_neighbors_flag=False, # At this stage we do not need an optimized structure, it will get optimized latter 42 | vdwDesign=GLOBALS.vdwDesign, 43 | print_stdout=GLOBALS.print_stdout, 44 | output_dir=output_dir, output_file_tag='Alanine_mutant', PDB_file_tag='Alanine_mutant' 45 | ) 46 | 47 | # The stability of the original wildtype antibody is needed for the MC and GA searches 48 | run_foldx_AnalyseComplex( 49 | GLOBALS.foldx_dir, 50 | PDB_file_path_copy.parent, PDB_file_path_copy.stem, 51 | antibody_chains, antigen_chains, 52 | GLOBALS.vdwDesign, 53 | GLOBALS.print_stdout, 54 | output_dir, output_file_tag='original_wildtype' 55 | ) 56 | 57 | AUTO_Ala_positions_full_residue_IDs = positions_to_Ala_df[positions_to_Ala_df['AA_Allowed'] == 'AUTO'].full_residue_ID.values 58 | return foldx_Alanine_mutant_PDB_file_path, AUTO_Ala_positions_full_residue_IDs, output_dir 59 | 60 | def mutate_antibody_hotspot_position(foldx_Alanine_mutant_PDB_file_path, full_residue_ID, mutant_residue, hotspot_mutants_dir, GLOBALS): 61 | antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains 62 | hotspot_mutant = f'A{full_residue_ID[1:]}{mutant_residue}' # e.g: AH25K 63 | 64 | hotspot_mutant_output_dir = hotspot_mutants_dir / hotspot_mutant; hotspot_mutant_output_dir.mkdir(exist_ok=True) 65 | shutil.copy( 66 | src = foldx_Alanine_mutant_PDB_file_path, 67 | dst = hotspot_mutant_output_dir 68 | ) 69 | 70 | individual_list_foldx_mutations_file_path = create_individual_list_foldx_mutations_file(mutant = hotspot_mutant, output_dir = hotspot_mutant_output_dir) 71 | foldx_mutant_PDB_file_path, foldx_wildtype_PDB_file_path = run_foldx_BuildModel( 72 | foldx_dir=GLOBALS.foldx_dir, 73 | PDB_file_dir=hotspot_mutant_output_dir, PDB_file_name=foldx_Alanine_mutant_PDB_file_path.stem, 74 | individual_list_foldx_mutations_file_path=individual_list_foldx_mutations_file_path, 75 | move_neighbors_flag=True, # At this stage we want to optimize the generated structures 76 | vdwDesign=GLOBALS.vdwDesign, 77 | print_stdout=GLOBALS.print_stdout, 78 | output_dir=hotspot_mutant_output_dir, output_file_tag='hotspot_mutant', PDB_file_tag='hotspot_mutant' 79 | ) 80 | 81 | # Mutant 82 | run_foldx_AnalyseComplex( 83 | GLOBALS.foldx_dir, 84 | hotspot_mutant_output_dir, foldx_mutant_PDB_file_path.stem, 85 | antibody_chains, antigen_chains, 86 | GLOBALS.vdwDesign, 87 | GLOBALS.print_stdout, 88 | hotspot_mutant_output_dir, output_file_tag='hotspot_mutant' 89 | ) 90 | run_foldx_Stability( 91 | GLOBALS.foldx_dir, 92 | hotspot_mutant_output_dir, foldx_mutant_PDB_file_path.stem, 93 | GLOBALS.vdwDesign, 94 | GLOBALS.print_stdout, 95 | hotspot_mutant_output_dir, output_file_tag='hotspot_mutant' 96 | ) 97 | 98 | # Wildtype 99 | run_foldx_AnalyseComplex( 100 | GLOBALS.foldx_dir, 101 | hotspot_mutant_output_dir, foldx_wildtype_PDB_file_path.stem, 102 | antibody_chains, antigen_chains, 103 | GLOBALS.vdwDesign, 104 | GLOBALS.print_stdout, 105 | hotspot_mutant_output_dir, output_file_tag='hotspot_wildtype' 106 | ) 107 | run_foldx_Stability( 108 | GLOBALS.foldx_dir, 109 | foldx_wildtype_PDB_file_path.parent, foldx_wildtype_PDB_file_path.stem, 110 | GLOBALS.vdwDesign, 111 | GLOBALS.print_stdout, 112 | hotspot_mutant_output_dir, output_file_tag='hotspot_wildtype' 113 | ) 114 | 115 | return 116 | 117 | def generate_mutations_summary_file(PDB_dir, PDB_positions_to_explore_df, GLOBALS): 118 | """ 119 | """ 120 | PDB_name = PDB_dir.name 121 | 122 | # Create a summary_df per position, and then combine them all into one final df 123 | all_mutations_summary_df = [] 124 | for _, row in PDB_positions_to_explore_df.iterrows(): 125 | full_original_residue_ID = f'{row.Res1}{row.Chain}{row.number}' 126 | 127 | output_dir = PDB_dir / 'hotspot_mutants' / full_original_residue_ID 128 | 129 | binding_ddG_position, complex_stability_ddG_position, antibody_stability_ddG_position, index = [], [], [], [] 130 | if row.AA_Allowed != 'AUTO': 131 | assert len(row.AA_Allowed) > 0, f"Rows where AA_Allowed is not 'AUTO' must have at least one specified residue ({row = })" 132 | 133 | output_dir.mkdir(exist_ok=True) 134 | 135 | # Generate an 'artificial' summary df where all fields are set to -100 for each amino acid 136 | values = [-100] * len(row.AA_Allowed) 137 | binding_ddG_position.extend(values) 138 | complex_stability_ddG_position.extend(values) 139 | antibody_stability_ddG_position.extend(values) 140 | 141 | wildtype_AA = 'A' if row.MakeAla == 'Y' else full_original_residue_ID[0] 142 | index.extend((f'{wildtype_AA}{full_original_residue_ID[1:]}{mutant_AA}' for mutant_AA in row.AA_Allowed)) 143 | 144 | else: 145 | for mut_name_dir in output_dir.iterdir(): # Iterate over each individual mutation folder that was generated (e.g AH27C, AH27D, ...) 146 | if not mut_name_dir.is_dir(): 147 | continue 148 | 149 | binding_ddG = get_binding_ddG( 150 | wildtype_interaction_file_path = mut_name_dir / 'Interaction_hotspot_wildtype_AC.fxout', # Wildtype = alanine mutant 151 | mutant_interaction_file_path = mut_name_dir / 'Interaction_hotspot_mutant_AC.fxout' 152 | ) 153 | complex_stability_ddG = get_complex_stability_ddG( 154 | dif_file_path = mut_name_dir / f'Dif_hotspot_mutant_{PDB_name}_1_Alanine_mutant.fxout' 155 | ) 156 | antibody_stability_ddG = get_chain_group_stability_ddG( 157 | wildtype_indiv_file_path = mut_name_dir / 'Indiv_energies_hotspot_wildtype_AC.fxout', 158 | mutant_indiv_file_path = mut_name_dir / 'Indiv_energies_hotspot_mutant_AC.fxout', 159 | chain_group_name = GLOBALS.antibody_chains 160 | ) 161 | 162 | binding_ddG_position.append(binding_ddG) 163 | complex_stability_ddG_position.append(complex_stability_ddG) 164 | antibody_stability_ddG_position.append(antibody_stability_ddG) 165 | 166 | index.append(mut_name_dir.name) 167 | 168 | 169 | summary_df = pd.DataFrame( 170 | data = { 171 | 'binding_ddG':binding_ddG_position, 172 | 'complex_stability_ddG':complex_stability_ddG_position, 173 | 'antibody_stability_ddG':antibody_stability_ddG_position 174 | }, 175 | index = index 176 | ).sort_index() 177 | summary_df['original_residue'] = full_original_residue_ID[0] 178 | summary_df['position'] = full_original_residue_ID[2:] 179 | 180 | all_mutations_summary_df.append(summary_df) 181 | 182 | if len(all_mutations_summary_df) == 0: 183 | print(f'Something went wrong when generating the mutations summary file for {PDB_name = }.') 184 | 185 | pd.concat(all_mutations_summary_df, axis=0).round(decimals = 6).to_csv(PDB_dir / 'hotspot_mutants' / 'all_mutations_summary.csv') 186 | return -------------------------------------------------------------------------------- /src/evolvex/search_algorithms.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import math 4 | import random 5 | 6 | import pandas as pd 7 | from dask.distributed import as_completed 8 | 9 | from evolvex.dask_parallel import wait_and_remove 10 | from evolvex.foldx_commands import (get_binding_dG, 11 | get_chain_group_intraclash_score, get_complex_stability_dG, 12 | get_chain_group_stability_dG, 13 | run_foldx_AnalyseComplex, 14 | run_foldx_Stability, get_all_other_interaction_file_info) 15 | from evolvex.model_generation import clean_up_model_dir, create_model 16 | from evolvex.model_generation import get_acceptable_positions_mut_names_map, get_hotspot_positions_mut_names_map 17 | from evolvex.utils import save_compressed_PDB_file 18 | 19 | from evolvex.utils import NDIGIS_ROUNDING 20 | 21 | paratope_AA = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] 22 | paratope_AA_weights = [2.95, 0.1, 6.75, 2.85, 3.9, 7.75, 2.95, 2.75, 2.55, 3.55, 0.7, 7.75, 1.9, 2.2, 5.15, 13.45, 5.85, 2.5, 5.3, 19.1] 23 | 24 | 25 | def all_hotspot_and_acceptable_mutations_combinations_generator(all_mutations_summary_file_path): 26 | """ 27 | Generator that yields all possible combinations of the hotspot and acceptable mutations. 28 | """ 29 | all_mutations_summary_df = pd.read_csv(all_mutations_summary_file_path, header=0, index_col=0) 30 | 31 | acceptable_positions_mut_names_map = get_acceptable_positions_mut_names_map(all_mutations_summary_df) 32 | hotspot_positions_mut_names_map = get_hotspot_positions_mut_names_map(all_mutations_summary_df) 33 | 34 | # Remove hotspot positions from acceptable positions 35 | for position in hotspot_positions_mut_names_map.keys(): 36 | del acceptable_positions_mut_names_map[position] 37 | 38 | ### NOTE: This only explores combinations of mutations in one order, do we want to get mutation order variability for BuildModel ? Would need to additionally use itertools.combinations. 39 | all_hotspot_mutations_combinations = list(itertools.product(*hotspot_positions_mut_names_map.values())) # hotspot_positions_mut_names_map.values() returns a list of sublists, where each sublist are the mutations for a given position 40 | if len(all_hotspot_mutations_combinations) == 0: 41 | all_hotspot_mutations_combinations = [[]] 42 | random.shuffle(all_hotspot_mutations_combinations) 43 | 44 | all_acceptable_mutations_combinations = list(itertools.product(*acceptable_positions_mut_names_map.values())) 45 | if len(all_acceptable_mutations_combinations) == 0: 46 | all_acceptable_mutations_combinations = [[]] 47 | random.shuffle(all_acceptable_mutations_combinations) 48 | 49 | for hotspot_mutations_list in all_hotspot_mutations_combinations: 50 | for acceptable_mutations_list in all_acceptable_mutations_combinations: 51 | mutations_list = hotspot_mutations_list + acceptable_mutations_list 52 | 53 | yield mutations_list 54 | 55 | return 56 | 57 | def run_foldx_commands(mutant_PDB_file_path, wildtype_PDB_file_path, antibody_chains, antigen_chains, output_dir, GLOBALS): 58 | run_foldx_AnalyseComplex( 59 | GLOBALS.foldx_dir, 60 | mutant_PDB_file_path.parent, mutant_PDB_file_path.stem, 61 | antibody_chains, antigen_chains, 62 | GLOBALS.vdwDesign, 63 | GLOBALS.print_stdout, 64 | output_dir, output_file_tag=None 65 | ) 66 | run_foldx_AnalyseComplex( 67 | GLOBALS.foldx_dir, 68 | wildtype_PDB_file_path.parent, wildtype_PDB_file_path.stem, 69 | antibody_chains, antigen_chains, 70 | GLOBALS.vdwDesign, 71 | GLOBALS.print_stdout, 72 | output_dir, output_file_tag=None 73 | ) 74 | 75 | run_foldx_Stability( 76 | GLOBALS.foldx_dir, 77 | mutant_PDB_file_path.parent, mutant_PDB_file_path.stem, 78 | GLOBALS.vdwDesign, 79 | GLOBALS.print_stdout, 80 | output_dir, output_file_tag=None 81 | ) 82 | run_foldx_Stability( 83 | GLOBALS.foldx_dir, 84 | wildtype_PDB_file_path.parent, wildtype_PDB_file_path.stem, 85 | GLOBALS.vdwDesign, 86 | GLOBALS.print_stdout, 87 | output_dir, output_file_tag=None 88 | ) 89 | 90 | if GLOBALS.calculate_binding_dG_with_water: 91 | run_foldx_AnalyseComplex( 92 | GLOBALS.foldx_dir, 93 | mutant_PDB_file_path.parent, mutant_PDB_file_path.stem, 94 | antibody_chains, antigen_chains, 95 | GLOBALS.vdwDesign, 96 | GLOBALS.print_stdout, 97 | output_dir, output_file_tag='mutant_with_waters', 98 | with_predicted_waters=True 99 | ) 100 | run_foldx_AnalyseComplex( 101 | GLOBALS.foldx_dir, 102 | wildtype_PDB_file_path.parent, wildtype_PDB_file_path.stem, 103 | antibody_chains, antigen_chains, 104 | GLOBALS.vdwDesign, 105 | GLOBALS.print_stdout, 106 | output_dir, output_file_tag='wildtype_with_waters', 107 | with_predicted_waters=True 108 | ) 109 | 110 | return 111 | 112 | def systematic_search(parallel_executor, backbone_PDB_files_paths, evolvex_working_dir, GLOBALS): 113 | antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains 114 | 115 | for PDB_file_path in backbone_PDB_files_paths: 116 | PDB_dir = evolvex_working_dir / PDB_file_path.stem 117 | PDB_name = PDB_dir.name 118 | 119 | search_output_dir = PDB_dir / 'search_results'; search_output_dir.mkdir(exist_ok = True) ### Should be False 120 | all_mutations_summary_file_path = PDB_dir / 'hotspot_mutants' / 'all_mutations_summary.csv' 121 | 122 | futures = [] 123 | for nth_model, mutations_list in enumerate(all_hotspot_and_acceptable_mutations_combinations_generator(all_mutations_summary_file_path)): 124 | foldx_Alanine_mutant_PDB_file_path = PDB_dir / f'{PDB_name}_1_Alanine_mutant.pdb' 125 | output_dir = search_output_dir / str(nth_model) 126 | 127 | future = parallel_executor.submit( 128 | create_model, foldx_Alanine_mutant_PDB_file_path, True, mutations_list, output_dir, GLOBALS, 129 | ) 130 | futures.append(future) 131 | 132 | wait_and_remove(parallel_executor, futures) 133 | 134 | return 135 | 136 | 137 | def get_random_mut_name(full_residue_IDs_list, allowed_AA_mutations_per_position_map): 138 | random_residue_ID = random.choice(full_residue_IDs_list) 139 | wildtype_AA, residue_ID, position = random_residue_ID[0], random_residue_ID[1:], random_residue_ID[2:] 140 | 141 | allowed_mutations = allowed_AA_mutations_per_position_map[position] 142 | if len(allowed_mutations) == 1: 143 | # Can't index a set, and there is only one mutation, so this works 144 | for mutant_AA in allowed_mutations: 145 | pass 146 | else: 147 | mutant_AA = random.choices(paratope_AA, paratope_AA_weights)[0] 148 | while not mutant_AA in allowed_mutations: 149 | mutant_AA = random.choices(paratope_AA, paratope_AA_weights)[0] 150 | 151 | return f'{wildtype_AA}{residue_ID}{mutant_AA}' 152 | 153 | def metropolis_criterion(energies): 154 | if any(energy < 0 for energy in energies): 155 | return True 156 | 157 | U = random.random() 158 | for energy in energies: 159 | P = math.exp(-(energy / 0.5919)) 160 | if P >= U: 161 | return True 162 | 163 | return False 164 | 165 | def keep_mutant_decision(model_dir, antibody_chains, antigen_chains, antibody_stability_dG_original_wildtype, iteration_fraction, generated_models_info, GLOBALS): 166 | # Calculate energies and scores 167 | wildtype_antibody_stability_dG = get_chain_group_stability_dG(indiv_file_path = model_dir / 'Indiv_energies_WT_model_1_AC.fxout', chain_group_name = antibody_chains) 168 | mutant_antibody_stability_dG = get_chain_group_stability_dG(indiv_file_path = model_dir / 'Indiv_energies_model_1_AC.fxout', chain_group_name = antibody_chains) 169 | antibody_stability_ddG = round(mutant_antibody_stability_dG - wildtype_antibody_stability_dG, NDIGIS_ROUNDING) 170 | 171 | wildtype_complex_stability_dG = get_complex_stability_dG(st_file_path = model_dir / 'WT_model_1_0_ST.fxout') 172 | mutant_complex_stability_dG = get_complex_stability_dG(st_file_path = model_dir / 'model_1_0_ST.fxout') 173 | 174 | wildtype_binding_dG = get_binding_dG(interaction_file_path = model_dir / f'Interaction_WT_model_1_AC.fxout') 175 | mutant_binding_dG = get_binding_dG(interaction_file_path = model_dir / f'Interaction_model_1_AC.fxout') 176 | binding_ddG = round(mutant_binding_dG - wildtype_binding_dG, NDIGIS_ROUNDING) 177 | 178 | if GLOBALS.calculate_binding_dG_with_water: 179 | wildtype_binding_dG_with_waters = get_binding_dG(interaction_file_path = model_dir / f'Interaction_wildtype_with_waters_AC.fxout') 180 | mutant_binding_dG_with_waters = get_binding_dG(interaction_file_path = model_dir / f'Interaction_mutant_with_waters_AC.fxout') 181 | binding_ddG_with_waters = round(mutant_binding_dG_with_waters - wildtype_binding_dG_with_waters, NDIGIS_ROUNDING) 182 | 183 | wildtype_antibody_intraclash_score = get_chain_group_intraclash_score(interaction_file_path = model_dir / f'Interaction_WT_model_1_AC.fxout', chain_group_name = antibody_chains) 184 | mutant_antibody_intraclash_score = get_chain_group_intraclash_score(interaction_file_path = model_dir / f'Interaction_model_1_AC.fxout', chain_group_name = antibody_chains) 185 | antibody_delta_intraclash_score = round(mutant_antibody_intraclash_score - wildtype_antibody_intraclash_score, NDIGIS_ROUNDING) 186 | 187 | # All columns from interactions file, starting from Backbone Hbond 188 | wildtype_other_info_map = get_all_other_interaction_file_info(interaction_file_path = model_dir / f'Interaction_WT_model_1_AC.fxout') 189 | mutant_other_info_map = get_all_other_interaction_file_info(interaction_file_path = model_dir / f'Interaction_model_1_AC.fxout') 190 | 191 | 192 | # Make decision 193 | if antibody_stability_ddG > 0.5 or mutant_antibody_stability_dG > (antibody_stability_dG_original_wildtype + 2): 194 | keep_mutant = False 195 | 196 | elif antibody_delta_intraclash_score > 0.5 or mutant_antibody_intraclash_score > 10: 197 | keep_mutant = False 198 | 199 | else: 200 | energies = (binding_ddG, binding_ddG_with_waters) if GLOBALS.calculate_binding_dG_with_water else (binding_ddG,) 201 | keep_mutant = metropolis_criterion(energies) 202 | 203 | # Log info of the selected model. This bit of code is uggly, but couldn't find a better way. 204 | generated_models_info['antibody_stability_dG'].append(mutant_antibody_stability_dG if keep_mutant else wildtype_antibody_stability_dG) 205 | generated_models_info['complex_stability_dG'].append(mutant_complex_stability_dG if keep_mutant else wildtype_complex_stability_dG) 206 | generated_models_info['binding_dG'].append(mutant_binding_dG if keep_mutant else wildtype_binding_dG) 207 | if GLOBALS.calculate_binding_dG_with_water: 208 | generated_models_info['binding_dG_with_waters'].append(mutant_binding_dG_with_waters if keep_mutant else wildtype_binding_dG_with_waters) 209 | generated_models_info['antibody_intraclash_score'].append(mutant_antibody_intraclash_score if keep_mutant else wildtype_antibody_intraclash_score) 210 | 211 | other_info_map = mutant_other_info_map if keep_mutant else wildtype_other_info_map 212 | for key, value in other_info_map.items(): 213 | key = key.replace(' ', '_') # Backbone Hbond => Backbone_Hbond 214 | generated_models_info[key].append(value) 215 | 216 | # 217 | 218 | return keep_mutant 219 | 220 | def update_full_residue_IDs_list(model, mut_names_list): 221 | for mut_name in mut_names_list: 222 | wildtype_AA, residue_ID, mutant_AA = mut_name[0], mut_name[1:-1], mut_name[-1] 223 | old_residue_ID = f'{wildtype_AA}{residue_ID}' 224 | new_residue_ID = f'{mutant_AA}{residue_ID}' 225 | 226 | model.full_residue_IDs_list.remove(old_residue_ID) 227 | model.full_residue_IDs_list.append(new_residue_ID) 228 | 229 | return 230 | 231 | def make_MC_steps(model, n_MC_steps, nth_loop, iteration_fraction, model_PDB_files_dir, GLOBALS): 232 | backbone_PDB_file_name = model.backbone_PDB_file_name 233 | antibody_stability_dG_original_wildtype = model.antibody_stability_dG_original_wildtype 234 | antibody_seq_map_original_wildtype = model.antibody_seq_map_original_wildtype 235 | allowed_AA_mutations_per_position_map = model.allowed_AA_mutations_per_position_map 236 | antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains 237 | 238 | generated_models_info = defaultdict(list) 239 | generated_models_info['backbone_PDB_file_name'] = [backbone_PDB_file_name] * n_MC_steps 240 | generated_models_info['nth_model'] = [model.model_dir.name] * n_MC_steps 241 | generated_models_info['step'] = ['MC'] * n_MC_steps 242 | 243 | current_nth_MC_iteration = nth_loop * (n_MC_steps + 1) # +1 to account for the recombination steps 244 | for i in range(n_MC_steps): 245 | nth_iteration = current_nth_MC_iteration + i + 1 246 | generated_models_info['nth_iteration'].append(nth_iteration) 247 | 248 | full_residue_IDs_list = model.full_residue_IDs_list 249 | mut_name = get_random_mut_name(full_residue_IDs_list, allowed_AA_mutations_per_position_map) 250 | 251 | # Create the mutant with BuildModel, which will be called "model_1" 252 | model_dir = model.model_dir 253 | create_model( 254 | input_PDB_file_path = model_dir / 'model.pdb', copy_PDB_file_to_output_dir = False, mutations_list = [mut_name], output_dir = model_dir, GLOBALS = GLOBALS, 255 | ) 256 | 257 | run_foldx_commands( 258 | mutant_PDB_file_path = model_dir / 'model_1.pdb', wildtype_PDB_file_path = model_dir / 'WT_model_1.pdb', 259 | antibody_chains = antibody_chains, antigen_chains = antigen_chains, 260 | output_dir = model_dir, GLOBALS = GLOBALS 261 | ) 262 | 263 | keep_mutant = keep_mutant_decision(model_dir, antibody_chains, antigen_chains, antibody_stability_dG_original_wildtype, iteration_fraction, generated_models_info, GLOBALS) 264 | if keep_mutant: 265 | clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model_1.pdb') 266 | update_full_residue_IDs_list(model, mut_names_list = [mut_name]) 267 | 268 | else: 269 | clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model.pdb') 270 | 271 | save_compressed_PDB_file( 272 | PDB_file_path = model_dir / 'model.pdb', 273 | output_name = f'{backbone_PDB_file_name}_{model.model_dir.name}_{nth_iteration}.pdb', 274 | output_dir = model_PDB_files_dir 275 | ) 276 | 277 | generated_models_info['residue_IDs'].append(';'.join(full_residue_IDs_list)) 278 | generated_models_info['from_mut_name'].append(mut_name) 279 | generated_models_info['mutation_accepted'].append(keep_mutant) 280 | 281 | return model, generated_models_info 282 | 283 | def write_generated_models_info(generated_models_info, generated_models_info_file_handle): 284 | if generated_models_info_file_handle.tell() == 0: # File is empty 285 | headers = ','.join(generated_models_info.keys()) + '\n' 286 | generated_models_info_file_handle.write(headers) 287 | 288 | # Equivalent to lines = pd.DataFrame(generated_models_info).to_csv(index=False, header=False), but 10x faster 289 | lines = ''.join( 290 | f"{','.join(map(str, row_of_values))}\n" # row CSV format 291 | for row_of_values in zip(*generated_models_info.values()) # if generated_models_info = {'A':[1,2,3], 'B':[4,5,6]}, zip(*generated_models_info.values()) yields (1,4), (2,5) and (3,6) 292 | ) 293 | 294 | generated_models_info_file_handle.writelines(lines) 295 | return 296 | 297 | def random_model_pairing_generator(models_population): 298 | backbone_PDB_grouped_models_map = defaultdict(list) 299 | for model in models_population: 300 | backbone_PDB_file_name = model.backbone_PDB_file_name 301 | backbone_PDB_grouped_models_map[backbone_PDB_file_name].append(model) 302 | 303 | for _, backbone_models_list in backbone_PDB_grouped_models_map.items(): 304 | random.shuffle(backbone_models_list) 305 | 306 | for i in range(0, len(backbone_models_list), 2): 307 | yield (backbone_models_list[i], backbone_models_list[i+1]) 308 | 309 | return 310 | 311 | def get_recombination_mut_names(model_1, model_2): 312 | residue_ID_to_AA_map_1 = {full_residue_ID[1:]:full_residue_ID[0] for full_residue_ID in model_1.full_residue_IDs_list} # e.g {'H52':'K', 'H56':'Y', ...} 313 | residue_ID_to_AA_map_2 = {full_residue_ID[1:]:full_residue_ID[0] for full_residue_ID in model_2.full_residue_IDs_list} 314 | 315 | shared_residue_IDs = set.intersection(set(residue_ID_to_AA_map_1.keys()), set(residue_ID_to_AA_map_2.keys())) 316 | sorted_shared_residue_IDs = sorted(shared_residue_IDs, key = lambda residue_ID:(residue_ID[0], int(residue_ID[1:]))) 317 | if len(sorted_shared_residue_IDs) < 2: 318 | raise ValueError(f'Cannot perform recombination between {model_1.model_dir} and {model_2.model_dir} because {shared_residue_IDs = }, and at least 2 shared residue IDs are needed.') 319 | 320 | recombination_location = random.randint(1, len(sorted_shared_residue_IDs) - 1) 321 | mut_names_1, mut_names_2 = [], [] 322 | for residue_ID in sorted_shared_residue_IDs[:recombination_location]: 323 | AA_1 = residue_ID_to_AA_map_1[residue_ID] 324 | AA_2 = residue_ID_to_AA_map_2[residue_ID] 325 | 326 | mut_names_1.append(f'{AA_1}{residue_ID}{AA_2}') # e.g 'KH52R' 327 | mut_names_2.append(f'{AA_2}{residue_ID}{AA_1}') 328 | 329 | return (mut_names_1, mut_names_2) 330 | 331 | def make_recombination_step(model_1, model_2, nth_iteration, iteration_fraction, model_PDB_files_dir, GLOBALS): 332 | # These variables are the same for both model 1 and 2 333 | backbone_PDB_file_name = model_1.backbone_PDB_file_name 334 | antibody_stability_dG_original_wildtype = model_1.antibody_stability_dG_original_wildtype 335 | antibody_seq_map_original_wildtype = model_1.antibody_seq_map_original_wildtype 336 | antibody_chains, antigen_chains = GLOBALS.antibody_chains, GLOBALS.antigen_chains 337 | 338 | generated_models_info = defaultdict(list) 339 | generated_models_info['backbone_PDB_file_name'] = [backbone_PDB_file_name] * 2 340 | generated_models_info['nth_model'] = [model_1.model_dir.name, model_2.model_dir.name] 341 | generated_models_info['step'] = ['recombination'] * 2 342 | generated_models_info['nth_iteration'] = [nth_iteration] * 2 343 | 344 | mut_names_1, mut_names_2 = get_recombination_mut_names(model_1, model_2) 345 | for mut_names, model in [(mut_names_1, model_1), (mut_names_2, model_2)]: 346 | model_dir = model.model_dir 347 | full_residue_IDs_list = model.full_residue_IDs_list 348 | 349 | create_model( 350 | input_PDB_file_path = model_dir / 'model.pdb', copy_PDB_file_to_output_dir = False, mutations_list = mut_names, output_dir = model_dir, GLOBALS = GLOBALS 351 | ) 352 | 353 | run_foldx_commands( 354 | mutant_PDB_file_path = model_dir / 'model_1.pdb', wildtype_PDB_file_path = model_dir / 'WT_model_1.pdb', 355 | antibody_chains = antibody_chains, antigen_chains = antigen_chains, 356 | output_dir = model_dir, GLOBALS = GLOBALS 357 | ) 358 | 359 | keep_mutant = keep_mutant_decision(model_dir, antibody_chains, antigen_chains, antibody_stability_dG_original_wildtype, iteration_fraction, generated_models_info, GLOBALS) 360 | if keep_mutant: 361 | clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model_1.pdb') 362 | update_full_residue_IDs_list(model, mut_names_list = mut_names) 363 | 364 | else: 365 | clean_up_model_dir(model_dir, PDB_file_name_to_keep_as_model = 'model.pdb') 366 | 367 | save_compressed_PDB_file( 368 | PDB_file_path = model_dir / 'model.pdb', 369 | output_name = f'{backbone_PDB_file_name}_{model.model_dir.name}_{nth_iteration}.pdb', 370 | output_dir = model_PDB_files_dir 371 | ) 372 | 373 | generated_models_info['residue_IDs'].append(';'.join(full_residue_IDs_list)) 374 | generated_models_info['from_mut_name'].append(';'.join(mut_names)) 375 | generated_models_info['mutation_accepted'].append(keep_mutant) 376 | 377 | return model_1, model_2, generated_models_info 378 | 379 | def GA_search(parallel_executor, initial_models_population, generated_models_info_file_path, model_PDB_files_dir, GLOBALS): 380 | recombine_every_nth_iteration = GLOBALS.recombine_every_nth_iteration 381 | max_iterations = GLOBALS.max_iterations 382 | 383 | # Each loop = n MC steps + 1 recombination step 384 | n_MC_and_recombination_loops = max_iterations // recombine_every_nth_iteration 385 | n_MC_steps_per_loop = recombine_every_nth_iteration - 1 386 | 387 | generated_models_info_file_handle = open(generated_models_info_file_path, 'a') 388 | 389 | models_population = initial_models_population 390 | for nth_loop in range(n_MC_and_recombination_loops): 391 | iteration_fraction = nth_loop / n_MC_and_recombination_loops 392 | 393 | # MC steps 394 | futures = [] 395 | for model in models_population: 396 | future = parallel_executor.submit(make_MC_steps, model, n_MC_steps_per_loop, nth_loop, iteration_fraction, model_PDB_files_dir, GLOBALS) 397 | futures.append(future) 398 | 399 | models_population = [] 400 | for _, (model, generated_models_info) in as_completed(futures, with_results=True): 401 | models_population.append(model) 402 | write_generated_models_info(generated_models_info, generated_models_info_file_handle) 403 | parallel_executor.cancel(futures) 404 | 405 | # Recombination step. The models are recombined with a model from the same backbone. 406 | nth_iteration = (nth_loop + 1) * recombine_every_nth_iteration 407 | futures = [] 408 | for model_1, model_2 in random_model_pairing_generator(models_population): 409 | future = parallel_executor.submit(make_recombination_step, model_1, model_2, nth_iteration, iteration_fraction, model_PDB_files_dir, GLOBALS) 410 | futures.append(future) 411 | 412 | models_population = [] 413 | for _, (model_1, model_2, generated_models_info) in as_completed(futures, with_results=True): 414 | models_population.append(model_1) 415 | models_population.append(model_2) 416 | write_generated_models_info(generated_models_info, generated_models_info_file_handle) 417 | parallel_executor.cancel(futures) 418 | 419 | generated_models_info_file_handle.close() 420 | return -------------------------------------------------------------------------------- /src/evolvex/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import tarfile 3 | 4 | NDIGIS_ROUNDING = 4 5 | 6 | def save_compressed_PDB_file(PDB_file_path, output_name, output_dir): 7 | with tarfile.open(output_dir / f'{output_name}.tar.gz', 'w:gz') as tar_file_handle: 8 | tar_file_handle.add(PDB_file_path, arcname=output_name) 9 | return -------------------------------------------------------------------------------- /src/evolvex/utils_bio.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from Bio import SeqIO 4 | from Bio.PDB import PDBParser 5 | from Bio.Data.IUPACData import protein_letters_3to1 6 | 7 | from Bio import BiopythonParserWarning 8 | from Bio.PDB.PDBExceptions import PDBConstructionWarning 9 | 10 | warnings.simplefilter('ignore', PDBConstructionWarning) 11 | warnings.simplefilter('ignore', BiopythonParserWarning) 12 | 13 | def get_residue_ID_to_residue_name_map(PDB_file_path): 14 | """ 15 | Returns a dict where keys are residue IDs (e.g A52, for residue 52 in chain A) and keys are single letter residues (e.g G for Glycine) 16 | """ 17 | parser = PDBParser(QUIET=True) 18 | structure = parser.get_structure(id='', file=PDB_file_path) 19 | 20 | residue_ID_to_residue_name_map = {} 21 | for residue in structure.get_residues(): 22 | _, _, chain, (_, number, _) = residue.full_id 23 | residue_ID = f'{chain}{number}' 24 | residue_name = protein_letters_3to1[residue.resname.title()] 25 | 26 | residue_ID_to_residue_name_map[residue_ID] = residue_name 27 | 28 | return residue_ID_to_residue_name_map 29 | 30 | def get_chain_to_sequence_map(PDB_file_path, chain_subset): 31 | chain_to_sequence_map = { 32 | record.annotations['chain']:str(record.seq).replace('X', '') # When extracting the sequences from ATOMS lines, SeqIO.parse introduces an 'X' when there are consecutive residues with a number difference > 1 (e.g: A1,Y2,P5 => AYXXP), but we don't want that. 33 | for record in SeqIO.parse(PDB_file_path, format = 'pdb-atom') 34 | if record.annotations['chain'] in chain_subset 35 | } 36 | return chain_to_sequence_map --------------------------------------------------------------------------------