├── LICENSE
├── README.md
├── data
    ├── Ab8.csv
    ├── PDGF38_heavy.csv
    ├── PDGF38_light.csv
    └── PDGF38_raw.csv
├── images
    └── PfAbNet-viscosity_workflow.png
├── notebooks
    ├── 1_preprocess.ipynb
    ├── 2_build_hm.ipynb
    ├── 3_validation.ipynb
    ├── 4_attributions.ipynb
    ├── 5_sensitivity.ipynb
    └── __init__.py
├── pfabnet
    ├── __init__.py
    ├── base.py
    ├── dataset.py
    ├── esp_generator.py
    ├── generate_attributions.py
    ├── generate_testset_attributions.py
    ├── model.py
    ├── predict.py
    ├── sbatch_tmpl.sh
    ├── train.py
    ├── trainer.py
    └── utils.py
├── pfabnet_eisenberg
    ├── __init__.py
    ├── base.py
    ├── dataset.py
    ├── eisenberg_generator.py
    ├── model.py
    ├── predict.py
    ├── train.py
    ├── trainer.py
    └── utils.py
└── requirements.txt


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PfAbNet-viscosity
 2 | This repository accompanies the manuscript "Low-Data Interpretable Deep Learning Prediction of Antibody Viscosity using a Biophysically Meaningful Representation." The code and notebooks in this repository can be used to train PfAbNet-viscosity models, generate test set predictions and reproduce all analyses reported in the manuscript.
 3 | ![alt text](https://github.com/PfizerRD/PfAbNet-viscosity/blob/main/images/PfAbNet-viscosity_workflow.png?raw=true)
 4 | 
 5 | This workflow requires the following software/toolkits licenses:
 6 | 1. Bioluminate (Schrodinger LLC)
 7 | 2. oechem, oespicoli, and oezap toolkits (OpenEye Scientific Software)
 8 | 
 9 | Run the jupyter notebooks in the following order to reproduce the analyses presented in the manuscript:
10 | 1. ```1_preprocess.ipynb```: Retrieve and process the raw data (measured viscosity and antibody sequences)
11 | 2. ```2_build_hm.ipynb```: Build homology models. Analyze and plot dataset diversity.
12 | 3. ```3_validation.ipynb```: Train PfAbNet-PDGF and PfAbNet-Ab21 models. Generate test set predictions and performance plots.
13 | 4. ```4_attribution.ipynb```: Perform attribution analysis.
14 | 5. ```5_sensitivity.ipynb```: Perform sensitivity analysis.
15 | 
16 | ## Training
17 | The following command can be used to train PfAbNet models from the command line after the required input files have been created (see the Jupyter Notebooks on how to specify input arguments).
18 | 
19 | For example, this will train models using PDGF38 dataset.
20 | 
21 | ```
22 | python pfabnet/train.py --training_data_file data/PDGF.csv \
23 |   --homology_model_dir data/hm \
24 |   --output_model_prefix PfAbNet-PDGF38 \
25 |   --output_model_dir models/pdgf38
26 | ```
27 | 
28 | ## Inference
29 | The following command can be used to generate predictions for a test antibody using .mol2 file with charges (see the Jupyter Notebooks on how to specify input arguments).
30 | 
31 | ```
32 | python pfabnet/predict.py --structure_file data/hm/mAb1.mol2 \
33 |   --PfAbNet_model_dir models/pdgf38 \
34 |   --PfAbNet_model_prefix PfAbNet-PDGF38 \
35 |   --output_file models/pdgf/mAb1.csv
36 | ```
37 | 
38 | 


--------------------------------------------------------------------------------
/data/Ab8.csv:
--------------------------------------------------------------------------------
 1 | Entity,Viscosity_at_150,SCM
 2 | TGN1412,16.42,844.6
 3 | Basiliximab,25.05,640.8
 4 | Natalizumab,13.67,815.5
 5 | Tremelimumab,8.8,704.2
 6 | Ipilimumab,8.6,754.0
 7 | Atezolizumab,11.56,759.6
 8 | Ganitumab,10.1,806.5
 9 | Vesencumab,23.57,661.3
10 | 


--------------------------------------------------------------------------------
/data/PDGF38_heavy.csv:
--------------------------------------------------------------------------------
 1 | Name	FW1	CDR1	FW2	CDR2	FW3	CDR3	FW4
 2 | AB-001	EVQLLESGGGLVQPGGSLRLSCAAS	GFTFSSYAMS	WVRQAPGKGLEWVS	YISDDGSLKYYADSVKG	RFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK	HPYWYGGQLDL	WGQGTLVTVSS
 3 | 4QCI	----V--------------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
 4 | R1-002	-----Q-------------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
 5 | R1-003	------------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
 6 | R1-004	----------------------R--	----------	--------------	-----------------	-------------------------------R	-----------	-----------
 7 | R1-005	-------------------------	----------	--------------	---N-------------	-------------------------------R	-----------	-----------
 8 | R1-006	-------------------------	----------	--------------	-----------------	-------------------------------R	-----------	--R--------
 9 | R1-007	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
10 | R1-008	-------------------------	----------	--------------	-----------------	--------------------------------	-----------	-----------
11 | R1-009	-------------------------	----------	--------------	-----------------	--------------------------------	-----------	-----------
12 | R1-010	-------------------------	----------	--------------	-----------------	--------------------------------	-----------	-----------
13 | R1-011	-------------------------	----------	--------------	-----------------	--------------------------------	-----------	-----------
14 | R1-012	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
15 | R1-013	-----Q-------------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
16 | R1-014	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
17 | R1-015	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
18 | R1-016	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
19 | R1-017	------------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
20 | R1-018	-------------------------	----------	--------------	-----------------	-------------------------------R	-----------	--R--------
21 | R2-001	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
22 | R2-002	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
23 | R2-003	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
24 | R2-004	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
25 | R2-005	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
26 | R2-006	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
27 | R2-007	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
28 | R2-008	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
29 | R2-009	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
30 | R2-010	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	--H--------	-----------
31 | R2-011	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-------K---	-----------
32 | R2-012	-----Q------K------------	----------	--------------	---K-------------	-------------------------------R	-----------	-----------
33 | R2-013	-----Q------K------------	----------	--------------	---N-------------	-------------------------------R	-----------	-----------
34 | R2-014	-----Q------K------------	----------	--------------	----Q------------	-------------------------------R	-----------	-----------
35 | R2-015	-----Q------K------------	----------	--------------	------------N----	-------------------------------R	-----------	-----------
36 | R2-016	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	---------N-	-----------
37 | R2-017	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	---------Y-	-----------
38 | R2-018	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
39 | R2-019	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
40 | R2-020	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
41 | R2-021	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
42 | R2-022	-----Q------K------------	----------	--------------	-----------------	-------------------------------R	-----------	-----------
43 | 
44 | 


--------------------------------------------------------------------------------
/data/PDGF38_light.csv:
--------------------------------------------------------------------------------
 1 | Name	FW1	CDR1	FW2	CDR2	FW3	CDR3	FW4
 2 | AB-001	SYELTQPPSVSVSPGQTASITC	SGDSLGSYFVH	WYQQKPGQSPVLVIY	DDSNRPS	GIPERFSGSNSGNTATLTISGTQAMDEADYYC	SAFTHNSDV	FGGGTKLTVL
 3 | 4QCI	------------A-----R-S-	-----------	--------A------	-------	------------------------E-------	---------	----------
 4 | R1-002	----------------------	-----------	---------------	-------	--------------------------------	---------	----------
 5 | R1-003	----------------------	-----------	---------------	-------	--------------------------------	---------	----------
 6 | R1-004	----------------------	-----------	---------------	-------	--------------------------------	---------	----------
 7 | R1-005	----------------------	-----------	---------------	-------	--------------------------------	---------	----------
 8 | R1-006	----------------------	-----------	---------------	-------	--------------------------------	---------	----------
 9 | R1-007	----------------------	-----------	---------------	-------	--------------------------------	---------	----------
10 | R1-008	--V-------------------	-----------	---------------	-------	--------------------------------	---------	----------
11 | R1-009	----------------R-----	-----------	---------------	-------	--------------------------------	---------	----------
12 | R1-010	----------------------	-----------	---------------	---K---	--------------------------------	---------	----------
13 | R1-011	----------------------	-----------	---------------	-------	--------------------------------	-------N-	----------
14 | R1-012	--V-------------------	-----------	---------------	-------	--------------------------------	---------	----------
15 | R1-013	----------------R-----	-----------	---------------	-------	--------------------------------	---------	----------
16 | R1-014	----------------R-----	-----------	---------------	-------	--------------------------------	---------	----------
17 | R1-015	----------------------	-----------	---------------	-------	--------------------------------	-------N-	----------
18 | R1-016	----------------------	-----------	---------------	---K---	--------------------------------	---------	----------
19 | R1-017	----------------R-----	-----------	---------------	-------	--------------------------------	---------	----------
20 | R1-018	----------------R-----	-----------	---------------	-------	--------------------------------	---------	----------
21 | R2-001	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
22 | R2-002	--V---------A--K--R---	---K-------	---------------	---K---	--------------------------------	---------	----------
23 | R2-003	--V---------A--K--R---	----------K	---------------	---K---	--------------------------------	---------	----------
24 | R2-004	--V---------A--K--R---	-----------	--------------H	---K---	--------------------------------	---------	----------
25 | R2-005	--V---------A--K--R---	-----------	--------------R	---K---	--------------------------------	---------	----------
26 | R2-006	--V---------A--K--R---	-----------	---------------	--KK---	--------------------------------	---------	----------
27 | R2-007	--V---------A--K--R---	-----------	---------------	---K---	--------K-----------------------	---------	----------
28 | R2-008	--V---------A--K--R---	-----------	---------------	---K---	----------K---------------------	---------	----------
29 | R2-009	--V---------A--K--R---	-----------	---------------	---K---	-----------K--------------------	---------	----------
30 | R2-010	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
31 | R2-011	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
32 | R2-012	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
33 | R2-013	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
34 | R2-014	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
35 | R2-015	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
36 | R2-016	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
37 | R2-017	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	---------	----------
38 | R2-018	--V---------A--K--R---	--N--------	---------------	---K---	--------------------------------	---------	----------
39 | R2-019	--V---------A--K--R---	-----------	---------------	L--K---	--------------------------------	---------	----------
40 | R2-020	--V---------A--K--R---	-----------	---------------	-N-K---	--------------------------------	---------	----------
41 | R2-021	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	-------K-	----------
42 | R2-022	--V---------A--K--R---	-----------	---------------	---K---	--------------------------------	-------N-	----------
43 | 
44 | 


--------------------------------------------------------------------------------
/data/PDGF38_raw.csv:
--------------------------------------------------------------------------------
 1 | Entity,Viscosity_at_150,SCM
 2 | AB-001,440,-2213
 3 | R1-002,288,-2008
 4 | R1-003,523,-1985
 5 | R1-004,310,-1961
 6 | R1-005,190,-1838
 7 | R1-006,314,-1941
 8 | R1-007,233,-1988
 9 | R1-008,567,-2085
10 | R1-009,430,-2180
11 | R1-010,99,-1898
12 | R1-011,519,-2035
13 | R1-012,471,-1861
14 | R1-013,414,-1972
15 | R1-014,415,-1983
16 | R1-015,452,-1817
17 | R1-016,73,-1706
18 | R1-017,1534,-1949
19 | R1-018,416,-1914
20 | R2-001,37,-1503
21 | R2-004,54,-1480
22 | R2-005,37,-1456
23 | R2-006,13,-1470
24 | R2-007,21,-1433
25 | R2-008,23,-1460
26 | R2-009,19,-1469
27 | R2-010,35,-1656
28 | R2-011,26,-1500
29 | R2-012,39,-1273
30 | R2-013,26,-1357
31 | R2-014,51,-1378
32 | R2-015,83,-1446
33 | R2-016,67,-1416
34 | R2-017,84,-1564
35 | R2-018,20,-1346
36 | R2-019,60,-1503
37 | R2-020,10,-1363
38 | R2-021,119,-1197
39 | R2-022,135,-1311
40 | 


--------------------------------------------------------------------------------
/images/PfAbNet-viscosity_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pfizer-opensource/pfabnet-viscosity/60970a752a3e74cc336db13576a9c3a21448fe2e/images/PfAbNet-viscosity_workflow.png


--------------------------------------------------------------------------------
/notebooks/1_preprocess.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "id": "735f2307-49ee-42de-99bd-d3acd30711d4",
   7 |    "metadata": {},
   8 |    "outputs": [],
   9 |    "source": [
  10 |     "import os\n",
  11 |     "import copy\n",
  12 |     "import numpy as np\n",
  13 |     "import pandas as pd"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "code",
  18 |    "execution_count": 2,
  19 |    "id": "5cc62605-e842-49a0-a545-6c373969999f",
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "try:\n",
  24 |     "    from pfabnet import utils\n",
  25 |     "    from pfabnet.base import ENTITY_KEY, VISCOSITY_KEY\n",
  26 |     "except ModuleNotFoundError as e:\n",
  27 |     "    os.chdir(os.getcwd() + '/../')\n",
  28 |     "    from pfabnet import base\n",
  29 |     "    from pfabnet.base import ENTITY_KEY, VISCOSITY_KEY"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": 3,
  35 |    "id": "b8daec3b-7a25-4f51-9563-8672e2bdda92",
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "BASE_DIR = os.path.dirname(base.get_file_path()) + '/../'\n",
  40 |     "DATA_DIR = os.path.join(BASE_DIR, 'data')\n",
  41 |     "RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')\n",
  42 |     "FASTA_DIR = os.path.join(DATA_DIR, 'fasta')"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 4,
  48 |    "id": "df9be626-a510-4969-b9c3-1fad57a016fd",
  49 |    "metadata": {},
  50 |    "outputs": [],
  51 |    "source": [
  52 |     "# create data directory\n",
  53 |     "os.makedirs(DATA_DIR, exist_ok=True)\n",
  54 |     "os.makedirs(RAW_DATA_DIR, exist_ok=True)\n",
  55 |     "os.makedirs(FASTA_DIR, exist_ok=True)"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "id": "1502e6f4-6f1c-4cc8-b035-b92bb0500192",
  61 |    "metadata": {},
  62 |    "source": [
  63 |     "## Process Ab21 dataset"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": 11,
  69 |    "id": "8a1a39e8-ceb7-4c8f-beb1-048ba7937c07",
  70 |    "metadata": {},
  71 |    "outputs": [],
  72 |    "source": [
  73 |     "# Extract supplementary data from Lai et al. Mol. Pharmaceutics 2021, 18, 3, 1167–1175\n",
  74 |     "# https://pubs.acs.org/doi/suppl/10.1021/acs.molpharmaceut.0c01073/suppl_file/mp0c01073_si_001.zip\n",
  75 |     "\n",
  76 |     "# 1. copy mp0c01073_si_001.zip to data\n",
  77 |     "# 2. unzip mp0cp1073_si_001.zip - this will create a sub directory SI"
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "code",
  82 |    "execution_count": 19,
  83 |    "id": "70cfc7f3-9d72-49ad-ba5b-a3db5ffd679e",
  84 |    "metadata": {},
  85 |    "outputs": [
  86 |     {
  87 |      "name": "stdout",
  88 |      "output_type": "stream",
  89 |      "text": [
  90 |       "Number of antibodies in Ab21 set: 21\n"
  91 |      ]
  92 |     },
  93 |     {
  94 |      "data": {
  95 |       "text/html": [
  96 |        "<div>\n",
  97 |        "<style scoped>\n",
  98 |        "    .dataframe tbody tr th:only-of-type {\n",
  99 |        "        vertical-align: middle;\n",
 100 |        "    }\n",
 101 |        "\n",
 102 |        "    .dataframe tbody tr th {\n",
 103 |        "        vertical-align: top;\n",
 104 |        "    }\n",
 105 |        "\n",
 106 |        "    .dataframe thead th {\n",
 107 |        "        text-align: right;\n",
 108 |        "    }\n",
 109 |        "</style>\n",
 110 |        "<table border=\"1\" class=\"dataframe\">\n",
 111 |        "  <thead>\n",
 112 |        "    <tr style=\"text-align: right;\">\n",
 113 |        "      <th></th>\n",
 114 |        "      <th>Entity</th>\n",
 115 |        "      <th>Viscosity_at_150</th>\n",
 116 |        "      <th>Isotype</th>\n",
 117 |        "      <th>N_hydrophobic Fv</th>\n",
 118 |        "      <th>N_hydrophobic mAb</th>\n",
 119 |        "      <th>N_hydrophilic Fv</th>\n",
 120 |        "      <th>N_hydrophilic mAb</th>\n",
 121 |        "      <th>HI_Fv</th>\n",
 122 |        "      <th>HI_mAb</th>\n",
 123 |        "      <th>SASA_phobic_Fv</th>\n",
 124 |        "      <th>...</th>\n",
 125 |        "      <th>net charges mAb</th>\n",
 126 |        "      <th>FvCSP</th>\n",
 127 |        "      <th>mAbCSP</th>\n",
 128 |        "      <th>Fv_pI</th>\n",
 129 |        "      <th>mAb_pI</th>\n",
 130 |        "      <th>SAP Fv</th>\n",
 131 |        "      <th>SAP mAb</th>\n",
 132 |        "      <th>SCM Fv</th>\n",
 133 |        "      <th>SCM mAb</th>\n",
 134 |        "      <th>classifier</th>\n",
 135 |        "    </tr>\n",
 136 |        "  </thead>\n",
 137 |        "  <tbody>\n",
 138 |        "    <tr>\n",
 139 |        "      <th>0</th>\n",
 140 |        "      <td>mAb1</td>\n",
 141 |        "      <td>14.4</td>\n",
 142 |        "      <td>IgG1</td>\n",
 143 |        "      <td>83</td>\n",
 144 |        "      <td>492</td>\n",
 145 |        "      <td>120</td>\n",
 146 |        "      <td>712</td>\n",
 147 |        "      <td>1.098922</td>\n",
 148 |        "      <td>1.002246</td>\n",
 149 |        "      <td>3760.633301</td>\n",
 150 |        "      <td>...</td>\n",
 151 |        "      <td>26</td>\n",
 152 |        "      <td>-10</td>\n",
 153 |        "      <td>40</td>\n",
 154 |        "      <td>8.88</td>\n",
 155 |        "      <td>8.96</td>\n",
 156 |        "      <td>134.8</td>\n",
 157 |        "      <td>526.3</td>\n",
 158 |        "      <td>2522.9</td>\n",
 159 |        "      <td>6979.0</td>\n",
 160 |        "      <td>0</td>\n",
 161 |        "    </tr>\n",
 162 |        "    <tr>\n",
 163 |        "      <th>1</th>\n",
 164 |        "      <td>mAb2</td>\n",
 165 |        "      <td>20.9</td>\n",
 166 |        "      <td>IgG1</td>\n",
 167 |        "      <td>85</td>\n",
 168 |        "      <td>498</td>\n",
 169 |        "      <td>110</td>\n",
 170 |        "      <td>690</td>\n",
 171 |        "      <td>1.364037</td>\n",
 172 |        "      <td>1.078138</td>\n",
 173 |        "      <td>4469.273438</td>\n",
 174 |        "      <td>...</td>\n",
 175 |        "      <td>22</td>\n",
 176 |        "      <td>0</td>\n",
 177 |        "      <td>10</td>\n",
 178 |        "      <td>8.02</td>\n",
 179 |        "      <td>8.75</td>\n",
 180 |        "      <td>161.4</td>\n",
 181 |        "      <td>573.4</td>\n",
 182 |        "      <td>1687.7</td>\n",
 183 |        "      <td>5731.8</td>\n",
 184 |        "      <td>0</td>\n",
 185 |        "    </tr>\n",
 186 |        "    <tr>\n",
 187 |        "      <th>2</th>\n",
 188 |        "      <td>mAb3</td>\n",
 189 |        "      <td>14.9</td>\n",
 190 |        "      <td>IgG1</td>\n",
 191 |        "      <td>80</td>\n",
 192 |        "      <td>486</td>\n",
 193 |        "      <td>122</td>\n",
 194 |        "      <td>720</td>\n",
 195 |        "      <td>1.317641</td>\n",
 196 |        "      <td>1.057277</td>\n",
 197 |        "      <td>4007.478271</td>\n",
 198 |        "      <td>...</td>\n",
 199 |        "      <td>26</td>\n",
 200 |        "      <td>0</td>\n",
 201 |        "      <td>12</td>\n",
 202 |        "      <td>7.67</td>\n",
 203 |        "      <td>8.71</td>\n",
 204 |        "      <td>149.5</td>\n",
 205 |        "      <td>552.1</td>\n",
 206 |        "      <td>2170.0</td>\n",
 207 |        "      <td>6075.0</td>\n",
 208 |        "      <td>0</td>\n",
 209 |        "    </tr>\n",
 210 |        "    <tr>\n",
 211 |        "      <th>3</th>\n",
 212 |        "      <td>mAb4</td>\n",
 213 |        "      <td>93.4</td>\n",
 214 |        "      <td>IgG1</td>\n",
 215 |        "      <td>78</td>\n",
 216 |        "      <td>482</td>\n",
 217 |        "      <td>122</td>\n",
 218 |        "      <td>714</td>\n",
 219 |        "      <td>1.195039</td>\n",
 220 |        "      <td>1.024895</td>\n",
 221 |        "      <td>3754.267578</td>\n",
 222 |        "      <td>...</td>\n",
 223 |        "      <td>20</td>\n",
 224 |        "      <td>-2</td>\n",
 225 |        "      <td>-24</td>\n",
 226 |        "      <td>8.19</td>\n",
 227 |        "      <td>8.83</td>\n",
 228 |        "      <td>161.9</td>\n",
 229 |        "      <td>539.7</td>\n",
 230 |        "      <td>2406.3</td>\n",
 231 |        "      <td>7008.6</td>\n",
 232 |        "      <td>1</td>\n",
 233 |        "    </tr>\n",
 234 |        "    <tr>\n",
 235 |        "      <th>4</th>\n",
 236 |        "      <td>mAb5</td>\n",
 237 |        "      <td>8.6</td>\n",
 238 |        "      <td>IgG1</td>\n",
 239 |        "      <td>89</td>\n",
 240 |        "      <td>504</td>\n",
 241 |        "      <td>112</td>\n",
 242 |        "      <td>700</td>\n",
 243 |        "      <td>1.273285</td>\n",
 244 |        "      <td>1.052817</td>\n",
 245 |        "      <td>5683.704590</td>\n",
 246 |        "      <td>...</td>\n",
 247 |        "      <td>26</td>\n",
 248 |        "      <td>0</td>\n",
 249 |        "      <td>22</td>\n",
 250 |        "      <td>7.86</td>\n",
 251 |        "      <td>8.85</td>\n",
 252 |        "      <td>213.3</td>\n",
 253 |        "      <td>598.2</td>\n",
 254 |        "      <td>1636.9</td>\n",
 255 |        "      <td>5795.4</td>\n",
 256 |        "      <td>0</td>\n",
 257 |        "    </tr>\n",
 258 |        "  </tbody>\n",
 259 |        "</table>\n",
 260 |        "<p>5 rows × 24 columns</p>\n",
 261 |        "</div>"
 262 |       ],
 263 |       "text/plain": [
 264 |        "  Entity  Viscosity_at_150 Isotype  N_hydrophobic Fv  N_hydrophobic mAb  \\\n",
 265 |        "0   mAb1              14.4    IgG1                83                492   \n",
 266 |        "1   mAb2              20.9    IgG1                85                498   \n",
 267 |        "2   mAb3              14.9    IgG1                80                486   \n",
 268 |        "3   mAb4              93.4    IgG1                78                482   \n",
 269 |        "4   mAb5               8.6    IgG1                89                504   \n",
 270 |        "\n",
 271 |        "   N_hydrophilic Fv  N_hydrophilic mAb     HI_Fv    HI_mAb  SASA_phobic_Fv  \\\n",
 272 |        "0               120                712  1.098922  1.002246     3760.633301   \n",
 273 |        "1               110                690  1.364037  1.078138     4469.273438   \n",
 274 |        "2               122                720  1.317641  1.057277     4007.478271   \n",
 275 |        "3               122                714  1.195039  1.024895     3754.267578   \n",
 276 |        "4               112                700  1.273285  1.052817     5683.704590   \n",
 277 |        "\n",
 278 |        "   ...  net charges mAb  FvCSP  mAbCSP  Fv_pI  mAb_pI  SAP Fv  SAP mAb  \\\n",
 279 |        "0  ...               26    -10      40   8.88    8.96   134.8    526.3   \n",
 280 |        "1  ...               22      0      10   8.02    8.75   161.4    573.4   \n",
 281 |        "2  ...               26      0      12   7.67    8.71   149.5    552.1   \n",
 282 |        "3  ...               20     -2     -24   8.19    8.83   161.9    539.7   \n",
 283 |        "4  ...               26      0      22   7.86    8.85   213.3    598.2   \n",
 284 |        "\n",
 285 |        "   SCM Fv  SCM mAb  classifier  \n",
 286 |        "0  2522.9   6979.0           0  \n",
 287 |        "1  1687.7   5731.8           0  \n",
 288 |        "2  2170.0   6075.0           0  \n",
 289 |        "3  2406.3   7008.6           1  \n",
 290 |        "4  1636.9   5795.4           0  \n",
 291 |        "\n",
 292 |        "[5 rows x 24 columns]"
 293 |       ]
 294 |      },
 295 |      "execution_count": 19,
 296 |      "metadata": {},
 297 |      "output_type": "execute_result"
 298 |     }
 299 |    ],
 300 |    "source": [
 301 |     "df_Ab21 = pd.read_csv(os.path.join(os.path.join(DATA_DIR, 'SI'), 'features_values_SI.csv'))\n",
 302 |     "df_Ab21 = df_Ab21.loc[df_Ab21.Isotype == 'IgG1']\n",
 303 |     "df_Ab21.rename({'mabs':ENTITY_KEY}, inplace=True, axis=1)\n",
 304 |     "df_Ab21.reset_index(drop=True, inplace=True)\n",
 305 |     "\n",
 306 |     "df_Ab21_2 = pd.read_csv(os.path.join(DATA_DIR, 'Ab21_raw.csv'))\n",
 307 |     "df_Ab21_merged = df_Ab21_2.merge(df_Ab21, on=ENTITY_KEY)\n",
 308 |     "\n",
 309 |     "df_Ab21_merged.to_csv(os.path.join(DATA_DIR, 'Ab21.csv'), index=False)\n",
 310 |     "print('Number of antibodies in Ab21 set: %d' % len(df_Ab21_merged))\n",
 311 |     "df_Ab21_merged.head()"
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "code",
 316 |    "execution_count": 20,
 317 |    "id": "ffc22546-f7ff-4f2a-83ed-f68de958afeb",
 318 |    "metadata": {},
 319 |    "outputs": [
 320 |     {
 321 |      "name": "stdout",
 322 |      "output_type": "stream",
 323 |      "text": [
 324 |       "21 fasta files were saved in FASTA_DIR\n"
 325 |      ]
 326 |     }
 327 |    ],
 328 |    "source": [
 329 |     "# Extract and save fasta entry for each antibody in the SI fasta file into separate file\n",
 330 |     "\n",
 331 |     "from Bio.SeqIO.FastaIO import FastaIterator\n",
 332 |     "\n",
 333 |     "# extract light and heavy chain sequences from fasta file\n",
 334 |     "light_chains = {}\n",
 335 |     "heavy_chains = {}\n",
 336 |     "with open(os.path.join(os.path.join(DATA_DIR, 'SI'), 'seq_vis_SI.fasta'), 'r') as handle:\n",
 337 |     "    for record in FastaIterator(handle):\n",
 338 |     "        id_fields = record.id.split('_')\n",
 339 |     "        title = id_fields[0]\n",
 340 |     "        if title == 'mAB27': # handle the inconsistent naming in the SI file\n",
 341 |     "            title = 'mAb27'\n",
 342 |     "        chain_type = id_fields[1]\n",
 343 |     "        if chain_type == 'light':\n",
 344 |     "            light_chains[title] = str(record.seq)\n",
 345 |     "        else:\n",
 346 |     "            heavy_chains[title] = str(record.seq)\n",
 347 |     "\n",
 348 |     "fasta_files = []\n",
 349 |     "for k, v in light_chains.items():\n",
 350 |     "    if k in df_Ab21_merged[ENTITY_KEY].values:\n",
 351 |     "        fasta_file = os.path.join(FASTA_DIR, k + '.fasta')\n",
 352 |     "        fasta_files.append(fasta_file)\n",
 353 |     "        with open(fasta_file, 'w') as fptr:\n",
 354 |     "            fptr.write('>' + k + '_VH\\n')\n",
 355 |     "            fptr.write(heavy_chains[k] + '\\n')\n",
 356 |     "            fptr.write('>' + k + '_VL\\n')\n",
 357 |     "            fptr.write(v + '\\n')\n",
 358 |     "\n",
 359 |     "print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "code",
 364 |    "execution_count": 22,
 365 |    "id": "4fae4778-49f5-4d48-848e-6f05ee696716",
 366 |    "metadata": {},
 367 |    "outputs": [
 368 |     {
 369 |      "data": {
 370 |       "text/html": [
 371 |        "<div>\n",
 372 |        "<style scoped>\n",
 373 |        "    .dataframe tbody tr th:only-of-type {\n",
 374 |        "        vertical-align: middle;\n",
 375 |        "    }\n",
 376 |        "\n",
 377 |        "    .dataframe tbody tr th {\n",
 378 |        "        vertical-align: top;\n",
 379 |        "    }\n",
 380 |        "\n",
 381 |        "    .dataframe thead th {\n",
 382 |        "        text-align: right;\n",
 383 |        "    }\n",
 384 |        "</style>\n",
 385 |        "<table border=\"1\" class=\"dataframe\">\n",
 386 |        "  <thead>\n",
 387 |        "    <tr style=\"text-align: right;\">\n",
 388 |        "      <th></th>\n",
 389 |        "      <th>Entity</th>\n",
 390 |        "      <th>Viscosity_at_150</th>\n",
 391 |        "      <th>Isotype</th>\n",
 392 |        "      <th>N_hydrophobic Fv</th>\n",
 393 |        "      <th>N_hydrophobic mAb</th>\n",
 394 |        "      <th>N_hydrophilic Fv</th>\n",
 395 |        "      <th>N_hydrophilic mAb</th>\n",
 396 |        "      <th>HI_Fv</th>\n",
 397 |        "      <th>HI_mAb</th>\n",
 398 |        "      <th>SASA_phobic_Fv</th>\n",
 399 |        "      <th>...</th>\n",
 400 |        "      <th>mAbCSP</th>\n",
 401 |        "      <th>Fv_pI</th>\n",
 402 |        "      <th>mAb_pI</th>\n",
 403 |        "      <th>SAP Fv</th>\n",
 404 |        "      <th>SAP mAb</th>\n",
 405 |        "      <th>SCM Fv</th>\n",
 406 |        "      <th>SCM mAb</th>\n",
 407 |        "      <th>classifier</th>\n",
 408 |        "      <th>LC</th>\n",
 409 |        "      <th>HC</th>\n",
 410 |        "    </tr>\n",
 411 |        "  </thead>\n",
 412 |        "  <tbody>\n",
 413 |        "    <tr>\n",
 414 |        "      <th>0</th>\n",
 415 |        "      <td>mAb1</td>\n",
 416 |        "      <td>14.4</td>\n",
 417 |        "      <td>IgG1</td>\n",
 418 |        "      <td>83</td>\n",
 419 |        "      <td>492</td>\n",
 420 |        "      <td>120</td>\n",
 421 |        "      <td>712</td>\n",
 422 |        "      <td>1.098922</td>\n",
 423 |        "      <td>1.002246</td>\n",
 424 |        "      <td>3760.633301</td>\n",
 425 |        "      <td>...</td>\n",
 426 |        "      <td>40</td>\n",
 427 |        "      <td>8.88</td>\n",
 428 |        "      <td>8.96</td>\n",
 429 |        "      <td>134.8</td>\n",
 430 |        "      <td>526.3</td>\n",
 431 |        "      <td>2522.9</td>\n",
 432 |        "      <td>6979.0</td>\n",
 433 |        "      <td>0</td>\n",
 434 |        "      <td>DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...</td>\n",
 435 |        "      <td>EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...</td>\n",
 436 |        "    </tr>\n",
 437 |        "    <tr>\n",
 438 |        "      <th>1</th>\n",
 439 |        "      <td>mAb2</td>\n",
 440 |        "      <td>20.9</td>\n",
 441 |        "      <td>IgG1</td>\n",
 442 |        "      <td>85</td>\n",
 443 |        "      <td>498</td>\n",
 444 |        "      <td>110</td>\n",
 445 |        "      <td>690</td>\n",
 446 |        "      <td>1.364037</td>\n",
 447 |        "      <td>1.078138</td>\n",
 448 |        "      <td>4469.273438</td>\n",
 449 |        "      <td>...</td>\n",
 450 |        "      <td>10</td>\n",
 451 |        "      <td>8.02</td>\n",
 452 |        "      <td>8.75</td>\n",
 453 |        "      <td>161.4</td>\n",
 454 |        "      <td>573.4</td>\n",
 455 |        "      <td>1687.7</td>\n",
 456 |        "      <td>5731.8</td>\n",
 457 |        "      <td>0</td>\n",
 458 |        "      <td>DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...</td>\n",
 459 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...</td>\n",
 460 |        "    </tr>\n",
 461 |        "    <tr>\n",
 462 |        "      <th>2</th>\n",
 463 |        "      <td>mAb3</td>\n",
 464 |        "      <td>14.9</td>\n",
 465 |        "      <td>IgG1</td>\n",
 466 |        "      <td>80</td>\n",
 467 |        "      <td>486</td>\n",
 468 |        "      <td>122</td>\n",
 469 |        "      <td>720</td>\n",
 470 |        "      <td>1.317641</td>\n",
 471 |        "      <td>1.057277</td>\n",
 472 |        "      <td>4007.478271</td>\n",
 473 |        "      <td>...</td>\n",
 474 |        "      <td>12</td>\n",
 475 |        "      <td>7.67</td>\n",
 476 |        "      <td>8.71</td>\n",
 477 |        "      <td>149.5</td>\n",
 478 |        "      <td>552.1</td>\n",
 479 |        "      <td>2170.0</td>\n",
 480 |        "      <td>6075.0</td>\n",
 481 |        "      <td>0</td>\n",
 482 |        "      <td>DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...</td>\n",
 483 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...</td>\n",
 484 |        "    </tr>\n",
 485 |        "    <tr>\n",
 486 |        "      <th>3</th>\n",
 487 |        "      <td>mAb4</td>\n",
 488 |        "      <td>93.4</td>\n",
 489 |        "      <td>IgG1</td>\n",
 490 |        "      <td>78</td>\n",
 491 |        "      <td>482</td>\n",
 492 |        "      <td>122</td>\n",
 493 |        "      <td>714</td>\n",
 494 |        "      <td>1.195039</td>\n",
 495 |        "      <td>1.024895</td>\n",
 496 |        "      <td>3754.267578</td>\n",
 497 |        "      <td>...</td>\n",
 498 |        "      <td>-24</td>\n",
 499 |        "      <td>8.19</td>\n",
 500 |        "      <td>8.83</td>\n",
 501 |        "      <td>161.9</td>\n",
 502 |        "      <td>539.7</td>\n",
 503 |        "      <td>2406.3</td>\n",
 504 |        "      <td>7008.6</td>\n",
 505 |        "      <td>1</td>\n",
 506 |        "      <td>DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...</td>\n",
 507 |        "      <td>QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...</td>\n",
 508 |        "    </tr>\n",
 509 |        "    <tr>\n",
 510 |        "      <th>4</th>\n",
 511 |        "      <td>mAb5</td>\n",
 512 |        "      <td>8.6</td>\n",
 513 |        "      <td>IgG1</td>\n",
 514 |        "      <td>89</td>\n",
 515 |        "      <td>504</td>\n",
 516 |        "      <td>112</td>\n",
 517 |        "      <td>700</td>\n",
 518 |        "      <td>1.273285</td>\n",
 519 |        "      <td>1.052817</td>\n",
 520 |        "      <td>5683.704590</td>\n",
 521 |        "      <td>...</td>\n",
 522 |        "      <td>22</td>\n",
 523 |        "      <td>7.86</td>\n",
 524 |        "      <td>8.85</td>\n",
 525 |        "      <td>213.3</td>\n",
 526 |        "      <td>598.2</td>\n",
 527 |        "      <td>1636.9</td>\n",
 528 |        "      <td>5795.4</td>\n",
 529 |        "      <td>0</td>\n",
 530 |        "      <td>EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...</td>\n",
 531 |        "      <td>EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE...</td>\n",
 532 |        "    </tr>\n",
 533 |        "  </tbody>\n",
 534 |        "</table>\n",
 535 |        "<p>5 rows × 26 columns</p>\n",
 536 |        "</div>"
 537 |       ],
 538 |       "text/plain": [
 539 |        "  Entity  Viscosity_at_150 Isotype  N_hydrophobic Fv  N_hydrophobic mAb  \\\n",
 540 |        "0   mAb1              14.4    IgG1                83                492   \n",
 541 |        "1   mAb2              20.9    IgG1                85                498   \n",
 542 |        "2   mAb3              14.9    IgG1                80                486   \n",
 543 |        "3   mAb4              93.4    IgG1                78                482   \n",
 544 |        "4   mAb5               8.6    IgG1                89                504   \n",
 545 |        "\n",
 546 |        "   N_hydrophilic Fv  N_hydrophilic mAb     HI_Fv    HI_mAb  SASA_phobic_Fv  \\\n",
 547 |        "0               120                712  1.098922  1.002246     3760.633301   \n",
 548 |        "1               110                690  1.364037  1.078138     4469.273438   \n",
 549 |        "2               122                720  1.317641  1.057277     4007.478271   \n",
 550 |        "3               122                714  1.195039  1.024895     3754.267578   \n",
 551 |        "4               112                700  1.273285  1.052817     5683.704590   \n",
 552 |        "\n",
 553 |        "   ...  mAbCSP  Fv_pI  mAb_pI  SAP Fv  SAP mAb  SCM Fv  SCM mAb  classifier  \\\n",
 554 |        "0  ...      40   8.88    8.96   134.8    526.3  2522.9   6979.0           0   \n",
 555 |        "1  ...      10   8.02    8.75   161.4    573.4  1687.7   5731.8           0   \n",
 556 |        "2  ...      12   7.67    8.71   149.5    552.1  2170.0   6075.0           0   \n",
 557 |        "3  ...     -24   8.19    8.83   161.9    539.7  2406.3   7008.6           1   \n",
 558 |        "4  ...      22   7.86    8.85   213.3    598.2  1636.9   5795.4           0   \n",
 559 |        "\n",
 560 |        "                                                  LC  \\\n",
 561 |        "0  DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...   \n",
 562 |        "1  DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...   \n",
 563 |        "2  DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...   \n",
 564 |        "3  DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...   \n",
 565 |        "4  EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...   \n",
 566 |        "\n",
 567 |        "                                                  HC  \n",
 568 |        "0  EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...  \n",
 569 |        "1  EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...  \n",
 570 |        "2  EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...  \n",
 571 |        "3  QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...  \n",
 572 |        "4  EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE...  \n",
 573 |        "\n",
 574 |        "[5 rows x 26 columns]"
 575 |       ]
 576 |      },
 577 |      "execution_count": 22,
 578 |      "metadata": {},
 579 |      "output_type": "execute_result"
 580 |     }
 581 |    ],
 582 |    "source": [
 583 |     "Ab21_entity_list = []; Ab21_LC_list = []; Ab21_HC_list = []\n",
 584 |     "for k, v in light_chains.items():\n",
 585 |     "    Ab21_entity_list.append(k)\n",
 586 |     "    Ab21_LC_list.append(v)\n",
 587 |     "    Ab21_HC_list.append(heavy_chains[k])\n",
 588 |     "    \n",
 589 |     "df_tmp = pd.DataFrame({ENTITY_KEY:Ab21_entity_list, 'LC':Ab21_LC_list, 'HC':Ab21_HC_list})\n",
 590 |     "\n",
 591 |     "df_Ab21 = df_Ab21_merged.merge(df_tmp, on=ENTITY_KEY)\n",
 592 |     "df_Ab21.head()"
 593 |    ]
 594 |   },
 595 |   {
 596 |    "cell_type": "markdown",
 597 |    "id": "023937c8-1c65-44b9-9592-35f140677fc7",
 598 |    "metadata": {},
 599 |    "source": [
 600 |     "## Process PDGF38 dataset"
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "code",
 605 |    "execution_count": 23,
 606 |    "id": "5f49968c-43e2-43a7-94cc-b5ffd6ce3626",
 607 |    "metadata": {},
 608 |    "outputs": [],
 609 |    "source": [
 610 |     "def get_plos_seq_data(df_seq_plos):\n",
 611 |     "    df_tmp = df_seq_plos.loc[:,'FW1':'FW4']\n",
 612 |     "    df_seq_plos['seq'] = df_tmp.apply(''.join, axis=1)\n",
 613 |     "    entity_to_sequence = {}\n",
 614 |     "    sequences = df_seq_plos['seq'].values\n",
 615 |     "    for _, row in df_seq_plos.iterrows():\n",
 616 |     "        ref_sequence = list(row['seq'])\n",
 617 |     "        entity_to_sequence[row['Name']] = ref_sequence\n",
 618 |     "        for _, row2 in df_seq_plos.iterrows():\n",
 619 |     "            if row2['Name'] == row['Name']: \n",
 620 |     "                continue\n",
 621 |     "            sequence2 = list(row2['seq'])\n",
 622 |     "            sequence2_mod = copy.copy(ref_sequence)\n",
 623 |     "            for idx, (aa1, aa2) in enumerate(zip(ref_sequence, sequence2)):\n",
 624 |     "                if aa2 != '-':\n",
 625 |     "                    sequence2_mod[idx] = aa2\n",
 626 |     "            entity_to_sequence[row2['Name']] = sequence2_mod\n",
 627 |     "\n",
 628 |     "        return entity_to_sequence"
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": 25,
 634 |    "id": "c59733e5-0c59-473b-b8fe-88cd80c46134",
 635 |    "metadata": {},
 636 |    "outputs": [
 637 |     {
 638 |      "data": {
 639 |       "text/html": [
 640 |        "<div>\n",
 641 |        "<style scoped>\n",
 642 |        "    .dataframe tbody tr th:only-of-type {\n",
 643 |        "        vertical-align: middle;\n",
 644 |        "    }\n",
 645 |        "\n",
 646 |        "    .dataframe tbody tr th {\n",
 647 |        "        vertical-align: top;\n",
 648 |        "    }\n",
 649 |        "\n",
 650 |        "    .dataframe thead th {\n",
 651 |        "        text-align: right;\n",
 652 |        "    }\n",
 653 |        "</style>\n",
 654 |        "<table border=\"1\" class=\"dataframe\">\n",
 655 |        "  <thead>\n",
 656 |        "    <tr style=\"text-align: right;\">\n",
 657 |        "      <th></th>\n",
 658 |        "      <th>Entity</th>\n",
 659 |        "      <th>Viscosity_at_150</th>\n",
 660 |        "      <th>SCM</th>\n",
 661 |        "      <th>HC</th>\n",
 662 |        "      <th>LC</th>\n",
 663 |        "    </tr>\n",
 664 |        "  </thead>\n",
 665 |        "  <tbody>\n",
 666 |        "    <tr>\n",
 667 |        "      <th>0</th>\n",
 668 |        "      <td>AB-001</td>\n",
 669 |        "      <td>440</td>\n",
 670 |        "      <td>-2213</td>\n",
 671 |        "      <td>EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...</td>\n",
 672 |        "      <td>SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...</td>\n",
 673 |        "    </tr>\n",
 674 |        "    <tr>\n",
 675 |        "      <th>1</th>\n",
 676 |        "      <td>R1-002</td>\n",
 677 |        "      <td>288</td>\n",
 678 |        "      <td>-2008</td>\n",
 679 |        "      <td>EVQLLQSGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...</td>\n",
 680 |        "      <td>SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...</td>\n",
 681 |        "    </tr>\n",
 682 |        "    <tr>\n",
 683 |        "      <th>2</th>\n",
 684 |        "      <td>R1-003</td>\n",
 685 |        "      <td>523</td>\n",
 686 |        "      <td>-1985</td>\n",
 687 |        "      <td>EVQLLESGGGLVKPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...</td>\n",
 688 |        "      <td>SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...</td>\n",
 689 |        "    </tr>\n",
 690 |        "    <tr>\n",
 691 |        "      <th>3</th>\n",
 692 |        "      <td>R1-004</td>\n",
 693 |        "      <td>310</td>\n",
 694 |        "      <td>-1961</td>\n",
 695 |        "      <td>EVQLLESGGGLVQPGGSLRLSCRASGFTFSSYAMSWVRQAPGKGLE...</td>\n",
 696 |        "      <td>SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...</td>\n",
 697 |        "    </tr>\n",
 698 |        "    <tr>\n",
 699 |        "      <th>4</th>\n",
 700 |        "      <td>R1-005</td>\n",
 701 |        "      <td>190</td>\n",
 702 |        "      <td>-1838</td>\n",
 703 |        "      <td>EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...</td>\n",
 704 |        "      <td>SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...</td>\n",
 705 |        "    </tr>\n",
 706 |        "  </tbody>\n",
 707 |        "</table>\n",
 708 |        "</div>"
 709 |       ],
 710 |       "text/plain": [
 711 |        "   Entity  Viscosity_at_150   SCM  \\\n",
 712 |        "0  AB-001               440 -2213   \n",
 713 |        "1  R1-002               288 -2008   \n",
 714 |        "2  R1-003               523 -1985   \n",
 715 |        "3  R1-004               310 -1961   \n",
 716 |        "4  R1-005               190 -1838   \n",
 717 |        "\n",
 718 |        "                                                  HC  \\\n",
 719 |        "0  EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...   \n",
 720 |        "1  EVQLLQSGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...   \n",
 721 |        "2  EVQLLESGGGLVKPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...   \n",
 722 |        "3  EVQLLESGGGLVQPGGSLRLSCRASGFTFSSYAMSWVRQAPGKGLE...   \n",
 723 |        "4  EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...   \n",
 724 |        "\n",
 725 |        "                                                  LC  \n",
 726 |        "0  SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...  \n",
 727 |        "1  SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...  \n",
 728 |        "2  SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...  \n",
 729 |        "3  SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...  \n",
 730 |        "4  SYELTQPPSVSVSPGQTASITCSGDSLGSYFVHWYQQKPGQSPVLV...  "
 731 |       ]
 732 |      },
 733 |      "execution_count": 25,
 734 |      "metadata": {},
 735 |      "output_type": "execute_result"
 736 |     }
 737 |    ],
 738 |    "source": [
 739 |     "# Extract PDGF sequences and viscosity values from Lai SI: SI/mutants_SI.xlsx\n",
 740 |     "\n",
 741 |     "# Extract sequences from PLOS SI\n",
 742 |     "df_plos_lc = pd.read_csv('data/PDGF38_light.csv', sep='\\t')\n",
 743 |     "df_plos_hc = pd.read_csv('data/PDGF38_heavy.csv', sep='\\t')\n",
 744 |     "\n",
 745 |     "entity_to_sequence_hc = get_plos_seq_data(df_plos_hc)\n",
 746 |     "entity_to_sequence_lc = get_plos_seq_data(df_plos_lc)\n",
 747 |     "plos_data = [(k, ''.join(entity_to_sequence_hc[k]), ''.join(v)) for k, v in entity_to_sequence_lc.items()]\n",
 748 |     "df_plos = pd.DataFrame(plos_data, columns=[ENTITY_KEY, 'HC', 'LC'])\n",
 749 |     "\n",
 750 |     "# Lai SI\n",
 751 |     "df_PDGF38_raw = pd.read_csv(os.path.join(DATA_DIR, 'PDGF38_raw.csv'))\n",
 752 |     "\n",
 753 |     "xls = open(os.path.join(DATA_DIR, 'SI/mutants_SI.xlsx'), 'rb')\n",
 754 |     "df_PDGF38_sheet3 = pd.read_excel(xls, 'result')\n",
 755 |     "df_PDGF38_sheet3.rename({'Unnamed: 0':ENTITY_KEY}, inplace=True, axis=1)\n",
 756 |     "\n",
 757 |     "df_PDGF38 = df_PDGF38_raw.merge(df_plos, on=ENTITY_KEY)\n",
 758 |     "\n",
 759 |     "df_PDGF38 = df_PDGF38[[ENTITY_KEY, VISCOSITY_KEY, 'SCM', 'HC', 'LC']]\n",
 760 |     "df_PDGF38.to_csv(os.path.join(DATA_DIR, 'PDGF38.csv'), index=False)\n",
 761 |     "\n",
 762 |     "df_PDGF38.head()"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": 26,
 768 |    "id": "b2da622c-9513-4155-b110-d9e3718f7319",
 769 |    "metadata": {},
 770 |    "outputs": [
 771 |     {
 772 |      "name": "stdout",
 773 |      "output_type": "stream",
 774 |      "text": [
 775 |       "38 fasta files were saved in FASTA_DIR\n"
 776 |      ]
 777 |     }
 778 |    ],
 779 |    "source": [
 780 |     "fasta_files = []\n",
 781 |     "for entity, hc, lc in zip(df_PDGF38[ENTITY_KEY].values, df_PDGF38['HC'].values, df_PDGF38['LC'].values):\n",
 782 |     "    if 'R1-001' in entity:\n",
 783 |     "        print(entity)\n",
 784 |     "    fasta_file = os.path.join(FASTA_DIR, entity + '.fasta')\n",
 785 |     "    fasta_files.append(fasta_file)\n",
 786 |     "    with open(fasta_file, 'w') as fptr:\n",
 787 |     "        fptr.write('>' + entity + '_VH\\n')\n",
 788 |     "        fptr.write(hc + '\\n')\n",
 789 |     "        fptr.write('>' + entity + '_VL\\n')\n",
 790 |     "        fptr.write(lc + '\\n')\n",
 791 |     "\n",
 792 |     "print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))\n"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "id": "c5db5ded-b3f2-479d-ac02-1812473ccc5b",
 798 |    "metadata": {},
 799 |    "source": [
 800 |     "### Prepare Ab8 dataset"
 801 |    ]
 802 |   },
 803 |   {
 804 |    "cell_type": "code",
 805 |    "execution_count": 27,
 806 |    "id": "32c95ecb-b93e-4796-8485-580e708f3ae9",
 807 |    "metadata": {},
 808 |    "outputs": [],
 809 |    "source": [
 810 |     "# Extract supplementary data from Lai et al. MABS 2021, VOL. 13, NO. 1, e1991256 (19 pages) \n",
 811 |     "# https://www.tandfonline.com/doi/suppl/10.1080/19420862.2021.1991256/suppl_file/kmab_a_1991256_sm4057.zip\n",
 812 |     "#\n",
 813 |     "# 1. copy kmab_a_1991256_sm4057.zip to data\n",
 814 |     "# 2. unzip kmab_a_1991256_sm4057.zip - this will extract files in data directory"
 815 |    ]
 816 |   },
 817 |   {
 818 |    "cell_type": "code",
 819 |    "execution_count": 107,
 820 |    "id": "a06a8a16-966a-406f-aa9e-cb41ffcc8293",
 821 |    "metadata": {},
 822 |    "outputs": [
 823 |     {
 824 |      "data": {
 825 |       "text/html": [
 826 |        "<div>\n",
 827 |        "<style scoped>\n",
 828 |        "    .dataframe tbody tr th:only-of-type {\n",
 829 |        "        vertical-align: middle;\n",
 830 |        "    }\n",
 831 |        "\n",
 832 |        "    .dataframe tbody tr th {\n",
 833 |        "        vertical-align: top;\n",
 834 |        "    }\n",
 835 |        "\n",
 836 |        "    .dataframe thead th {\n",
 837 |        "        text-align: right;\n",
 838 |        "    }\n",
 839 |        "</style>\n",
 840 |        "<table border=\"1\" class=\"dataframe\">\n",
 841 |        "  <thead>\n",
 842 |        "    <tr style=\"text-align: right;\">\n",
 843 |        "      <th></th>\n",
 844 |        "      <th>Clone Name</th>\n",
 845 |        "      <th>Entity</th>\n",
 846 |        "      <th>ISOTYPE</th>\n",
 847 |        "      <th>HC</th>\n",
 848 |        "      <th>LC</th>\n",
 849 |        "      <th>Unnamed: 5</th>\n",
 850 |        "      <th>Variable Domain Source</th>\n",
 851 |        "      <th>Source Details</th>\n",
 852 |        "      <th>HC Class</th>\n",
 853 |        "      <th>HFR1</th>\n",
 854 |        "      <th>...</th>\n",
 855 |        "      <th>VH</th>\n",
 856 |        "      <th>LC Class</th>\n",
 857 |        "      <th>LFR1</th>\n",
 858 |        "      <th>CDRL1</th>\n",
 859 |        "      <th>LFR2</th>\n",
 860 |        "      <th>CDRL2</th>\n",
 861 |        "      <th>LFR3</th>\n",
 862 |        "      <th>CDRL3</th>\n",
 863 |        "      <th>LFR4</th>\n",
 864 |        "      <th>VL</th>\n",
 865 |        "    </tr>\n",
 866 |        "  </thead>\n",
 867 |        "  <tbody>\n",
 868 |        "    <tr>\n",
 869 |        "      <th>0</th>\n",
 870 |        "      <td>TGN1412 analog</td>\n",
 871 |        "      <td>TGN1412</td>\n",
 872 |        "      <td>IgG1 / Kappa</td>\n",
 873 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...</td>\n",
 874 |        "      <td>DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...</td>\n",
 875 |        "      <td>NaN</td>\n",
 876 |        "      <td>PDB</td>\n",
 877 |        "      <td>1YJD</td>\n",
 878 |        "      <td>IgG1</td>\n",
 879 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKAS</td>\n",
 880 |        "      <td>...</td>\n",
 881 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...</td>\n",
 882 |        "      <td>Kappa</td>\n",
 883 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
 884 |        "      <td>HASQNIYVWLN</td>\n",
 885 |        "      <td>WYQQKPGKAPKLLIY</td>\n",
 886 |        "      <td>KASNLHT</td>\n",
 887 |        "      <td>GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC</td>\n",
 888 |        "      <td>QQGQTYPYT</td>\n",
 889 |        "      <td>FGGGTKVEIK</td>\n",
 890 |        "      <td>DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...</td>\n",
 891 |        "    </tr>\n",
 892 |        "    <tr>\n",
 893 |        "      <th>1</th>\n",
 894 |        "      <td>Avastin analog</td>\n",
 895 |        "      <td>Bevacizumab</td>\n",
 896 |        "      <td>IgG1 / Kappa</td>\n",
 897 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...</td>\n",
 898 |        "      <td>DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...</td>\n",
 899 |        "      <td>NaN</td>\n",
 900 |        "      <td>PDB</td>\n",
 901 |        "      <td>1BJ1</td>\n",
 902 |        "      <td>IgG1</td>\n",
 903 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAAS</td>\n",
 904 |        "      <td>...</td>\n",
 905 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...</td>\n",
 906 |        "      <td>Kappa</td>\n",
 907 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
 908 |        "      <td>SASQDISNYLN</td>\n",
 909 |        "      <td>WYQQKPGKAPKVLIY</td>\n",
 910 |        "      <td>FTSSLHS</td>\n",
 911 |        "      <td>GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC</td>\n",
 912 |        "      <td>QQYSTVPWT</td>\n",
 913 |        "      <td>FGQGTKVEIK</td>\n",
 914 |        "      <td>DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...</td>\n",
 915 |        "    </tr>\n",
 916 |        "    <tr>\n",
 917 |        "      <th>2</th>\n",
 918 |        "      <td>Herceptin analog</td>\n",
 919 |        "      <td>Trastuzumab</td>\n",
 920 |        "      <td>IgG1 / Kappa</td>\n",
 921 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...</td>\n",
 922 |        "      <td>DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...</td>\n",
 923 |        "      <td>NaN</td>\n",
 924 |        "      <td>PDB</td>\n",
 925 |        "      <td>1N8Z</td>\n",
 926 |        "      <td>IgG1</td>\n",
 927 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAAS</td>\n",
 928 |        "      <td>...</td>\n",
 929 |        "      <td>EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...</td>\n",
 930 |        "      <td>Kappa</td>\n",
 931 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
 932 |        "      <td>RASQDVNTAVA</td>\n",
 933 |        "      <td>WYQQKPGKAPKLLIY</td>\n",
 934 |        "      <td>SASFLYS</td>\n",
 935 |        "      <td>GVPSRFSGSRSGTDFTLTISSLQPEDFATYYC</td>\n",
 936 |        "      <td>QQHYTTPPT</td>\n",
 937 |        "      <td>FGQGTKVEIK</td>\n",
 938 |        "      <td>DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...</td>\n",
 939 |        "    </tr>\n",
 940 |        "    <tr>\n",
 941 |        "      <th>3</th>\n",
 942 |        "      <td>Basiliximab analog</td>\n",
 943 |        "      <td>Basiliximab</td>\n",
 944 |        "      <td>IgG1 / Kappa</td>\n",
 945 |        "      <td>QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...</td>\n",
 946 |        "      <td>QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...</td>\n",
 947 |        "      <td>NaN</td>\n",
 948 |        "      <td>PDB</td>\n",
 949 |        "      <td>1MIM</td>\n",
 950 |        "      <td>IgG1</td>\n",
 951 |        "      <td>QLQQSGTVLARPGASVKMSCKAS</td>\n",
 952 |        "      <td>...</td>\n",
 953 |        "      <td>QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...</td>\n",
 954 |        "      <td>Kappa</td>\n",
 955 |        "      <td>QIVSTQSPAIMSASPGEKVTMTC</td>\n",
 956 |        "      <td>SASSSRSYMQ</td>\n",
 957 |        "      <td>WYQQKPGTSPKRWIY</td>\n",
 958 |        "      <td>DTSKLAS</td>\n",
 959 |        "      <td>GVPARFSGSGSGTSYSLTISSMEAEDAATYYC</td>\n",
 960 |        "      <td>HQRSSYT</td>\n",
 961 |        "      <td>FGGGTKLEIK</td>\n",
 962 |        "      <td>QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...</td>\n",
 963 |        "    </tr>\n",
 964 |        "    <tr>\n",
 965 |        "      <th>4</th>\n",
 966 |        "      <td>Natalizumab analog</td>\n",
 967 |        "      <td>Natalizumab</td>\n",
 968 |        "      <td>IgG1 / Kappa</td>\n",
 969 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...</td>\n",
 970 |        "      <td>DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...</td>\n",
 971 |        "      <td>NaN</td>\n",
 972 |        "      <td>US Patent</td>\n",
 973 |        "      <td>US5840299A</td>\n",
 974 |        "      <td>IgG1</td>\n",
 975 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKAS</td>\n",
 976 |        "      <td>...</td>\n",
 977 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...</td>\n",
 978 |        "      <td>Kappa</td>\n",
 979 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
 980 |        "      <td>KTSQDINKYMA</td>\n",
 981 |        "      <td>WYQQTPGKAPRLLIH</td>\n",
 982 |        "      <td>YTSALQP</td>\n",
 983 |        "      <td>GIPSRFSGSGSGRDYTFTISSLQPEDIATYYC</td>\n",
 984 |        "      <td>LQYDNLWT</td>\n",
 985 |        "      <td>FGQGTKVEIK</td>\n",
 986 |        "      <td>DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...</td>\n",
 987 |        "    </tr>\n",
 988 |        "  </tbody>\n",
 989 |        "</table>\n",
 990 |        "<p>5 rows × 26 columns</p>\n",
 991 |        "</div>"
 992 |       ],
 993 |       "text/plain": [
 994 |        "           Clone Name       Entity       ISOTYPE  \\\n",
 995 |        "0      TGN1412 analog      TGN1412  IgG1 / Kappa   \n",
 996 |        "1      Avastin analog  Bevacizumab  IgG1 / Kappa   \n",
 997 |        "2    Herceptin analog  Trastuzumab  IgG1 / Kappa   \n",
 998 |        "3  Basiliximab analog  Basiliximab  IgG1 / Kappa   \n",
 999 |        "4  Natalizumab analog  Natalizumab  IgG1 / Kappa   \n",
1000 |        "\n",
1001 |        "                                                  HC  \\\n",
1002 |        "0  QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...   \n",
1003 |        "1  EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...   \n",
1004 |        "2  EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...   \n",
1005 |        "3  QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...   \n",
1006 |        "4  QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...   \n",
1007 |        "\n",
1008 |        "                                                  LC  Unnamed: 5  \\\n",
1009 |        "0  DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...         NaN   \n",
1010 |        "1  DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...         NaN   \n",
1011 |        "2  DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...         NaN   \n",
1012 |        "3  QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...         NaN   \n",
1013 |        "4  DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...         NaN   \n",
1014 |        "\n",
1015 |        "  Variable Domain Source Source Details HC Class                       HFR1  \\\n",
1016 |        "0                    PDB           1YJD     IgG1  QVQLVQSGAEVKKPGASVKVSCKAS   \n",
1017 |        "1                    PDB           1BJ1     IgG1  EVQLVESGGGLVQPGGSLRLSCAAS   \n",
1018 |        "2                    PDB           1N8Z     IgG1  EVQLVESGGGLVQPGGSLRLSCAAS   \n",
1019 |        "3                    PDB           1MIM     IgG1    QLQQSGTVLARPGASVKMSCKAS   \n",
1020 |        "4              US Patent     US5840299A     IgG1  QVQLVQSGAEVKKPGASVKVSCKAS   \n",
1021 |        "\n",
1022 |        "   ...                                                 VH LC Class  \\\n",
1023 |        "0  ...  QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...    Kappa   \n",
1024 |        "1  ...  EVQLVESGGGLVQPGGSLRLSCAASGYTFTNYGMNWVRQAPGKGLE...    Kappa   \n",
1025 |        "2  ...  EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...    Kappa   \n",
1026 |        "3  ...  QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...    Kappa   \n",
1027 |        "4  ...  QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...    Kappa   \n",
1028 |        "\n",
1029 |        "                      LFR1        CDRL1             LFR2    CDRL2  \\\n",
1030 |        "0  DIQMTQSPSSLSASVGDRVTITC  HASQNIYVWLN  WYQQKPGKAPKLLIY  KASNLHT   \n",
1031 |        "1  DIQMTQSPSSLSASVGDRVTITC  SASQDISNYLN  WYQQKPGKAPKVLIY  FTSSLHS   \n",
1032 |        "2  DIQMTQSPSSLSASVGDRVTITC  RASQDVNTAVA  WYQQKPGKAPKLLIY  SASFLYS   \n",
1033 |        "3  QIVSTQSPAIMSASPGEKVTMTC   SASSSRSYMQ  WYQQKPGTSPKRWIY  DTSKLAS   \n",
1034 |        "4  DIQMTQSPSSLSASVGDRVTITC  KTSQDINKYMA  WYQQTPGKAPRLLIH  YTSALQP   \n",
1035 |        "\n",
1036 |        "                               LFR3      CDRL3        LFR4  \\\n",
1037 |        "0  GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC  QQGQTYPYT  FGGGTKVEIK   \n",
1038 |        "1  GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC  QQYSTVPWT  FGQGTKVEIK   \n",
1039 |        "2  GVPSRFSGSRSGTDFTLTISSLQPEDFATYYC  QQHYTTPPT  FGQGTKVEIK   \n",
1040 |        "3  GVPARFSGSGSGTSYSLTISSMEAEDAATYYC    HQRSSYT  FGGGTKLEIK   \n",
1041 |        "4  GIPSRFSGSGSGRDYTFTISSLQPEDIATYYC   LQYDNLWT  FGQGTKVEIK   \n",
1042 |        "\n",
1043 |        "                                                  VL  \n",
1044 |        "0  DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...  \n",
1045 |        "1  DIQMTQSPSSLSASVGDRVTITCSASQDISNYLNWYQQKPGKAPKV...  \n",
1046 |        "2  DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...  \n",
1047 |        "3  QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...  \n",
1048 |        "4  DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...  \n",
1049 |        "\n",
1050 |        "[5 rows x 26 columns]"
1051 |       ]
1052 |      },
1053 |      "execution_count": 107,
1054 |      "metadata": {},
1055 |      "output_type": "execute_result"
1056 |     }
1057 |    ],
1058 |    "source": [
1059 |     "df_Ab14 = pd.read_excel(open(os.path.join(DATA_DIR, 'supplemental Table.xlsx'), 'rb'), sheet_name='Sequence Listing')\n",
1060 |     "df_Ab14 = df_Ab14.loc[df_Ab14.ISOTYPE == 'IgG1 / Kappa']\n",
1061 |     "df_Ab14.reset_index(drop=True, inplace=True)\n",
1062 |     "\n",
1063 |     "df_Ab14_VH = pd.read_excel(open(os.path.join(DATA_DIR, 'supplemental Table.xlsx'), 'rb'), sheet_name='VHs')\n",
1064 |     "df_Ab14_VH = df_Ab14_VH.loc[df_Ab14_VH['HC Class'] == 'IgG1']\n",
1065 |     "df_Ab14_VH['VH'] = df_Ab14_VH.apply(lambda x: x['HFR1'] + x['CDRH1'] + x['HFR2'] + x['CDRH2'] + x['HFR3'] + x['CDRH3'] + x['HFR4'], axis=1)\n",
1066 |     "\n",
1067 |     "df_Ab14_VL = pd.read_excel(open(os.path.join(DATA_DIR, 'supplemental Table.xlsx'), 'rb'), sheet_name='VLs')\n",
1068 |     "df_Ab14_VL['VL'] = df_Ab14_VL.apply(lambda x: x['LFR1'] + x['CDRL1'] + x['LFR2'] + x['CDRL2'] + x['LFR3'] + x['CDRL3'] + x['LFR4'], axis=1)\n",
1069 |     "df_Ab14_VL.drop_duplicates(inplace=True)\n",
1070 |     "df_Ab14_VL.head()\n",
1071 |     "\n",
1072 |     "df_Ab14 = df_Ab14.merge(df_Ab14_VH, on='mAb')\n",
1073 |     "df_Ab14 = df_Ab14.merge(df_Ab14_VL, on='mAb')\n",
1074 |     "\n",
1075 |     "df_Ab14.rename({'mAb':ENTITY_KEY, 'Amino Acids, Mature Heavy Chain':'HC', \n",
1076 |     "                'Amino Acids, Mature Light Chain':'LC'}, inplace=True, axis=1)\n",
1077 |     "\n",
1078 |     "df_Ab14.head()"
1079 |    ]
1080 |   },
1081 |   {
1082 |    "cell_type": "code",
1083 |    "execution_count": 108,
1084 |    "id": "52f3df41-875f-4bef-8eee-60f886703bcf",
1085 |    "metadata": {
1086 |     "tags": []
1087 |    },
1088 |    "outputs": [
1089 |     {
1090 |      "data": {
1091 |       "text/html": [
1092 |        "<div>\n",
1093 |        "<style scoped>\n",
1094 |        "    .dataframe tbody tr th:only-of-type {\n",
1095 |        "        vertical-align: middle;\n",
1096 |        "    }\n",
1097 |        "\n",
1098 |        "    .dataframe tbody tr th {\n",
1099 |        "        vertical-align: top;\n",
1100 |        "    }\n",
1101 |        "\n",
1102 |        "    .dataframe thead th {\n",
1103 |        "        text-align: right;\n",
1104 |        "    }\n",
1105 |        "</style>\n",
1106 |        "<table border=\"1\" class=\"dataframe\">\n",
1107 |        "  <thead>\n",
1108 |        "    <tr style=\"text-align: right;\">\n",
1109 |        "      <th></th>\n",
1110 |        "      <th>Clone Name</th>\n",
1111 |        "      <th>Entity</th>\n",
1112 |        "      <th>ISOTYPE</th>\n",
1113 |        "      <th>HC</th>\n",
1114 |        "      <th>LC</th>\n",
1115 |        "      <th>Unnamed: 5</th>\n",
1116 |        "      <th>Variable Domain Source</th>\n",
1117 |        "      <th>Source Details</th>\n",
1118 |        "      <th>HC Class</th>\n",
1119 |        "      <th>HFR1</th>\n",
1120 |        "      <th>...</th>\n",
1121 |        "      <th>LC Class</th>\n",
1122 |        "      <th>LFR1</th>\n",
1123 |        "      <th>CDRL1</th>\n",
1124 |        "      <th>LFR2</th>\n",
1125 |        "      <th>CDRL2</th>\n",
1126 |        "      <th>LFR3</th>\n",
1127 |        "      <th>CDRL3</th>\n",
1128 |        "      <th>LFR4</th>\n",
1129 |        "      <th>VL</th>\n",
1130 |        "      <th>match</th>\n",
1131 |        "    </tr>\n",
1132 |        "  </thead>\n",
1133 |        "  <tbody>\n",
1134 |        "    <tr>\n",
1135 |        "      <th>0</th>\n",
1136 |        "      <td>TGN1412 analog</td>\n",
1137 |        "      <td>TGN1412</td>\n",
1138 |        "      <td>IgG1 / Kappa</td>\n",
1139 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...</td>\n",
1140 |        "      <td>DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...</td>\n",
1141 |        "      <td>NaN</td>\n",
1142 |        "      <td>PDB</td>\n",
1143 |        "      <td>1YJD</td>\n",
1144 |        "      <td>IgG1</td>\n",
1145 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKAS</td>\n",
1146 |        "      <td>...</td>\n",
1147 |        "      <td>Kappa</td>\n",
1148 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
1149 |        "      <td>HASQNIYVWLN</td>\n",
1150 |        "      <td>WYQQKPGKAPKLLIY</td>\n",
1151 |        "      <td>KASNLHT</td>\n",
1152 |        "      <td>GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC</td>\n",
1153 |        "      <td>QQGQTYPYT</td>\n",
1154 |        "      <td>FGGGTKVEIK</td>\n",
1155 |        "      <td>DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...</td>\n",
1156 |        "      <td>False</td>\n",
1157 |        "    </tr>\n",
1158 |        "    <tr>\n",
1159 |        "      <th>1</th>\n",
1160 |        "      <td>Basiliximab analog</td>\n",
1161 |        "      <td>Basiliximab</td>\n",
1162 |        "      <td>IgG1 / Kappa</td>\n",
1163 |        "      <td>QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...</td>\n",
1164 |        "      <td>QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...</td>\n",
1165 |        "      <td>NaN</td>\n",
1166 |        "      <td>PDB</td>\n",
1167 |        "      <td>1MIM</td>\n",
1168 |        "      <td>IgG1</td>\n",
1169 |        "      <td>QLQQSGTVLARPGASVKMSCKAS</td>\n",
1170 |        "      <td>...</td>\n",
1171 |        "      <td>Kappa</td>\n",
1172 |        "      <td>QIVSTQSPAIMSASPGEKVTMTC</td>\n",
1173 |        "      <td>SASSSRSYMQ</td>\n",
1174 |        "      <td>WYQQKPGTSPKRWIY</td>\n",
1175 |        "      <td>DTSKLAS</td>\n",
1176 |        "      <td>GVPARFSGSGSGTSYSLTISSMEAEDAATYYC</td>\n",
1177 |        "      <td>HQRSSYT</td>\n",
1178 |        "      <td>FGGGTKLEIK</td>\n",
1179 |        "      <td>QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...</td>\n",
1180 |        "      <td>False</td>\n",
1181 |        "    </tr>\n",
1182 |        "    <tr>\n",
1183 |        "      <th>2</th>\n",
1184 |        "      <td>Natalizumab analog</td>\n",
1185 |        "      <td>Natalizumab</td>\n",
1186 |        "      <td>IgG1 / Kappa</td>\n",
1187 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...</td>\n",
1188 |        "      <td>DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...</td>\n",
1189 |        "      <td>NaN</td>\n",
1190 |        "      <td>US Patent</td>\n",
1191 |        "      <td>US5840299A</td>\n",
1192 |        "      <td>IgG1</td>\n",
1193 |        "      <td>QVQLVQSGAEVKKPGASVKVSCKAS</td>\n",
1194 |        "      <td>...</td>\n",
1195 |        "      <td>Kappa</td>\n",
1196 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
1197 |        "      <td>KTSQDINKYMA</td>\n",
1198 |        "      <td>WYQQTPGKAPRLLIH</td>\n",
1199 |        "      <td>YTSALQP</td>\n",
1200 |        "      <td>GIPSRFSGSGSGRDYTFTISSLQPEDIATYYC</td>\n",
1201 |        "      <td>LQYDNLWT</td>\n",
1202 |        "      <td>FGQGTKVEIK</td>\n",
1203 |        "      <td>DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...</td>\n",
1204 |        "      <td>False</td>\n",
1205 |        "    </tr>\n",
1206 |        "    <tr>\n",
1207 |        "      <th>3</th>\n",
1208 |        "      <td>Tremelimumab analog</td>\n",
1209 |        "      <td>Tremelimumab</td>\n",
1210 |        "      <td>IgG1 / Kappa</td>\n",
1211 |        "      <td>QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...</td>\n",
1212 |        "      <td>DIQMTQSPSSLSASVGDRVTITCRASQSINSYLDWYQQKPGKAPKL...</td>\n",
1213 |        "      <td>NaN</td>\n",
1214 |        "      <td>US Patent</td>\n",
1215 |        "      <td>US6682736</td>\n",
1216 |        "      <td>IgG1</td>\n",
1217 |        "      <td>QVQLVESGGGVVQPGRSLRLSCAAS</td>\n",
1218 |        "      <td>...</td>\n",
1219 |        "      <td>Kappa</td>\n",
1220 |        "      <td>DIQMTQSPSSLSASVGDRVTITC</td>\n",
1221 |        "      <td>RASQSINSYLD</td>\n",
1222 |        "      <td>WYQQKPGKAPKLLIY</td>\n",
1223 |        "      <td>AASSLQS</td>\n",
1224 |        "      <td>GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC</td>\n",
1225 |        "      <td>QQYYSTPFT</td>\n",
1226 |        "      <td>FGPGTKVEIK</td>\n",
1227 |        "      <td>DIQMTQSPSSLSASVGDRVTITCRASQSINSYLDWYQQKPGKAPKL...</td>\n",
1228 |        "      <td>False</td>\n",
1229 |        "    </tr>\n",
1230 |        "    <tr>\n",
1231 |        "      <th>4</th>\n",
1232 |        "      <td>Ipilimumab analog</td>\n",
1233 |        "      <td>Ipilimumab</td>\n",
1234 |        "      <td>IgG1 / Kappa</td>\n",
1235 |        "      <td>QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYTMHWVRQAPGKGLE...</td>\n",
1236 |        "      <td>EIVLTQSPGTLSLSPGERATLSCRASQSVGSSYLAWYQQKPGQAPR...</td>\n",
1237 |        "      <td>NaN</td>\n",
1238 |        "      <td>US Patent</td>\n",
1239 |        "      <td>US6984720</td>\n",
1240 |        "      <td>IgG1</td>\n",
1241 |        "      <td>QVQLVESGGGVVQPGRSLRLSCAAS</td>\n",
1242 |        "      <td>...</td>\n",
1243 |        "      <td>Kappa</td>\n",
1244 |        "      <td>EIVLTQSPGTLSLSPGERATLSC</td>\n",
1245 |        "      <td>RASQSVGSSYLA</td>\n",
1246 |        "      <td>WYQQKPGQAPRLLIY</td>\n",
1247 |        "      <td>GAFSRAT</td>\n",
1248 |        "      <td>GIPDRFSGSGSGTDFTLTISRLEPEDFAVYYC</td>\n",
1249 |        "      <td>QQYGSSPWT</td>\n",
1250 |        "      <td>FGQGTKVEIK</td>\n",
1251 |        "      <td>EIVLTQSPGTLSLSPGERATLSCRASQSVGSSYLAWYQQKPGQAPR...</td>\n",
1252 |        "      <td>False</td>\n",
1253 |        "    </tr>\n",
1254 |        "  </tbody>\n",
1255 |        "</table>\n",
1256 |        "<p>5 rows × 27 columns</p>\n",
1257 |        "</div>"
1258 |       ],
1259 |       "text/plain": [
1260 |        "            Clone Name        Entity       ISOTYPE  \\\n",
1261 |        "0       TGN1412 analog       TGN1412  IgG1 / Kappa   \n",
1262 |        "1   Basiliximab analog   Basiliximab  IgG1 / Kappa   \n",
1263 |        "2   Natalizumab analog   Natalizumab  IgG1 / Kappa   \n",
1264 |        "3  Tremelimumab analog  Tremelimumab  IgG1 / Kappa   \n",
1265 |        "4    Ipilimumab analog    Ipilimumab  IgG1 / Kappa   \n",
1266 |        "\n",
1267 |        "                                                  HC  \\\n",
1268 |        "0  QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...   \n",
1269 |        "1  QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...   \n",
1270 |        "2  QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...   \n",
1271 |        "3  QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...   \n",
1272 |        "4  QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYTMHWVRQAPGKGLE...   \n",
1273 |        "\n",
1274 |        "                                                  LC  Unnamed: 5  \\\n",
1275 |        "0  DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...         NaN   \n",
1276 |        "1  QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...         NaN   \n",
1277 |        "2  DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...         NaN   \n",
1278 |        "3  DIQMTQSPSSLSASVGDRVTITCRASQSINSYLDWYQQKPGKAPKL...         NaN   \n",
1279 |        "4  EIVLTQSPGTLSLSPGERATLSCRASQSVGSSYLAWYQQKPGQAPR...         NaN   \n",
1280 |        "\n",
1281 |        "  Variable Domain Source Source Details HC Class                       HFR1  \\\n",
1282 |        "0                    PDB           1YJD     IgG1  QVQLVQSGAEVKKPGASVKVSCKAS   \n",
1283 |        "1                    PDB           1MIM     IgG1    QLQQSGTVLARPGASVKMSCKAS   \n",
1284 |        "2              US Patent     US5840299A     IgG1  QVQLVQSGAEVKKPGASVKVSCKAS   \n",
1285 |        "3              US Patent      US6682736     IgG1  QVQLVESGGGVVQPGRSLRLSCAAS   \n",
1286 |        "4              US Patent      US6984720     IgG1  QVQLVESGGGVVQPGRSLRLSCAAS   \n",
1287 |        "\n",
1288 |        "   ... LC Class                     LFR1         CDRL1             LFR2  \\\n",
1289 |        "0  ...    Kappa  DIQMTQSPSSLSASVGDRVTITC   HASQNIYVWLN  WYQQKPGKAPKLLIY   \n",
1290 |        "1  ...    Kappa  QIVSTQSPAIMSASPGEKVTMTC    SASSSRSYMQ  WYQQKPGTSPKRWIY   \n",
1291 |        "2  ...    Kappa  DIQMTQSPSSLSASVGDRVTITC   KTSQDINKYMA  WYQQTPGKAPRLLIH   \n",
1292 |        "3  ...    Kappa  DIQMTQSPSSLSASVGDRVTITC   RASQSINSYLD  WYQQKPGKAPKLLIY   \n",
1293 |        "4  ...    Kappa  EIVLTQSPGTLSLSPGERATLSC  RASQSVGSSYLA  WYQQKPGQAPRLLIY   \n",
1294 |        "\n",
1295 |        "     CDRL2                              LFR3      CDRL3        LFR4  \\\n",
1296 |        "0  KASNLHT  GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC  QQGQTYPYT  FGGGTKVEIK   \n",
1297 |        "1  DTSKLAS  GVPARFSGSGSGTSYSLTISSMEAEDAATYYC    HQRSSYT  FGGGTKLEIK   \n",
1298 |        "2  YTSALQP  GIPSRFSGSGSGRDYTFTISSLQPEDIATYYC   LQYDNLWT  FGQGTKVEIK   \n",
1299 |        "3  AASSLQS  GVPSRFSGSGSGTDFTLTISSLQPEDFATYYC  QQYYSTPFT  FGPGTKVEIK   \n",
1300 |        "4  GAFSRAT  GIPDRFSGSGSGTDFTLTISRLEPEDFAVYYC  QQYGSSPWT  FGQGTKVEIK   \n",
1301 |        "\n",
1302 |        "                                                  VL  match  \n",
1303 |        "0  DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...  False  \n",
1304 |        "1  QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...  False  \n",
1305 |        "2  DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...  False  \n",
1306 |        "3  DIQMTQSPSSLSASVGDRVTITCRASQSINSYLDWYQQKPGKAPKL...  False  \n",
1307 |        "4  EIVLTQSPGTLSLSPGERATLSCRASQSVGSSYLAWYQQKPGQAPR...  False  \n",
1308 |        "\n",
1309 |        "[5 rows x 27 columns]"
1310 |       ]
1311 |      },
1312 |      "execution_count": 108,
1313 |      "metadata": {},
1314 |      "output_type": "execute_result"
1315 |     }
1316 |    ],
1317 |    "source": [
1318 |     "def vl_vh_match_in_Ab21(x):\n",
1319 |     "    for vl, vh in zip(df_Ab21.LC.values, df_Ab21.HC.values):\n",
1320 |     "        if x['VL'] in vl and x['VH'] in vh:\n",
1321 |     "            return True\n",
1322 |     "    return False\n",
1323 |     "\n",
1324 |     "df_Ab14['match'] = df_Ab14.apply(lambda x: vl_vh_match_in_Ab21(x), axis=1)\n",
1325 |     "df_Ab8 = df_Ab14[~df_Ab14.match]\n",
1326 |     "df_Ab8.reset_index(drop=True, inplace=True)\n",
1327 |     "df_Ab8.head()"
1328 |    ]
1329 |   },
1330 |   {
1331 |    "cell_type": "code",
1332 |    "execution_count": 109,
1333 |    "id": "624e9d02-b403-4a7b-b316-eec252fbc93b",
1334 |    "metadata": {},
1335 |    "outputs": [
1336 |     {
1337 |      "name": "stdout",
1338 |      "output_type": "stream",
1339 |      "text": [
1340 |       "8 fasta files were saved in FASTA_DIR\n"
1341 |      ]
1342 |     }
1343 |    ],
1344 |    "source": [
1345 |     "fasta_files = []\n",
1346 |     "for entity, hc, lc in zip(df_Ab8[ENTITY_KEY].values, df_Ab8['HC'].values, df_Ab8['LC'].values):\n",
1347 |     "    fasta_file = os.path.join(FASTA_DIR, entity + '.fasta')\n",
1348 |     "    fasta_files.append(fasta_file)\n",
1349 |     "    with open(fasta_file, 'w') as fptr:\n",
1350 |     "        fptr.write('>' + entity + '_VH\\n')\n",
1351 |     "        fptr.write(hc + '\\n')\n",
1352 |     "        fptr.write('>' + entity + '_VL\\n')\n",
1353 |     "        fptr.write(lc + '\\n')\n",
1354 |     "\n",
1355 |     "print('%d fasta files were saved in FASTA_DIR' % len(fasta_files))"
1356 |    ]
1357 |   },
1358 |   {
1359 |    "cell_type": "code",
1360 |    "execution_count": 111,
1361 |    "id": "14146155-5238-44fd-8b5a-a45f34c16c8e",
1362 |    "metadata": {},
1363 |    "outputs": [
1364 |     {
1365 |      "name": "stdout",
1366 |      "output_type": "stream",
1367 |      "text": [
1368 |       "Number of antibodies in Ab8 set: 8\n"
1369 |      ]
1370 |     },
1371 |     {
1372 |      "data": {
1373 |       "text/html": [
1374 |        "<div>\n",
1375 |        "<style scoped>\n",
1376 |        "    .dataframe tbody tr th:only-of-type {\n",
1377 |        "        vertical-align: middle;\n",
1378 |        "    }\n",
1379 |        "\n",
1380 |        "    .dataframe tbody tr th {\n",
1381 |        "        vertical-align: top;\n",
1382 |        "    }\n",
1383 |        "\n",
1384 |        "    .dataframe thead th {\n",
1385 |        "        text-align: right;\n",
1386 |        "    }\n",
1387 |        "</style>\n",
1388 |        "<table border=\"1\" class=\"dataframe\">\n",
1389 |        "  <thead>\n",
1390 |        "    <tr style=\"text-align: right;\">\n",
1391 |        "      <th></th>\n",
1392 |        "      <th>Entity</th>\n",
1393 |        "      <th>Viscosity_at_150</th>\n",
1394 |        "      <th>SCM</th>\n",
1395 |        "    </tr>\n",
1396 |        "  </thead>\n",
1397 |        "  <tbody>\n",
1398 |        "    <tr>\n",
1399 |        "      <th>0</th>\n",
1400 |        "      <td>TGN1412</td>\n",
1401 |        "      <td>16.42</td>\n",
1402 |        "      <td>844.6</td>\n",
1403 |        "    </tr>\n",
1404 |        "    <tr>\n",
1405 |        "      <th>1</th>\n",
1406 |        "      <td>Basiliximab</td>\n",
1407 |        "      <td>25.05</td>\n",
1408 |        "      <td>640.8</td>\n",
1409 |        "    </tr>\n",
1410 |        "    <tr>\n",
1411 |        "      <th>2</th>\n",
1412 |        "      <td>Natalizumab</td>\n",
1413 |        "      <td>13.67</td>\n",
1414 |        "      <td>815.5</td>\n",
1415 |        "    </tr>\n",
1416 |        "    <tr>\n",
1417 |        "      <th>3</th>\n",
1418 |        "      <td>Tremelimumab</td>\n",
1419 |        "      <td>8.80</td>\n",
1420 |        "      <td>704.2</td>\n",
1421 |        "    </tr>\n",
1422 |        "    <tr>\n",
1423 |        "      <th>4</th>\n",
1424 |        "      <td>Ipilimumab</td>\n",
1425 |        "      <td>8.60</td>\n",
1426 |        "      <td>754.0</td>\n",
1427 |        "    </tr>\n",
1428 |        "  </tbody>\n",
1429 |        "</table>\n",
1430 |        "</div>"
1431 |       ],
1432 |       "text/plain": [
1433 |        "         Entity  Viscosity_at_150    SCM\n",
1434 |        "0       TGN1412             16.42  844.6\n",
1435 |        "1   Basiliximab             25.05  640.8\n",
1436 |        "2   Natalizumab             13.67  815.5\n",
1437 |        "3  Tremelimumab              8.80  704.2\n",
1438 |        "4    Ipilimumab              8.60  754.0"
1439 |       ]
1440 |      },
1441 |      "execution_count": 111,
1442 |      "metadata": {},
1443 |      "output_type": "execute_result"
1444 |     }
1445 |    ],
1446 |    "source": [
1447 |     "# Save viscosity and other computed properties\n",
1448 |     "Ab8_entities = ['TGN1412', 'Basiliximab', 'Natalizumab', 'Tremelimumab', 'Ipilimumab', 'Atezolizumab', 'Ganitumab', 'Vesencumab']\n",
1449 |     "Ab8_visc = [16.42, 25.05, 13.67, 8.8, 8.6, 11.56, 10.1, 23.57]\n",
1450 |     "Ab8_SCM = [844.6, 640.8, 815.5, 704.2, 754, 759.6, 806.5, 661.3]\n",
1451 |     "df_Ab8_visc = pd.DataFrame({ENTITY_KEY: Ab8_entities, VISCOSITY_KEY: Ab8_visc, 'SCM': Ab8_SCM})\n",
1452 |     "\n",
1453 |     "df_Ab8_visc.to_csv(os.path.join(DATA_DIR, 'Ab8.csv'), index=False)\n",
1454 |     "print('Number of antibodies in Ab8 set: %d' % len(df_Ab8_visc))\n",
1455 |     "df_Ab8_visc.head()"
1456 |    ]
1457 |   },
1458 |   {
1459 |    "cell_type": "code",
1460 |    "execution_count": null,
1461 |    "id": "cfe0c281-5a01-48a5-9f28-93bcf517ab9c",
1462 |    "metadata": {},
1463 |    "outputs": [],
1464 |    "source": []
1465 |   }
1466 |  ],
1467 |  "metadata": {
1468 |   "kernelspec": {
1469 |    "display_name": "Python 3 (ipykernel)",
1470 |    "language": "python",
1471 |    "name": "python3"
1472 |   },
1473 |   "language_info": {
1474 |    "codemirror_mode": {
1475 |     "name": "ipython",
1476 |     "version": 3
1477 |    },
1478 |    "file_extension": ".py",
1479 |    "mimetype": "text/x-python",
1480 |    "name": "python",
1481 |    "nbconvert_exporter": "python",
1482 |    "pygments_lexer": "ipython3",
1483 |    "version": "3.9.12"
1484 |   }
1485 |  },
1486 |  "nbformat": 4,
1487 |  "nbformat_minor": 5
1488 | }
1489 | 


--------------------------------------------------------------------------------
/notebooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pfizer-opensource/pfabnet-viscosity/60970a752a3e74cc336db13576a9c3a21448fe2e/notebooks/__init__.py


--------------------------------------------------------------------------------
/pfabnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import ViscosityDataset
2 | 
3 | 


--------------------------------------------------------------------------------
/pfabnet/base.py:
--------------------------------------------------------------------------------
1 | ENTITY_KEY = 'Entity'
2 | VISCOSITY_KEY = 'Viscosity_at_150'
3 | SCHRODINGER_BASE = '/localscratch/software/schrodinger/adv-2021-2'
4 | 
5 | 
6 | def get_file_path():
7 |     return __file__
8 | 
9 | 


--------------------------------------------------------------------------------
/pfabnet/dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import torch
 3 | 
 4 | class ViscosityDataset(Dataset):
 5 |     def __init__(self, X, y):
 6 |         self.X = X
 7 |         self.y = y
 8 | 
 9 |     def __getitem__(self, index):
10 |         return torch.Tensor(self.X[index]), torch.Tensor([self.y[index]])
11 | 
12 |     def __len__(self):
13 |         return len(self.y)
14 | 
15 | 


--------------------------------------------------------------------------------
/pfabnet/esp_generator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import argparse
 4 | import pickle
 5 | 
 6 | import numpy as np
 7 | from openeye import oechem
 8 | from utils import generate_esp_grids
 9 | 
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(
14 |         description='Generate PfAbNet ESP grid input')
15 |     parser.add_argument('--input_mols_dir', type=str, default='./',
16 |                         help='directory containing antibody structures/models')
17 |     parser.add_argument('--esp_output_dir', type=str, default='./',
18 |                         help='directory to save the generated ESP grid files')
19 |     parser.add_argument('--grid_dim', type=int, default=96,
20 |                         help='number of grid points along each axis (default = 96)')
21 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
22 |                         help='spacing between grid points (default = 0.75 Angstrom)')
23 |     parser.add_argument('--shell_width', type=float, default=2.0,
24 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
25 |     parser.add_argument('--NX', type=int, default=10,
26 |                         help='augmentation level (default 10x)')
27 |     parser.add_argument('--processors', type=int, default=10,
28 |                         help='Number of CPUs for ESP grid calculation (default 10)')
29 |     parser.add_argument('--seed', type=int, default=42,
30 |                         help='random seed (default 42)')
31 | 
32 |     parser.add_argument('-v', '--verbose', action='count', default=0)
33 |     in_args = parser.parse_args()
34 | 
35 |     input_mols_dir = in_args.input_mols_dir
36 |     esp_dir = in_args.esp_output_dir
37 |     seed = in_args.seed
38 | 
39 |     args = in_args.__dict__
40 |     np.random.seed(seed)
41 | 
42 |     try:
43 |         os.mkdir(esp_dir)
44 |     except Exception as e:
45 |         pass
46 | 
47 |     mol_files = glob.glob(input_mols_dir + '/*.mol2')
48 |     for mol_file in mol_files:
49 |         print(mol_file)
50 |         output = generate_esp_grids(args, mol_file)
51 |         for idx, (esp_grid, output_mol) in enumerate(output):
52 |             output_dir = os.path.join(esp_dir, 'rotation_%d' % (idx + 1))
53 |             try:
54 |                 os.mkdir(output_dir)
55 |             except Exception as e:
56 |                 pass
57 | 
58 |             base_mol_file = os.path.basename(mol_file).split('.mol2')[0]
59 |             esp_file = os.path.join(output_dir, base_mol_file + '.pyb')
60 |             with open(esp_file, 'wb') as fptr:
61 |                pickle.dump(esp_grid, fptr)
62 | 
63 |             output_mol_file = os.path.join(output_dir, os.path.basename(mol_file))
64 | 
65 |             ofs = oechem.oemolostream(output_mol_file)
66 |             oechem.OEWriteConstMolecule(ofs, output_mol)
67 |             ofs.close()
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/pfabnet/generate_attributions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import glob
  4 | import argparse
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | from openeye import oechem
 10 | from openeye import oegrid
 11 | 
 12 | from model import ViscosityNet
 13 | from utils import seed_everything
 14 | from utils import GRID_DIM_KEY, GRID_SPACING_KEY, ESP_DIR_KEY, HOMOLOGY_MODEL_DIR_KEY
 15 | from utils import get_molecule, calculate_attribution_grid
 16 | from utils import get_esp_grids, generate_esp_grids
 17 | from base import ENTITY_KEY
 18 | 
 19 | 
 20 | 
 21 | device = 'cpu'
 22 | if torch.cuda.is_available():
 23 |     device = torch.cuda.current_device()
 24 | 
 25 | def get_cnn_models(args, model_files):
 26 |     models = []
 27 |     for model_file in model_files:
 28 |         model = ViscosityNet(args['grid_dim'])
 29 |         if os.path.exists(model_file):
 30 |             print('loading %s...' % model_file)
 31 |             model.load_state_dict(torch.load(model_file))
 32 |             model.eval()
 33 | 
 34 |         model = torch.nn.DataParallel(model).to(device)
 35 |         models.append(model)
 36 | 
 37 |     return models
 38 | 
 39 | 
 40 | def overlay(reference_mol, fit_mol, attribution_mol=None):
 41 |     alignment = oechem.OEGetAlignment(reference_mol, fit_mol)
 42 |     rot = oechem.OEDoubleArray(9)
 43 |     trans = oechem.OEDoubleArray(3)
 44 |     oechem.OERMSD(reference_mol, fit_mol, alignment, True, True, rot, trans)
 45 |     oechem.OERotate(fit_mol, rot)
 46 |     oechem.OETranslate(fit_mol, trans)
 47 | 
 48 |     if attribution_mol is not None:
 49 |         oechem.OERotate(attribution_mol, rot)
 50 |         oechem.OETranslate(attribution_mol, trans)
 51 | 
 52 | 
 53 | def get_attribution_mol(args, attribution_grid):
 54 |     attribution_mol = oechem.OEGraphMol()
 55 |     grid_dim, grid_spacing = args[GRID_DIM_KEY], args[GRID_SPACING_KEY]
 56 |     significant_thres = args['significant_attribution_threshold']
 57 |     grid = oegrid.OEScalarGrid(grid_dim, grid_dim, grid_dim, 0.0, 0.0, 0.0, grid_spacing)
 58 |     for i in range(grid_dim):
 59 |         for j in range(grid_dim):
 60 |             for k in range(grid_dim):
 61 |                 gradient = attribution_grid[0][0][i][j][k]
 62 | 
 63 |                 if np.abs(attribution_grid[0][0][i][j][k]) > significant_thres:
 64 |                     x, y, z = grid.GridIdxToSpatialCoord(i, j, k)
 65 |                     if gradient > 0.0:
 66 |                         atom = attribution_mol.NewAtom(oechem.OEElemNo_O)
 67 |                     else:
 68 |                         atom = attribution_mol.NewAtom(oechem.OEElemNo_N)
 69 |                     atom.SetPartialCharge(attribution_grid[0][0][i][j][k])
 70 |                     attribution_mol.SetCoords(atom, oechem.OEFloatArray([x, y, z]))
 71 | 
 72 |     return attribution_mol
 73 | 
 74 | 
 75 | def generate_attributions(args, models):
 76 |     def save_molecule(f, mol):
 77 |         ofs = oechem.oemolostream(f)
 78 |         oechem.OEWriteMolecule(ofs, mol)
 79 |         ofs.close()
 80 | 
 81 |     reference_mol = get_molecule(args['reference_structure_file'], perceive_residue=True, center_mol=False)
 82 | 
 83 |     df = pd.read_csv(args['test_data_file'])
 84 | 
 85 |     hm_model_dir = args[HOMOLOGY_MODEL_DIR_KEY]
 86 |     output_attribution_dir = args['output_attribution_dir']
 87 |     for row_idx, row in df.iterrows():
 88 |         if args['process_structure_index'] >= 0 and args['process_structure_index'] != row_idx:
 89 |             continue
 90 |         mol_file = os.path.join(hm_model_dir, row[ENTITY_KEY] + '.mol2')
 91 |         if len(args[ESP_DIR_KEY]) > 0:
 92 |             esp_grids = get_esp_grids(args, mol_file)
 93 |         else:
 94 |             esp_grids = generate_esp_grids(args, mol_file)
 95 | 
 96 |         for grid_idx, (esp_grid, mol) in enumerate(esp_grids):
 97 |             for model_idx, model in enumerate(models):
 98 |                 print('processing... row_idx: %d grid_idx: %d, model_idx: %d'
 99 |                       % (row_idx, grid_idx, model_idx))
100 |                 mol2 = oechem.OEGraphMol(mol)
101 |                 oechem.OEPerceiveResidues(mol2)
102 |                 attribution_grid, _ = calculate_attribution_grid(model, esp_grid, device)
103 |                 attribution_mol = get_attribution_mol(args, attribution_grid)
104 |                 overlay(reference_mol, mol2, attribution_mol)
105 |                 outfile_base = os.path.join(output_attribution_dir,
106 |                                            '%s_%d_%d' % (row[ENTITY_KEY], grid_idx, model_idx))
107 |                 save_molecule(outfile_base + '.mol2', mol2)
108 |                 save_molecule(outfile_base + '.oeb.gz', attribution_mol)
109 |                 if len(args[ESP_DIR_KEY]) > 0:
110 |                     pdb_file = mol_file.split('.mol2')[0] + '.pdb'
111 |                     pdb_mol = get_molecule(pdb_file, perceive_residue=False, center_mol=False)
112 |                     oechem.OEPerceiveResidues(mol2)
113 |                     overlay(mol2, pdb_mol)
114 |                     save_molecule(outfile_base + '.pdb', pdb_mol)
115 | 
116 | 
117 | 
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     parser = argparse.ArgumentParser(
122 |         description='Generate attributions using PfAbNet models')
123 |     parser.add_argument('--test_data_file', type=str, help='test set csv files with entity names')
124 |     parser.add_argument('--reference_structure_file', type=str,
125 |                         help='align each generated attribution molecule to the reference molecule (.mol2)')
126 |     parser.add_argument('--homology_model_dir', type=str, help='homology model directory')
127 |     parser.add_argument('--PfAbNet_model_prefix', type=str, default='PfAbNet', help='PfAbNet model prefix')
128 |     parser.add_argument('--PfAbNet_model_dir', type=str, help='PfAbNet model directory')
129 |     parser.add_argument('--grid_dim', type=int, default=96,
130 |                         help='number of grid points along each axis (default = 96)')
131 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
132 |                         help='spacing between grid points (default = 0.75 Angstrom)')
133 |     parser.add_argument('--shell_width', type=float, default=2.0,
134 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
135 |     parser.add_argument('--NX', type=int, default=10,
136 |                         help='number of rotated structures for each input structure (default 10)')
137 |     parser.add_argument('--processors', type=int, default=5,
138 |                         help='Number of CPUs for ESP grid calculation (default 5)')
139 |     parser.add_argument('--esp_dir', type=str, default='', help='directory with precomputed ESP grids')
140 |     parser.add_argument('--significant_attribution_threshold', type=float,
141 |                         help='significant attribution threshold')
142 |     parser.add_argument('--process_structure_index', type=int, default=-1,
143 |                         help='process structure index (default: -1, process all')
144 |     parser.add_argument('--output_attribution_dir', type=str, help='directory to save attribution outputs')
145 |     parser.add_argument('-v', '--verbose', action='count', default=0)
146 |     args = parser.parse_args()
147 | 
148 |     seed_everything(42)
149 | 
150 |     model_files_prefix = os.path.join(args.PfAbNet_model_dir, args.PfAbNet_model_prefix)
151 |     model_files = glob.glob('%s*.pt' % model_files_prefix)
152 | 
153 |     args = vars(args)
154 |     cnn_models = get_cnn_models(args, model_files)
155 |     generate_attributions(args, cnn_models)
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/pfabnet/generate_testset_attributions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import glob
 4 | import argparse
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | 
10 | 
11 | from model import ViscosityNet
12 | from utils import prepare_test_input
13 | from utils import calculate_attribution_grid
14 | from utils import seed_everything
15 | 
16 | 
17 | device = 'cpu'
18 | if torch.cuda.is_available():
19 |     device = torch.cuda.current_device()
20 | 
21 | 
22 | def get_cnn_models(args, model_files):
23 |     models = []
24 |     for model_file in model_files:
25 |         model = ViscosityNet(args['grid_dim'])
26 |         if os.path.exists(model_file):
27 |             print('loading %s...' % model_file)
28 |             model.load_state_dict(torch.load(model_file))
29 |             model.eval()
30 | 
31 |         model = torch.nn.DataParallel(model).to(device)
32 |         models.append(model)
33 | 
34 |     return models
35 | 
36 | 
37 | def calculate_test_set_attribution_scores(args, models):
38 |     df = pd.read_csv(args.test_data_file)
39 | 
40 |     args = vars(args)
41 |     X, _ = prepare_test_input(df, args)
42 | 
43 |     attribution_scores = []
44 |     for model in models:
45 |         for i in range(len(X)):
46 |             attribution_grid, esp_grid = calculate_attribution_grid(model, X[i], device)
47 |             attribution_grid = attribution_grid[np.abs(esp_grid) > 1e-5]
48 |             attribution_scores.extend(attribution_grid.flatten())
49 | 
50 |     return np.array(attribution_scores)
51 | 
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser(
56 |         description='Generate attributions using PfAbNet models')
57 |     parser.add_argument('--test_data_file', type=str, help='test set csv files with entity names')
58 |     parser.add_argument('--homology_model_dir', type=str, help='homology model directory')
59 |     parser.add_argument('--PfAbNet_model_prefix', type=str, default='PfAbNet', help='PfAbNet model prefix')
60 |     parser.add_argument('--PfAbNet_model_dir', type=str, help='PfAbNet model directory')
61 |     parser.add_argument('--grid_dim', type=int, default=96,
62 |                         help='number of grid points along each axis (default = 96)')
63 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
64 |                         help='spacing between grid points (default = 0.75 Angstrom)')
65 |     parser.add_argument('--shell_width', type=float, default=2.0,
66 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
67 |     parser.add_argument('--NX', type=int, default=1,
68 |                         help='number of rotated structures for each input structure (default 1)')
69 |     parser.add_argument('--processors', type=int, default=1,
70 |                         help='Number of CPUs for ESP grid calculation (default 1)')
71 |     parser.add_argument('--esp_dir', type=str, default='', help='directory with precomputed ESP grids')
72 |     parser.add_argument('--output_attribution_scores', type=str, help='file to save attribution scores')
73 |     parser.add_argument('--output_attribution_threshold', type=str, help='file to save attribution threshold')
74 |     parser.add_argument('-v', '--verbose', action='count', default=0)
75 |     args = parser.parse_args()
76 | 
77 |     seed_everything(42)
78 | 
79 |     model_files_prefix = os.path.join(args.PfAbNet_model_dir, args.PfAbNet_model_prefix)
80 |     model_files = glob.glob('%s*.pt' % model_files_prefix)
81 |     cnn_models = get_cnn_models(vars(args), model_files)
82 | 
83 |     attribution_scores = calculate_test_set_attribution_scores(args, cnn_models)
84 |     np.save(args.output_attribution_scores, attribution_scores)
85 |     np.save(args.output_attribution_threshold, np.std(attribution_scores))
86 | 
87 | 


--------------------------------------------------------------------------------
/pfabnet/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ViscosityNet(nn.Module):
 6 |     def __init__(self, grid_dim=96):
 7 |         super(ViscosityNet, self).__init__()
 8 |         nfilt = 2
 9 |         ks = 3
10 | 
11 |         dilation = 1
12 |         if grid_dim >= 64:
13 |             self.convnet = nn.Sequential(nn.Conv3d(1, nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
14 |                                          nn.MaxPool3d(2),
15 |                                          nn.Conv3d(nfilt, 2*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
16 |                                          nn.MaxPool3d(2),
17 |                                          nn.Conv3d(2*nfilt, 4*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
18 |                                          nn.MaxPool3d(2),
19 |                                          nn.Conv3d(4*nfilt, 8*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
20 |                                          nn.MaxPool3d(2),
21 |                                          nn.Conv3d(8*nfilt, 16*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
22 |                                          nn.MaxPool3d(2),
23 |                                          nn.Conv3d(16*nfilt, 32*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
24 |                                          nn.MaxPool3d(2),
25 |                                          nn.Conv3d(32*nfilt, 512*nfilt, ks, padding='same', dilation=dilation), nn.ReLU()
26 |                                          )
27 |         else:
28 |             self.convnet = nn.Sequential(nn.Conv3d(1, nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
29 |                                          nn.MaxPool3d(2),
30 |                                          nn.Conv3d(nfilt, 2*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
31 |                                          nn.MaxPool3d(2),
32 |                                          nn.Conv3d(2*nfilt, 4*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
33 |                                          nn.MaxPool3d(2),
34 |                                          nn.Conv3d(4*nfilt, 8*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
35 |                                          nn.MaxPool3d(2),
36 |                                          nn.Conv3d(8*nfilt, 16*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
37 |                                          nn.MaxPool3d(2),
38 |                                          nn.Conv3d(16*nfilt, 32*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
39 |                                          nn.Conv3d(32*nfilt, 512*nfilt, ks, padding='same', dilation=dilation), nn.ReLU()
40 |                                          )
41 | 
42 | 
43 |         self.fc = nn.Sequential(nn.Linear(512*nfilt, 1), nn.ReLU())
44 | 
45 |         self.drop_out = nn.Dropout(0.05)
46 | 
47 | 
48 |     def forward(self, x, y=None):
49 |         x = self.convnet(x)
50 | 
51 |         emb = torch.flatten(x, 1)
52 | 
53 |         x = self.drop_out(emb)
54 |         x = self.fc(x)
55 |        
56 |         if y is not None:
57 |             loss = nn.functional.huber_loss(y, x, reduction='mean')
58 |             return x, loss
59 |         else:
60 |             return x
61 | 
62 | 


--------------------------------------------------------------------------------
/pfabnet/predict.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import torch
  3 | from dataset import ViscosityDataset
  4 | from torch.utils.data.dataloader import DataLoader
  5 | 
  6 | import argparse
  7 | import glob
  8 | import os
  9 | 
 10 | import numpy as np
 11 | 
 12 | from model import ViscosityNet
 13 | from utils import seed_everything
 14 | from utils import generate_esp_grids, get_esp_grids
 15 | from utils import DEFAULT_GRID_PARAMS, ESP_DIR_KEY
 16 | from base import ENTITY_KEY
 17 | 
 18 | 
 19 | device = 'cpu'
 20 | if torch.cuda.is_available():
 21 |     device = torch.cuda.current_device()
 22 | 
 23 | def get_cnn_models(args, model_files):
 24 |     models = []
 25 |     for model_file in model_files:
 26 |         model = ViscosityNet(args.grid_dim)
 27 |         if os.path.exists(model_file):
 28 |             print('loading %s...' % model_file)
 29 |             model.load_state_dict(torch.load(model_file))
 30 |             model.eval()
 31 | 
 32 |         model = model.to(device)
 33 |         models.append(model)
 34 | 
 35 |     return models
 36 | 
 37 | 
 38 | def predict(cnn_models, mol_file, args = DEFAULT_GRID_PARAMS):
 39 |     if len(args[ESP_DIR_KEY]) > 0:
 40 |         esp_grids = get_esp_grids(args, mol_file)
 41 |     else:
 42 |         esp_grids = generate_esp_grids(args, mol_file)
 43 | 
 44 |     esp_grids = [esp_array for esp_array, _ in esp_grids]
 45 |     # esp_grids = generate_esp_grids(args, mol_file)
 46 |     dummy_y = [0.0]*len(esp_grids)
 47 | 
 48 |     test_dataset = ViscosityDataset(esp_grids, dummy_y)
 49 | 
 50 |     loader = DataLoader(test_dataset, shuffle=False, pin_memory=True,
 51 |                         batch_size=1, num_workers=0)
 52 | 
 53 |     y_preds = []
 54 |     for it, d_it in enumerate(loader):
 55 |         x, y = d_it
 56 | 
 57 |         # place data on the correct device
 58 |         x = x.to(device)
 59 | 
 60 |         for model in cnn_models:
 61 |             # forward the model
 62 |             with torch.set_grad_enabled(False):
 63 |                 output = model(x)
 64 | 
 65 |             y1 = output.detach().cpu().squeeze(1).numpy()
 66 |             y_preds.extend(y1)
 67 | 
 68 | 
 69 |     return np.power(10, np.mean(np.array(y_preds)))
 70 | 
 71 | 
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     parser = argparse.ArgumentParser(
 76 |         description='Generate predictions using PfAbNet models')
 77 |     parser.add_argument('--structure_file', type=str, help='Input Fv structure')
 78 |     parser.add_argument('--PfAbNet_model_prefix', type=str, default='PfAbNet', help='output model prefix')
 79 |     parser.add_argument('--PfAbNet_model_dir', type=str, help='output model directory')
 80 |     parser.add_argument('--grid_dim', type=int, default=96,
 81 |                         help='number of grid points along each axis (default = 96)')
 82 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
 83 |                         help='spacing between grid points (default = 0.75 Angstrom)')
 84 |     parser.add_argument('--shell_width', type=float, default=2.0,
 85 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
 86 |     parser.add_argument('--NX', type=int, default=10,
 87 |                         help='augmentation level (default 10x)')
 88 |     parser.add_argument('--processors', type=int, default=5,
 89 |                         help='Number of CPUs for ESP grid calculation (default 5)')
 90 |     parser.add_argument('--esp_dir', type=str, default='', help='directory with precomputed ESP grids')
 91 |     parser.add_argument('--output_file', type=str, help='Output file with prediction')
 92 |     parser.add_argument('-v', '--verbose', action='count', default=0)
 93 |     args = parser.parse_args()
 94 | 
 95 |     seed_everything(42)
 96 | 
 97 |     model_files_prefix = os.path.join(args.PfAbNet_model_dir, args.PfAbNet_model_prefix)
 98 |     model_files = glob.glob('%s*.pt' % model_files_prefix)
 99 |     cnn_models = get_cnn_models(args, model_files)
100 | 
101 |     output = []
102 |     ypred = predict(cnn_models, args.structure_file, args.__dict__)
103 |     output.append({ENTITY_KEY:os.path.basename(args.structure_file).split('.mol2')[0], 'VISCOSITY_PRED':ypred})
104 |     print(args.structure_file, ypred)
105 | 
106 |     df = pd.DataFrame(output)
107 |     df.to_csv(args.output_file, index=False)
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/pfabnet/sbatch_tmpl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -l
2 | #SBATCH -e %j.err
3 | #SBATCH -o %j.out
4 | #SBATCH --nodes=1
5 | #SBATCH --gres=gpu:v100:1
6 | #SBATCH --mem=32gb
7 | #SBATCH --wait
8 | 


--------------------------------------------------------------------------------
/pfabnet/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | import argparse
  7 | from sklearn.model_selection import KFold
  8 | 
  9 | from dataset import ViscosityDataset
 10 | from model import ViscosityNet
 11 | from trainer import Trainer, TrainerConfig
 12 | from utils import seed_everything, prepare_training_input
 13 | from base import VISCOSITY_KEY
 14 | 
 15 | 
 16 | def train(args):
 17 |     seed_everything(42)
 18 | 
 19 |     training_data_files = args.training_data_file.split(',')
 20 |     df_list = []
 21 |     for training_data_file in training_data_files:
 22 |         if training_data_file.endswith('.csv'):
 23 |             df = pd.read_csv(training_data_file)
 24 |         else:
 25 |             df = pd.read_pickle(training_data_file)
 26 | 
 27 |         df_list.append(df)
 28 | 
 29 |     df = pd.concat(df_list)
 30 |     df.loc[df[VISCOSITY_KEY] > 1000, VISCOSITY_KEY] = 1000
 31 | 
 32 |     X, y = prepare_training_input(df, args.__dict__)
 33 | 
 34 |     kf = KFold(n_splits=10, shuffle=True)
 35 |     train_index, val_index = list(kf.split(y))[args.fold_idx]
 36 | 
 37 |     X_train, y_train = X[train_index], y[train_index]
 38 |     X_val, y_val = X[val_index], y[val_index]
 39 |     print('Number of datapoints; train: %d, val: %d' % (len(y_train), len(y_val)))
 40 | 
 41 |     train_dataset = ViscosityDataset(X_train, y_train)
 42 |     val_dataset = ViscosityDataset(X_val, y_val)
 43 | 
 44 |     # save model path
 45 |     ckpt_file = '%s_%d.pt' % (args.output_model_prefix, args.fold_idx)
 46 |     ckpt_path = os.path.join(args.output_model_dir, ckpt_file)
 47 |     print('PyTorch model will be saved in ', ckpt_path)
 48 | 
 49 |     def weights_init(m):
 50 |         if isinstance(m, nn.Conv3d) or isinstance(m, nn.Linear):
 51 |             torch.nn.init.kaiming_normal_(m.weight)
 52 |             torch.nn.init.zeros_(m.bias)
 53 | 
 54 |     model = ViscosityNet(args.grid_dim)
 55 |     model.apply(weights_init)
 56 |     if os.path.exists(ckpt_path):
 57 |         print('loading saved model...')
 58 |         model.load_state_dict(torch.load(ckpt_path))
 59 |         model.eval()
 60 | 
 61 |     print(sum(p.numel() for p in model.parameters() if p.requires_grad), 'model parameters')
 62 | 
 63 |     bs = 1
 64 | 
 65 |     history_file = '%s_hist_%d.pkl' % (args.output_model_prefix, args.fold_idx)
 66 |     history_path = os.path.join(args.output_model_dir, history_file)
 67 |     tconf = TrainerConfig(max_epochs=2000, batch_size=bs, learning_rate=1e-5,
 68 |                           num_workers=0, ckpt_path=ckpt_path, history_path=history_path)
 69 | 
 70 |     trainer = Trainer(model, train_dataset, val_dataset, tconf)
 71 |     trainer.train()
 72 | 
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser(
 77 |         description='train PfAbNet model')
 78 |     parser.add_argument('--training_data_file', type=str, help='training data file')
 79 |     parser.add_argument('--homology_model_dir', type=str, help='homology model directory')
 80 |     parser.add_argument('--output_model_prefix', type=str, default='PfAbNet', help='output model prefix')
 81 |     parser.add_argument('--output_model_dir', type=str, help='output model directory')
 82 |     parser.add_argument('--grid_dim', type=int, default=96,
 83 |                         help='number of grid points along each axis (default = 96)')
 84 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
 85 |                         help='spacing between grid points (default = 2.0 Angstrom)')
 86 |     parser.add_argument('--shell_width', type=float, default=2.0,
 87 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
 88 |     parser.add_argument('--NX', type=int, default=10,
 89 |                         help='augmentation level (default 10x)')
 90 |     parser.add_argument('--processors', type=int, default=5,
 91 |                         help='Number of CPUs for ESP grid calculation (default 5)')
 92 |     parser.add_argument('--esp_dir', type=str, default='', help='directory with precomputed ESP grids')
 93 |     parser.add_argument('--fold_idx', default=0, type=int,
 94 |                         help='index of the k-fold split (default = 0)')
 95 |     parser.add_argument('-v', '--verbose', action='count', default=0)
 96 |     args = parser.parse_args()
 97 | 
 98 |     os.makedirs(args.output_model_dir, exist_ok=True)
 99 | 
100 |     train(args)
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/pfabnet/trainer.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import numpy as np
  3 | import torch
  4 | from torch.utils.data.dataloader import DataLoader
  5 | import pickle
  6 | 
  7 | 
  8 | class TrainerConfig:
  9 |     # optimization parameters
 10 |     betas = (0.9, 0.999)
 11 |     grad_norm_clip = 1.0
 12 |     ckpt_path = None
 13 |     history_path = None
 14 |     num_workers = 0 # for DataLoader
 15 | 
 16 |     def __init__(self, **kwargs):
 17 |         for k,v in kwargs.items():
 18 |             setattr(self, k, v)
 19 | 
 20 | class Trainer:
 21 | 
 22 |     def __init__(self, model, train_dataset, val_dataset, config):
 23 |         self.model = model
 24 |         self.train_dataset = train_dataset
 25 |         self.val_dataset = val_dataset
 26 |         self.config = config
 27 | 
 28 |         self.device = 'cpu'
 29 |         if torch.cuda.is_available():
 30 |             self.device = torch.cuda.current_device()
 31 |             self.model = torch.nn.DataParallel(self.model).to(self.device)
 32 | 
 33 |     def save_checkpoint(self):
 34 |         raw_model = self.model.module if hasattr(self.model, "module") else self.model
 35 |         torch.save(raw_model.state_dict(), self.config.ckpt_path)
 36 | 
 37 | 
 38 |     def train(self):
 39 |         model, config = self.model, self.config
 40 |         optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate,
 41 |                                       betas=config.betas, weight_decay=0.005)
 42 | 
 43 |         def run_epoch(split):
 44 |             is_train = split == 'train'
 45 |             model.train(is_train)
 46 | 
 47 |             if is_train:
 48 |                data = self.train_dataset
 49 |                batch_size = config.batch_size
 50 |             else:
 51 |                data = self.val_dataset
 52 |                batch_size = config.batch_size
 53 | 
 54 |             shuffle = False
 55 |             if is_train:
 56 |                shuffle = True
 57 | 
 58 |             loader = DataLoader(data, shuffle=shuffle, pin_memory=True,
 59 |                                 batch_size=batch_size,
 60 |                                 num_workers=config.num_workers)
 61 | 
 62 |             losses = []
 63 |             pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
 64 |             for it, d_it in pbar:
 65 |                 x, y = d_it
 66 | 
 67 |                 x = x.to(self.device)
 68 |                 y = y.to(self.device)
 69 | 
 70 |                 with torch.set_grad_enabled(is_train):
 71 |                     output, loss = model(x, y)
 72 |                     loss = loss.mean() 
 73 |                     losses.append(loss.item())
 74 | 
 75 |                 if is_train:
 76 |                     model.zero_grad()
 77 |                     loss.backward()
 78 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
 79 |                     optimizer.step()
 80 | 
 81 |                     pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. "
 82 |                                          f"lr {config.learning_rate:e}")
 83 | 
 84 |             return float(np.mean(losses))
 85 | 
 86 |         best_loss = float('inf')
 87 |         try:
 88 |             with open(self.config.history_path, 'rb') as fptr:
 89 |                 history = pickle.load(fptr)
 90 |                 start_epoch = len(np.array(history['val_loss']))
 91 |                 history = {'train_loss':history['train_loss'][:start_epoch], 'val_loss':history['val_loss'][:start_epoch]}
 92 |         except Exception as e:
 93 |             history = {'train_loss': [], 'val_loss': []}
 94 |             start_epoch = 0
 95 | 
 96 |         for epoch in range(start_epoch, config.max_epochs):
 97 |             train_loss = run_epoch('train')
 98 |             val_loss = run_epoch('val')
 99 |             history['train_loss'].append(train_loss)
100 |             history['val_loss'].append(val_loss)
101 | 
102 |             with open(self.config.history_path, 'wb') as fptr:
103 |                 pickle.dump(history, fptr)
104 | 
105 |             if epoch < 1950:
106 |                 self.save_checkpoint()
107 |                 continue
108 | 
109 |             good_model = val_loss < best_loss
110 |             if self.config.ckpt_path is not None and good_model:
111 |                 best_loss = val_loss
112 |                 self.save_checkpoint()
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/pfabnet/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import multiprocessing
  4 | import pickle
  5 | 
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | from captum.attr import IntegratedGradients
 10 | 
 11 | from openeye import oechem
 12 | from openeye import oegrid
 13 | from openeye import oezap
 14 | from openeye import oespicoli
 15 | 
 16 | try:
 17 |     from base import VISCOSITY_KEY, ENTITY_KEY
 18 | except Exception as e:
 19 |     from .base import VISCOSITY_KEY, ENTITY_KEY
 20 | 
 21 | ESP_GRID_KEY = 'ESP_GRID'
 22 | 
 23 | INPUT_MOL_KEY = 'INPUT_MOL'
 24 | ROT_X_KEY = 'rot_x'
 25 | ROT_Y_KEY = 'rot_y'
 26 | ROT_Z_KEY = 'rot_z'
 27 | GRID_SPACING_KEY = 'grid_spacing'
 28 | GRID_DIM_KEY = 'grid_dim'
 29 | SHELL_WIDTH_KEY = 'shell_width'
 30 | NX_KEY = 'NX' # augmentation level
 31 | PROCESSORS_KEY = 'processors'
 32 | HOMOLOGY_MODEL_DIR_KEY = 'homology_model_dir'
 33 | ESP_DIR_KEY = 'esp_dir'
 34 | 
 35 | DEFAULT_GRID_PARAMS = {GRID_DIM_KEY: 96, GRID_SPACING_KEY: 0.75,
 36 |                        SHELL_WIDTH_KEY: 2.0, NX_KEY: 10}
 37 | 
 38 | def get_molecule(input_file, perceive_residue=True, center_mol=True):
 39 |     ifs = oechem.oemolistream(input_file)
 40 |     mol = oechem.OEGraphMol()
 41 |     oechem.OEReadMolecule(ifs, mol)
 42 |     ifs.close()
 43 | 
 44 |     if perceive_residue:
 45 |         oechem.OEPerceiveResidues(mol)
 46 |     if center_mol:
 47 |         oechem.OECenter(mol)
 48 | 
 49 |     return mol
 50 | 
 51 | 
 52 | def get_esp_array(params):
 53 |     mol = params[INPUT_MOL_KEY]
 54 |     theta_x = params[ROT_X_KEY]
 55 |     theta_y = params[ROT_Y_KEY]
 56 |     theta_z = params[ROT_Z_KEY]
 57 |     grid_spacing = params[GRID_SPACING_KEY]
 58 |     grid_dim = params[GRID_DIM_KEY]
 59 |     shell_width = params[SHELL_WIDTH_KEY]
 60 | 
 61 |     oechem.OEEulerRotate(mol, oechem.OEDoubleArray([theta_x, theta_y, theta_z]))
 62 | 
 63 |     oechem.OEAssignBondiVdWRadii(mol)
 64 | 
 65 |     zap = oezap.OEZap()
 66 |     zap.SetInnerDielectric(2.0)
 67 |     zap.SetGridSpacing(grid_spacing)
 68 |     zap.SetMolecule(mol)
 69 | 
 70 |     grid = oegrid.OEScalarGrid(grid_dim, grid_dim, grid_dim,
 71 |                                0.0, 0.0, 0.0, grid_spacing)
 72 |     zap.SetOuterDielectric(80)
 73 |     zap.CalcPotentialGrid(grid)
 74 | 
 75 |     surf = oespicoli.OESurface()
 76 |     oespicoli.OEMakeMolecularSurface(surf, mol)
 77 | 
 78 |     surf_grid = oegrid.OEScalarGrid(grid_dim, grid_dim, grid_dim, 0.0, 0.0, 0.0, grid_spacing)
 79 |     oespicoli.OEMakeGridFromSurface(surf_grid, surf)
 80 | 
 81 |     grid_size = grid.GetSize()
 82 |     arr = np.zeros(grid_size)
 83 |     idx = 0
 84 |     count = 0
 85 |     for i in range(0, grid_dim):
 86 |         for j in range(0, grid_dim):
 87 |             for k in range(0, grid_dim):
 88 |                 v = surf_grid.GetValue(i, j, k)
 89 |                 if 0 <= v < shell_width:
 90 |                     val = grid.GetValue(i, j, k)
 91 |                     arr[idx] = val
 92 | 
 93 |                     count += 1
 94 |                 idx += 1
 95 | 
 96 |     arr3d_esp = np.reshape(arr, (grid_dim, grid_dim, grid_dim, 1))
 97 | 
 98 |     return arr3d_esp, mol
 99 | 
100 | 
101 | 
102 | def prepare_cnn_input(df, args, train=True):
103 |     hm_model_dir = args[HOMOLOGY_MODEL_DIR_KEY]
104 |     if hm_model_dir is None:
105 |         raise Exception('Homology model directory not specified')
106 | 
107 |     X = []
108 |     y = []
109 |     for row_idx, row in df.iterrows():
110 |         entity = row[ENTITY_KEY]
111 | 
112 |         mol_file = os.path.join(hm_model_dir, entity + '.mol2')
113 |         if len(args[ESP_DIR_KEY]) > 0:
114 |             esp_grids = get_esp_grids(args, mol_file)
115 |         else:
116 |             esp_grids = generate_esp_grids(args, mol_file)
117 | 
118 |         esp_grids = [esp_array for esp_array, _ in esp_grids]
119 | 
120 |         X.extend(esp_grids)
121 |         if train:
122 |             log_visc = np.log10(row[VISCOSITY_KEY])
123 |             y.extend([log_visc] * args[NX_KEY])
124 |         else:
125 |             y.extend([0.0] * args[NX_KEY])
126 | 
127 |     return np.array(X), np.array(y)
128 | 
129 | 
130 | def get_esp_grids(args, mol_file):
131 |     esp_dir = args[ESP_DIR_KEY]
132 |     esp_array_output = []
133 |     for i in range(args[NX_KEY]):
134 |         with open('%s/rotation_%d/%s.pyb' % (esp_dir, i + 1,
135 |                                              os.path.basename(mol_file).split('.mol2')[0]), 'rb') as fptr:
136 | 
137 |             mol = get_molecule(os.path.join(os.path.join(esp_dir, 'rotation_%d' % (i+1)), os.path.basename(mol_file)))
138 |             esp_array_output.append((pickle.load(fptr), mol))
139 | 
140 |     return esp_array_output
141 | 
142 | 
143 | def generate_esp_grids(args, mol_file):
144 |     mol = get_molecule(mol_file)
145 | 
146 |     params = []
147 |     for i in range(args[NX_KEY]):
148 |         rot_x = np.random.uniform(0, 180)
149 |         rot_y = np.random.uniform(0, 180)
150 |         rot_z = np.random.uniform(0, 180)
151 | 
152 |         params.append({INPUT_MOL_KEY: oechem.OEGraphMol(mol), ROT_X_KEY: rot_x,
153 |                        ROT_Y_KEY: rot_y, ROT_Z_KEY: rot_z,
154 |                        GRID_DIM_KEY: args[GRID_DIM_KEY],
155 |                        GRID_SPACING_KEY: args[GRID_SPACING_KEY],
156 |                        SHELL_WIDTH_KEY: args[SHELL_WIDTH_KEY]})
157 |     if multiprocessing.cpu_count() >= args[PROCESSORS_KEY]:
158 |         processors = args[PROCESSORS_KEY]
159 |     else:
160 |         processors = multiprocessing.cpu_count()
161 |     p = multiprocessing.Pool(processes=processors)
162 |     esp_array_output = p.map(get_esp_array, params)
163 |     p.close()
164 | 
165 |     output = [(np.moveaxis(esp_array, 3, 0), output_mol) for esp_array, output_mol in esp_array_output]
166 |     return output
167 | 
168 | 
169 | def prepare_training_input(df, args):
170 |     return prepare_cnn_input(df, args, train=True)
171 | 
172 | 
173 | def prepare_test_input(df, args):
174 |     return prepare_cnn_input(df, args, train=False)
175 | 
176 | 
177 | def calculate_attribution_grid(model, esp_grid_in, device='cpu'):
178 |     esp_grid = torch.Tensor(esp_grid_in)
179 |     baseline = torch.zeros(esp_grid.shape)
180 |     esp_grid = esp_grid.unsqueeze(0)
181 |     esp_grid2 = esp_grid.to(device)
182 | 
183 |     baseline = torch.unsqueeze(baseline, 0)
184 |     baseline = baseline.to(device)
185 | 
186 |     ig = IntegratedGradients(model)
187 |     attributions, delta = ig.attribute(esp_grid2, baseline, target=0, return_convergence_delta=True)
188 |     attributions = attributions.detach().cpu().numpy()
189 |     esp_grid2 = esp_grid2.detach().cpu().numpy()
190 | 
191 |     return attributions, esp_grid2
192 | 
193 | 
194 | def seed_everything(seed):
195 |     random.seed(seed)
196 |     np.random.seed(seed)
197 |     torch.manual_seed(seed)
198 |     torch.cuda.manual_seed_all(seed)
199 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import ViscosityDataset
2 | 
3 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/base.py:
--------------------------------------------------------------------------------
1 | ENTITY_KEY = 'Entity'
2 | VISCOSITY_KEY = 'Viscosity_at_150'
3 | SCHRODINGER_BASE = '/localscratch/software/schrodinger/adv-2021-2'
4 | 
5 | 
6 | def get_file_path():
7 |     return __file__
8 | 
9 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import torch
 3 | 
 4 | class ViscosityDataset(Dataset):
 5 |     def __init__(self, X, y):
 6 |         self.X = X
 7 |         self.y = y
 8 | 
 9 |     def __getitem__(self, index):
10 |         return torch.Tensor(self.X[index]), torch.Tensor([self.y[index]])
11 | 
12 |     def __len__(self):
13 |         return len(self.y)
14 | 
15 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/eisenberg_generator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import argparse
 4 | import pickle
 5 | 
 6 | import numpy as np
 7 | from openeye import oechem
 8 | from utils import generate_eisenberg_grids
 9 | 
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(
14 |         description='Generate PfAbNet ESP grid input')
15 |     parser.add_argument('--input_mols_dir', type=str, default='./',
16 |                         help='directory containing antibody structures/models')
17 |     parser.add_argument('--eisenberg_output_dir', type=str, default='./',
18 |                         help='directory to save the generated Eisenberg grid files')
19 |     parser.add_argument('--grid_dim', type=int, default=96,
20 |                         help='number of grid points along each axis (default = 96)')
21 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
22 |                         help='spacing between grid points (default = 0.75 Angstrom)')
23 |     parser.add_argument('--shell_width', type=float, default=2.0,
24 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
25 |     parser.add_argument('--NX', type=int, default=10,
26 |                         help='augmentation level (default 10x)')
27 |     parser.add_argument('--processors', type=int, default=10,
28 |                         help='Number of CPUs for ESP grid calculation (default 10)')
29 |     parser.add_argument('--seed', type=int, default=42,
30 |                         help='random seed (default 42)')
31 | 
32 |     parser.add_argument('-v', '--verbose', action='count', default=0)
33 |     in_args = parser.parse_args()
34 | 
35 |     input_mols_dir = in_args.input_mols_dir
36 |     eisenberg_dir = in_args.eisenberg_output_dir
37 |     seed = in_args.seed
38 | 
39 |     args = in_args.__dict__
40 |     np.random.seed(seed)
41 | 
42 |     try:
43 |         os.mkdir(eisenberg_dir)
44 |     except Exception as e:
45 |         pass
46 | 
47 |     mol_files = glob.glob(input_mols_dir + '/*.mol2')
48 |     for mol_file in mol_files:
49 |         print(mol_file)
50 |         output = generate_eisenberg_grids(args, mol_file)
51 |         for idx, (esp_grid, phobic_grid, philic_grid, output_mol) in enumerate(output):
52 |             output_dir = os.path.join(eisenberg_dir, 'rotation_%d' % (idx + 1))
53 |             try:
54 |                 os.mkdir(output_dir)
55 |             except Exception as e:
56 |                 pass
57 | 
58 |             base_mol_file = os.path.basename(mol_file).split('.mol2')[0]
59 |             esp_file = os.path.join(output_dir, base_mol_file + '.pyb')
60 |             with open(esp_file, 'wb') as fptr:
61 |                pickle.dump([esp_grid, phobic_grid, philic_grid], fptr)
62 | 
63 |             output_mol_file = os.path.join(output_dir, os.path.basename(mol_file))
64 | 
65 |             ofs = oechem.oemolostream(output_mol_file)
66 |             oechem.OEWriteConstMolecule(ofs, output_mol)
67 |             ofs.close()
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ViscosityNet(nn.Module):
 6 |     def __init__(self, num_channels=2):
 7 |         super(ViscosityNet, self).__init__()
 8 |         nfilt = num_channels
 9 |         ks = 3
10 | 
11 |         dilation = 1
12 |         if num_channels == 2:
13 |             self.convnet = nn.Sequential(nn.Conv3d(num_channels, nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
14 |                                          nn.MaxPool3d(2),
15 |                                          nn.Conv3d(nfilt, 2*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
16 |                                          nn.MaxPool3d(2),
17 |                                          nn.Conv3d(2*nfilt, 4*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
18 |                                          nn.MaxPool3d(2),
19 |                                          nn.Conv3d(4*nfilt, 8*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
20 |                                          nn.MaxPool3d(2),
21 |                                          nn.Conv3d(8*nfilt, 16*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
22 |                                          nn.MaxPool3d(2),
23 |                                          nn.Conv3d(16*nfilt, 32*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
24 |                                          nn.MaxPool3d(2),
25 |                                          nn.Conv3d(32*nfilt, 1024, ks, padding='same', dilation=dilation), nn.ReLU()
26 |                                          )
27 |         elif num_channels == 3:
28 |             self.convnet = nn.Sequential(nn.Conv3d(num_channels, nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
29 |                                          nn.MaxPool3d(2),
30 |                                          nn.Conv3d(nfilt, 2*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
31 |                                          nn.MaxPool3d(2),
32 |                                          nn.Conv3d(2*nfilt, 4*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
33 |                                          nn.MaxPool3d(2),
34 |                                          nn.Conv3d(4*nfilt, 8*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
35 |                                          nn.MaxPool3d(2),
36 |                                          nn.Conv3d(8*nfilt, 16*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
37 |                                          nn.MaxPool3d(2),
38 |                                          nn.Conv3d(16*nfilt, 32*nfilt, ks, padding='same', dilation=dilation), nn.ReLU(),
39 |                                          nn.MaxPool3d(2),
40 |                                          nn.Conv3d(32*nfilt, 1024, ks, padding='same', dilation=dilation), nn.ReLU()
41 |                                          )
42 |         else:
43 |             print('ERROR... number of input channels must be either 2 or 3')
44 | 
45 | 
46 |         self.fc = nn.Sequential(nn.Linear(1024, 1), nn.ReLU())
47 | 
48 |         self.drop_out = nn.Dropout(0.05)
49 | 
50 | 
51 |     def forward(self, x, y=None):
52 |         x = self.convnet(x)
53 | 
54 |         emb = torch.flatten(x, 1)
55 | 
56 |         x = self.drop_out(emb)
57 |         x = self.fc(x)
58 |        
59 |         if y is not None:
60 |             loss = nn.functional.huber_loss(y, x, reduction='mean')
61 |             return x, loss
62 |         else:
63 |             return x, emb
64 | 
65 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/predict.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import torch
  3 | from dataset import ViscosityDataset
  4 | from torch.utils.data.dataloader import DataLoader
  5 | 
  6 | import argparse
  7 | import glob
  8 | import os
  9 | 
 10 | import numpy as np
 11 | 
 12 | from model import ViscosityNet
 13 | from utils import seed_everything
 14 | from utils import generate_eisenberg_grids, get_eisenberg_grids
 15 | from utils import DEFAULT_GRID_PARAMS, EISENBERG_DIR_KEY
 16 | from base import ENTITY_KEY
 17 | 
 18 | 
 19 | device = 'cpu'
 20 | if torch.cuda.is_available():
 21 |     device = torch.cuda.current_device()
 22 | 
 23 | def get_cnn_models(args, model_files):
 24 |     models = []
 25 |     for model_file in model_files:
 26 |         model = ViscosityNet(args.num_channels)
 27 |         if os.path.exists(model_file):
 28 |             print('loading %s...' % model_file)
 29 |             model.load_state_dict(torch.load(model_file))
 30 |             model.eval()
 31 | 
 32 |         model = model.to(device)
 33 |         models.append(model)
 34 | 
 35 |     return models
 36 | 
 37 | 
 38 | def predict(cnn_models, mol_file, args = DEFAULT_GRID_PARAMS):
 39 |     if len(args[EISENBERG_DIR_KEY]) > 0:
 40 |         esp_grids = get_eisenberg_grids(args, mol_file)
 41 |     else:
 42 |         esp_grids = generate_eisenberg_grids(args, mol_file)
 43 | 
 44 |     if args['num_channels'] == 3:
 45 |         combined_grids = [np.concatenate([esp_arr, phobic_arr, philic_arr], axis=0)
 46 |                          for esp_arr, phobic_arr, philic_arr, _ in esp_grids]
 47 |     else:
 48 |         combined_grids = [np.concatenate([phobic_arr, philic_arr], axis=0)
 49 |                          for _, phobic_arr, philic_arr, _ in esp_grids]
 50 | 
 51 |     dummy_y = [0.0]*len(combined_grids)
 52 | 
 53 |     test_dataset = ViscosityDataset(combined_grids, dummy_y)
 54 | 
 55 |     loader = DataLoader(test_dataset, shuffle=False, pin_memory=True,
 56 |                         batch_size=1, num_workers=0)
 57 | 
 58 |     y_preds = []
 59 |     for it, d_it in enumerate(loader):
 60 |         x, y = d_it
 61 | 
 62 |         # place data on the correct device
 63 |         x = x.to(device)
 64 | 
 65 |         for model in cnn_models:
 66 |             # forward the model
 67 |             with torch.set_grad_enabled(False):
 68 |                 output, _ = model(x)
 69 | 
 70 |             y1 = output.detach().cpu().squeeze(1).numpy()
 71 |             y_preds.extend(y1)
 72 | 
 73 | 
 74 |     return np.power(10, np.mean(np.array(y_preds)))
 75 | 
 76 | 
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser(
 81 |         description='Generate predictions using PfAbNet models')
 82 |     parser.add_argument('--structure_file', type=str, help='Input Fv structure')
 83 |     parser.add_argument('--PfAbNet_model_prefix', type=str, default='PfAbNet', help='output model prefix')
 84 |     parser.add_argument('--PfAbNet_model_dir', type=str, help='output model directory')
 85 |     parser.add_argument('--grid_dim', type=int, default=96,
 86 |                         help='number of grid points along each axis (default = 96)')
 87 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
 88 |                         help='spacing between grid points (default = 0.75 Angstrom)')
 89 |     parser.add_argument('--shell_width', type=float, default=2.0,
 90 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
 91 |     parser.add_argument('--NX', type=int, default=10,
 92 |                         help='augmentation level (default 10x)')
 93 |     parser.add_argument('--num_channels', type=int, default=2,
 94 |                         help='number of input channels (2 for eisenberg '
 95 |                              'phobic + philic or 3 for esp_eisenberg (default 2))')
 96 |     parser.add_argument('--processors', type=int, default=5,
 97 |                         help='Number of CPUs for ESP grid calculation (default 5)')
 98 |     parser.add_argument('--eisenberg_dir', type=str, default='', help='directory with precomputed density grids')
 99 |     parser.add_argument('--output_file', type=str, help='Output file with prediction')
100 |     parser.add_argument('-v', '--verbose', action='count', default=0)
101 |     args = parser.parse_args()
102 | 
103 |     seed_everything(42)
104 | 
105 |     model_files_prefix = os.path.join(args.PfAbNet_model_dir, args.PfAbNet_model_prefix)
106 |     model_files = glob.glob('%s*.pt' % model_files_prefix)
107 |     cnn_models = get_cnn_models(args, model_files)
108 | 
109 |     output = []
110 |     ypred = predict(cnn_models, args.structure_file, args.__dict__)
111 |     output.append({ENTITY_KEY:os.path.basename(args.structure_file).split('.mol2')[0], 'VISCOSITY_PRED':ypred})
112 |     print(args.structure_file, ypred)
113 | 
114 |     df = pd.DataFrame(output)
115 |     df.to_csv(args.output_file, index=False)
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | import argparse
  7 | from sklearn.model_selection import KFold
  8 | 
  9 | from dataset import ViscosityDataset
 10 | from model import ViscosityNet
 11 | from trainer import Trainer, TrainerConfig
 12 | from utils import seed_everything, prepare_training_input
 13 | from base import VISCOSITY_KEY
 14 | 
 15 | 
 16 | def train(args):
 17 |     seed_everything(42)
 18 | 
 19 |     training_data_files = args.training_data_file.split(',')
 20 |     df_list = []
 21 |     for training_data_file in training_data_files:
 22 |         if training_data_file.endswith('.csv'):
 23 |             df = pd.read_csv(training_data_file)
 24 |         else:
 25 |             df = pd.read_pickle(training_data_file)
 26 | 
 27 |         df_list.append(df)
 28 | 
 29 |     df = pd.concat(df_list)
 30 |     df.loc[df[VISCOSITY_KEY] > 1000, VISCOSITY_KEY] = 1000
 31 | 
 32 |     X, y = prepare_training_input(df, args.__dict__)
 33 | 
 34 |     kf = KFold(n_splits=10, shuffle=True)
 35 |     train_index, val_index = list(kf.split(y))[args.fold_idx]
 36 | 
 37 |     X_train, y_train = X[train_index], y[train_index]
 38 |     X_val, y_val = X[val_index], y[val_index]
 39 |     print('Number of datapoints; train: %d, val: %d' % (len(y_train), len(y_val)))
 40 | 
 41 |     train_dataset = ViscosityDataset(X_train, y_train)
 42 |     val_dataset = ViscosityDataset(X_val, y_val)
 43 | 
 44 |     # save model path
 45 |     ckpt_file = '%s_%d.pt' % (args.output_model_prefix, args.fold_idx)
 46 |     ckpt_path = os.path.join(args.output_model_dir, ckpt_file)
 47 |     print('PyTorch model will be saved in ', ckpt_path)
 48 | 
 49 |     def weights_init(m):
 50 |         if isinstance(m, nn.Conv3d) or isinstance(m, nn.Linear):
 51 |             torch.nn.init.kaiming_normal_(m.weight)
 52 |             torch.nn.init.zeros_(m.bias)
 53 | 
 54 |     model = ViscosityNet(args.num_channels)
 55 |     model.apply(weights_init)
 56 |     if os.path.exists(ckpt_path):
 57 |         print('loading saved model...')
 58 |         model.load_state_dict(torch.load(ckpt_path))
 59 |         model.eval()
 60 | 
 61 |     print(sum(p.numel() for p in model.parameters() if p.requires_grad), 'model parameters')
 62 | 
 63 |     bs = 1
 64 | 
 65 |     history_file = '%s_hist_%d.pkl' % (args.output_model_prefix, args.fold_idx)
 66 |     history_path = os.path.join(args.output_model_dir, history_file)
 67 |     tconf = TrainerConfig(max_epochs=2000, batch_size=bs, learning_rate=1e-5,
 68 |                           num_workers=0, ckpt_path=ckpt_path, history_path=history_path)
 69 | 
 70 |     trainer = Trainer(model, train_dataset, val_dataset, tconf)
 71 |     trainer.train()
 72 | 
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser(
 77 |         description='train PfAbNet model')
 78 |     parser.add_argument('--training_data_file', type=str, help='training data file')
 79 |     parser.add_argument('--homology_model_dir', type=str, help='homology model directory')
 80 |     parser.add_argument('--output_model_prefix', type=str, default='PfAbNet', help='output model prefix')
 81 |     parser.add_argument('--output_model_dir', type=str, help='output model directory')
 82 |     parser.add_argument('--grid_dim', type=int, default=96,
 83 |                         help='number of grid points along each axis (default = 96)')
 84 |     parser.add_argument('--grid_spacing', type=float, default=0.75,
 85 |                         help='spacing between grid points (default = 0.75 Angstrom)')
 86 |     parser.add_argument('--shell_width', type=float, default=2.0,
 87 |                         help='thickness of the surface shell (default 2.0 Angstrom)')
 88 |     parser.add_argument('--NX', type=int, default=10,
 89 |                         help='augmentation level (default 10x)')
 90 |     parser.add_argument('--processors', type=int, default=5,
 91 |                         help='Number of CPUs for ESP grid calculation (default 5)')
 92 |     parser.add_argument('--num_channels', type=int, default=2,
 93 |                         help='number of input channels (2 for eisenberg phobic + philic or 3 for esp_eisenberg (default 2))')
 94 |     parser.add_argument('--eisenberg_dir', type=str, default='', help='directory with precomputed Eisenberg grids')
 95 |     parser.add_argument('--fold_idx', default=0, type=int,
 96 |                         help='index of the k-fold split (default = 0)')
 97 |     parser.add_argument('-v', '--verbose', action='count', default=0)
 98 |     args = parser.parse_args()
 99 | 
100 |     os.makedirs(args.output_model_dir, exist_ok=True)
101 | 
102 |     train(args)
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/trainer.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import numpy as np
  3 | import torch
  4 | from torch.utils.data.dataloader import DataLoader
  5 | import pickle
  6 | 
  7 | 
  8 | class TrainerConfig:
  9 |     # optimization parameters
 10 |     betas = (0.9, 0.999)
 11 |     grad_norm_clip = 1.0
 12 |     ckpt_path = None
 13 |     history_path = None
 14 |     num_workers = 0 # for DataLoader
 15 | 
 16 |     def __init__(self, **kwargs):
 17 |         for k,v in kwargs.items():
 18 |             setattr(self, k, v)
 19 | 
 20 | class Trainer:
 21 | 
 22 |     def __init__(self, model, train_dataset, val_dataset, config):
 23 |         self.model = model
 24 |         self.train_dataset = train_dataset
 25 |         self.val_dataset = val_dataset
 26 |         self.config = config
 27 | 
 28 |         self.device = 'cpu'
 29 |         if torch.cuda.is_available():
 30 |             self.device = torch.cuda.current_device()
 31 |             self.model = torch.nn.DataParallel(self.model).to(self.device)
 32 | 
 33 |     def save_checkpoint(self):
 34 |         raw_model = self.model.module if hasattr(self.model, "module") else self.model
 35 |         torch.save(raw_model.state_dict(), self.config.ckpt_path)
 36 | 
 37 | 
 38 |     def train(self):
 39 |         model, config = self.model, self.config
 40 |         optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate,
 41 |                                       betas=config.betas, weight_decay=0.005)
 42 | 
 43 |         def run_epoch(split):
 44 |             is_train = split == 'train'
 45 |             model.train(is_train)
 46 | 
 47 |             if is_train:
 48 |                data = self.train_dataset
 49 |                batch_size = config.batch_size
 50 |             else:
 51 |                data = self.val_dataset
 52 |                batch_size = config.batch_size
 53 | 
 54 |             shuffle = False
 55 |             if is_train:
 56 |                shuffle = True
 57 | 
 58 |             loader = DataLoader(data, shuffle=shuffle, pin_memory=True,
 59 |                                 batch_size=batch_size,
 60 |                                 num_workers=config.num_workers)
 61 | 
 62 |             losses = []
 63 |             pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
 64 |             for it, d_it in pbar:
 65 |                 x, y = d_it
 66 | 
 67 |                 x = x.to(self.device)
 68 |                 y = y.to(self.device)
 69 | 
 70 |                 with torch.set_grad_enabled(is_train):
 71 |                     output, loss = model(x, y)
 72 |                     loss = loss.mean() 
 73 |                     losses.append(loss.item())
 74 | 
 75 |                 if is_train:
 76 |                     model.zero_grad()
 77 |                     loss.backward()
 78 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
 79 |                     optimizer.step()
 80 | 
 81 |                     pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. "
 82 |                                          f"lr {config.learning_rate:e}")
 83 | 
 84 |             return float(np.mean(losses))
 85 | 
 86 |         best_loss = float('inf')
 87 |         try:
 88 |             with open(self.config.history_path, 'rb') as fptr:
 89 |                 history = pickle.load(fptr)
 90 |                 start_epoch = len(np.array(history['val_loss']))
 91 |                 history = {'train_loss':history['train_loss'][:start_epoch], 'val_loss':history['val_loss'][:start_epoch]}
 92 |         except Exception as e:
 93 |             history = {'train_loss': [], 'val_loss': []}
 94 |             start_epoch = 0
 95 | 
 96 |         for epoch in range(start_epoch, config.max_epochs):
 97 |             train_loss = run_epoch('train')
 98 |             val_loss = run_epoch('val')
 99 |             history['train_loss'].append(train_loss)
100 |             history['val_loss'].append(val_loss)
101 | 
102 |             with open(self.config.history_path, 'wb') as fptr:
103 |                 pickle.dump(history, fptr)
104 | 
105 |             if epoch < 1950:
106 |                 self.save_checkpoint()
107 |                 continue
108 | 
109 |             good_model = val_loss < best_loss
110 |             if self.config.ckpt_path is not None and good_model:
111 |                 best_loss = val_loss
112 |                 self.save_checkpoint()
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/pfabnet_eisenberg/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import multiprocessing
  4 | import pickle
  5 | import collections
  6 | 
  7 | import numpy as np
  8 | 
  9 | import torch
 10 | from openeye import oechem
 11 | from openeye import oegrid
 12 | from openeye import oezap
 13 | from openeye import oespicoli
 14 | 
 15 | try:
 16 |     from base import VISCOSITY_KEY, ENTITY_KEY
 17 | except Exception as e:
 18 |     from .base import VISCOSITY_KEY, ENTITY_KEY
 19 | 
 20 | EISENBERG_GRID_KEY = 'EISENBERG_GRID'
 21 | 
 22 | INPUT_MOL_KEY = 'INPUT_MOL'
 23 | ROT_X_KEY = 'rot_x'
 24 | ROT_Y_KEY = 'rot_y'
 25 | ROT_Z_KEY = 'rot_z'
 26 | GRID_SPACING_KEY = 'grid_spacing'
 27 | GRID_DIM_KEY = 'grid_dim'
 28 | SHELL_WIDTH_KEY = 'shell_width'
 29 | NX_KEY = 'NX' # augmentation level
 30 | PROCESSORS_KEY = 'processors'
 31 | HOMOLOGY_MODEL_DIR_KEY = 'homology_model_dir'
 32 | EISENBERG_DIR_KEY = 'eisenberg_dir'
 33 | 
 34 | DEFAULT_GRID_PARAMS = {GRID_DIM_KEY: 96, GRID_SPACING_KEY: 0.75,
 35 |                        SHELL_WIDTH_KEY: 2.0, NX_KEY: 10}
 36 | 
 37 | def get_molecule(input_file):
 38 |     ifs = oechem.oemolistream(input_file)
 39 |     mol = oechem.OEGraphMol()
 40 |     oechem.OEReadMolecule(ifs, mol)
 41 |     ifs.close()
 42 | 
 43 |     oechem.OEPerceiveResidues(mol)
 44 |     oechem.OECenter(mol)
 45 | 
 46 |     return mol
 47 | 
 48 | def get_eisenberg_grid(params, mol, grid_type='PHOBIC'):
 49 |     eisenberg_scale = collections.defaultdict(float)
 50 |     eisenberg_scale['ALA'] = 0.25; eisenberg_scale['CYS'] = 0.04; eisenberg_scale['PHE'] = 0.61;
 51 |     eisenberg_scale['ILE'] = 0.73; eisenberg_scale['LEU'] = 0.53; eisenberg_scale['PRO'] = -0.07;
 52 |     eisenberg_scale['VAL'] = 0.54; eisenberg_scale['TRP'] = 0.37; eisenberg_scale['TYR'] = 0.02;
 53 |     eisenberg_scale['ASP'] = -0.72; eisenberg_scale['GLU'] = -0.62; eisenberg_scale['GLY'] = 0.16;
 54 |     eisenberg_scale['HIS'] = -0.40; eisenberg_scale['LYS'] = -1.1; eisenberg_scale['MET'] = 0.26;
 55 |     eisenberg_scale['ASN'] = -0.64; eisenberg_scale['GLN'] = -0.69; eisenberg_scale['ARG'] = -1.8;
 56 |     eisenberg_scale['SER'] = -0.26; eisenberg_scale['THR'] = -0.18;
 57 | 
 58 |     mol_copy = oechem.OEGraphMol(mol)
 59 |     for atom in mol_copy.GetAtoms():
 60 |         res = oechem.OEAtomGetResidue(atom)
 61 |         aa = res.GetName()
 62 |         if grid_type == 'PHOBIC' and eisenberg_scale[aa] < 0.0:
 63 |             mol_copy.DeleteAtom(atom)
 64 |             continue
 65 |         if grid_type == 'PHILIC' and eisenberg_scale[aa] > 0.0:
 66 |             mol_copy.DeleteAtom(atom)
 67 |             continue
 68 | 
 69 |         atom.SetRadius(3*np.abs(eisenberg_scale[aa]))
 70 | 
 71 |     mol_copy.Sweep()
 72 |     print(grid_type, mol_copy.NumAtoms(), oechem.OECount(mol_copy, oechem.OEIsHydrogen()))
 73 |     grid_spacing = params[GRID_SPACING_KEY]
 74 |     grid_dim = params[GRID_DIM_KEY]
 75 |     oe_grid = oegrid.OEScalarGrid(grid_dim, grid_dim, grid_dim, 0.0, 0.0, 0.0, grid_spacing)
 76 |     oegrid.OEMakeMolecularGaussianGrid(oe_grid, mol_copy)
 77 | 
 78 |     return oe_grid
 79 | 
 80 | 
 81 | def gen_eisenberg_array(params):
 82 |     mol = params[INPUT_MOL_KEY]
 83 |     theta_x = params[ROT_X_KEY]
 84 |     theta_y = params[ROT_Y_KEY]
 85 |     theta_z = params[ROT_Z_KEY]
 86 |     grid_spacing = params[GRID_SPACING_KEY]
 87 |     grid_dim = params[GRID_DIM_KEY]
 88 |     shell_width = params[SHELL_WIDTH_KEY]
 89 | 
 90 |     oechem.OEEulerRotate(mol, oechem.OEDoubleArray([theta_x, theta_y, theta_z]))
 91 | 
 92 |     oechem.OEAssignBondiVdWRadii(mol)
 93 | 
 94 |     zap = oezap.OEZap()
 95 |     zap.SetInnerDielectric(2.0)
 96 |     zap.SetGridSpacing(grid_spacing)
 97 |     zap.SetMolecule(mol)
 98 | 
 99 |     grid = oegrid.OEScalarGrid(grid_dim, grid_dim, grid_dim,
100 |                                0.0, 0.0, 0.0, grid_spacing)
101 |     zap.SetOuterDielectric(80)
102 |     zap.CalcPotentialGrid(grid)
103 | 
104 |     surf = oespicoli.OESurface()
105 |     oespicoli.OEMakeMolecularSurface(surf, mol)
106 | 
107 |     surf_grid = oegrid.OEScalarGrid(grid_dim, grid_dim, grid_dim, 0.0, 0.0, 0.0, grid_spacing)
108 |     oespicoli.OEMakeGridFromSurface(surf_grid, surf)
109 | 
110 |     phobic_grid = get_eisenberg_grid(params, mol, 'PHOBIC')
111 |     philic_grid = get_eisenberg_grid(params, mol, 'PHILIC')
112 | 
113 |     grid_size = grid.GetSize()
114 |     arr = np.zeros(grid_size)
115 |     phobic_arr = np.zeros(grid_size)
116 |     philic_arr = np.zeros(grid_size)
117 |     idx = 0
118 |     for i in range(0, grid_dim):
119 |         for j in range(0, grid_dim):
120 |             for k in range(0, grid_dim):
121 |                 v = surf_grid.GetValue(i, j, k)
122 |                 if 0 <= v < shell_width:
123 |                     val = grid.GetValue(i, j, k)
124 |                     arr[idx] = val
125 |                     val = phobic_grid.GetValue(i, j, k)
126 |                     phobic_arr[idx] = val
127 |                     val = philic_grid.GetValue(i, j, k)
128 |                     philic_arr[idx] = val
129 | 
130 |                 idx += 1
131 | 
132 |     arr3d_esp = np.reshape(arr, (grid_dim, grid_dim, grid_dim, 1))
133 |     arr3d_phobic = np.reshape(phobic_arr, (grid_dim, grid_dim, grid_dim, 1))
134 |     arr3d_philic = np.reshape(philic_arr, (grid_dim, grid_dim, grid_dim, 1))
135 | 
136 |     return arr3d_esp, arr3d_phobic, arr3d_philic, mol
137 | 
138 | 
139 | def prepare_cnn_input(df, args, train=True):
140 |     hm_model_dir = args[HOMOLOGY_MODEL_DIR_KEY]
141 |     if hm_model_dir is None:
142 |         raise Exception('Homology model directory not specified')
143 | 
144 |     X = []
145 |     y = []
146 |     for row_idx, row in df.iterrows():
147 |         entity = row[ENTITY_KEY]
148 | 
149 |         mol_file = os.path.join(hm_model_dir, entity + '.mol2')
150 |         if len(args[EISENBERG_DIR_KEY]) > 0:
151 |             esp_grids = get_eisenberg_grids(args, mol_file)
152 |         else:
153 |             esp_grids = generate_eisenberg_grids(args, mol_file)
154 | 
155 |         if args['num_channels'] == 3:
156 |             combined_grid = [np.concatenate([esp_arr, phobic_arr, philic_arr], axis=0)
157 |                              for esp_arr, phobic_arr, philic_arr, _ in esp_grids]
158 |         else:
159 |             combined_grid = [np.concatenate([phobic_arr, philic_arr], axis=0)
160 |                              for _, phobic_arr, philic_arr, _ in esp_grids]
161 | 
162 |         X.extend(combined_grid)
163 |         if train:
164 |             log_visc = np.log10(row[VISCOSITY_KEY])
165 |             y.extend([log_visc] * args[NX_KEY])
166 |         else:
167 |             y.extend([0.0] * args[NX_KEY])
168 | 
169 |     return np.array(X), np.array(y)
170 | 
171 | 
172 | def get_eisenberg_grids(args, mol_file):
173 |     eisenberg_dir = args[EISENBERG_DIR_KEY]
174 |     eisenberg_array_output = []
175 |     for i in range(args[NX_KEY]):
176 |         with open('%s/rotation_%d/%s.pyb' % (eisenberg_dir, i + 1,
177 |                                              os.path.basename(mol_file).split('.mol2')[0]), 'rb') as fptr:
178 | 
179 |             mol = get_molecule(os.path.join(os.path.join(eisenberg_dir, 'rotation_%d' % (i+1)), os.path.basename(mol_file)))
180 |             esp_arr, phobic_arr, philic_arr = pickle.load(fptr)
181 |             eisenberg_array_output.append((esp_arr, phobic_arr, philic_arr, mol))
182 | 
183 |     return eisenberg_array_output
184 | 
185 | 
186 | def generate_eisenberg_grids(args, mol_file):
187 |     mol = get_molecule(mol_file)
188 | 
189 |     params = []
190 |     for i in range(args[NX_KEY]):
191 |         rot_x = np.random.uniform(0, 180)
192 |         rot_y = np.random.uniform(0, 180)
193 |         rot_z = np.random.uniform(0, 180)
194 | 
195 |         params.append({INPUT_MOL_KEY: oechem.OEGraphMol(mol), ROT_X_KEY: rot_x,
196 |                        ROT_Y_KEY: rot_y, ROT_Z_KEY: rot_z,
197 |                        GRID_DIM_KEY: args[GRID_DIM_KEY],
198 |                        GRID_SPACING_KEY: args[GRID_SPACING_KEY],
199 |                        SHELL_WIDTH_KEY: args[SHELL_WIDTH_KEY]})
200 |     if multiprocessing.cpu_count() >= args[PROCESSORS_KEY]:
201 |         processors = args[PROCESSORS_KEY]
202 |     else:
203 |         processors = multiprocessing.cpu_count()
204 |     p = multiprocessing.Pool(processes=processors)
205 |     eisenberg_array_output = p.map(gen_eisenberg_array, params)
206 |     p.close()
207 | 
208 |     output = [(np.moveaxis(esp_array, 3, 0), np.moveaxis(phobic_array, 3, 0), np.moveaxis(philic_array, 3, 0), output_mol)
209 |               for esp_array, phobic_array, philic_array, output_mol in eisenberg_array_output]
210 |     return output
211 | 
212 | 
213 | def prepare_training_input(df, args):
214 |     return prepare_cnn_input(df, args, train=True)
215 | 
216 | 
217 | def prepare_test_input(df, args):
218 |     return prepare_cnn_input(df, args, train=False)
219 | 
220 | 
221 | def seed_everything(seed):
222 |     random.seed(seed)
223 |     np.random.seed(seed)
224 |     torch.manual_seed(seed)
225 |     torch.cuda.manual_seed_all(seed)
226 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | # packages in environment at /home/X/.conda/envs/X-env:
  2 | #
  3 | # Name                    Version                   Build  Channel
  4 | _libgcc_mutex             0.1                        main  
  5 | anyio                     3.3.4            py39hf3d152e_0    conda-forge
  6 | argon2-cffi               20.1.0           py39h27cfd23_1  
  7 | async_generator           1.10                       py_0    conda-forge
  8 | attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
  9 | babel                     2.9.1              pyh44b312d_0    conda-forge
 10 | backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
 11 | backports                 1.0                        py_2    conda-forge
 12 | backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
 13 | bcbio-gff                 0.6.7                    pypi_0    pypi
 14 | biopython                 1.79                     pypi_0    pypi
 15 | blas                      1.0                         mkl  
 16 | bleach                    4.1.0              pyhd8ed1ab_0    conda-forge
 17 | bottleneck                1.3.2            py39hdd57654_1  
 18 | brotli                    1.0.9                he6710b0_2  
 19 | brotlipy                  0.7.0           py39h27cfd23_1003  
 20 | bzip2                     1.0.8                h7b6447c_0  
 21 | ca-certificates           2021.10.26           h06a4308_2  
 22 | captum                    0.4.1                    pypi_0    pypi
 23 | certifi                   2021.10.8        py39h06a4308_0  
 24 | cffi                      1.14.6           py39h400218f_0  
 25 | charset-normalizer        2.0.4              pyhd3eb1b0_0  
 26 | click                     8.1.3                    pypi_0    pypi
 27 | cloudpickle               2.0.0                    pypi_0    pypi
 28 | conda                     4.10.3           py39h06a4308_0  
 29 | conda-package-handling    1.7.3            py39h27cfd23_1  
 30 | cryptography              3.4.8            py39hd23ed53_0  
 31 | cudatoolkit               10.1.243             h6bb024c_0  
 32 | cx-oracle                 8.3.0                    pypi_0    pypi
 33 | cycler                    0.10.0           py39h06a4308_0  
 34 | dask                      2021.11.1                pypi_0    pypi
 35 | dbus                      1.13.18              hb2f20db_0  
 36 | decorator                 5.1.0              pyhd8ed1ab_0    conda-forge
 37 | defusedxml                0.7.1              pyhd8ed1ab_0    conda-forge
 38 | entrypoints               0.3             pyhd8ed1ab_1003    conda-forge
 39 | et-xmlfile                1.1.0                    pypi_0    pypi
 40 | expat                     2.4.1                h2531618_2  
 41 | ffmpeg                    4.3                  hf484d3e_0    pytorch
 42 | flask                     2.2.1                    pypi_0    pypi
 43 | fontconfig                2.13.1               h6c09931_0  
 44 | fonttools                 4.25.0             pyhd3eb1b0_0  
 45 | freetype                  2.10.4               h5ab3b9f_0  
 46 | fsspec                    2021.11.0                pypi_0    pypi
 47 | giflib                    5.2.1                h7b6447c_0  
 48 | glib                      2.69.1               h5202010_0  
 49 | gmp                       6.2.1                h2531618_2  
 50 | gnutls                    3.6.15               he1e5248_0  
 51 | gpytorch                  1.6.0                    pypi_0    pypi
 52 | gst-plugins-base          1.14.0               h8213a91_2  
 53 | gstreamer                 1.14.0               h28cd5cc_2  
 54 | icu                       58.2                 he6710b0_3  
 55 | idna                      3.2                pyhd3eb1b0_0  
 56 | importlib-metadata        4.8.1            py39hf3d152e_0    conda-forge
 57 | iniconfig                 1.1.1              pyhd3eb1b0_0  
 58 | intel-openmp              2021.3.0          h06a4308_3350  
 59 | ipykernel                 5.5.5            py39hef51801_0    conda-forge
 60 | ipython                   7.28.0           py39hef51801_0    conda-forge
 61 | ipython_genutils          0.2.0                      py_1    conda-forge
 62 | ipywidgets                7.6.5                    pypi_0    pypi
 63 | itsdangerous              2.1.2                    pypi_0    pypi
 64 | jedi                      0.18.0           py39hf3d152e_2    conda-forge
 65 | jinja2                    3.1.2                    pypi_0    pypi
 66 | joblib                    1.0.1              pyhd3eb1b0_0  
 67 | jpeg                      9d                   h7f8727e_0  
 68 | json5                     0.9.5              pyh9f0ad1d_0    conda-forge
 69 | jsonschema                4.1.2              pyhd8ed1ab_0    conda-forge
 70 | jupyter_client            7.0.6              pyhd8ed1ab_0    conda-forge
 71 | jupyter_core              4.8.1            py39hf3d152e_0    conda-forge
 72 | jupyter_server            1.11.1             pyhd8ed1ab_0    conda-forge
 73 | jupyterlab                3.2.1              pyhd8ed1ab_0    conda-forge
 74 | jupyterlab-widgets        1.0.2                    pypi_0    pypi
 75 | jupyterlab_pygments       0.1.2              pyh9f0ad1d_0    conda-forge
 76 | jupyterlab_server         2.8.2              pyhd8ed1ab_0    conda-forge
 77 | kiwisolver                1.3.1            py39h2531618_0  
 78 | lame                      3.100                h7b6447c_0  
 79 | lcms2                     2.12                 h3be6417_0  
 80 | ld_impl_linux-64          2.35.1               h7274673_9  
 81 | libffi                    3.3                  he6710b0_2  
 82 | libgcc-ng                 9.1.0                hdf63c60_0  
 83 | libgfortran-ng            7.3.0                hdf63c60_0  
 84 | libiconv                  1.15                 h63c8f33_5  
 85 | libidn2                   2.3.2                h7f8727e_0  
 86 | libpng                    1.6.37               hbc83047_0  
 87 | libsodium                 1.0.18               h36c2ea0_1    conda-forge
 88 | libstdcxx-ng              9.1.0                hdf63c60_0  
 89 | libtasn1                  4.16.0               h27cfd23_0  
 90 | libtiff                   4.2.0                h85742a9_0  
 91 | libunistring              0.9.10               h27cfd23_0  
 92 | libuuid                   1.0.3                h7f8727e_2  
 93 | libuv                     1.40.0               h7b6447c_0  
 94 | libwebp                   1.2.0                h89dd481_0  
 95 | libwebp-base              1.2.0                h27cfd23_0  
 96 | libxcb                    1.14                 h7b6447c_0  
 97 | libxml2                   2.9.10               hb55368b_3  
 98 | locket                    0.2.1                    pypi_0    pypi
 99 | lxml                      4.6.4                    pypi_0    pypi
100 | lz4-c                     1.9.3                h295c915_1  
101 | markupsafe                2.1.1                    pypi_0    pypi
102 | matplotlib                3.4.3            py39h06a4308_0  
103 | matplotlib-base           3.4.3            py39hbbc1b5f_0  
104 | matplotlib-inline         0.1.3              pyhd8ed1ab_0    conda-forge
105 | mistune                   0.8.4           py39hbd71b63_1002    conda-forge
106 | mkl                       2021.3.0           h06a4308_520  
107 | mkl-service               2.4.0            py39h7f8727e_0  
108 | mkl_fft                   1.3.1            py39hd3c417c_0  
109 | mkl_random                1.2.2            py39h51133e4_0  
110 | more-itertools            8.8.0              pyhd3eb1b0_0  
111 | munkres                   1.1.4                      py_0  
112 | nbclassic                 0.3.2              pyhd8ed1ab_0    conda-forge
113 | nbclient                  0.5.4              pyhd8ed1ab_0    conda-forge
114 | nbconvert                 6.2.0            py39hf3d152e_0    conda-forge
115 | nbformat                  5.1.3              pyhd8ed1ab_0    conda-forge
116 | ncurses                   6.2                  he6710b0_1  
117 | nest-asyncio              1.5.1              pyhd8ed1ab_0    conda-forge
118 | nettle                    3.7.3                hbbd107a_1  
119 | ninja                     1.10.2               hff7bd54_1  
120 | notebook                  6.4.5              pyha770c72_0    conda-forge
121 | numexpr                   2.7.3            py39h22e1b3c_1  
122 | numpy                     1.21.2           py39h20f2e39_0  
123 | numpy-base                1.21.2           py39h79a1101_0  
124 | olefile                   0.46               pyhd3eb1b0_0  
125 | openeye-toolkits          2021.1.1                 py39_0    openeye
126 | openh264                  2.1.0                hd408876_0  
127 | openpyxl                  3.0.9                    pypi_0    pypi
128 | openssl                   1.1.1l               h7f8727e_0  
129 | packaging                 21.0               pyhd8ed1ab_0    conda-forge
130 | pandas                    1.3.3            py39h8c16a72_0  
131 | pandoc                    2.14.2               h7f98852_0    conda-forge
132 | pandocfilters             1.5.0              pyhd8ed1ab_0    conda-forge
133 | parso                     0.8.2              pyhd8ed1ab_0    conda-forge
134 | partd                     1.2.0                    pypi_0    pypi
135 | pcre                      8.45                 h295c915_0  
136 | pexpect                   4.8.0              pyh9f0ad1d_2    conda-forge
137 | pickleshare               0.7.5                   py_1003    conda-forge
138 | pillow                    8.4.0            py39h5aabda8_0  
139 | pip                       21.2.4           py39h06a4308_0  
140 | pluggy                    0.13.1           py39h06a4308_0  
141 | prometheus_client         0.11.0             pyhd8ed1ab_0    conda-forge
142 | prompt-toolkit            3.0.21             pyha770c72_0    conda-forge
143 | psutil                    5.8.0                    pypi_0    pypi
144 | ptyprocess                0.7.0              pyhd3deb0d_0    conda-forge
145 | py                        1.10.0             pyhd3eb1b0_0  
146 | pycosat                   0.6.3            py39h27cfd23_0  
147 | pycparser                 2.20                       py_2  
148 | pygments                  2.10.0             pyhd8ed1ab_0    conda-forge
149 | pyopenssl                 20.0.1             pyhd3eb1b0_1  
150 | pyparsing                 2.4.7              pyhd3eb1b0_0  
151 | pyqt                      5.9.2            py39h2531618_6  
152 | pyrsistent                0.17.3           py39hbd71b63_1    conda-forge
153 | pysocks                   1.7.1            py39h06a4308_0  
154 | pytest                    6.2.4            py39h06a4308_2  
155 | python                    3.9.7                h12debd9_1  
156 | python-dateutil           2.8.2              pyhd3eb1b0_0  
157 | python-docx               0.8.11                   pypi_0    pypi
158 | python_abi                3.9                      2_cp39    conda-forge
159 | pytorch-mutex             1.0                        cuda    pytorch
160 | pytz                      2021.3             pyhd3eb1b0_0  
161 | pyyaml                    6.0                      pypi_0    pypi
162 | pyzmq                     19.0.2           py39hb69f2a1_2    conda-forge
163 | qt                        5.9.7                h5867ecd_1  
164 | readline                  8.1                  h27cfd23_0  
165 | requests                  2.26.0             pyhd3eb1b0_0  
166 | requests-unixsocket       0.2.0                      py_0    conda-forge
167 | ruamel_yaml               0.15.100         py39h27cfd23_0  
168 | scikit-learn              0.24.2           py39ha9443f7_0  
169 | scipy                     1.6.2            py39had2a1c9_1  
170 | selfies                   2.0.0                    pypi_0    pypi
171 | send2trash                1.8.0              pyhd8ed1ab_0    conda-forge
172 | setuptools                58.0.4           py39h06a4308_0  
173 | sip                       4.19.13          py39h2531618_0  
174 | six                       1.16.0             pyhd3eb1b0_0  
175 | sniffio                   1.2.0            py39hf3d152e_1    conda-forge
176 | sqlite                    3.36.0               hc218d9a_0  
177 | swifter                   1.0.9                    pypi_0    pypi
178 | terminado                 0.12.1           py39hf3d152e_0    conda-forge
179 | testpath                  0.5.0              pyhd8ed1ab_0    conda-forge
180 | threadpoolctl             2.2.0              pyh0d69192_0  
181 | tk                        8.6.11               h1ccaba5_0  
182 | toml                      0.10.2             pyhd3eb1b0_0  
183 | toolz                     0.11.2                   pypi_0    pypi
184 | torch                     1.10.0                   pypi_0    pypi
185 | torchvision               0.2.2                      py_3    pytorch
186 | tornado                   6.1              py39h27cfd23_0  
187 | tqdm                      4.62.2             pyhd3eb1b0_1  
188 | traitlets                 5.1.0              pyhd8ed1ab_0    conda-forge
189 | typing_extensions         3.10.0.2           pyh06a4308_0  
190 | tzdata                    2021a                h5d7bf9c_0  
191 | urllib3                   1.26.7             pyhd3eb1b0_0  
192 | viennarna                 2.3.3                hfc679d8_2    bioconda
193 | wcwidth                   0.2.5              pyh9f0ad1d_2    conda-forge
194 | webencodings              0.5.1                      py_1    conda-forge
195 | websocket-client          0.57.0           py39hf3d152e_4    conda-forge
196 | werkzeug                  2.2.1                    pypi_0    pypi
197 | wheel                     0.37.0             pyhd3eb1b0_1  
198 | widgetsnbextension        3.5.2                    pypi_0    pypi
199 | xz                        5.2.5                h7b6447c_0  
200 | yaml                      0.2.5                h7b6447c_0  
201 | zeromq                    4.3.4                h2531618_0  
202 | zipp                      3.6.0              pyhd8ed1ab_0    conda-forge
203 | zlib                      1.2.11               h7b6447c_3  
204 | zstd                      1.4.9                haebb681_0  
205 | 


--------------------------------------------------------------------------------