├── .ipynb_checkpoints
└── DeepSP_predictor-checkpoint.ipynb
├── Conv1D_regressionSAPpos.json
├── Conv1D_regressionSCMneg.json
├── Conv1D_regressionSCMpos.json
├── Conv1D_regression_SAPpos.h5
├── Conv1D_regression_SCMneg.h5
├── Conv1D_regression_SCMpos.h5
├── DeepSP-app.py
├── DeepSP_input.csv
├── DeepSP_model_train.py
├── DeepSP_predictor.ipynb
├── LICENSE
├── README.md
├── data
├── Deep_SAPpos_data.txt
├── Deep_SCMneg_data.txt
├── Deep_SCMpos_data.txt
├── ERR4082227_1482_rank1_imgt_scheme.pdb
└── ERR4082243_2914_rank1_imgt_scheme.pdb
├── deepsp_predictor.py
└── environment.yml
/.ipynb_checkpoints/DeepSP_predictor-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "CqYhZfJnSvsH"
7 | },
8 | "source": [
9 | "Install and Import Necessary Libraries"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {
16 | "colab": {
17 | "base_uri": "https://localhost:8080/"
18 | },
19 | "id": "icCMgpaXYyFN",
20 | "outputId": "4fe01a84-464c-4248-8e1a-c1b671051b4c"
21 | },
22 | "outputs": [
23 | {
24 | "name": "stdout",
25 | "output_type": "stream",
26 | "text": [
27 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
28 | "\u001b[0m✨🍰✨ Everything looks OK!\n",
29 | "Channels:\n",
30 | " - bioconda\n",
31 | " - conda-forge\n",
32 | "Platform: linux-64\n",
33 | "Collecting package metadata (repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n",
34 | "Solving environment: - \b\b\\ \b\b| \b\bdone\n",
35 | "\n",
36 | "\n",
37 | "==> WARNING: A newer version of conda exists. <==\n",
38 | " current version: 23.11.0\n",
39 | " latest version: 24.3.0\n",
40 | "\n",
41 | "Please update conda by running\n",
42 | "\n",
43 | " $ conda update -n base -c conda-forge conda\n",
44 | "\n",
45 | "\n",
46 | "\n",
47 | "## Package Plan ##\n",
48 | "\n",
49 | " environment location: /usr/local\n",
50 | "\n",
51 | " added / updated specs:\n",
52 | " - anarci\n",
53 | "\n",
54 | "\n",
55 | "The following packages will be downloaded:\n",
56 | "\n",
57 | " package | build\n",
58 | " ---------------------------|-----------------\n",
59 | " anarci-2021.02.04 | pyhdfd78af_0 1.1 MB bioconda\n",
60 | " biopython-1.83 | py310h2372a71_0 2.6 MB conda-forge\n",
61 | " ca-certificates-2024.2.2 | hbcca054_0 152 KB conda-forge\n",
62 | " certifi-2024.2.2 | pyhd8ed1ab_0 157 KB conda-forge\n",
63 | " hmmer-3.4 | hdbdd923_1 11.1 MB bioconda\n",
64 | " libblas-3.9.0 |22_linux64_openblas 14 KB conda-forge\n",
65 | " libcblas-3.9.0 |22_linux64_openblas 14 KB conda-forge\n",
66 | " libgfortran-ng-13.2.0 | h69a702a_6 24 KB conda-forge\n",
67 | " libgfortran5-13.2.0 | h43f5ff8_6 1.4 MB conda-forge\n",
68 | " liblapack-3.9.0 |22_linux64_openblas 14 KB conda-forge\n",
69 | " libopenblas-0.3.27 |pthreads_h413a1c8_0 5.3 MB conda-forge\n",
70 | " numpy-1.26.4 | py310hb13e2d6_0 6.7 MB conda-forge\n",
71 | " openssl-3.3.0 | hd590300_0 2.8 MB conda-forge\n",
72 | " ------------------------------------------------------------\n",
73 | " Total: 31.3 MB\n",
74 | "\n",
75 | "The following NEW packages will be INSTALLED:\n",
76 | "\n",
77 | " anarci bioconda/noarch::anarci-2021.02.04-pyhdfd78af_0 \n",
78 | " biopython conda-forge/linux-64::biopython-1.83-py310h2372a71_0 \n",
79 | " hmmer bioconda/linux-64::hmmer-3.4-hdbdd923_1 \n",
80 | " libblas conda-forge/linux-64::libblas-3.9.0-22_linux64_openblas \n",
81 | " libcblas conda-forge/linux-64::libcblas-3.9.0-22_linux64_openblas \n",
82 | " libgfortran-ng conda-forge/linux-64::libgfortran-ng-13.2.0-h69a702a_6 \n",
83 | " libgfortran5 conda-forge/linux-64::libgfortran5-13.2.0-h43f5ff8_6 \n",
84 | " liblapack conda-forge/linux-64::liblapack-3.9.0-22_linux64_openblas \n",
85 | " libopenblas conda-forge/linux-64::libopenblas-0.3.27-pthreads_h413a1c8_0 \n",
86 | " numpy conda-forge/linux-64::numpy-1.26.4-py310hb13e2d6_0 \n",
87 | "\n",
88 | "The following packages will be UPDATED:\n",
89 | "\n",
90 | " ca-certificates 2023.11.17-hbcca054_0 --> 2024.2.2-hbcca054_0 \n",
91 | " certifi 2023.11.17-pyhd8ed1ab_0 --> 2024.2.2-pyhd8ed1ab_0 \n",
92 | " openssl 3.2.0-hd590300_1 --> 3.3.0-hd590300_0 \n",
93 | "\n",
94 | "\n",
95 | "\n",
96 | "Downloading and Extracting Packages:\n",
97 | "hmmer-3.4 | 11.1 MB | : 0% 0/1 [00:00, ?it/s]\n",
98 | "numpy-1.26.4 | 6.7 MB | : 0% 0/1 [00:00, ?it/s]\u001b[A\n",
99 | "\n",
100 | "libopenblas-0.3.27 | 5.3 MB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\n",
101 | "\n",
102 | "\n",
103 | "openssl-3.3.0 | 2.8 MB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\n",
104 | "\n",
105 | "\n",
106 | "\n",
107 | "biopython-1.83 | 2.6 MB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\n",
108 | "\n",
109 | "\n",
110 | "\n",
111 | "\n",
112 | "libgfortran5-13.2.0 | 1.4 MB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
113 | "\n",
114 | "\n",
115 | "\n",
116 | "\n",
117 | "\n",
118 | "anarci-2021.02.04 | 1.1 MB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
119 | "\n",
120 | "\n",
121 | "\n",
122 | "\n",
123 | "\n",
124 | "\n",
125 | "certifi-2024.2.2 | 157 KB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
126 | "\n",
127 | "\n",
128 | "\n",
129 | "\n",
130 | "\n",
131 | "\n",
132 | "\n",
133 | "ca-certificates-2024 | 152 KB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
134 | "\n",
135 | "\n",
136 | "\n",
137 | "\n",
138 | "\n",
139 | "\n",
140 | "\n",
141 | "\n",
142 | "libgfortran-ng-13.2. | 24 KB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
143 | "\n",
144 | "\n",
145 | "\n",
146 | "\n",
147 | "\n",
148 | "\n",
149 | "\n",
150 | "\n",
151 | "\n",
152 | "libblas-3.9.0 | 14 KB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
153 | "\n",
154 | "\n",
155 | "\n",
156 | "\n",
157 | "\n",
158 | "\n",
159 | "\n",
160 | "\n",
161 | "\n",
162 | "\n",
163 | "liblapack-3.9.0 | 14 KB | : 0% 0/1 [00:00, ?it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
164 | "\n",
165 | "\n",
166 | "\n",
167 | "\n",
168 | "\n",
169 | "\n",
170 | "\n",
171 | "\n",
172 | "\n",
173 | "\n",
174 | "\n",
175 | "hmmer-3.4 | 11.1 MB | : 0% 0.00140836123365361/1 [00:00<01:20, 80.97s/it]\n",
176 | "numpy-1.26.4 | 6.7 MB | : 0% 0.002337542641177788/1 [00:00<00:59, 60.03s/it]\u001b[A\n",
177 | "\n",
178 | "libopenblas-0.3.27 | 5.3 MB | : 0% 0.0029263690607916376/1 [00:00<00:52, 52.42s/it]\u001b[A\u001b[A\n",
179 | "\n",
180 | "\n",
181 | "\n",
182 | "biopython-1.83 | 2.6 MB | : 1% 0.00597188359638319/1 [00:00<00:26, 26.37s/it]\u001b[A\u001b[A\u001b[A\u001b[A\n",
183 | "\n",
184 | "\n",
185 | "hmmer-3.4 | 11.1 MB | : 50% 0.5013765991806851/1 [00:00<00:00, 2.78it/s] \n",
186 | "numpy-1.26.4 | 6.7 MB | : 40% 0.40439487692375736/1 [00:00<00:00, 2.02it/s] \u001b[A\n",
187 | "\n",
188 | "libopenblas-0.3.27 | 5.3 MB | : 46% 0.45943994254428716/1 [00:00<00:00, 2.21it/s] \u001b[A\u001b[A\n",
189 | "\n",
190 | "\n",
191 | "\n",
192 | "biopython-1.83 | 2.6 MB | : 75% 0.752457333144282/1 [00:00<00:00, 3.57it/s] \u001b[A\u001b[A\u001b[A\u001b[A\n",
193 | "\n",
194 | "\n",
195 | "hmmer-3.4 | 11.1 MB | : 80% 0.7971324582479433/1 [00:00<00:00, 2.71it/s]\n",
196 | "numpy-1.26.4 | 6.7 MB | : 86% 0.8625532345946039/1 [00:00<00:00, 3.03it/s] \u001b[A\n",
197 | "\n",
198 | "\n",
199 | "\n",
200 | "\n",
201 | "libgfortran5-13.2.0 | 1.4 MB | : 1% 0.011357082649394437/1 [00:00<00:32, 33.16s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
202 | "\n",
203 | "\n",
204 | "\n",
205 | "\n",
206 | "\n",
207 | "anarci-2021.02.04 | 1.1 MB | : 1% 0.014093787688236882/1 [00:00<00:27, 28.15s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
208 | "\n",
209 | "\n",
210 | "\n",
211 | "\n",
212 | "\n",
213 | "\n",
214 | "certifi-2024.2.2 | 157 KB | : 10% 0.10204348557228184/1 [00:00<00:04, 5.03s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
215 | "\n",
216 | "\n",
217 | "\n",
218 | "\n",
219 | "\n",
220 | "\n",
221 | "\n",
222 | "ca-certificates-2024 | 152 KB | : 11% 0.10540943949765814/1 [00:00<00:04, 4.96s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
223 | "\n",
224 | "\n",
225 | "\n",
226 | "\n",
227 | "\n",
228 | "\n",
229 | "\n",
230 | "\n",
231 | "libgfortran-ng-13.2. | 24 KB | : 68% 0.6775007236488443/1 [00:00<00:00, 1.28it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
232 | "\n",
233 | "\n",
234 | "\n",
235 | "\n",
236 | "\n",
237 | "\n",
238 | "\n",
239 | "\n",
240 | "\n",
241 | "libblas-3.9.0 | 14 KB | : 100% 1.0/1 [00:00<00:00, 1.78it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
242 | "\n",
243 | "\n",
244 | "\n",
245 | "\n",
246 | "\n",
247 | "\n",
248 | "\n",
249 | "\n",
250 | "\n",
251 | "\n",
252 | "liblapack-3.9.0 | 14 KB | : 100% 1.0/1 [00:00<00:00, 1.76it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
253 | "\n",
254 | "\n",
255 | "\n",
256 | "\n",
257 | "\n",
258 | "\n",
259 | "\n",
260 | "\n",
261 | "\n",
262 | "\n",
263 | "\n",
264 | "libcblas-3.9.0 | 14 KB | : 100% 1.0/1 [00:00<00:00, 1.75it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
265 | "\n",
266 | "\n",
267 | "openssl-3.3.0 | 2.8 MB | : 100% 1.0/1 [00:00<00:00, 3.88it/s] \u001b[A\u001b[A\u001b[A\n",
268 | "\n",
269 | "libopenblas-0.3.27 | 5.3 MB | : 100% 1.0/1 [00:01<00:00, 1.54s/it] \u001b[A\u001b[A\n",
270 | "\n",
271 | "libopenblas-0.3.27 | 5.3 MB | : 100% 1.0/1 [00:01<00:00, 1.54s/it]\u001b[A\u001b[A\n",
272 | "\n",
273 | "\n",
274 | "\n",
275 | "biopython-1.83 | 2.6 MB | : 100% 1.0/1 [00:01<00:00, 3.57it/s] \u001b[A\u001b[A\u001b[A\u001b[A\n",
276 | "\n",
277 | "\n",
278 | "\n",
279 | "\n",
280 | "libgfortran5-13.2.0 | 1.4 MB | : 100% 1.0/1 [00:01<00:00, 1.44s/it] \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
281 | "\n",
282 | "\n",
283 | "\n",
284 | "\n",
285 | "libgfortran5-13.2.0 | 1.4 MB | : 100% 1.0/1 [00:01<00:00, 1.44s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
286 | "\n",
287 | "\n",
288 | "\n",
289 | "\n",
290 | "\n",
291 | "\n",
292 | "certifi-2024.2.2 | 157 KB | : 100% 1.0/1 [00:01<00:00, 1.48s/it] \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
293 | "\n",
294 | "\n",
295 | "\n",
296 | "\n",
297 | "\n",
298 | "\n",
299 | "certifi-2024.2.2 | 157 KB | : 100% 1.0/1 [00:01<00:00, 1.48s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
300 | "\n",
301 | "\n",
302 | "\n",
303 | "\n",
304 | "\n",
305 | "\n",
306 | "\n",
307 | "ca-certificates-2024 | 152 KB | : 100% 1.0/1 [00:01<00:00, 1.52s/it] \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
308 | "\n",
309 | "\n",
310 | "\n",
311 | "\n",
312 | "\n",
313 | "\n",
314 | "\n",
315 | "ca-certificates-2024 | 152 KB | : 100% 1.0/1 [00:01<00:00, 1.52s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
316 | "\n",
317 | "\n",
318 | "\n",
319 | "\n",
320 | "\n",
321 | "\n",
322 | "\n",
323 | "\n",
324 | "libgfortran-ng-13.2. | 24 KB | : 100% 1.0/1 [00:01<00:00, 1.94s/it] \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
325 | "\n",
326 | "\n",
327 | "\n",
328 | "\n",
329 | "\n",
330 | "\n",
331 | "\n",
332 | "\n",
333 | "libgfortran-ng-13.2. | 24 KB | : 100% 1.0/1 [00:01<00:00, 1.94s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
334 | "\n",
335 | "\n",
336 | "\n",
337 | "\n",
338 | "\n",
339 | "\n",
340 | "\n",
341 | "\n",
342 | "\n",
343 | "libblas-3.9.0 | 14 KB | : 100% 1.0/1 [00:01<00:00, 1.78it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
344 | "\n",
345 | "\n",
346 | "\n",
347 | "\n",
348 | "\n",
349 | "anarci-2021.02.04 | 1.1 MB | : 100% 1.0/1 [00:02<00:00, 2.44s/it] \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
350 | "\n",
351 | "\n",
352 | "\n",
353 | "\n",
354 | "\n",
355 | "anarci-2021.02.04 | 1.1 MB | : 100% 1.0/1 [00:02<00:00, 2.44s/it]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
356 | "\n",
357 | "\n",
358 | "\n",
359 | "\n",
360 | "\n",
361 | "\n",
362 | "\n",
363 | "\n",
364 | "\n",
365 | "\n",
366 | "liblapack-3.9.0 | 14 KB | : 100% 1.0/1 [00:02<00:00, 1.76it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
367 | "\n",
368 | "\n",
369 | "\n",
370 | "\n",
371 | "\n",
372 | "\n",
373 | "\n",
374 | "\n",
375 | "\n",
376 | "\n",
377 | "\n",
378 | "libcblas-3.9.0 | 14 KB | : 100% 1.0/1 [00:02<00:00, 1.75it/s]\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
379 | " \n",
380 | " \u001b[A\n",
381 | "\n",
382 | " \u001b[A\u001b[A\n",
383 | "\n",
384 | "\n",
385 | " \u001b[A\u001b[A\u001b[A\n",
386 | "\n",
387 | "\n",
388 | "\n",
389 | " \u001b[A\u001b[A\u001b[A\u001b[A\n",
390 | "\n",
391 | "\n",
392 | "\n",
393 | "\n",
394 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
395 | "\n",
396 | "\n",
397 | "\n",
398 | "\n",
399 | "\n",
400 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
401 | "\n",
402 | "\n",
403 | "\n",
404 | "\n",
405 | "\n",
406 | "\n",
407 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
408 | "\n",
409 | "\n",
410 | "\n",
411 | "\n",
412 | "\n",
413 | "\n",
414 | "\n",
415 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
416 | "\n",
417 | "\n",
418 | "\n",
419 | "\n",
420 | "\n",
421 | "\n",
422 | "\n",
423 | "\n",
424 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
425 | "\n",
426 | "\n",
427 | "\n",
428 | "\n",
429 | "\n",
430 | "\n",
431 | "\n",
432 | "\n",
433 | "\n",
434 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
435 | "\n",
436 | "\n",
437 | "\n",
438 | "\n",
439 | "\n",
440 | "\n",
441 | "\n",
442 | "\n",
443 | "\n",
444 | "\n",
445 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
446 | "\n",
447 | "\n",
448 | "\n",
449 | "\n",
450 | "\n",
451 | "\n",
452 | "\n",
453 | "\n",
454 | "\n",
455 | "\n",
456 | "\n",
457 | " \u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\n",
458 | "Preparing transaction: - \b\b\\ \b\b| \b\bdone\n",
459 | "Verifying transaction: - \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
460 | "Executing transaction: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n"
461 | ]
462 | }
463 | ],
464 | "source": [
465 | "!pip install -q condacolab\n",
466 | "import condacolab\n",
467 | "condacolab.install()\n",
468 | "\n",
469 | "!conda install -c bioconda anarci"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 1,
475 | "metadata": {
476 | "id": "SOqM10kcxHb9"
477 | },
478 | "outputs": [],
479 | "source": [
480 | "# Import libraries\n",
481 | "import numpy as np\n",
482 | "import pandas as pd\n",
483 | "import random\n",
484 | "import matplotlib.pyplot as plt\n",
485 | "\n",
486 | "from numpy.random import seed\n",
487 | "\n",
488 | "# Import machine learning libraries\n",
489 | "import tensorflow as tf\n",
490 | "from tensorflow.keras.models import model_from_json\n",
491 | "from tensorflow.keras.utils import plot_model\n",
492 | "\n",
493 | "import keras\n",
494 | "from keras.models import model_from_json\n",
495 | "from keras.callbacks import ModelCheckpoint\n",
496 | "from keras.optimizers import Adam\n",
497 | "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
498 | "\n",
499 | "from Bio import SeqIO\n",
500 | "from Bio.Seq import Seq\n",
501 | "from Bio.SeqRecord import SeqRecord"
502 | ]
503 | },
504 | {
505 | "cell_type": "markdown",
506 | "metadata": {
507 | "id": "nifzJMA-TkHp"
508 | },
509 | "source": [
510 | "Import dataset"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 4,
516 | "metadata": {
517 | "colab": {
518 | "base_uri": "https://localhost:8080/",
519 | "height": 551
520 | },
521 | "id": "gp2-sVDPSTXh",
522 | "outputId": "977ba12a-5f3b-4228-b978-30d9785f1296"
523 | },
524 | "outputs": [
525 | {
526 | "data": {
527 | "application/vnd.google.colaboratory.intrinsic+json": {
528 | "summary": "{\n \"name\": \"dataset\",\n \"rows\": 16,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16,\n \"samples\": [\n \"mAb1\",\n \"mAb2\",\n \"mAb6\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Heavy_Chain\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16,\n \"samples\": [\n \"EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSAITWNSGHIDYADSVEGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAKVSYLSTASSLDYWGQGTLVTVSS\",\n \"EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLEWVAWISPYGGSTYYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARRHWPGGFDYWGQGTLVTVSA\",\n \"QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLEWVAFMSYDGSNKKYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDRGIAAGGNYYYYGMDVWGQGTTVTVSS\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Light_Chain\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16,\n \"samples\": [\n \"DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQRYNRAPYTFGQGTKVEIK\",\n \"DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLYHPATFGQGTKVEIK\",\n \"EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPPFTFGPGTKVDIK\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
529 | "type": "dataframe",
530 | "variable_name": "dataset"
531 | },
532 | "text/html": [
533 | "\n",
534 | "
\n",
535 | "
\n",
536 | "\n",
549 | "
\n",
550 | " \n",
551 | " \n",
552 | " | \n",
553 | " Name | \n",
554 | " Heavy_Chain | \n",
555 | " Light_Chain | \n",
556 | "
\n",
557 | " \n",
558 | " \n",
559 | " \n",
560 | " 0 | \n",
561 | " mAb1 | \n",
562 | " EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE... | \n",
563 | " DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL... | \n",
564 | "
\n",
565 | " \n",
566 | " 1 | \n",
567 | " mAb2 | \n",
568 | " EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE... | \n",
569 | " DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL... | \n",
570 | "
\n",
571 | " \n",
572 | " 2 | \n",
573 | " mAb3 | \n",
574 | " QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE... | \n",
575 | " DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL... | \n",
576 | "
\n",
577 | " \n",
578 | " 3 | \n",
579 | " mAb4 | \n",
580 | " EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE... | \n",
581 | " EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL... | \n",
582 | "
\n",
583 | " \n",
584 | " 4 | \n",
585 | " mAb5 | \n",
586 | " EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE... | \n",
587 | " EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPR... | \n",
588 | "
\n",
589 | " \n",
590 | " 5 | \n",
591 | " mAb6 | \n",
592 | " QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLE... | \n",
593 | " EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRL... | \n",
594 | "
\n",
595 | " \n",
596 | " 6 | \n",
597 | " mAb7 | \n",
598 | " EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE... | \n",
599 | " DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL... | \n",
600 | "
\n",
601 | " \n",
602 | " 7 | \n",
603 | " mAb8 | \n",
604 | " EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGL... | \n",
605 | " DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGK... | \n",
606 | "
\n",
607 | " \n",
608 | " 8 | \n",
609 | " mAb9 | \n",
610 | " QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKG... | \n",
611 | " DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL... | \n",
612 | "
\n",
613 | " \n",
614 | " 9 | \n",
615 | " mAb10 | \n",
616 | " EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE... | \n",
617 | " DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKL... | \n",
618 | "
\n",
619 | " \n",
620 | " 10 | \n",
621 | " mAb11 | \n",
622 | " QVQLQESGPGLVRPSQTLSLTCTVSGYSITSDHAWSWVRQPPGRGL... | \n",
623 | " DIQMTQSPSSLSASVGDRVTITCRASQDISSYLNWYQQKPGKAPKL... | \n",
624 | "
\n",
625 | " \n",
626 | " 11 | \n",
627 | " mAb12 | \n",
628 | " QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLE... | \n",
629 | " DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPG... | \n",
630 | "
\n",
631 | " \n",
632 | " 12 | \n",
633 | " mAb13 | \n",
634 | " QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI... | \n",
635 | " QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW... | \n",
636 | "
\n",
637 | " \n",
638 | " 13 | \n",
639 | " mAb14 | \n",
640 | " QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE... | \n",
641 | " DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL... | \n",
642 | "
\n",
643 | " \n",
644 | " 14 | \n",
645 | " mAb15 | \n",
646 | " QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE... | \n",
647 | " DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL... | \n",
648 | "
\n",
649 | " \n",
650 | " 15 | \n",
651 | " mAb16 | \n",
652 | " EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE... | \n",
653 | " DIQMTQSPSSLSASVGDRVTITCRASQYFSSYLAWYQQKPGKAPKL... | \n",
654 | "
\n",
655 | " \n",
656 | "
\n",
657 | "
\n",
658 | "
\n",
865 | "
\n"
866 | ],
867 | "text/plain": [
868 | " Name Heavy_Chain \\\n",
869 | "0 mAb1 EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE... \n",
870 | "1 mAb2 EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE... \n",
871 | "2 mAb3 QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE... \n",
872 | "3 mAb4 EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE... \n",
873 | "4 mAb5 EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE... \n",
874 | "5 mAb6 QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLE... \n",
875 | "6 mAb7 EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE... \n",
876 | "7 mAb8 EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGL... \n",
877 | "8 mAb9 QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKG... \n",
878 | "9 mAb10 EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE... \n",
879 | "10 mAb11 QVQLQESGPGLVRPSQTLSLTCTVSGYSITSDHAWSWVRQPPGRGL... \n",
880 | "11 mAb12 QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLE... \n",
881 | "12 mAb13 QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI... \n",
882 | "13 mAb14 QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE... \n",
883 | "14 mAb15 QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE... \n",
884 | "15 mAb16 EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE... \n",
885 | "\n",
886 | " Light_Chain \n",
887 | "0 DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL... \n",
888 | "1 DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL... \n",
889 | "2 DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL... \n",
890 | "3 EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL... \n",
891 | "4 EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPR... \n",
892 | "5 EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRL... \n",
893 | "6 DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL... \n",
894 | "7 DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGK... \n",
895 | "8 DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL... \n",
896 | "9 DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKL... \n",
897 | "10 DIQMTQSPSSLSASVGDRVTITCRASQDISSYLNWYQQKPGKAPKL... \n",
898 | "11 DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPG... \n",
899 | "12 QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW... \n",
900 | "13 DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL... \n",
901 | "14 DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL... \n",
902 | "15 DIQMTQSPSSLSASVGDRVTITCRASQYFSSYLAWYQQKPGKAPKL... "
903 | ]
904 | },
905 | "execution_count": 4,
906 | "metadata": {},
907 | "output_type": "execute_result"
908 | }
909 | ],
910 | "source": [
911 | "dataset = pd.read_csv('DeepSP_input.csv') # replace with your csv file, see format in DeepSP_input.csv file\n",
912 | "dataset"
913 | ]
914 | },
915 | {
916 | "cell_type": "code",
917 | "execution_count": 5,
918 | "metadata": {
919 | "id": "XWbYjNz-SoTA"
920 | },
921 | "outputs": [],
922 | "source": [
923 | "name = dataset['Name'].to_list()\n",
924 | "Heavy_seq = dataset['Heavy_Chain'].to_list()\n",
925 | "Light_seq = dataset['Light_Chain'].to_list()"
926 | ]
927 | },
928 | {
929 | "cell_type": "markdown",
930 | "metadata": {
931 | "id": "V-qttNLlTuT4"
932 | },
933 | "source": [
934 | "Convert to Fasta File"
935 | ]
936 | },
937 | {
938 | "cell_type": "code",
939 | "execution_count": 6,
940 | "metadata": {
941 | "id": "Pt5KeAfZy8gF"
942 | },
943 | "outputs": [],
944 | "source": [
945 | "file_out='seq_H.fasta'\n",
946 | "\n",
947 | "with open(file_out, \"w\") as output_handle:\n",
948 | " for i in range(len(name)):\n",
949 | " seq_name = name[i]\n",
950 | " seq = Heavy_seq[i]\n",
951 | " record = SeqRecord(\n",
952 | " Seq(seq),\n",
953 | " id=seq_name,\n",
954 | " name=\"\",\n",
955 | " description=\"\",\n",
956 | " )\n",
957 | " SeqIO.write(record, output_handle, \"fasta\")\n",
958 | "\n",
959 | "file_out='seq_L.fasta'\n",
960 | "\n",
961 | "with open(file_out, \"w\") as output_handle:\n",
962 | " for i in range(len(name)):\n",
963 | " seq_name = name[i]\n",
964 | " seq = Light_seq[i]\n",
965 | " record = SeqRecord(\n",
966 | " Seq(seq),\n",
967 | " id=seq_name,\n",
968 | " name=\"\",\n",
969 | " description=\"\",\n",
970 | " )\n",
971 | " SeqIO.write(record, output_handle, \"fasta\")"
972 | ]
973 | },
974 | {
975 | "cell_type": "markdown",
976 | "metadata": {
977 | "id": "QugBcnYeT1ci"
978 | },
979 | "source": [
980 | "Sequence Alignment with ANARCI"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 7,
986 | "metadata": {
987 | "id": "h7DCE-fo16qG"
988 | },
989 | "outputs": [],
990 | "source": [
991 | "!ANARCI -i seq_H.fasta -o seq_aligned -s imgt -r heavy --csv\n",
992 | "!ANARCI -i seq_L.fasta -o seq_aligned -s imgt -r light --csv"
993 | ]
994 | },
995 | {
996 | "cell_type": "code",
997 | "execution_count": 8,
998 | "metadata": {
999 | "id": "uFOunTDOUUhZ"
1000 | },
1001 | "outputs": [],
1002 | "source": [
1003 | "H_aligned = pd.read_csv('seq_aligned_H.csv')\n",
1004 | "L_aligned = pd.read_csv('seq_aligned_KL.csv')"
1005 | ]
1006 | },
1007 | {
1008 | "cell_type": "code",
1009 | "execution_count": 9,
1010 | "metadata": {
1011 | "id": "Vn4-q554Udy_"
1012 | },
1013 | "outputs": [],
1014 | "source": [
1015 | "# https://github.com/Lailabcode/DeepSCM/blob/main/deepscm-master/seq_preprocessing.py\n",
1016 | "\n",
1017 | "def seq_preprocessing():\n",
1018 | " infile_H = pd.read_csv('seq_aligned_H.csv')\n",
1019 | " infile_L = pd.read_csv('seq_aligned_KL.csv')\n",
1020 | " outfile = open('seq_aligned_HL.txt', \"w\")\n",
1021 | "\n",
1022 | " H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \\\n",
1023 | " '11','12','13','14','15','16','17','18','19','20', \\\n",
1024 | " '21','22','23','24','25','26','27','28','29','30', \\\n",
1025 | " '31','32','33','34','35','36','37','38','39','40', \\\n",
1026 | " '41','42','43','44','45','46','47','48','49','50', \\\n",
1027 | " '51','52','53','54','55','56','57','58','59','60', \\\n",
1028 | " '61','62','63','64','65','66','67','68','69','70', \\\n",
1029 | " '71','72','73','74','75','76','77','78','79','80', \\\n",
1030 | " '81','82','83','84','85','86','87','88','89','90', \\\n",
1031 | " '91','92','93','94','95','96','97','98','99','100', \\\n",
1032 | " '101','102','103','104','105','106','107','108','109','110', \\\n",
1033 | " '111','111A','111B','111C','111D','111E','111F','111G','111H', \\\n",
1034 | " '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\\\n",
1035 | " '113','114','115','116','117','118','119','120', \\\n",
1036 | " '121','122','123','124','125','126','127','128']\n",
1037 | "\n",
1038 | " L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \\\n",
1039 | " '11','12','13','14','15','16','17','18','19','20', \\\n",
1040 | " '21','22','23','24','25','26','27','28','29','30', \\\n",
1041 | " '31','32','33','34','35','36','37','38','39','40', \\\n",
1042 | " '41','42','43','44','45','46','47','48','49','50', \\\n",
1043 | " '51','52','53','54','55','56','57','58','59','60', \\\n",
1044 | " '61','62','63','64','65','66','67','68','69','70', \\\n",
1045 | " '71','72','73','74','75','76','77','78','79','80', \\\n",
1046 | " '81','82','83','84','85','86','87','88','89','90', \\\n",
1047 | " '91','92','93','94','95','96','97','98','99','100', \\\n",
1048 | " '101','102','103','104','105','106','107','108','109','110', \\\n",
1049 | " '111','112','113','114','115','116','117','118','119','120', \\\n",
1050 | " '121','122','123','124','125','126','127']\n",
1051 | "\n",
1052 | " H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \\\n",
1053 | " '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \\\n",
1054 | " '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \\\n",
1055 | " '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \\\n",
1056 | " '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \\\n",
1057 | " '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \\\n",
1058 | " '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \\\n",
1059 | " '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \\\n",
1060 | " '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \\\n",
1061 | " '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \\\n",
1062 | " '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \\\n",
1063 | " '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \\\n",
1064 | " '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \\\n",
1065 | " '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \\\n",
1066 | " '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144}\n",
1067 | "\n",
1068 | " L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \\\n",
1069 | " '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \\\n",
1070 | " '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \\\n",
1071 | " '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \\\n",
1072 | " '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \\\n",
1073 | " '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \\\n",
1074 | " '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \\\n",
1075 | " '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \\\n",
1076 | " '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \\\n",
1077 | " '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \\\n",
1078 | " '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \\\n",
1079 | " '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \\\n",
1080 | " '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127}\n",
1081 | "\n",
1082 | "\n",
1083 | " N_mAbs = len(infile_H[\"Id\"])\n",
1084 | "\n",
1085 | " for i in range(N_mAbs):\n",
1086 | " H_tmp = 145*['-']\n",
1087 | " L_tmp = 127*['-']\n",
1088 | " for col in infile_H.columns:\n",
1089 | " if(col in H_inclusion_list):\n",
1090 | " H_tmp[H_dict[col]]=infile_H.iloc[i][col]\n",
1091 | " for col in infile_L.columns:\n",
1092 | " if(col in L_inclusion_list):\n",
1093 | " L_tmp[L_dict[col]]=infile_L.iloc[i][col]\n",
1094 | "\n",
1095 | " aa_string = ''\n",
1096 | " for aa in H_tmp+L_tmp:\n",
1097 | " aa_string += aa\n",
1098 | " outfile.write(infile_H.iloc[i,0]+\" \"+aa_string)\n",
1099 | " outfile.write(\"\\n\")\n",
1100 | "\n",
1101 | " outfile.close()\n",
1102 | " return\n",
1103 | "\n",
1104 | "seq_preprocessing()"
1105 | ]
1106 | },
1107 | {
1108 | "cell_type": "markdown",
1109 | "metadata": {
1110 | "id": "s0SoIZ19Un54"
1111 | },
1112 | "source": [
1113 | "Read Aligned Sequence"
1114 | ]
1115 | },
1116 | {
1117 | "cell_type": "code",
1118 | "execution_count": 2,
1119 | "metadata": {
1120 | "id": "8maPu9TsUnU0"
1121 | },
1122 | "outputs": [],
1123 | "source": [
1124 | "def load_input_data(filename):\n",
1125 | " name_list=[]\n",
1126 | " seq_list=[]\n",
1127 | " with open(filename) as datafile:\n",
1128 | " for line in datafile:\n",
1129 | " line = line.strip().split()\n",
1130 | " name_list.append(line[0])\n",
1131 | " seq_list.append(line[1])\n",
1132 | " return name_list, seq_list"
1133 | ]
1134 | },
1135 | {
1136 | "cell_type": "code",
1137 | "execution_count": 11,
1138 | "metadata": {
1139 | "id": "vjQU_ae6Usqq"
1140 | },
1141 | "outputs": [],
1142 | "source": [
1143 | "name_list, seq_list = load_input_data('seq_aligned_HL.txt')\n",
1144 | "X = seq_list"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "markdown",
1149 | "metadata": {
1150 | "id": "KVJcM7emVAqS"
1151 | },
1152 | "source": [
1153 | "One Hot Encoding of Aligned Sequence"
1154 | ]
1155 | },
1156 | {
1157 | "cell_type": "code",
1158 | "execution_count": 12,
1159 | "metadata": {
1160 | "id": "QTUshyyHVFbI"
1161 | },
1162 | "outputs": [],
1163 | "source": [
1164 | "def one_hot_encoder(s):\n",
1165 | " d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}\n",
1166 | "\n",
1167 | " x = np.zeros((len(d), len(s)))\n",
1168 | " x[[d[c] for c in s], range(len(s))] = 1\n",
1169 | "\n",
1170 | " return x"
1171 | ]
1172 | },
1173 | {
1174 | "cell_type": "code",
1175 | "execution_count": 13,
1176 | "metadata": {
1177 | "id": "v-JnQUxuVPr9"
1178 | },
1179 | "outputs": [],
1180 | "source": [
1181 | "X = [one_hot_encoder(s=x) for x in X]\n",
1182 | "X = np.transpose(np.asarray(X), (0, 2, 1))\n",
1183 | "X = np.asarray(X)"
1184 | ]
1185 | },
1186 | {
1187 | "cell_type": "markdown",
1188 | "metadata": {
1189 | "id": "QrdyGSKQWv0V"
1190 | },
1191 | "source": [
1192 | "Predict DeepSP Predictor"
1193 | ]
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": 14,
1198 | "metadata": {
1199 | "colab": {
1200 | "base_uri": "https://localhost:8080/"
1201 | },
1202 | "id": "3Xyrqu5dXwxq",
1203 | "outputId": "84dede34-3550-4fa5-d28e-c8bc27c8e2e1"
1204 | },
1205 | "outputs": [
1206 | {
1207 | "name": "stdout",
1208 | "output_type": "stream",
1209 | "text": [
1210 | "1/1 [==============================] - 0s 424ms/step\n",
1211 | "1/1 [==============================] - 0s 178ms/step\n",
1212 | "1/1 [==============================] - 0s 181ms/step\n"
1213 | ]
1214 | }
1215 | ],
1216 | "source": [
1217 | "# sappos\n",
1218 | "json_file = open('Conv1D_regressionSAPpos.json', 'r')\n",
1219 | "loaded_model_json = json_file.read()\n",
1220 | "json_file.close()\n",
1221 | "loaded_model = model_from_json(loaded_model_json)\n",
1222 | "# load weights into model\n",
1223 | "loaded_model.load_weights(\"Conv1D_regression_SAPpos.h5\")\n",
1224 | "loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])\n",
1225 | "sap_pos = loaded_model.predict(X)\n",
1226 | "\n",
1227 | "# scmneg\n",
1228 | "json_file = open('Conv1D_regressionSCMneg.json', 'r')\n",
1229 | "loaded_model_json = json_file.read()\n",
1230 | "json_file.close()\n",
1231 | "loaded_model = model_from_json(loaded_model_json)\n",
1232 | "# load weights into model\n",
1233 | "loaded_model.load_weights(\"Conv1D_regression_SCMneg.h5\")\n",
1234 | "loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])\n",
1235 | "scm_neg = loaded_model.predict(X)\n",
1236 | "\n",
1237 | "# scmpos\n",
1238 | "json_file = open('Conv1D_regressionSCMpos.json', 'r')\n",
1239 | "loaded_model_json = json_file.read()\n",
1240 | "json_file.close()\n",
1241 | "loaded_model = model_from_json(loaded_model_json)\n",
1242 | "# load weights into model\n",
1243 | "loaded_model.load_weights(\"Conv1D_regression_SCMpos.h5\")\n",
1244 | "loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])\n",
1245 | "scm_pos = loaded_model.predict(X)"
1246 | ]
1247 | },
1248 | {
1249 | "cell_type": "code",
1250 | "execution_count": 15,
1251 | "metadata": {
1252 | "colab": {
1253 | "base_uri": "https://localhost:8080/",
1254 | "height": 600
1255 | },
1256 | "id": "a-FfSETHegbu",
1257 | "outputId": "5e5c0891-7fe1-46d2-b5b2-2ff93195f0af"
1258 | },
1259 | "outputs": [
1260 | {
1261 | "data": {
1262 | "application/vnd.google.colaboratory.intrinsic+json": {
1263 | "type": "dataframe",
1264 | "variable_name": "df"
1265 | },
1266 | "text/html": [
1267 | "\n",
1268 | " \n",
1269 | "
\n",
1270 | "\n",
1283 | "
\n",
1284 | " \n",
1285 | " \n",
1286 | " | \n",
1287 | " Name | \n",
1288 | " SAP_pos_CDRH1 | \n",
1289 | " SAP_pos_CDRH2 | \n",
1290 | " SAP_pos_CDRH3 | \n",
1291 | " SAP_pos_CDRL1 | \n",
1292 | " SAP_pos_CDRL2 | \n",
1293 | " SAP_pos_CDRL3 | \n",
1294 | " SAP_pos_CDR | \n",
1295 | " SAP_pos_Hv | \n",
1296 | " SAP_pos_Lv | \n",
1297 | " ... | \n",
1298 | " SCM_pos_CDRH1 | \n",
1299 | " SCM_pos_CDRH2 | \n",
1300 | " SCM_pos_CDRH3 | \n",
1301 | " SCM_pos_CDRL1 | \n",
1302 | " SCM_pos_CDRL2 | \n",
1303 | " SCM_pos_CDRL3 | \n",
1304 | " SCM_pos_CDR | \n",
1305 | " SCM_pos_Hv | \n",
1306 | " SCM_pos_Lv | \n",
1307 | " SCM_pos_Fv | \n",
1308 | "
\n",
1309 | " \n",
1310 | " \n",
1311 | " \n",
1312 | " 0 | \n",
1313 | " mAb1 | \n",
1314 | " 2.134783 | \n",
1315 | " 2.524245 | \n",
1316 | " 14.445071 | \n",
1317 | " 1.904740 | \n",
1318 | " 3.589351 | \n",
1319 | " 3.172182 | \n",
1320 | " 27.496794 | \n",
1321 | " 58.417648 | \n",
1322 | " 30.517651 | \n",
1323 | " ... | \n",
1324 | " 3.183182 | \n",
1325 | " 19.583357 | \n",
1326 | " 29.513483 | \n",
1327 | " 116.769501 | \n",
1328 | " 41.759361 | \n",
1329 | " 55.548267 | \n",
1330 | " 263.838928 | \n",
1331 | " 907.113037 | \n",
1332 | " 1219.444458 | \n",
1333 | " 2109.088623 | \n",
1334 | "
\n",
1335 | " \n",
1336 | " 1 | \n",
1337 | " mAb2 | \n",
1338 | " 1.844576 | \n",
1339 | " 4.339117 | \n",
1340 | " 8.942592 | \n",
1341 | " 1.613968 | \n",
1342 | " 10.417940 | \n",
1343 | " 9.945271 | \n",
1344 | " 38.409042 | \n",
1345 | " 58.486862 | \n",
1346 | " 44.568775 | \n",
1347 | " ... | \n",
1348 | " 27.889265 | \n",
1349 | " 22.455563 | \n",
1350 | " 154.704300 | \n",
1351 | " 23.503654 | \n",
1352 | " 37.771931 | \n",
1353 | " 90.647194 | \n",
1354 | " 360.761566 | \n",
1355 | " 1224.561279 | \n",
1356 | " 1132.755249 | \n",
1357 | " 2335.686523 | \n",
1358 | "
\n",
1359 | " \n",
1360 | " 2 | \n",
1361 | " mAb3 | \n",
1362 | " 2.809425 | \n",
1363 | " 1.746096 | \n",
1364 | " 20.808647 | \n",
1365 | " 0.617971 | \n",
1366 | " 3.460975 | \n",
1367 | " 5.031782 | \n",
1368 | " 34.877113 | \n",
1369 | " 63.382252 | \n",
1370 | " 44.274189 | \n",
1371 | " ... | \n",
1372 | " 71.399979 | \n",
1373 | " 29.845194 | \n",
1374 | " 41.844185 | \n",
1375 | " 37.094913 | \n",
1376 | " 39.442978 | \n",
1377 | " 22.183475 | \n",
1378 | " 246.880508 | \n",
1379 | " 1165.597656 | \n",
1380 | " 830.064209 | \n",
1381 | " 1963.784912 | \n",
1382 | "
\n",
1383 | " \n",
1384 | " 3 | \n",
1385 | " mAb4 | \n",
1386 | " 3.139667 | \n",
1387 | " 0.300687 | \n",
1388 | " 26.260990 | \n",
1389 | " 1.863188 | \n",
1390 | " 0.188002 | \n",
1391 | " 6.295355 | \n",
1392 | " 38.342857 | \n",
1393 | " 73.466133 | \n",
1394 | " 38.451836 | \n",
1395 | " ... | \n",
1396 | " 27.552292 | \n",
1397 | " 11.260792 | \n",
1398 | " 56.283066 | \n",
1399 | " 13.800642 | \n",
1400 | " 66.964668 | \n",
1401 | " 50.259052 | \n",
1402 | " 225.270020 | \n",
1403 | " 993.915222 | \n",
1404 | " 1097.456543 | \n",
1405 | " 2073.500244 | \n",
1406 | "
\n",
1407 | " \n",
1408 | " 4 | \n",
1409 | " mAb5 | \n",
1410 | " 2.489059 | \n",
1411 | " 0.111882 | \n",
1412 | " 15.967413 | \n",
1413 | " 2.553848 | \n",
1414 | " 0.564827 | \n",
1415 | " 1.848455 | \n",
1416 | " 23.765207 | \n",
1417 | " 61.603027 | \n",
1418 | " 33.180843 | \n",
1419 | " ... | \n",
1420 | " 42.208309 | \n",
1421 | " 10.151648 | \n",
1422 | " 77.493065 | \n",
1423 | " 142.884079 | \n",
1424 | " 81.866493 | \n",
1425 | " 61.183517 | \n",
1426 | " 415.960480 | \n",
1427 | " 1010.628784 | \n",
1428 | " 1221.234863 | \n",
1429 | " 2209.069336 | \n",
1430 | "
\n",
1431 | " \n",
1432 | " 5 | \n",
1433 | " mAb6 | \n",
1434 | " 9.645024 | \n",
1435 | " 2.265561 | \n",
1436 | " 23.064241 | \n",
1437 | " 6.464199 | \n",
1438 | " 0.987525 | \n",
1439 | " 8.052277 | \n",
1440 | " 49.345421 | \n",
1441 | " 61.487507 | \n",
1442 | " 48.577274 | \n",
1443 | " ... | \n",
1444 | " 45.501659 | \n",
1445 | " 18.139217 | \n",
1446 | " 136.593094 | \n",
1447 | " 18.756721 | \n",
1448 | " 71.840881 | \n",
1449 | " 66.386726 | \n",
1450 | " 359.891602 | \n",
1451 | " 1329.421021 | \n",
1452 | " 1113.381714 | \n",
1453 | " 2426.393555 | \n",
1454 | "
\n",
1455 | " \n",
1456 | " 6 | \n",
1457 | " mAb7 | \n",
1458 | " 8.222817 | \n",
1459 | " 2.838049 | \n",
1460 | " 11.414651 | \n",
1461 | " 4.315411 | \n",
1462 | " 2.673587 | \n",
1463 | " 5.908565 | \n",
1464 | " 36.578148 | \n",
1465 | " 50.778774 | \n",
1466 | " 44.486637 | \n",
1467 | " ... | \n",
1468 | " 52.311821 | \n",
1469 | " 74.923622 | \n",
1470 | " 82.053070 | \n",
1471 | " 72.280479 | \n",
1472 | " 45.879509 | \n",
1473 | " 88.023949 | \n",
1474 | " 428.916351 | \n",
1475 | " 1007.111145 | \n",
1476 | " 848.104492 | \n",
1477 | " 1808.066040 | \n",
1478 | "
\n",
1479 | " \n",
1480 | " 7 | \n",
1481 | " mAb8 | \n",
1482 | " 4.424239 | \n",
1483 | " 2.095972 | \n",
1484 | " 16.019022 | \n",
1485 | " 2.523247 | \n",
1486 | " 7.355127 | \n",
1487 | " 1.797327 | \n",
1488 | " 34.046429 | \n",
1489 | " 60.678223 | \n",
1490 | " 34.210297 | \n",
1491 | " ... | \n",
1492 | " 51.687237 | \n",
1493 | " -3.795643 | \n",
1494 | " 91.028336 | \n",
1495 | " 14.377379 | \n",
1496 | " 23.035267 | \n",
1497 | " 32.922684 | \n",
1498 | " 198.840347 | \n",
1499 | " 1157.940308 | \n",
1500 | " 944.083252 | \n",
1501 | " 2085.386963 | \n",
1502 | "
\n",
1503 | " \n",
1504 | " 8 | \n",
1505 | " mAb9 | \n",
1506 | " 1.428219 | \n",
1507 | " 3.449205 | \n",
1508 | " 5.459098 | \n",
1509 | " 2.133244 | \n",
1510 | " 2.771093 | \n",
1511 | " 8.270716 | \n",
1512 | " 23.198622 | \n",
1513 | " 45.440292 | \n",
1514 | " 37.316959 | \n",
1515 | " ... | \n",
1516 | " 15.407248 | \n",
1517 | " 8.218230 | \n",
1518 | " 46.540298 | \n",
1519 | " 8.149660 | \n",
1520 | " 17.474276 | \n",
1521 | " 35.439575 | \n",
1522 | " 130.615524 | \n",
1523 | " 1107.602173 | \n",
1524 | " 954.662292 | \n",
1525 | " 2039.475220 | \n",
1526 | "
\n",
1527 | " \n",
1528 | " 9 | \n",
1529 | " mAb10 | \n",
1530 | " 2.341985 | \n",
1531 | " 0.196968 | \n",
1532 | " 10.534814 | \n",
1533 | " 4.143441 | \n",
1534 | " 6.528968 | \n",
1535 | " 15.280798 | \n",
1536 | " 40.839794 | \n",
1537 | " 55.972923 | \n",
1538 | " 49.084797 | \n",
1539 | " ... | \n",
1540 | " 25.994398 | \n",
1541 | " 14.516958 | \n",
1542 | " 26.370356 | \n",
1543 | " 19.893885 | \n",
1544 | " 69.955307 | \n",
1545 | " 26.277246 | \n",
1546 | " 174.261017 | \n",
1547 | " 1044.724976 | \n",
1548 | " 1139.791870 | \n",
1549 | " 2159.286621 | \n",
1550 | "
\n",
1551 | " \n",
1552 | " 10 | \n",
1553 | " mAb11 | \n",
1554 | " 3.050937 | \n",
1555 | " 9.065768 | \n",
1556 | " 5.594949 | \n",
1557 | " 1.300072 | \n",
1558 | " 4.010072 | \n",
1559 | " 6.862015 | \n",
1560 | " 30.163914 | \n",
1561 | " 55.924561 | \n",
1562 | " 34.403553 | \n",
1563 | " ... | \n",
1564 | " 46.803642 | \n",
1565 | " 44.563057 | \n",
1566 | " 90.800636 | \n",
1567 | " 29.547958 | \n",
1568 | " 120.159859 | \n",
1569 | " 28.398201 | \n",
1570 | " 362.141327 | \n",
1571 | " 1422.787842 | \n",
1572 | " 1272.123779 | \n",
1573 | " 2652.440430 | \n",
1574 | "
\n",
1575 | " \n",
1576 | " 11 | \n",
1577 | " mAb12 | \n",
1578 | " 2.903481 | \n",
1579 | " 0.377317 | \n",
1580 | " 11.468390 | \n",
1581 | " 6.613725 | \n",
1582 | " 5.520360 | \n",
1583 | " 1.477862 | \n",
1584 | " 28.122194 | \n",
1585 | " 51.992001 | \n",
1586 | " 53.377674 | \n",
1587 | " ... | \n",
1588 | " 35.554497 | \n",
1589 | " -3.779891 | \n",
1590 | " -0.063098 | \n",
1591 | " 125.557152 | \n",
1592 | " 43.826054 | \n",
1593 | " 73.312820 | \n",
1594 | " 267.234039 | \n",
1595 | " 969.922485 | \n",
1596 | " 1093.797974 | \n",
1597 | " 2053.877930 | \n",
1598 | "
\n",
1599 | " \n",
1600 | " 12 | \n",
1601 | " mAb13 | \n",
1602 | " 2.835168 | \n",
1603 | " 1.458057 | \n",
1604 | " 9.994059 | \n",
1605 | " 2.109535 | \n",
1606 | " 3.517281 | \n",
1607 | " 3.287737 | \n",
1608 | " 23.887489 | \n",
1609 | " 46.273491 | \n",
1610 | " 35.819031 | \n",
1611 | " ... | \n",
1612 | " 114.275505 | \n",
1613 | " 21.517704 | \n",
1614 | " 52.248070 | \n",
1615 | " 111.769119 | \n",
1616 | " 84.824715 | \n",
1617 | " 86.854416 | \n",
1618 | " 469.482086 | \n",
1619 | " 1137.821533 | \n",
1620 | " 1275.564941 | \n",
1621 | " 2417.019531 | \n",
1622 | "
\n",
1623 | " \n",
1624 | " 13 | \n",
1625 | " mAb14 | \n",
1626 | " 2.589987 | \n",
1627 | " 4.791502 | \n",
1628 | " 20.667749 | \n",
1629 | " 2.308463 | \n",
1630 | " 6.206800 | \n",
1631 | " 5.448440 | \n",
1632 | " 43.316048 | \n",
1633 | " 63.523815 | \n",
1634 | " 38.004623 | \n",
1635 | " ... | \n",
1636 | " 45.412704 | \n",
1637 | " 26.941366 | \n",
1638 | " 64.773582 | \n",
1639 | " 62.668564 | \n",
1640 | " 109.243866 | \n",
1641 | " 33.384670 | \n",
1642 | " 342.168793 | \n",
1643 | " 1134.156250 | \n",
1644 | " 1143.960449 | \n",
1645 | " 2254.151367 | \n",
1646 | "
\n",
1647 | " \n",
1648 | " 14 | \n",
1649 | " mAb15 | \n",
1650 | " 2.769219 | \n",
1651 | " 3.568675 | \n",
1652 | " 14.486679 | \n",
1653 | " 9.599416 | \n",
1654 | " 2.980665 | \n",
1655 | " 5.314492 | \n",
1656 | " 38.965103 | \n",
1657 | " 48.768085 | \n",
1658 | " 39.433372 | \n",
1659 | " ... | \n",
1660 | " 52.217686 | \n",
1661 | " 33.024086 | \n",
1662 | " 68.119850 | \n",
1663 | " 48.751259 | \n",
1664 | " 103.548256 | \n",
1665 | " 25.522581 | \n",
1666 | " 326.298615 | \n",
1667 | " 1186.510864 | \n",
1668 | " 1266.150513 | \n",
1669 | " 2435.450439 | \n",
1670 | "
\n",
1671 | " \n",
1672 | " 15 | \n",
1673 | " mAb16 | \n",
1674 | " 2.071724 | \n",
1675 | " 7.665909 | \n",
1676 | " 19.876446 | \n",
1677 | " 4.267005 | \n",
1678 | " 3.184840 | \n",
1679 | " 6.350702 | \n",
1680 | " 44.845722 | \n",
1681 | " 68.781349 | \n",
1682 | " 37.064419 | \n",
1683 | " ... | \n",
1684 | " 33.740913 | \n",
1685 | " 22.686165 | \n",
1686 | " 151.189835 | \n",
1687 | " 41.309410 | \n",
1688 | " 68.023109 | \n",
1689 | " 32.488457 | \n",
1690 | " 351.102264 | \n",
1691 | " 1164.186035 | \n",
1692 | " 1198.958496 | \n",
1693 | " 2334.452393 | \n",
1694 | "
\n",
1695 | " \n",
1696 | "
\n",
1697 | "
16 rows × 31 columns
\n",
1698 | "
\n",
1699 | "
\n",
1906 | "
\n"
1907 | ],
1908 | "text/plain": [
1909 | " Name SAP_pos_CDRH1 SAP_pos_CDRH2 SAP_pos_CDRH3 SAP_pos_CDRL1 \\\n",
1910 | "0 mAb1 2.134783 2.524245 14.445071 1.904740 \n",
1911 | "1 mAb2 1.844576 4.339117 8.942592 1.613968 \n",
1912 | "2 mAb3 2.809425 1.746096 20.808647 0.617971 \n",
1913 | "3 mAb4 3.139667 0.300687 26.260990 1.863188 \n",
1914 | "4 mAb5 2.489059 0.111882 15.967413 2.553848 \n",
1915 | "5 mAb6 9.645024 2.265561 23.064241 6.464199 \n",
1916 | "6 mAb7 8.222817 2.838049 11.414651 4.315411 \n",
1917 | "7 mAb8 4.424239 2.095972 16.019022 2.523247 \n",
1918 | "8 mAb9 1.428219 3.449205 5.459098 2.133244 \n",
1919 | "9 mAb10 2.341985 0.196968 10.534814 4.143441 \n",
1920 | "10 mAb11 3.050937 9.065768 5.594949 1.300072 \n",
1921 | "11 mAb12 2.903481 0.377317 11.468390 6.613725 \n",
1922 | "12 mAb13 2.835168 1.458057 9.994059 2.109535 \n",
1923 | "13 mAb14 2.589987 4.791502 20.667749 2.308463 \n",
1924 | "14 mAb15 2.769219 3.568675 14.486679 9.599416 \n",
1925 | "15 mAb16 2.071724 7.665909 19.876446 4.267005 \n",
1926 | "\n",
1927 | " SAP_pos_CDRL2 SAP_pos_CDRL3 SAP_pos_CDR SAP_pos_Hv SAP_pos_Lv ... \\\n",
1928 | "0 3.589351 3.172182 27.496794 58.417648 30.517651 ... \n",
1929 | "1 10.417940 9.945271 38.409042 58.486862 44.568775 ... \n",
1930 | "2 3.460975 5.031782 34.877113 63.382252 44.274189 ... \n",
1931 | "3 0.188002 6.295355 38.342857 73.466133 38.451836 ... \n",
1932 | "4 0.564827 1.848455 23.765207 61.603027 33.180843 ... \n",
1933 | "5 0.987525 8.052277 49.345421 61.487507 48.577274 ... \n",
1934 | "6 2.673587 5.908565 36.578148 50.778774 44.486637 ... \n",
1935 | "7 7.355127 1.797327 34.046429 60.678223 34.210297 ... \n",
1936 | "8 2.771093 8.270716 23.198622 45.440292 37.316959 ... \n",
1937 | "9 6.528968 15.280798 40.839794 55.972923 49.084797 ... \n",
1938 | "10 4.010072 6.862015 30.163914 55.924561 34.403553 ... \n",
1939 | "11 5.520360 1.477862 28.122194 51.992001 53.377674 ... \n",
1940 | "12 3.517281 3.287737 23.887489 46.273491 35.819031 ... \n",
1941 | "13 6.206800 5.448440 43.316048 63.523815 38.004623 ... \n",
1942 | "14 2.980665 5.314492 38.965103 48.768085 39.433372 ... \n",
1943 | "15 3.184840 6.350702 44.845722 68.781349 37.064419 ... \n",
1944 | "\n",
1945 | " SCM_pos_CDRH1 SCM_pos_CDRH2 SCM_pos_CDRH3 SCM_pos_CDRL1 SCM_pos_CDRL2 \\\n",
1946 | "0 3.183182 19.583357 29.513483 116.769501 41.759361 \n",
1947 | "1 27.889265 22.455563 154.704300 23.503654 37.771931 \n",
1948 | "2 71.399979 29.845194 41.844185 37.094913 39.442978 \n",
1949 | "3 27.552292 11.260792 56.283066 13.800642 66.964668 \n",
1950 | "4 42.208309 10.151648 77.493065 142.884079 81.866493 \n",
1951 | "5 45.501659 18.139217 136.593094 18.756721 71.840881 \n",
1952 | "6 52.311821 74.923622 82.053070 72.280479 45.879509 \n",
1953 | "7 51.687237 -3.795643 91.028336 14.377379 23.035267 \n",
1954 | "8 15.407248 8.218230 46.540298 8.149660 17.474276 \n",
1955 | "9 25.994398 14.516958 26.370356 19.893885 69.955307 \n",
1956 | "10 46.803642 44.563057 90.800636 29.547958 120.159859 \n",
1957 | "11 35.554497 -3.779891 -0.063098 125.557152 43.826054 \n",
1958 | "12 114.275505 21.517704 52.248070 111.769119 84.824715 \n",
1959 | "13 45.412704 26.941366 64.773582 62.668564 109.243866 \n",
1960 | "14 52.217686 33.024086 68.119850 48.751259 103.548256 \n",
1961 | "15 33.740913 22.686165 151.189835 41.309410 68.023109 \n",
1962 | "\n",
1963 | " SCM_pos_CDRL3 SCM_pos_CDR SCM_pos_Hv SCM_pos_Lv SCM_pos_Fv \n",
1964 | "0 55.548267 263.838928 907.113037 1219.444458 2109.088623 \n",
1965 | "1 90.647194 360.761566 1224.561279 1132.755249 2335.686523 \n",
1966 | "2 22.183475 246.880508 1165.597656 830.064209 1963.784912 \n",
1967 | "3 50.259052 225.270020 993.915222 1097.456543 2073.500244 \n",
1968 | "4 61.183517 415.960480 1010.628784 1221.234863 2209.069336 \n",
1969 | "5 66.386726 359.891602 1329.421021 1113.381714 2426.393555 \n",
1970 | "6 88.023949 428.916351 1007.111145 848.104492 1808.066040 \n",
1971 | "7 32.922684 198.840347 1157.940308 944.083252 2085.386963 \n",
1972 | "8 35.439575 130.615524 1107.602173 954.662292 2039.475220 \n",
1973 | "9 26.277246 174.261017 1044.724976 1139.791870 2159.286621 \n",
1974 | "10 28.398201 362.141327 1422.787842 1272.123779 2652.440430 \n",
1975 | "11 73.312820 267.234039 969.922485 1093.797974 2053.877930 \n",
1976 | "12 86.854416 469.482086 1137.821533 1275.564941 2417.019531 \n",
1977 | "13 33.384670 342.168793 1134.156250 1143.960449 2254.151367 \n",
1978 | "14 25.522581 326.298615 1186.510864 1266.150513 2435.450439 \n",
1979 | "15 32.488457 351.102264 1164.186035 1198.958496 2334.452393 \n",
1980 | "\n",
1981 | "[16 rows x 31 columns]"
1982 | ]
1983 | },
1984 | "execution_count": 15,
1985 | "metadata": {},
1986 | "output_type": "execute_result"
1987 | }
1988 | ],
1989 | "source": [
1990 | "features = ['Name', 'SAP_pos_CDRH1','SAP_pos_CDRH2','SAP_pos_CDRH3','SAP_pos_CDRL1','SAP_pos_CDRL2','SAP_pos_CDRL3','SAP_pos_CDR','SAP_pos_Hv','SAP_pos_Lv','SAP_pos_Fv',\n",
1991 | " 'SCM_neg_CDRH1','SCM_neg_CDRH2','SCM_neg_CDRH3','SCM_neg_CDRL1','SCM_neg_CDRL2','SCM_neg_CDRL3','SCM_neg_CDR','SCM_neg_Hv','SCM_neg_Lv','SCM_neg_Fv',\n",
1992 | " 'SCM_pos_CDRH1','SCM_pos_CDRH2','SCM_pos_CDRH3','SCM_pos_CDRL1','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_pos_CDR','SCM_pos_Hv','SCM_pos_Lv','SCM_pos_Fv']\n",
1993 | "df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_neg), pd.DataFrame(scm_pos)], ignore_index=True, axis=1,); df.columns = features\n",
1994 | "df.to_csv('DeepSP_descriptors.csv', index=False)\n",
1995 | "df"
1996 | ]
1997 | },
1998 | {
1999 | "cell_type": "code",
2000 | "execution_count": 15,
2001 | "metadata": {
2002 | "id": "R0Gi5_po05Ct"
2003 | },
2004 | "outputs": [],
2005 | "source": []
2006 | }
2007 | ],
2008 | "metadata": {
2009 | "colab": {
2010 | "provenance": []
2011 | },
2012 | "kernelspec": {
2013 | "display_name": "Python 3 (ipykernel)",
2014 | "language": "python",
2015 | "name": "python3"
2016 | },
2017 | "language_info": {
2018 | "codemirror_mode": {
2019 | "name": "ipython",
2020 | "version": 3
2021 | },
2022 | "file_extension": ".py",
2023 | "mimetype": "text/x-python",
2024 | "name": "python",
2025 | "nbconvert_exporter": "python",
2026 | "pygments_lexer": "ipython3",
2027 | "version": "3.9.13"
2028 | }
2029 | },
2030 | "nbformat": 4,
2031 | "nbformat_minor": 1
2032 | }
2033 |
--------------------------------------------------------------------------------
/Conv1D_regressionSAPpos.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": {"name": "model_conv1D", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 272, 21], "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_1", "trainable": true, "dtype": "float32", "filters": 128, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.3, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_2", "trainable": true, "dtype": "float32", "filters": 96, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_3", "trainable": true, "dtype": "float32", "filters": 32, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_2", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "MaxPooling1D", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}}, {"class_name": "Flatten", "config": {"name": "flatten", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "Dense_1", "trainable": true, "dtype": "float32", "units": 112, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_2", "trainable": true, "dtype": "float32", "units": 48, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_3", "trainable": true, "dtype": "float32", "units": 10, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.11.0", "backend": "tensorflow"}
--------------------------------------------------------------------------------
/Conv1D_regressionSCMneg.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": {"name": "model_conv1D", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 272, 21], "dtype": "float32", "sparse": false, "ragged": false, "name": "input_3"}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_1", "trainable": true, "dtype": "float32", "filters": 128, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_6", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.1, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_2", "trainable": true, "dtype": "float32", "filters": 112, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_7", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_3", "trainable": true, "dtype": "float32", "filters": 64, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_8", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "MaxPooling1D", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}}, {"class_name": "Flatten", "config": {"name": "flatten_2", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "Dense_1", "trainable": true, "dtype": "float32", "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_2", "trainable": true, "dtype": "float32", "units": 10, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.11.0", "backend": "tensorflow"}
--------------------------------------------------------------------------------
/Conv1D_regressionSCMpos.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": {"name": "model_conv1D", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 272, 21], "dtype": "float32", "sparse": false, "ragged": false, "name": "input_2"}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_1", "trainable": true, "dtype": "float32", "filters": 128, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_3", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.4, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_2", "trainable": true, "dtype": "float32", "filters": 112, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_4", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.4, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_3", "trainable": true, "dtype": "float32", "filters": 144, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_5", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "dtype": "float32", "rate": 0.0, "noise_shape": null, "seed": null}}, {"class_name": "MaxPooling1D", "config": {"name": "MaxPooling1D", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "Dense_1", "trainable": true, "dtype": "float32", "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_2", "trainable": true, "dtype": "float32", "units": 10, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.11.0", "backend": "tensorflow"}
--------------------------------------------------------------------------------
/Conv1D_regression_SAPpos.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lailabcode/DeepSP/4ba0118fa8e95873daf4a4dcef52beca08b67ab5/Conv1D_regression_SAPpos.h5
--------------------------------------------------------------------------------
/Conv1D_regression_SCMneg.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lailabcode/DeepSP/4ba0118fa8e95873daf4a4dcef52beca08b67ab5/Conv1D_regression_SCMneg.h5
--------------------------------------------------------------------------------
/Conv1D_regression_SCMpos.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lailabcode/DeepSP/4ba0118fa8e95873daf4a4dcef52beca08b67ab5/Conv1D_regression_SCMpos.h5
--------------------------------------------------------------------------------
/DeepSP-app.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Aug 22 17:32:38 2023
4 |
5 | @author: plai3
6 | """
7 |
8 | import streamlit as st
9 | import numpy as np
10 | import pandas as pd
11 |
12 | from keras.models import model_from_json
13 |
14 | from Bio import SeqIO
15 | from io import StringIO
16 | from anarci import anarci
17 |
18 | def one_hot_encoder(s):
19 | d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}
20 |
21 | x = np.zeros((len(d), len(s)))
22 | x[[d[c] for c in s], range(len(s))] = 1
23 |
24 | return x
25 |
26 | H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
27 | '11','12','13','14','15','16','17','18','19','20', \
28 | '21','22','23','24','25','26','27','28','29','30', \
29 | '31','32','33','34','35','36','37','38','39','40', \
30 | '41','42','43','44','45','46','47','48','49','50', \
31 | '51','52','53','54','55','56','57','58','59','60', \
32 | '61','62','63','64','65','66','67','68','69','70', \
33 | '71','72','73','74','75','76','77','78','79','80', \
34 | '81','82','83','84','85','86','87','88','89','90', \
35 | '91','92','93','94','95','96','97','98','99','100', \
36 | '101','102','103','104','105','106','107','108','109','110', \
37 | '111','111A','111B','111C','111D','111E','111F','111G','111H', \
38 | '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\
39 | '113','114','115','116','117','118','119','120', \
40 | '121','122','123','124','125','126','127','128']
41 |
42 | L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
43 | '11','12','13','14','15','16','17','18','19','20', \
44 | '21','22','23','24','25','26','27','28','29','30', \
45 | '31','32','33','34','35','36','37','38','39','40', \
46 | '41','42','43','44','45','46','47','48','49','50', \
47 | '51','52','53','54','55','56','57','58','59','60', \
48 | '61','62','63','64','65','66','67','68','69','70', \
49 | '71','72','73','74','75','76','77','78','79','80', \
50 | '81','82','83','84','85','86','87','88','89','90', \
51 | '91','92','93','94','95','96','97','98','99','100', \
52 | '101','102','103','104','105','106','107','108','109','110', \
53 | '111','112','113','114','115','116','117','118','119','120', \
54 | '121','122','123','124','125','126','127']
55 |
56 | H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
57 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
58 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
59 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
60 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
61 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
62 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
63 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
64 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
65 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
66 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
67 | '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \
68 | '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \
69 | '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \
70 | '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144}
71 |
72 | L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
73 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
74 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
75 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
76 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
77 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
78 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
79 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
80 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
81 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
82 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
83 | '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \
84 | '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127}
85 |
86 | st.set_page_config(
87 | page_title="DeepSP App",
88 | layout="centered",
89 | )
90 |
91 | st.title('DeepSP')
92 | st.header('Deep learning-based antibody structural properties')
93 | st.subheader('The FASTA file format is H_seq/L_seq (variable regions)')
94 |
95 | st.markdown('''
96 | ### EXAMPLE:
97 | \>6p8n
98 | QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMNWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARGKNSDYNWDFQHWGQGTLVTVSS/DIVMSQSPSSLAVSVGEKVTMSCKSSQSLLYSSNQKNYLAWYQQKPGQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTISSVKAEDLAVYYCQQYEMFGGGTKLEIK
99 | ''')
100 |
101 | seq_file = st.file_uploader("#### Upload your FASTA file", type=['fasta'])
102 | if seq_file is not None:
103 | stringio = StringIO(seq_file.getvalue().decode("utf-8"))
104 | sequences_H = []
105 | sequences_L = []
106 | name_list = []
107 | for record in SeqIO.parse(stringio, 'fasta'):
108 | name = str(record.id)
109 | name_list.append(name)
110 | sequence = str(record.seq)
111 | sequence_H, sequence_L = sequence.split('/')
112 | sequences_H.append((name,sequence_H))
113 | sequences_L.append((name,sequence_L))
114 |
115 | results_H = anarci(sequences_H, scheme="imgt", output=False)
116 | results_L = anarci(sequences_L, scheme="imgt", output=False)
117 | numbering_H, alignment_details_H, hit_tables_H = results_H
118 | numbering_L, alignment_details_L, hit_tables_L = results_L
119 |
120 | # Iterate over the sequences
121 | seq_list = []
122 | for i in range(len(sequences_H)):
123 | if numbering_H[i] is None:
124 | print('ANARCI did not number', sequences_H[i][0])
125 | else:
126 | domain_numbering_H, start_index_H, end_index_H = numbering_H[i][0]
127 | domain_numbering_L, start_index_L, end_index_L = numbering_L[i][0]
128 | H_tmp = 145*['-']
129 | L_tmp = 127*['-']
130 | for j in range(len(domain_numbering_H)):
131 | col_H = str(domain_numbering_H[j][0][0])+domain_numbering_H[j][0][1]
132 | col_H = col_H.replace(" ", "")
133 | H_tmp[H_dict[col_H]]=domain_numbering_H[j][1]
134 | for j in range(len(domain_numbering_L)):
135 | col_L = str(domain_numbering_L[j][0][0])+domain_numbering_L[j][0][1]
136 | col_L = col_L.replace(" ", "")
137 | L_tmp[L_dict[col_L]]=domain_numbering_L[j][1]
138 | aa_string = ''
139 | for aa in H_tmp+L_tmp:
140 | aa_string += aa
141 | seq_list.append(aa_string)
142 |
143 | X = [one_hot_encoder(s=x) for x in seq_list]
144 | X = np.transpose(np.asarray(X), (0, 2, 1))
145 | X = np.asarray(X)
146 |
147 | #load DeepSAP_pos model
148 | json_file = open('Conv1D_regressionSAPpos.json', 'r')
149 | loaded_model_json = json_file.read()
150 | json_file.close()
151 | loaded_model = model_from_json(loaded_model_json)
152 |
153 | # load weights into the model
154 | loaded_model.load_weights('Conv1D_regression_SAPpos.h5')
155 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
156 |
157 | # predict SAPpos
158 | y_pred = loaded_model.predict(X)
159 | df_SAPpos = pd.DataFrame(y_pred, columns=['SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SAP_pos_CDRH3',
160 | 'SAP_pos_CDRL1', 'SAP_pos_CDRL2', 'SAP_pos_CDRL3',
161 | 'SAP_pos_CDR', 'SAP_pos_Hv', 'SAP_pos_Lv', 'SAP_pos_Fv'])
162 |
163 | #load DeepSCM_neg model
164 | json_file = open('Conv1D_regressionSCMneg.json', 'r')
165 | loaded_model_json = json_file.read()
166 | json_file.close()
167 | loaded_model = model_from_json(loaded_model_json)
168 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
169 |
170 | # load weights into the model
171 | loaded_model.load_weights('Conv1D_regression_SCMneg.h5')
172 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
173 |
174 | # predict SCMneg
175 | y_pred = loaded_model.predict(X)
176 | df_SCMneg = pd.DataFrame(y_pred, columns=['SCM_neg_CDRH1', 'SCM_neg_CDRH2', 'SCM_neg_CDRH3',
177 | 'SCM_neg_CDRL1', 'SCM_neg_CDRL2', 'SCM_neg_CDRL3',
178 | 'SCM_neg_CDR', 'SCM_neg_Hv', 'SCM_neg_Lv', 'SCM_neg_Fv'])
179 |
180 |
181 | #load DeepSCM_pos model
182 | json_file = open('Conv1D_regressionSCMpos.json', 'r')
183 | loaded_model_json = json_file.read()
184 | json_file.close()
185 | loaded_model = model_from_json(loaded_model_json)
186 |
187 | # load weights into the model
188 | loaded_model.load_weights('Conv1D_regression_SCMpos.h5')
189 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
190 |
191 | # predict SCMpos
192 | y_pred = loaded_model.predict(X)
193 | df_SCMpos = pd.DataFrame(y_pred, columns=['SCM_pos_CDRH1', 'SCM_pos_CDRH2', 'SCM_pos_CDRH3',
194 | 'SCM_pos_CDRL1', 'SCM_pos_CDRL2', 'SCM_pos_CDRL3',
195 | 'SCM_pos_CDR', 'SCM_pos_Hv', 'SCM_pos_Lv', 'SCM_pos_Fv'])
196 |
197 | df_name = pd.DataFrame(name_list, columns=['ID'])
198 |
199 | df_DeepSP = pd.concat([df_name, df_SAPpos, df_SCMneg, df_SCMpos], axis=1)
200 | st.dataframe(data = df_DeepSP, use_container_width=True, hide_index=True)
201 |
202 |
--------------------------------------------------------------------------------
/DeepSP_input.csv:
--------------------------------------------------------------------------------
1 | Name,Heavy_Chain,Light_Chain
2 | mAb1,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSAITWNSGHIDYADSVEGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAKVSYLSTASSLDYWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQRYNRAPYTFGQGTKVEIK
3 | mAb2,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLEWVAWISPYGGSTYYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARRHWPGGFDYWGQGTLVTVSA,DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLYHPATFGQGTKVEIK
4 | mAb3,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLSINKDNSKSQVFFKMNSLQSNDTAIYYCARALTYYDYEFAYWGQGTLVTVSA,DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRLLIKYASESISGIPSRFSGSGSGTDFTLSINSVESEDIADYYCQQNNNWPTTFGAGTKLELK
5 | mAb4,EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLEWVSAISGSGGGTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYFCAKDKILWFGEPVFDYWGQGTLVTVSS,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPPTFGQGTKVEIK
6 | mAb5,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSGITGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDPGTTVIMSWFDPWGQGTLVTVSS,EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVFYCQQYGSSPRTFGQGTKVEIK
7 | mAb6,QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLEWVAFMSYDGSNKKYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDRGIAAGGNYYYYGMDVWGQGTTVTVSS,EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPPFTFGPGTKVDIK
8 | mAb7,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLEWVAEIRSKSINSATHYAESVKGRFTISRDDSKSAVYLQMTDLRTEDTGVYYCSRNYYGSTYDYWGQGTTLTVSS,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRLLIKYASESMSGIPSRFSGSGSGTDFTLSINTVESEDIADYYCQQSHSWPFTFGSGTNLEVK
9 | mAb8,EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGLEWVASITYDGSTNYNPSVKGRITISRDDSKNTFYLQMNSLRAEDTAVYYCARGSHYFGHWHFAVWGQGTLVTVSS,DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGKAPKLLIYAASYLESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSHEDPYTFGQGTKVEIK
10 | mAb9,QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKGLEWIGHIYYSGNTNYNPSLKSRLTISIDTSKTQFSLKLSSVTAADTAIYYCVRDRVTGAFDIWGQGTMVTVSS,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYFCQHFDHLPLAFGGGTKVEIK
11 | mAb10,EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLEWVADVNPNSGGSIYNQRFKGRFTLSVDRSKNTLYLQMNSLRAEDTAVYYCARNLGPSFYFDYWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKLLIYSASYRYTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYYIYPYTFGQGTKVEIK
12 | mAb11,QVQLQESGPGLVRPSQTLSLTCTVSGYSITSDHAWSWVRQPPGRGLEWIGYISYSGITTYNPSLKSRVTMLRDTSKNQFSLRLSSVTAADTAVYYCARSLARTTAMDYWGQGSLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQDISSYLNWYQQKPGKAPKLLIYYTSRLHSGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQGNTLPYTFGQGTKVEIK
13 | mAb12,QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLEWIGEIDPSESNTNYNQKFKGRVTLTVDISASTAYMELSSLRSEDTAVYYCARGGYDGWDYAIDYWGQGTLVTVSS,DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPGQSPQLLIYGISNRFSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCLQGTHQPYTFGQGTKVEIK
14 | mAb13,QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWIGAIYPGNSDTSYNQKFEGKAKLTAVTSASTAYMELSSLTHEDSAVYYCSRDYGYYFDFWGQGTTLTVSS,QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRWIYDTSKLASGVPARFSGSGSGTSYSLTISSMEAEDAATYYCHQRSSYTFGGGTKLEIK
15 | mAb14,QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLEWMGRIDPANGYTKYDPKFQGRVTITADTSASTAYMELSSLRSEDEAVYYCAREGYYGNYGVYAMDYWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRLLIHYTSALQPGIPSRFSGSGSGRDYTFTISSLQPEDIATYYCLQYDNLWTFGQGTKVEIK
16 | mAb15,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLEWIGCIYPGNVNTNYNEKFKDRATLTVDTSISTAYMELSRLRSDDTAVYFCTRSHYGLDWNFDVWGQGTTVTVSS,DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKLLIYKASNLHTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQGQTYPYTFGGGTKVEIK
17 | mAb16,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSQISPAGGYTNYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARGELPYYRMSKVMDVWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQYFSSYLAWYQQKPGKAPKLLIYGASSRASGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLGSPPTFGQGTKVEIK
18 |
--------------------------------------------------------------------------------
/DeepSP_model_train.py:
--------------------------------------------------------------------------------
1 | # Import libraries
2 | import numpy as np
3 | import pandas as pd
4 | import random
5 |
6 |
7 | from numpy.random import seed
8 |
9 | # Import machine learning libraries
10 | import tensorflow as tf
11 | import keras
12 | from keras.models import model_from_json
13 | from keras.layers import BatchNormalization
14 | from keras.callbacks import ModelCheckpoint
15 | from keras.optimizers import Adam
16 | import keras_tuner as kt
17 | from sklearn.model_selection import train_test_split
18 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
19 |
20 |
21 | np.random.seed(0)
22 | random.seed(0)
23 | tf.random.set_seed(0)
24 |
25 |
26 | def load_input_data(filename):
27 | name_list=[]
28 | seq_list=[]
29 | score_list=[]
30 |
31 | with open(filename) as datafile:
32 | for line in datafile:
33 | line = line.strip().split()
34 | name_list.append(line[0])
35 | seq_list.append(line[1])
36 | score_temp = []
37 | for i in range(len(line[2:])):
38 | data = float(line[i+2])
39 | score_temp.append(data)
40 | score_list.append(score_temp)
41 | return name_list, seq_list, score_list
42 |
43 | def one_hot_encoder(s):
44 | d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}
45 |
46 | x = np.zeros((len(d), len(s)))
47 | x[[d[c] for c in s], range(len(s))] = 1
48 |
49 | return x
50 |
51 |
52 | def best_model_SAPpos():
53 | best_model = keras.Sequential(name="model_conv1D")
54 |
55 | best_model.add(keras.layers.Input(shape=(272,21)))
56 |
57 | best_model.add(keras.layers.Conv1D(filters=128, kernel_size=5, activation = 'relu', name="Conv1D_1"))
58 | best_model.add(BatchNormalization())
59 | best_model.add(keras.layers.Dropout(0.3))
60 |
61 | best_model.add(keras.layers.Conv1D(filters=96, kernel_size=4, activation = 'relu', name="Conv1D_2"))
62 | best_model.add(BatchNormalization())
63 |
64 | best_model.add(keras.layers.Conv1D(filters=32, kernel_size=5, activation = 'relu', name="Conv1D_3"))
65 | best_model.add(BatchNormalization())
66 |
67 | best_model.add(keras.layers.MaxPooling1D(pool_size=2, name="MaxPooling1D"))
68 | best_model.add(keras.layers.Flatten())
69 |
70 |
71 | # Input layer and First hidden layer of neural network
72 | best_model.add(keras.layers.Dense(units=112, activation = 'relu', name="Dense_1"))
73 | best_model.add(keras.layers.Dense(units=48, activation = 'relu', name="Dense_2"))
74 | best_model.add(keras.layers.Dense(10, name="Dense_3"))
75 |
76 | return best_model
77 |
78 |
79 | def best_model_SCMpos():
80 | best_model = keras.Sequential(name="model_conv1D")
81 |
82 | best_model.add(keras.layers.Input(shape=(272,21)))
83 |
84 | best_model.add(keras.layers.Conv1D(filters=128, kernel_size=4, activation = 'relu', name="Conv1D_1"))
85 | best_model.add(BatchNormalization())
86 | best_model.add(keras.layers.Dropout(0.4))
87 |
88 | best_model.add(keras.layers.Conv1D(filters=112, kernel_size=4, activation = 'relu', name="Conv1D_2"))
89 | best_model.add(BatchNormalization())
90 | best_model.add(keras.layers.Dropout(0.4))
91 |
92 | best_model.add(keras.layers.Conv1D(filters=144, kernel_size=5, activation = 'relu', name="Conv1D_3"))
93 | best_model.add(BatchNormalization())
94 | best_model.add(keras.layers.Dropout(0.0))
95 |
96 | best_model.add(keras.layers.MaxPooling1D(pool_size=2, name="MaxPooling1D"))
97 | best_model.add(keras.layers.Flatten())
98 |
99 | # Input layer and First hidden layer of neural network
100 | best_model.add(keras.layers.Dense(units=128, activation = 'relu', name="Dense_1"))
101 | best_model.add(keras.layers.Dense(10, name="Dense_2"))
102 |
103 | return best_model
104 |
105 |
106 | def best_model_SCMneg():
107 | best_model = keras.Sequential(name="model_conv1D")
108 |
109 | best_model.add(keras.layers.Input(shape=(272,21)))
110 |
111 | best_model.add(keras.layers.Conv1D(filters=128, kernel_size=5, activation = 'relu', name="Conv1D_1"))
112 | best_model.add(BatchNormalization())
113 | best_model.add(keras.layers.Dropout(0.1))
114 |
115 | best_model.add(keras.layers.Conv1D(filters=112, kernel_size=4, activation = 'relu', name="Conv1D_2"))
116 | best_model.add(BatchNormalization())
117 |
118 | best_model.add(keras.layers.Conv1D(filters=64, kernel_size=4, activation = 'relu', name="Conv1D_3"))
119 | best_model.add(BatchNormalization())
120 |
121 | best_model.add(keras.layers.MaxPooling1D(pool_size=2, name="MaxPooling1D"))
122 | best_model.add(keras.layers.Flatten())
123 |
124 | # Input layer and First hidden layer of neural network
125 | best_model.add(keras.layers.Dense(units=128, activation = 'relu', name="Dense_1"))
126 | best_model.add(keras.layers.Dense(10, name="Dense_2"))
127 |
128 | return best_model
129 |
130 |
131 | #ts = 0.2; bs = 64
132 | filenames = ['Deep_SAPpos_data.txt', 'Deep_SCMpos_data.txt', 'Deep_SCMneg_data.txt']
133 | models = [best_model_SAPpos(), best_model_SCMpos(), best_model_SCMneg()]
134 | l_rates = [0.0001, 0.005, 0.0001]
135 |
136 | for file, model, l_rate in zip(filenames,models,l_rates):
137 | prop = file.split('_')[1]
138 |
139 | name_list, seq_list, score_list = load_input_data("data/"+file)
140 | X= seq_list; y= score_list
141 |
142 | #Train and compile model with best hyperparameters
143 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
144 | X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)
145 |
146 | X_train = [one_hot_encoder(s=x) for x in X_train]
147 | X_train = np.transpose(np.asarray(X_train), (0, 2, 1))
148 | X_train = np.asarray(X_train)
149 |
150 | X_test = [one_hot_encoder(s=x) for x in X_test]
151 | X_test = np.transpose(np.asarray(X_test), (0, 2, 1))
152 | X_test = np.asarray(X_test)
153 |
154 | X_val = [one_hot_encoder(s=x) for x in X_val]
155 | X_val = np.transpose(np.asarray(X_val), (0, 2, 1))
156 | X_val = np.asarray(X_val)
157 |
158 | y_train = np.asarray(y_train).reshape((-1, 10))
159 | y_test = np.asarray(y_test).reshape((-1, 10))
160 | y_val = np.asarray(y_val).reshape((-1, 10))
161 |
162 | optimizer = Adam(learning_rate = l_rate)
163 | best_model = model
164 | best_model.compile(optimizer=optimizer, loss='mae', metrics=None)
165 |
166 | # Create callback
167 | filepath = 'Conv1D_regression_'+prop+'.h5'
168 | checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')
169 | callbacks = [checkpoint]
170 |
171 | # Fit the CNN to the training set
172 | history = best_model.fit(x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val), epochs=50, callbacks=callbacks, batch_size=64, verbose=2)
173 |
174 | # Save the Conv1D architecture to json
175 | Conv1D_regression_json = best_model.to_json()
176 | with open("Conv1D_regression"+prop+".json", "w") as json_file:
177 | json_file.write(Conv1D_regression_json)
178 |
179 |
180 | # Load the Conv1D architecture from json
181 | pred_model = model_from_json(Conv1D_regression_json)
182 |
183 | # Load weights from the best model into Conv1D model
184 | pred_model.load_weights(filepath)
185 |
186 | # Compile the loaded Conv1D model
187 | pred_model.compile(optimizer=optimizer, metrics=['mae'])
188 |
189 | y_pred = pred_model.predict(X_test)
190 |
191 | best_val_loss = min(history.history['val_loss'])
192 |
193 | # Initialize lists to store baseline MAE and mean scores for each target
194 | baseline_mae_list = []
195 | mean_score_list = []
196 |
197 | for i in range(y_test.shape[1]):
198 | # Calculate the baseline MAE for the i-th target
199 | baseline_prediction = np.full_like(y_test[:, i], np.mean(y_test[:, i]))
200 | baseline_mae = mean_absolute_error(y_test[:, i], baseline_prediction)
201 | baseline_mae_list.append(baseline_mae)
202 |
203 | # Calculate the mean score for the i-th target
204 | mean_score = np.mean(y_test[:, i])
205 | mean_score_list.append(mean_score)
206 |
207 | # Initialize lists to store metrics for each target
208 | mae_list = []
209 | corr_list = []
210 |
211 | for i in range(y_test.shape[1]):
212 | # Calculate MAE for the i-th target
213 | mae = mean_absolute_error(y_test[:, i], y_pred[:, i])
214 | mae_list.append(mae)
215 |
216 | # Calculate correlation coefficient (correlation) for the i-th target
217 | corr = np.corrcoef(y_test[:, i], y_pred[:, i])[0, 1]
218 | corr_list.append(corr)
219 |
220 | reg = ['CDRH1', 'CDRH2', 'CDRH3', 'CDRL1', 'CDRL2', 'CDRL3', 'CDR', 'Hv', 'Lv', 'Fv']
221 |
222 | result_dict = {
223 | "prop": [],
224 | "Mean_score": [],
225 | "Baseline_MAE": [],
226 | "Val_loss": [],
227 | "MAE": [],
228 | "R": [],
229 | }
230 |
231 | for r, i, j, k, l in zip(reg, mean_score_list, baseline_mae_list, mae_list, corr_list):
232 | # Append the corresponding values to the result_dict
233 | result_dict["prop"].append(prop + r)
234 | result_dict["Mean_score"].append(i)
235 | result_dict["Baseline_MAE"].append(j)
236 | result_dict["Val_loss"].append(best_val_loss)
237 | result_dict["MAE"].append(k)
238 | result_dict["R"].append(l)
239 |
240 | # Create the DataFrame
241 | result_df = pd.DataFrame(result_dict)
242 |
243 | # Save the DataFrame to CSV
244 | result_df.to_csv("hyp_metric_" + prop + ".csv", index=False)
245 |
246 | his_df = pd.DataFrame(history.history)
247 | his_df.to_csv("his" + prop + ".csv", index=False)
248 |
249 | data_frames = []
250 | for file in filenames:
251 | prop = file.split('_')[1]
252 | infile = "hyp_metric_" + prop + ".csv"
253 | df = pd.read_csv(infile)
254 | data_frames.append(df)
255 | concatenated_df = pd.concat(data_frames, ignore_index=True)
256 | concatenated_df.to_csv("Final_model_metric.csv", index=False)
257 |
258 |
259 |
260 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Lailabcode
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepSP
2 | DeepSP is an antibody-specific surrogate model that can generate 30 spatial properties of an antibody solely based on their sequence.
3 |
4 | # How to generate descriptors (features) using DeepSP
5 |
6 | ## Option 1 - Google colab notebook
7 | - Run
8 | 1. Prepare your input file according to the format DeepSP_input.csv
9 | 2. Run the notebook file DeepSP_predictor.ipynb
10 | 3. DeepSP structural properties for sequences inputed, will be populated and saved to a csv file - 'DeepSP_descriptor.csv'.
11 |
12 | ## Option 2 - Linux environment
13 | - Set up (bash)- create an environment and install necessary package
14 | 1. conda create -n deepSP python=3.9.13
15 | 2. source activate deepSP
16 | 3. conda install -c bioconda anarci
17 | 4. pip install keras==2.11.0 tensorflow-cpu==2.11.0 scikit-learn==1.0.2 pandas numpy==1.26.4
18 | - Run
19 | 1. Prepare your input file according to the format DeepSP_input.csv
20 | 2. Run the python file deepsp_predictor.py - 'python deepsp_predictor.py'
21 | 3. DeepSP structural properties for sequences inputed, will be obtained and saved to a csv file - 'DeepSP_descriptor.csv'.
22 |
23 |
24 | # Citation
25 |
26 | Kalejaye, L.; Wu, I.-E.; Terry, T.; Lai, P.-K. DeepSP: Deep Learning-Based Spatial Properties to Predict Monoclonal Antibody Stability. *Comput. Struct. Biotechnol. J.* 2024, 23, 2220–2229 (https://doi.org/10.1016/j.csbj.2024.05.029)
--------------------------------------------------------------------------------
/deepsp_predictor.py:
--------------------------------------------------------------------------------
1 | # Import libraries
2 | import os
3 | os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
5 | import numpy as np
6 | import pandas as pd
7 | import random
8 | from numpy.random import seed
9 |
10 | # Import machine learning libraries
11 | import tensorflow as tf
12 | from tensorflow.keras.models import model_from_json
13 |
14 | import keras
15 | from keras.models import model_from_json
16 |
17 | from Bio import SeqIO
18 | from Bio.Seq import Seq
19 | from Bio.SeqRecord import SeqRecord
20 |
21 |
22 | # Import dataset
23 | dataset = pd.read_csv('DeepSP_input.csv') # replace with your csv file, see format in DeepSP_input.csv file
24 | name = dataset['Name'].to_list()
25 | Heavy_seq = dataset['Heavy_Chain'].to_list()
26 | Light_seq = dataset['Light_Chain'].to_list()
27 |
28 | # Convert to Fasta File
29 | file_out='seq_H.fasta'
30 | with open(file_out, "w") as output_handle:
31 | for i in range(len(name)):
32 | seq_name = name[i]
33 | seq = Heavy_seq[i]
34 | record = SeqRecord(
35 | Seq(seq),
36 | id=seq_name,
37 | name="",
38 | description="",
39 | )
40 | SeqIO.write(record, output_handle, "fasta")
41 |
42 | file_out='seq_L.fasta'
43 | with open(file_out, "w") as output_handle:
44 | for i in range(len(name)):
45 | seq_name = name[i]
46 | seq = Light_seq[i]
47 | record = SeqRecord(
48 | Seq(seq),
49 | id=seq_name,
50 | name="",
51 | description="",
52 | )
53 | SeqIO.write(record, output_handle, "fasta")
54 |
55 | # sequence alignment with ANARCI
56 | os.system('ANARCI -i seq_H.fasta -o seq_aligned -s imgt -r heavy --csv')
57 | os.system('ANARCI -i seq_L.fasta -o seq_aligned -s imgt -r light --csv')
58 |
59 | H_aligned = pd.read_csv('seq_aligned_H.csv')
60 | L_aligned = pd.read_csv('seq_aligned_KL.csv')
61 |
62 | #sequence preprocessing: source - https://github.com/Lailabcode/DeepSCM/blob/main/deepscm-master/seq_preprocessing.py
63 | def seq_preprocessing():
64 | infile_H = pd.read_csv('seq_aligned_H.csv')
65 | infile_L = pd.read_csv('seq_aligned_KL.csv')
66 | outfile = open('seq_aligned_HL.txt', "w")
67 |
68 | H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
69 | '11','12','13','14','15','16','17','18','19','20', \
70 | '21','22','23','24','25','26','27','28','29','30', \
71 | '31','32','33','34','35','36','37','38','39','40', \
72 | '41','42','43','44','45','46','47','48','49','50', \
73 | '51','52','53','54','55','56','57','58','59','60', \
74 | '61','62','63','64','65','66','67','68','69','70', \
75 | '71','72','73','74','75','76','77','78','79','80', \
76 | '81','82','83','84','85','86','87','88','89','90', \
77 | '91','92','93','94','95','96','97','98','99','100', \
78 | '101','102','103','104','105','106','107','108','109','110', \
79 | '111','111A','111B','111C','111D','111E','111F','111G','111H', \
80 | '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\
81 | '113','114','115','116','117','118','119','120', \
82 | '121','122','123','124','125','126','127','128']
83 |
84 | L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
85 | '11','12','13','14','15','16','17','18','19','20', \
86 | '21','22','23','24','25','26','27','28','29','30', \
87 | '31','32','33','34','35','36','37','38','39','40', \
88 | '41','42','43','44','45','46','47','48','49','50', \
89 | '51','52','53','54','55','56','57','58','59','60', \
90 | '61','62','63','64','65','66','67','68','69','70', \
91 | '71','72','73','74','75','76','77','78','79','80', \
92 | '81','82','83','84','85','86','87','88','89','90', \
93 | '91','92','93','94','95','96','97','98','99','100', \
94 | '101','102','103','104','105','106','107','108','109','110', \
95 | '111','112','113','114','115','116','117','118','119','120', \
96 | '121','122','123','124','125','126','127']
97 |
98 | H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
99 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
100 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
101 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
102 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
103 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
104 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
105 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
106 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
107 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
108 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
109 | '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \
110 | '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \
111 | '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \
112 | '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144}
113 |
114 | L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
115 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
116 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
117 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
118 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
119 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
120 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
121 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
122 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
123 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
124 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
125 | '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \
126 | '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127}
127 |
128 |
129 | N_mAbs = len(infile_H["Id"])
130 |
131 | for i in range(N_mAbs):
132 | H_tmp = 145*['-']
133 | L_tmp = 127*['-']
134 | for col in infile_H.columns:
135 | if(col in H_inclusion_list):
136 | H_tmp[H_dict[col]]=infile_H.iloc[i][col]
137 | for col in infile_L.columns:
138 | if(col in L_inclusion_list):
139 | L_tmp[L_dict[col]]=infile_L.iloc[i][col]
140 |
141 | aa_string = ''
142 | for aa in H_tmp+L_tmp:
143 | aa_string += aa
144 | outfile.write(infile_H.iloc[i,0]+" "+aa_string)
145 | outfile.write("\n")
146 |
147 | outfile.close()
148 | return
149 |
150 | seq_preprocessing()
151 |
152 | # Read Aligned Sequence
153 | def load_input_data(filename):
154 | name_list=[]
155 | seq_list=[]
156 | with open(filename) as datafile:
157 | for line in datafile:
158 | line = line.strip().split()
159 | name_list.append(line[0])
160 | seq_list.append(line[1])
161 | return name_list, seq_list
162 |
163 | name_list, seq_list = load_input_data('seq_aligned_HL.txt')
164 | X = seq_list
165 |
166 | # One Hot Encoding of Aligned Sequence
167 | def one_hot_encoder(s):
168 | d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}
169 |
170 | x = np.zeros((len(d), len(s)))
171 | x[[d[c] for c in s], range(len(s))] = 1
172 |
173 | return x
174 | X = [one_hot_encoder(s=x) for x in X]
175 | X = np.transpose(np.asarray(X), (0, 2, 1))
176 | X = np.asarray(X)
177 |
178 |
179 | # Predict DeepSP Descriptors
180 |
181 | # sappos
182 | json_file = open('Conv1D_regressionSAPpos.json', 'r')
183 | loaded_model_json = json_file.read()
184 | json_file.close()
185 | loaded_model = model_from_json(loaded_model_json)
186 | # load weights into model
187 | loaded_model.load_weights("Conv1D_regression_SAPpos.h5")
188 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
189 | sap_pos = loaded_model.predict(X)
190 |
191 | # scmneg
192 | json_file = open('Conv1D_regressionSCMneg.json', 'r')
193 | loaded_model_json = json_file.read()
194 | json_file.close()
195 | loaded_model = model_from_json(loaded_model_json)
196 | # load weights into model
197 | loaded_model.load_weights("Conv1D_regression_SCMneg.h5")
198 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
199 | scm_neg = loaded_model.predict(X)
200 |
201 | # scmpos
202 | json_file = open('Conv1D_regressionSCMpos.json', 'r')
203 | loaded_model_json = json_file.read()
204 | json_file.close()
205 | loaded_model = model_from_json(loaded_model_json)
206 | # load weights into model
207 | loaded_model.load_weights("Conv1D_regression_SCMpos.h5")
208 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
209 | scm_pos = loaded_model.predict(X)
210 |
211 | features = ['Name', 'SAP_pos_CDRH1','SAP_pos_CDRH2','SAP_pos_CDRH3','SAP_pos_CDRL1','SAP_pos_CDRL2','SAP_pos_CDRL3','SAP_pos_CDR','SAP_pos_Hv','SAP_pos_Lv','SAP_pos_Fv',
212 | 'SCM_neg_CDRH1','SCM_neg_CDRH2','SCM_neg_CDRH3','SCM_neg_CDRL1','SCM_neg_CDRL2','SCM_neg_CDRL3','SCM_neg_CDR','SCM_neg_Hv','SCM_neg_Lv','SCM_neg_Fv',
213 | 'SCM_pos_CDRH1','SCM_pos_CDRH2','SCM_pos_CDRH3','SCM_pos_CDRL1','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_pos_CDR','SCM_pos_Hv','SCM_pos_Lv','SCM_pos_Fv']
214 | df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_neg), pd.DataFrame(scm_pos)], ignore_index=True, axis=1,); df.columns = features
215 | df.to_csv('DeepSP_descriptors.csv', index=False)
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: web2
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - defaults
6 | dependencies:
7 | - anarci=2021.02.04
8 | - biopython=1.78
9 | - hmmer=3.3.2
10 | - keras=2.12.0
11 | - keras-preprocessing=1.1.2
12 | - numpy=1.23.5
13 | - numpy-base=1.23.5
14 | - pandas=2.0.3
15 | - python=3.11.5
16 | - tensorflow=2.12.0
17 | - tensorflow-base=2.12.0
18 | - tensorflow-estimator=2.12.0
19 | prefix: /home/pklai/anaconda3/envs/web2
20 |
--------------------------------------------------------------------------------