├── 1. Analyse UniRef50 and AFDB darkness.ipynb
├── 2. Analyse AFDB90 graph.ipynb
├── 3. UniProt name assessement.ipynb
├── 4. Outlier_Analysis.ipynb
├── LICENSE
├── README.md
├── plots
├── AFDB90v4_component_darkness_histogram.pdf
├── AFDB90v4_component_darkness_histogram.png
├── AFDB90v4_histogram_dark_content.pdf
├── AFDB90v4_histogram_dark_content.png
├── AFDBv4_uniref50_histogram_components_word_diversity.pdf
├── AFDBv4_uniref50_histogram_components_word_diversity.png
├── AFDBv4_uniref50_histogram_dark_content.pdf
├── AFDBv4_uniref50_histogram_dark_content.png
├── community_cosmograph_layout_darkness.png
├── component159_community_cosmograph_layout_darkness.png
└── outliers.pdf
└── scripts
├── AFDBv4_DUF_analysis_dark.py
├── AFDBv4_pLDDT_analysis.py
├── get_communities_summary.py
├── get_connected_components.py
├── get_uniprot_taxonomy.py
├── make_communities_map.py
├── make_shapemers.py
├── sbatch_community_summary.sh
├── sbatch_connect_component_collection.sh
├── sbatch_make_communities_graph.sh
└── sbatch_mmseqs_AFDB90_all-against-all.sh
/1. Analyse UniRef50 and AFDB darkness.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "strange-jewelry",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import os\n",
11 | "import pandas as pd\n",
12 | "import json\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import seaborn as sns\n",
15 | "import numpy as np\n",
16 | "import scipy"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "written-andrew",
22 | "metadata": {},
23 | "source": [
24 | "# 1. Analyse the darkness content of UniRef50 and AlphaFold DB (v4)"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "solar-joining",
30 | "metadata": {},
31 | "source": [
32 | "### 1.1. Load data\n",
33 | "First, run:\n",
34 | "\n",
35 | "`python3 scripts/AFDBv4_pLDDT_analysis.py UniRef50`\n",
36 | "\n",
37 | "This will generate the file `data_generated/AFDBv4_pLDDT_diggestion.csv`. The corresponding for the AFDB90v4 paper, is `data_generated/AFDBv4_pLDDT_diggestion_UniRef50_2023-02-01.csv`, which we load in the next cell.\n",
38 | "\n",
39 | "This file contains the per UniRef50 cluster data on functional darkness, cluster size, median evidence score for the proteins contained, and pLDDT distributions in the AlphaFold database if there is at least one member of the cluster in AFDB. This table refers to ALL Uniref50 clusters and not only those with members in AFDB. \n",
40 | "\n",
41 | "The column corresponding to functional darkness is \"FULL_noDUF\", which reads \"full annotation coverage, excluding DUFs and related terms\"."
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "id": "fatal-beijing",
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "name": "stderr",
52 | "output_type": "stream",
53 | "text": [
54 | "/scicore/home/schwede/soares0000/projects/dark_protein_universe/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3072: DtypeWarning: Columns (2,4) have mixed types.Specify dtype option on import or set low_memory=False.\n",
55 | " interactivity=interactivity, compiler=compiler, result=result)\n"
56 | ]
57 | },
58 | {
59 | "data": {
60 | "text/html": [
61 | "
\n",
62 | "\n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " | \n",
79 | " AF2_REP_best | \n",
80 | " AF2_REP_best_len | \n",
81 | " AF2_REP_worst | \n",
82 | " AF2_REP_worst_len | \n",
83 | " AF2_longest_best70 | \n",
84 | " AF2_longest_best70_len | \n",
85 | " AF2_longest_best70_pLDDT | \n",
86 | " FULL_noDUF | \n",
87 | " REP | \n",
88 | " SP | \n",
89 | " TM | \n",
90 | " delta_pLDDT | \n",
91 | " max_pLDDT | \n",
92 | " median_Evidence | \n",
93 | " median_pLDDT | \n",
94 | " min_pLDDT | \n",
95 | " nACCs | \n",
96 | " nAF2 | \n",
97 | " nUniRef100 | \n",
98 | " nUniRef90 | \n",
99 | "
\n",
100 | " \n",
101 | " | unirefID | \n",
102 | " | \n",
103 | " | \n",
104 | " | \n",
105 | " | \n",
106 | " | \n",
107 | " | \n",
108 | " | \n",
109 | " | \n",
110 | " | \n",
111 | " | \n",
112 | " | \n",
113 | " | \n",
114 | " | \n",
115 | " | \n",
116 | " | \n",
117 | " | \n",
118 | " | \n",
119 | " | \n",
120 | " | \n",
121 | " | \n",
122 | "
\n",
123 | " \n",
124 | " \n",
125 | " \n",
126 | " | UniRef50_A0A007 | \n",
127 | " A0A007 | \n",
128 | " 407.0 | \n",
129 | " NaN | \n",
130 | " NaN | \n",
131 | " A0A007 | \n",
132 | " 407.0 | \n",
133 | " 88.248698 | \n",
134 | " 96.81 | \n",
135 | " A0A007 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 0.000000 | \n",
139 | " 88.248698 | \n",
140 | " 4.0 | \n",
141 | " 88.248698 | \n",
142 | " 88.248698 | \n",
143 | " 1 | \n",
144 | " 1 | \n",
145 | " 1 | \n",
146 | " 1 | \n",
147 | "
\n",
148 | " \n",
149 | " | UniRef50_A0A009DWD5 | \n",
150 | " A0A009DWD5 | \n",
151 | " 39.0 | \n",
152 | " NaN | \n",
153 | " NaN | \n",
154 | " A0A009DWD5 | \n",
155 | " 39.0 | \n",
156 | " 71.991282 | \n",
157 | " 0.00 | \n",
158 | " NaN | \n",
159 | " 0 | \n",
160 | " 0 | \n",
161 | " 0.000000 | \n",
162 | " 71.991282 | \n",
163 | " 4.0 | \n",
164 | " 71.991282 | \n",
165 | " 71.991282 | \n",
166 | " 1 | \n",
167 | " 1 | \n",
168 | " 1 | \n",
169 | " 1 | \n",
170 | "
\n",
171 | " \n",
172 | " | UniRef50_A0A009DWJ5 | \n",
173 | " A0A009DWJ5 | \n",
174 | " 47.0 | \n",
175 | " NaN | \n",
176 | " NaN | \n",
177 | " A0A009DWJ5 | \n",
178 | " 47.0 | \n",
179 | " 80.863404 | \n",
180 | " 80.85 | \n",
181 | " A0A009DWJ5 | \n",
182 | " 0 | \n",
183 | " 0 | \n",
184 | " 0.000000 | \n",
185 | " 80.863404 | \n",
186 | " 4.0 | \n",
187 | " 80.863404 | \n",
188 | " 80.863404 | \n",
189 | " 1 | \n",
190 | " 1 | \n",
191 | " 1 | \n",
192 | " 1 | \n",
193 | "
\n",
194 | " \n",
195 | " | UniRef50_A0A009DWL0 | \n",
196 | " A0A009DWL0 | \n",
197 | " 76.0 | \n",
198 | " NaN | \n",
199 | " NaN | \n",
200 | " NaN | \n",
201 | " NaN | \n",
202 | " NaN | \n",
203 | " 98.67 | \n",
204 | " UPI0018888667 | \n",
205 | " 0 | \n",
206 | " 0 | \n",
207 | " 0.000000 | \n",
208 | " 52.397763 | \n",
209 | " 4.0 | \n",
210 | " 52.397763 | \n",
211 | " 52.397763 | \n",
212 | " 3 | \n",
213 | " 1 | \n",
214 | " 3 | \n",
215 | " 3 | \n",
216 | "
\n",
217 | " \n",
218 | " | UniRef50_A0A009DY31 | \n",
219 | " A0A2D5KFP4 | \n",
220 | " 46.0 | \n",
221 | " A0A0K6IRR6 | \n",
222 | " 57.0 | \n",
223 | " A0A009DY31 | \n",
224 | " 48.0 | \n",
225 | " 70.708333 | \n",
226 | " 93.75 | \n",
227 | " A0A009DY31 | \n",
228 | " 0 | \n",
229 | " 0 | \n",
230 | " -23.334962 | \n",
231 | " 83.109348 | \n",
232 | " 4.0 | \n",
233 | " 76.051401 | \n",
234 | " 59.774386 | \n",
235 | " 4 | \n",
236 | " 4 | \n",
237 | " 4 | \n",
238 | " 4 | \n",
239 | "
\n",
240 | " \n",
241 | " | ... | \n",
242 | " ... | \n",
243 | " ... | \n",
244 | " ... | \n",
245 | " ... | \n",
246 | " ... | \n",
247 | " ... | \n",
248 | " ... | \n",
249 | " ... | \n",
250 | " ... | \n",
251 | " ... | \n",
252 | " ... | \n",
253 | " ... | \n",
254 | " ... | \n",
255 | " ... | \n",
256 | " ... | \n",
257 | " ... | \n",
258 | " ... | \n",
259 | " ... | \n",
260 | " ... | \n",
261 | " ... | \n",
262 | "
\n",
263 | " \n",
264 | " | UniRef50_Z9JYV3 | \n",
265 | " Z9JYV3 | \n",
266 | " 151.0 | \n",
267 | " NaN | \n",
268 | " NaN | \n",
269 | " NaN | \n",
270 | " NaN | \n",
271 | " NaN | \n",
272 | " 0.00 | \n",
273 | " NaN | \n",
274 | " 0 | \n",
275 | " 0 | \n",
276 | " 0.000000 | \n",
277 | " 64.583444 | \n",
278 | " 4.0 | \n",
279 | " 64.583444 | \n",
280 | " 64.583444 | \n",
281 | " 1 | \n",
282 | " 1 | \n",
283 | " 1 | \n",
284 | " 1 | \n",
285 | "
\n",
286 | " \n",
287 | " | UniRef50_Z9JYV5 | \n",
288 | " Z9JYV5 | \n",
289 | " 211.0 | \n",
290 | " NaN | \n",
291 | " NaN | \n",
292 | " NaN | \n",
293 | " NaN | \n",
294 | " NaN | \n",
295 | " 99.53 | \n",
296 | " Z9JYV5 | \n",
297 | " 0 | \n",
298 | " 0 | \n",
299 | " 0.000000 | \n",
300 | " 45.124692 | \n",
301 | " 4.0 | \n",
302 | " 45.124692 | \n",
303 | " 45.124692 | \n",
304 | " 1 | \n",
305 | " 1 | \n",
306 | " 1 | \n",
307 | " 1 | \n",
308 | "
\n",
309 | " \n",
310 | " | UniRef50_Z9JYW2 | \n",
311 | " Z9JYW2 | \n",
312 | " 261.0 | \n",
313 | " NaN | \n",
314 | " NaN | \n",
315 | " Z9JYW2 | \n",
316 | " 261.0 | \n",
317 | " 73.070268 | \n",
318 | " 99.62 | \n",
319 | " Z9JYW2 | \n",
320 | " 0 | \n",
321 | " 0 | \n",
322 | " 0.000000 | \n",
323 | " 73.070268 | \n",
324 | " 4.0 | \n",
325 | " 73.070268 | \n",
326 | " 73.070268 | \n",
327 | " 1 | \n",
328 | " 1 | \n",
329 | " 1 | \n",
330 | " 1 | \n",
331 | "
\n",
332 | " \n",
333 | " | UniRef50_Z9JYW9 | \n",
334 | " Z9JYW9 | \n",
335 | " 171.0 | \n",
336 | " NaN | \n",
337 | " NaN | \n",
338 | " NaN | \n",
339 | " NaN | \n",
340 | " NaN | \n",
341 | " 78.36 | \n",
342 | " Z9JYW9 | \n",
343 | " 0 | \n",
344 | " 0 | \n",
345 | " 0.000000 | \n",
346 | " 34.725965 | \n",
347 | " 4.0 | \n",
348 | " 34.725965 | \n",
349 | " 34.725965 | \n",
350 | " 1 | \n",
351 | " 1 | \n",
352 | " 1 | \n",
353 | " 1 | \n",
354 | "
\n",
355 | " \n",
356 | " | UniRef50_Z9JZ05 | \n",
357 | " A0A7X9C7W4 | \n",
358 | " 297.0 | \n",
359 | " A0A1B0ZIR3 | \n",
360 | " 340.0 | \n",
361 | " A0A2N6U4Z5 | \n",
362 | " 341.0 | \n",
363 | " 86.651672 | \n",
364 | " 98.58 | \n",
365 | " UPI0015607FF1 | \n",
366 | " 0 | \n",
367 | " 0 | \n",
368 | " -4.939340 | \n",
369 | " 89.525017 | \n",
370 | " 4.0 | \n",
371 | " 86.824971 | \n",
372 | " 84.585676 | \n",
373 | " 58 | \n",
374 | " 28 | \n",
375 | " 44 | \n",
376 | " 22 | \n",
377 | "
\n",
378 | " \n",
379 | "
\n",
380 | "
53625854 rows × 20 columns
\n",
381 | "
"
382 | ],
383 | "text/plain": [
384 | " AF2_REP_best AF2_REP_best_len AF2_REP_worst \\\n",
385 | "unirefID \n",
386 | "UniRef50_A0A007 A0A007 407.0 NaN \n",
387 | "UniRef50_A0A009DWD5 A0A009DWD5 39.0 NaN \n",
388 | "UniRef50_A0A009DWJ5 A0A009DWJ5 47.0 NaN \n",
389 | "UniRef50_A0A009DWL0 A0A009DWL0 76.0 NaN \n",
390 | "UniRef50_A0A009DY31 A0A2D5KFP4 46.0 A0A0K6IRR6 \n",
391 | "... ... ... ... \n",
392 | "UniRef50_Z9JYV3 Z9JYV3 151.0 NaN \n",
393 | "UniRef50_Z9JYV5 Z9JYV5 211.0 NaN \n",
394 | "UniRef50_Z9JYW2 Z9JYW2 261.0 NaN \n",
395 | "UniRef50_Z9JYW9 Z9JYW9 171.0 NaN \n",
396 | "UniRef50_Z9JZ05 A0A7X9C7W4 297.0 A0A1B0ZIR3 \n",
397 | "\n",
398 | " AF2_REP_worst_len AF2_longest_best70 \\\n",
399 | "unirefID \n",
400 | "UniRef50_A0A007 NaN A0A007 \n",
401 | "UniRef50_A0A009DWD5 NaN A0A009DWD5 \n",
402 | "UniRef50_A0A009DWJ5 NaN A0A009DWJ5 \n",
403 | "UniRef50_A0A009DWL0 NaN NaN \n",
404 | "UniRef50_A0A009DY31 57.0 A0A009DY31 \n",
405 | "... ... ... \n",
406 | "UniRef50_Z9JYV3 NaN NaN \n",
407 | "UniRef50_Z9JYV5 NaN NaN \n",
408 | "UniRef50_Z9JYW2 NaN Z9JYW2 \n",
409 | "UniRef50_Z9JYW9 NaN NaN \n",
410 | "UniRef50_Z9JZ05 340.0 A0A2N6U4Z5 \n",
411 | "\n",
412 | " AF2_longest_best70_len AF2_longest_best70_pLDDT \\\n",
413 | "unirefID \n",
414 | "UniRef50_A0A007 407.0 88.248698 \n",
415 | "UniRef50_A0A009DWD5 39.0 71.991282 \n",
416 | "UniRef50_A0A009DWJ5 47.0 80.863404 \n",
417 | "UniRef50_A0A009DWL0 NaN NaN \n",
418 | "UniRef50_A0A009DY31 48.0 70.708333 \n",
419 | "... ... ... \n",
420 | "UniRef50_Z9JYV3 NaN NaN \n",
421 | "UniRef50_Z9JYV5 NaN NaN \n",
422 | "UniRef50_Z9JYW2 261.0 73.070268 \n",
423 | "UniRef50_Z9JYW9 NaN NaN \n",
424 | "UniRef50_Z9JZ05 341.0 86.651672 \n",
425 | "\n",
426 | " FULL_noDUF REP SP TM delta_pLDDT \\\n",
427 | "unirefID \n",
428 | "UniRef50_A0A007 96.81 A0A007 0 0 0.000000 \n",
429 | "UniRef50_A0A009DWD5 0.00 NaN 0 0 0.000000 \n",
430 | "UniRef50_A0A009DWJ5 80.85 A0A009DWJ5 0 0 0.000000 \n",
431 | "UniRef50_A0A009DWL0 98.67 UPI0018888667 0 0 0.000000 \n",
432 | "UniRef50_A0A009DY31 93.75 A0A009DY31 0 0 -23.334962 \n",
433 | "... ... ... .. .. ... \n",
434 | "UniRef50_Z9JYV3 0.00 NaN 0 0 0.000000 \n",
435 | "UniRef50_Z9JYV5 99.53 Z9JYV5 0 0 0.000000 \n",
436 | "UniRef50_Z9JYW2 99.62 Z9JYW2 0 0 0.000000 \n",
437 | "UniRef50_Z9JYW9 78.36 Z9JYW9 0 0 0.000000 \n",
438 | "UniRef50_Z9JZ05 98.58 UPI0015607FF1 0 0 -4.939340 \n",
439 | "\n",
440 | " max_pLDDT median_Evidence median_pLDDT min_pLDDT \\\n",
441 | "unirefID \n",
442 | "UniRef50_A0A007 88.248698 4.0 88.248698 88.248698 \n",
443 | "UniRef50_A0A009DWD5 71.991282 4.0 71.991282 71.991282 \n",
444 | "UniRef50_A0A009DWJ5 80.863404 4.0 80.863404 80.863404 \n",
445 | "UniRef50_A0A009DWL0 52.397763 4.0 52.397763 52.397763 \n",
446 | "UniRef50_A0A009DY31 83.109348 4.0 76.051401 59.774386 \n",
447 | "... ... ... ... ... \n",
448 | "UniRef50_Z9JYV3 64.583444 4.0 64.583444 64.583444 \n",
449 | "UniRef50_Z9JYV5 45.124692 4.0 45.124692 45.124692 \n",
450 | "UniRef50_Z9JYW2 73.070268 4.0 73.070268 73.070268 \n",
451 | "UniRef50_Z9JYW9 34.725965 4.0 34.725965 34.725965 \n",
452 | "UniRef50_Z9JZ05 89.525017 4.0 86.824971 84.585676 \n",
453 | "\n",
454 | " nACCs nAF2 nUniRef100 nUniRef90 \n",
455 | "unirefID \n",
456 | "UniRef50_A0A007 1 1 1 1 \n",
457 | "UniRef50_A0A009DWD5 1 1 1 1 \n",
458 | "UniRef50_A0A009DWJ5 1 1 1 1 \n",
459 | "UniRef50_A0A009DWL0 3 1 3 3 \n",
460 | "UniRef50_A0A009DY31 4 4 4 4 \n",
461 | "... ... ... ... ... \n",
462 | "UniRef50_Z9JYV3 1 1 1 1 \n",
463 | "UniRef50_Z9JYV5 1 1 1 1 \n",
464 | "UniRef50_Z9JYW2 1 1 1 1 \n",
465 | "UniRef50_Z9JYW9 1 1 1 1 \n",
466 | "UniRef50_Z9JZ05 58 28 44 22 \n",
467 | "\n",
468 | "[53625854 rows x 20 columns]"
469 | ]
470 | },
471 | "execution_count": 2,
472 | "metadata": {},
473 | "output_type": "execute_result"
474 | }
475 | ],
476 | "source": [
477 | "indata = 'data_generated_v2/AFDBv4_pLDDT_diggestion_UniRef50_2023-02-01.csv'\n",
478 | "indata = pd.read_csv(indata)\n",
479 | "indata = indata.sort_values(by='unirefID')\n",
480 | "indata = indata.set_index(\"unirefID\")\n",
481 | "indata = indata[:-1]\n",
482 | "indata"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": 7,
488 | "id": "norman-brunei",
489 | "metadata": {},
490 | "outputs": [
491 | {
492 | "data": {
493 | "text/html": [
494 | "\n",
495 | "\n",
508 | "
\n",
509 | " \n",
510 | " \n",
511 | " | \n",
512 | " AF2_REP_best | \n",
513 | " AF2_REP_best_len | \n",
514 | " AF2_REP_worst | \n",
515 | " AF2_REP_worst_len | \n",
516 | " AF2_longest_best70 | \n",
517 | " AF2_longest_best70_len | \n",
518 | " AF2_longest_best70_pLDDT | \n",
519 | " FULL_noDUF | \n",
520 | " REP | \n",
521 | " SP | \n",
522 | " ... | \n",
523 | " delta_pLDDT | \n",
524 | " max_pLDDT | \n",
525 | " median_Evidence | \n",
526 | " median_pLDDT | \n",
527 | " min_pLDDT | \n",
528 | " nACCs | \n",
529 | " nAF2 | \n",
530 | " nUniRef100 | \n",
531 | " nUniRef90 | \n",
532 | " darkness_bins | \n",
533 | "
\n",
534 | " \n",
535 | " | unirefID | \n",
536 | " | \n",
537 | " | \n",
538 | " | \n",
539 | " | \n",
540 | " | \n",
541 | " | \n",
542 | " | \n",
543 | " | \n",
544 | " | \n",
545 | " | \n",
546 | " | \n",
547 | " | \n",
548 | " | \n",
549 | " | \n",
550 | " | \n",
551 | " | \n",
552 | " | \n",
553 | " | \n",
554 | " | \n",
555 | " | \n",
556 | " | \n",
557 | "
\n",
558 | " \n",
559 | " \n",
560 | " \n",
561 | " | UniRef50_A0A007 | \n",
562 | " A0A007 | \n",
563 | " 407.0 | \n",
564 | " NaN | \n",
565 | " NaN | \n",
566 | " A0A007 | \n",
567 | " 407.0 | \n",
568 | " 88.248698 | \n",
569 | " 96.81 | \n",
570 | " A0A007 | \n",
571 | " 0 | \n",
572 | " ... | \n",
573 | " 0.000000 | \n",
574 | " 88.248698 | \n",
575 | " 4.0 | \n",
576 | " 88.248698 | \n",
577 | " 88.248698 | \n",
578 | " 1 | \n",
579 | " 1 | \n",
580 | " 1 | \n",
581 | " 1 | \n",
582 | " (95.0, 100.0] | \n",
583 | "
\n",
584 | " \n",
585 | " | UniRef50_A0A009DWD5 | \n",
586 | " A0A009DWD5 | \n",
587 | " 39.0 | \n",
588 | " NaN | \n",
589 | " NaN | \n",
590 | " A0A009DWD5 | \n",
591 | " 39.0 | \n",
592 | " 71.991282 | \n",
593 | " 0.00 | \n",
594 | " NaN | \n",
595 | " 0 | \n",
596 | " ... | \n",
597 | " 0.000000 | \n",
598 | " 71.991282 | \n",
599 | " 4.0 | \n",
600 | " 71.991282 | \n",
601 | " 71.991282 | \n",
602 | " 1 | \n",
603 | " 1 | \n",
604 | " 1 | \n",
605 | " 1 | \n",
606 | " (-0.001, 5.0] | \n",
607 | "
\n",
608 | " \n",
609 | " | UniRef50_A0A009DWJ5 | \n",
610 | " A0A009DWJ5 | \n",
611 | " 47.0 | \n",
612 | " NaN | \n",
613 | " NaN | \n",
614 | " A0A009DWJ5 | \n",
615 | " 47.0 | \n",
616 | " 80.863404 | \n",
617 | " 80.85 | \n",
618 | " A0A009DWJ5 | \n",
619 | " 0 | \n",
620 | " ... | \n",
621 | " 0.000000 | \n",
622 | " 80.863404 | \n",
623 | " 4.0 | \n",
624 | " 80.863404 | \n",
625 | " 80.863404 | \n",
626 | " 1 | \n",
627 | " 1 | \n",
628 | " 1 | \n",
629 | " 1 | \n",
630 | " (80.0, 85.0] | \n",
631 | "
\n",
632 | " \n",
633 | " | UniRef50_A0A009DWL0 | \n",
634 | " A0A009DWL0 | \n",
635 | " 76.0 | \n",
636 | " NaN | \n",
637 | " NaN | \n",
638 | " NaN | \n",
639 | " NaN | \n",
640 | " NaN | \n",
641 | " 98.67 | \n",
642 | " UPI0018888667 | \n",
643 | " 0 | \n",
644 | " ... | \n",
645 | " 0.000000 | \n",
646 | " 52.397763 | \n",
647 | " 4.0 | \n",
648 | " 52.397763 | \n",
649 | " 52.397763 | \n",
650 | " 3 | \n",
651 | " 1 | \n",
652 | " 3 | \n",
653 | " 3 | \n",
654 | " (95.0, 100.0] | \n",
655 | "
\n",
656 | " \n",
657 | " | UniRef50_A0A009DY31 | \n",
658 | " A0A2D5KFP4 | \n",
659 | " 46.0 | \n",
660 | " A0A0K6IRR6 | \n",
661 | " 57.0 | \n",
662 | " A0A009DY31 | \n",
663 | " 48.0 | \n",
664 | " 70.708333 | \n",
665 | " 93.75 | \n",
666 | " A0A009DY31 | \n",
667 | " 0 | \n",
668 | " ... | \n",
669 | " -23.334962 | \n",
670 | " 83.109348 | \n",
671 | " 4.0 | \n",
672 | " 76.051401 | \n",
673 | " 59.774386 | \n",
674 | " 4 | \n",
675 | " 4 | \n",
676 | " 4 | \n",
677 | " 4 | \n",
678 | " (90.0, 95.0] | \n",
679 | "
\n",
680 | " \n",
681 | " | ... | \n",
682 | " ... | \n",
683 | " ... | \n",
684 | " ... | \n",
685 | " ... | \n",
686 | " ... | \n",
687 | " ... | \n",
688 | " ... | \n",
689 | " ... | \n",
690 | " ... | \n",
691 | " ... | \n",
692 | " ... | \n",
693 | " ... | \n",
694 | " ... | \n",
695 | " ... | \n",
696 | " ... | \n",
697 | " ... | \n",
698 | " ... | \n",
699 | " ... | \n",
700 | " ... | \n",
701 | " ... | \n",
702 | " ... | \n",
703 | "
\n",
704 | " \n",
705 | " | UniRef50_Z9JYV3 | \n",
706 | " Z9JYV3 | \n",
707 | " 151.0 | \n",
708 | " NaN | \n",
709 | " NaN | \n",
710 | " NaN | \n",
711 | " NaN | \n",
712 | " NaN | \n",
713 | " 0.00 | \n",
714 | " NaN | \n",
715 | " 0 | \n",
716 | " ... | \n",
717 | " 0.000000 | \n",
718 | " 64.583444 | \n",
719 | " 4.0 | \n",
720 | " 64.583444 | \n",
721 | " 64.583444 | \n",
722 | " 1 | \n",
723 | " 1 | \n",
724 | " 1 | \n",
725 | " 1 | \n",
726 | " (-0.001, 5.0] | \n",
727 | "
\n",
728 | " \n",
729 | " | UniRef50_Z9JYV5 | \n",
730 | " Z9JYV5 | \n",
731 | " 211.0 | \n",
732 | " NaN | \n",
733 | " NaN | \n",
734 | " NaN | \n",
735 | " NaN | \n",
736 | " NaN | \n",
737 | " 99.53 | \n",
738 | " Z9JYV5 | \n",
739 | " 0 | \n",
740 | " ... | \n",
741 | " 0.000000 | \n",
742 | " 45.124692 | \n",
743 | " 4.0 | \n",
744 | " 45.124692 | \n",
745 | " 45.124692 | \n",
746 | " 1 | \n",
747 | " 1 | \n",
748 | " 1 | \n",
749 | " 1 | \n",
750 | " (95.0, 100.0] | \n",
751 | "
\n",
752 | " \n",
753 | " | UniRef50_Z9JYW2 | \n",
754 | " Z9JYW2 | \n",
755 | " 261.0 | \n",
756 | " NaN | \n",
757 | " NaN | \n",
758 | " Z9JYW2 | \n",
759 | " 261.0 | \n",
760 | " 73.070268 | \n",
761 | " 99.62 | \n",
762 | " Z9JYW2 | \n",
763 | " 0 | \n",
764 | " ... | \n",
765 | " 0.000000 | \n",
766 | " 73.070268 | \n",
767 | " 4.0 | \n",
768 | " 73.070268 | \n",
769 | " 73.070268 | \n",
770 | " 1 | \n",
771 | " 1 | \n",
772 | " 1 | \n",
773 | " 1 | \n",
774 | " (95.0, 100.0] | \n",
775 | "
\n",
776 | " \n",
777 | " | UniRef50_Z9JYW9 | \n",
778 | " Z9JYW9 | \n",
779 | " 171.0 | \n",
780 | " NaN | \n",
781 | " NaN | \n",
782 | " NaN | \n",
783 | " NaN | \n",
784 | " NaN | \n",
785 | " 78.36 | \n",
786 | " Z9JYW9 | \n",
787 | " 0 | \n",
788 | " ... | \n",
789 | " 0.000000 | \n",
790 | " 34.725965 | \n",
791 | " 4.0 | \n",
792 | " 34.725965 | \n",
793 | " 34.725965 | \n",
794 | " 1 | \n",
795 | " 1 | \n",
796 | " 1 | \n",
797 | " 1 | \n",
798 | " (75.0, 80.0] | \n",
799 | "
\n",
800 | " \n",
801 | " | UniRef50_Z9JZ05 | \n",
802 | " A0A7X9C7W4 | \n",
803 | " 297.0 | \n",
804 | " A0A1B0ZIR3 | \n",
805 | " 340.0 | \n",
806 | " A0A2N6U4Z5 | \n",
807 | " 341.0 | \n",
808 | " 86.651672 | \n",
809 | " 98.58 | \n",
810 | " UPI0015607FF1 | \n",
811 | " 0 | \n",
812 | " ... | \n",
813 | " -4.939340 | \n",
814 | " 89.525017 | \n",
815 | " 4.0 | \n",
816 | " 86.824971 | \n",
817 | " 84.585676 | \n",
818 | " 58 | \n",
819 | " 28 | \n",
820 | " 44 | \n",
821 | " 22 | \n",
822 | " (95.0, 100.0] | \n",
823 | "
\n",
824 | " \n",
825 | "
\n",
826 | "
53625854 rows × 21 columns
\n",
827 | "
"
828 | ],
829 | "text/plain": [
830 | " AF2_REP_best AF2_REP_best_len AF2_REP_worst \\\n",
831 | "unirefID \n",
832 | "UniRef50_A0A007 A0A007 407.0 NaN \n",
833 | "UniRef50_A0A009DWD5 A0A009DWD5 39.0 NaN \n",
834 | "UniRef50_A0A009DWJ5 A0A009DWJ5 47.0 NaN \n",
835 | "UniRef50_A0A009DWL0 A0A009DWL0 76.0 NaN \n",
836 | "UniRef50_A0A009DY31 A0A2D5KFP4 46.0 A0A0K6IRR6 \n",
837 | "... ... ... ... \n",
838 | "UniRef50_Z9JYV3 Z9JYV3 151.0 NaN \n",
839 | "UniRef50_Z9JYV5 Z9JYV5 211.0 NaN \n",
840 | "UniRef50_Z9JYW2 Z9JYW2 261.0 NaN \n",
841 | "UniRef50_Z9JYW9 Z9JYW9 171.0 NaN \n",
842 | "UniRef50_Z9JZ05 A0A7X9C7W4 297.0 A0A1B0ZIR3 \n",
843 | "\n",
844 | " AF2_REP_worst_len AF2_longest_best70 \\\n",
845 | "unirefID \n",
846 | "UniRef50_A0A007 NaN A0A007 \n",
847 | "UniRef50_A0A009DWD5 NaN A0A009DWD5 \n",
848 | "UniRef50_A0A009DWJ5 NaN A0A009DWJ5 \n",
849 | "UniRef50_A0A009DWL0 NaN NaN \n",
850 | "UniRef50_A0A009DY31 57.0 A0A009DY31 \n",
851 | "... ... ... \n",
852 | "UniRef50_Z9JYV3 NaN NaN \n",
853 | "UniRef50_Z9JYV5 NaN NaN \n",
854 | "UniRef50_Z9JYW2 NaN Z9JYW2 \n",
855 | "UniRef50_Z9JYW9 NaN NaN \n",
856 | "UniRef50_Z9JZ05 340.0 A0A2N6U4Z5 \n",
857 | "\n",
858 | " AF2_longest_best70_len AF2_longest_best70_pLDDT \\\n",
859 | "unirefID \n",
860 | "UniRef50_A0A007 407.0 88.248698 \n",
861 | "UniRef50_A0A009DWD5 39.0 71.991282 \n",
862 | "UniRef50_A0A009DWJ5 47.0 80.863404 \n",
863 | "UniRef50_A0A009DWL0 NaN NaN \n",
864 | "UniRef50_A0A009DY31 48.0 70.708333 \n",
865 | "... ... ... \n",
866 | "UniRef50_Z9JYV3 NaN NaN \n",
867 | "UniRef50_Z9JYV5 NaN NaN \n",
868 | "UniRef50_Z9JYW2 261.0 73.070268 \n",
869 | "UniRef50_Z9JYW9 NaN NaN \n",
870 | "UniRef50_Z9JZ05 341.0 86.651672 \n",
871 | "\n",
872 | " FULL_noDUF REP SP ... delta_pLDDT \\\n",
873 | "unirefID ... \n",
874 | "UniRef50_A0A007 96.81 A0A007 0 ... 0.000000 \n",
875 | "UniRef50_A0A009DWD5 0.00 NaN 0 ... 0.000000 \n",
876 | "UniRef50_A0A009DWJ5 80.85 A0A009DWJ5 0 ... 0.000000 \n",
877 | "UniRef50_A0A009DWL0 98.67 UPI0018888667 0 ... 0.000000 \n",
878 | "UniRef50_A0A009DY31 93.75 A0A009DY31 0 ... -23.334962 \n",
879 | "... ... ... .. ... ... \n",
880 | "UniRef50_Z9JYV3 0.00 NaN 0 ... 0.000000 \n",
881 | "UniRef50_Z9JYV5 99.53 Z9JYV5 0 ... 0.000000 \n",
882 | "UniRef50_Z9JYW2 99.62 Z9JYW2 0 ... 0.000000 \n",
883 | "UniRef50_Z9JYW9 78.36 Z9JYW9 0 ... 0.000000 \n",
884 | "UniRef50_Z9JZ05 98.58 UPI0015607FF1 0 ... -4.939340 \n",
885 | "\n",
886 | " max_pLDDT median_Evidence median_pLDDT min_pLDDT \\\n",
887 | "unirefID \n",
888 | "UniRef50_A0A007 88.248698 4.0 88.248698 88.248698 \n",
889 | "UniRef50_A0A009DWD5 71.991282 4.0 71.991282 71.991282 \n",
890 | "UniRef50_A0A009DWJ5 80.863404 4.0 80.863404 80.863404 \n",
891 | "UniRef50_A0A009DWL0 52.397763 4.0 52.397763 52.397763 \n",
892 | "UniRef50_A0A009DY31 83.109348 4.0 76.051401 59.774386 \n",
893 | "... ... ... ... ... \n",
894 | "UniRef50_Z9JYV3 64.583444 4.0 64.583444 64.583444 \n",
895 | "UniRef50_Z9JYV5 45.124692 4.0 45.124692 45.124692 \n",
896 | "UniRef50_Z9JYW2 73.070268 4.0 73.070268 73.070268 \n",
897 | "UniRef50_Z9JYW9 34.725965 4.0 34.725965 34.725965 \n",
898 | "UniRef50_Z9JZ05 89.525017 4.0 86.824971 84.585676 \n",
899 | "\n",
900 | " nACCs nAF2 nUniRef100 nUniRef90 darkness_bins \n",
901 | "unirefID \n",
902 | "UniRef50_A0A007 1 1 1 1 (95.0, 100.0] \n",
903 | "UniRef50_A0A009DWD5 1 1 1 1 (-0.001, 5.0] \n",
904 | "UniRef50_A0A009DWJ5 1 1 1 1 (80.0, 85.0] \n",
905 | "UniRef50_A0A009DWL0 3 1 3 3 (95.0, 100.0] \n",
906 | "UniRef50_A0A009DY31 4 4 4 4 (90.0, 95.0] \n",
907 | "... ... ... ... ... ... \n",
908 | "UniRef50_Z9JYV3 1 1 1 1 (-0.001, 5.0] \n",
909 | "UniRef50_Z9JYV5 1 1 1 1 (95.0, 100.0] \n",
910 | "UniRef50_Z9JYW2 1 1 1 1 (95.0, 100.0] \n",
911 | "UniRef50_Z9JYW9 1 1 1 1 (75.0, 80.0] \n",
912 | "UniRef50_Z9JZ05 58 28 44 22 (95.0, 100.0] \n",
913 | "\n",
914 | "[53625854 rows x 21 columns]"
915 | ]
916 | },
917 | "execution_count": 7,
918 | "metadata": {},
919 | "output_type": "execute_result"
920 | }
921 | ],
922 | "source": [
923 | "indata['darkness_bins'] = pd.cut(indata['FULL_noDUF'].astype(float), bins=[i for i in range(0, 105, 5)], include_lowest=True)\n",
924 | "indata['median_Evidence'] = indata['median_Evidence'].fillna(0)\n",
925 | "indata"
926 | ]
927 | },
928 | {
929 | "cell_type": "markdown",
930 | "id": "promising-importance",
931 | "metadata": {},
932 | "source": [
933 | "To add DUF counts into the dataframe, run `python3 scripts/AFDBv4_DUF_analysis_dark.py UniRef50`, which will generate the `generated_data/AFDBv4_DUF_dark_diggestion_UniRef50.csv`\n",
934 | "\n",
935 | "For the AFDB90v4 paper, the precomupted file is `data_generated/AFDBv4_DUF_dark_diggestion_UniRef50_2023-02-06.csv`\n",
936 | "\n",
937 | "This table list each UniRef50 cluster and states whether there are proteins annotated for DUFs in it. We want to merge this information into the dataframe above."
938 | ]
939 | },
940 | {
941 | "cell_type": "code",
942 | "execution_count": 4,
943 | "id": "varying-network",
944 | "metadata": {},
945 | "outputs": [
946 | {
947 | "data": {
948 | "text/html": [
949 | "\n",
950 | "\n",
963 | "
\n",
964 | " \n",
965 | " \n",
966 | " | \n",
967 | " AF2_REP_best | \n",
968 | " AF2_REP_best_len | \n",
969 | " AF2_REP_worst | \n",
970 | " AF2_REP_worst_len | \n",
971 | " AF2_longest_best70 | \n",
972 | " AF2_longest_best70_len | \n",
973 | " AF2_longest_best70_pLDDT | \n",
974 | " FULL_noDUF | \n",
975 | " REP | \n",
976 | " SP | \n",
977 | " ... | \n",
978 | " max_pLDDT | \n",
979 | " median_Evidence | \n",
980 | " median_pLDDT | \n",
981 | " min_pLDDT | \n",
982 | " nACCs | \n",
983 | " nAF2 | \n",
984 | " nUniRef100 | \n",
985 | " nUniRef90 | \n",
986 | " darkness_bins | \n",
987 | " Has_duf | \n",
988 | "
\n",
989 | " \n",
990 | " \n",
991 | " \n",
992 | " | UniRef50_A0A007 | \n",
993 | " A0A007 | \n",
994 | " 407.0 | \n",
995 | " NaN | \n",
996 | " NaN | \n",
997 | " A0A007 | \n",
998 | " 407.0 | \n",
999 | " 88.248698 | \n",
1000 | " 96.81 | \n",
1001 | " A0A007 | \n",
1002 | " 0 | \n",
1003 | " ... | \n",
1004 | " 88.248698 | \n",
1005 | " 4.0 | \n",
1006 | " 88.248698 | \n",
1007 | " 88.248698 | \n",
1008 | " 1 | \n",
1009 | " 1 | \n",
1010 | " 1 | \n",
1011 | " 1 | \n",
1012 | " (95.0, 100.0] | \n",
1013 | " NaN | \n",
1014 | "
\n",
1015 | " \n",
1016 | " | UniRef50_A0A009DWD5 | \n",
1017 | " A0A009DWD5 | \n",
1018 | " 39.0 | \n",
1019 | " NaN | \n",
1020 | " NaN | \n",
1021 | " A0A009DWD5 | \n",
1022 | " 39.0 | \n",
1023 | " 71.991282 | \n",
1024 | " 0.00 | \n",
1025 | " NaN | \n",
1026 | " 0 | \n",
1027 | " ... | \n",
1028 | " 71.991282 | \n",
1029 | " 4.0 | \n",
1030 | " 71.991282 | \n",
1031 | " 71.991282 | \n",
1032 | " 1 | \n",
1033 | " 1 | \n",
1034 | " 1 | \n",
1035 | " 1 | \n",
1036 | " (-0.001, 5.0] | \n",
1037 | " 0.0 | \n",
1038 | "
\n",
1039 | " \n",
1040 | " | UniRef50_A0A009DWJ5 | \n",
1041 | " A0A009DWJ5 | \n",
1042 | " 47.0 | \n",
1043 | " NaN | \n",
1044 | " NaN | \n",
1045 | " A0A009DWJ5 | \n",
1046 | " 47.0 | \n",
1047 | " 80.863404 | \n",
1048 | " 80.85 | \n",
1049 | " A0A009DWJ5 | \n",
1050 | " 0 | \n",
1051 | " ... | \n",
1052 | " 80.863404 | \n",
1053 | " 4.0 | \n",
1054 | " 80.863404 | \n",
1055 | " 80.863404 | \n",
1056 | " 1 | \n",
1057 | " 1 | \n",
1058 | " 1 | \n",
1059 | " 1 | \n",
1060 | " (80.0, 85.0] | \n",
1061 | " NaN | \n",
1062 | "
\n",
1063 | " \n",
1064 | " | UniRef50_A0A009DWL0 | \n",
1065 | " A0A009DWL0 | \n",
1066 | " 76.0 | \n",
1067 | " NaN | \n",
1068 | " NaN | \n",
1069 | " NaN | \n",
1070 | " NaN | \n",
1071 | " NaN | \n",
1072 | " 98.67 | \n",
1073 | " UPI0018888667 | \n",
1074 | " 0 | \n",
1075 | " ... | \n",
1076 | " 52.397763 | \n",
1077 | " 4.0 | \n",
1078 | " 52.397763 | \n",
1079 | " 52.397763 | \n",
1080 | " 3 | \n",
1081 | " 1 | \n",
1082 | " 3 | \n",
1083 | " 3 | \n",
1084 | " (95.0, 100.0] | \n",
1085 | " NaN | \n",
1086 | "
\n",
1087 | " \n",
1088 | " | UniRef50_A0A009DY31 | \n",
1089 | " A0A2D5KFP4 | \n",
1090 | " 46.0 | \n",
1091 | " A0A0K6IRR6 | \n",
1092 | " 57.0 | \n",
1093 | " A0A009DY31 | \n",
1094 | " 48.0 | \n",
1095 | " 70.708333 | \n",
1096 | " 93.75 | \n",
1097 | " A0A009DY31 | \n",
1098 | " 0 | \n",
1099 | " ... | \n",
1100 | " 83.109348 | \n",
1101 | " 4.0 | \n",
1102 | " 76.051401 | \n",
1103 | " 59.774386 | \n",
1104 | " 4 | \n",
1105 | " 4 | \n",
1106 | " 4 | \n",
1107 | " 4 | \n",
1108 | " (90.0, 95.0] | \n",
1109 | " NaN | \n",
1110 | "
\n",
1111 | " \n",
1112 | " | ... | \n",
1113 | " ... | \n",
1114 | " ... | \n",
1115 | " ... | \n",
1116 | " ... | \n",
1117 | " ... | \n",
1118 | " ... | \n",
1119 | " ... | \n",
1120 | " ... | \n",
1121 | " ... | \n",
1122 | " ... | \n",
1123 | " ... | \n",
1124 | " ... | \n",
1125 | " ... | \n",
1126 | " ... | \n",
1127 | " ... | \n",
1128 | " ... | \n",
1129 | " ... | \n",
1130 | " ... | \n",
1131 | " ... | \n",
1132 | " ... | \n",
1133 | " ... | \n",
1134 | "
\n",
1135 | " \n",
1136 | " | UniRef50_Z9JYV3 | \n",
1137 | " Z9JYV3 | \n",
1138 | " 151.0 | \n",
1139 | " NaN | \n",
1140 | " NaN | \n",
1141 | " NaN | \n",
1142 | " NaN | \n",
1143 | " NaN | \n",
1144 | " 0.00 | \n",
1145 | " NaN | \n",
1146 | " 0 | \n",
1147 | " ... | \n",
1148 | " 64.583444 | \n",
1149 | " 4.0 | \n",
1150 | " 64.583444 | \n",
1151 | " 64.583444 | \n",
1152 | " 1 | \n",
1153 | " 1 | \n",
1154 | " 1 | \n",
1155 | " 1 | \n",
1156 | " (-0.001, 5.0] | \n",
1157 | " NaN | \n",
1158 | "
\n",
1159 | " \n",
1160 | " | UniRef50_Z9JYV5 | \n",
1161 | " Z9JYV5 | \n",
1162 | " 211.0 | \n",
1163 | " NaN | \n",
1164 | " NaN | \n",
1165 | " NaN | \n",
1166 | " NaN | \n",
1167 | " NaN | \n",
1168 | " 99.53 | \n",
1169 | " Z9JYV5 | \n",
1170 | " 0 | \n",
1171 | " ... | \n",
1172 | " 45.124692 | \n",
1173 | " 4.0 | \n",
1174 | " 45.124692 | \n",
1175 | " 45.124692 | \n",
1176 | " 1 | \n",
1177 | " 1 | \n",
1178 | " 1 | \n",
1179 | " 1 | \n",
1180 | " (95.0, 100.0] | \n",
1181 | " NaN | \n",
1182 | "
\n",
1183 | " \n",
1184 | " | UniRef50_Z9JYW2 | \n",
1185 | " Z9JYW2 | \n",
1186 | " 261.0 | \n",
1187 | " NaN | \n",
1188 | " NaN | \n",
1189 | " Z9JYW2 | \n",
1190 | " 261.0 | \n",
1191 | " 73.070268 | \n",
1192 | " 99.62 | \n",
1193 | " Z9JYW2 | \n",
1194 | " 0 | \n",
1195 | " ... | \n",
1196 | " 73.070268 | \n",
1197 | " 4.0 | \n",
1198 | " 73.070268 | \n",
1199 | " 73.070268 | \n",
1200 | " 1 | \n",
1201 | " 1 | \n",
1202 | " 1 | \n",
1203 | " 1 | \n",
1204 | " (95.0, 100.0] | \n",
1205 | " NaN | \n",
1206 | "
\n",
1207 | " \n",
1208 | " | UniRef50_Z9JYW9 | \n",
1209 | " Z9JYW9 | \n",
1210 | " 171.0 | \n",
1211 | " NaN | \n",
1212 | " NaN | \n",
1213 | " NaN | \n",
1214 | " NaN | \n",
1215 | " NaN | \n",
1216 | " 78.36 | \n",
1217 | " Z9JYW9 | \n",
1218 | " 0 | \n",
1219 | " ... | \n",
1220 | " 34.725965 | \n",
1221 | " 4.0 | \n",
1222 | " 34.725965 | \n",
1223 | " 34.725965 | \n",
1224 | " 1 | \n",
1225 | " 1 | \n",
1226 | " 1 | \n",
1227 | " 1 | \n",
1228 | " (75.0, 80.0] | \n",
1229 | " NaN | \n",
1230 | "
\n",
1231 | " \n",
1232 | " | UniRef50_Z9JZ05 | \n",
1233 | " A0A7X9C7W4 | \n",
1234 | " 297.0 | \n",
1235 | " A0A1B0ZIR3 | \n",
1236 | " 340.0 | \n",
1237 | " A0A2N6U4Z5 | \n",
1238 | " 341.0 | \n",
1239 | " 86.651672 | \n",
1240 | " 98.58 | \n",
1241 | " UPI0015607FF1 | \n",
1242 | " 0 | \n",
1243 | " ... | \n",
1244 | " 89.525017 | \n",
1245 | " 4.0 | \n",
1246 | " 86.824971 | \n",
1247 | " 84.585676 | \n",
1248 | " 58 | \n",
1249 | " 28 | \n",
1250 | " 44 | \n",
1251 | " 22 | \n",
1252 | " (95.0, 100.0] | \n",
1253 | " NaN | \n",
1254 | "
\n",
1255 | " \n",
1256 | "
\n",
1257 | "
53625854 rows × 22 columns
\n",
1258 | "
"
1259 | ],
1260 | "text/plain": [
1261 | " AF2_REP_best AF2_REP_best_len AF2_REP_worst \\\n",
1262 | "UniRef50_A0A007 A0A007 407.0 NaN \n",
1263 | "UniRef50_A0A009DWD5 A0A009DWD5 39.0 NaN \n",
1264 | "UniRef50_A0A009DWJ5 A0A009DWJ5 47.0 NaN \n",
1265 | "UniRef50_A0A009DWL0 A0A009DWL0 76.0 NaN \n",
1266 | "UniRef50_A0A009DY31 A0A2D5KFP4 46.0 A0A0K6IRR6 \n",
1267 | "... ... ... ... \n",
1268 | "UniRef50_Z9JYV3 Z9JYV3 151.0 NaN \n",
1269 | "UniRef50_Z9JYV5 Z9JYV5 211.0 NaN \n",
1270 | "UniRef50_Z9JYW2 Z9JYW2 261.0 NaN \n",
1271 | "UniRef50_Z9JYW9 Z9JYW9 171.0 NaN \n",
1272 | "UniRef50_Z9JZ05 A0A7X9C7W4 297.0 A0A1B0ZIR3 \n",
1273 | "\n",
1274 | " AF2_REP_worst_len AF2_longest_best70 \\\n",
1275 | "UniRef50_A0A007 NaN A0A007 \n",
1276 | "UniRef50_A0A009DWD5 NaN A0A009DWD5 \n",
1277 | "UniRef50_A0A009DWJ5 NaN A0A009DWJ5 \n",
1278 | "UniRef50_A0A009DWL0 NaN NaN \n",
1279 | "UniRef50_A0A009DY31 57.0 A0A009DY31 \n",
1280 | "... ... ... \n",
1281 | "UniRef50_Z9JYV3 NaN NaN \n",
1282 | "UniRef50_Z9JYV5 NaN NaN \n",
1283 | "UniRef50_Z9JYW2 NaN Z9JYW2 \n",
1284 | "UniRef50_Z9JYW9 NaN NaN \n",
1285 | "UniRef50_Z9JZ05 340.0 A0A2N6U4Z5 \n",
1286 | "\n",
1287 | " AF2_longest_best70_len AF2_longest_best70_pLDDT \\\n",
1288 | "UniRef50_A0A007 407.0 88.248698 \n",
1289 | "UniRef50_A0A009DWD5 39.0 71.991282 \n",
1290 | "UniRef50_A0A009DWJ5 47.0 80.863404 \n",
1291 | "UniRef50_A0A009DWL0 NaN NaN \n",
1292 | "UniRef50_A0A009DY31 48.0 70.708333 \n",
1293 | "... ... ... \n",
1294 | "UniRef50_Z9JYV3 NaN NaN \n",
1295 | "UniRef50_Z9JYV5 NaN NaN \n",
1296 | "UniRef50_Z9JYW2 261.0 73.070268 \n",
1297 | "UniRef50_Z9JYW9 NaN NaN \n",
1298 | "UniRef50_Z9JZ05 341.0 86.651672 \n",
1299 | "\n",
1300 | " FULL_noDUF REP SP ... max_pLDDT \\\n",
1301 | "UniRef50_A0A007 96.81 A0A007 0 ... 88.248698 \n",
1302 | "UniRef50_A0A009DWD5 0.00 NaN 0 ... 71.991282 \n",
1303 | "UniRef50_A0A009DWJ5 80.85 A0A009DWJ5 0 ... 80.863404 \n",
1304 | "UniRef50_A0A009DWL0 98.67 UPI0018888667 0 ... 52.397763 \n",
1305 | "UniRef50_A0A009DY31 93.75 A0A009DY31 0 ... 83.109348 \n",
1306 | "... ... ... .. ... ... \n",
1307 | "UniRef50_Z9JYV3 0.00 NaN 0 ... 64.583444 \n",
1308 | "UniRef50_Z9JYV5 99.53 Z9JYV5 0 ... 45.124692 \n",
1309 | "UniRef50_Z9JYW2 99.62 Z9JYW2 0 ... 73.070268 \n",
1310 | "UniRef50_Z9JYW9 78.36 Z9JYW9 0 ... 34.725965 \n",
1311 | "UniRef50_Z9JZ05 98.58 UPI0015607FF1 0 ... 89.525017 \n",
1312 | "\n",
1313 | " median_Evidence median_pLDDT min_pLDDT nACCs nAF2 \\\n",
1314 | "UniRef50_A0A007 4.0 88.248698 88.248698 1 1 \n",
1315 | "UniRef50_A0A009DWD5 4.0 71.991282 71.991282 1 1 \n",
1316 | "UniRef50_A0A009DWJ5 4.0 80.863404 80.863404 1 1 \n",
1317 | "UniRef50_A0A009DWL0 4.0 52.397763 52.397763 3 1 \n",
1318 | "UniRef50_A0A009DY31 4.0 76.051401 59.774386 4 4 \n",
1319 | "... ... ... ... ... ... \n",
1320 | "UniRef50_Z9JYV3 4.0 64.583444 64.583444 1 1 \n",
1321 | "UniRef50_Z9JYV5 4.0 45.124692 45.124692 1 1 \n",
1322 | "UniRef50_Z9JYW2 4.0 73.070268 73.070268 1 1 \n",
1323 | "UniRef50_Z9JYW9 4.0 34.725965 34.725965 1 1 \n",
1324 | "UniRef50_Z9JZ05 4.0 86.824971 84.585676 58 28 \n",
1325 | "\n",
1326 | " nUniRef100 nUniRef90 darkness_bins Has_duf \n",
1327 | "UniRef50_A0A007 1 1 (95.0, 100.0] NaN \n",
1328 | "UniRef50_A0A009DWD5 1 1 (-0.001, 5.0] 0.0 \n",
1329 | "UniRef50_A0A009DWJ5 1 1 (80.0, 85.0] NaN \n",
1330 | "UniRef50_A0A009DWL0 3 3 (95.0, 100.0] NaN \n",
1331 | "UniRef50_A0A009DY31 4 4 (90.0, 95.0] NaN \n",
1332 | "... ... ... ... ... \n",
1333 | "UniRef50_Z9JYV3 1 1 (-0.001, 5.0] NaN \n",
1334 | "UniRef50_Z9JYV5 1 1 (95.0, 100.0] NaN \n",
1335 | "UniRef50_Z9JYW2 1 1 (95.0, 100.0] NaN \n",
1336 | "UniRef50_Z9JYW9 1 1 (75.0, 80.0] NaN \n",
1337 | "UniRef50_Z9JZ05 44 22 (95.0, 100.0] NaN \n",
1338 | "\n",
1339 | "[53625854 rows x 22 columns]"
1340 | ]
1341 | },
1342 | "execution_count": 4,
1343 | "metadata": {},
1344 | "output_type": "execute_result"
1345 | }
1346 | ],
1347 | "source": [
1348 | "# get DUF distribution of all darks and merge with the data\n",
1349 | "\n",
1350 | "duf_dark_data = 'data_generated_v2/AFDBv4_DUF_dark_diggestion_UniRef50_2023-02-06.csv'\n",
1351 | "duf_dark_data = pd.read_csv(duf_dark_data)\n",
1352 | "duf_dark_data = duf_dark_data.sort_values(by='unirefID')\n",
1353 | "duf_dark_data = duf_dark_data.set_index(\"unirefID\")\n",
1354 | "duf_dark_data = duf_dark_data[:-1]\n",
1355 | "\n",
1356 | "indata = pd.concat([indata, duf_dark_data], axis=1)\n",
1357 | "indata"
1358 | ]
1359 | },
1360 | {
1361 | "cell_type": "markdown",
1362 | "id": "norman-qualification",
1363 | "metadata": {},
1364 | "source": [
1365 | "## 1.2. Make histogram at different pLDDT cutoffs"
1366 | ]
1367 | },
1368 | {
1369 | "cell_type": "code",
1370 | "execution_count": 5,
1371 | "id": "eleven-omaha",
1372 | "metadata": {},
1373 | "outputs": [
1374 | {
1375 | "name": "stdout",
1376 | "output_type": "stream",
1377 | "text": [
1378 | "Full n = 53625854 n_dark = 18249414 uniprot_n_dark = 37761108.0 % uniprot = 10.308197827140427\n",
1379 | "Full n = 53625854 n_dark = 18249414 uniref100_n_dark = 33852950.0 % uniref100 = 10.767738572191368\n",
1380 | "% UniRef50 dark with dufs = 0.08570847130870293\n",
1381 | "\n",
1382 | "AFDB n = 41983663 n_dark = 12339265 uniprot_n_dark = 29763470.0 % uniprot = 8.614381123145668\n",
1383 | "AFDB n = 41983663 n_dark = 12339265 uniref100_n_dark = 26286720.0 % uniref100 = 8.887961218741369\n",
1384 | "% UniRef50 dark with dufs = 0.10342012509280826\n",
1385 | "\n",
1386 | "AFDB70 n = 26228839 n_dark = 5618293 uniprot_n_dark = 19979438.0 % uniprot = 6.4303307109762855\n",
1387 | "AFDB70 n = 26228839 n_dark = 5618293 uniref100_n_dark = 17307646.0 % uniref100 = 6.5541442844322315\n",
1388 | "% UniRef50 dark with dufs = 0.18025675909453015\n",
1389 | "\n",
1390 | "AFDB90 n = 6136321 n_dark = 927430 uniprot_n_dark = 3696194.0 % uniprot = 6.043489091347138\n",
1391 | "AFDB90 n = 6136321 n_dark = 927430 uniref100_n_dark = 3275219.0 % uniref100 = 6.180251211639788\n",
1392 | "% UniRef50 dark with dufs = 0.06771225311444018\n",
1393 | "\n"
1394 | ]
1395 | },
1396 | {
1397 | "data": {
1398 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3wAAADQCAYAAABcImMqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABBEUlEQVR4nO3dd5xdVb3+8c+TRmihBRBChxAIkRJCwEuRJk0gCEiXyEXRnyAiegVFqRcuRSwIikhXShAQQu9BpEmoSQgYakghEEqo6d/fH3tNODmZcmbm1D3P+/Xar9lnt7X25Jmds3ZZWxGBmZmZmZmZ5U+3WlfAzMzMzMzMKsMNPjMzMzMzs5xyg8/MzMzMzCyn3OAzMzMzMzPLKTf4zMzMzMzMcsoNPjMzMzMzs5xygy9nJIWk9dL4lZL+t9Z1MjMzM7PGJ+n/JB1X4rI3Sdq9wlWyErjBV2WSRkn6QNJiJSw3U9InBcNXqlVPMwBJb0j6POXvA0l3SFq9hPWulDRX0ipF00+VNKco1z9L85oy/7GkjyQ9LenEwr+VZtYfL2m/8u+5NTJJh0ganTIyVdJdkrZpY51T0wmzLYumf1vSvKLMXpjmXSlpdsrsx5LGpi9Dy7Sy/muS/l/B/G2Ltv1Jqsd+Bcv8WNLb6e/i8rb+/7B8KTgOfyzpQ0mPSfq+pDa/w9XyWCzpO5JeSfPvlrRqwTxJOkfSe2k4R5I6/9uySpK0InA48Of0eXtJ85s5hjV9Xz0HaPHCQzPrT5J0g6QtCpZZKx0TexStu+CiRmvH6WpI5f+rjNsr+wUbN/iqSNJawLZAAHuXsMoxEbFUwfB4RSto1ry9ImIpYBVgGvCH1haWtCSwHzADOKyZRUYU5frcgnnHRMTSqayfAAcBdxZ9EViwPnAc8DdJK3d05yxfJB0P/A44C1gZWAP4IzCslXVE9iXm/fSz2ONFmT2mYN65KbMrAkcAWwGPpr+DRdYn+9s4V9JmABHxSOG2gT2BT4C7U912BU4EdgLWBNYBTmvXL8XyYK+UszWBs4ETgMtaW6GWx2JJ25P9DQ4DlgdeB64rWPcoYB9gE2BjYC/ge238Dqz2vg3cGRGfF0ybUpSjBd9XI+LfQB9JQ1rZ5pSUoaXJjp8vAY9I2qmddWvtON3lucFXXYcDTwBXAsM7soF05u07BZ/LelbBrCURMRO4ERjYxqL7AR8Cp9PBnEfEpxExiuzEyFeAr7ew3D3Ax8C6HSnH8iVdWTsdODoibk45mhMRt0XE/7Sy6rZkX2yPBQ6S1Ku9ZUfEzIh4iiyzK5A1/ppb7llgPLBhC5saDtwYEZ8WfL4sIsZFxAfAGWRfuqwLiogZETESOBAYLmlQK4vX8li8J/D3lNvZZLndTlLT/OHA+RExKSImA+fjXDeC3YGH27nOKFrITaHITIqIk4FLya4Olp2k5SVdIWmKsjuXbimY9910Vfp9SSOLrkpHurI+IV1pvyhdqd4QuBj4Srqy+GFafjFJv5Y0UdI0SRdLWjzN2z5dzfyJpHeU3YlyRJp3FHAo8LO0vdvKsd9u8FXX4cA1adjVVyWskUhaguxLxhNtLDqc7Ezu9cAGkjbvaJkRMREYTfaFvLg+kvR1oBfwYkfLsFz5CtAb+Ec71xsO3AbckD7v1dEKRMTHwH00k1mAdKvS+mS5Lp63JLA/cFXB5I2A5ws+Pw+sLGmFjtbRGl+6cjKJFnKW1PpYrGbGmxqozeV6o47Wz6rmy8DL7VxnPNmV3Pa4GRhcdKdEufwVWIIsbysBvwWQtCPwf8ABZCcA3yT72ym0J7AF2VXpA4BdI2I88H2+uMK4bFr2bLJj/abAekA/4OSCbX0JWCZNPxK4SNJyEXEJWTvh3LS9Dv9/VMgNvipR9vzImsANEfE08CpwSBurXZDOInwo6ZmKV9KsebekM1YzgK8B57W0oKQ1gB2AayNiGvAAi94id0BBrj8sPIPWgilktwQttD7ZbW8jgbMi4sN27I/l1wrA9IiYW+oK6UTGN8kyO4fsKnZxZrcqyuxWbWy2OLNN638M/JvsC8eEZtbbF5jOwmfQlyL722vSNL50G3Ww/CvO2QJ1cCy+O83fOF3VOJnscZYl0vzmcr1U0S2jVn+WJbuSW2jVohx9WNRQ+zit1x5TyE4StGe9No/Typ5l3R34fkR8kO4AaTreHgpcHhHPRMQs4OdkV+3WKtjE2RHxYToB8hBZY24RKcdHAT+OiPfTicCzyG6NbjIHOD3V4U6yv6MB7djfdnGDr3qGA/dGxPT0+VravsXi2IhYNg2DK1s9sxbtk85Y9QaOAR6W9KUWlv0WMD4inkufrwEOkdSzYJkbCnK9bERMaaP8fmTPVhWvvyTZ7UOHS/KzHwbwHtBXRQ/3t+EbwFzgzvT5GmB3ZZ0TNHmiKLNtXeUuzmzT+kuTndXdiOw//2LDgasjIgqmfQL0KfjcNF78pcu6nuKcFarpsTgi7gdOAW4C3kjDx2RXJaH5XH9SlH2rPx+w6MmmKUU5WrbglnTS8h+2s5x+ZCcIPiQ7PgP0LFqmJ1mjqUkpx+nVgffT7fHFViW7qgdARHxC9n9Kv4Jl3i4Y/4zsxEVzViQ7ufF0UwOU7CRI4f8r7xWdnGxte53mBl8VpLNbBwBfVdbT2tvAj4FNJLX3MvenfHGGDLIvD2YVFxHzIuJmYB7QUo+HhwPrFOT8N0BfYI+OlKmsR9DNgUdaqNMbwF104hY8y5XHgVlknUGUajjZf7ITU2b/TvZFoq07MJolaSlgZ1rO7DSyL8F7Fa23OrA9cHXRKuNY+HaoTYBpEfFeR+pn+ZBuDe4HtPQMf82PxRFxUUT0j4iVyTLfAxibZjeX63EdqZtV1Qtktym2x4YsfPtuKb4BPJMajlPJGnZrFS2zNgUNtBK9BSwvadlm5k0huxMPWHCL/QrA5BK2W3yiYjrwObBRQQN0mdQ5TSnKfuLDDb7q2IfsS/JAssu/m5L9ATxC8z3CteY5YF9JSyh7396R5aqkWWvScxrDgOXI7skvnv8VsrO8Q/ki54PIrma3K+cp318FbiW7Be7OFpZbDdgNf1Ewsg4tyG4du0jSPilHPSXtLunc4uUl9SPr/XJPvsjsJmSdBbQ3s4ulZ6RuITsLfkULy61A9mWmOLPfAh6LiFeLpl8NHClpYPqS8kuyjr+sC5LUR9KeZM8W/S0ixjSzTM2PxZJ6SxqU/t9YA7gE+H3BlZWrgeMl9Uu3kv4E57oR3Al8tZ3rfJXsZECrUlb6SToF+A7wC8hONpOdMDhT0grpmH4w2XfqNrdbKCKmpnX+KGm5tK3t0uzrgCMkbarsFSRnAU+mkxltmQasptThV0TMB/4C/FbSSmn/+inrdbkU08h6ZC4bN/iqYzhwRURMjIi3mwbgQuDQdt5+9FtgNlkYriK7TcOskm6T9AnwEXAmMDwimmtgDQdujYgxRTn/PbCnpGafNSlyYXrOaRpZ1/o3Abulg2eTA5XeswM8BTyKu6m3JCLOB44naxi9S3ZG9xiyhlixbwHPRcS9RZm9ANhYrfeA2ORnKbPvkX2JfRr4r6Jbmr5SkNnxqV4/LNrO4SzcWUvT/twNnEv2vMhEsjPap5RQL8uX21LO3gJOIrti12xPsNTHsbg3WQPzE7KG4uPArwrW/TNZR0ljyK763ZGmWX27Gtgj3bnWZFUt+h6+/WDBlehPUidDLVk1ZagpR18Gto+IewuW+QHZ7cQvAO+QHdO/nu6YaK9vkV0xfClt6zhYcBvyr8iyPpXspMlBzW9iEQ+Snex4W1LTo1snAK8AT0j6CLif0p/RuwwYmG4HvaXEdVol3y5tZmZmZmZtkXQW8E5E/K6EZW8ie61Ms1eGrXrc4DMzMzMzM8upit3SKelyZS8THFswbXlJ9yl7aeF9kpZL0yXpAmUvO3xBknuktLrjTFueOM+WN8605YGk3SS9nLJ5YjPzj5f0YsrtA5IKOxoZnrI+QVJbPcFbF1LJZ/iuJHuAt9CJwAMR0Z/snTBNQd4d6J+Go4A/VbBeZh11Jc605ceVOM+WL1fiTFsDk9QduIgsnwOBgyUNLFrsWWBIRGxM9t7Qc9O6y5M937slWYc9pzSd4DCrWIMvIv7Jou+HGcYXD6VfxRddZw8jvXsovTdjWWUvRzSrG8605YnzbHnjTFsODAVeiYjXImI2WW+swwoXiIiHIuKz9PEJYLU0vitwX2Qv+v4AuI9FT4BYF9We3iHLYeXUJSpkLy9cOY33I+t5qsmkNG0qRSQdRXY2jiWXXHLz9ddv7+tArF49++yz0yNixbaXrCvOtLWoATPtPFuLGjDP4ExbK+ow083lcstWlj+SL15N0FKmF+I851tLma52g2+BiAhJ7e4xJiIuIXufC4MHD46HH3647HWz2ujTp097X6BZV5xpK9bImXaerVgj5xmcaVtUI2da0mHAENr5XjznOd9aynS138M3remWifTznTR9MrB6wXKrUdqb7c1qzZm2PHGeLW+caWskJeVS0s5k72PcOyJmtWdd65qq3eAbSfZCUNLPWwumH556zdoKmFFwC4ZZPXOmLU+cZ8sbZ9oayVNAf0lrS+pF9uLvkYULSNqM7CX1e0fEOwWz7gF2kbRc6qxllzTNrHK3dEq6Dtge6CtpElnPQWcDN0g6EngTOCAtfiewB9kb6T8DjqhUvcw6ypm2PHGeLW+caWt0ETFX0jFkDbXuwOURMU7S6cDoiBgJnAcsBfxdEsDEiNg7It6XdAZZoxHg9Igo7sTIuqiKNfgi4uAWZu3UzLIBHF2pupiVgzNteeI8W94405YHEXEn2QmJwmknF4zv3Mq6lwOXV6521qiqfUunmZmZmZmZVYkbfGZmZmZmnSTp3oLxn9eyLmaF3OAzMzMzM+u8wveffbNmtTAr4gafmZmZmVnntfs9j2bVULMXr5uZmZmZ5cg6kkYCKhhfICL2rk21rKtzg8/MzMzMrPOGFYz/uma1MCviBp+ZmZmZWSdFxMO1roNZc9zgMzMzMzPrJEljWPg5vgCmAw8Bv46ImTWpmHV5bvCZmZmZmXXens1MWx4YDvwB+G51q2OWcYPPzMzMzKyTIuLNZia/CTwr6dlq18e+MKD/BkydNqXi5ayy8qq8POGlipfTXm7wmZmZmZlVll+FVkNTp03hwP6nVLycERNOq3gZHeEGn5mZmZlZJ0ka3Mzk5YDDgH9WuTpmC7jBZ2ZmZmbWeecXfQ7gPWAUcEnVa2OWtNngk7Q18FxEfCrpMGAw8PsW7lM2q3vOtOWJ82x540xbo4qIHWpdB7PmlHI/8Z+AzyRtAvwEeBW4uqK1MqssZ9ryxHm2vHGmrWFJGiDpfEl3pOHXktavdb2sayulwTc3IgIYBlwYERcBS1e2WmYV5UxbnjjPljfOtDUkSV8hu33zE7JbOP8CfAqMkrRVDatmXVwpz/B9LOnnZA+cbiepG9CzstUyqyhn2vLEeba8caatUZ0MHBwRowqm3SLpQeAUYPea1Mq6vFKu8B0IzAKOjIi3gdWA8ypaK7PKcqYtT5xnyxtn2hrVukWNPQAi4mFgnepXxyzT6hU+Sd2B6wofQo2IifheemtQzrTlifNseeNMW4P7uJV5n1atFmZFWm3wRcQ8SfMlLRMRM6pVKbNKcaYtT5xnyxtn2hrc6pIuaGa6gH7VroxZk1Ke4fsEGCPpPgrOTkTEsRWrlVllOdOWJ86z5Y0zbY3qf1qZN7pqtTArUkqD7+Y0lI2kHwPfIXsh5RjgCGAV4HpgBeBp4FsRMbuc5ZolzrTlifNseeNMW0OKiKtqXQez5rTZ4IuIqyQtDqwRES93tkBJ/YBjgYER8bmkG4CDgD2A30bE9ZIuBo4kexePWVk505YnzrPljTNtZlZebfbSKWkv4Dng7vR5U0kjO1luD2BxST2AJYCpwI7AjWn+VcA+nSzDrFnOtOWJ82x540ybmZVXKa9lOBUYCnwIEBHP0YmuZSNiMvBrYCLZAXcG2a0UH0bE3LTYJFp4uFXSUZJGSxo9ffr0jlbDurZTcaYtP07FebZ8ORVn2sysbEpp8M1ppqes+R0tUNJywDBgbWBVYElgt1LXj4hLImJIRAzp27dvR6thXZszbXniPFveONPW0CStKOkXki6RdHnTUOt6WddVSqct4yQdAnSX1J/sPvjHOlHmzsDrEfEugKSbga2BZSX1SGfbVgMmd6IMs9Y405YnzrPljTNtje5W4BHgfmBejetiVtIVvh8CGwGzgGvJboX4USfKnAhsJWkJSQJ2Al4EHgL2T8sMJ/tjMasEZ9ryxHm2vHGmrdEtEREnRMQNEXFT01DrSlnXVUqD7+sRcVJEbJGGXwJ7d7TAiHiS7CHpZ8i6Ru4GXAKcABwv6RWyLpIv62gZZm1wpi1PnGfLG2faGt3tkvboyIqSdpP0sqRXJJ3YzPztJD0jaa6k/YvmzZP0XBo629GR5Ugpt3T+HPh7CdNKFhGnAKcUTX6N7CFts0pzpi1PnGfLG2faGpKkj8ne9SjgF5JmAXPS54iIPm2s3x24CPgaWUdCT0kaGREvFiw2Efg28NNmNvF5RGza2f2w/GmxwSdpd7J31PSTdEHBrD7A3ObXMqtfzrTlifNseeNMW6OLiKU7uYmhwCsR8RqApOvJOhxa0OCLiDfSvA53ZGRdT2u3dE4BRgMzybovbhpGArtWvmpmZedMW544z5Y3zrTlgqQHSpnWjH7AWwWfW3xdSAt6p1eIPCFpnxbq5teMdEEtXuGLiOeB5yVdGxFzYEHXxqtHxAfVqqBZuTjTlifOs+WNM22NTlJvstd+9E3ZVZrVh/Y13DpqzYiYLGkd4EFJYyLi1cIFIuISsmdYGTx4cFShTlYHSum05T5JfSQtT/bA818k/bbC9TKrJGfa8sR5trxxpq1RfY/sKvUGZNltukp9K3BhCetPBlYv+Nyu14VExOT08zVgFLBZqetavpXS4FsmIj4C9gWujogtybo0NmtUzrTlifNseeNMW0OKiN9HxNrATyNi7YJhk4gopcH3FNBf0tqSegEHkd3S3CZJy0laLI33JXvX5Iutr2VdRSkNvh6SVgEOAG6vcH3MqsGZtjxxni1vnGlrSJJ2TKOTJe1bPLS1fkTMBY4B7gHGAzdExDhJp0vaO5WxhaRJwDeBP0sal1bfEBgt6Xmyd0yeXdS7p3VhpbyW4XSy4P0rIp5K9wVPqGy1zCrKmbY8cZ4tb5xpa1RfBR4E9mpmXgA3t7WBiLgTuLNo2skF40+R3epZvN5jwJfbWV/rItps8EXE3yl49026L3i/SlbKrJKcacsT59nyxpm2RpXe90hEHFHrupgVarPBJ+kKsrMSC4mI/65IjcwqzJm2PHGeLW+caWt0kl4FngAeAR6JiHFtrGJWUaXc0ll4/3xv4Btk78oxa1TOtOWJ82x540xboxsIbAlsC5wnaQDwQkR8o7bVsq6qlFs6byr8LOk64F8Vq5FZhTnTlifOs+WNM205MA+Yk37OB95Jg1lNlHKFr1h/YKVyV8SshpxpyxPn2fLGmbZG8xEwBvgN8JeIeK/G9bEurpRn+D4mu5de6efbwAkVrpdZxTjTlifOs+WNM205cDCwDfAD4DuSHgP+GREP1LZa1lWVckvn0tWoiFm1ONOWJ86z5Y0zbY0uIm4FbpW0AbA7cBzwM2DxWtbLuq4WG3ySBre2YkQ8U/7qmFWOM2154jxb3jjTlheSbgI2AV4F/gkcDjxZ00pZl9baFb7zW5kXwI5lrotZpTnTlifOs+WNM2158X/AsxExr9YVMYNWGnwRsUM1K2JWac605YnzbHnjTFteRMToWtfBrFC3thaQdLSkZQs+LyfpBxWtlVkFOdOWJ86z5Y0zbWZWXm02+IDvRsSHTR8i4gPguxWrkVnlOdOWJ86z5Y0zbWZWRqU0+LpLUtMHSd2BXpWrklnFOdOWJ86z5Y0zbQ1N0taSlkzjh0n6jaQ1a10v67pKafDdDYyQtJOknYDr0rQOk7SspBslvSRpvKSvSFpe0n2SJqSfy3WmDLNWONOWJ86z5Y0zbY3uT8BnkjYBfkLWW+fVta2SdWWlNPhOAB4E/l8aHiB7l0hn/B64OyI2IOu2djxwIvBARPRPZZzYyTLMWuJMW544z5Y3zrQ1urkREcAw4MKIuAjw+yWtZkp58fp84OI0dJqkZYDtgG+n7c8GZksaBmyfFrsKGEV20DcrK2fa8sR5trxxpi0HPpb0c+AwYDtJ3YCeNa6TdWGlXOErt7WBd4ErJD0r6dJ0n/PKETE1LfM2sHJzK0s6StJoSaOnT59epSqbtcqZtjxxni1vnGmrtgOBWcCREfE2sBpwXm2rZF1ZLRp8PYDBwJ8iYjPgU4puo0iXwaO5lSPikogYEhFD+vbtW/HKmpXAmbY8cZ4tb5xpq5rUydB1EfGbiHgEICImRoSf4bOaqUWDbxIwKSKeTJ9vJDsQT5O0CkD6+U4N6mbWEc605YnzbHnjTFvVRMQ8YH66ldisLrTY4JO0jKSzU49W70t6L/VsdXbhC1HbK13afkvSgDRpJ+BFYCQwPE0bDtza0TLMmuNMW544z5Y3zrTlyCfAGEmXSbqgaah1pazraq3TlhvIesnaPh0skfQlsoPiDcAunSj3h8A1knoBrwFHkDU+b5B0JPAmcEAntm/WHGfa8sR5trxxpi0vbk6DWV1orcG3VkScUzghHYDPkfTfnSk0Ip4DhjQza6fObNesDc605YnzbHnjTFsuRMRVkhYH1oiIl2tdH7PWnuF7U9LPJC3otUrSypJOAN6qfNXMys6Ztjxxni1vnGnLBUl7Ac8Bd6fPm0oaWdNKWZfWWoPvQGAF4OF0L/37ZO+oWR7f9mCNyZm2PHGeLW+cacuLU4GhwIew4ArzOrWrjnV1Ld7SGREfkL2A1C8htVxwpi1PnGfLG2facmRORMyQVDhtfq0qY9baM3wLkbQN2dmKsRFxb+WqZFYdzrTlifNseeNMWwMbJ+kQoLuk/sCxwGM1rpN1Ya29luHfBePfBS4ElgZOkXRiS+uZ1Stn2vLEeba8caYtR34IbATMAq4FZgA/qmmNrEtr7Rm+ngXjRwFfi4jTyLpFPrSitTKrDGfa8sR5trxxpi0vvh4RJ0XEFmn4JbB3rStlXVdrDb5ukpaTtAKgiHgXICI+BeZWpXZm5eVMW544z5Y3zrTlxc9LnLYISbtJelnSK81d2Za0naRnJM2VtH/RvOGSJqRheAfrbjnU2jN8ywBPAwJC0ioRMVXSUmmaWaNxpi1PnGfLG2faGpqk3YE9gH6SLiiY1YcSTlpI6g5cBHwNmAQ8JWlkRLxYsNhE4NvAT4vWXR44hex9kwE8ndb9oON7VFkD+m/A1GlTal2NLqG1Bt+OEfFaM9PnA9+oUH3MKsmZtjxxni1vnGlrdFOA0WS3bz5dMP1j4MclrD8UeKXp70DS9cAwYEGDLyLeSPOKe/3cFbgvIt5P8+8DdgOu68iOVMPUaVM4sP8pVSlrxITTqlJOvWrtls6/A0h6oHBiRHwWEa9XtFZmleFMW544z5Y3zrQ1tIh4PiKuAtaLiKvS+EiyRlwpV9r6AW8VfJ6UppWipHUlHSVptKTR06dPL3HT1uhau8LXTdIvgPUlHV88MyJ+U7lqmVWEM2154jxb3jjTlhf3Sdqb7Hv208A7kh6LiFKu8lVURFwCXAIwePDgqHF1rEpau8J3EDCPLKxLNzOYNRpn2vLEeba8caat0wYNGkSfPn0WGQYNGlTNaiwTER8B+wJXR8SWwE4lrDcZWL3g82ppWik6s67lXItX+CLiZeAcSS9ExF1VrJNZRTjTlifOs+WNM23lMHHiRCIWvXAlVbXfnx6SVgEOAE5qx3pPAf0lrU3WWDsIOKTEde8BzpK0XPq8CyX2DGr519oVvibPSLpM0l0AkgZKOrLC9TKrJGfa8sR5trxxpq3RnU7WAHslIp6StA4woa2VImIucExadzxwQ0SMk3R6ukUUSVtImgR8E/izpHFp3feBM8gajU8Bpzd14GJWSoPvSrLgrZo+/wc4rkL1MauGK3GmLT+uxHm2fLkSZ9oaWET8PSI2jogfpM+vRcR+Ja57Z0SsHxHrRsSZadrJETEyjT8VEatFxJIRsUJEbFSw7uURsV4arqjEvlljKqXB1zcibiDrFrnp7MO8itbKrLKcacsT59nyxpm2hibpCkmXFw+1rpd1Xa310tnkU0krkL3EEUlbATMqWiuzynKmLU+cZ8sbZ9oa3e0F473J3iPpN4xbzZTS4Due7B0i60p6FFgR2L+itTKrLGfa8sR5trxxpq2hRcRNhZ8lXQf8q0bVMWu7wRcRz0j6KjAAEPAyMLTSFTOrFGfa8sR5trxxpi2H+gMr1boS1nW12OCT1J2sO9l+wF2pl6A9yV7WuDiwWXWqaFYezrTlifNseeNMW15I+pjslmSln28DJ9S0UtaltXaF7zKyFzj+G/iDpCnA5sDPI+KWKtTNrNycacsT59nyxpm2XIiIpWtdh84Y0H8Dpk7zI4d50lqDbwiwcUTMl9Sb7OzEuhHxXjkKTmfyRgOTI2LP9JLJ64EVgKeBb0XE7HKUZZY405YnzrPljTNtDU3S4NbmR8Qz1apLZ0ydNoUD+59S8XJGTDit4mVYprXXMsyOiKYukWcCr5XroJv8iOylkk3OAX4bEesBHwB+yaqVmzNteeI8W94409bozm9l+HUN62VdXGsNvg0kvZCGMQWfx0h6oTOFSloN+DpwafosYEfgxrTIVcA+Hd3+55/M4uwjruW8o0Zw5uHXMP7fby6Y9+jIsXx/6G8XfP7bWfdz5uHX8OjIsQDMnjmHS35xOxHR0eKtfjVsps2a4Txb3jjT1tAiYodWhh1rXT/rulq7pXPDCpb7O+BnQNM9zisAH6aXqwJMIntoexGSjgKOAlh99dWb3fhiS/Tif/5yEN17dOPdSR/y55/fzi//uiZzZs3l6QcmsPyXsmI/nfE5H777MSdcdhDnfXcEW+89iLuvfordhg8l+7/AcqZhM23WDOfZ8saZtlyQdDRwTUR8mD4vBxwcEX+sacWsy2rxCl9EvNna0NECU49b70TE0x1ZPyIuiYghETGkb9++zS7TrZvo3iPbtc8/nc1q/VcE4IHrn+Gr+20MqTHXo1cPZs+cy7y58+nZqwfvTp7BrE9ns8YA95ybR42cabNizrPljTNtOfLdpsYeQER8AHy3dtWxrq611zL8KyK2KehadsEsICKiTwfL3BrYW9IeQG+gD/B7YFlJPdLZttWAyR3cPgAfvPMxl5x4O9MmfsDwk3fl049m8p9nJrPb8KGMOH8UAIst3pMhXxvAVWfcwz5Hb81dlz/JLocPYcT5D9G9Z3f2Puor9OrdszPVsDrS6Jk2K+Q8W94405Yj3SUp0vNBqcOgXjWuk3VhrV3h2yb9XDoi+hQMS3fioEtE/DwiVouItYCDgAcj4lDgIWD/tNhw4NaOlgGw3EpLc8LlB/OLqw/lunMf5K4rnmS34Vssstx2+27MUWftycxP57DOxqvw+O0vssUuG7DGgJV48q7xzWzZGlWjZ9qskPNseeNMW47cDYyQtJOknYDr0jSzmmit05YFJHWXtKqkNZqGCtTlBOB4Sa+Q3Vt/WUc3NGf23AXjiy/Zi95L9GLamx9w5+VP8rtjbmLG9E/484m3LVhm7px5/Puel9h670HM+mw2c+fMY+6cecz8bE4ndsfqWaNl2qw1zrPljTNtDe4E4EHg/6XhAbJnSM1qorVOWwCQ9EPgFGAaMD9NDmDjzhYeEaOAUWn8NWBoZ7cJMOXV9xhx/kN069aNefPmc+BPt2fDoWsumP+LYZfxvbP3WvD5wRHPstNBmyGJbfb5MledcS/de3Tje2fvWY7qWJ1pxEybtcR5trxxpq3RRfZ6kYvTYFZzbTb4yN5bMyDK+y6cilpzw5X52aUHtTj/rFsXftXOLocNWTC+Wv8VOenqQytWN6sLDZXpiS9N49pzH6RbN9GtezeG/2oXFlu8J5efcjdzZ89j+S8tzbd++TV69urB3866nzdfmsb2+2/C1nsPYvbMOVx5+j1898yvu+fZ/GqoPJuVwJk2MyujUhp8bwEzKl0RsypqqEwv03cpjvvDfvReshdj/vUaI//8GEsuszj/tddGDN11A+668t88fvuLbL5Tf79mpGtqqDyblcCZNjMro1IafK8BoyTdAcxqmhgRv6lYrcwqq6EyvUzfJReM9+jVnW7duzFt4gfseOCmAKy90Zd45B9j2HL3Df2aka6pofJsVgJn2hqapLUj4vWiaVtExFO1qpN1baU0+CamoRcN0KXsgP4bMHXalEWmr7Lyqrw84aUa1MjqUENlusmsz+dwyx8fZfjJu/LY7eMY+9gb7Hjgcox59HU+/WimXzPSdTVkns1a4Uxbo7tJ0l4RMRlA0leBC4Ev17ZaVmnd1J0+fTrcqXDJ2tuuabPBFxGndapGVTZ12hQO7H/KItNHTGio3bAKarRMQ9aT7J9PvJ3dhm/BquuswB5HbMl15z7Arx+cwOrrr8iyK2ZXAbfbd2O223djxj72xkKvGZk+ZQZP3jWebb/R6T4PrM40Yp7NWuNMWw58D7hF0l7AYOD/gD1qWyWrhvkxr9l2SLm1t13T2ovXb2PhF58GMB14KCL+1qHamdVQo2Z6/vzgsl/dyWbbr8tmO/QHYImlF+PIM7L/O26+8BEGbvlFL7RNrxk54tRdGfHrh/yakZxq1DybtcSZtryIiKckHQvcC8wEdo6Id2tcLevCWrvC9+tmpi0PHCZpUEScWKE6mVVKQ2b62QcnMOZfr/PR+5/xxF3j6bdeXzbbvj93XPoE6iY2GLoGX95mnQXL+zUjXUZD5tmsFc60NbRmTlosQdYB0WWSiIi9a1Mz6+pabPBFxMPNTZc0Enga8IHXGkqjZnrznddn853XX2T6hkObfw+xXzPSNTRqns1a4kxbDjR30sKs5krptGUhETHPXbxbnjjTlifOs7XXoEGDmDhx4kLT1lhjDcaOHVujGi3MmbZGEREPS+oO3B8RO9S6PmZNWnuGb/lmJi8HHA6Mq1iNzCqkUTPdXM+z7nXWGjXPVn8mTpxIRCw0rRYNLGfa8iCdoJgvaZmI8PskrS60doXvabL7kJuO+k0PT48C/l9lq2VWEQ2Z6eZ6nnWvs0aD5tmsFc605cUnwBhJ9wGfNk2MiGNrVyXrylp7hm/talbErNKcacsT59nyxpm2HLk5DWZ1od3P8JmZmXXGxJemce25D9Ktm+jWvRvDf7ULM977lL+deT/T3vqAM285kuVXXhqAv511P2++NI3t99+ErfcexOyZc7jy9Hv47plfr8lth2ZmbYmIq2pdB7NC3WpdATMz61qW6bsUx/1hP3526UHs+q0hjPzzY6y6Tl9OvPJg1vnyKguW+3TG53z47seccNlB/PPmFwC4++qn2G34UDf2zKxuSeov6UZJL0p6rWkocd3dJL0s6RVJi/RMK2kxSSPS/CclrZWmryXpc0nPpeHiMu+WNbAWG3yStk4/F6tedcwqx5m2PGnkPC/Td0l6L9kLgB69utOtezeWWHoxei/Ra6HlevTqweyZc5k3dz49e/Xg3ckzmPXpbNYYsFItqm0V1siZNityBfAnYC6wA3A18Le2Vko9fF4E7A4MBA6WNLBosSOBDyJiPeC3wDkF816NiE3T8P3O74blRWtX+C5IPx+vRkXMqsCZtjxp+DzP+nwOt/zxUXY9fItm5y+2eE+GfG0AV51xD/scvTV3Xf4k2+67MSPOf4gbL/gns2fOqXKNrcIaPtNmyeIR8QCgiHgzIk4Fvl7CekOBVyLitYiYDVwPDCtaZhjQdMvojcBO8i0P1obWnuGbI+kSoJ+kC4pnuqcha0DOtOVJQ+d57px5/PnE29lt+Basus4KLS633b4bs92+GzP2sTdYZ+NVePz2F9lilw2YPmUGT941nm2/sXEVa20V1tCZNiswS1I3YIKkY4DJwFIlrNcPeKvg8yRgy5aWiYi5kmYATQfRtSU9C3wE/DIiHikuQNJRwFEAq6++eul7ZA2ttQbfnsDOwK5kXSWbNTpn2vKkYfM8f35w2a/uZLPt12WzHfq3ufzcOfP49z0vccSpuzLi1w8xd8485s6Zx8zPfIUvZxo202ZFfgQsARwLnAHsCAyvcJlTgTUi4j1JmwO3SNooIj4qXCgiLgEuARg8eHA0sx3LodZeyzAduF7S+Ih4vop1MqsIZ9rypJHz/OyDExjzr9f56P3PeOKu8fRbry87HrgZ1579AJP+8y5/+cUdbLnbBmz/zU0BeHDEs+x00GZIYpt9vsxVZ9xL9x7d+N7Ze9Z2R6ysGjnTZoUi4qk0+glwRDtWnQwUXnZbLU1rbplJknoAywDvRUQAs1L5T0t6FVgfGN3+PbC8KeW1DO9J+gewdfr8CPCjiJhUuWqZVZQzbXnScHnefOf12Xzn9ReZfvyfvtns8rscNmTB+Gr9V+Skqw+tWN2sLjRcps0AJN0GtHjVLCL2bmMTTwH9Ja1N1rA7CDikaJmRZFcLHwf2Bx6MiJC0IvB+RMyTtA7QHyipZ1DLv1Jey3AFWbhWTcNtaVqHSFpd0kOpq9pxkn6Upi8v6T5JE9LP5TpahlkbnGnLE+fZ8saZtkb1a+B84HXgc+AvafgEeLWtlSNiLnAMcA8wHrghIsZJOl1SU2PxMmAFSa8AxwNNr27YDnhB0nNknbl8PyLeL9eOWWMrpcG3UkRcERFz03AlsGInypwL/CQiBgJbAUenLmdPBB6IiP7AA3wRYLNyc6atwwYNGkSfPn0WGQYNGlSrKjVcngf032CR39+A/ht0osqWMw2X6d8efSM/3umP3H7pEwC8PPotfrrLxZx31AjOO2oEb46fBsDtlz7BmYdfs2C5iOCSX9zOnFlzO7F7Vi8i4uGIeBjYOiIOjIjb0nAIsG2J27gzItaPiHUj4sw07eSIGJnGZ0bENyNivYgYGhGvpek3RcRG6ZUMgyPitkrtpzWeUm7pnC7pMOC69Plg4L2OFhgRU8keLCUiPpY0nqzHoWHA9mmxq4BRwAkdLcesFc60ddjEiRPJHpVYWA17xW64PE+dNoUD+5+y0LQRE07rWIUtjxou08NP3pXxT77JB+98smDal7dZm+En77rQcmMfe52Trj6UMw79K3t+ZyseHTmWLXfbkJ6LlfJ1zBrIkpLWaWqMpVs0l6xxnawLK+UK338DBwBvkx0w96d9D6C2SNJawGbAk8DK6aBMKmvlFtY5StJoSaOnT59ejmpY1+NMW544z5Y3DZfp5VdeepFp4554g3OOvJ5rz33gi3dGRjBv7ny69+jOZx/P4pXnJrPJdut2dres/vwYGCVplKSHgYfIeu40q4k2TylFxJtAWw+ZtpukpYCbgOMi4qPCs+Pp4dNmH3p1d7KNZ9CgQUycOHGhaWussQZjx46tSX2cacsT59nyJg+ZXnPDlTnzH0fSc7Ee/OOif3HvX0ez53e/wm7Dh3LpL+9gjyOGctcVT7LzIZtz4wX/ZN6ceezx31uy9HJLdH5HreYi4m5J/YGme9VfiohZtayTdW2lXOErO0k9yQ6610TEzWnyNEmrpPmrAO/Uom5Wfk23wBUOxQ3ARudMW544z5Y31c507yV7LbhNc8vdN+SN9Azfptuvx/fO3osVV1+WXr178tqYqay5wUoM2WUA91/3TLmKtzoQEbMi4vk0uLFnNVX1Bp+yU2qXAeMj4jcFs5q6mSX9vLXadTPrCGfa8sR5trypRaY/+/iL7/cvPTWRL625cAeg9/1tNLsePoRZn89hzux5zJszn1mfzi5X8WZmC6nFU8JbA98CxqSuYwF+AZwN3CDpSOBNsvv3zRqBM2154jxb3lQ801efcS+vvDCFubPn8eaLbzNwq7V4dORYevXuwVLLLs63T/mi85bR973MJtutS6/ePRnytfX584m3M2/ufIb/apcO76DVB0lbR8SjkhbzVT2rJyU3+CRtBZwK9AZ+FxG3dKTAiPgX0FJ3djt1ZJtmHeFMW544z5Y3jZTpw5tprO1wwKbNLjvkawMWjC+30tKcePnB5aiC1YcLgM3JXoo+uMZ1MVugxQafpC9FxNsFk44HvkF20HwSuKWyVTMrL2fa8sR5trxxpi0H5ki6BOgn6YLimRFxbA3qZNbqFb6LJT0DnBsRM4EPybpGng98VIW6mZWbM2154jxb3jjT1uj2BHYGdgWernFdzBZoscEXEftI2gu4XdLVwHHAIcASwD5VqZ1ZGTnTlifOs+VNo2Z6QP8NmDptykLTVll5VV6e8FKNamS1EhHTgesljY+I52tdH7MmrT7DFxG3SboT+AHwD+DMiPhnVWpmVgHOtOWJ82x504iZnjptCgf2P2WhaSMmnFaj2lideE/SP8g6DAJ4BPhRREyqYZ2sC2vxtQyS9pb0EHA3MBY4EBgm6XpJ61argmbl4kxbnjjPljfOtOXIFWSv/Vg1DbelaWY10doVvv8FhgKLA/dExFDgJ5L6A2cCB1Whfmbl5ExbnjjPljfOtLXLwIEDmTSpLi+arRQRhQ28KyUdV6vKmLXW4JsB7Et27/w7TRMjYgI+6FpjcqYtT5xnyxtn2tpl0qRJjBo1aqFp22+/fU3qUmS6pMOA69Lng4H3algf6+JavKWTrCvkFcgahYdUpzpmFeVMW544z9ZuAwcOpE+fPgsNdcSZtrz4b+AA4G1gKllvs0fUtEbWpbXWS+d04A9VrItZRTnTlifOs3VEHV8RcaYtNyLiTWDvWtfDrElrV/jMzMzMzMysgbnBZ2ZmZmZmllOtvofPzKwrq+Me4MzMzMxK4gaflY2/HFve1PPzTmZmVt8kbQWcCvQGfhcRt9S0QtZlucFnZdPcl2PwF2Qzs2rzCTiz6pP0pYh4u2DS8WS9zwp4ErilFvUyc4PPzMwsZ3wCzqwmLpb0DHBuRMwEPiR7JcN84KNaVsy6NnfaYmZmZmZ1rc7fIQlAROwDPAvcLulw4DhgMbL3S+5Ts4pZl+crfNYhvl3I8saZNjOrX43yTHVE3CbpTuAHwD+AMyPinzWulnVxbvBZhzTKgdesVM60NSqfrDCrD5L2Bn4MzAXOAv4K/ErSD4CTIuLVWtbPui43+KxN/jJheeI8W974ZIXlTQMfp/8XGAosDtwTEUOBn0jqD5wJHFTLylnX5QafLaSlg6y/TFgjKjXP4ExbY2jgL8JmJWvgkxgzgH2BJYB3miZGxATc2LMacoOvC2juC0Lv3r2ZOXNms8s36EHWcqilL7fN5belTDvPVu98jLauKocnML4BHAzMAQ6pcV3MFqirBp+k3YDfA92BSyPi7GrXoT3/8Zb6pbMS67dnm9D8FwRf5ai8ame61PzWY05LzS40n9+Wpll51cNxut5U6sSEj9GV5zxXT1e44yIipgN/6Mw22sqkpMWAq4HNgfeAAyPijTTv58CRwDzg2Ii4pzN1sfyomwafpO7ARcDXgEnAU5JGRsSL5dh+z549F+nCtxz/8Zb6pbPc67d3m1Z9lcx0c3luUq1MVWubVj+qnWmfmLBKqvT3jnrT2RMTlTr5bF8oMZNHAh9ExHqSDgLOAQ6UNJDsttGNgFWB+yWtHxHzqrsXVo/qpsFH9pDrKxHxGoCk64FhQFkOvHPmzPF/vFZtFct0c3kG59cqrqqZ9okJq7CK5bmlk3K1vOMCOn9iwiefK66UTA4DTk3jNwIXSlKafn1EzAJel/RK2t7jVaq71TFFRK3rAICk/YHdIuI76fO3gC0j4pii5Y4CjkofBwAvF22qLzC9wtWtdlldZZ/WjIgVq1B2VTRgprtKzqpZVpfLdAl5hvz9+9c6Z9Uqp8vlOU2vl2N0NcvKWzktlVVXmS7xGDs2LTMpfX4V2JKsEfhERPwtTb8MuCsibiwqo5RjdEd0lczUeznNZrqervCVJCIuAS5pab6k0RExpBp1qVZZ3qd8q5dMO2eNU1Y9ayvPkL9//zzmzHn+Qr0co6tZVt7KqXZZ9ayUY3RHODP1XU63Sm68nSYDqxd8Xi1NM2tUzrTljTNteeI8W70pJZMLlpHUA1iGrPMW59laVE8NvqeA/pLWltSL7MHTkTWuk1lnONOWN8605YnzbPWmlEyOBIan8f2BByN7PmskcJCkxSStDfQH/l2leludq5tbOiNirqRjgHvIuqK9PCLGdWBTZb9MXQdleZ8aUANm2jlrnLJqwpmueTnVLMt5Lp3//eu/nGqX1SEtZVLS6cDoiBgJXAb8NXXK8j7phe5puRvIOniZCxxd5R46nZk6LqduOm0xMzMzMzOz8qqnWzrNzMzMzMysjNzgMzMzMzMzy6lcNfgk7SbpZUmvSDqxjNtdXdJDkl6UNE7Sj9L0UyVNlvRcGvYoU3lvSBqTtjk6TVte0n2SJqSfy3WyjAEF9X5O0keSjivXPkm6XNI76X0xTdOa3QdlLkj/bi9IGtyZfcuLSuU5bbtqma5GntM2K5Zp57k8fIxuVxk+RjeAPGTax2jnubMk7SMpJG1QwTLmpXw8L+kZSf9VwbK+JOl6Sa9KelrSnZLWL3MZTfszLu3TTyRVrl0WEbkYyB5ufRVYB+gFPA8MLNO2VwEGp/Glgf8AA8lecvnTCuzLG0DfomnnAiem8ROBc8r8u3sbWLNc+wRsBwwGxra1D8AewF2AgK2AJ2udp1oPlcxz2n7VMl3tPBf8/sqWaee5bP8mPkZ3/HfnY3SdDXnJtI/RznMZ/j1HAI8Ap1WwjE8KxncFHq5QOQIeB75fMG0TYNsK7s9KwP2V/P3l6QrfUOCViHgtImYD1wPDyrHhiJgaEc+k8Y+B8UC/cmy7HYYBV6Xxq4B9yrjtnYBXI+LNcm0wIv5J1ntUoZb2YRhwdWSeAJaVtEq56tKgKpZnqItMVzLPUOZMO89l4WN0x/kYXZ/ynGkfo60kkpYCtgGOJPUYWgV9gA8qtO0dgDkRcXHThIh4PiIeqVB5RMQ7wFHAMZJUiTLy1ODrB7xV8HkSFTg4SloL2Ax4Mk06Jt0ScHk5bnlIArg3XUY+Kk1bOSKmpvG3gZXLVBZkf6DXFXyuxD5By/tQlX+7BlO130kVMl3tPEN1Mu08t4+P0R3nY3R9ykumfYx2njtjGHB3RPwHeE/S5hUqZ/F0C+RLwKXAGRUqZxDwdIW23aKIeI3syvdKldh+nhp8FZfOYtwEHBcRHwF/AtYFNgWmAueXqahtImIwsDtwtKTtCmdGdv23LO/TUPZiz72Bv6dJldqnhZRzH6zjqpTpquUZapNp57k++BhdPs50ffAxujyc54o6mOzqNunnwRUq5/OI2DQiNgB2A66u1NWwPMpTg28ysHrB59XStLKQ1JPsoHtNRNwMEBHTImJeRMwH/kJ2e0enRcTk9PMd4B9pu9OabjlIP98pR1lkB/hnImJaKrMi+5S0tA8V/bdrUBX/nVQr01XOM1Qv085z+/gY3TE+RtevXGTax2jnuaMkLQ/sCFwq6Q3gf4ADKt0Qi4jHgb7AihXY/DigUlcpWyRpHWAe5f1bWyBPDb6ngP6S1k5njw4CRpZjwym4lwHjI+I3BdML7/n+BjC2eN0OlLWkpKWbxoFd0nZHAsPTYsOBWztbVnIwBbdVVGKfCrS0DyOBw1PPWVsBMwpuw+iqKpZnqF6ma5BnqF6mnef28TG6Y3yMrl8Nn2kfo53nTtof+GtErBkRa0XE6sDrwLaVLFRZb6DdgfcqsPkHgcUKbm9G0saSKrZPklYELgYuTFejyy9q0JtPpQaynpf+Q9Zr1kll3O42ZLcCvAA8l4Y9gL8CY9L0kcAqZShrHbKevp4nO8twUpq+AvAAMIGsJ5/ly1DWkmR/LMsUTCvLPpEdzKcCc8jujz+ypX0g6xHpovTvNgYYUuss1cNQqTynbVcl09XMc9puRTLtPNd3pn2M7tC2nenyZK+hM+1jtPPcyX/Ph4DdiqYdC/ypAmXNK/hbeB74egX3a1XghpSRccAdQP8K7c+4tD8/BbpVap+UCjUzMzMzM7OcydMtnWZmZmZmZlbADT4zMzMzM7OccoPPzMzMzMwsp9zgMzMzMzMzyyk3+MzMzMzMzHKqYRp8kuZJeq5gWKuM295H0sCCz6dL2rlc20/b3F7S7S1Mn5H26QVJ90taqYVtDJF0QRvlrCWp2XfYSPq2pFU7tgedI+l3krZL49ekfT2rYP4vJe1T8HlPSafXoKpV40w703niPDvPeeNMO9PWWCSdJGlc+rd+TtKWrSxbs2zWQsM0+IDPI2LTguGNMm57H2DBgTciTo6I+8u4/bY8kvZpY7IXuR5dvICkHhExOiKO7UQ53yZ7t0hVSVoB2Coi/ilpY7J/y42BLSQto+ylq1tGxC0Fq90B7CVpiWrXt4qcaWc6T5xn5zlvnGln2hqEpK8AewKD07/1zsBbrazybWqQzVpppAbfIiS9IalvGh8iaVQaP1XS5ZJGSXpN0rEF6xyeWv7PS/qrpP8C9gbOS2cD1pV0paT90/I7SXpW0pi0zcUKyj5N0jNp3gZp+lBJj6d1HpM0oB37I2Bp4IOC/firpEeBvxaerZO0oqT70pmMSyW92fS7ALpL+kuad6+kxdP+DAGuSfu5eCv7sGTa13+n/RiWpm+UpjWdFeyflr0j/T7HSjqwmV3bD7g7jc8BFpfUDehJ9uLJ04FTCleI7AWRo8j+eLsMZ9qZzhPn2XnOG2famba6tQowPSJmAUTE9IiYImlzSQ9LelrSPZJWaS6bNa15NVTqje7lHvjijfTPAf9I094A+qbxIcCoNH4q8BiwGNAXeI/sj3wj4D8F6yyffl4J7F9Q1pXA/kBvsrMD66fpVwPHFZT9wzT+A+DSNN4H6JHGdwZuSuPbA7c3s1/bAzPSfr0FvAT0KdiPp4HFi7cBXAj8PI3vBkTa17WAucCmad4NwGFpfBQwpKDslvbhrIJ1lk2/syWBPwCHpum9gMXJDqp/KdjmMs3s41XAXgWff5f29yfApsBlLfybHwr8odbZc6adaWfaecZ57lJ5dqadaQ+NNQBLpX/j/wB/BL6a/gYfA1ZMyxwIXJ7GF8pm3oceNI7PI2LTdix/R2St/FmS3gFWBnYE/h4R0wEi4v02tjEAeD0i/pM+X0V228Pv0ueb08+ngX3T+DLAVZL6kx0Me5ZQ10ciYk8ASScA5wLfT/NGRsTnzayzDfCNtB93S/qgYN7rEfFcQd3WaqXs5vZhF2BvST9Nn3sDawCPAydJWg24OSImSBoDnC/pHLL/FB5ppoxVgHebPkTEcU3jkm4DvifpJGAT4L6I+Eua/Q75vtzuTC/MmW5szvPCnOfG50wvzJm2uhURn0jaHNgW2AEYAfwvMAi4L7uYTXdgas0qWUMNfUsn2Rmlpn3oXTRvVsH4PKhI47apjMLtnwE8FBGDgL2aqVdbRgLbFXz+tBP1Kq5ba8sWLidgv/jiuYU1ImJ8RFxLdhvK58CdknZM/ykNBsYA/yvp5GbK+Jxmfg/plo2nyc7KrBsRBwD764v753undbsSZ7r1ehXXrbVlnenac55br1dx3Vpb1nmuD8506/UqrltryzrTVlYRMS8iRkXEKcAxZFeDxxXk6ssRsUuNq1kTjd7gewPYPI3vV8LyDwLfVPYwL5KWT9M/JruHvdjLwFqS1kufvwU83EYZywCT0/i3S6hTsW2AV0tY7lHgAABJuwDLlbBOS/tZ7B7gh+nefiRtln6uA7wWERcAtwIbK+vh6LOI+BtwHtlBuNh4YL3CCZJ6AseRnVVcnOysJGRnX3ql8fWBZnv+yrE3cKad6fx4A+fZec6XN3CmnWmrO5IGpKvcTTYly8GKyjp0QVJPSRul+aVmMxcavcF3GvB7SaPJzhS1KiLGAWcCD0t6HvhNmnU98D/KHhRet2D5mcARwN/TLQTzgYvbKOZc4P8kPUvpZ/e2TQ+NPk92cP9JCeucBuyirCvkbwJvk4W3NVcCF5fwgOoZZLeEvCBpXPoM2YF+rKTnyC6RXw18Gfh3mnYK2eXzYneQPQdQ6Gjgqoj4DHgBWCL9jp+OiA/TMjukdbsSZ9qZzhPn2XnOG2fambb6tBTZrc0vSnqBrBfck8mejT0nZf054L/S8ldSWjZzQRHR9lJWd5T12jUvIuamMxd/auezBlUl6V/AngUH1baWXxm4NiJ2qmjFrG4405YnzrPljTNt1rjc4GtQ6bL1DWRXaWcDP4iIp2pbq5Ype/nl5xHxQonLbwHMKXgI3HLOmbY8cZ4tb5xps8blBp+ZmZmZmVlONfozfGZmZmZmZtYCN/jMzMzMzMxyyg0+MzMzMzOznHKDz8zMzMzMLKfc4DMzMzMzM8up/w97t2q86MRMQQAAAABJRU5ErkJggg==\n",
1399 | "text/plain": [
1400 | ""
1401 | ]
1402 | },
1403 | "metadata": {
1404 | "needs_background": "light"
1405 | },
1406 | "output_type": "display_data"
1407 | }
1408 | ],
1409 | "source": [
1410 | "modes = ['Full', 'AFDB', 'AFDB70', 'AFDB90']\n",
1411 | "panel = ['A', 'B', 'C', 'D', 'E']\n",
1412 | "\n",
1413 | "fig, ax = plt.subplots(1, len(panel), figsize=(2.5*len(panel), 3))\n",
1414 | "percentage_dufs = []\n",
1415 | "\n",
1416 | "for j, mode in enumerate(modes):\n",
1417 | " if mode == 'Full':\n",
1418 | " tmp = indata\n",
1419 | " if 'AFDB' in mode:\n",
1420 | " tmp = indata.loc[indata.nAF2.astype(float) > 0]\n",
1421 | " if len(mode.split('AFDB')[-1]) > 0:\n",
1422 | " cut = int(mode.split('AFDB')[-1])\n",
1423 | " tmp = tmp.loc[tmp.AF2_longest_best70_pLDDT.astype(float) >= cut]\n",
1424 | " \n",
1425 | " h,_ = np.histogram(tmp.FULL_noDUF.astype(float), bins=[i for i in range(0, 105, 5)])\n",
1426 | " n_dark = h[0]\n",
1427 | " h = h*100/sum(h)\n",
1428 | "\n",
1429 | " colors = ['#57257F']\n",
1430 | " for i in range(len(h)-2):\n",
1431 | " colors.append('silver')\n",
1432 | " colors.append('white')\n",
1433 | "\n",
1434 | " x = list(range(len(h)))\n",
1435 | " y = list(h)\n",
1436 | "\n",
1437 | " ax[j].bar(x,y,1, align='edge', color=colors, edgecolor='k')\n",
1438 | " ax[j].set_facecolor('#F2F2F2')\n",
1439 | " ax[j].set_xticks(range(0,21,5))\n",
1440 | " ax[j].set_xticklabels(range(0,101,25))\n",
1441 | " ax[j].set_ylabel('% of UniRef50 clusters')\n",
1442 | " ax[j].set_xlabel('Functional Brightness (%)')\n",
1443 | " \n",
1444 | " ax[j].title.set_text('{} {}'.format(panel[j], mode))\n",
1445 | "\n",
1446 | " ax[j].set_ylim(0,100)\n",
1447 | " \n",
1448 | " percentage_dark = round(h[0])\n",
1449 | " ax[j].text(-0.1, percentage_dark+1, '{}%'.format(percentage_dark),\n",
1450 | " verticalalignment='bottom', horizontalalignment='left',\n",
1451 | " color='#57257F', fontsize=9)\n",
1452 | " \n",
1453 | " uniprot_n_dark = sum(tmp.loc[tmp.FULL_noDUF.astype(float) <=5].nACCs.astype(float))\n",
1454 | " print(mode, 'n =', len(tmp), 'n_dark =', n_dark, 'uniprot_n_dark =', uniprot_n_dark, '% uniprot =', uniprot_n_dark*100/sum(tmp.nACCs.astype(float)))\n",
1455 | "\n",
1456 | " uniref_n_dark = sum(tmp.loc[tmp.FULL_noDUF.astype(float) <=5].nUniRef100.astype(float))\n",
1457 | " print(mode, 'n =', len(tmp), 'n_dark =', n_dark, 'uniref100_n_dark =', uniref_n_dark, '% uniref100 =', uniref_n_dark*100/sum(tmp.nUniRef100.astype(float)))\n",
1458 | " \n",
1459 | " percentage_duf = len(tmp.loc[tmp.Has_duf == 1])*100/len(tmp.loc[tmp.FULL_noDUF.astype(float) <=5])\n",
1460 | " print('% UniRef50 dark with dufs =', percentage_duf)\n",
1461 | " print()\n",
1462 | " \n",
1463 | " percentage_dufs.append(percentage_duf)\n",
1464 | "\n",
1465 | "ax[j+1].bar(panel[:-1],percentage_dufs,1, align='center', color=['#57257F' for i in modes], edgecolor='k')\n",
1466 | "ax[j+1].set_facecolor('#F2F2F2')\n",
1467 | "ax[j+1].set_ylabel('% of dark clusters with DUF')\n",
1468 | "ax[j+1].set_xlabel('Set')\n",
1469 | "ax[j+1].title.set_text('({}) DUF content'.format(panel[j+1]))\n",
1470 | "ax[j+1].set_ylim(0,0.2)\n",
1471 | " \n",
1472 | "plt.tight_layout()\n",
1473 | "plt.savefig('plots/AFDBv4_uniref50_histogram_dark_content.pdf')\n",
1474 | "plt.savefig('plots/AFDBv4_uniref50_histogram_dark_content.png', dpi=2000)"
1475 | ]
1476 | },
1477 | {
1478 | "cell_type": "code",
1479 | "execution_count": 6,
1480 | "id": "latest-canberra",
1481 | "metadata": {},
1482 | "outputs": [
1483 | {
1484 | "name": "stdout",
1485 | "output_type": "stream",
1486 | "text": [
1487 | "brightness vs size Correlation AFDB90: 0.0\n"
1488 | ]
1489 | }
1490 | ],
1491 | "source": [
1492 | "print('brightness vs size Correlation {}:'.format(mode), scipy.stats.pearsonr(indata['FULL_noDUF'], indata['nUniRef100'])[1])"
1493 | ]
1494 | },
1495 | {
1496 | "cell_type": "code",
1497 | "execution_count": 7,
1498 | "id": "confidential-mistress",
1499 | "metadata": {},
1500 | "outputs": [
1501 | {
1502 | "data": {
1503 | "text/html": [
1504 | "\n",
1505 | "\n",
1518 | "
\n",
1519 | " \n",
1520 | " \n",
1521 | " | \n",
1522 | " mean | \n",
1523 | " std | \n",
1524 | " median | \n",
1525 | "
\n",
1526 | " \n",
1527 | " | darkness_bins | \n",
1528 | " | \n",
1529 | " | \n",
1530 | " | \n",
1531 | "
\n",
1532 | " \n",
1533 | " \n",
1534 | " \n",
1535 | " | (-0.001, 5.0] | \n",
1536 | " 1.854932 | \n",
1537 | " 7.471334 | \n",
1538 | " 1 | \n",
1539 | "
\n",
1540 | " \n",
1541 | " | (5.0, 10.0] | \n",
1542 | " 3.238855 | \n",
1543 | " 18.929983 | \n",
1544 | " 1 | \n",
1545 | "
\n",
1546 | " \n",
1547 | " | (10.0, 15.0] | \n",
1548 | " 3.051652 | \n",
1549 | " 16.244046 | \n",
1550 | " 1 | \n",
1551 | "
\n",
1552 | " \n",
1553 | " | (15.0, 20.0] | \n",
1554 | " 2.881375 | \n",
1555 | " 15.197970 | \n",
1556 | " 1 | \n",
1557 | "
\n",
1558 | " \n",
1559 | " | (20.0, 25.0] | \n",
1560 | " 2.758930 | \n",
1561 | " 15.388115 | \n",
1562 | " 1 | \n",
1563 | "
\n",
1564 | " \n",
1565 | " | (25.0, 30.0] | \n",
1566 | " 2.670532 | \n",
1567 | " 13.003847 | \n",
1568 | " 1 | \n",
1569 | "
\n",
1570 | " \n",
1571 | " | (30.0, 35.0] | \n",
1572 | " 2.595750 | \n",
1573 | " 11.021570 | \n",
1574 | " 1 | \n",
1575 | "
\n",
1576 | " \n",
1577 | " | (35.0, 40.0] | \n",
1578 | " 2.644472 | \n",
1579 | " 12.318777 | \n",
1580 | " 1 | \n",
1581 | "
\n",
1582 | " \n",
1583 | " | (40.0, 45.0] | \n",
1584 | " 2.672398 | \n",
1585 | " 11.483546 | \n",
1586 | " 1 | \n",
1587 | "
\n",
1588 | " \n",
1589 | " | (45.0, 50.0] | \n",
1590 | " 2.666221 | \n",
1591 | " 10.795142 | \n",
1592 | " 1 | \n",
1593 | "
\n",
1594 | " \n",
1595 | " | (50.0, 55.0] | \n",
1596 | " 2.747724 | \n",
1597 | " 11.707789 | \n",
1598 | " 1 | \n",
1599 | "
\n",
1600 | " \n",
1601 | " | (55.0, 60.0] | \n",
1602 | " 2.814912 | \n",
1603 | " 11.031360 | \n",
1604 | " 1 | \n",
1605 | "
\n",
1606 | " \n",
1607 | " | (60.0, 65.0] | \n",
1608 | " 2.905467 | \n",
1609 | " 12.595227 | \n",
1610 | " 1 | \n",
1611 | "
\n",
1612 | " \n",
1613 | " | (65.0, 70.0] | \n",
1614 | " 2.974943 | \n",
1615 | " 13.504286 | \n",
1616 | " 1 | \n",
1617 | "
\n",
1618 | " \n",
1619 | " | (70.0, 75.0] | \n",
1620 | " 3.055948 | \n",
1621 | " 13.730255 | \n",
1622 | " 1 | \n",
1623 | "
\n",
1624 | " \n",
1625 | " | (75.0, 80.0] | \n",
1626 | " 3.233934 | \n",
1627 | " 14.495583 | \n",
1628 | " 1 | \n",
1629 | "
\n",
1630 | " \n",
1631 | " | (80.0, 85.0] | \n",
1632 | " 3.583951 | \n",
1633 | " 15.663360 | \n",
1634 | " 1 | \n",
1635 | "
\n",
1636 | " \n",
1637 | " | (85.0, 90.0] | \n",
1638 | " 4.109898 | \n",
1639 | " 18.881363 | \n",
1640 | " 1 | \n",
1641 | "
\n",
1642 | " \n",
1643 | " | (90.0, 95.0] | \n",
1644 | " 5.100711 | \n",
1645 | " 24.738266 | \n",
1646 | " 1 | \n",
1647 | "
\n",
1648 | " \n",
1649 | " | (95.0, 100.0] | \n",
1650 | " 18.727700 | \n",
1651 | " 122.980629 | \n",
1652 | " 2 | \n",
1653 | "
\n",
1654 | " \n",
1655 | "
\n",
1656 | "
"
1657 | ],
1658 | "text/plain": [
1659 | " mean std median\n",
1660 | "darkness_bins \n",
1661 | "(-0.001, 5.0] 1.854932 7.471334 1\n",
1662 | "(5.0, 10.0] 3.238855 18.929983 1\n",
1663 | "(10.0, 15.0] 3.051652 16.244046 1\n",
1664 | "(15.0, 20.0] 2.881375 15.197970 1\n",
1665 | "(20.0, 25.0] 2.758930 15.388115 1\n",
1666 | "(25.0, 30.0] 2.670532 13.003847 1\n",
1667 | "(30.0, 35.0] 2.595750 11.021570 1\n",
1668 | "(35.0, 40.0] 2.644472 12.318777 1\n",
1669 | "(40.0, 45.0] 2.672398 11.483546 1\n",
1670 | "(45.0, 50.0] 2.666221 10.795142 1\n",
1671 | "(50.0, 55.0] 2.747724 11.707789 1\n",
1672 | "(55.0, 60.0] 2.814912 11.031360 1\n",
1673 | "(60.0, 65.0] 2.905467 12.595227 1\n",
1674 | "(65.0, 70.0] 2.974943 13.504286 1\n",
1675 | "(70.0, 75.0] 3.055948 13.730255 1\n",
1676 | "(75.0, 80.0] 3.233934 14.495583 1\n",
1677 | "(80.0, 85.0] 3.583951 15.663360 1\n",
1678 | "(85.0, 90.0] 4.109898 18.881363 1\n",
1679 | "(90.0, 95.0] 5.100711 24.738266 1\n",
1680 | "(95.0, 100.0] 18.727700 122.980629 2"
1681 | ]
1682 | },
1683 | "execution_count": 7,
1684 | "metadata": {},
1685 | "output_type": "execute_result"
1686 | }
1687 | ],
1688 | "source": [
1689 | "indata.groupby(['darkness_bins'])['nUniRef100'].agg([np.mean, np.std, np.median])"
1690 | ]
1691 | },
1692 | {
1693 | "cell_type": "markdown",
1694 | "id": "behavioral-veteran",
1695 | "metadata": {},
1696 | "source": [
1697 | "# 2. Define AFDB90 set and collect all associated sequences from previously contructed mongoDB\n",
1698 | "\n",
1699 | "The AFDB90 set corresponds to those UniRef50 clusters where the longest member with a pLDDT >70% has a pLDDT >90%. We thus select only those from the table above and save them as a table and the corresponding fasta file. \n",
1700 | "\n",
1701 | "The table will be used for further analysis in the other jupyter notebooks. The fasta file will be used for the all-against-all mmseqs2 searches that make the base of the sequence similarity network."
1702 | ]
1703 | },
1704 | {
1705 | "cell_type": "code",
1706 | "execution_count": 8,
1707 | "id": "joint-sample",
1708 | "metadata": {},
1709 | "outputs": [],
1710 | "source": [
1711 | "AFDB90 = indata.loc[indata.AF2_longest_best70_pLDDT.astype(float) >= 90]\n",
1712 | "AFDB90.to_csv('data_generated_v2/AFDB90v4_data.csv')"
1713 | ]
1714 | },
1715 | {
1716 | "cell_type": "code",
1717 | "execution_count": null,
1718 | "id": "cardiac-instrument",
1719 | "metadata": {},
1720 | "outputs": [],
1721 | "source": [
1722 | "dbuilder_path = None # change accordingly\n",
1723 | "\n",
1724 | "import sys\n",
1725 | "import os\n",
1726 | "sys.path.append(dbuilder_path)\n",
1727 | "\n",
1728 | "import extract_uniprot as uniprot\n",
1729 | "\n",
1730 | "MONGO_HOST = \"10.1.0.202\"\n",
1731 | "MONGO_PORT = 30077\n",
1732 | "\n",
1733 | "uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)"
1734 | ]
1735 | },
1736 | {
1737 | "cell_type": "code",
1738 | "execution_count": null,
1739 | "id": "quality-harris",
1740 | "metadata": {},
1741 | "outputs": [],
1742 | "source": [
1743 | "outfasta = 'data_generated_v2/AFDBv4_90.fasta'\n",
1744 | "\n",
1745 | "count = 0\n",
1746 | "step = 100000\n",
1747 | "\n",
1748 | "target_ids = [i.split('_')[1] for i in AFDB90.index]\n",
1749 | "n_entries = len(target_ids)\n",
1750 | "\n",
1751 | "chuncks = [target_ids[i:i+step] if i+step < len(target_ids) else target_ids[i:] for i in range(0, n_entries, step)]\n",
1752 | "collected_ids = []\n",
1753 | "\n",
1754 | "print('Getting sequences for {} chuncks'.format(len(chuncks)))\n",
1755 | " \n",
1756 | "with open(outfasta, 'w') as out:\n",
1757 | " for i, chunck in enumerate(chuncks):\n",
1758 | " documents = uniprot_db.col.find({'_id': {'$in': chunck}})\n",
1759 | " for doc in documents:\n",
1760 | " out.write('>{}\\n{}\\n'.format(doc['_id'], doc['data']['SEQ']))\n",
1761 | "\n",
1762 | " "
1763 | ]
1764 | }
1765 | ],
1766 | "metadata": {
1767 | "kernelspec": {
1768 | "display_name": "Python 3 (ipykernel)",
1769 | "language": "python",
1770 | "name": "python3"
1771 | },
1772 | "language_info": {
1773 | "codemirror_mode": {
1774 | "name": "ipython",
1775 | "version": 3
1776 | },
1777 | "file_extension": ".py",
1778 | "mimetype": "text/x-python",
1779 | "name": "python",
1780 | "nbconvert_exporter": "python",
1781 | "pygments_lexer": "ipython3",
1782 | "version": "3.6.6"
1783 | }
1784 | },
1785 | "nbformat": 4,
1786 | "nbformat_minor": 5
1787 | }
1788 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2021 schwede
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AFDB90v4
2 |
3 | This repository contains all analysis code, data and metadata generated for the current submission of our manuscript "What is hidden in the darkness? Deep-learning assisted large-scale protein family curation uncovers novel protein families and folds".
4 |
5 | ## Repo organisation
6 |
7 | The code is organised in python notebooks (for major data analysis), python scripts (for large-scale data generation and processing) and bash scripts. The Notebooks are divided into four main analysis tasks, and describe which scripts were used to generate and analyse the data (which is also provided precomputed in https://zenodo.org/record/8121336).
8 |
9 | ## How to use this repo
10 |
11 | To use the code in this repo, just download it as well as the data available [Zenodo](https://zenodo.org/record/8121336) and follow the Jupyter Notebooks from 1-4. Each notebook corresponds to a specific analysis step, and lists which scripts were run to generate the data to be analysed.
12 |
13 | The code in `make_shapemers.py` for the AFDB dataset is written for the entire AFDB download (with tar and zip etc.) but there are functions for running it on individual files or folders with PDB/CIF files
14 |
15 | A script to predict outlier scores for user input proteins is coming soon.
16 |
17 | ## Dependencies
18 |
19 | The code was written in Python 3.6 (network generation and analysis) and 3.9+ (shape-mer generation and outlier detection).
20 |
21 | For the *analysis of the data*, common, standard python modules were used. Extra modules required are:
22 | - networkx
23 | - scipy
24 | - seaborn
25 | - pandas
26 | - datashader
27 | - geometricus
28 | - torch
29 | - numba
30 | - numpy
31 | - tqdm
32 | - sklearn
33 | - gensim
34 | - scikit-learn
35 |
36 | For the *generation* of the data, we used the `dbuilder` package, which is part of the ProteinUniverseAtlas project and can be found in https://github.com/ProteinUniverseAtlas/dbuilder. Shape-mers were generated using the trained ShapemerLearn model from geometricus, for which model and training code can be found in [https://github.com/TurtleTools/geometricus/tree/master/training](https://github.com/TurtleTools/geometricus/tree/master/training).
37 |
38 |
--------------------------------------------------------------------------------
/plots/AFDB90v4_component_darkness_histogram.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_component_darkness_histogram.pdf
--------------------------------------------------------------------------------
/plots/AFDB90v4_component_darkness_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_component_darkness_histogram.png
--------------------------------------------------------------------------------
/plots/AFDB90v4_histogram_dark_content.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_histogram_dark_content.pdf
--------------------------------------------------------------------------------
/plots/AFDB90v4_histogram_dark_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_histogram_dark_content.png
--------------------------------------------------------------------------------
/plots/AFDBv4_uniref50_histogram_components_word_diversity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_components_word_diversity.pdf
--------------------------------------------------------------------------------
/plots/AFDBv4_uniref50_histogram_components_word_diversity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_components_word_diversity.png
--------------------------------------------------------------------------------
/plots/AFDBv4_uniref50_histogram_dark_content.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_dark_content.pdf
--------------------------------------------------------------------------------
/plots/AFDBv4_uniref50_histogram_dark_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_dark_content.png
--------------------------------------------------------------------------------
/plots/community_cosmograph_layout_darkness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/community_cosmograph_layout_darkness.png
--------------------------------------------------------------------------------
/plots/component159_community_cosmograph_layout_darkness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/component159_community_cosmograph_layout_darkness.png
--------------------------------------------------------------------------------
/plots/outliers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/outliers.pdf
--------------------------------------------------------------------------------
/scripts/AFDBv4_DUF_analysis_dark.py:
--------------------------------------------------------------------------------
1 | path_to_dbuilder = #path to dbuilder
2 |
3 | import sys
4 | import os
5 |
6 | sys.path.append(path_to_dbuilder)
7 | from src import extract_uniref as uniref
8 | from src import extract_interpro as interpro
9 | from src import extract_uniparc as uniparc
10 | from src import extract_uniprot as uniprot
11 |
12 | import time
13 | import numpy as np
14 |
15 | def write_data_to_file(data, target_uniref, outfile):
16 |
17 | if not os.path.isfile(outfile):
18 | with open(outfile, 'w') as outp:
19 | outp.write(','.join(sorted(list(data.keys()))))
20 | outp.write('\n')
21 |
22 | with open(outfile, 'a+') as outp:
23 | for i in range(len(data['unirefID'])):
24 | line_data = [str(data[key][i]) if data[key][i] is not None else str(np.nan) for key in sorted(list(data.keys()))]
25 | outp.write(','.join(line_data))
26 | outp.write('\n')
27 |
28 | return {key: [] for key in data}
29 |
30 |
31 | # LOAD TARGET UNIREF DATABASE
32 |
33 | target_uniref = sys.argv[1] # either UniRef90 or UniRef50
34 |
35 | MONGO_HOST = "10.1.0.202"
36 | MONGO_PORT = 30077
37 |
38 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref)
39 | uniref_db.index_db()
40 |
41 | interpro_db = interpro.interpro_db_diggested(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
42 | interpro_db.index_db()
43 |
44 | uniparc_db = uniparc.uniparc_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
45 | uniparc_db.index_db()
46 |
47 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
48 | uniprot_db.index_db()
49 |
50 | # COUNT ENTRIES IN THE UNIREF DATABASE
51 |
52 | print('\nCOUNTING DATABASE SIZE')
53 |
54 | n_entries = uniref_db.col.count_documents({})
55 | print(' ... FOUND {} ENTRIES'.format(n_entries))
56 |
57 | # DEFINE OUTPUT FILE AND CHECK THE UNIREF IDS ALREADY THERE
58 |
59 | print('\nDEFINING OUTPUT FILE')
60 |
61 | outfile = 'AFDBv4_DUF_dark_diggestion_{}_2023-02-06.csv'.format(target_uniref)
62 |
63 | # GO THROUGH EACH CHUNCK AND COLLECT THE TARGET DATA
64 |
65 | start = time.time()
66 |
67 | print('\nGOING THROUGH THE CHUNCKS')
68 |
69 | write_step = 50000
70 |
71 | data = {'unirefID': [], 'Has_duf': []}
72 |
73 | curr_count = 0
74 | for document in uniref_db.col.find():
75 | uniref_id = document['_id']
76 | uniref_dt = document['data']
77 |
78 | curr_count += 1
79 |
80 | if uniref_dt['DARKNESS']['FULL_noDUF'] <= 5:
81 | rep = uniref_dt['DARKNESS']['REP']
82 | has_duf = 0
83 | domains = []
84 |
85 | if rep is not None:
86 | if not rep.startswith('UP'):
87 | try:
88 | domains = interpro_db.query(rep)[0]['data']
89 | except:
90 | pass
91 |
92 | uniprot_dt = uniprot_db.query(rep)[0]['data']
93 | if 'CHAINS' in uniprot_dt:
94 | domains += uniprot_dt['CHAINS']
95 |
96 | else:
97 | try:
98 | domains = uniparc_db.query(rep)[0]['data']['ANNO']
99 | except:
100 | pass
101 |
102 | if len(domains) > 0:
103 | for domain in domains:
104 | if 'DUF' in domain[0]:
105 | has_duf = 1
106 |
107 | data['unirefID'].append(uniref_id)
108 | data['Has_duf'].append(has_duf)
109 |
110 | if curr_count % write_step == 0:
111 | data = write_data_to_file(data, target_uniref, outfile)
112 |
113 | numb_seconds = time.time() - start
114 | time_to_end = round(((numb_seconds/curr_count)*n_entries)-numb_seconds)
115 |
116 | print('{} out of {}'.format(curr_count, n_entries), 'Time passed since start: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))))
117 |
118 |
119 | data = write_data_to_file(data, target_uniref, outfile)
120 |
121 | numb_seconds = time.time() - start
122 | print('\nFINISHED AFTER: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
123 |
124 |
--------------------------------------------------------------------------------
/scripts/AFDBv4_pLDDT_analysis.py:
--------------------------------------------------------------------------------
1 | path_to_dbuilder = #path to dbuilder
2 |
3 | import sys
4 | import os
5 |
6 | sys.path.append(path_to_dbuilder)
7 | from src import extract_uniref as uniref
8 | from src import extract_alphafold as alphafold
9 | from src import extract_uniprot as uniprot
10 |
11 | import time
12 | import numpy as np
13 |
14 | def write_data_to_file(data, target_uniref, outfile):
15 |
16 | if not os.path.isfile(outfile):
17 | with open(outfile, 'w') as outp:
18 | outp.write(','.join(sorted(list(data.keys()))))
19 | outp.write('\n')
20 |
21 | with open(outfile, 'a+') as outp:
22 | for i in range(len(data['unirefID'])):
23 | line_data = [str(data[key][i]) if data[key][i] is not None else str(np.nan) for key in sorted(list(data.keys()))]
24 | outp.write(','.join(line_data))
25 | outp.write('\n')
26 |
27 | return {key: [] for key in data}
28 |
29 |
30 | # LOAD TARGET UNIREF DATABASE
31 |
32 | target_uniref = sys.argv[1] # either UniRef90 or UniRef50
33 |
34 | MONGO_HOST = "10.1.0.202"
35 | MONGO_PORT = 30077
36 |
37 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref)
38 | uniref_db.index_db()
39 |
40 | alphafold_db = alphafold.alphafold_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
41 | alphafold_db.index_db()
42 |
43 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
44 | uniprot_db.index_db()
45 |
46 | # COUNT ENTRIES IN THE UNIREF DATABASE
47 |
48 | print('\nCOUNTING DATABASE SIZE')
49 |
50 | n_entries = uniref_db.col.count_documents({})
51 | print(' ... FOUND {} ENTRIES'.format(n_entries))
52 |
53 | # DEFINE OUTPUT FILE AND CHECK THE UNIREF IDS ALREADY THERE
54 |
55 | print('\nDEFINING OUTPUT FILE')
56 |
57 | outfile = 'data_generated/AFDBv4_pLDDT_diggestion_{}.csv'.format(target_uniref)
58 |
59 | # GO THROUGH EACH CHUNCK AND COLLECT THE TARGET DATA
60 |
61 | start = time.time()
62 |
63 | print('\nGOING THROUGH THE CHUNCKS')
64 |
65 | write_step = 50000
66 |
67 | data = {'unirefID': [], 'median_pLDDT': [], 'max_pLDDT': [], 'min_pLDDT': [], 'delta_pLDDT': [], 'nACCs': [],
68 | 'nUniRef100': [], 'nUniRef90': [], 'nAF2': [],'AF2_REP_best_len': [], 'AF2_REP_worst_len': [],
69 | 'AF2_REP_best': [], 'AF2_REP_worst': [], 'AF2_longest_best70': [],'AF2_longest_best70_pLDDT': [],
70 | 'AF2_longest_best70_len': [], 'median_Evidence': []}
71 | # , 'uniref_rep_len': []}
72 |
73 | curr_count = 0
74 | for document in uniref_db.col.find():
75 | uniref_id = document['_id']
76 | uniref_dt = document['data']
77 |
78 | curr_count += 1
79 |
80 | for key in uniref_dt['DARKNESS']:
81 | if key != 'pLDDTs':
82 | if 'AF2_REP' in key:
83 | if uniref_dt['DARKNESS'][key] is not None:
84 | data['{}_len'.format(key)].append(uniref_dt['DARKNESS'][key]['LEN'])
85 | data[key].append(uniref_dt['DARKNESS'][key]['ACC'])
86 |
87 | else:
88 | data['{}_len'.format(key)].append(np.nan)
89 | data[key].append(None)
90 |
91 | else:
92 | if key not in data:
93 | data[key] = []
94 | try:
95 | data[key].append(round(uniref_dt['DARKNESS'][key], 2))
96 | except:
97 | data[key].append(uniref_dt['DARKNESS'][key])
98 |
99 | if key == 'pLDDTs':
100 | if len(uniref_dt['DARKNESS'][key]) > 0:
101 | median = np.median(uniref_dt['DARKNESS'][key])
102 | maximum = max(uniref_dt['DARKNESS'][key])
103 | minimum = min(uniref_dt['DARKNESS'][key])
104 | delta = min(uniref_dt['DARKNESS'][key])-max(uniref_dt['DARKNESS'][key])
105 |
106 | data['median_pLDDT'].append(median)
107 | data['max_pLDDT'].append(maximum)
108 | data['min_pLDDT'].append(minimum)
109 | data['delta_pLDDT'].append(delta)
110 |
111 | else:
112 | data['median_pLDDT'].append(np.nan)
113 | data['max_pLDDT'].append(np.nan)
114 | data['min_pLDDT'].append(np.nan)
115 | data['delta_pLDDT'].append(np.nan)
116 |
117 | # get the longest protein with a pLDDT > 70
118 | longest_pLDDT_best70 = None
119 | best_pLDDT = None
120 | best_n_res = 0
121 |
122 | af_docs = alphafold_db.col.find({'_id': {'$in': uniref_dt['ACC']}})
123 | for af_document in af_docs:
124 | curr_dt = af_document['data']
125 |
126 | # get plddt
127 | avgPLDDT = []
128 | n_res = 0
129 | for fragment in curr_dt:
130 | avgPLDDT.append(curr_dt[fragment]['pLDDT']['avg_pLDDT']*curr_dt[fragment]['pLDDT']['Lenght'])
131 | n_res += curr_dt[fragment]['pLDDT']['Lenght']
132 |
133 | fullprotein_pLDDT = sum(avgPLDDT)/n_res
134 |
135 | if fullprotein_pLDDT > 70 and n_res > best_n_res:
136 | longest_pLDDT_best70 = af_document['_id']
137 | best_pLDDT = fullprotein_pLDDT
138 | best_n_res = n_res
139 |
140 | if best_n_res == 0:
141 | best_n_res = None
142 |
143 | # rep_length = len(uniprot_db.query(uniref_id.split('_')[-1])[0]['data']['SEQ'])
144 |
145 | # get the median evidence level
146 | evidence_level = []
147 | uniprot_docs = uniprot_db.col.find({'_id': {'$in': uniref_dt['ACC']}})
148 | for up_document in uniprot_docs:
149 | evidence_level.append(up_document['data']['EVIDENCE']['LEVEL'])
150 | if len(evidence_level) > 0:
151 | median_evidence = np.median(evidence_level)
152 | else:
153 | median_evidence = np.nan
154 |
155 | data['nACCs'].append(len(uniref_dt['ACC']))
156 | data['nUniRef100'].append(len(uniref_dt['UNIREF']['UniRef100']))
157 | data['nUniRef90'].append(len(uniref_dt['UNIREF']['UniRef90']))
158 | data['nAF2'].append(len(uniref_dt['DARKNESS']['pLDDTs']))
159 | data['unirefID'].append(uniref_id)
160 | # data['uniref_rep_len'].append(rep_length)
161 | data['AF2_longest_best70'].append(longest_pLDDT_best70)
162 | data['AF2_longest_best70_len'].append(best_n_res)
163 | data['AF2_longest_best70_pLDDT'].append(best_pLDDT)
164 | data['median_Evidence'].append(median_evidence)
165 |
166 | if curr_count % write_step == 0:
167 | data = write_data_to_file(data, target_uniref, outfile)
168 |
169 | numb_seconds = time.time() - start
170 | time_to_end = round(((numb_seconds/curr_count)*n_entries)-numb_seconds)
171 |
172 | print('{} out of {}'.format(curr_count, n_entries), 'Time passed since start: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))))
173 |
174 |
175 | data = write_data_to_file(data, target_uniref, outfile)
176 |
177 | numb_seconds = time.time() - start
178 | print('\nFINISHED AFTER: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
179 |
180 |
--------------------------------------------------------------------------------
/scripts/get_communities_summary.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pandas as pd
4 | import json
5 | import networkx as nx
6 | import numpy as np
7 |
8 | import scipy
9 | from scipy import stats
10 |
11 | from multiprocessing.pool import ThreadPool
12 | from collections import Counter
13 |
14 | import sys
15 |
16 | # LOAD MY DBs
17 |
18 | path_to_dbuilder = #path to dbuilder
19 |
20 | sys.path.append(path_to_dbuilder)
21 |
22 | from src import extract_uniref as uniref
23 | from src import extract_uniprot as uniprot
24 | from src import extract_uniparc as uniparc
25 |
26 | target_uniref = 'UniRef50'
27 |
28 | MONGO_HOST = "10.1.0.202"
29 | MONGO_PORT = 30077
30 |
31 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref)
32 | uniref_db.index_db()
33 |
34 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
35 | uniprot_db.index_db()
36 |
37 | uniparc_db = uniparc.uniparc_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
38 | uniparc_db.index_db()
39 |
40 |
41 | # GET INPUTS
42 | AFDB_data = sys.argv[1]
43 | uniprot_tax = sys.argv[2]
44 | threads = int(sys.argv[3])
45 | infolder = AFDB_data.split('/')[-2]
46 |
47 | # LOAD INPUTS
48 |
49 | print('1. Loading AFDB data')
50 | AFDB90_CC = pd.read_csv(AFDB_data, dtype = {'communityID': str})
51 | AFDB90_CC = AFDB90_CC.sort_values(by='unirefID')
52 | AFDB90_CC = AFDB90_CC.set_index("unirefID")
53 |
54 | print('2. Loading taxonomy data')
55 | taxonomy = pd.read_csv(uniprot_tax, index_col=0)
56 |
57 | print('3. Getting outlier data')
58 | outliers = '/scicore/home/schwede/durair0000/projects/turtle_tools/afdb-geometricus/data/outlier_results/'
59 | outliers_data = {}
60 |
61 | files = sorted(os.listdir(outliers))
62 | for i, file in enumerate(files):
63 | if i % 10 == 0:
64 | print(i, len(files))
65 |
66 | with open('{}/{}'.format(outliers, file)) as f:
67 | for line in f:
68 | line = line.strip().split()
69 | uniprot_id = line[0].split('-')[0]
70 | if uniprot_id not in outliers_data:
71 | outliers_data[uniprot_id] = [float(line[1])]
72 | else:
73 | outliers_data[uniprot_id].append(float(line[1]))
74 |
75 |
76 | # DEFINE ROUTINES
77 |
78 | def chunk_list(l, n):
79 |
80 | chunks = np.array_split(np.array(l), n)
81 |
82 | chunks = [list(chunk) for chunk in chunks]
83 | return chunks
84 |
85 | def get_tax_from_menzi(subgr_members, max_chunck_size=10000):
86 |
87 | superkingdoms = []
88 |
89 | for unirefID in subgr_members.index:
90 |
91 | curr_accs = uniref_db.query(unirefID)[0]['data']['UNIREF']['UniRef100']
92 | curr_accs = [i.split('_')[-1] for i in curr_accs]
93 |
94 | if len(curr_accs) > max_chunck_size:
95 | ratio = len(curr_accs)/max_chunck_size
96 | if ratio - round(ratio) > 0:
97 | n_chunks = round(ratio) + 1
98 | else:
99 | n_chunks = round(ratio)
100 | chuncks = chunk_list(curr_accs, n_chunks, counts=None)
101 | else:
102 | chuncks = [curr_accs]
103 |
104 | for chunck in chuncks:
105 |
106 | up_docs = uniprot_db.col.find({'_id': {'$in': chunck}})
107 | for doc in up_docs:
108 | acc = doc['_id']
109 | try:
110 | tax = doc['data']['TAXID'][2][0]
111 | superkingdoms.append(tax)
112 | except:
113 | pass
114 |
115 | uparc_docs = uniparc_db.col.find({'_id': {'$in': curr_accs}})
116 | for doc in uparc_docs:
117 | acc = doc['_id']
118 | try:
119 | tax = doc['data']['TAXID'][2][0]
120 | superkingdoms.append(tax)
121 | except:
122 | pass
123 |
124 | try:
125 | count = Counter(superkingdoms)
126 | return count.most_common(1)[0][0], count.most_common(1)[0][1]*100/len(superkingdoms)
127 | except:
128 | return np.nan, np.nan
129 |
130 | def get_comunities_summary_for_communities(arguments):
131 |
132 | target_communities = arguments[0]
133 | AFDB90_CC = arguments[1]
134 | taxonomy = arguments[2]
135 | outlier_data = arguments[3]
136 | thread_id = arguments[4]
137 |
138 | curr_data = AFDB90_CC.loc[AFDB90_CC.communityID.isin(target_communities)]
139 | curr_tax = taxonomy.loc[taxonomy.communityIDs.isin(target_communities)]
140 |
141 | start = time.time()
142 |
143 | communities_summary = {'Community': [], 'Subgraph': [], 'Avg_darkness': [], 'SD_darkness': [],
144 | 'Avg_outlier_score': [], 'SD_outlier_score':[], 'N_members': [], 'TM': [],
145 | 'SP': [], 'Median_length': [], 'MAD_length': [], 'Median_darkness': [], 'MAD_darkness': [],
146 | 'Median_representative': [], 'Longest_representative': [], 'Median_rep_title': [],
147 | 'Mode_superkingdom': [], 'Freq_superkingdom': []}
148 |
149 | n_expected = len(target_communities)
150 | for i, community_class in enumerate(target_communities):
151 |
152 | subgr_members = curr_data.loc[curr_data.communityID == community_class]
153 |
154 | communities_summary['Subgraph'].append(community_class.split('[')[0])
155 | communities_summary['Community'].append(community_class)
156 |
157 | communities_summary['Avg_darkness'].append(np.mean(subgr_members.FULL_noDUF.astype(float)))
158 | communities_summary['SD_darkness'].append(np.std(subgr_members.FULL_noDUF.astype(float)))
159 |
160 | communities_summary['Median_darkness'].append(np.median(subgr_members.FULL_noDUF.astype(float)))
161 | communities_summary['MAD_darkness'].append(stats.median_abs_deviation(subgr_members.FULL_noDUF.astype(float)))
162 |
163 | outlier_scores = [outlier_data[i] for i in subgr_members.AF2_longest_best70 if i in outlier_data]
164 | if len(outlier_scores) > 0:
165 | communities_summary['Avg_outlier_score'].append(np.mean(outlier_scores))
166 | communities_summary['SD_outlier_score'].append(np.std(outlier_scores))
167 | else:
168 | communities_summary['Avg_outlier_score'].append(np.nan)
169 | communities_summary['SD_outlier_score'].append(np.nan)
170 |
171 | communities_summary['N_members'].append(len(subgr_members))
172 |
173 | communities_summary['TM'].append((len(subgr_members)-list(subgr_members.TM).count(0))*100/len(subgr_members))
174 | communities_summary['SP'].append((len(subgr_members)-list(subgr_members.SP).count(0))*100/len(subgr_members))
175 |
176 | median = np.median(subgr_members.AF2_longest_best70_len.astype(float))
177 | communities_summary['Median_length'].append(median)
178 | communities_summary['MAD_length'].append(stats.median_abs_deviation(subgr_members.AF2_longest_best70_len.astype(float)))
179 |
180 | subgr_members['dist_to_median'] = abs(subgr_members.AF2_REP_best_len - median)
181 |
182 | median_rep = subgr_members.sort_values(by='dist_to_median', ascending=True).AF2_longest_best70[0]
183 | communities_summary['Median_representative'].append(median_rep)
184 |
185 | longest_rep = subgr_members.sort_values(by='AF2_REP_best_len', ascending=False).AF2_longest_best70[0]
186 | communities_summary['Longest_representative'].append(longest_rep)
187 |
188 | median_title = uniprot_db.query(median_rep)[0]['data']['NAME']['TITLE']
189 | communities_summary['Median_rep_title'].append(median_title)
190 |
191 | try:
192 | subgrp_tax = curr_tax.loc[curr_tax.communityIDs == community_class]
193 | tax = subgrp_tax.superkingdom.mode()[0]
194 | frq = subgrp_tax['superkingdom'].value_counts()[tax]*100/len(subgrp_tax)
195 | except:
196 | tax, frq = get_tax_from_menzi(subgr_members)
197 |
198 | communities_summary['Mode_superkingdom'].append(tax)
199 | communities_summary['Freq_superkingdom'].append(frq)
200 |
201 | if i % 100 == 0:
202 | numb_seconds = time.time() - start
203 | time_to_end = round(((numb_seconds/(i+1))*n_expected)-numb_seconds)
204 | print('thread {}:'.format(thread_id), i+1, n_expected, 'CURR COMMUNITY:', community_class, 'CURR TITLE', median_title, ' ... Time passed: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))), flush = True)
205 |
206 | return communities_summary
207 |
208 |
209 | def get_comunities_summary(AFDB90_CC, outlier_data, taxonomy, threads):
210 |
211 | target_communities = list(set(AFDB90_CC.communityID))
212 | separated_jobs = chunk_list(target_communities, threads)
213 |
214 | list_arguments = [i for i in zip(separated_jobs, [AFDB90_CC for job in separated_jobs], [taxonomy for job in separated_jobs], [outlier_data for job in separated_jobs], range(threads))]
215 |
216 | pool = ThreadPool(threads)
217 | results = pool.imap_unordered(get_comunities_summary_for_communities, list_arguments)
218 |
219 | all_results = {}
220 | for dic in results:
221 | for key in dic:
222 | if key not in all_results:
223 | all_results[key] = dic[key]
224 | else:
225 | all_results[key] += dic[key]
226 |
227 | all_results = pd.DataFrame(all_results)
228 | all_results = all_results.set_index('Community')
229 | all_results = all_results.sort_values(by='N_members', ascending=False)
230 |
231 | return all_results
232 |
233 |
234 | # GET COMMUNITIES SUMMARY
235 |
236 | print('3. Getting communities summary')
237 |
238 | if not os.path.isfile('{}/communities_summary_noreps.csv'.format(infolder)):
239 | communities_summary = get_comunities_summary(AFDB90_CC, outliers_data, taxonomy, threads=threads)
240 | communities_summary.to_csv('{}/communities_summary_noreps.csv'.format(infolder))
241 |
242 | else:
243 | communities_summary = pd.read_csv('{}/communities_summary_noreps.csv'.format(infolder), dtype = {'Community': str})
244 | communities_summary = communities_summary.set_index("Community")
245 |
246 | communities_summary.to_csv('{}/communities_summary.csv'.format(infolder))
--------------------------------------------------------------------------------
/scripts/get_connected_components.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import json
4 | import networkx as nx
5 | import numpy as np
6 | import time
7 | import gzip
8 | import io
9 |
10 | from networkx.algorithms.community import asyn_lpa_communities
11 |
12 | # LOAD INPUTS
13 |
14 | infasta = sys.argv[1]
15 | inmmsqs = sys.argv[2]
16 | outfolder = sys.argv[3]
17 |
18 | if not os.path.isdir(outfolder):
19 | os.mkdir(outfolder)
20 |
21 | # helping routines
22 |
23 | def get_seqs_index(infasta, indx = {}):
24 |
25 | print('\nReading sequences from input fasta and generating node index')
26 |
27 | start = time.time()
28 |
29 | count = len(indx)
30 |
31 | if infasta.endswith('.gz'):
32 | with gzip.open(infasta, 'rb') as inf:
33 | with io.TextIOWrapper(inf, encoding='utf-8') as decoder:
34 | for line in decoder:
35 | if line.startswith('>'):
36 | if '|' in line:
37 | line = line.split('|')[1].strip('>')
38 | else:
39 | line = line.split()[0].strip('>')
40 | indx[line] = count
41 | count+=1
42 |
43 | else:
44 | with open(infasta, 'r') as inf:
45 | for line in inf:
46 | if line.startswith('>'):
47 | if '|' in line:
48 | line = line.split('|')[1].strip('>')
49 | else:
50 | line = line.split()[0].strip('>')
51 | indx[line] = count
52 | count+=1
53 |
54 | print(' ... No. of expected nodes:', len(indx))
55 |
56 | numb_seconds = time.time() - start
57 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
58 |
59 | return indx
60 |
61 |
62 | def get_neighbors_simple(inmmsqs, indexes, mineval = 1e-4, mincov = 0, simplex = True, nmax=None):
63 |
64 | print('\nCollecting edges from input mmseqs file')
65 |
66 | start = time.time()
67 |
68 | edges = set()
69 | nodes = set()
70 |
71 | if not simplex:
72 | edges = {}
73 |
74 | mincov = mincov/100
75 | previous_len = None
76 | with open(inmmsqs, 'r') as inmm:
77 | for line in inmm:
78 | line = line.split('\t')
79 | i, j, cov, evalue = line[0], line[1], line[2], line[10]
80 |
81 | if i in indexes and j in indexes:
82 | evalue = float(evalue.strip())
83 | cov = float(cov)
84 |
85 | if i != j and evalue <= mineval and cov >= mincov:
86 | if simplex:
87 | # edges.add(tuple(sorted([indexes[i], indexes[j]])))
88 | edges.add(tuple(sorted([i, j])))
89 | else:
90 | # edge = sorted([indexes[i], indexes[j]])
91 | edge = sorted([i, j])
92 | if edge[0] in edges:
93 | if edge[1] in edges[edge[0]]:
94 | if edges[edge[0]][edge[1]][0] > evalue:
95 | edges[edge[0]][edge[1]] = (evalue, cov)
96 | else:
97 | edges[edge[0]][edge[1]] = (evalue, cov)
98 | else:
99 | edges[edge[0]] = {edge[1]: (evalue, cov)}
100 |
101 | nodes.add(i)
102 | nodes.add(j)
103 |
104 | if len(nodes)> 0 and len(nodes) % 100000 == 0 and len(nodes) != previous_len:
105 | print(len(nodes))
106 | previous_len = len(nodes)
107 |
108 | if nmax is not None and len(nodes) == nmax:
109 | break
110 |
111 | print(' ... Total number of hubs:', len(edges))
112 | print(' ... Total number of nodes:', len(nodes))
113 |
114 | numb_seconds = time.time() - start
115 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
116 |
117 | return edges, nodes
118 |
119 | def generate_pairs(neigbhrs, topN = None, min_weight = 0):
120 |
121 | print('\nRemoving redundant edges')
122 |
123 | start = time.time()
124 |
125 | edges = dict()
126 | weights = list()
127 |
128 | for i in neigbhrs:
129 | if topN is not None:
130 | curr_neighbrs = {k: v for k, v in sorted(neigbhrs[i].items(), key=lambda item: item[1])[:topN]}
131 | else:
132 | curr_neighbrs = neigbhrs[i]
133 |
134 | for j in curr_neighbrs:
135 | i_index, j_index = sorted([i, j])
136 | evalue = curr_neighbrs[j][0]
137 | cov = curr_neighbrs[j][1]
138 |
139 | edge = (i_index, j_index)
140 | edges[edge] = {'evalue': evalue, 'cov': cov*100}
141 | # edges.add(edge)
142 | # weights.append(-np.log10(evalue)*cov)
143 |
144 | print(' ... Total number of edges:', len(edges))
145 |
146 | # # normalise weights
147 | # max_weigth = max(weights)
148 | # min_weigth = int(min(weights))
149 | # normalised_weigths = [(weight-min_weight)/(max_weigth-min_weight) for weight in weights]
150 |
151 | # for i, edge in enumerate(edges):
152 | # edges[edge]['weight'] = normalised_weigths[i]
153 |
154 | numb_seconds = time.time() - start
155 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
156 |
157 | return edges
158 |
159 | def build_graph(edges, indexes, outgraph = None, outfolder = outfolder, map_properties = False, properties=['Darkness']):
160 |
161 | print('\nBuilding the graph')
162 |
163 | start = time.time()
164 |
165 | G=nx.Graph()
166 | G.add_nodes_from(list(indexes.keys()))
167 | G.add_edges_from(list(edges.keys()))
168 |
169 | nx.set_edge_attributes(G, edges)
170 |
171 | if map_properties:
172 | properties = get_nodes_properties(indexes.keys(), properties)
173 | nx.set_node_attributes(G, properties)
174 |
175 | if outgraph is None:
176 | nx.write_gml(G, "{}/full_graph.gml".format(outfolder))
177 | else:
178 | nx.write_gml(G, outgraph)
179 |
180 | numb_seconds = time.time() - start
181 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
182 |
183 | return G
184 |
185 |
186 | def collect_connected_components(G, nodes, min_size = 0, outfolder=outfolder, outgraph=None):
187 |
188 | print('\nCollecting individual subgraphs/connected components')
189 |
190 | sec_outfolder = '{}/subgraphs'.format(outfolder)
191 | if not os.path.isdir(sec_outfolder):
192 | os.mkdir(sec_outfolder)
193 |
194 | start = time.time()
195 |
196 | # Get connected components
197 | components = sorted(nx.connected_components(G), key=len, reverse=True)
198 |
199 | print(' ... Found {} subgraphs'.format(nx.number_connected_components(G)))
200 | print(' ... ... The largest has {} nodes'.format(len(components[0])))
201 |
202 | count = 0
203 | edge_count = 0
204 | node_count = 0
205 |
206 | node_cluster_class = {'node': [], 'subgraphID': [], 'communityID': []}
207 |
208 | for component_index, c in enumerate(components):
209 |
210 | curr_size = len(c)
211 |
212 | if curr_size >= min_size:
213 |
214 | component = G.subgraph(c).copy()
215 |
216 | curr_outf = '{}/subgraph_{:06d}.gml'.format(sec_outfolder, component_index)
217 | nx.write_gml(component, curr_outf)
218 |
219 | # Get communities by label propagation
220 | communities = list(asyn_lpa_communities(component, weight=None))
221 |
222 | for community_index, community in enumerate(communities):
223 | node_cluster_class['node'] += community
224 | node_cluster_class['communityID'] += ['{}[{}]'.format(component_index, community_index) for node in community]
225 | node_cluster_class['subgraphID'] += [component_index for node in community]
226 |
227 | print(' ... ... Subgraph:', component_index, 'No. nodes:', curr_size, 'No. communities:', len(communities))
228 | print(' ... ... Subgraph:', component_index, 'No. nodes:', curr_size)
229 |
230 |
231 | json.dump(node_cluster_class, open('{}/node_class.json'.format(outfolder), 'w'))
232 |
233 | # if outgraph is None:
234 | # nx.write_gml(G, "{}/full_graph.gml".format(outfolder))
235 | # else:
236 | # nx.write_gml(G, outgraph)
237 |
238 | print(' ... Wrote {} subgraphs, totalling {} nodes and {} edges'.format(count, node_count, edge_count))
239 |
240 | numb_seconds = time.time() - start
241 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
242 |
243 |
244 | # MAIN CODE
245 |
246 | outgraph = "{}/full_graph.gml".format(outfolder)
247 |
248 | indexes = get_seqs_index(infasta)
249 |
250 | if not os.path.isfile(outgraph):
251 | hubs, nodes = get_neighbors_simple(inmmsqs, indexes, mineval = 1e-4, mincov = 50, simplex = False, nmax=None)
252 | edges = generate_pairs(hubs, topN = 4)
253 |
254 | graph = build_graph(edges, indexes, outgraph=outgraph)
255 |
256 | else:
257 | print('Graph already produced. Will just load it')
258 |
259 | start = time.time()
260 | graph = nx.read_gml(outgraph)
261 |
262 | numb_seconds = time.time() - start
263 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
264 |
265 |
266 | collect_connected_components(graph, nodes=list(indexes.keys()), min_size = 2)
267 |
--------------------------------------------------------------------------------
/scripts/get_uniprot_taxonomy.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pandas as pd
4 | import json
5 | import itertools
6 | import networkx as nx
7 | import numpy as np
8 |
9 | import scipy
10 | from scipy import stats
11 |
12 | from ete3 import NCBITaxa
13 |
14 | from multiprocessing.pool import ThreadPool
15 |
16 | import sys
17 |
18 | import warnings
19 | warnings.filterwarnings("ignore")
20 |
21 | # LOAD MY DBs
22 |
23 | path_to_dbuilder = #path to dbuilder
24 | sys.path.append(path_to_dbuilder)
25 |
26 | from src import extract_uniref as uniref
27 | from src import extract_uniprot as uniprot
28 | from src import extract_uniparc as uniparc
29 | from src import extract_alphafold as alphafold
30 |
31 | target_uniref = 'UniRef50'
32 |
33 | MONGO_HOST = "10.1.0.202"
34 | MONGO_PORT = 30077
35 |
36 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref)
37 | uniref_db.index_db()
38 |
39 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
40 | uniprot_db.index_db()
41 |
42 | uniparc_db = uniparc.uniparc_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
43 | uniparc_db.index_db()
44 |
45 | alphafold_db = alphafold.alphafold_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)
46 | alphafold_db.index_db()
47 |
48 |
49 | # GET INPUTS
50 | AFDB_data = sys.argv[1]
51 | threads = int(sys.argv[2])
52 | jobid = sys.argv[3]
53 | infolder = AFDB_data.split('/')[-2]
54 |
55 | label = AFDB_data.split('/')[-1].split('.')[0]
56 |
57 | # LOAD INPUTS
58 |
59 | print('1. Loading AFDB data')
60 | AFDB90_CC = pd.read_csv(AFDB_data, dtype = {'communityID': str})
61 | AFDB90_CC = AFDB90_CC.sort_values(by='unirefID')
62 | AFDB90_CC = AFDB90_CC.set_index("unirefID")
63 |
64 | # ROUTINES
65 |
66 | def chunk_list(l, n, counts=None):
67 |
68 | print(len(l))
69 |
70 | print('Making chuncks')
71 |
72 | if counts is not None:
73 | b = [[l[i]]*counts[i] for i in range(len(l))]
74 | b = list(itertools.chain.from_iterable(b))
75 | else:
76 | b = l
77 |
78 | chunks = np.array_split(np.array(b), n)
79 | # chunks = [list(chunk) for chunk in chunks]
80 |
81 | final_chunks = []
82 | for i, chunk in enumerate(chunks):
83 | chunk = set(chunk)
84 |
85 | if i > 0:
86 | last_chunk = set(final_chunks[-1])
87 | if len(chunk.intersection(last_chunk)) > 0:
88 | chunk = chunk - last_chunk
89 |
90 | chunk = list(chunk)
91 | final_chunks.append(chunk)
92 | print(len(chunk), chunk[0])
93 |
94 | sumlen = sum([len(i) for i in final_chunks])
95 | print(' ... Made {} chuncks ({} jobs in total)'.format(len(final_chunks), sumlen), len(l))
96 |
97 | return final_chunks
98 |
99 | def get_taxonomy_for_unirefs(arguments,max_chunck_size = 10000):
100 |
101 | NCBI = NCBITaxa()
102 |
103 | target_unirefs = arguments[0]
104 | AFDB90_CC = arguments[1]
105 | outfolder = arguments[2]
106 | thread_id = arguments[3]
107 | jobid = arguments[4]
108 |
109 | curr_data = AFDB90_CC.loc[AFDB90_CC.index.isin(target_unirefs)]
110 |
111 | target_ranks = ['superkingdom','phylum','class','order','genus','species']
112 |
113 | out_json = '../{}/{}_taxonomy.json'.format(outfolder, thread_id)
114 | out_summary = '../{}/{}_taxonomy_summary.json'.format(outfolder, thread_id)
115 |
116 | try:
117 | taxonomy = json.load(open(out_json, 'r'))
118 | target_unirefs = list(set(target_unirefs)-set(taxonomy['UniRef50IDs']))
119 | except:
120 | taxonomy = {rank: [] for rank in target_ranks}
121 | taxonomy['uniprotIDs'] = []
122 | taxonomy['UniRef50IDs'] = []
123 | taxonomy['communityIDs'] = []
124 | # taxonomy['pLDDT'] = []
125 |
126 | count = 0
127 | n_expected = len(target_unirefs)
128 |
129 | start = time.time()
130 |
131 | for unirefID in target_unirefs:
132 |
133 | row = curr_data.loc[unirefID]
134 | curr_community = row.communityID
135 | # curr_accs = uniref_db.query(unirefID)[0]['data']['UNIREF']['UniRef100']
136 | curr_accs = uniref_db.query(unirefID)[0]['data']['ACC']
137 | curr_accs = [i.split('_')[-1] for i in curr_accs]
138 |
139 | if len(curr_accs) > max_chunck_size:
140 | ratio = len(curr_accs)/max_chunck_size
141 | if ratio - round(ratio) > 0:
142 | n_chunks = round(ratio) + 1
143 | else:
144 | n_chunks = round(ratio)
145 | chuncks = chunk_list(curr_accs, n_chunks, counts=None)
146 | else:
147 | chuncks = [curr_accs]
148 |
149 | for chunck in chuncks:
150 | up_docs = uniprot_db.col.find({'_id': {'$in': chunck}})
151 | for doc in up_docs:
152 | acc = doc['_id']
153 | taxid = doc['data']['TAXID'][0]
154 |
155 | curr_tax = {rank: np.nan for rank in target_ranks}
156 | try:
157 | lineage = NCBI.get_lineage(taxid)
158 | translation = NCBI.get_taxid_translator(lineage)
159 | ranks = NCBI.get_rank(lineage)
160 |
161 | for level in lineage:
162 | if ranks[level] in target_ranks:
163 | curr_tax[ranks[level]] = translation[level]
164 | except:
165 | pass
166 |
167 | for rank in curr_tax:
168 | taxonomy[rank].append(curr_tax[rank])
169 |
170 | taxonomy['uniprotIDs'].append(acc)
171 | taxonomy['UniRef50IDs'].append(unirefID)
172 | taxonomy['communityIDs'].append(curr_community)
173 |
174 | uparc_docs = uniparc_db.col.find({'_id': {'$in': curr_accs}})
175 | for doc in uparc_docs:
176 | acc = doc['_id']
177 | taxid = doc['data']['TAXID'][0]
178 |
179 | curr_tax = {rank: np.nan for rank in target_ranks}
180 | try:
181 | lineage = NCBI.get_lineage(taxid)
182 | translation = NCBI.get_taxid_translator(lineage)
183 | ranks = NCBI.get_rank(lineage)
184 |
185 | for level in lineage:
186 | if ranks[level] in target_ranks:
187 | curr_tax[ranks[level]] = translation[level]
188 | except:
189 | pass
190 |
191 | for rank in curr_tax:
192 | taxonomy[rank].append(curr_tax[rank])
193 |
194 | taxonomy['uniprotIDs'].append(acc)
195 | taxonomy['UniRef50IDs'].append(unirefID)
196 | taxonomy['communityIDs'].append(curr_community)
197 |
198 | if count % 100 == 0:
199 | numb_seconds = time.time() - start
200 | time_to_end = round(((numb_seconds/(count+1))*n_expected)-numb_seconds)
201 | print('thread {}:'.format(thread_id), count+1, n_expected, ' ... Time passed: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))), flush = True)
202 |
203 | if count % 1000 == 0:
204 | json.dump(taxonomy, open(out_json, 'w'), indent=4)
205 |
206 | count+=1
207 |
208 | json.dump(taxonomy, open(out_json, 'w'), indent=4)
209 |
210 | return taxonomy
211 |
212 | # GET TAXONOMY
213 |
214 | separated_jobs = chunk_list(list(AFDB90_CC.index), threads, counts=list(AFDB90_CC.nUniRef100))
215 |
216 | list_arguments = [i for i in zip(separated_jobs, [AFDB90_CC for job in separated_jobs], [infolder for job in separated_jobs], range(threads), [jobid for job in separated_jobs])]
217 |
218 | pool = ThreadPool(threads)
219 | results = pool.imap_unordered(get_taxonomy_for_unirefs, list_arguments)
220 |
221 | all_results = {}
222 | for dic in results:
223 | for key in dic:
224 | if key not in all_results:
225 | all_results[key] = dic[key]
226 | else:
227 | all_results[key] += dic[key]
228 |
229 | taxonomy = pd.DataFrame(all_results)
230 | taxonomy = taxonomy.set_index('uniprotIDs')
231 |
232 | taxonomy.to_csv('../{}/{}_uniprot_community_taxonomy_map.csv'.format(infolder, label))
233 |
234 |
--------------------------------------------------------------------------------
/scripts/make_communities_map.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import json
4 | import networkx as nx
5 | import numpy as np
6 | import pandas as pd
7 | import time
8 |
9 | infolder = sys.argv[1]
10 |
11 | ingraph = '{}/full_graph.gml'.format(infolder)
12 | node_data = '{}/node_class.json'.format(infolder)
13 |
14 | def get_node_index(data, target_level = 'communityID', min_size = None):
15 |
16 | node_index = {}
17 |
18 | for i, node in enumerate(data['node']):
19 | node_index[node] = data[target_level][i]
20 |
21 | return node_index
22 |
23 | def colapse_graph(graph, node_index):
24 |
25 | new_nodes = set(node_index.values())
26 | new_edges = set()
27 |
28 | n_expected = len(graph.edges)
29 |
30 | start = time.time()
31 |
32 | count = 0
33 | for edge in graph.edges:
34 | new_edge = [node_index[edge[0]], node_index[edge[1]]]
35 | if len(set(new_edge)) > 1:
36 | new_edge = tuple(sorted(new_edge))
37 | new_edges.add(new_edge)
38 |
39 | count+=1
40 |
41 | if count % 1000000 == 0:
42 | numb_seconds = time.time() - start
43 | time_to_end = round(((numb_seconds/count)*n_expected)-numb_seconds)
44 | print(count, n_expected, ' ... Time passed: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))), flush = True)
45 |
46 | print(' ... Defined {} edges connecting {} nodes'.format(len(new_edges), len(new_nodes)))
47 |
48 | return new_edges
49 |
50 | def write_edges_list(edges, infolder):
51 |
52 | print('Writing edges file for cosmograph')
53 |
54 | start = time.time()
55 |
56 | outfile = '{}/communities_edge_list.csv'.format(infolder)
57 |
58 | with open(outfile, 'w') as outp:
59 | outp.write('innode,outnode\n')
60 | for edge in edges:
61 | if edge[0] != edge[1]:
62 | outp.write('{},{}\n'.format(edge[0], edge[1]))
63 |
64 | numb_seconds = time.time() - start
65 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))))
66 |
67 |
68 | # load edges attributes
69 | data = json.load(open(node_data, 'r'))
70 |
71 | # load graph
72 | graph = nx.read_gml(ingraph)
73 |
74 | # get node index
75 | node_index = get_node_index(data)
76 |
77 | # colapse nodes
78 | new_edges = colapse_graph(graph, node_index)
79 |
80 | # write edges file for cosmograph
81 | write_edges_list(new_edges, infolder)
--------------------------------------------------------------------------------
/scripts/make_shapemers.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gzip
3 | import io
4 | import itertools
5 |
6 | import prody as pd
7 | import tarfile
8 | import torch
9 | from scipy import ndimage
10 | from time import time
11 |
12 | from geometricus import MultipleMomentInvariants, ShapemerLearn
13 |
14 | from pathlib import Path
15 | import json
16 | import numba as nb
17 | import numpy as np
18 | from tqdm import tqdm
19 |
20 | import proteinnet_parser
21 |
22 |
23 | def parse_pae_file(pae_json_data):
24 | if type(pae_json_data) == str or type(pae_json_data) == Path:
25 | with open(pae_json_data, "rt") as f:
26 | data = json.load(f)[0]
27 | else:
28 | data = json.load(pae_json_data)[0]
29 |
30 | if 'residue1' in data and 'distance' in data:
31 | # Legacy PAE format, keep for backwards compatibility.
32 | r1, d = data['residue1'], data['distance']
33 | size = max(r1)
34 | matrix = np.empty((size, size), dtype=np.float64)
35 | matrix.ravel()[:] = d
36 | elif 'predicted_aligned_error' in data:
37 | # New PAE format.
38 | matrix = np.array(data['predicted_aligned_error'], dtype=np.float64)
39 | else:
40 | raise ValueError('Invalid PAE JSON format.')
41 |
42 | return matrix
43 |
44 |
45 | @nb.njit
46 | def get_plddt_matrix(plddt):
47 | size = len(plddt)
48 | matrix = np.empty((size, size), dtype=np.float64)
49 | for i in range(size):
50 | matrix[i, i] = plddt[i]
51 | for j in range(i + 1, size):
52 | matrix[i, j] = matrix[j, i] = (plddt[i] + plddt[j])
53 | return 100 - matrix / 2
54 |
55 |
56 | def get_domains_networkx(pae_matrix, plddt_matrix, cutoff=20, graph_resolution=0.5):
57 | """
58 | Adapted from https://github.com/tristanic/pae_to_domains
59 |
60 | Takes a predicted aligned error (PAE) matrix representing the predicted error in distances between each
61 | pair of residues in a model, and uses a graph-based community clustering algorithm to partition the model
62 | into approximately rigid groups.
63 |
64 | Arguments:
65 |
66 | * pae_matrix: a (n_residues x n_residues) numpy array. Diagonal elements should be set to some non-zero value
67 | to avoid divide-by-zero warnings
68 | * plddt_matrix: a (n_residues x n_residues) numpy array containing average pairwise PLDDT values
69 | * cutoff (optional, default=20): graph edges will only be created for residue pairs with pae+avg.PLDDT <
70 | cutoff
71 | * graph_resolution (optional, default=0.5): regulates how aggressively the clustering algorithm is.
72 | Smaller values lead to larger clusters. Value should be larger than zero, and values larger than 5 are
73 | unlikely to be useful.
74 |
75 | Returns: a series of lists, where each list contains the indices of residues belonging to one cluster.
76 | """
77 | try:
78 | import networkx as nx
79 | from networkx.algorithms import community
80 | except ImportError:
81 | print(
82 | 'ERROR: This method requires NetworkX (>=2.6.2) to be installed. Please install it using "pip install '
83 | 'networkx" in a Python >=3.7 environment and try again.')
84 | import sys
85 | sys.exit()
86 | matrix = pae_matrix + plddt_matrix
87 | weights = 1 / matrix
88 | g = nx.Graph()
89 | size = weights.shape[0]
90 | g.add_nodes_from(range(size))
91 | edges = np.argwhere(matrix < cutoff)
92 | sel_weights = weights[edges.T[0], edges.T[1]]
93 | wedges = [(i, j, w) for (i, j), w in zip(edges, sel_weights)]
94 | g.add_weighted_edges_from(wedges)
95 | clusters = community.greedy_modularity_communities(g, weight='weight', resolution=graph_resolution)
96 | return clusters
97 |
98 |
99 | def get_domains_igraph(pae_matrix, plddt_matrix, cutoff=20, graph_resolution=0.5):
100 | """
101 | Adapted from https://github.com/tristanic/pae_to_domains
102 |
103 | Takes a predicted aligned error (PAE) matrix representing the predicted error in distances between each
104 | pair of residues in a model, and uses a graph-based community clustering algorithm to partition the model
105 | into approximately rigid groups.
106 |
107 | Arguments:
108 |
109 | * pae_matrix: a (n_residues x n_residues) numpy array. Diagonal elements should be set to some non-zero
110 | value to avoid divide-by-zero warnings
111 | * plddt_matrix: a (n_residues x n_residues) numpy array containing average pairwise PLDDT values
112 | * cutoff (optional, default=20): graph edges will only be created for residue pairs with pae+avg.PLDDT=3.6 environment and try again.')
124 | import sys
125 | sys.exit()
126 | matrix = pae_matrix + plddt_matrix
127 | weights = 1 / matrix
128 | g = igraph.Graph()
129 | size = weights.shape[0]
130 | g.add_vertices(range(size))
131 | edges = np.argwhere(pae_matrix < cutoff)
132 | sel_weights = weights[edges.T[0], edges.T[1]]
133 | g.add_edges(edges)
134 | g.es['weight'] = sel_weights
135 |
136 | vc = g.community_leiden(weights='weight', resolution_parameter=graph_resolution / 100, n_iterations=-1)
137 | membership = np.array(vc.membership)
138 | from collections import defaultdict
139 | clusters = defaultdict(list)
140 | for i, c in enumerate(membership):
141 | clusters[c].append(i)
142 | return clusters.values()
143 |
144 |
145 | def clusters_to_domains(protein, clusters, min_length=20, avg_plddt_cutoff=70):
146 | chain = "A"
147 | for cl in clusters:
148 | start, stop = min(cl), max(cl)
149 | if (stop - start) >= min_length:
150 | domain = protein.select(f"resnum {start}:{stop}")
151 | if domain.getBetas().mean() >= avg_plddt_cutoff:
152 | for res in domain:
153 | res.setChid(chain)
154 | yield domain
155 | chain = chr(ord(chain) + 1)
156 |
157 |
158 | def split_alphafold_protein(prody_protein, pae_file=None, plddt_threshold=70, sigma=5):
159 | """
160 | Splits an AlphaFold protein into fragments based on a Gaussian-smoothed version of the PLDDT score.
161 | Parameters
162 | ----------
163 | prody_protein
164 | ProDy protein object of calpha atoms
165 | pae_file
166 | pae_file_data
167 | plddt_threshold
168 | Fragments will be split according to residues with a (smoothed) PLDDT score below this threshold.
169 | sigma
170 | Sigma for the smoothing of the PLDDT score.
171 |
172 | Returns
173 | -------
174 | (start, end) indices for each split
175 | """
176 | if pae_file is not None:
177 | pae_matrix = parse_pae_file(pae_file)
178 | beta_list = prody_protein.getBetas()
179 | plddt_matrix = get_plddt_matrix(beta_list)
180 | clusters = get_domains_igraph(pae_matrix, plddt_matrix)
181 | beta_list = ndimage.gaussian_filter1d(beta_list, sigma=sigma)
182 | all_slices = []
183 | for cl in clusters:
184 | chain_start, chain_stop = min(cl), max(cl)
185 | length = chain_stop - chain_start
186 | if length < 20:
187 | continue
188 | indices = np.ones(length, dtype=int)
189 | indices[np.where(beta_list[chain_start:chain_stop] < plddt_threshold)] = 0
190 | slices = ndimage.find_objects(ndimage.label(indices)[0])
191 | slices = [(s[0].start, s[0].stop) for s in slices]
192 | all_slices += [(chain_start + start, chain_start + stop) for start, stop in slices]
193 | else:
194 | beta_list = ndimage.gaussian_filter1d(prody_protein.getBetas(), sigma=sigma)
195 | indices = np.ones(beta_list.shape[0], dtype=int)
196 | indices[np.where(beta_list < plddt_threshold)] = 0
197 | slices = ndimage.find_objects(ndimage.label(indices)[0])
198 | all_slices = [(s[0].start, s[0].stop) for s in slices]
199 | return all_slices
200 |
201 |
202 | def split_pdb_protein(prody_protein):
203 | """
204 | Splits a protein into fragments based on chain.
205 | Parameters
206 | ----------
207 | prody_protein
208 | ProDy protein object.
209 |
210 | Returns
211 | -------
212 | (start, end, chid) indices for each split
213 | """
214 | slices = []
215 | chains = set(a.getChid() for a in prody_protein)
216 | if len(chains):
217 | for chain in chains:
218 | if not len(chain.strip()):
219 | chain = prody_protein
220 | else:
221 | chain = prody_protein.select(f"chain {chain}")
222 | slices.append((chain[0].getResindex(), chain[-1].getResindex() + 1, chain[0].getChid()))
223 | else:
224 | slices.append((prody_protein[0].getResindex(), prody_protein[-1].getResindex(), ''))
225 | return sorted(slices)
226 |
227 |
228 | def get_shapemers(calpha_protein,
229 | model,
230 | is_af=False,
231 | pae_file_data=None,
232 | length_threshold=20,
233 | plddt_threshold=70,
234 | sigma=5):
235 | """
236 | Retrieves the moments of the protein.
237 | Parameters
238 | ----------
239 | calpha_protein
240 | prody object
241 | model
242 | ShapemerLearn model
243 | is_af
244 | Whether the protein is an AlphaFold protein
245 | pae_file_data
246 | pae file as extrected gzip or as filename
247 | length_threshold
248 | Proteins with fewer (filtered) residues than this threshold will be ignored.
249 | plddt_threshold
250 | Residues with a (smoothed) PLDDT score below this threshold will be ignored.
251 | sigma
252 | Sigma for the smoothing of the PLDDT score.
253 |
254 | Returns
255 | -------
256 | """
257 | if is_af:
258 | residue_slices = split_alphafold_protein(calpha_protein, pae_file_data, plddt_threshold, sigma)
259 | else:
260 | residue_slices = split_pdb_protein(calpha_protein)
261 | coords = calpha_protein.getCoords()
262 | return get_shapemers_from_coords(coords, model, length_threshold=length_threshold, residue_slices=residue_slices)
263 |
264 |
265 | def get_shapemers_from_coords(coords, model, length_threshold=20, residue_slices=None):
266 | shapemers = []
267 | indices = [len(coords)]
268 | if residue_slices is None:
269 | residue_slices = [(0, len(coords))]
270 | try:
271 | for x in residue_slices:
272 | start_index, end_index, *_ = x
273 | if end_index - start_index > length_threshold:
274 | indices += list(range(start_index, end_index))
275 | shapemers += MultipleMomentInvariants.from_coordinates("name",
276 | coords[
277 | start_index:end_index]).get_shapemers_model(
278 | model)
279 | if len(shapemers):
280 | assert len(shapemers) == len(indices) - 1
281 | return indices, shapemers
282 | except Exception as e:
283 | print(f"Error {e}")
284 | return [], []
285 |
286 |
287 | def make_corpus_proteome(taxid, db_folder, output_folder):
288 | model = ShapemerLearn.load()
289 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension)))
290 | key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys))))
291 | start = time()
292 | index = 0
293 | f_s = open(output_folder / f"{taxid}_shapemers.txt", "w")
294 | f_i = open(output_folder / f"{taxid}_indices.txt", "w")
295 | for f in db_folder.glob(f"proteome-tax_id-{taxid}-*_v4.tar"):
296 | with tarfile.open(f) as tar:
297 | for fh in tar.getmembers():
298 | if '.cif' in fh.name:
299 | if index % 1000 == 0:
300 | print(f"{index} proteins processed in {time() - start} seconds")
301 | uniprot_ac = '-'.join(fh.name.split('-')[1:3])
302 | with io.TextIOWrapper(gzip.open(tar.extractfile(fh), 'r'), encoding='utf-8') as mmcif:
303 | with gzip.open(tar.extractfile(
304 | tar.getmember(f"AF-{uniprot_ac}-predicted_aligned_error_v4.json.gz"))) as pae:
305 | protein = pd.parseMMCIFStream(mmcif)
306 | protein = protein.select("protein and calpha")
307 | indices, shapemers = get_shapemers(protein, model, is_af=True, pae_file_data=pae)
308 | if len(shapemers):
309 | f_i.write(f"{uniprot_ac}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n")
310 | f_s.write(f"{uniprot_ac}\t{' '.join(str(key_to_index[s]) for s in shapemers)}\n")
311 | index += 1
312 | f_s.close()
313 | f_i.close()
314 |
315 |
316 | def make_corpus_from_file(filename, db_folder, output_folder):
317 | model = ShapemerLearn.load()
318 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension)))
319 | shapemer_key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys))))
320 | f_s = open(output_folder / f"{filename.stem}_shapemers.txt", "w")
321 | f_i = open(output_folder / f"{filename.stem}_indices.txt", "w")
322 | with open(filename) as f:
323 | num_lines = sum(1 for _ in f)
324 | with tarfile.open(db_folder / f"{filename.stem}.tar") as tar:
325 | with open(filename) as f:
326 | for line in tqdm(f, total=num_lines):
327 | fh = line.strip()
328 | uniprot_ac = '-'.join(fh.split('-')[1:3])
329 | with io.TextIOWrapper(gzip.open(tar.extractfile(tar.getmember(fh)), 'r'), encoding='utf-8') as mmcif:
330 | with gzip.open(tar.extractfile(
331 | tar.getmember(f"AF-{uniprot_ac}-predicted_aligned_error_v4.json.gz"))) as pae:
332 | protein = pd.parseMMCIFStream(mmcif)
333 | protein = protein.select("protein and calpha")
334 | indices, shapemers = get_shapemers(protein, model, is_af=True, pae_file_data=pae)
335 | if len(shapemers):
336 | f_i.write(f"{uniprot_ac}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n")
337 | f_s.write(
338 | f"{uniprot_ac}\t{' '.join(str(shapemer_key_to_index[s]) for s in shapemers)}\n")
339 | f_s.close()
340 | f_i.close()
341 |
342 |
343 | def make_corpus_pdb_folder(db_folder_divided, output_folder):
344 | model = ShapemerLearn.load()
345 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension)))
346 | key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys))))
347 | f_s = open(output_folder / f"{db_folder_divided.stem}_shapemers.txt", "w")
348 | f_i = open(output_folder / f"{db_folder_divided.stem}_indices.txt", "w")
349 | for filename in tqdm(db_folder_divided.glob(f"*.ent.gz")):
350 | try:
351 | uid = filename.stem.split(".")[0].split("pdb")[1]
352 | with gzip.open(filename, 'r') as pdb:
353 | with io.TextIOWrapper(pdb, encoding='utf-8') as decoder:
354 | protein = pd.parsePDBStream(decoder)
355 | protein = protein.select("protein and calpha")
356 | if protein is None:
357 | continue
358 | coords = protein.getCoords()
359 | residue_slices = split_pdb_protein(protein)
360 | for start_index, end_index, chain in residue_slices:
361 | indices, shapemers = get_shapemers_from_coords(coords, model, residue_slices=[(start_index, end_index)])
362 | if len(shapemers):
363 | f_i.write(f"{uid}_{chain}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n")
364 | f_s.write(f"{uid}_{chain}\t{' '.join(str(key_to_index[s]) for s in shapemers)}\n")
365 | except Exception as e:
366 | print(e)
367 | f_s.close()
368 | f_i.close()
369 |
370 |
371 | def make_corpus_proteinnet(db_folder, output_folder):
372 | model = ShapemerLearn.load()
373 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension)))
374 | key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys))))
375 | f_s = open(output_folder / f"proteinnet_shapemers.txt", "w")
376 | f_i = open(output_folder / f"proteinnet_indices.txt", "w")
377 | for filename in [db_folder / x for x in ["training_100", "validation", "testing"]]:
378 | with open(filename) as f:
379 | total = sum(1 for line in f if line == "[ID]\n")
380 | for entry in tqdm(proteinnet_parser.yield_records_from_file(filename, 20), total=total):
381 | entry = proteinnet_parser.clean_entry(entry, 'ca')
382 | uid = entry["ID"]
383 | indices, shapemers = get_shapemers_from_coords(entry["tertiary"], model)
384 | if len(shapemers):
385 | f_i.write(f"{uid}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n")
386 | f_s.write(f"{uid}\t{' '.join(str(key_to_index[s]) for s in shapemers)}\n")
387 | f_s.close()
388 | f_i.close()
389 |
390 |
391 | def main():
392 | parser = argparse.ArgumentParser()
393 | parser.add_argument("filename", type=Path)
394 | parser.add_argument("db_folder", type=Path)
395 | parser.add_argument("output_folder", type=Path)
396 | args = parser.parse_args()
397 | if args.filename:
398 | make_corpus_from_file(args.filename, args.db_folder, Path(str(args.output_folder).strip()))
399 |
400 |
401 | def main_pdb():
402 | parser = argparse.ArgumentParser()
403 | parser.add_argument("db_folder_divided", type=Path)
404 | parser.add_argument("output_folder", type=Path)
405 | args = parser.parse_args()
406 | make_corpus_pdb_folder(args.db_folder_divided, Path(str(args.output_folder).strip()))
407 |
408 |
409 | if __name__ == '__main__':
410 | main_pdb()
411 |
--------------------------------------------------------------------------------
/scripts/sbatch_community_summary.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=AF90coms
4 | #SBATCH --qos=1day
5 | #SBATCH --cpus-per-task=128
6 | #SBATCH --mem=50G # --mem=1024G will send to bigmem
7 | #SBATCH --output=slurm_output/AF90_communities_output%A.out
8 | #SBATCH --error=slurm_output/AF90_communities_error%A.err
9 |
10 | python3 get_communities_summary.py data_generated_v2/AFDB90v4_cc_data.csv data_generated_v2/AFDB90v4_cc_data_uniprot_community_taxonomy_map.csv 128
--------------------------------------------------------------------------------
/scripts/sbatch_connect_component_collection.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=AF90AvsA
4 | #SBATCH --qos=1day
5 | #SBATCH --cpus-per-task=1
6 | #SBATCH --mem=100G # --mem=1024G will send to bigmem /// 300G for Uniref50
7 | #SBATCH --output=slurm_output/UR50_comp_output%A.out
8 | #SBATCH --error=slurm_output/UR50_comp_error%A.err
9 |
10 | DATABASES="databases"
11 |
12 | python3 get_connected_components.py ../data_generated/AFDBv4_90.fasta ../data_generated/AFDB90v4_all-gainst-all.m8 ../data_generated
--------------------------------------------------------------------------------
/scripts/sbatch_make_communities_graph.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=AF90cmgr
4 | #SBATCH --qos=1week
5 | #SBATCH --cpus-per-task=1
6 | #SBATCH --mem=20G # --mem=1024G will send to bigmem /// 300G for Uniref50
7 | #SBATCH --output=slurm_output/UR50_comp_output%A.out
8 | #SBATCH --error=slurm_output/UR50_comp_error%A.err
9 |
10 | python3 make_communities_map.py ../data_generated
--------------------------------------------------------------------------------
/scripts/sbatch_mmseqs_AFDB90_all-against-all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=UR50AvsA
4 | #SBATCH --qos=1day
5 | #SBATCH --cpus-per-task=60
6 | #SBATCH --mem=100G # --mem=1024G will send to bigmem
7 | #SBATCH --output=slurm_output/UR50_mmseqs_output%A.out
8 | #SBATCH --error=slurm_output/UR50_mmseqs_error%A.err
9 |
10 | DATABASES="../databases"
11 | DBFOLDER="${DATABASES}/mmseqsDBs"
12 |
13 | # load MMseqs
14 | ml MMseqs2
15 |
16 | # # # create database for mmseqs2
17 | # # mkdir $DBFOLDER
18 | mmseqs createdb AFDBv4_90.fasta ${DBFOLDER}/AFDB90v4
19 |
20 | # run mmseqs
21 | mmseqs easy-search AFDBv4_90.fasta ${DBFOLDER}/AFDB90v4 ../data_generated/AFDB90v4_all-gainst-all.m8 tmp -e 1e-4 --threads 60
22 |
--------------------------------------------------------------------------------