├── 1. Analyse UniRef50 and AFDB darkness.ipynb ├── 2. Analyse AFDB90 graph.ipynb ├── 3. UniProt name assessement.ipynb ├── 4. Outlier_Analysis.ipynb ├── LICENSE ├── README.md ├── plots ├── AFDB90v4_component_darkness_histogram.pdf ├── AFDB90v4_component_darkness_histogram.png ├── AFDB90v4_histogram_dark_content.pdf ├── AFDB90v4_histogram_dark_content.png ├── AFDBv4_uniref50_histogram_components_word_diversity.pdf ├── AFDBv4_uniref50_histogram_components_word_diversity.png ├── AFDBv4_uniref50_histogram_dark_content.pdf ├── AFDBv4_uniref50_histogram_dark_content.png ├── community_cosmograph_layout_darkness.png ├── component159_community_cosmograph_layout_darkness.png └── outliers.pdf └── scripts ├── AFDBv4_DUF_analysis_dark.py ├── AFDBv4_pLDDT_analysis.py ├── get_communities_summary.py ├── get_connected_components.py ├── get_uniprot_taxonomy.py ├── make_communities_map.py ├── make_shapemers.py ├── sbatch_community_summary.sh ├── sbatch_connect_component_collection.sh ├── sbatch_make_communities_graph.sh └── sbatch_mmseqs_AFDB90_all-against-all.sh /1. Analyse UniRef50 and AFDB darkness.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "strange-jewelry", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import pandas as pd\n", 12 | "import json\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "import seaborn as sns\n", 15 | "import numpy as np\n", 16 | "import scipy" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "written-andrew", 22 | "metadata": {}, 23 | "source": [ 24 | "# 1. Analyse the darkness content of UniRef50 and AlphaFold DB (v4)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "solar-joining", 30 | "metadata": {}, 31 | "source": [ 32 | "### 1.1. Load data\n", 33 | "First, run:\n", 34 | "\n", 35 | "`python3 scripts/AFDBv4_pLDDT_analysis.py UniRef50`\n", 36 | "\n", 37 | "This will generate the file `data_generated/AFDBv4_pLDDT_diggestion.csv`. The corresponding for the AFDB90v4 paper, is `data_generated/AFDBv4_pLDDT_diggestion_UniRef50_2023-02-01.csv`, which we load in the next cell.\n", 38 | "\n", 39 | "This file contains the per UniRef50 cluster data on functional darkness, cluster size, median evidence score for the proteins contained, and pLDDT distributions in the AlphaFold database if there is at least one member of the cluster in AFDB. This table refers to ALL Uniref50 clusters and not only those with members in AFDB. \n", 40 | "\n", 41 | "The column corresponding to functional darkness is \"FULL_noDUF\", which reads \"full annotation coverage, excluding DUFs and related terms\"." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "id": "fatal-beijing", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stderr", 52 | "output_type": "stream", 53 | "text": [ 54 | "/scicore/home/schwede/soares0000/projects/dark_protein_universe/venv/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3072: DtypeWarning: Columns (2,4) have mixed types.Specify dtype option on import or set low_memory=False.\n", 55 | " interactivity=interactivity, compiler=compiler, result=result)\n" 56 | ] 57 | }, 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | "
AF2_REP_bestAF2_REP_best_lenAF2_REP_worstAF2_REP_worst_lenAF2_longest_best70AF2_longest_best70_lenAF2_longest_best70_pLDDTFULL_noDUFREPSPTMdelta_pLDDTmax_pLDDTmedian_Evidencemedian_pLDDTmin_pLDDTnACCsnAF2nUniRef100nUniRef90
unirefID
UniRef50_A0A007A0A007407.0NaNNaNA0A007407.088.24869896.81A0A007000.00000088.2486984.088.24869888.2486981111
UniRef50_A0A009DWD5A0A009DWD539.0NaNNaNA0A009DWD539.071.9912820.00NaN000.00000071.9912824.071.99128271.9912821111
UniRef50_A0A009DWJ5A0A009DWJ547.0NaNNaNA0A009DWJ547.080.86340480.85A0A009DWJ5000.00000080.8634044.080.86340480.8634041111
UniRef50_A0A009DWL0A0A009DWL076.0NaNNaNNaNNaNNaN98.67UPI0018888667000.00000052.3977634.052.39776352.3977633133
UniRef50_A0A009DY31A0A2D5KFP446.0A0A0K6IRR657.0A0A009DY3148.070.70833393.75A0A009DY3100-23.33496283.1093484.076.05140159.7743864444
...............................................................
UniRef50_Z9JYV3Z9JYV3151.0NaNNaNNaNNaNNaN0.00NaN000.00000064.5834444.064.58344464.5834441111
UniRef50_Z9JYV5Z9JYV5211.0NaNNaNNaNNaNNaN99.53Z9JYV5000.00000045.1246924.045.12469245.1246921111
UniRef50_Z9JYW2Z9JYW2261.0NaNNaNZ9JYW2261.073.07026899.62Z9JYW2000.00000073.0702684.073.07026873.0702681111
UniRef50_Z9JYW9Z9JYW9171.0NaNNaNNaNNaNNaN78.36Z9JYW9000.00000034.7259654.034.72596534.7259651111
UniRef50_Z9JZ05A0A7X9C7W4297.0A0A1B0ZIR3340.0A0A2N6U4Z5341.086.65167298.58UPI0015607FF100-4.93934089.5250174.086.82497184.58567658284422
\n", 380 | "

53625854 rows × 20 columns

\n", 381 | "
" 382 | ], 383 | "text/plain": [ 384 | " AF2_REP_best AF2_REP_best_len AF2_REP_worst \\\n", 385 | "unirefID \n", 386 | "UniRef50_A0A007 A0A007 407.0 NaN \n", 387 | "UniRef50_A0A009DWD5 A0A009DWD5 39.0 NaN \n", 388 | "UniRef50_A0A009DWJ5 A0A009DWJ5 47.0 NaN \n", 389 | "UniRef50_A0A009DWL0 A0A009DWL0 76.0 NaN \n", 390 | "UniRef50_A0A009DY31 A0A2D5KFP4 46.0 A0A0K6IRR6 \n", 391 | "... ... ... ... \n", 392 | "UniRef50_Z9JYV3 Z9JYV3 151.0 NaN \n", 393 | "UniRef50_Z9JYV5 Z9JYV5 211.0 NaN \n", 394 | "UniRef50_Z9JYW2 Z9JYW2 261.0 NaN \n", 395 | "UniRef50_Z9JYW9 Z9JYW9 171.0 NaN \n", 396 | "UniRef50_Z9JZ05 A0A7X9C7W4 297.0 A0A1B0ZIR3 \n", 397 | "\n", 398 | " AF2_REP_worst_len AF2_longest_best70 \\\n", 399 | "unirefID \n", 400 | "UniRef50_A0A007 NaN A0A007 \n", 401 | "UniRef50_A0A009DWD5 NaN A0A009DWD5 \n", 402 | "UniRef50_A0A009DWJ5 NaN A0A009DWJ5 \n", 403 | "UniRef50_A0A009DWL0 NaN NaN \n", 404 | "UniRef50_A0A009DY31 57.0 A0A009DY31 \n", 405 | "... ... ... \n", 406 | "UniRef50_Z9JYV3 NaN NaN \n", 407 | "UniRef50_Z9JYV5 NaN NaN \n", 408 | "UniRef50_Z9JYW2 NaN Z9JYW2 \n", 409 | "UniRef50_Z9JYW9 NaN NaN \n", 410 | "UniRef50_Z9JZ05 340.0 A0A2N6U4Z5 \n", 411 | "\n", 412 | " AF2_longest_best70_len AF2_longest_best70_pLDDT \\\n", 413 | "unirefID \n", 414 | "UniRef50_A0A007 407.0 88.248698 \n", 415 | "UniRef50_A0A009DWD5 39.0 71.991282 \n", 416 | "UniRef50_A0A009DWJ5 47.0 80.863404 \n", 417 | "UniRef50_A0A009DWL0 NaN NaN \n", 418 | "UniRef50_A0A009DY31 48.0 70.708333 \n", 419 | "... ... ... \n", 420 | "UniRef50_Z9JYV3 NaN NaN \n", 421 | "UniRef50_Z9JYV5 NaN NaN \n", 422 | "UniRef50_Z9JYW2 261.0 73.070268 \n", 423 | "UniRef50_Z9JYW9 NaN NaN \n", 424 | "UniRef50_Z9JZ05 341.0 86.651672 \n", 425 | "\n", 426 | " FULL_noDUF REP SP TM delta_pLDDT \\\n", 427 | "unirefID \n", 428 | "UniRef50_A0A007 96.81 A0A007 0 0 0.000000 \n", 429 | "UniRef50_A0A009DWD5 0.00 NaN 0 0 0.000000 \n", 430 | "UniRef50_A0A009DWJ5 80.85 A0A009DWJ5 0 0 0.000000 \n", 431 | "UniRef50_A0A009DWL0 98.67 UPI0018888667 0 0 0.000000 \n", 432 | "UniRef50_A0A009DY31 93.75 A0A009DY31 0 0 -23.334962 \n", 433 | "... ... ... .. .. ... \n", 434 | "UniRef50_Z9JYV3 0.00 NaN 0 0 0.000000 \n", 435 | "UniRef50_Z9JYV5 99.53 Z9JYV5 0 0 0.000000 \n", 436 | "UniRef50_Z9JYW2 99.62 Z9JYW2 0 0 0.000000 \n", 437 | "UniRef50_Z9JYW9 78.36 Z9JYW9 0 0 0.000000 \n", 438 | "UniRef50_Z9JZ05 98.58 UPI0015607FF1 0 0 -4.939340 \n", 439 | "\n", 440 | " max_pLDDT median_Evidence median_pLDDT min_pLDDT \\\n", 441 | "unirefID \n", 442 | "UniRef50_A0A007 88.248698 4.0 88.248698 88.248698 \n", 443 | "UniRef50_A0A009DWD5 71.991282 4.0 71.991282 71.991282 \n", 444 | "UniRef50_A0A009DWJ5 80.863404 4.0 80.863404 80.863404 \n", 445 | "UniRef50_A0A009DWL0 52.397763 4.0 52.397763 52.397763 \n", 446 | "UniRef50_A0A009DY31 83.109348 4.0 76.051401 59.774386 \n", 447 | "... ... ... ... ... \n", 448 | "UniRef50_Z9JYV3 64.583444 4.0 64.583444 64.583444 \n", 449 | "UniRef50_Z9JYV5 45.124692 4.0 45.124692 45.124692 \n", 450 | "UniRef50_Z9JYW2 73.070268 4.0 73.070268 73.070268 \n", 451 | "UniRef50_Z9JYW9 34.725965 4.0 34.725965 34.725965 \n", 452 | "UniRef50_Z9JZ05 89.525017 4.0 86.824971 84.585676 \n", 453 | "\n", 454 | " nACCs nAF2 nUniRef100 nUniRef90 \n", 455 | "unirefID \n", 456 | "UniRef50_A0A007 1 1 1 1 \n", 457 | "UniRef50_A0A009DWD5 1 1 1 1 \n", 458 | "UniRef50_A0A009DWJ5 1 1 1 1 \n", 459 | "UniRef50_A0A009DWL0 3 1 3 3 \n", 460 | "UniRef50_A0A009DY31 4 4 4 4 \n", 461 | "... ... ... ... ... \n", 462 | "UniRef50_Z9JYV3 1 1 1 1 \n", 463 | "UniRef50_Z9JYV5 1 1 1 1 \n", 464 | "UniRef50_Z9JYW2 1 1 1 1 \n", 465 | "UniRef50_Z9JYW9 1 1 1 1 \n", 466 | "UniRef50_Z9JZ05 58 28 44 22 \n", 467 | "\n", 468 | "[53625854 rows x 20 columns]" 469 | ] 470 | }, 471 | "execution_count": 2, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "indata = 'data_generated_v2/AFDBv4_pLDDT_diggestion_UniRef50_2023-02-01.csv'\n", 478 | "indata = pd.read_csv(indata)\n", 479 | "indata = indata.sort_values(by='unirefID')\n", 480 | "indata = indata.set_index(\"unirefID\")\n", 481 | "indata = indata[:-1]\n", 482 | "indata" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 7, 488 | "id": "norman-brunei", 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/html": [ 494 | "
\n", 495 | "\n", 508 | "\n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | "
AF2_REP_bestAF2_REP_best_lenAF2_REP_worstAF2_REP_worst_lenAF2_longest_best70AF2_longest_best70_lenAF2_longest_best70_pLDDTFULL_noDUFREPSP...delta_pLDDTmax_pLDDTmedian_Evidencemedian_pLDDTmin_pLDDTnACCsnAF2nUniRef100nUniRef90darkness_bins
unirefID
UniRef50_A0A007A0A007407.0NaNNaNA0A007407.088.24869896.81A0A0070...0.00000088.2486984.088.24869888.2486981111(95.0, 100.0]
UniRef50_A0A009DWD5A0A009DWD539.0NaNNaNA0A009DWD539.071.9912820.00NaN0...0.00000071.9912824.071.99128271.9912821111(-0.001, 5.0]
UniRef50_A0A009DWJ5A0A009DWJ547.0NaNNaNA0A009DWJ547.080.86340480.85A0A009DWJ50...0.00000080.8634044.080.86340480.8634041111(80.0, 85.0]
UniRef50_A0A009DWL0A0A009DWL076.0NaNNaNNaNNaNNaN98.67UPI00188886670...0.00000052.3977634.052.39776352.3977633133(95.0, 100.0]
UniRef50_A0A009DY31A0A2D5KFP446.0A0A0K6IRR657.0A0A009DY3148.070.70833393.75A0A009DY310...-23.33496283.1093484.076.05140159.7743864444(90.0, 95.0]
..................................................................
UniRef50_Z9JYV3Z9JYV3151.0NaNNaNNaNNaNNaN0.00NaN0...0.00000064.5834444.064.58344464.5834441111(-0.001, 5.0]
UniRef50_Z9JYV5Z9JYV5211.0NaNNaNNaNNaNNaN99.53Z9JYV50...0.00000045.1246924.045.12469245.1246921111(95.0, 100.0]
UniRef50_Z9JYW2Z9JYW2261.0NaNNaNZ9JYW2261.073.07026899.62Z9JYW20...0.00000073.0702684.073.07026873.0702681111(95.0, 100.0]
UniRef50_Z9JYW9Z9JYW9171.0NaNNaNNaNNaNNaN78.36Z9JYW90...0.00000034.7259654.034.72596534.7259651111(75.0, 80.0]
UniRef50_Z9JZ05A0A7X9C7W4297.0A0A1B0ZIR3340.0A0A2N6U4Z5341.086.65167298.58UPI0015607FF10...-4.93934089.5250174.086.82497184.58567658284422(95.0, 100.0]
\n", 826 | "

53625854 rows × 21 columns

\n", 827 | "
" 828 | ], 829 | "text/plain": [ 830 | " AF2_REP_best AF2_REP_best_len AF2_REP_worst \\\n", 831 | "unirefID \n", 832 | "UniRef50_A0A007 A0A007 407.0 NaN \n", 833 | "UniRef50_A0A009DWD5 A0A009DWD5 39.0 NaN \n", 834 | "UniRef50_A0A009DWJ5 A0A009DWJ5 47.0 NaN \n", 835 | "UniRef50_A0A009DWL0 A0A009DWL0 76.0 NaN \n", 836 | "UniRef50_A0A009DY31 A0A2D5KFP4 46.0 A0A0K6IRR6 \n", 837 | "... ... ... ... \n", 838 | "UniRef50_Z9JYV3 Z9JYV3 151.0 NaN \n", 839 | "UniRef50_Z9JYV5 Z9JYV5 211.0 NaN \n", 840 | "UniRef50_Z9JYW2 Z9JYW2 261.0 NaN \n", 841 | "UniRef50_Z9JYW9 Z9JYW9 171.0 NaN \n", 842 | "UniRef50_Z9JZ05 A0A7X9C7W4 297.0 A0A1B0ZIR3 \n", 843 | "\n", 844 | " AF2_REP_worst_len AF2_longest_best70 \\\n", 845 | "unirefID \n", 846 | "UniRef50_A0A007 NaN A0A007 \n", 847 | "UniRef50_A0A009DWD5 NaN A0A009DWD5 \n", 848 | "UniRef50_A0A009DWJ5 NaN A0A009DWJ5 \n", 849 | "UniRef50_A0A009DWL0 NaN NaN \n", 850 | "UniRef50_A0A009DY31 57.0 A0A009DY31 \n", 851 | "... ... ... \n", 852 | "UniRef50_Z9JYV3 NaN NaN \n", 853 | "UniRef50_Z9JYV5 NaN NaN \n", 854 | "UniRef50_Z9JYW2 NaN Z9JYW2 \n", 855 | "UniRef50_Z9JYW9 NaN NaN \n", 856 | "UniRef50_Z9JZ05 340.0 A0A2N6U4Z5 \n", 857 | "\n", 858 | " AF2_longest_best70_len AF2_longest_best70_pLDDT \\\n", 859 | "unirefID \n", 860 | "UniRef50_A0A007 407.0 88.248698 \n", 861 | "UniRef50_A0A009DWD5 39.0 71.991282 \n", 862 | "UniRef50_A0A009DWJ5 47.0 80.863404 \n", 863 | "UniRef50_A0A009DWL0 NaN NaN \n", 864 | "UniRef50_A0A009DY31 48.0 70.708333 \n", 865 | "... ... ... \n", 866 | "UniRef50_Z9JYV3 NaN NaN \n", 867 | "UniRef50_Z9JYV5 NaN NaN \n", 868 | "UniRef50_Z9JYW2 261.0 73.070268 \n", 869 | "UniRef50_Z9JYW9 NaN NaN \n", 870 | "UniRef50_Z9JZ05 341.0 86.651672 \n", 871 | "\n", 872 | " FULL_noDUF REP SP ... delta_pLDDT \\\n", 873 | "unirefID ... \n", 874 | "UniRef50_A0A007 96.81 A0A007 0 ... 0.000000 \n", 875 | "UniRef50_A0A009DWD5 0.00 NaN 0 ... 0.000000 \n", 876 | "UniRef50_A0A009DWJ5 80.85 A0A009DWJ5 0 ... 0.000000 \n", 877 | "UniRef50_A0A009DWL0 98.67 UPI0018888667 0 ... 0.000000 \n", 878 | "UniRef50_A0A009DY31 93.75 A0A009DY31 0 ... -23.334962 \n", 879 | "... ... ... .. ... ... \n", 880 | "UniRef50_Z9JYV3 0.00 NaN 0 ... 0.000000 \n", 881 | "UniRef50_Z9JYV5 99.53 Z9JYV5 0 ... 0.000000 \n", 882 | "UniRef50_Z9JYW2 99.62 Z9JYW2 0 ... 0.000000 \n", 883 | "UniRef50_Z9JYW9 78.36 Z9JYW9 0 ... 0.000000 \n", 884 | "UniRef50_Z9JZ05 98.58 UPI0015607FF1 0 ... -4.939340 \n", 885 | "\n", 886 | " max_pLDDT median_Evidence median_pLDDT min_pLDDT \\\n", 887 | "unirefID \n", 888 | "UniRef50_A0A007 88.248698 4.0 88.248698 88.248698 \n", 889 | "UniRef50_A0A009DWD5 71.991282 4.0 71.991282 71.991282 \n", 890 | "UniRef50_A0A009DWJ5 80.863404 4.0 80.863404 80.863404 \n", 891 | "UniRef50_A0A009DWL0 52.397763 4.0 52.397763 52.397763 \n", 892 | "UniRef50_A0A009DY31 83.109348 4.0 76.051401 59.774386 \n", 893 | "... ... ... ... ... \n", 894 | "UniRef50_Z9JYV3 64.583444 4.0 64.583444 64.583444 \n", 895 | "UniRef50_Z9JYV5 45.124692 4.0 45.124692 45.124692 \n", 896 | "UniRef50_Z9JYW2 73.070268 4.0 73.070268 73.070268 \n", 897 | "UniRef50_Z9JYW9 34.725965 4.0 34.725965 34.725965 \n", 898 | "UniRef50_Z9JZ05 89.525017 4.0 86.824971 84.585676 \n", 899 | "\n", 900 | " nACCs nAF2 nUniRef100 nUniRef90 darkness_bins \n", 901 | "unirefID \n", 902 | "UniRef50_A0A007 1 1 1 1 (95.0, 100.0] \n", 903 | "UniRef50_A0A009DWD5 1 1 1 1 (-0.001, 5.0] \n", 904 | "UniRef50_A0A009DWJ5 1 1 1 1 (80.0, 85.0] \n", 905 | "UniRef50_A0A009DWL0 3 1 3 3 (95.0, 100.0] \n", 906 | "UniRef50_A0A009DY31 4 4 4 4 (90.0, 95.0] \n", 907 | "... ... ... ... ... ... \n", 908 | "UniRef50_Z9JYV3 1 1 1 1 (-0.001, 5.0] \n", 909 | "UniRef50_Z9JYV5 1 1 1 1 (95.0, 100.0] \n", 910 | "UniRef50_Z9JYW2 1 1 1 1 (95.0, 100.0] \n", 911 | "UniRef50_Z9JYW9 1 1 1 1 (75.0, 80.0] \n", 912 | "UniRef50_Z9JZ05 58 28 44 22 (95.0, 100.0] \n", 913 | "\n", 914 | "[53625854 rows x 21 columns]" 915 | ] 916 | }, 917 | "execution_count": 7, 918 | "metadata": {}, 919 | "output_type": "execute_result" 920 | } 921 | ], 922 | "source": [ 923 | "indata['darkness_bins'] = pd.cut(indata['FULL_noDUF'].astype(float), bins=[i for i in range(0, 105, 5)], include_lowest=True)\n", 924 | "indata['median_Evidence'] = indata['median_Evidence'].fillna(0)\n", 925 | "indata" 926 | ] 927 | }, 928 | { 929 | "cell_type": "markdown", 930 | "id": "promising-importance", 931 | "metadata": {}, 932 | "source": [ 933 | "To add DUF counts into the dataframe, run `python3 scripts/AFDBv4_DUF_analysis_dark.py UniRef50`, which will generate the `generated_data/AFDBv4_DUF_dark_diggestion_UniRef50.csv`\n", 934 | "\n", 935 | "For the AFDB90v4 paper, the precomupted file is `data_generated/AFDBv4_DUF_dark_diggestion_UniRef50_2023-02-06.csv`\n", 936 | "\n", 937 | "This table list each UniRef50 cluster and states whether there are proteins annotated for DUFs in it. We want to merge this information into the dataframe above." 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": 4, 943 | "id": "varying-network", 944 | "metadata": {}, 945 | "outputs": [ 946 | { 947 | "data": { 948 | "text/html": [ 949 | "
\n", 950 | "\n", 963 | "\n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | "
AF2_REP_bestAF2_REP_best_lenAF2_REP_worstAF2_REP_worst_lenAF2_longest_best70AF2_longest_best70_lenAF2_longest_best70_pLDDTFULL_noDUFREPSP...max_pLDDTmedian_Evidencemedian_pLDDTmin_pLDDTnACCsnAF2nUniRef100nUniRef90darkness_binsHas_duf
UniRef50_A0A007A0A007407.0NaNNaNA0A007407.088.24869896.81A0A0070...88.2486984.088.24869888.2486981111(95.0, 100.0]NaN
UniRef50_A0A009DWD5A0A009DWD539.0NaNNaNA0A009DWD539.071.9912820.00NaN0...71.9912824.071.99128271.9912821111(-0.001, 5.0]0.0
UniRef50_A0A009DWJ5A0A009DWJ547.0NaNNaNA0A009DWJ547.080.86340480.85A0A009DWJ50...80.8634044.080.86340480.8634041111(80.0, 85.0]NaN
UniRef50_A0A009DWL0A0A009DWL076.0NaNNaNNaNNaNNaN98.67UPI00188886670...52.3977634.052.39776352.3977633133(95.0, 100.0]NaN
UniRef50_A0A009DY31A0A2D5KFP446.0A0A0K6IRR657.0A0A009DY3148.070.70833393.75A0A009DY310...83.1093484.076.05140159.7743864444(90.0, 95.0]NaN
..................................................................
UniRef50_Z9JYV3Z9JYV3151.0NaNNaNNaNNaNNaN0.00NaN0...64.5834444.064.58344464.5834441111(-0.001, 5.0]NaN
UniRef50_Z9JYV5Z9JYV5211.0NaNNaNNaNNaNNaN99.53Z9JYV50...45.1246924.045.12469245.1246921111(95.0, 100.0]NaN
UniRef50_Z9JYW2Z9JYW2261.0NaNNaNZ9JYW2261.073.07026899.62Z9JYW20...73.0702684.073.07026873.0702681111(95.0, 100.0]NaN
UniRef50_Z9JYW9Z9JYW9171.0NaNNaNNaNNaNNaN78.36Z9JYW90...34.7259654.034.72596534.7259651111(75.0, 80.0]NaN
UniRef50_Z9JZ05A0A7X9C7W4297.0A0A1B0ZIR3340.0A0A2N6U4Z5341.086.65167298.58UPI0015607FF10...89.5250174.086.82497184.58567658284422(95.0, 100.0]NaN
\n", 1257 | "

53625854 rows × 22 columns

\n", 1258 | "
" 1259 | ], 1260 | "text/plain": [ 1261 | " AF2_REP_best AF2_REP_best_len AF2_REP_worst \\\n", 1262 | "UniRef50_A0A007 A0A007 407.0 NaN \n", 1263 | "UniRef50_A0A009DWD5 A0A009DWD5 39.0 NaN \n", 1264 | "UniRef50_A0A009DWJ5 A0A009DWJ5 47.0 NaN \n", 1265 | "UniRef50_A0A009DWL0 A0A009DWL0 76.0 NaN \n", 1266 | "UniRef50_A0A009DY31 A0A2D5KFP4 46.0 A0A0K6IRR6 \n", 1267 | "... ... ... ... \n", 1268 | "UniRef50_Z9JYV3 Z9JYV3 151.0 NaN \n", 1269 | "UniRef50_Z9JYV5 Z9JYV5 211.0 NaN \n", 1270 | "UniRef50_Z9JYW2 Z9JYW2 261.0 NaN \n", 1271 | "UniRef50_Z9JYW9 Z9JYW9 171.0 NaN \n", 1272 | "UniRef50_Z9JZ05 A0A7X9C7W4 297.0 A0A1B0ZIR3 \n", 1273 | "\n", 1274 | " AF2_REP_worst_len AF2_longest_best70 \\\n", 1275 | "UniRef50_A0A007 NaN A0A007 \n", 1276 | "UniRef50_A0A009DWD5 NaN A0A009DWD5 \n", 1277 | "UniRef50_A0A009DWJ5 NaN A0A009DWJ5 \n", 1278 | "UniRef50_A0A009DWL0 NaN NaN \n", 1279 | "UniRef50_A0A009DY31 57.0 A0A009DY31 \n", 1280 | "... ... ... \n", 1281 | "UniRef50_Z9JYV3 NaN NaN \n", 1282 | "UniRef50_Z9JYV5 NaN NaN \n", 1283 | "UniRef50_Z9JYW2 NaN Z9JYW2 \n", 1284 | "UniRef50_Z9JYW9 NaN NaN \n", 1285 | "UniRef50_Z9JZ05 340.0 A0A2N6U4Z5 \n", 1286 | "\n", 1287 | " AF2_longest_best70_len AF2_longest_best70_pLDDT \\\n", 1288 | "UniRef50_A0A007 407.0 88.248698 \n", 1289 | "UniRef50_A0A009DWD5 39.0 71.991282 \n", 1290 | "UniRef50_A0A009DWJ5 47.0 80.863404 \n", 1291 | "UniRef50_A0A009DWL0 NaN NaN \n", 1292 | "UniRef50_A0A009DY31 48.0 70.708333 \n", 1293 | "... ... ... \n", 1294 | "UniRef50_Z9JYV3 NaN NaN \n", 1295 | "UniRef50_Z9JYV5 NaN NaN \n", 1296 | "UniRef50_Z9JYW2 261.0 73.070268 \n", 1297 | "UniRef50_Z9JYW9 NaN NaN \n", 1298 | "UniRef50_Z9JZ05 341.0 86.651672 \n", 1299 | "\n", 1300 | " FULL_noDUF REP SP ... max_pLDDT \\\n", 1301 | "UniRef50_A0A007 96.81 A0A007 0 ... 88.248698 \n", 1302 | "UniRef50_A0A009DWD5 0.00 NaN 0 ... 71.991282 \n", 1303 | "UniRef50_A0A009DWJ5 80.85 A0A009DWJ5 0 ... 80.863404 \n", 1304 | "UniRef50_A0A009DWL0 98.67 UPI0018888667 0 ... 52.397763 \n", 1305 | "UniRef50_A0A009DY31 93.75 A0A009DY31 0 ... 83.109348 \n", 1306 | "... ... ... .. ... ... \n", 1307 | "UniRef50_Z9JYV3 0.00 NaN 0 ... 64.583444 \n", 1308 | "UniRef50_Z9JYV5 99.53 Z9JYV5 0 ... 45.124692 \n", 1309 | "UniRef50_Z9JYW2 99.62 Z9JYW2 0 ... 73.070268 \n", 1310 | "UniRef50_Z9JYW9 78.36 Z9JYW9 0 ... 34.725965 \n", 1311 | "UniRef50_Z9JZ05 98.58 UPI0015607FF1 0 ... 89.525017 \n", 1312 | "\n", 1313 | " median_Evidence median_pLDDT min_pLDDT nACCs nAF2 \\\n", 1314 | "UniRef50_A0A007 4.0 88.248698 88.248698 1 1 \n", 1315 | "UniRef50_A0A009DWD5 4.0 71.991282 71.991282 1 1 \n", 1316 | "UniRef50_A0A009DWJ5 4.0 80.863404 80.863404 1 1 \n", 1317 | "UniRef50_A0A009DWL0 4.0 52.397763 52.397763 3 1 \n", 1318 | "UniRef50_A0A009DY31 4.0 76.051401 59.774386 4 4 \n", 1319 | "... ... ... ... ... ... \n", 1320 | "UniRef50_Z9JYV3 4.0 64.583444 64.583444 1 1 \n", 1321 | "UniRef50_Z9JYV5 4.0 45.124692 45.124692 1 1 \n", 1322 | "UniRef50_Z9JYW2 4.0 73.070268 73.070268 1 1 \n", 1323 | "UniRef50_Z9JYW9 4.0 34.725965 34.725965 1 1 \n", 1324 | "UniRef50_Z9JZ05 4.0 86.824971 84.585676 58 28 \n", 1325 | "\n", 1326 | " nUniRef100 nUniRef90 darkness_bins Has_duf \n", 1327 | "UniRef50_A0A007 1 1 (95.0, 100.0] NaN \n", 1328 | "UniRef50_A0A009DWD5 1 1 (-0.001, 5.0] 0.0 \n", 1329 | "UniRef50_A0A009DWJ5 1 1 (80.0, 85.0] NaN \n", 1330 | "UniRef50_A0A009DWL0 3 3 (95.0, 100.0] NaN \n", 1331 | "UniRef50_A0A009DY31 4 4 (90.0, 95.0] NaN \n", 1332 | "... ... ... ... ... \n", 1333 | "UniRef50_Z9JYV3 1 1 (-0.001, 5.0] NaN \n", 1334 | "UniRef50_Z9JYV5 1 1 (95.0, 100.0] NaN \n", 1335 | "UniRef50_Z9JYW2 1 1 (95.0, 100.0] NaN \n", 1336 | "UniRef50_Z9JYW9 1 1 (75.0, 80.0] NaN \n", 1337 | "UniRef50_Z9JZ05 44 22 (95.0, 100.0] NaN \n", 1338 | "\n", 1339 | "[53625854 rows x 22 columns]" 1340 | ] 1341 | }, 1342 | "execution_count": 4, 1343 | "metadata": {}, 1344 | "output_type": "execute_result" 1345 | } 1346 | ], 1347 | "source": [ 1348 | "# get DUF distribution of all darks and merge with the data\n", 1349 | "\n", 1350 | "duf_dark_data = 'data_generated_v2/AFDBv4_DUF_dark_diggestion_UniRef50_2023-02-06.csv'\n", 1351 | "duf_dark_data = pd.read_csv(duf_dark_data)\n", 1352 | "duf_dark_data = duf_dark_data.sort_values(by='unirefID')\n", 1353 | "duf_dark_data = duf_dark_data.set_index(\"unirefID\")\n", 1354 | "duf_dark_data = duf_dark_data[:-1]\n", 1355 | "\n", 1356 | "indata = pd.concat([indata, duf_dark_data], axis=1)\n", 1357 | "indata" 1358 | ] 1359 | }, 1360 | { 1361 | "cell_type": "markdown", 1362 | "id": "norman-qualification", 1363 | "metadata": {}, 1364 | "source": [ 1365 | "## 1.2. Make histogram at different pLDDT cutoffs" 1366 | ] 1367 | }, 1368 | { 1369 | "cell_type": "code", 1370 | "execution_count": 5, 1371 | "id": "eleven-omaha", 1372 | "metadata": {}, 1373 | "outputs": [ 1374 | { 1375 | "name": "stdout", 1376 | "output_type": "stream", 1377 | "text": [ 1378 | "Full n = 53625854 n_dark = 18249414 uniprot_n_dark = 37761108.0 % uniprot = 10.308197827140427\n", 1379 | "Full n = 53625854 n_dark = 18249414 uniref100_n_dark = 33852950.0 % uniref100 = 10.767738572191368\n", 1380 | "% UniRef50 dark with dufs = 0.08570847130870293\n", 1381 | "\n", 1382 | "AFDB n = 41983663 n_dark = 12339265 uniprot_n_dark = 29763470.0 % uniprot = 8.614381123145668\n", 1383 | "AFDB n = 41983663 n_dark = 12339265 uniref100_n_dark = 26286720.0 % uniref100 = 8.887961218741369\n", 1384 | "% UniRef50 dark with dufs = 0.10342012509280826\n", 1385 | "\n", 1386 | "AFDB70 n = 26228839 n_dark = 5618293 uniprot_n_dark = 19979438.0 % uniprot = 6.4303307109762855\n", 1387 | "AFDB70 n = 26228839 n_dark = 5618293 uniref100_n_dark = 17307646.0 % uniref100 = 6.5541442844322315\n", 1388 | "% UniRef50 dark with dufs = 0.18025675909453015\n", 1389 | "\n", 1390 | "AFDB90 n = 6136321 n_dark = 927430 uniprot_n_dark = 3696194.0 % uniprot = 6.043489091347138\n", 1391 | "AFDB90 n = 6136321 n_dark = 927430 uniref100_n_dark = 3275219.0 % uniref100 = 6.180251211639788\n", 1392 | "% UniRef50 dark with dufs = 0.06771225311444018\n", 1393 | "\n" 1394 | ] 1395 | }, 1396 | { 1397 | "data": { 1398 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3wAAADQCAYAAABcImMqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAABBEUlEQVR4nO3dd5xdVb3+8c+TRmihBRBChxAIkRJCwEuRJk0gCEiXyEXRnyAiegVFqRcuRSwIikhXShAQQu9BpEmoSQgYakghEEqo6d/fH3tNODmZcmbm1D3P+/Xar9lnt7X25Jmds3ZZWxGBmZmZmZmZ5U+3WlfAzMzMzMzMKsMNPjMzMzMzs5xyg8/MzMzMzCyn3OAzMzMzMzPLKTf4zMzMzMzMcsoNPjMzMzMzs5xygy9nJIWk9dL4lZL+t9Z1MjMzM7PGJ+n/JB1X4rI3Sdq9wlWyErjBV2WSRkn6QNJiJSw3U9InBcNXqlVPMwBJb0j6POXvA0l3SFq9hPWulDRX0ipF00+VNKco1z9L85oy/7GkjyQ9LenEwr+VZtYfL2m/8u+5NTJJh0ganTIyVdJdkrZpY51T0wmzLYumf1vSvKLMXpjmXSlpdsrsx5LGpi9Dy7Sy/muS/l/B/G2Ltv1Jqsd+Bcv8WNLb6e/i8rb+/7B8KTgOfyzpQ0mPSfq+pDa/w9XyWCzpO5JeSfPvlrRqwTxJOkfSe2k4R5I6/9uySpK0InA48Of0eXtJ85s5hjV9Xz0HaPHCQzPrT5J0g6QtCpZZKx0TexStu+CiRmvH6WpI5f+rjNsr+wUbN/iqSNJawLZAAHuXsMoxEbFUwfB4RSto1ry9ImIpYBVgGvCH1haWtCSwHzADOKyZRUYU5frcgnnHRMTSqayfAAcBdxZ9EViwPnAc8DdJK3d05yxfJB0P/A44C1gZWAP4IzCslXVE9iXm/fSz2ONFmT2mYN65KbMrAkcAWwGPpr+DRdYn+9s4V9JmABHxSOG2gT2BT4C7U912BU4EdgLWBNYBTmvXL8XyYK+UszWBs4ETgMtaW6GWx2JJ25P9DQ4DlgdeB64rWPcoYB9gE2BjYC/ge238Dqz2vg3cGRGfF0ybUpSjBd9XI+LfQB9JQ1rZ5pSUoaXJjp8vAY9I2qmddWvtON3lucFXXYcDTwBXAsM7soF05u07BZ/LelbBrCURMRO4ERjYxqL7AR8Cp9PBnEfEpxExiuzEyFeAr7ew3D3Ax8C6HSnH8iVdWTsdODoibk45mhMRt0XE/7Sy6rZkX2yPBQ6S1Ku9ZUfEzIh4iiyzK5A1/ppb7llgPLBhC5saDtwYEZ8WfL4sIsZFxAfAGWRfuqwLiogZETESOBAYLmlQK4vX8li8J/D3lNvZZLndTlLT/OHA+RExKSImA+fjXDeC3YGH27nOKFrITaHITIqIk4FLya4Olp2k5SVdIWmKsjuXbimY9910Vfp9SSOLrkpHurI+IV1pvyhdqd4QuBj4Srqy+GFafjFJv5Y0UdI0SRdLWjzN2z5dzfyJpHeU3YlyRJp3FHAo8LO0vdvKsd9u8FXX4cA1adjVVyWskUhaguxLxhNtLDqc7Ezu9cAGkjbvaJkRMREYTfaFvLg+kvR1oBfwYkfLsFz5CtAb+Ec71xsO3AbckD7v1dEKRMTHwH00k1mAdKvS+mS5Lp63JLA/cFXB5I2A5ws+Pw+sLGmFjtbRGl+6cjKJFnKW1PpYrGbGmxqozeV6o47Wz6rmy8DL7VxnPNmV3Pa4GRhcdKdEufwVWIIsbysBvwWQtCPwf8ABZCcA3yT72ym0J7AF2VXpA4BdI2I88H2+uMK4bFr2bLJj/abAekA/4OSCbX0JWCZNPxK4SNJyEXEJWTvh3LS9Dv9/VMgNvipR9vzImsANEfE08CpwSBurXZDOInwo6ZmKV9KsebekM1YzgK8B57W0oKQ1gB2AayNiGvAAi94id0BBrj8sPIPWgilktwQttD7ZbW8jgbMi4sN27I/l1wrA9IiYW+oK6UTGN8kyO4fsKnZxZrcqyuxWbWy2OLNN638M/JvsC8eEZtbbF5jOwmfQlyL722vSNL50G3Ww/CvO2QJ1cCy+O83fOF3VOJnscZYl0vzmcr1U0S2jVn+WJbuSW2jVohx9WNRQ+zit1x5TyE4StGe9No/Typ5l3R34fkR8kO4AaTreHgpcHhHPRMQs4OdkV+3WKtjE2RHxYToB8hBZY24RKcdHAT+OiPfTicCzyG6NbjIHOD3V4U6yv6MB7djfdnGDr3qGA/dGxPT0+VravsXi2IhYNg2DK1s9sxbtk85Y9QaOAR6W9KUWlv0WMD4inkufrwEOkdSzYJkbCnK9bERMaaP8fmTPVhWvvyTZ7UOHS/KzHwbwHtBXRQ/3t+EbwFzgzvT5GmB3ZZ0TNHmiKLNtXeUuzmzT+kuTndXdiOw//2LDgasjIgqmfQL0KfjcNF78pcu6nuKcFarpsTgi7gdOAW4C3kjDx2RXJaH5XH9SlH2rPx+w6MmmKUU5WrbglnTS8h+2s5x+ZCcIPiQ7PgP0LFqmJ1mjqUkpx+nVgffT7fHFViW7qgdARHxC9n9Kv4Jl3i4Y/4zsxEVzViQ7ufF0UwOU7CRI4f8r7xWdnGxte53mBl8VpLNbBwBfVdbT2tvAj4FNJLX3MvenfHGGDLIvD2YVFxHzIuJmYB7QUo+HhwPrFOT8N0BfYI+OlKmsR9DNgUdaqNMbwF104hY8y5XHgVlknUGUajjZf7ITU2b/TvZFoq07MJolaSlgZ1rO7DSyL8F7Fa23OrA9cHXRKuNY+HaoTYBpEfFeR+pn+ZBuDe4HtPQMf82PxRFxUUT0j4iVyTLfAxibZjeX63EdqZtV1Qtktym2x4YsfPtuKb4BPJMajlPJGnZrFS2zNgUNtBK9BSwvadlm5k0huxMPWHCL/QrA5BK2W3yiYjrwObBRQQN0mdQ5TSnKfuLDDb7q2IfsS/JAssu/m5L9ATxC8z3CteY5YF9JSyh7396R5aqkWWvScxrDgOXI7skvnv8VsrO8Q/ki54PIrma3K+cp318FbiW7Be7OFpZbDdgNf1Ewsg4tyG4du0jSPilHPSXtLunc4uUl9SPr/XJPvsjsJmSdBbQ3s4ulZ6RuITsLfkULy61A9mWmOLPfAh6LiFeLpl8NHClpYPqS8kuyjr+sC5LUR9KeZM8W/S0ixjSzTM2PxZJ6SxqU/t9YA7gE+H3BlZWrgeMl9Uu3kv4E57oR3Al8tZ3rfJXsZECrUlb6SToF+A7wC8hONpOdMDhT0grpmH4w2XfqNrdbKCKmpnX+KGm5tK3t0uzrgCMkbarsFSRnAU+mkxltmQasptThV0TMB/4C/FbSSmn/+inrdbkU08h6ZC4bN/iqYzhwRURMjIi3mwbgQuDQdt5+9FtgNlkYriK7TcOskm6T9AnwEXAmMDwimmtgDQdujYgxRTn/PbCnpGafNSlyYXrOaRpZ1/o3Abulg2eTA5XeswM8BTyKu6m3JCLOB44naxi9S3ZG9xiyhlixbwHPRcS9RZm9ANhYrfeA2ORnKbPvkX2JfRr4r6Jbmr5SkNnxqV4/LNrO4SzcWUvT/twNnEv2vMhEsjPap5RQL8uX21LO3gJOIrti12xPsNTHsbg3WQPzE7KG4uPArwrW/TNZR0ljyK763ZGmWX27Gtgj3bnWZFUt+h6+/WDBlehPUidDLVk1ZagpR18Gto+IewuW+QHZ7cQvAO+QHdO/nu6YaK9vkV0xfClt6zhYcBvyr8iyPpXspMlBzW9iEQ+Snex4W1LTo1snAK8AT0j6CLif0p/RuwwYmG4HvaXEdVol3y5tZmZmZmZtkXQW8E5E/K6EZW8ie61Ms1eGrXrc4DMzMzMzM8upit3SKelyZS8THFswbXlJ9yl7aeF9kpZL0yXpAmUvO3xBknuktLrjTFueOM+WN8605YGk3SS9nLJ5YjPzj5f0YsrtA5IKOxoZnrI+QVJbPcFbF1LJZ/iuJHuAt9CJwAMR0Z/snTBNQd4d6J+Go4A/VbBeZh11Jc605ceVOM+WL1fiTFsDk9QduIgsnwOBgyUNLFrsWWBIRGxM9t7Qc9O6y5M937slWYc9pzSd4DCrWIMvIv7Jou+HGcYXD6VfxRddZw8jvXsovTdjWWUvRzSrG8605YnzbHnjTFsODAVeiYjXImI2WW+swwoXiIiHIuKz9PEJYLU0vitwX2Qv+v4AuI9FT4BYF9We3iHLYeXUJSpkLy9cOY33I+t5qsmkNG0qRSQdRXY2jiWXXHLz9ddv7+tArF49++yz0yNixbaXrCvOtLWoATPtPFuLGjDP4ExbK+ow083lcstWlj+SL15N0FKmF+I851tLma52g2+BiAhJ7e4xJiIuIXufC4MHD46HH3647HWz2ujTp097X6BZV5xpK9bImXaerVgj5xmcaVtUI2da0mHAENr5XjznOd9aynS138M3remWifTznTR9MrB6wXKrUdqb7c1qzZm2PHGeLW+caWskJeVS0s5k72PcOyJmtWdd65qq3eAbSfZCUNLPWwumH556zdoKmFFwC4ZZPXOmLU+cZ8sbZ9oayVNAf0lrS+pF9uLvkYULSNqM7CX1e0fEOwWz7gF2kbRc6qxllzTNrHK3dEq6Dtge6CtpElnPQWcDN0g6EngTOCAtfiewB9kb6T8DjqhUvcw6ypm2PHGeLW+caWt0ETFX0jFkDbXuwOURMU7S6cDoiBgJnAcsBfxdEsDEiNg7It6XdAZZoxHg9Igo7sTIuqiKNfgi4uAWZu3UzLIBHF2pupiVgzNteeI8W94405YHEXEn2QmJwmknF4zv3Mq6lwOXV6521qiqfUunmZmZmZmZVYkbfGZmZmZmnSTp3oLxn9eyLmaF3OAzMzMzM+u8wveffbNmtTAr4gafmZmZmVnntfs9j2bVULMXr5uZmZmZ5cg6kkYCKhhfICL2rk21rKtzg8/MzMzMrPOGFYz/uma1MCviBp+ZmZmZWSdFxMO1roNZc9zgMzMzMzPrJEljWPg5vgCmAw8Bv46ImTWpmHV5bvCZmZmZmXXens1MWx4YDvwB+G51q2OWcYPPzMzMzKyTIuLNZia/CTwr6dlq18e+MKD/BkydNqXi5ayy8qq8POGlipfTXm7wmZmZmZlVll+FVkNTp03hwP6nVLycERNOq3gZHeEGn5mZmZlZJ0ka3Mzk5YDDgH9WuTpmC7jBZ2ZmZmbWeecXfQ7gPWAUcEnVa2OWtNngk7Q18FxEfCrpMGAw8PsW7lM2q3vOtOWJ82x540xbo4qIHWpdB7PmlHI/8Z+AzyRtAvwEeBW4uqK1MqssZ9ryxHm2vHGmrWFJGiDpfEl3pOHXktavdb2sayulwTc3IgIYBlwYERcBS1e2WmYV5UxbnjjPljfOtDUkSV8hu33zE7JbOP8CfAqMkrRVDatmXVwpz/B9LOnnZA+cbiepG9CzstUyqyhn2vLEeba8caatUZ0MHBwRowqm3SLpQeAUYPea1Mq6vFKu8B0IzAKOjIi3gdWA8ypaK7PKcqYtT5xnyxtn2hrVukWNPQAi4mFgnepXxyzT6hU+Sd2B6wofQo2IifheemtQzrTlifNseeNMW4P7uJV5n1atFmZFWm3wRcQ8SfMlLRMRM6pVKbNKcaYtT5xnyxtn2hrc6pIuaGa6gH7VroxZk1Ke4fsEGCPpPgrOTkTEsRWrlVllOdOWJ86z5Y0zbY3qf1qZN7pqtTArUkqD7+Y0lI2kHwPfIXsh5RjgCGAV4HpgBeBp4FsRMbuc5ZolzrTlifNseeNMW0OKiKtqXQez5rTZ4IuIqyQtDqwRES93tkBJ/YBjgYER8bmkG4CDgD2A30bE9ZIuBo4kexePWVk505YnzrPljTNtZlZebfbSKWkv4Dng7vR5U0kjO1luD2BxST2AJYCpwI7AjWn+VcA+nSzDrFnOtOWJ82x540ybmZVXKa9lOBUYCnwIEBHP0YmuZSNiMvBrYCLZAXcG2a0UH0bE3LTYJFp4uFXSUZJGSxo9ffr0jlbDurZTcaYtP07FebZ8ORVn2sysbEpp8M1ppqes+R0tUNJywDBgbWBVYElgt1LXj4hLImJIRAzp27dvR6thXZszbXniPFveONPW0CStKOkXki6RdHnTUOt6WddVSqct4yQdAnSX1J/sPvjHOlHmzsDrEfEugKSbga2BZSX1SGfbVgMmd6IMs9Y405YnzrPljTNtje5W4BHgfmBejetiVtIVvh8CGwGzgGvJboX4USfKnAhsJWkJSQJ2Al4EHgL2T8sMJ/tjMasEZ9ryxHm2vHGmrdEtEREnRMQNEXFT01DrSlnXVUqD7+sRcVJEbJGGXwJ7d7TAiHiS7CHpZ8i6Ru4GXAKcABwv6RWyLpIv62gZZm1wpi1PnGfLG2faGt3tkvboyIqSdpP0sqRXJJ3YzPztJD0jaa6k/YvmzZP0XBo629GR5Ugpt3T+HPh7CdNKFhGnAKcUTX6N7CFts0pzpi1PnGfLG2faGpKkj8ne9SjgF5JmAXPS54iIPm2s3x24CPgaWUdCT0kaGREvFiw2Efg28NNmNvF5RGza2f2w/GmxwSdpd7J31PSTdEHBrD7A3ObXMqtfzrTlifNseeNMW6OLiKU7uYmhwCsR8RqApOvJOhxa0OCLiDfSvA53ZGRdT2u3dE4BRgMzybovbhpGArtWvmpmZedMW544z5Y3zrTlgqQHSpnWjH7AWwWfW3xdSAt6p1eIPCFpnxbq5teMdEEtXuGLiOeB5yVdGxFzYEHXxqtHxAfVqqBZuTjTlifOs+WNM22NTlJvstd+9E3ZVZrVh/Y13DpqzYiYLGkd4EFJYyLi1cIFIuISsmdYGTx4cFShTlYHSum05T5JfSQtT/bA818k/bbC9TKrJGfa8sR5trxxpq1RfY/sKvUGZNltukp9K3BhCetPBlYv+Nyu14VExOT08zVgFLBZqetavpXS4FsmIj4C9gWujogtybo0NmtUzrTlifNseeNMW0OKiN9HxNrATyNi7YJhk4gopcH3FNBf0tqSegEHkd3S3CZJy0laLI33JXvX5Iutr2VdRSkNvh6SVgEOAG6vcH3MqsGZtjxxni1vnGlrSJJ2TKOTJe1bPLS1fkTMBY4B7gHGAzdExDhJp0vaO5WxhaRJwDeBP0sal1bfEBgt6Xmyd0yeXdS7p3VhpbyW4XSy4P0rIp5K9wVPqGy1zCrKmbY8cZ4tb5xpa1RfBR4E9mpmXgA3t7WBiLgTuLNo2skF40+R3epZvN5jwJfbWV/rItps8EXE3yl49026L3i/SlbKrJKcacsT59nyxpm2RpXe90hEHFHrupgVarPBJ+kKsrMSC4mI/65IjcwqzJm2PHGeLW+caWt0kl4FngAeAR6JiHFtrGJWUaXc0ll4/3xv4Btk78oxa1TOtOWJ82x540xboxsIbAlsC5wnaQDwQkR8o7bVsq6qlFs6byr8LOk64F8Vq5FZhTnTlifOs+WNM205MA+Yk37OB95Jg1lNlHKFr1h/YKVyV8SshpxpyxPn2fLGmbZG8xEwBvgN8JeIeK/G9bEurpRn+D4mu5de6efbwAkVrpdZxTjTlifOs+WNM205cDCwDfAD4DuSHgP+GREP1LZa1lWVckvn0tWoiFm1ONOWJ86z5Y0zbY0uIm4FbpW0AbA7cBzwM2DxWtbLuq4WG3ySBre2YkQ8U/7qmFWOM2154jxb3jjTlheSbgI2AV4F/gkcDjxZ00pZl9baFb7zW5kXwI5lrotZpTnTlifOs+WNM2158X/AsxExr9YVMYNWGnwRsUM1K2JWac605YnzbHnjTFteRMToWtfBrFC3thaQdLSkZQs+LyfpBxWtlVkFOdOWJ86z5Y0zbWZWXm02+IDvRsSHTR8i4gPguxWrkVnlOdOWJ86z5Y0zbWZWRqU0+LpLUtMHSd2BXpWrklnFOdOWJ86z5Y0zbQ1N0taSlkzjh0n6jaQ1a10v67pKafDdDYyQtJOknYDr0rQOk7SspBslvSRpvKSvSFpe0n2SJqSfy3WmDLNWONOWJ86z5Y0zbY3uT8BnkjYBfkLWW+fVta2SdWWlNPhOAB4E/l8aHiB7l0hn/B64OyI2IOu2djxwIvBARPRPZZzYyTLMWuJMW544z5Y3zrQ1urkREcAw4MKIuAjw+yWtZkp58fp84OI0dJqkZYDtgG+n7c8GZksaBmyfFrsKGEV20DcrK2fa8sR5trxxpi0HPpb0c+AwYDtJ3YCeNa6TdWGlXOErt7WBd4ErJD0r6dJ0n/PKETE1LfM2sHJzK0s6StJoSaOnT59epSqbtcqZtjxxni1vnGmrtgOBWcCREfE2sBpwXm2rZF1ZLRp8PYDBwJ8iYjPgU4puo0iXwaO5lSPikogYEhFD+vbtW/HKmpXAmbY8cZ4tb5xpq5rUydB1EfGbiHgEICImRoSf4bOaqUWDbxIwKSKeTJ9vJDsQT5O0CkD6+U4N6mbWEc605YnzbHnjTFvVRMQ8YH66ldisLrTY4JO0jKSzU49W70t6L/VsdXbhC1HbK13afkvSgDRpJ+BFYCQwPE0bDtza0TLMmuNMW544z5Y3zrTlyCfAGEmXSbqgaah1pazraq3TlhvIesnaPh0skfQlsoPiDcAunSj3h8A1knoBrwFHkDU+b5B0JPAmcEAntm/WHGfa8sR5trxxpi0vbk6DWV1orcG3VkScUzghHYDPkfTfnSk0Ip4DhjQza6fObNesDc605YnzbHnjTFsuRMRVkhYH1oiIl2tdH7PWnuF7U9LPJC3otUrSypJOAN6qfNXMys6Ztjxxni1vnGnLBUl7Ac8Bd6fPm0oaWdNKWZfWWoPvQGAF4OF0L/37ZO+oWR7f9mCNyZm2PHGeLW+cacuLU4GhwIew4ArzOrWrjnV1Ld7SGREfkL2A1C8htVxwpi1PnGfLG2facmRORMyQVDhtfq0qY9baM3wLkbQN2dmKsRFxb+WqZFYdzrTlifNseeNMWwMbJ+kQoLuk/sCxwGM1rpN1Ya29luHfBePfBS4ElgZOkXRiS+uZ1Stn2vLEeba8caYtR34IbATMAq4FZgA/qmmNrEtr7Rm+ngXjRwFfi4jTyLpFPrSitTKrDGfa8sR5trxxpi0vvh4RJ0XEFmn4JbB3rStlXVdrDb5ukpaTtAKgiHgXICI+BeZWpXZm5eVMW544z5Y3zrTlxc9LnLYISbtJelnSK81d2Za0naRnJM2VtH/RvOGSJqRheAfrbjnU2jN8ywBPAwJC0ioRMVXSUmmaWaNxpi1PnGfLG2faGpqk3YE9gH6SLiiY1YcSTlpI6g5cBHwNmAQ8JWlkRLxYsNhE4NvAT4vWXR44hex9kwE8ndb9oON7VFkD+m/A1GlTal2NLqG1Bt+OEfFaM9PnA9+oUH3MKsmZtjxxni1vnGlrdFOA0WS3bz5dMP1j4MclrD8UeKXp70DS9cAwYEGDLyLeSPOKe/3cFbgvIt5P8+8DdgOu68iOVMPUaVM4sP8pVSlrxITTqlJOvWrtls6/A0h6oHBiRHwWEa9XtFZmleFMW544z5Y3zrQ1tIh4PiKuAtaLiKvS+EiyRlwpV9r6AW8VfJ6UppWipHUlHSVptKTR06dPL3HT1uhau8LXTdIvgPUlHV88MyJ+U7lqmVWEM2154jxb3jjTlhf3Sdqb7Hv208A7kh6LiFKu8lVURFwCXAIwePDgqHF1rEpau8J3EDCPLKxLNzOYNRpn2vLEeba8caat0wYNGkSfPn0WGQYNGlTNaiwTER8B+wJXR8SWwE4lrDcZWL3g82ppWik6s67lXItX+CLiZeAcSS9ExF1VrJNZRTjTlifOs+WNM23lMHHiRCIWvXAlVbXfnx6SVgEOAE5qx3pPAf0lrU3WWDsIOKTEde8BzpK0XPq8CyX2DGr519oVvibPSLpM0l0AkgZKOrLC9TKrJGfa8sR5trxxpq3RnU7WAHslIp6StA4woa2VImIucExadzxwQ0SMk3R6ukUUSVtImgR8E/izpHFp3feBM8gajU8Bpzd14GJWSoPvSrLgrZo+/wc4rkL1MauGK3GmLT+uxHm2fLkSZ9oaWET8PSI2jogfpM+vRcR+Ja57Z0SsHxHrRsSZadrJETEyjT8VEatFxJIRsUJEbFSw7uURsV4arqjEvlljKqXB1zcibiDrFrnp7MO8itbKrLKcacsT59nyxpm2hibpCkmXFw+1rpd1Xa310tnkU0krkL3EEUlbATMqWiuzynKmLU+cZ8sbZ9oa3e0F473J3iPpN4xbzZTS4Due7B0i60p6FFgR2L+itTKrLGfa8sR5trxxpq2hRcRNhZ8lXQf8q0bVMWu7wRcRz0j6KjAAEPAyMLTSFTOrFGfa8sR5trxxpi2H+gMr1boS1nW12OCT1J2sO9l+wF2pl6A9yV7WuDiwWXWqaFYezrTlifNseeNMW15I+pjslmSln28DJ9S0UtaltXaF7zKyFzj+G/iDpCnA5sDPI+KWKtTNrNycacsT59nyxpm2XIiIpWtdh84Y0H8Dpk7zI4d50lqDbwiwcUTMl9Sb7OzEuhHxXjkKTmfyRgOTI2LP9JLJ64EVgKeBb0XE7HKUZZY405YnzrPljTNtDU3S4NbmR8Qz1apLZ0ydNoUD+59S8XJGTDit4mVYprXXMsyOiKYukWcCr5XroJv8iOylkk3OAX4bEesBHwB+yaqVmzNteeI8W94409bozm9l+HUN62VdXGsNvg0kvZCGMQWfx0h6oTOFSloN+DpwafosYEfgxrTIVcA+Hd3+55/M4uwjruW8o0Zw5uHXMP7fby6Y9+jIsXx/6G8XfP7bWfdz5uHX8OjIsQDMnjmHS35xOxHR0eKtfjVsps2a4Txb3jjT1tAiYodWhh1rXT/rulq7pXPDCpb7O+BnQNM9zisAH6aXqwJMIntoexGSjgKOAlh99dWb3fhiS/Tif/5yEN17dOPdSR/y55/fzi//uiZzZs3l6QcmsPyXsmI/nfE5H777MSdcdhDnfXcEW+89iLuvfordhg8l+7/AcqZhM23WDOfZ8saZtlyQdDRwTUR8mD4vBxwcEX+sacWsy2rxCl9EvNna0NECU49b70TE0x1ZPyIuiYghETGkb9++zS7TrZvo3iPbtc8/nc1q/VcE4IHrn+Gr+20MqTHXo1cPZs+cy7y58+nZqwfvTp7BrE9ns8YA95ybR42cabNizrPljTNtOfLdpsYeQER8AHy3dtWxrq611zL8KyK2KehadsEsICKiTwfL3BrYW9IeQG+gD/B7YFlJPdLZttWAyR3cPgAfvPMxl5x4O9MmfsDwk3fl049m8p9nJrPb8KGMOH8UAIst3pMhXxvAVWfcwz5Hb81dlz/JLocPYcT5D9G9Z3f2Puor9OrdszPVsDrS6Jk2K+Q8W94405Yj3SUp0vNBqcOgXjWuk3VhrV3h2yb9XDoi+hQMS3fioEtE/DwiVouItYCDgAcj4lDgIWD/tNhw4NaOlgGw3EpLc8LlB/OLqw/lunMf5K4rnmS34Vssstx2+27MUWftycxP57DOxqvw+O0vssUuG7DGgJV48q7xzWzZGlWjZ9qskPNseeNMW47cDYyQtJOknYDr0jSzmmit05YFJHWXtKqkNZqGCtTlBOB4Sa+Q3Vt/WUc3NGf23AXjiy/Zi95L9GLamx9w5+VP8rtjbmLG9E/484m3LVhm7px5/Puel9h670HM+mw2c+fMY+6cecz8bE4ndsfqWaNl2qw1zrPljTNtDe4E4EHg/6XhAbJnSM1qorVOWwCQ9EPgFGAaMD9NDmDjzhYeEaOAUWn8NWBoZ7cJMOXV9xhx/kN069aNefPmc+BPt2fDoWsumP+LYZfxvbP3WvD5wRHPstNBmyGJbfb5MledcS/de3Tje2fvWY7qWJ1pxEybtcR5trxxpq3RRfZ6kYvTYFZzbTb4yN5bMyDK+y6cilpzw5X52aUHtTj/rFsXftXOLocNWTC+Wv8VOenqQytWN6sLDZXpiS9N49pzH6RbN9GtezeG/2oXFlu8J5efcjdzZ89j+S8tzbd++TV69urB3866nzdfmsb2+2/C1nsPYvbMOVx5+j1898yvu+fZ/GqoPJuVwJk2MyujUhp8bwEzKl0RsypqqEwv03cpjvvDfvReshdj/vUaI//8GEsuszj/tddGDN11A+668t88fvuLbL5Tf79mpGtqqDyblcCZNjMro1IafK8BoyTdAcxqmhgRv6lYrcwqq6EyvUzfJReM9+jVnW7duzFt4gfseOCmAKy90Zd45B9j2HL3Df2aka6pofJsVgJn2hqapLUj4vWiaVtExFO1qpN1baU0+CamoRcN0KXsgP4bMHXalEWmr7Lyqrw84aUa1MjqUENlusmsz+dwyx8fZfjJu/LY7eMY+9gb7Hjgcox59HU+/WimXzPSdTVkns1a4Uxbo7tJ0l4RMRlA0leBC4Ev17ZaVmnd1J0+fTrcqXDJ2tuuabPBFxGndapGVTZ12hQO7H/KItNHTGio3bAKarRMQ9aT7J9PvJ3dhm/BquuswB5HbMl15z7Arx+cwOrrr8iyK2ZXAbfbd2O223djxj72xkKvGZk+ZQZP3jWebb/R6T4PrM40Yp7NWuNMWw58D7hF0l7AYOD/gD1qWyWrhvkxr9l2SLm1t13T2ovXb2PhF58GMB14KCL+1qHamdVQo2Z6/vzgsl/dyWbbr8tmO/QHYImlF+PIM7L/O26+8BEGbvlFL7RNrxk54tRdGfHrh/yakZxq1DybtcSZtryIiKckHQvcC8wEdo6Id2tcLevCWrvC9+tmpi0PHCZpUEScWKE6mVVKQ2b62QcnMOZfr/PR+5/xxF3j6bdeXzbbvj93XPoE6iY2GLoGX95mnQXL+zUjXUZD5tmsFc60NbRmTlosQdYB0WWSiIi9a1Mz6+pabPBFxMPNTZc0Enga8IHXGkqjZnrznddn853XX2T6hkObfw+xXzPSNTRqns1a4kxbDjR30sKs5krptGUhETHPXbxbnjjTlifOs7XXoEGDmDhx4kLT1lhjDcaOHVujGi3MmbZGEREPS+oO3B8RO9S6PmZNWnuGb/lmJi8HHA6Mq1iNzCqkUTPdXM+z7nXWGjXPVn8mTpxIRCw0rRYNLGfa8iCdoJgvaZmI8PskrS60doXvabL7kJuO+k0PT48C/l9lq2VWEQ2Z6eZ6nnWvs0aD5tmsFc605cUnwBhJ9wGfNk2MiGNrVyXrylp7hm/talbErNKcacsT59nyxpm2HLk5DWZ1od3P8JmZmXXGxJemce25D9Ktm+jWvRvDf7ULM977lL+deT/T3vqAM285kuVXXhqAv511P2++NI3t99+ErfcexOyZc7jy9Hv47plfr8lth2ZmbYmIq2pdB7NC3WpdATMz61qW6bsUx/1hP3526UHs+q0hjPzzY6y6Tl9OvPJg1vnyKguW+3TG53z47seccNlB/PPmFwC4++qn2G34UDf2zKxuSeov6UZJL0p6rWkocd3dJL0s6RVJi/RMK2kxSSPS/CclrZWmryXpc0nPpeHiMu+WNbAWG3yStk4/F6tedcwqx5m2PGnkPC/Td0l6L9kLgB69utOtezeWWHoxei/Ra6HlevTqweyZc5k3dz49e/Xg3ckzmPXpbNYYsFItqm0V1siZNityBfAnYC6wA3A18Le2Vko9fF4E7A4MBA6WNLBosSOBDyJiPeC3wDkF816NiE3T8P3O74blRWtX+C5IPx+vRkXMqsCZtjxp+DzP+nwOt/zxUXY9fItm5y+2eE+GfG0AV51xD/scvTV3Xf4k2+67MSPOf4gbL/gns2fOqXKNrcIaPtNmyeIR8QCgiHgzIk4Fvl7CekOBVyLitYiYDVwPDCtaZhjQdMvojcBO8i0P1obWnuGbI+kSoJ+kC4pnuqcha0DOtOVJQ+d57px5/PnE29lt+Basus4KLS633b4bs92+GzP2sTdYZ+NVePz2F9lilw2YPmUGT941nm2/sXEVa20V1tCZNiswS1I3YIKkY4DJwFIlrNcPeKvg8yRgy5aWiYi5kmYATQfRtSU9C3wE/DIiHikuQNJRwFEAq6++eul7ZA2ttQbfnsDOwK5kXSWbNTpn2vKkYfM8f35w2a/uZLPt12WzHfq3ufzcOfP49z0vccSpuzLi1w8xd8485s6Zx8zPfIUvZxo202ZFfgQsARwLnAHsCAyvcJlTgTUi4j1JmwO3SNooIj4qXCgiLgEuARg8eHA0sx3LodZeyzAduF7S+Ih4vop1MqsIZ9rypJHz/OyDExjzr9f56P3PeOKu8fRbry87HrgZ1579AJP+8y5/+cUdbLnbBmz/zU0BeHDEs+x00GZIYpt9vsxVZ9xL9x7d+N7Ze9Z2R6ysGjnTZoUi4qk0+glwRDtWnQwUXnZbLU1rbplJknoAywDvRUQAs1L5T0t6FVgfGN3+PbC8KeW1DO9J+gewdfr8CPCjiJhUuWqZVZQzbXnScHnefOf12Xzn9ReZfvyfvtns8rscNmTB+Gr9V+Skqw+tWN2sLjRcps0AJN0GtHjVLCL2bmMTTwH9Ja1N1rA7CDikaJmRZFcLHwf2Bx6MiJC0IvB+RMyTtA7QHyipZ1DLv1Jey3AFWbhWTcNtaVqHSFpd0kOpq9pxkn6Upi8v6T5JE9LP5TpahlkbnGnLE+fZ8saZtkb1a+B84HXgc+AvafgEeLWtlSNiLnAMcA8wHrghIsZJOl1SU2PxMmAFSa8AxwNNr27YDnhB0nNknbl8PyLeL9eOWWMrpcG3UkRcERFz03AlsGInypwL/CQiBgJbAUenLmdPBB6IiP7AA3wRYLNyc6atwwYNGkSfPn0WGQYNGlSrKjVcngf032CR39+A/ht0osqWMw2X6d8efSM/3umP3H7pEwC8PPotfrrLxZx31AjOO2oEb46fBsDtlz7BmYdfs2C5iOCSX9zOnFlzO7F7Vi8i4uGIeBjYOiIOjIjb0nAIsG2J27gzItaPiHUj4sw07eSIGJnGZ0bENyNivYgYGhGvpek3RcRG6ZUMgyPitkrtpzWeUm7pnC7pMOC69Plg4L2OFhgRU8keLCUiPpY0nqzHoWHA9mmxq4BRwAkdLcesFc60ddjEiRPJHpVYWA17xW64PE+dNoUD+5+y0LQRE07rWIUtjxou08NP3pXxT77JB+98smDal7dZm+En77rQcmMfe52Trj6UMw79K3t+ZyseHTmWLXfbkJ6LlfJ1zBrIkpLWaWqMpVs0l6xxnawLK+UK338DBwBvkx0w96d9D6C2SNJawGbAk8DK6aBMKmvlFtY5StJoSaOnT59ejmpY1+NMW544z5Y3DZfp5VdeepFp4554g3OOvJ5rz33gi3dGRjBv7ny69+jOZx/P4pXnJrPJdut2dres/vwYGCVplKSHgYfIeu40q4k2TylFxJtAWw+ZtpukpYCbgOMi4qPCs+Pp4dNmH3p1d7KNZ9CgQUycOHGhaWussQZjx46tSX2cacsT59nyJg+ZXnPDlTnzH0fSc7Ee/OOif3HvX0ez53e/wm7Dh3LpL+9gjyOGctcVT7LzIZtz4wX/ZN6ceezx31uy9HJLdH5HreYi4m5J/YGme9VfiohZtayTdW2lXOErO0k9yQ6610TEzWnyNEmrpPmrAO/Uom5Wfk23wBUOxQ3ARudMW544z5Y31c507yV7LbhNc8vdN+SN9Azfptuvx/fO3osVV1+WXr178tqYqay5wUoM2WUA91/3TLmKtzoQEbMi4vk0uLFnNVX1Bp+yU2qXAeMj4jcFs5q6mSX9vLXadTPrCGfa8sR5trypRaY/+/iL7/cvPTWRL625cAeg9/1tNLsePoRZn89hzux5zJszn1mfzi5X8WZmC6nFU8JbA98CxqSuYwF+AZwN3CDpSOBNsvv3zRqBM2154jxb3lQ801efcS+vvDCFubPn8eaLbzNwq7V4dORYevXuwVLLLs63T/mi85bR973MJtutS6/ePRnytfX584m3M2/ufIb/apcO76DVB0lbR8SjkhbzVT2rJyU3+CRtBZwK9AZ+FxG3dKTAiPgX0FJ3djt1ZJtmHeFMW544z5Y3jZTpw5tprO1wwKbNLjvkawMWjC+30tKcePnB5aiC1YcLgM3JXoo+uMZ1MVugxQafpC9FxNsFk44HvkF20HwSuKWyVTMrL2fa8sR5trxxpi0H5ki6BOgn6YLimRFxbA3qZNbqFb6LJT0DnBsRM4EPybpGng98VIW6mZWbM2154jxb3jjT1uj2BHYGdgWernFdzBZoscEXEftI2gu4XdLVwHHAIcASwD5VqZ1ZGTnTlifOs+VNo2Z6QP8NmDptykLTVll5VV6e8FKNamS1EhHTgesljY+I52tdH7MmrT7DFxG3SboT+AHwD+DMiPhnVWpmVgHOtOWJ82x504iZnjptCgf2P2WhaSMmnFaj2lideE/SP8g6DAJ4BPhRREyqYZ2sC2vxtQyS9pb0EHA3MBY4EBgm6XpJ61argmbl4kxbnjjPljfOtOXIFWSv/Vg1DbelaWY10doVvv8FhgKLA/dExFDgJ5L6A2cCB1Whfmbl5ExbnjjPljfOtLXLwIEDmTSpLi+arRQRhQ28KyUdV6vKmLXW4JsB7Et27/w7TRMjYgI+6FpjcqYtT5xnyxtn2tpl0qRJjBo1aqFp22+/fU3qUmS6pMOA69Lng4H3algf6+JavKWTrCvkFcgahYdUpzpmFeVMW544z9ZuAwcOpE+fPgsNdcSZtrz4b+AA4G1gKllvs0fUtEbWpbXWS+d04A9VrItZRTnTlifOs3VEHV8RcaYtNyLiTWDvWtfDrElrV/jMzMzMzMysgbnBZ2ZmZmZmllOtvofPzKwrq+Me4MzMzMxK4gaflY2/HFve1PPzTmZmVt8kbQWcCvQGfhcRt9S0QtZlucFnZdPcl2PwF2Qzs2rzCTiz6pP0pYh4u2DS8WS9zwp4ErilFvUyc4PPzMwsZ3wCzqwmLpb0DHBuRMwEPiR7JcN84KNaVsy6NnfaYmZmZmZ1rc7fIQlAROwDPAvcLulw4DhgMbL3S+5Ts4pZl+crfNYhvl3I8saZNjOrX43yTHVE3CbpTuAHwD+AMyPinzWulnVxbvBZhzTKgdesVM60NSqfrDCrD5L2Bn4MzAXOAv4K/ErSD4CTIuLVWtbPui43+KxN/jJheeI8W974ZIXlTQMfp/8XGAosDtwTEUOBn0jqD5wJHFTLylnX5QafLaSlg6y/TFgjKjXP4ExbY2jgL8JmJWvgkxgzgH2BJYB3miZGxATc2LMacoOvC2juC0Lv3r2ZOXNms8s36EHWcqilL7fN5belTDvPVu98jLauKocnML4BHAzMAQ6pcV3MFqirBp+k3YDfA92BSyPi7GrXoT3/8Zb6pbMS67dnm9D8FwRf5ai8ame61PzWY05LzS40n9+Wpll51cNxut5U6sSEj9GV5zxXT1e44yIipgN/6Mw22sqkpMWAq4HNgfeAAyPijTTv58CRwDzg2Ii4pzN1sfyomwafpO7ARcDXgEnAU5JGRsSL5dh+z549F+nCtxz/8Zb6pbPc67d3m1Z9lcx0c3luUq1MVWubVj+qnWmfmLBKqvT3jnrT2RMTlTr5bF8oMZNHAh9ExHqSDgLOAQ6UNJDsttGNgFWB+yWtHxHzqrsXVo/qpsFH9pDrKxHxGoCk64FhQFkOvHPmzPF/vFZtFct0c3kG59cqrqqZ9okJq7CK5bmlk3K1vOMCOn9iwiefK66UTA4DTk3jNwIXSlKafn1EzAJel/RK2t7jVaq71TFFRK3rAICk/YHdIuI76fO3gC0j4pii5Y4CjkofBwAvF22qLzC9wtWtdlldZZ/WjIgVq1B2VTRgprtKzqpZVpfLdAl5hvz9+9c6Z9Uqp8vlOU2vl2N0NcvKWzktlVVXmS7xGDs2LTMpfX4V2JKsEfhERPwtTb8MuCsibiwqo5RjdEd0lczUeznNZrqervCVJCIuAS5pab6k0RExpBp1qVZZ3qd8q5dMO2eNU1Y9ayvPkL9//zzmzHn+Qr0co6tZVt7KqXZZ9ayUY3RHODP1XU63Sm68nSYDqxd8Xi1NM2tUzrTljTNteeI8W70pJZMLlpHUA1iGrPMW59laVE8NvqeA/pLWltSL7MHTkTWuk1lnONOWN8605YnzbPWmlEyOBIan8f2BByN7PmskcJCkxSStDfQH/l2leludq5tbOiNirqRjgHvIuqK9PCLGdWBTZb9MXQdleZ8aUANm2jlrnLJqwpmueTnVLMt5Lp3//eu/nGqX1SEtZVLS6cDoiBgJXAb8NXXK8j7phe5puRvIOniZCxxd5R46nZk6LqduOm0xMzMzMzOz8qqnWzrNzMzMzMysjNzgMzMzMzMzy6lcNfgk7SbpZUmvSDqxjNtdXdJDkl6UNE7Sj9L0UyVNlvRcGvYoU3lvSBqTtjk6TVte0n2SJqSfy3WyjAEF9X5O0keSjivXPkm6XNI76X0xTdOa3QdlLkj/bi9IGtyZfcuLSuU5bbtqma5GntM2K5Zp57k8fIxuVxk+RjeAPGTax2jnubMk7SMpJG1QwTLmpXw8L+kZSf9VwbK+JOl6Sa9KelrSnZLWL3MZTfszLu3TTyRVrl0WEbkYyB5ufRVYB+gFPA8MLNO2VwEGp/Glgf8AA8lecvnTCuzLG0DfomnnAiem8ROBc8r8u3sbWLNc+wRsBwwGxra1D8AewF2AgK2AJ2udp1oPlcxz2n7VMl3tPBf8/sqWaee5bP8mPkZ3/HfnY3SdDXnJtI/RznMZ/j1HAI8Ap1WwjE8KxncFHq5QOQIeB75fMG0TYNsK7s9KwP2V/P3l6QrfUOCViHgtImYD1wPDyrHhiJgaEc+k8Y+B8UC/cmy7HYYBV6Xxq4B9yrjtnYBXI+LNcm0wIv5J1ntUoZb2YRhwdWSeAJaVtEq56tKgKpZnqItMVzLPUOZMO89l4WN0x/kYXZ/ynGkfo60kkpYCtgGOJPUYWgV9gA8qtO0dgDkRcXHThIh4PiIeqVB5RMQ7wFHAMZJUiTLy1ODrB7xV8HkSFTg4SloL2Ax4Mk06Jt0ScHk5bnlIArg3XUY+Kk1bOSKmpvG3gZXLVBZkf6DXFXyuxD5By/tQlX+7BlO130kVMl3tPEN1Mu08t4+P0R3nY3R9ykumfYx2njtjGHB3RPwHeE/S5hUqZ/F0C+RLwKXAGRUqZxDwdIW23aKIeI3syvdKldh+nhp8FZfOYtwEHBcRHwF/AtYFNgWmAueXqahtImIwsDtwtKTtCmdGdv23LO/TUPZiz72Bv6dJldqnhZRzH6zjqpTpquUZapNp57k++BhdPs50ffAxujyc54o6mOzqNunnwRUq5/OI2DQiNgB2A66u1NWwPMpTg28ysHrB59XStLKQ1JPsoHtNRNwMEBHTImJeRMwH/kJ2e0enRcTk9PMd4B9pu9OabjlIP98pR1lkB/hnImJaKrMi+5S0tA8V/bdrUBX/nVQr01XOM1Qv085z+/gY3TE+RtevXGTax2jnuaMkLQ/sCFwq6Q3gf4ADKt0Qi4jHgb7AihXY/DigUlcpWyRpHWAe5f1bWyBPDb6ngP6S1k5njw4CRpZjwym4lwHjI+I3BdML7/n+BjC2eN0OlLWkpKWbxoFd0nZHAsPTYsOBWztbVnIwBbdVVGKfCrS0DyOBw1PPWVsBMwpuw+iqKpZnqF6ma5BnqF6mnef28TG6Y3yMrl8Nn2kfo53nTtof+GtErBkRa0XE6sDrwLaVLFRZb6DdgfcqsPkHgcUKbm9G0saSKrZPklYELgYuTFejyy9q0JtPpQaynpf+Q9Zr1kll3O42ZLcCvAA8l4Y9gL8CY9L0kcAqZShrHbKevp4nO8twUpq+AvAAMIGsJ5/ly1DWkmR/LMsUTCvLPpEdzKcCc8jujz+ypX0g6xHpovTvNgYYUuss1cNQqTynbVcl09XMc9puRTLtPNd3pn2M7tC2nenyZK+hM+1jtPPcyX/Ph4DdiqYdC/ypAmXNK/hbeB74egX3a1XghpSRccAdQP8K7c+4tD8/BbpVap+UCjUzMzMzM7OcydMtnWZmZmZmZlbADT4zMzMzM7OccoPPzMzMzMwsp9zgMzMzMzMzyyk3+MzMzMzMzHKqYRp8kuZJeq5gWKuM295H0sCCz6dL2rlc20/b3F7S7S1Mn5H26QVJ90taqYVtDJF0QRvlrCWp2XfYSPq2pFU7tgedI+l3krZL49ekfT2rYP4vJe1T8HlPSafXoKpV40w703niPDvPeeNMO9PWWCSdJGlc+rd+TtKWrSxbs2zWQsM0+IDPI2LTguGNMm57H2DBgTciTo6I+8u4/bY8kvZpY7IXuR5dvICkHhExOiKO7UQ53yZ7t0hVSVoB2Coi/ilpY7J/y42BLSQto+ylq1tGxC0Fq90B7CVpiWrXt4qcaWc6T5xn5zlvnGln2hqEpK8AewKD07/1zsBbrazybWqQzVpppAbfIiS9IalvGh8iaVQaP1XS5ZJGSXpN0rEF6xyeWv7PS/qrpP8C9gbOS2cD1pV0paT90/I7SXpW0pi0zcUKyj5N0jNp3gZp+lBJj6d1HpM0oB37I2Bp4IOC/firpEeBvxaerZO0oqT70pmMSyW92fS7ALpL+kuad6+kxdP+DAGuSfu5eCv7sGTa13+n/RiWpm+UpjWdFeyflr0j/T7HSjqwmV3bD7g7jc8BFpfUDehJ9uLJ04FTCleI7AWRo8j+eLsMZ9qZzhPn2XnOG2famba6tQowPSJmAUTE9IiYImlzSQ9LelrSPZJWaS6bNa15NVTqje7lHvjijfTPAf9I094A+qbxIcCoNH4q8BiwGNAXeI/sj3wj4D8F6yyffl4J7F9Q1pXA/kBvsrMD66fpVwPHFZT9wzT+A+DSNN4H6JHGdwZuSuPbA7c3s1/bAzPSfr0FvAT0KdiPp4HFi7cBXAj8PI3vBkTa17WAucCmad4NwGFpfBQwpKDslvbhrIJ1lk2/syWBPwCHpum9gMXJDqp/KdjmMs3s41XAXgWff5f29yfApsBlLfybHwr8odbZc6adaWfaecZ57lJ5dqadaQ+NNQBLpX/j/wB/BL6a/gYfA1ZMyxwIXJ7GF8pm3oceNI7PI2LTdix/R2St/FmS3gFWBnYE/h4R0wEi4v02tjEAeD0i/pM+X0V228Pv0ueb08+ngX3T+DLAVZL6kx0Me5ZQ10ciYk8ASScA5wLfT/NGRsTnzayzDfCNtB93S/qgYN7rEfFcQd3WaqXs5vZhF2BvST9Nn3sDawCPAydJWg24OSImSBoDnC/pHLL/FB5ppoxVgHebPkTEcU3jkm4DvifpJGAT4L6I+Eua/Q75vtzuTC/MmW5szvPCnOfG50wvzJm2uhURn0jaHNgW2AEYAfwvMAi4L7uYTXdgas0qWUMNfUsn2Rmlpn3oXTRvVsH4PKhI47apjMLtnwE8FBGDgL2aqVdbRgLbFXz+tBP1Kq5ba8sWLidgv/jiuYU1ImJ8RFxLdhvK58CdknZM/ykNBsYA/yvp5GbK+Jxmfg/plo2nyc7KrBsRBwD764v753undbsSZ7r1ehXXrbVlnenac55br1dx3Vpb1nmuD8506/UqrltryzrTVlYRMS8iRkXEKcAxZFeDxxXk6ssRsUuNq1kTjd7gewPYPI3vV8LyDwLfVPYwL5KWT9M/JruHvdjLwFqS1kufvwU83EYZywCT0/i3S6hTsW2AV0tY7lHgAABJuwDLlbBOS/tZ7B7gh+nefiRtln6uA7wWERcAtwIbK+vh6LOI+BtwHtlBuNh4YL3CCZJ6AseRnVVcnOysJGRnX3ql8fWBZnv+yrE3cKad6fx4A+fZec6XN3CmnWmrO5IGpKvcTTYly8GKyjp0QVJPSRul+aVmMxcavcF3GvB7SaPJzhS1KiLGAWcCD0t6HvhNmnU98D/KHhRet2D5mcARwN/TLQTzgYvbKOZc4P8kPUvpZ/e2TQ+NPk92cP9JCeucBuyirCvkbwJvk4W3NVcCF5fwgOoZZLeEvCBpXPoM2YF+rKTnyC6RXw18Gfh3mnYK2eXzYneQPQdQ6Gjgqoj4DHgBWCL9jp+OiA/TMjukdbsSZ9qZzhPn2XnOG2fambb6tBTZrc0vSnqBrBfck8mejT0nZf054L/S8ldSWjZzQRHR9lJWd5T12jUvIuamMxd/auezBlUl6V/AngUH1baWXxm4NiJ2qmjFrG4405YnzrPljTNt1rjc4GtQ6bL1DWRXaWcDP4iIp2pbq5Ype/nl5xHxQonLbwHMKXgI3HLOmbY8cZ4tb5xps8blBp+ZmZmZmVlONfozfGZmZmZmZtYCN/jMzMzMzMxyyg0+MzMzMzOznHKDz8zMzMzMLKfc4DMzMzMzM8up/w97t2q86MRMQQAAAABJRU5ErkJggg==\n", 1399 | "text/plain": [ 1400 | "
" 1401 | ] 1402 | }, 1403 | "metadata": { 1404 | "needs_background": "light" 1405 | }, 1406 | "output_type": "display_data" 1407 | } 1408 | ], 1409 | "source": [ 1410 | "modes = ['Full', 'AFDB', 'AFDB70', 'AFDB90']\n", 1411 | "panel = ['A', 'B', 'C', 'D', 'E']\n", 1412 | "\n", 1413 | "fig, ax = plt.subplots(1, len(panel), figsize=(2.5*len(panel), 3))\n", 1414 | "percentage_dufs = []\n", 1415 | "\n", 1416 | "for j, mode in enumerate(modes):\n", 1417 | " if mode == 'Full':\n", 1418 | " tmp = indata\n", 1419 | " if 'AFDB' in mode:\n", 1420 | " tmp = indata.loc[indata.nAF2.astype(float) > 0]\n", 1421 | " if len(mode.split('AFDB')[-1]) > 0:\n", 1422 | " cut = int(mode.split('AFDB')[-1])\n", 1423 | " tmp = tmp.loc[tmp.AF2_longest_best70_pLDDT.astype(float) >= cut]\n", 1424 | " \n", 1425 | " h,_ = np.histogram(tmp.FULL_noDUF.astype(float), bins=[i for i in range(0, 105, 5)])\n", 1426 | " n_dark = h[0]\n", 1427 | " h = h*100/sum(h)\n", 1428 | "\n", 1429 | " colors = ['#57257F']\n", 1430 | " for i in range(len(h)-2):\n", 1431 | " colors.append('silver')\n", 1432 | " colors.append('white')\n", 1433 | "\n", 1434 | " x = list(range(len(h)))\n", 1435 | " y = list(h)\n", 1436 | "\n", 1437 | " ax[j].bar(x,y,1, align='edge', color=colors, edgecolor='k')\n", 1438 | " ax[j].set_facecolor('#F2F2F2')\n", 1439 | " ax[j].set_xticks(range(0,21,5))\n", 1440 | " ax[j].set_xticklabels(range(0,101,25))\n", 1441 | " ax[j].set_ylabel('% of UniRef50 clusters')\n", 1442 | " ax[j].set_xlabel('Functional Brightness (%)')\n", 1443 | " \n", 1444 | " ax[j].title.set_text('{} {}'.format(panel[j], mode))\n", 1445 | "\n", 1446 | " ax[j].set_ylim(0,100)\n", 1447 | " \n", 1448 | " percentage_dark = round(h[0])\n", 1449 | " ax[j].text(-0.1, percentage_dark+1, '{}%'.format(percentage_dark),\n", 1450 | " verticalalignment='bottom', horizontalalignment='left',\n", 1451 | " color='#57257F', fontsize=9)\n", 1452 | " \n", 1453 | " uniprot_n_dark = sum(tmp.loc[tmp.FULL_noDUF.astype(float) <=5].nACCs.astype(float))\n", 1454 | " print(mode, 'n =', len(tmp), 'n_dark =', n_dark, 'uniprot_n_dark =', uniprot_n_dark, '% uniprot =', uniprot_n_dark*100/sum(tmp.nACCs.astype(float)))\n", 1455 | "\n", 1456 | " uniref_n_dark = sum(tmp.loc[tmp.FULL_noDUF.astype(float) <=5].nUniRef100.astype(float))\n", 1457 | " print(mode, 'n =', len(tmp), 'n_dark =', n_dark, 'uniref100_n_dark =', uniref_n_dark, '% uniref100 =', uniref_n_dark*100/sum(tmp.nUniRef100.astype(float)))\n", 1458 | " \n", 1459 | " percentage_duf = len(tmp.loc[tmp.Has_duf == 1])*100/len(tmp.loc[tmp.FULL_noDUF.astype(float) <=5])\n", 1460 | " print('% UniRef50 dark with dufs =', percentage_duf)\n", 1461 | " print()\n", 1462 | " \n", 1463 | " percentage_dufs.append(percentage_duf)\n", 1464 | "\n", 1465 | "ax[j+1].bar(panel[:-1],percentage_dufs,1, align='center', color=['#57257F' for i in modes], edgecolor='k')\n", 1466 | "ax[j+1].set_facecolor('#F2F2F2')\n", 1467 | "ax[j+1].set_ylabel('% of dark clusters with DUF')\n", 1468 | "ax[j+1].set_xlabel('Set')\n", 1469 | "ax[j+1].title.set_text('({}) DUF content'.format(panel[j+1]))\n", 1470 | "ax[j+1].set_ylim(0,0.2)\n", 1471 | " \n", 1472 | "plt.tight_layout()\n", 1473 | "plt.savefig('plots/AFDBv4_uniref50_histogram_dark_content.pdf')\n", 1474 | "plt.savefig('plots/AFDBv4_uniref50_histogram_dark_content.png', dpi=2000)" 1475 | ] 1476 | }, 1477 | { 1478 | "cell_type": "code", 1479 | "execution_count": 6, 1480 | "id": "latest-canberra", 1481 | "metadata": {}, 1482 | "outputs": [ 1483 | { 1484 | "name": "stdout", 1485 | "output_type": "stream", 1486 | "text": [ 1487 | "brightness vs size Correlation AFDB90: 0.0\n" 1488 | ] 1489 | } 1490 | ], 1491 | "source": [ 1492 | "print('brightness vs size Correlation {}:'.format(mode), scipy.stats.pearsonr(indata['FULL_noDUF'], indata['nUniRef100'])[1])" 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "code", 1497 | "execution_count": 7, 1498 | "id": "confidential-mistress", 1499 | "metadata": {}, 1500 | "outputs": [ 1501 | { 1502 | "data": { 1503 | "text/html": [ 1504 | "
\n", 1505 | "\n", 1518 | "\n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | "
meanstdmedian
darkness_bins
(-0.001, 5.0]1.8549327.4713341
(5.0, 10.0]3.23885518.9299831
(10.0, 15.0]3.05165216.2440461
(15.0, 20.0]2.88137515.1979701
(20.0, 25.0]2.75893015.3881151
(25.0, 30.0]2.67053213.0038471
(30.0, 35.0]2.59575011.0215701
(35.0, 40.0]2.64447212.3187771
(40.0, 45.0]2.67239811.4835461
(45.0, 50.0]2.66622110.7951421
(50.0, 55.0]2.74772411.7077891
(55.0, 60.0]2.81491211.0313601
(60.0, 65.0]2.90546712.5952271
(65.0, 70.0]2.97494313.5042861
(70.0, 75.0]3.05594813.7302551
(75.0, 80.0]3.23393414.4955831
(80.0, 85.0]3.58395115.6633601
(85.0, 90.0]4.10989818.8813631
(90.0, 95.0]5.10071124.7382661
(95.0, 100.0]18.727700122.9806292
\n", 1656 | "
" 1657 | ], 1658 | "text/plain": [ 1659 | " mean std median\n", 1660 | "darkness_bins \n", 1661 | "(-0.001, 5.0] 1.854932 7.471334 1\n", 1662 | "(5.0, 10.0] 3.238855 18.929983 1\n", 1663 | "(10.0, 15.0] 3.051652 16.244046 1\n", 1664 | "(15.0, 20.0] 2.881375 15.197970 1\n", 1665 | "(20.0, 25.0] 2.758930 15.388115 1\n", 1666 | "(25.0, 30.0] 2.670532 13.003847 1\n", 1667 | "(30.0, 35.0] 2.595750 11.021570 1\n", 1668 | "(35.0, 40.0] 2.644472 12.318777 1\n", 1669 | "(40.0, 45.0] 2.672398 11.483546 1\n", 1670 | "(45.0, 50.0] 2.666221 10.795142 1\n", 1671 | "(50.0, 55.0] 2.747724 11.707789 1\n", 1672 | "(55.0, 60.0] 2.814912 11.031360 1\n", 1673 | "(60.0, 65.0] 2.905467 12.595227 1\n", 1674 | "(65.0, 70.0] 2.974943 13.504286 1\n", 1675 | "(70.0, 75.0] 3.055948 13.730255 1\n", 1676 | "(75.0, 80.0] 3.233934 14.495583 1\n", 1677 | "(80.0, 85.0] 3.583951 15.663360 1\n", 1678 | "(85.0, 90.0] 4.109898 18.881363 1\n", 1679 | "(90.0, 95.0] 5.100711 24.738266 1\n", 1680 | "(95.0, 100.0] 18.727700 122.980629 2" 1681 | ] 1682 | }, 1683 | "execution_count": 7, 1684 | "metadata": {}, 1685 | "output_type": "execute_result" 1686 | } 1687 | ], 1688 | "source": [ 1689 | "indata.groupby(['darkness_bins'])['nUniRef100'].agg([np.mean, np.std, np.median])" 1690 | ] 1691 | }, 1692 | { 1693 | "cell_type": "markdown", 1694 | "id": "behavioral-veteran", 1695 | "metadata": {}, 1696 | "source": [ 1697 | "# 2. Define AFDB90 set and collect all associated sequences from previously contructed mongoDB\n", 1698 | "\n", 1699 | "The AFDB90 set corresponds to those UniRef50 clusters where the longest member with a pLDDT >70% has a pLDDT >90%. We thus select only those from the table above and save them as a table and the corresponding fasta file. \n", 1700 | "\n", 1701 | "The table will be used for further analysis in the other jupyter notebooks. The fasta file will be used for the all-against-all mmseqs2 searches that make the base of the sequence similarity network." 1702 | ] 1703 | }, 1704 | { 1705 | "cell_type": "code", 1706 | "execution_count": 8, 1707 | "id": "joint-sample", 1708 | "metadata": {}, 1709 | "outputs": [], 1710 | "source": [ 1711 | "AFDB90 = indata.loc[indata.AF2_longest_best70_pLDDT.astype(float) >= 90]\n", 1712 | "AFDB90.to_csv('data_generated_v2/AFDB90v4_data.csv')" 1713 | ] 1714 | }, 1715 | { 1716 | "cell_type": "code", 1717 | "execution_count": null, 1718 | "id": "cardiac-instrument", 1719 | "metadata": {}, 1720 | "outputs": [], 1721 | "source": [ 1722 | "dbuilder_path = None # change accordingly\n", 1723 | "\n", 1724 | "import sys\n", 1725 | "import os\n", 1726 | "sys.path.append(dbuilder_path)\n", 1727 | "\n", 1728 | "import extract_uniprot as uniprot\n", 1729 | "\n", 1730 | "MONGO_HOST = \"10.1.0.202\"\n", 1731 | "MONGO_PORT = 30077\n", 1732 | "\n", 1733 | "uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT)" 1734 | ] 1735 | }, 1736 | { 1737 | "cell_type": "code", 1738 | "execution_count": null, 1739 | "id": "quality-harris", 1740 | "metadata": {}, 1741 | "outputs": [], 1742 | "source": [ 1743 | "outfasta = 'data_generated_v2/AFDBv4_90.fasta'\n", 1744 | "\n", 1745 | "count = 0\n", 1746 | "step = 100000\n", 1747 | "\n", 1748 | "target_ids = [i.split('_')[1] for i in AFDB90.index]\n", 1749 | "n_entries = len(target_ids)\n", 1750 | "\n", 1751 | "chuncks = [target_ids[i:i+step] if i+step < len(target_ids) else target_ids[i:] for i in range(0, n_entries, step)]\n", 1752 | "collected_ids = []\n", 1753 | "\n", 1754 | "print('Getting sequences for {} chuncks'.format(len(chuncks)))\n", 1755 | " \n", 1756 | "with open(outfasta, 'w') as out:\n", 1757 | " for i, chunck in enumerate(chuncks):\n", 1758 | " documents = uniprot_db.col.find({'_id': {'$in': chunck}})\n", 1759 | " for doc in documents:\n", 1760 | " out.write('>{}\\n{}\\n'.format(doc['_id'], doc['data']['SEQ']))\n", 1761 | "\n", 1762 | " " 1763 | ] 1764 | } 1765 | ], 1766 | "metadata": { 1767 | "kernelspec": { 1768 | "display_name": "Python 3 (ipykernel)", 1769 | "language": "python", 1770 | "name": "python3" 1771 | }, 1772 | "language_info": { 1773 | "codemirror_mode": { 1774 | "name": "ipython", 1775 | "version": 3 1776 | }, 1777 | "file_extension": ".py", 1778 | "mimetype": "text/x-python", 1779 | "name": "python", 1780 | "nbconvert_exporter": "python", 1781 | "pygments_lexer": "ipython3", 1782 | "version": "3.6.6" 1783 | } 1784 | }, 1785 | "nbformat": 4, 1786 | "nbformat_minor": 5 1787 | } 1788 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 schwede 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AFDB90v4 2 | 3 | This repository contains all analysis code, data and metadata generated for the current submission of our manuscript "What is hidden in the darkness? Deep-learning assisted large-scale protein family curation uncovers novel protein families and folds". 4 | 5 | ## Repo organisation 6 | 7 | The code is organised in python notebooks (for major data analysis), python scripts (for large-scale data generation and processing) and bash scripts. The Notebooks are divided into four main analysis tasks, and describe which scripts were used to generate and analyse the data (which is also provided precomputed in https://zenodo.org/record/8121336). 8 | 9 | ## How to use this repo 10 | 11 | To use the code in this repo, just download it as well as the data available [Zenodo](https://zenodo.org/record/8121336) and follow the Jupyter Notebooks from 1-4. Each notebook corresponds to a specific analysis step, and lists which scripts were run to generate the data to be analysed. 12 | 13 | The code in `make_shapemers.py` for the AFDB dataset is written for the entire AFDB download (with tar and zip etc.) but there are functions for running it on individual files or folders with PDB/CIF files 14 | 15 | A script to predict outlier scores for user input proteins is coming soon. 16 | 17 | ## Dependencies 18 | 19 | The code was written in Python 3.6 (network generation and analysis) and 3.9+ (shape-mer generation and outlier detection). 20 | 21 | For the *analysis of the data*, common, standard python modules were used. Extra modules required are: 22 | - networkx 23 | - scipy 24 | - seaborn 25 | - pandas 26 | - datashader 27 | - geometricus 28 | - torch 29 | - numba 30 | - numpy 31 | - tqdm 32 | - sklearn 33 | - gensim 34 | - scikit-learn 35 | 36 | For the *generation* of the data, we used the `dbuilder` package, which is part of the ProteinUniverseAtlas project and can be found in https://github.com/ProteinUniverseAtlas/dbuilder. Shape-mers were generated using the trained ShapemerLearn model from geometricus, for which model and training code can be found in [https://github.com/TurtleTools/geometricus/tree/master/training](https://github.com/TurtleTools/geometricus/tree/master/training). 37 | 38 | -------------------------------------------------------------------------------- /plots/AFDB90v4_component_darkness_histogram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_component_darkness_histogram.pdf -------------------------------------------------------------------------------- /plots/AFDB90v4_component_darkness_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_component_darkness_histogram.png -------------------------------------------------------------------------------- /plots/AFDB90v4_histogram_dark_content.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_histogram_dark_content.pdf -------------------------------------------------------------------------------- /plots/AFDB90v4_histogram_dark_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDB90v4_histogram_dark_content.png -------------------------------------------------------------------------------- /plots/AFDBv4_uniref50_histogram_components_word_diversity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_components_word_diversity.pdf -------------------------------------------------------------------------------- /plots/AFDBv4_uniref50_histogram_components_word_diversity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_components_word_diversity.png -------------------------------------------------------------------------------- /plots/AFDBv4_uniref50_histogram_dark_content.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_dark_content.pdf -------------------------------------------------------------------------------- /plots/AFDBv4_uniref50_histogram_dark_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/AFDBv4_uniref50_histogram_dark_content.png -------------------------------------------------------------------------------- /plots/community_cosmograph_layout_darkness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/community_cosmograph_layout_darkness.png -------------------------------------------------------------------------------- /plots/component159_community_cosmograph_layout_darkness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/component159_community_cosmograph_layout_darkness.png -------------------------------------------------------------------------------- /plots/outliers.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ProteinUniverseAtlas/AFDB90v4/721bfef9e7b725d4f7ab3e7dc82d1b0835aa0f1f/plots/outliers.pdf -------------------------------------------------------------------------------- /scripts/AFDBv4_DUF_analysis_dark.py: -------------------------------------------------------------------------------- 1 | path_to_dbuilder = #path to dbuilder 2 | 3 | import sys 4 | import os 5 | 6 | sys.path.append(path_to_dbuilder) 7 | from src import extract_uniref as uniref 8 | from src import extract_interpro as interpro 9 | from src import extract_uniparc as uniparc 10 | from src import extract_uniprot as uniprot 11 | 12 | import time 13 | import numpy as np 14 | 15 | def write_data_to_file(data, target_uniref, outfile): 16 | 17 | if not os.path.isfile(outfile): 18 | with open(outfile, 'w') as outp: 19 | outp.write(','.join(sorted(list(data.keys())))) 20 | outp.write('\n') 21 | 22 | with open(outfile, 'a+') as outp: 23 | for i in range(len(data['unirefID'])): 24 | line_data = [str(data[key][i]) if data[key][i] is not None else str(np.nan) for key in sorted(list(data.keys()))] 25 | outp.write(','.join(line_data)) 26 | outp.write('\n') 27 | 28 | return {key: [] for key in data} 29 | 30 | 31 | # LOAD TARGET UNIREF DATABASE 32 | 33 | target_uniref = sys.argv[1] # either UniRef90 or UniRef50 34 | 35 | MONGO_HOST = "10.1.0.202" 36 | MONGO_PORT = 30077 37 | 38 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref) 39 | uniref_db.index_db() 40 | 41 | interpro_db = interpro.interpro_db_diggested(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 42 | interpro_db.index_db() 43 | 44 | uniparc_db = uniparc.uniparc_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 45 | uniparc_db.index_db() 46 | 47 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 48 | uniprot_db.index_db() 49 | 50 | # COUNT ENTRIES IN THE UNIREF DATABASE 51 | 52 | print('\nCOUNTING DATABASE SIZE') 53 | 54 | n_entries = uniref_db.col.count_documents({}) 55 | print(' ... FOUND {} ENTRIES'.format(n_entries)) 56 | 57 | # DEFINE OUTPUT FILE AND CHECK THE UNIREF IDS ALREADY THERE 58 | 59 | print('\nDEFINING OUTPUT FILE') 60 | 61 | outfile = 'AFDBv4_DUF_dark_diggestion_{}_2023-02-06.csv'.format(target_uniref) 62 | 63 | # GO THROUGH EACH CHUNCK AND COLLECT THE TARGET DATA 64 | 65 | start = time.time() 66 | 67 | print('\nGOING THROUGH THE CHUNCKS') 68 | 69 | write_step = 50000 70 | 71 | data = {'unirefID': [], 'Has_duf': []} 72 | 73 | curr_count = 0 74 | for document in uniref_db.col.find(): 75 | uniref_id = document['_id'] 76 | uniref_dt = document['data'] 77 | 78 | curr_count += 1 79 | 80 | if uniref_dt['DARKNESS']['FULL_noDUF'] <= 5: 81 | rep = uniref_dt['DARKNESS']['REP'] 82 | has_duf = 0 83 | domains = [] 84 | 85 | if rep is not None: 86 | if not rep.startswith('UP'): 87 | try: 88 | domains = interpro_db.query(rep)[0]['data'] 89 | except: 90 | pass 91 | 92 | uniprot_dt = uniprot_db.query(rep)[0]['data'] 93 | if 'CHAINS' in uniprot_dt: 94 | domains += uniprot_dt['CHAINS'] 95 | 96 | else: 97 | try: 98 | domains = uniparc_db.query(rep)[0]['data']['ANNO'] 99 | except: 100 | pass 101 | 102 | if len(domains) > 0: 103 | for domain in domains: 104 | if 'DUF' in domain[0]: 105 | has_duf = 1 106 | 107 | data['unirefID'].append(uniref_id) 108 | data['Has_duf'].append(has_duf) 109 | 110 | if curr_count % write_step == 0: 111 | data = write_data_to_file(data, target_uniref, outfile) 112 | 113 | numb_seconds = time.time() - start 114 | time_to_end = round(((numb_seconds/curr_count)*n_entries)-numb_seconds) 115 | 116 | print('{} out of {}'.format(curr_count, n_entries), 'Time passed since start: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end)))) 117 | 118 | 119 | data = write_data_to_file(data, target_uniref, outfile) 120 | 121 | numb_seconds = time.time() - start 122 | print('\nFINISHED AFTER: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 123 | 124 | -------------------------------------------------------------------------------- /scripts/AFDBv4_pLDDT_analysis.py: -------------------------------------------------------------------------------- 1 | path_to_dbuilder = #path to dbuilder 2 | 3 | import sys 4 | import os 5 | 6 | sys.path.append(path_to_dbuilder) 7 | from src import extract_uniref as uniref 8 | from src import extract_alphafold as alphafold 9 | from src import extract_uniprot as uniprot 10 | 11 | import time 12 | import numpy as np 13 | 14 | def write_data_to_file(data, target_uniref, outfile): 15 | 16 | if not os.path.isfile(outfile): 17 | with open(outfile, 'w') as outp: 18 | outp.write(','.join(sorted(list(data.keys())))) 19 | outp.write('\n') 20 | 21 | with open(outfile, 'a+') as outp: 22 | for i in range(len(data['unirefID'])): 23 | line_data = [str(data[key][i]) if data[key][i] is not None else str(np.nan) for key in sorted(list(data.keys()))] 24 | outp.write(','.join(line_data)) 25 | outp.write('\n') 26 | 27 | return {key: [] for key in data} 28 | 29 | 30 | # LOAD TARGET UNIREF DATABASE 31 | 32 | target_uniref = sys.argv[1] # either UniRef90 or UniRef50 33 | 34 | MONGO_HOST = "10.1.0.202" 35 | MONGO_PORT = 30077 36 | 37 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref) 38 | uniref_db.index_db() 39 | 40 | alphafold_db = alphafold.alphafold_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 41 | alphafold_db.index_db() 42 | 43 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 44 | uniprot_db.index_db() 45 | 46 | # COUNT ENTRIES IN THE UNIREF DATABASE 47 | 48 | print('\nCOUNTING DATABASE SIZE') 49 | 50 | n_entries = uniref_db.col.count_documents({}) 51 | print(' ... FOUND {} ENTRIES'.format(n_entries)) 52 | 53 | # DEFINE OUTPUT FILE AND CHECK THE UNIREF IDS ALREADY THERE 54 | 55 | print('\nDEFINING OUTPUT FILE') 56 | 57 | outfile = 'data_generated/AFDBv4_pLDDT_diggestion_{}.csv'.format(target_uniref) 58 | 59 | # GO THROUGH EACH CHUNCK AND COLLECT THE TARGET DATA 60 | 61 | start = time.time() 62 | 63 | print('\nGOING THROUGH THE CHUNCKS') 64 | 65 | write_step = 50000 66 | 67 | data = {'unirefID': [], 'median_pLDDT': [], 'max_pLDDT': [], 'min_pLDDT': [], 'delta_pLDDT': [], 'nACCs': [], 68 | 'nUniRef100': [], 'nUniRef90': [], 'nAF2': [],'AF2_REP_best_len': [], 'AF2_REP_worst_len': [], 69 | 'AF2_REP_best': [], 'AF2_REP_worst': [], 'AF2_longest_best70': [],'AF2_longest_best70_pLDDT': [], 70 | 'AF2_longest_best70_len': [], 'median_Evidence': []} 71 | # , 'uniref_rep_len': []} 72 | 73 | curr_count = 0 74 | for document in uniref_db.col.find(): 75 | uniref_id = document['_id'] 76 | uniref_dt = document['data'] 77 | 78 | curr_count += 1 79 | 80 | for key in uniref_dt['DARKNESS']: 81 | if key != 'pLDDTs': 82 | if 'AF2_REP' in key: 83 | if uniref_dt['DARKNESS'][key] is not None: 84 | data['{}_len'.format(key)].append(uniref_dt['DARKNESS'][key]['LEN']) 85 | data[key].append(uniref_dt['DARKNESS'][key]['ACC']) 86 | 87 | else: 88 | data['{}_len'.format(key)].append(np.nan) 89 | data[key].append(None) 90 | 91 | else: 92 | if key not in data: 93 | data[key] = [] 94 | try: 95 | data[key].append(round(uniref_dt['DARKNESS'][key], 2)) 96 | except: 97 | data[key].append(uniref_dt['DARKNESS'][key]) 98 | 99 | if key == 'pLDDTs': 100 | if len(uniref_dt['DARKNESS'][key]) > 0: 101 | median = np.median(uniref_dt['DARKNESS'][key]) 102 | maximum = max(uniref_dt['DARKNESS'][key]) 103 | minimum = min(uniref_dt['DARKNESS'][key]) 104 | delta = min(uniref_dt['DARKNESS'][key])-max(uniref_dt['DARKNESS'][key]) 105 | 106 | data['median_pLDDT'].append(median) 107 | data['max_pLDDT'].append(maximum) 108 | data['min_pLDDT'].append(minimum) 109 | data['delta_pLDDT'].append(delta) 110 | 111 | else: 112 | data['median_pLDDT'].append(np.nan) 113 | data['max_pLDDT'].append(np.nan) 114 | data['min_pLDDT'].append(np.nan) 115 | data['delta_pLDDT'].append(np.nan) 116 | 117 | # get the longest protein with a pLDDT > 70 118 | longest_pLDDT_best70 = None 119 | best_pLDDT = None 120 | best_n_res = 0 121 | 122 | af_docs = alphafold_db.col.find({'_id': {'$in': uniref_dt['ACC']}}) 123 | for af_document in af_docs: 124 | curr_dt = af_document['data'] 125 | 126 | # get plddt 127 | avgPLDDT = [] 128 | n_res = 0 129 | for fragment in curr_dt: 130 | avgPLDDT.append(curr_dt[fragment]['pLDDT']['avg_pLDDT']*curr_dt[fragment]['pLDDT']['Lenght']) 131 | n_res += curr_dt[fragment]['pLDDT']['Lenght'] 132 | 133 | fullprotein_pLDDT = sum(avgPLDDT)/n_res 134 | 135 | if fullprotein_pLDDT > 70 and n_res > best_n_res: 136 | longest_pLDDT_best70 = af_document['_id'] 137 | best_pLDDT = fullprotein_pLDDT 138 | best_n_res = n_res 139 | 140 | if best_n_res == 0: 141 | best_n_res = None 142 | 143 | # rep_length = len(uniprot_db.query(uniref_id.split('_')[-1])[0]['data']['SEQ']) 144 | 145 | # get the median evidence level 146 | evidence_level = [] 147 | uniprot_docs = uniprot_db.col.find({'_id': {'$in': uniref_dt['ACC']}}) 148 | for up_document in uniprot_docs: 149 | evidence_level.append(up_document['data']['EVIDENCE']['LEVEL']) 150 | if len(evidence_level) > 0: 151 | median_evidence = np.median(evidence_level) 152 | else: 153 | median_evidence = np.nan 154 | 155 | data['nACCs'].append(len(uniref_dt['ACC'])) 156 | data['nUniRef100'].append(len(uniref_dt['UNIREF']['UniRef100'])) 157 | data['nUniRef90'].append(len(uniref_dt['UNIREF']['UniRef90'])) 158 | data['nAF2'].append(len(uniref_dt['DARKNESS']['pLDDTs'])) 159 | data['unirefID'].append(uniref_id) 160 | # data['uniref_rep_len'].append(rep_length) 161 | data['AF2_longest_best70'].append(longest_pLDDT_best70) 162 | data['AF2_longest_best70_len'].append(best_n_res) 163 | data['AF2_longest_best70_pLDDT'].append(best_pLDDT) 164 | data['median_Evidence'].append(median_evidence) 165 | 166 | if curr_count % write_step == 0: 167 | data = write_data_to_file(data, target_uniref, outfile) 168 | 169 | numb_seconds = time.time() - start 170 | time_to_end = round(((numb_seconds/curr_count)*n_entries)-numb_seconds) 171 | 172 | print('{} out of {}'.format(curr_count, n_entries), 'Time passed since start: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end)))) 173 | 174 | 175 | data = write_data_to_file(data, target_uniref, outfile) 176 | 177 | numb_seconds = time.time() - start 178 | print('\nFINISHED AFTER: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 179 | 180 | -------------------------------------------------------------------------------- /scripts/get_communities_summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pandas as pd 4 | import json 5 | import networkx as nx 6 | import numpy as np 7 | 8 | import scipy 9 | from scipy import stats 10 | 11 | from multiprocessing.pool import ThreadPool 12 | from collections import Counter 13 | 14 | import sys 15 | 16 | # LOAD MY DBs 17 | 18 | path_to_dbuilder = #path to dbuilder 19 | 20 | sys.path.append(path_to_dbuilder) 21 | 22 | from src import extract_uniref as uniref 23 | from src import extract_uniprot as uniprot 24 | from src import extract_uniparc as uniparc 25 | 26 | target_uniref = 'UniRef50' 27 | 28 | MONGO_HOST = "10.1.0.202" 29 | MONGO_PORT = 30077 30 | 31 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref) 32 | uniref_db.index_db() 33 | 34 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 35 | uniprot_db.index_db() 36 | 37 | uniparc_db = uniparc.uniparc_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 38 | uniparc_db.index_db() 39 | 40 | 41 | # GET INPUTS 42 | AFDB_data = sys.argv[1] 43 | uniprot_tax = sys.argv[2] 44 | threads = int(sys.argv[3]) 45 | infolder = AFDB_data.split('/')[-2] 46 | 47 | # LOAD INPUTS 48 | 49 | print('1. Loading AFDB data') 50 | AFDB90_CC = pd.read_csv(AFDB_data, dtype = {'communityID': str}) 51 | AFDB90_CC = AFDB90_CC.sort_values(by='unirefID') 52 | AFDB90_CC = AFDB90_CC.set_index("unirefID") 53 | 54 | print('2. Loading taxonomy data') 55 | taxonomy = pd.read_csv(uniprot_tax, index_col=0) 56 | 57 | print('3. Getting outlier data') 58 | outliers = '/scicore/home/schwede/durair0000/projects/turtle_tools/afdb-geometricus/data/outlier_results/' 59 | outliers_data = {} 60 | 61 | files = sorted(os.listdir(outliers)) 62 | for i, file in enumerate(files): 63 | if i % 10 == 0: 64 | print(i, len(files)) 65 | 66 | with open('{}/{}'.format(outliers, file)) as f: 67 | for line in f: 68 | line = line.strip().split() 69 | uniprot_id = line[0].split('-')[0] 70 | if uniprot_id not in outliers_data: 71 | outliers_data[uniprot_id] = [float(line[1])] 72 | else: 73 | outliers_data[uniprot_id].append(float(line[1])) 74 | 75 | 76 | # DEFINE ROUTINES 77 | 78 | def chunk_list(l, n): 79 | 80 | chunks = np.array_split(np.array(l), n) 81 | 82 | chunks = [list(chunk) for chunk in chunks] 83 | return chunks 84 | 85 | def get_tax_from_menzi(subgr_members, max_chunck_size=10000): 86 | 87 | superkingdoms = [] 88 | 89 | for unirefID in subgr_members.index: 90 | 91 | curr_accs = uniref_db.query(unirefID)[0]['data']['UNIREF']['UniRef100'] 92 | curr_accs = [i.split('_')[-1] for i in curr_accs] 93 | 94 | if len(curr_accs) > max_chunck_size: 95 | ratio = len(curr_accs)/max_chunck_size 96 | if ratio - round(ratio) > 0: 97 | n_chunks = round(ratio) + 1 98 | else: 99 | n_chunks = round(ratio) 100 | chuncks = chunk_list(curr_accs, n_chunks, counts=None) 101 | else: 102 | chuncks = [curr_accs] 103 | 104 | for chunck in chuncks: 105 | 106 | up_docs = uniprot_db.col.find({'_id': {'$in': chunck}}) 107 | for doc in up_docs: 108 | acc = doc['_id'] 109 | try: 110 | tax = doc['data']['TAXID'][2][0] 111 | superkingdoms.append(tax) 112 | except: 113 | pass 114 | 115 | uparc_docs = uniparc_db.col.find({'_id': {'$in': curr_accs}}) 116 | for doc in uparc_docs: 117 | acc = doc['_id'] 118 | try: 119 | tax = doc['data']['TAXID'][2][0] 120 | superkingdoms.append(tax) 121 | except: 122 | pass 123 | 124 | try: 125 | count = Counter(superkingdoms) 126 | return count.most_common(1)[0][0], count.most_common(1)[0][1]*100/len(superkingdoms) 127 | except: 128 | return np.nan, np.nan 129 | 130 | def get_comunities_summary_for_communities(arguments): 131 | 132 | target_communities = arguments[0] 133 | AFDB90_CC = arguments[1] 134 | taxonomy = arguments[2] 135 | outlier_data = arguments[3] 136 | thread_id = arguments[4] 137 | 138 | curr_data = AFDB90_CC.loc[AFDB90_CC.communityID.isin(target_communities)] 139 | curr_tax = taxonomy.loc[taxonomy.communityIDs.isin(target_communities)] 140 | 141 | start = time.time() 142 | 143 | communities_summary = {'Community': [], 'Subgraph': [], 'Avg_darkness': [], 'SD_darkness': [], 144 | 'Avg_outlier_score': [], 'SD_outlier_score':[], 'N_members': [], 'TM': [], 145 | 'SP': [], 'Median_length': [], 'MAD_length': [], 'Median_darkness': [], 'MAD_darkness': [], 146 | 'Median_representative': [], 'Longest_representative': [], 'Median_rep_title': [], 147 | 'Mode_superkingdom': [], 'Freq_superkingdom': []} 148 | 149 | n_expected = len(target_communities) 150 | for i, community_class in enumerate(target_communities): 151 | 152 | subgr_members = curr_data.loc[curr_data.communityID == community_class] 153 | 154 | communities_summary['Subgraph'].append(community_class.split('[')[0]) 155 | communities_summary['Community'].append(community_class) 156 | 157 | communities_summary['Avg_darkness'].append(np.mean(subgr_members.FULL_noDUF.astype(float))) 158 | communities_summary['SD_darkness'].append(np.std(subgr_members.FULL_noDUF.astype(float))) 159 | 160 | communities_summary['Median_darkness'].append(np.median(subgr_members.FULL_noDUF.astype(float))) 161 | communities_summary['MAD_darkness'].append(stats.median_abs_deviation(subgr_members.FULL_noDUF.astype(float))) 162 | 163 | outlier_scores = [outlier_data[i] for i in subgr_members.AF2_longest_best70 if i in outlier_data] 164 | if len(outlier_scores) > 0: 165 | communities_summary['Avg_outlier_score'].append(np.mean(outlier_scores)) 166 | communities_summary['SD_outlier_score'].append(np.std(outlier_scores)) 167 | else: 168 | communities_summary['Avg_outlier_score'].append(np.nan) 169 | communities_summary['SD_outlier_score'].append(np.nan) 170 | 171 | communities_summary['N_members'].append(len(subgr_members)) 172 | 173 | communities_summary['TM'].append((len(subgr_members)-list(subgr_members.TM).count(0))*100/len(subgr_members)) 174 | communities_summary['SP'].append((len(subgr_members)-list(subgr_members.SP).count(0))*100/len(subgr_members)) 175 | 176 | median = np.median(subgr_members.AF2_longest_best70_len.astype(float)) 177 | communities_summary['Median_length'].append(median) 178 | communities_summary['MAD_length'].append(stats.median_abs_deviation(subgr_members.AF2_longest_best70_len.astype(float))) 179 | 180 | subgr_members['dist_to_median'] = abs(subgr_members.AF2_REP_best_len - median) 181 | 182 | median_rep = subgr_members.sort_values(by='dist_to_median', ascending=True).AF2_longest_best70[0] 183 | communities_summary['Median_representative'].append(median_rep) 184 | 185 | longest_rep = subgr_members.sort_values(by='AF2_REP_best_len', ascending=False).AF2_longest_best70[0] 186 | communities_summary['Longest_representative'].append(longest_rep) 187 | 188 | median_title = uniprot_db.query(median_rep)[0]['data']['NAME']['TITLE'] 189 | communities_summary['Median_rep_title'].append(median_title) 190 | 191 | try: 192 | subgrp_tax = curr_tax.loc[curr_tax.communityIDs == community_class] 193 | tax = subgrp_tax.superkingdom.mode()[0] 194 | frq = subgrp_tax['superkingdom'].value_counts()[tax]*100/len(subgrp_tax) 195 | except: 196 | tax, frq = get_tax_from_menzi(subgr_members) 197 | 198 | communities_summary['Mode_superkingdom'].append(tax) 199 | communities_summary['Freq_superkingdom'].append(frq) 200 | 201 | if i % 100 == 0: 202 | numb_seconds = time.time() - start 203 | time_to_end = round(((numb_seconds/(i+1))*n_expected)-numb_seconds) 204 | print('thread {}:'.format(thread_id), i+1, n_expected, 'CURR COMMUNITY:', community_class, 'CURR TITLE', median_title, ' ... Time passed: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))), flush = True) 205 | 206 | return communities_summary 207 | 208 | 209 | def get_comunities_summary(AFDB90_CC, outlier_data, taxonomy, threads): 210 | 211 | target_communities = list(set(AFDB90_CC.communityID)) 212 | separated_jobs = chunk_list(target_communities, threads) 213 | 214 | list_arguments = [i for i in zip(separated_jobs, [AFDB90_CC for job in separated_jobs], [taxonomy for job in separated_jobs], [outlier_data for job in separated_jobs], range(threads))] 215 | 216 | pool = ThreadPool(threads) 217 | results = pool.imap_unordered(get_comunities_summary_for_communities, list_arguments) 218 | 219 | all_results = {} 220 | for dic in results: 221 | for key in dic: 222 | if key not in all_results: 223 | all_results[key] = dic[key] 224 | else: 225 | all_results[key] += dic[key] 226 | 227 | all_results = pd.DataFrame(all_results) 228 | all_results = all_results.set_index('Community') 229 | all_results = all_results.sort_values(by='N_members', ascending=False) 230 | 231 | return all_results 232 | 233 | 234 | # GET COMMUNITIES SUMMARY 235 | 236 | print('3. Getting communities summary') 237 | 238 | if not os.path.isfile('{}/communities_summary_noreps.csv'.format(infolder)): 239 | communities_summary = get_comunities_summary(AFDB90_CC, outliers_data, taxonomy, threads=threads) 240 | communities_summary.to_csv('{}/communities_summary_noreps.csv'.format(infolder)) 241 | 242 | else: 243 | communities_summary = pd.read_csv('{}/communities_summary_noreps.csv'.format(infolder), dtype = {'Community': str}) 244 | communities_summary = communities_summary.set_index("Community") 245 | 246 | communities_summary.to_csv('{}/communities_summary.csv'.format(infolder)) -------------------------------------------------------------------------------- /scripts/get_connected_components.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import networkx as nx 5 | import numpy as np 6 | import time 7 | import gzip 8 | import io 9 | 10 | from networkx.algorithms.community import asyn_lpa_communities 11 | 12 | # LOAD INPUTS 13 | 14 | infasta = sys.argv[1] 15 | inmmsqs = sys.argv[2] 16 | outfolder = sys.argv[3] 17 | 18 | if not os.path.isdir(outfolder): 19 | os.mkdir(outfolder) 20 | 21 | # helping routines 22 | 23 | def get_seqs_index(infasta, indx = {}): 24 | 25 | print('\nReading sequences from input fasta and generating node index') 26 | 27 | start = time.time() 28 | 29 | count = len(indx) 30 | 31 | if infasta.endswith('.gz'): 32 | with gzip.open(infasta, 'rb') as inf: 33 | with io.TextIOWrapper(inf, encoding='utf-8') as decoder: 34 | for line in decoder: 35 | if line.startswith('>'): 36 | if '|' in line: 37 | line = line.split('|')[1].strip('>') 38 | else: 39 | line = line.split()[0].strip('>') 40 | indx[line] = count 41 | count+=1 42 | 43 | else: 44 | with open(infasta, 'r') as inf: 45 | for line in inf: 46 | if line.startswith('>'): 47 | if '|' in line: 48 | line = line.split('|')[1].strip('>') 49 | else: 50 | line = line.split()[0].strip('>') 51 | indx[line] = count 52 | count+=1 53 | 54 | print(' ... No. of expected nodes:', len(indx)) 55 | 56 | numb_seconds = time.time() - start 57 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 58 | 59 | return indx 60 | 61 | 62 | def get_neighbors_simple(inmmsqs, indexes, mineval = 1e-4, mincov = 0, simplex = True, nmax=None): 63 | 64 | print('\nCollecting edges from input mmseqs file') 65 | 66 | start = time.time() 67 | 68 | edges = set() 69 | nodes = set() 70 | 71 | if not simplex: 72 | edges = {} 73 | 74 | mincov = mincov/100 75 | previous_len = None 76 | with open(inmmsqs, 'r') as inmm: 77 | for line in inmm: 78 | line = line.split('\t') 79 | i, j, cov, evalue = line[0], line[1], line[2], line[10] 80 | 81 | if i in indexes and j in indexes: 82 | evalue = float(evalue.strip()) 83 | cov = float(cov) 84 | 85 | if i != j and evalue <= mineval and cov >= mincov: 86 | if simplex: 87 | # edges.add(tuple(sorted([indexes[i], indexes[j]]))) 88 | edges.add(tuple(sorted([i, j]))) 89 | else: 90 | # edge = sorted([indexes[i], indexes[j]]) 91 | edge = sorted([i, j]) 92 | if edge[0] in edges: 93 | if edge[1] in edges[edge[0]]: 94 | if edges[edge[0]][edge[1]][0] > evalue: 95 | edges[edge[0]][edge[1]] = (evalue, cov) 96 | else: 97 | edges[edge[0]][edge[1]] = (evalue, cov) 98 | else: 99 | edges[edge[0]] = {edge[1]: (evalue, cov)} 100 | 101 | nodes.add(i) 102 | nodes.add(j) 103 | 104 | if len(nodes)> 0 and len(nodes) % 100000 == 0 and len(nodes) != previous_len: 105 | print(len(nodes)) 106 | previous_len = len(nodes) 107 | 108 | if nmax is not None and len(nodes) == nmax: 109 | break 110 | 111 | print(' ... Total number of hubs:', len(edges)) 112 | print(' ... Total number of nodes:', len(nodes)) 113 | 114 | numb_seconds = time.time() - start 115 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 116 | 117 | return edges, nodes 118 | 119 | def generate_pairs(neigbhrs, topN = None, min_weight = 0): 120 | 121 | print('\nRemoving redundant edges') 122 | 123 | start = time.time() 124 | 125 | edges = dict() 126 | weights = list() 127 | 128 | for i in neigbhrs: 129 | if topN is not None: 130 | curr_neighbrs = {k: v for k, v in sorted(neigbhrs[i].items(), key=lambda item: item[1])[:topN]} 131 | else: 132 | curr_neighbrs = neigbhrs[i] 133 | 134 | for j in curr_neighbrs: 135 | i_index, j_index = sorted([i, j]) 136 | evalue = curr_neighbrs[j][0] 137 | cov = curr_neighbrs[j][1] 138 | 139 | edge = (i_index, j_index) 140 | edges[edge] = {'evalue': evalue, 'cov': cov*100} 141 | # edges.add(edge) 142 | # weights.append(-np.log10(evalue)*cov) 143 | 144 | print(' ... Total number of edges:', len(edges)) 145 | 146 | # # normalise weights 147 | # max_weigth = max(weights) 148 | # min_weigth = int(min(weights)) 149 | # normalised_weigths = [(weight-min_weight)/(max_weigth-min_weight) for weight in weights] 150 | 151 | # for i, edge in enumerate(edges): 152 | # edges[edge]['weight'] = normalised_weigths[i] 153 | 154 | numb_seconds = time.time() - start 155 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 156 | 157 | return edges 158 | 159 | def build_graph(edges, indexes, outgraph = None, outfolder = outfolder, map_properties = False, properties=['Darkness']): 160 | 161 | print('\nBuilding the graph') 162 | 163 | start = time.time() 164 | 165 | G=nx.Graph() 166 | G.add_nodes_from(list(indexes.keys())) 167 | G.add_edges_from(list(edges.keys())) 168 | 169 | nx.set_edge_attributes(G, edges) 170 | 171 | if map_properties: 172 | properties = get_nodes_properties(indexes.keys(), properties) 173 | nx.set_node_attributes(G, properties) 174 | 175 | if outgraph is None: 176 | nx.write_gml(G, "{}/full_graph.gml".format(outfolder)) 177 | else: 178 | nx.write_gml(G, outgraph) 179 | 180 | numb_seconds = time.time() - start 181 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 182 | 183 | return G 184 | 185 | 186 | def collect_connected_components(G, nodes, min_size = 0, outfolder=outfolder, outgraph=None): 187 | 188 | print('\nCollecting individual subgraphs/connected components') 189 | 190 | sec_outfolder = '{}/subgraphs'.format(outfolder) 191 | if not os.path.isdir(sec_outfolder): 192 | os.mkdir(sec_outfolder) 193 | 194 | start = time.time() 195 | 196 | # Get connected components 197 | components = sorted(nx.connected_components(G), key=len, reverse=True) 198 | 199 | print(' ... Found {} subgraphs'.format(nx.number_connected_components(G))) 200 | print(' ... ... The largest has {} nodes'.format(len(components[0]))) 201 | 202 | count = 0 203 | edge_count = 0 204 | node_count = 0 205 | 206 | node_cluster_class = {'node': [], 'subgraphID': [], 'communityID': []} 207 | 208 | for component_index, c in enumerate(components): 209 | 210 | curr_size = len(c) 211 | 212 | if curr_size >= min_size: 213 | 214 | component = G.subgraph(c).copy() 215 | 216 | curr_outf = '{}/subgraph_{:06d}.gml'.format(sec_outfolder, component_index) 217 | nx.write_gml(component, curr_outf) 218 | 219 | # Get communities by label propagation 220 | communities = list(asyn_lpa_communities(component, weight=None)) 221 | 222 | for community_index, community in enumerate(communities): 223 | node_cluster_class['node'] += community 224 | node_cluster_class['communityID'] += ['{}[{}]'.format(component_index, community_index) for node in community] 225 | node_cluster_class['subgraphID'] += [component_index for node in community] 226 | 227 | print(' ... ... Subgraph:', component_index, 'No. nodes:', curr_size, 'No. communities:', len(communities)) 228 | print(' ... ... Subgraph:', component_index, 'No. nodes:', curr_size) 229 | 230 | 231 | json.dump(node_cluster_class, open('{}/node_class.json'.format(outfolder), 'w')) 232 | 233 | # if outgraph is None: 234 | # nx.write_gml(G, "{}/full_graph.gml".format(outfolder)) 235 | # else: 236 | # nx.write_gml(G, outgraph) 237 | 238 | print(' ... Wrote {} subgraphs, totalling {} nodes and {} edges'.format(count, node_count, edge_count)) 239 | 240 | numb_seconds = time.time() - start 241 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 242 | 243 | 244 | # MAIN CODE 245 | 246 | outgraph = "{}/full_graph.gml".format(outfolder) 247 | 248 | indexes = get_seqs_index(infasta) 249 | 250 | if not os.path.isfile(outgraph): 251 | hubs, nodes = get_neighbors_simple(inmmsqs, indexes, mineval = 1e-4, mincov = 50, simplex = False, nmax=None) 252 | edges = generate_pairs(hubs, topN = 4) 253 | 254 | graph = build_graph(edges, indexes, outgraph=outgraph) 255 | 256 | else: 257 | print('Graph already produced. Will just load it') 258 | 259 | start = time.time() 260 | graph = nx.read_gml(outgraph) 261 | 262 | numb_seconds = time.time() - start 263 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 264 | 265 | 266 | collect_connected_components(graph, nodes=list(indexes.keys()), min_size = 2) 267 | -------------------------------------------------------------------------------- /scripts/get_uniprot_taxonomy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pandas as pd 4 | import json 5 | import itertools 6 | import networkx as nx 7 | import numpy as np 8 | 9 | import scipy 10 | from scipy import stats 11 | 12 | from ete3 import NCBITaxa 13 | 14 | from multiprocessing.pool import ThreadPool 15 | 16 | import sys 17 | 18 | import warnings 19 | warnings.filterwarnings("ignore") 20 | 21 | # LOAD MY DBs 22 | 23 | path_to_dbuilder = #path to dbuilder 24 | sys.path.append(path_to_dbuilder) 25 | 26 | from src import extract_uniref as uniref 27 | from src import extract_uniprot as uniprot 28 | from src import extract_uniparc as uniparc 29 | from src import extract_alphafold as alphafold 30 | 31 | target_uniref = 'UniRef50' 32 | 33 | MONGO_HOST = "10.1.0.202" 34 | MONGO_PORT = 30077 35 | 36 | uniref_db = uniref.uniref_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT, name = target_uniref) 37 | uniref_db.index_db() 38 | 39 | uniprot_db = uniprot.uniprot_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 40 | uniprot_db.index_db() 41 | 42 | uniparc_db = uniparc.uniparc_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 43 | uniparc_db.index_db() 44 | 45 | alphafold_db = alphafold.alphafold_extractor(mongo_host = MONGO_HOST, mongo_port = MONGO_PORT) 46 | alphafold_db.index_db() 47 | 48 | 49 | # GET INPUTS 50 | AFDB_data = sys.argv[1] 51 | threads = int(sys.argv[2]) 52 | jobid = sys.argv[3] 53 | infolder = AFDB_data.split('/')[-2] 54 | 55 | label = AFDB_data.split('/')[-1].split('.')[0] 56 | 57 | # LOAD INPUTS 58 | 59 | print('1. Loading AFDB data') 60 | AFDB90_CC = pd.read_csv(AFDB_data, dtype = {'communityID': str}) 61 | AFDB90_CC = AFDB90_CC.sort_values(by='unirefID') 62 | AFDB90_CC = AFDB90_CC.set_index("unirefID") 63 | 64 | # ROUTINES 65 | 66 | def chunk_list(l, n, counts=None): 67 | 68 | print(len(l)) 69 | 70 | print('Making chuncks') 71 | 72 | if counts is not None: 73 | b = [[l[i]]*counts[i] for i in range(len(l))] 74 | b = list(itertools.chain.from_iterable(b)) 75 | else: 76 | b = l 77 | 78 | chunks = np.array_split(np.array(b), n) 79 | # chunks = [list(chunk) for chunk in chunks] 80 | 81 | final_chunks = [] 82 | for i, chunk in enumerate(chunks): 83 | chunk = set(chunk) 84 | 85 | if i > 0: 86 | last_chunk = set(final_chunks[-1]) 87 | if len(chunk.intersection(last_chunk)) > 0: 88 | chunk = chunk - last_chunk 89 | 90 | chunk = list(chunk) 91 | final_chunks.append(chunk) 92 | print(len(chunk), chunk[0]) 93 | 94 | sumlen = sum([len(i) for i in final_chunks]) 95 | print(' ... Made {} chuncks ({} jobs in total)'.format(len(final_chunks), sumlen), len(l)) 96 | 97 | return final_chunks 98 | 99 | def get_taxonomy_for_unirefs(arguments,max_chunck_size = 10000): 100 | 101 | NCBI = NCBITaxa() 102 | 103 | target_unirefs = arguments[0] 104 | AFDB90_CC = arguments[1] 105 | outfolder = arguments[2] 106 | thread_id = arguments[3] 107 | jobid = arguments[4] 108 | 109 | curr_data = AFDB90_CC.loc[AFDB90_CC.index.isin(target_unirefs)] 110 | 111 | target_ranks = ['superkingdom','phylum','class','order','genus','species'] 112 | 113 | out_json = '../{}/{}_taxonomy.json'.format(outfolder, thread_id) 114 | out_summary = '../{}/{}_taxonomy_summary.json'.format(outfolder, thread_id) 115 | 116 | try: 117 | taxonomy = json.load(open(out_json, 'r')) 118 | target_unirefs = list(set(target_unirefs)-set(taxonomy['UniRef50IDs'])) 119 | except: 120 | taxonomy = {rank: [] for rank in target_ranks} 121 | taxonomy['uniprotIDs'] = [] 122 | taxonomy['UniRef50IDs'] = [] 123 | taxonomy['communityIDs'] = [] 124 | # taxonomy['pLDDT'] = [] 125 | 126 | count = 0 127 | n_expected = len(target_unirefs) 128 | 129 | start = time.time() 130 | 131 | for unirefID in target_unirefs: 132 | 133 | row = curr_data.loc[unirefID] 134 | curr_community = row.communityID 135 | # curr_accs = uniref_db.query(unirefID)[0]['data']['UNIREF']['UniRef100'] 136 | curr_accs = uniref_db.query(unirefID)[0]['data']['ACC'] 137 | curr_accs = [i.split('_')[-1] for i in curr_accs] 138 | 139 | if len(curr_accs) > max_chunck_size: 140 | ratio = len(curr_accs)/max_chunck_size 141 | if ratio - round(ratio) > 0: 142 | n_chunks = round(ratio) + 1 143 | else: 144 | n_chunks = round(ratio) 145 | chuncks = chunk_list(curr_accs, n_chunks, counts=None) 146 | else: 147 | chuncks = [curr_accs] 148 | 149 | for chunck in chuncks: 150 | up_docs = uniprot_db.col.find({'_id': {'$in': chunck}}) 151 | for doc in up_docs: 152 | acc = doc['_id'] 153 | taxid = doc['data']['TAXID'][0] 154 | 155 | curr_tax = {rank: np.nan for rank in target_ranks} 156 | try: 157 | lineage = NCBI.get_lineage(taxid) 158 | translation = NCBI.get_taxid_translator(lineage) 159 | ranks = NCBI.get_rank(lineage) 160 | 161 | for level in lineage: 162 | if ranks[level] in target_ranks: 163 | curr_tax[ranks[level]] = translation[level] 164 | except: 165 | pass 166 | 167 | for rank in curr_tax: 168 | taxonomy[rank].append(curr_tax[rank]) 169 | 170 | taxonomy['uniprotIDs'].append(acc) 171 | taxonomy['UniRef50IDs'].append(unirefID) 172 | taxonomy['communityIDs'].append(curr_community) 173 | 174 | uparc_docs = uniparc_db.col.find({'_id': {'$in': curr_accs}}) 175 | for doc in uparc_docs: 176 | acc = doc['_id'] 177 | taxid = doc['data']['TAXID'][0] 178 | 179 | curr_tax = {rank: np.nan for rank in target_ranks} 180 | try: 181 | lineage = NCBI.get_lineage(taxid) 182 | translation = NCBI.get_taxid_translator(lineage) 183 | ranks = NCBI.get_rank(lineage) 184 | 185 | for level in lineage: 186 | if ranks[level] in target_ranks: 187 | curr_tax[ranks[level]] = translation[level] 188 | except: 189 | pass 190 | 191 | for rank in curr_tax: 192 | taxonomy[rank].append(curr_tax[rank]) 193 | 194 | taxonomy['uniprotIDs'].append(acc) 195 | taxonomy['UniRef50IDs'].append(unirefID) 196 | taxonomy['communityIDs'].append(curr_community) 197 | 198 | if count % 100 == 0: 199 | numb_seconds = time.time() - start 200 | time_to_end = round(((numb_seconds/(count+1))*n_expected)-numb_seconds) 201 | print('thread {}:'.format(thread_id), count+1, n_expected, ' ... Time passed: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))), flush = True) 202 | 203 | if count % 1000 == 0: 204 | json.dump(taxonomy, open(out_json, 'w'), indent=4) 205 | 206 | count+=1 207 | 208 | json.dump(taxonomy, open(out_json, 'w'), indent=4) 209 | 210 | return taxonomy 211 | 212 | # GET TAXONOMY 213 | 214 | separated_jobs = chunk_list(list(AFDB90_CC.index), threads, counts=list(AFDB90_CC.nUniRef100)) 215 | 216 | list_arguments = [i for i in zip(separated_jobs, [AFDB90_CC for job in separated_jobs], [infolder for job in separated_jobs], range(threads), [jobid for job in separated_jobs])] 217 | 218 | pool = ThreadPool(threads) 219 | results = pool.imap_unordered(get_taxonomy_for_unirefs, list_arguments) 220 | 221 | all_results = {} 222 | for dic in results: 223 | for key in dic: 224 | if key not in all_results: 225 | all_results[key] = dic[key] 226 | else: 227 | all_results[key] += dic[key] 228 | 229 | taxonomy = pd.DataFrame(all_results) 230 | taxonomy = taxonomy.set_index('uniprotIDs') 231 | 232 | taxonomy.to_csv('../{}/{}_uniprot_community_taxonomy_map.csv'.format(infolder, label)) 233 | 234 | -------------------------------------------------------------------------------- /scripts/make_communities_map.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import networkx as nx 5 | import numpy as np 6 | import pandas as pd 7 | import time 8 | 9 | infolder = sys.argv[1] 10 | 11 | ingraph = '{}/full_graph.gml'.format(infolder) 12 | node_data = '{}/node_class.json'.format(infolder) 13 | 14 | def get_node_index(data, target_level = 'communityID', min_size = None): 15 | 16 | node_index = {} 17 | 18 | for i, node in enumerate(data['node']): 19 | node_index[node] = data[target_level][i] 20 | 21 | return node_index 22 | 23 | def colapse_graph(graph, node_index): 24 | 25 | new_nodes = set(node_index.values()) 26 | new_edges = set() 27 | 28 | n_expected = len(graph.edges) 29 | 30 | start = time.time() 31 | 32 | count = 0 33 | for edge in graph.edges: 34 | new_edge = [node_index[edge[0]], node_index[edge[1]]] 35 | if len(set(new_edge)) > 1: 36 | new_edge = tuple(sorted(new_edge)) 37 | new_edges.add(new_edge) 38 | 39 | count+=1 40 | 41 | if count % 1000000 == 0: 42 | numb_seconds = time.time() - start 43 | time_to_end = round(((numb_seconds/count)*n_expected)-numb_seconds) 44 | print(count, n_expected, ' ... Time passed: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds))), 'Expected to finish in: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(time_to_end)))-1, int(time.strftime('%d', time.gmtime(time_to_end)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(time_to_end))), flush = True) 45 | 46 | print(' ... Defined {} edges connecting {} nodes'.format(len(new_edges), len(new_nodes))) 47 | 48 | return new_edges 49 | 50 | def write_edges_list(edges, infolder): 51 | 52 | print('Writing edges file for cosmograph') 53 | 54 | start = time.time() 55 | 56 | outfile = '{}/communities_edge_list.csv'.format(infolder) 57 | 58 | with open(outfile, 'w') as outp: 59 | outp.write('innode,outnode\n') 60 | for edge in edges: 61 | if edge[0] != edge[1]: 62 | outp.write('{},{}\n'.format(edge[0], edge[1])) 63 | 64 | numb_seconds = time.time() - start 65 | print(' ... Took me: {} months {} days {}'.format(int(time.strftime('%m', time.gmtime(numb_seconds)))-1, int(time.strftime('%d', time.gmtime(numb_seconds)))-1, time.strftime('%H hours %M min %S sec', time.gmtime(numb_seconds)))) 66 | 67 | 68 | # load edges attributes 69 | data = json.load(open(node_data, 'r')) 70 | 71 | # load graph 72 | graph = nx.read_gml(ingraph) 73 | 74 | # get node index 75 | node_index = get_node_index(data) 76 | 77 | # colapse nodes 78 | new_edges = colapse_graph(graph, node_index) 79 | 80 | # write edges file for cosmograph 81 | write_edges_list(new_edges, infolder) -------------------------------------------------------------------------------- /scripts/make_shapemers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import io 4 | import itertools 5 | 6 | import prody as pd 7 | import tarfile 8 | import torch 9 | from scipy import ndimage 10 | from time import time 11 | 12 | from geometricus import MultipleMomentInvariants, ShapemerLearn 13 | 14 | from pathlib import Path 15 | import json 16 | import numba as nb 17 | import numpy as np 18 | from tqdm import tqdm 19 | 20 | import proteinnet_parser 21 | 22 | 23 | def parse_pae_file(pae_json_data): 24 | if type(pae_json_data) == str or type(pae_json_data) == Path: 25 | with open(pae_json_data, "rt") as f: 26 | data = json.load(f)[0] 27 | else: 28 | data = json.load(pae_json_data)[0] 29 | 30 | if 'residue1' in data and 'distance' in data: 31 | # Legacy PAE format, keep for backwards compatibility. 32 | r1, d = data['residue1'], data['distance'] 33 | size = max(r1) 34 | matrix = np.empty((size, size), dtype=np.float64) 35 | matrix.ravel()[:] = d 36 | elif 'predicted_aligned_error' in data: 37 | # New PAE format. 38 | matrix = np.array(data['predicted_aligned_error'], dtype=np.float64) 39 | else: 40 | raise ValueError('Invalid PAE JSON format.') 41 | 42 | return matrix 43 | 44 | 45 | @nb.njit 46 | def get_plddt_matrix(plddt): 47 | size = len(plddt) 48 | matrix = np.empty((size, size), dtype=np.float64) 49 | for i in range(size): 50 | matrix[i, i] = plddt[i] 51 | for j in range(i + 1, size): 52 | matrix[i, j] = matrix[j, i] = (plddt[i] + plddt[j]) 53 | return 100 - matrix / 2 54 | 55 | 56 | def get_domains_networkx(pae_matrix, plddt_matrix, cutoff=20, graph_resolution=0.5): 57 | """ 58 | Adapted from https://github.com/tristanic/pae_to_domains 59 | 60 | Takes a predicted aligned error (PAE) matrix representing the predicted error in distances between each 61 | pair of residues in a model, and uses a graph-based community clustering algorithm to partition the model 62 | into approximately rigid groups. 63 | 64 | Arguments: 65 | 66 | * pae_matrix: a (n_residues x n_residues) numpy array. Diagonal elements should be set to some non-zero value 67 | to avoid divide-by-zero warnings 68 | * plddt_matrix: a (n_residues x n_residues) numpy array containing average pairwise PLDDT values 69 | * cutoff (optional, default=20): graph edges will only be created for residue pairs with pae+avg.PLDDT < 70 | cutoff 71 | * graph_resolution (optional, default=0.5): regulates how aggressively the clustering algorithm is. 72 | Smaller values lead to larger clusters. Value should be larger than zero, and values larger than 5 are 73 | unlikely to be useful. 74 | 75 | Returns: a series of lists, where each list contains the indices of residues belonging to one cluster. 76 | """ 77 | try: 78 | import networkx as nx 79 | from networkx.algorithms import community 80 | except ImportError: 81 | print( 82 | 'ERROR: This method requires NetworkX (>=2.6.2) to be installed. Please install it using "pip install ' 83 | 'networkx" in a Python >=3.7 environment and try again.') 84 | import sys 85 | sys.exit() 86 | matrix = pae_matrix + plddt_matrix 87 | weights = 1 / matrix 88 | g = nx.Graph() 89 | size = weights.shape[0] 90 | g.add_nodes_from(range(size)) 91 | edges = np.argwhere(matrix < cutoff) 92 | sel_weights = weights[edges.T[0], edges.T[1]] 93 | wedges = [(i, j, w) for (i, j), w in zip(edges, sel_weights)] 94 | g.add_weighted_edges_from(wedges) 95 | clusters = community.greedy_modularity_communities(g, weight='weight', resolution=graph_resolution) 96 | return clusters 97 | 98 | 99 | def get_domains_igraph(pae_matrix, plddt_matrix, cutoff=20, graph_resolution=0.5): 100 | """ 101 | Adapted from https://github.com/tristanic/pae_to_domains 102 | 103 | Takes a predicted aligned error (PAE) matrix representing the predicted error in distances between each 104 | pair of residues in a model, and uses a graph-based community clustering algorithm to partition the model 105 | into approximately rigid groups. 106 | 107 | Arguments: 108 | 109 | * pae_matrix: a (n_residues x n_residues) numpy array. Diagonal elements should be set to some non-zero 110 | value to avoid divide-by-zero warnings 111 | * plddt_matrix: a (n_residues x n_residues) numpy array containing average pairwise PLDDT values 112 | * cutoff (optional, default=20): graph edges will only be created for residue pairs with pae+avg.PLDDT=3.6 environment and try again.') 124 | import sys 125 | sys.exit() 126 | matrix = pae_matrix + plddt_matrix 127 | weights = 1 / matrix 128 | g = igraph.Graph() 129 | size = weights.shape[0] 130 | g.add_vertices(range(size)) 131 | edges = np.argwhere(pae_matrix < cutoff) 132 | sel_weights = weights[edges.T[0], edges.T[1]] 133 | g.add_edges(edges) 134 | g.es['weight'] = sel_weights 135 | 136 | vc = g.community_leiden(weights='weight', resolution_parameter=graph_resolution / 100, n_iterations=-1) 137 | membership = np.array(vc.membership) 138 | from collections import defaultdict 139 | clusters = defaultdict(list) 140 | for i, c in enumerate(membership): 141 | clusters[c].append(i) 142 | return clusters.values() 143 | 144 | 145 | def clusters_to_domains(protein, clusters, min_length=20, avg_plddt_cutoff=70): 146 | chain = "A" 147 | for cl in clusters: 148 | start, stop = min(cl), max(cl) 149 | if (stop - start) >= min_length: 150 | domain = protein.select(f"resnum {start}:{stop}") 151 | if domain.getBetas().mean() >= avg_plddt_cutoff: 152 | for res in domain: 153 | res.setChid(chain) 154 | yield domain 155 | chain = chr(ord(chain) + 1) 156 | 157 | 158 | def split_alphafold_protein(prody_protein, pae_file=None, plddt_threshold=70, sigma=5): 159 | """ 160 | Splits an AlphaFold protein into fragments based on a Gaussian-smoothed version of the PLDDT score. 161 | Parameters 162 | ---------- 163 | prody_protein 164 | ProDy protein object of calpha atoms 165 | pae_file 166 | pae_file_data 167 | plddt_threshold 168 | Fragments will be split according to residues with a (smoothed) PLDDT score below this threshold. 169 | sigma 170 | Sigma for the smoothing of the PLDDT score. 171 | 172 | Returns 173 | ------- 174 | (start, end) indices for each split 175 | """ 176 | if pae_file is not None: 177 | pae_matrix = parse_pae_file(pae_file) 178 | beta_list = prody_protein.getBetas() 179 | plddt_matrix = get_plddt_matrix(beta_list) 180 | clusters = get_domains_igraph(pae_matrix, plddt_matrix) 181 | beta_list = ndimage.gaussian_filter1d(beta_list, sigma=sigma) 182 | all_slices = [] 183 | for cl in clusters: 184 | chain_start, chain_stop = min(cl), max(cl) 185 | length = chain_stop - chain_start 186 | if length < 20: 187 | continue 188 | indices = np.ones(length, dtype=int) 189 | indices[np.where(beta_list[chain_start:chain_stop] < plddt_threshold)] = 0 190 | slices = ndimage.find_objects(ndimage.label(indices)[0]) 191 | slices = [(s[0].start, s[0].stop) for s in slices] 192 | all_slices += [(chain_start + start, chain_start + stop) for start, stop in slices] 193 | else: 194 | beta_list = ndimage.gaussian_filter1d(prody_protein.getBetas(), sigma=sigma) 195 | indices = np.ones(beta_list.shape[0], dtype=int) 196 | indices[np.where(beta_list < plddt_threshold)] = 0 197 | slices = ndimage.find_objects(ndimage.label(indices)[0]) 198 | all_slices = [(s[0].start, s[0].stop) for s in slices] 199 | return all_slices 200 | 201 | 202 | def split_pdb_protein(prody_protein): 203 | """ 204 | Splits a protein into fragments based on chain. 205 | Parameters 206 | ---------- 207 | prody_protein 208 | ProDy protein object. 209 | 210 | Returns 211 | ------- 212 | (start, end, chid) indices for each split 213 | """ 214 | slices = [] 215 | chains = set(a.getChid() for a in prody_protein) 216 | if len(chains): 217 | for chain in chains: 218 | if not len(chain.strip()): 219 | chain = prody_protein 220 | else: 221 | chain = prody_protein.select(f"chain {chain}") 222 | slices.append((chain[0].getResindex(), chain[-1].getResindex() + 1, chain[0].getChid())) 223 | else: 224 | slices.append((prody_protein[0].getResindex(), prody_protein[-1].getResindex(), '')) 225 | return sorted(slices) 226 | 227 | 228 | def get_shapemers(calpha_protein, 229 | model, 230 | is_af=False, 231 | pae_file_data=None, 232 | length_threshold=20, 233 | plddt_threshold=70, 234 | sigma=5): 235 | """ 236 | Retrieves the moments of the protein. 237 | Parameters 238 | ---------- 239 | calpha_protein 240 | prody object 241 | model 242 | ShapemerLearn model 243 | is_af 244 | Whether the protein is an AlphaFold protein 245 | pae_file_data 246 | pae file as extrected gzip or as filename 247 | length_threshold 248 | Proteins with fewer (filtered) residues than this threshold will be ignored. 249 | plddt_threshold 250 | Residues with a (smoothed) PLDDT score below this threshold will be ignored. 251 | sigma 252 | Sigma for the smoothing of the PLDDT score. 253 | 254 | Returns 255 | ------- 256 | """ 257 | if is_af: 258 | residue_slices = split_alphafold_protein(calpha_protein, pae_file_data, plddt_threshold, sigma) 259 | else: 260 | residue_slices = split_pdb_protein(calpha_protein) 261 | coords = calpha_protein.getCoords() 262 | return get_shapemers_from_coords(coords, model, length_threshold=length_threshold, residue_slices=residue_slices) 263 | 264 | 265 | def get_shapemers_from_coords(coords, model, length_threshold=20, residue_slices=None): 266 | shapemers = [] 267 | indices = [len(coords)] 268 | if residue_slices is None: 269 | residue_slices = [(0, len(coords))] 270 | try: 271 | for x in residue_slices: 272 | start_index, end_index, *_ = x 273 | if end_index - start_index > length_threshold: 274 | indices += list(range(start_index, end_index)) 275 | shapemers += MultipleMomentInvariants.from_coordinates("name", 276 | coords[ 277 | start_index:end_index]).get_shapemers_model( 278 | model) 279 | if len(shapemers): 280 | assert len(shapemers) == len(indices) - 1 281 | return indices, shapemers 282 | except Exception as e: 283 | print(f"Error {e}") 284 | return [], [] 285 | 286 | 287 | def make_corpus_proteome(taxid, db_folder, output_folder): 288 | model = ShapemerLearn.load() 289 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension))) 290 | key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys)))) 291 | start = time() 292 | index = 0 293 | f_s = open(output_folder / f"{taxid}_shapemers.txt", "w") 294 | f_i = open(output_folder / f"{taxid}_indices.txt", "w") 295 | for f in db_folder.glob(f"proteome-tax_id-{taxid}-*_v4.tar"): 296 | with tarfile.open(f) as tar: 297 | for fh in tar.getmembers(): 298 | if '.cif' in fh.name: 299 | if index % 1000 == 0: 300 | print(f"{index} proteins processed in {time() - start} seconds") 301 | uniprot_ac = '-'.join(fh.name.split('-')[1:3]) 302 | with io.TextIOWrapper(gzip.open(tar.extractfile(fh), 'r'), encoding='utf-8') as mmcif: 303 | with gzip.open(tar.extractfile( 304 | tar.getmember(f"AF-{uniprot_ac}-predicted_aligned_error_v4.json.gz"))) as pae: 305 | protein = pd.parseMMCIFStream(mmcif) 306 | protein = protein.select("protein and calpha") 307 | indices, shapemers = get_shapemers(protein, model, is_af=True, pae_file_data=pae) 308 | if len(shapemers): 309 | f_i.write(f"{uniprot_ac}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n") 310 | f_s.write(f"{uniprot_ac}\t{' '.join(str(key_to_index[s]) for s in shapemers)}\n") 311 | index += 1 312 | f_s.close() 313 | f_i.close() 314 | 315 | 316 | def make_corpus_from_file(filename, db_folder, output_folder): 317 | model = ShapemerLearn.load() 318 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension))) 319 | shapemer_key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys)))) 320 | f_s = open(output_folder / f"{filename.stem}_shapemers.txt", "w") 321 | f_i = open(output_folder / f"{filename.stem}_indices.txt", "w") 322 | with open(filename) as f: 323 | num_lines = sum(1 for _ in f) 324 | with tarfile.open(db_folder / f"{filename.stem}.tar") as tar: 325 | with open(filename) as f: 326 | for line in tqdm(f, total=num_lines): 327 | fh = line.strip() 328 | uniprot_ac = '-'.join(fh.split('-')[1:3]) 329 | with io.TextIOWrapper(gzip.open(tar.extractfile(tar.getmember(fh)), 'r'), encoding='utf-8') as mmcif: 330 | with gzip.open(tar.extractfile( 331 | tar.getmember(f"AF-{uniprot_ac}-predicted_aligned_error_v4.json.gz"))) as pae: 332 | protein = pd.parseMMCIFStream(mmcif) 333 | protein = protein.select("protein and calpha") 334 | indices, shapemers = get_shapemers(protein, model, is_af=True, pae_file_data=pae) 335 | if len(shapemers): 336 | f_i.write(f"{uniprot_ac}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n") 337 | f_s.write( 338 | f"{uniprot_ac}\t{' '.join(str(shapemer_key_to_index[s]) for s in shapemers)}\n") 339 | f_s.close() 340 | f_i.close() 341 | 342 | 343 | def make_corpus_pdb_folder(db_folder_divided, output_folder): 344 | model = ShapemerLearn.load() 345 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension))) 346 | key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys)))) 347 | f_s = open(output_folder / f"{db_folder_divided.stem}_shapemers.txt", "w") 348 | f_i = open(output_folder / f"{db_folder_divided.stem}_indices.txt", "w") 349 | for filename in tqdm(db_folder_divided.glob(f"*.ent.gz")): 350 | try: 351 | uid = filename.stem.split(".")[0].split("pdb")[1] 352 | with gzip.open(filename, 'r') as pdb: 353 | with io.TextIOWrapper(pdb, encoding='utf-8') as decoder: 354 | protein = pd.parsePDBStream(decoder) 355 | protein = protein.select("protein and calpha") 356 | if protein is None: 357 | continue 358 | coords = protein.getCoords() 359 | residue_slices = split_pdb_protein(protein) 360 | for start_index, end_index, chain in residue_slices: 361 | indices, shapemers = get_shapemers_from_coords(coords, model, residue_slices=[(start_index, end_index)]) 362 | if len(shapemers): 363 | f_i.write(f"{uid}_{chain}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n") 364 | f_s.write(f"{uid}_{chain}\t{' '.join(str(key_to_index[s]) for s in shapemers)}\n") 365 | except Exception as e: 366 | print(e) 367 | f_s.close() 368 | f_i.close() 369 | 370 | 371 | def make_corpus_proteinnet(db_folder, output_folder): 372 | model = ShapemerLearn.load() 373 | shapemer_keys = list(map(tuple, itertools.product([0, 1], repeat=model.output_dimension))) 374 | key_to_index = dict(zip(shapemer_keys, range(len(shapemer_keys)))) 375 | f_s = open(output_folder / f"proteinnet_shapemers.txt", "w") 376 | f_i = open(output_folder / f"proteinnet_indices.txt", "w") 377 | for filename in [db_folder / x for x in ["training_100", "validation", "testing"]]: 378 | with open(filename) as f: 379 | total = sum(1 for line in f if line == "[ID]\n") 380 | for entry in tqdm(proteinnet_parser.yield_records_from_file(filename, 20), total=total): 381 | entry = proteinnet_parser.clean_entry(entry, 'ca') 382 | uid = entry["ID"] 383 | indices, shapemers = get_shapemers_from_coords(entry["tertiary"], model) 384 | if len(shapemers): 385 | f_i.write(f"{uid}\t{indices[0]}\t{' '.join(str(s) for s in indices[1:])}\n") 386 | f_s.write(f"{uid}\t{' '.join(str(key_to_index[s]) for s in shapemers)}\n") 387 | f_s.close() 388 | f_i.close() 389 | 390 | 391 | def main(): 392 | parser = argparse.ArgumentParser() 393 | parser.add_argument("filename", type=Path) 394 | parser.add_argument("db_folder", type=Path) 395 | parser.add_argument("output_folder", type=Path) 396 | args = parser.parse_args() 397 | if args.filename: 398 | make_corpus_from_file(args.filename, args.db_folder, Path(str(args.output_folder).strip())) 399 | 400 | 401 | def main_pdb(): 402 | parser = argparse.ArgumentParser() 403 | parser.add_argument("db_folder_divided", type=Path) 404 | parser.add_argument("output_folder", type=Path) 405 | args = parser.parse_args() 406 | make_corpus_pdb_folder(args.db_folder_divided, Path(str(args.output_folder).strip())) 407 | 408 | 409 | if __name__ == '__main__': 410 | main_pdb() 411 | -------------------------------------------------------------------------------- /scripts/sbatch_community_summary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=AF90coms 4 | #SBATCH --qos=1day 5 | #SBATCH --cpus-per-task=128 6 | #SBATCH --mem=50G # --mem=1024G will send to bigmem 7 | #SBATCH --output=slurm_output/AF90_communities_output%A.out 8 | #SBATCH --error=slurm_output/AF90_communities_error%A.err 9 | 10 | python3 get_communities_summary.py data_generated_v2/AFDB90v4_cc_data.csv data_generated_v2/AFDB90v4_cc_data_uniprot_community_taxonomy_map.csv 128 -------------------------------------------------------------------------------- /scripts/sbatch_connect_component_collection.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=AF90AvsA 4 | #SBATCH --qos=1day 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=100G # --mem=1024G will send to bigmem /// 300G for Uniref50 7 | #SBATCH --output=slurm_output/UR50_comp_output%A.out 8 | #SBATCH --error=slurm_output/UR50_comp_error%A.err 9 | 10 | DATABASES="databases" 11 | 12 | python3 get_connected_components.py ../data_generated/AFDBv4_90.fasta ../data_generated/AFDB90v4_all-gainst-all.m8 ../data_generated -------------------------------------------------------------------------------- /scripts/sbatch_make_communities_graph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=AF90cmgr 4 | #SBATCH --qos=1week 5 | #SBATCH --cpus-per-task=1 6 | #SBATCH --mem=20G # --mem=1024G will send to bigmem /// 300G for Uniref50 7 | #SBATCH --output=slurm_output/UR50_comp_output%A.out 8 | #SBATCH --error=slurm_output/UR50_comp_error%A.err 9 | 10 | python3 make_communities_map.py ../data_generated -------------------------------------------------------------------------------- /scripts/sbatch_mmseqs_AFDB90_all-against-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=UR50AvsA 4 | #SBATCH --qos=1day 5 | #SBATCH --cpus-per-task=60 6 | #SBATCH --mem=100G # --mem=1024G will send to bigmem 7 | #SBATCH --output=slurm_output/UR50_mmseqs_output%A.out 8 | #SBATCH --error=slurm_output/UR50_mmseqs_error%A.err 9 | 10 | DATABASES="../databases" 11 | DBFOLDER="${DATABASES}/mmseqsDBs" 12 | 13 | # load MMseqs 14 | ml MMseqs2 15 | 16 | # # # create database for mmseqs2 17 | # # mkdir $DBFOLDER 18 | mmseqs createdb AFDBv4_90.fasta ${DBFOLDER}/AFDB90v4 19 | 20 | # run mmseqs 21 | mmseqs easy-search AFDBv4_90.fasta ${DBFOLDER}/AFDB90v4 ../data_generated/AFDB90v4_all-gainst-all.m8 tmp -e 1e-4 --threads 60 22 | --------------------------------------------------------------------------------