├── .gitignore ├── 1-preparing-data.ipynb ├── 2-topic-modelling.ipynb ├── 3-analysis.ipynb ├── README.md └── src ├── __init__.py ├── make_data.py ├── plotting.py ├── proteinnet_parser.py └── uniprot_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /1-preparing-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from src import make_data\n", 20 | "from pathlib import Path" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": { 27 | "pycharm": { 28 | "name": "#%%\n" 29 | } 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "# Folder to store all the data\n", 34 | "DATA_FOLDER = Path(\"/path/to/data/folder\")" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "pycharm": { 41 | "name": "#%% md\n" 42 | } 43 | }, 44 | "source": [ 45 | "Download AlphaFold (AF) database proteins as tar files and extract" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "pycharm": { 53 | "name": "#%%\n" 54 | } 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "make_data.download_data(DATA_FOLDER)\n", 59 | "make_data.extract_data(DATA_FOLDER, DATA_FOLDER)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "pycharm": { 66 | "name": "#%% md\n" 67 | } 68 | }, 69 | "source": [ 70 | "Download UniProt annotations for all AF proteins, split by species" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "pycharm": { 78 | "name": "#%%\n" 79 | } 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "uniprot_folder = DATA_FOLDER / \"uniprot_files\"\n", 84 | "if not uniprot_folder.exists():\n", 85 | " uniprot_folder.mkdir()\n", 86 | "make_data.get_uniprot_info(DATA_FOLDER, uniprot_folder)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "pycharm": { 93 | "name": "#%% md\n" 94 | } 95 | }, 96 | "source": [ 97 | "Get average pLDDT scores, number of high confidence residues, and total number of residues for each AF protein" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 7, 103 | "metadata": { 104 | "pycharm": { 105 | "name": "#%%\n" 106 | }, 107 | "scrolled": false 108 | }, 109 | "outputs": [ 110 | { 111 | "name": "stderr", 112 | "output_type": "stream", 113 | "text": [ 114 | "3988it [01:33, 42.74it/s]\n", 115 | "19694it [10:20, 31.73it/s]\n", 116 | "12622it [09:22, 22.44it/s]\n", 117 | "23391it [21:38, 18.01it/s]\n", 118 | "27434it [16:18, 28.05it/s]\n", 119 | "39299it [22:24, 29.22it/s]\n", 120 | "4363it [02:29, 29.27it/s]\n", 121 | "13458it [09:59, 22.45it/s]\n", 122 | "1773it [00:51, 34.39it/s]\n", 123 | "5187it [05:02, 17.14it/s]\n", 124 | "19036it [13:44, 23.09it/s]\n", 125 | "6040it [04:20, 23.15it/s]\n", 126 | "5128it [03:46, 22.66it/s]\n", 127 | "21272it [16:47, 21.10it/s]\n", 128 | "7924it [07:17, 18.10it/s]\n", 129 | "2888it [01:25, 33.86it/s]\n", 130 | "55799it [32:34, 28.55it/s]\n", 131 | "5974it [04:12, 23.64it/s]]\n", 132 | "21615it [15:28, 23.29it/s]\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "avg_scores, lengths_high_confidence, lengths_full = make_data.get_AF_protein_information(DATA_FOLDER)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "uniprot_folder = DATA_FOLDER / \"uniprot_go\"" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "pycharm": { 153 | "name": "#%% md\n" 154 | } 155 | }, 156 | "source": [ 157 | "Combine all UniProt data and scores into a dataframe" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 9, 163 | "metadata": { 164 | "pycharm": { 165 | "name": "#%%\n" 166 | } 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stderr", 171 | "output_type": "stream", 172 | "text": [ 173 | "/mnt/backup2/geometric/geo_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3357: DtypeWarning: Columns (5) have mixed types.Specify dtype option on import or set low_memory=False.\n", 174 | " if (await self.run_code(code, result, async_=asy)):\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "import pandas as pnd\n", 180 | "AF_dataframe = pnd.concat([pnd.read_csv(filename, sep=\"\\t\") for filename in uniprot_folder.glob(\"UP*_uniprot.txt\")])\n", 181 | "AF_dataframe[\"Protein family\"] = [str(val).split(\",\")[0] for val in AF_dataframe[\"Protein families\"]] # Superfamily\n", 182 | "AF_dataframe[\"Organism\"] = [\" \".join(str(val).split(\" (\")[0].split(\" \")[:2]) for val in AF_dataframe[\"Organism\"]]\n", 183 | "AF_dataframe[\"ID\"] = [f\"AF-{k}-F1-model_v1.pdb\" for k in AF_dataframe[\"Entry\"]]\n", 184 | "AF_dataframe[\"Avg. score\"] = [avg_scores[key] if key in avg_scores else 40 for key in AF_dataframe[\"ID\"]]\n", 185 | "AF_dataframe[\"Length\"] = [lengths_full[key] if key in lengths_full else 0 for key in AF_dataframe[\"ID\"]]\n", 186 | "AF_dataframe[\"High confidence length\"] = [lengths_high_confidence[key] if key in lengths_high_confidence else 0 for key in AF_dataframe[\"ID\"]]\n", 187 | "\n", 188 | "AF_dataframe = AF_dataframe[[c for c in AF_dataframe.columns if not c.startswith(\"yourlist\")]]\n", 189 | "AF_dataframe.to_csv(DATA_FOLDER / \"AF_dataframe.txt\", sep=\"\\t\")\n", 190 | "AF_dataframe = AF_dataframe.set_index(\"ID\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "pycharm": { 197 | "name": "#%% md\n" 198 | } 199 | }, 200 | "source": [ 201 | "Calculate shapemers for each AF protein" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "pycharm": { 209 | "name": "#%%\n" 210 | } 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "make_data.get_AF_shapemers(DATA_FOLDER)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": { 220 | "pycharm": { 221 | "name": "#%% md\n" 222 | } 223 | }, 224 | "source": [ 225 | "Download and extract CASP12 data from\n", 226 | "`https://sharehost.hms.harvard.edu/sysbio/alquraishi/proteinnet/human_readable/casp12.tar.gz`\n", 227 | "into DATA_FOLDER / casp12\n", 228 | "\n", 229 | "Calculate shapemers for all CASP12 proteins" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "pycharm": { 237 | "name": "#%%\n" 238 | } 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "make_data.get_PDB_shapemers(DATA_FOLDER / \"casp12\" / \"training_100\",\n", 243 | " DATA_FOLDER)\n", 244 | "make_data.get_PDB_shapemers(DATA_FOLDER / \"casp12\" / \"validation\",\n", 245 | " DATA_FOLDER)\n", 246 | "make_data.get_PDB_shapemers(DATA_FOLDER / \"casp12\" / \"testing\",\n", 247 | " DATA_FOLDER)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "pycharm": { 254 | "name": "#%% md\n" 255 | } 256 | }, 257 | "source": [ 258 | "Get UniProt annotations for all PDB proteins" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 16, 264 | "metadata": { 265 | "pycharm": { 266 | "name": "#%%\n" 267 | } 268 | }, 269 | "outputs": [ 270 | { 271 | "name": "stderr", 272 | "output_type": "stream", 273 | "text": [ 274 | "1043it [16:36, 1.05it/s]\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "import itertools\n", 280 | "from src import uniprot_parser\n", 281 | "import pickle\n", 282 | "\n", 283 | "corpus_files = DATA_FOLDER.glob(\"*_ids_corpus_res4_6*.txt\")\n", 284 | "keys = (line.strip().split(\"\\t\")[0] for line in itertools.chain.from_iterable((open(file) for file in corpus_files)))\n", 285 | "pdb_ids = []\n", 286 | "for k in keys:\n", 287 | " if k.endswith(\".pdb\"):\n", 288 | " continue\n", 289 | " if \"#\" in k:\n", 290 | " if \"TBM\" in k or \"FM\" in k:\n", 291 | " continue\n", 292 | " k = k.split(\"#\")[1][:4]\n", 293 | " else:\n", 294 | " k = k[:4]\n", 295 | " pdb_ids.append(k)\n", 296 | "uniprot_parser.get_uniprot_info_from_ids(pdb_ids,\n", 297 | " DATA_FOLDER / \"uniprot_go\" / \"casp12_uniprot.txt\",\n", 298 | " identifier=\"PDB_ID\",\n", 299 | " columns=make_data.UNIPROT_COLUMNS,\n", 300 | " chunk=True)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 11, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "import pickle" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "pycharm": { 317 | "name": "#%%\n" 318 | } 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "coords = make_data.get_PDB_protein_information([DATA_FOLDER / \"casp12\" / f for f in [\"training_100\",\n", 323 | " \"validation\",\n", 324 | " \"testing\"]])" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 12, 330 | "metadata": { 331 | "pycharm": { 332 | "name": "#%%\n" 333 | } 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "with open(DATA_FOLDER / \"PDB_coords.pkl\", \"wb\") as f:\n", 338 | " pickle.dump(coords, f)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 17, 344 | "metadata": { 345 | "pycharm": { 346 | "name": "#%%\n" 347 | } 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "PDB_dataframe = pnd.read_csv(DATA_FOLDER / \"uniprot_go\" / \"casp12_uniprot.txt\", sep=\"\\t\")\n", 352 | "mapping_column = [c for c in PDB_dataframe.columns if c.startswith(\"yourlist\")][0]\n", 353 | "PDB_dataframe[\"PDB_ID\"] = PDB_dataframe[mapping_column]\n", 354 | "PDB_dataframe[\"Protein family\"] = [str(val).split(\",\")[0] for val in PDB_dataframe[\"Protein families\"]] # Superfamily\n", 355 | "PDB_dataframe[\"Organism\"] = [\" \".join(str(val).split(\" (\")[0].split(\" \")[:2]) for val in PDB_dataframe[\"Organism\"]]\n", 356 | "PDB_dataframe = PDB_dataframe[[c for c in PDB_dataframe.columns if c != mapping_column]]" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "pycharm": { 363 | "name": "#%% md\n" 364 | } 365 | }, 366 | "source": [ 367 | "Match AF proteins with previously determined PDB proteins" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 18, 373 | "metadata": { 374 | "pycharm": { 375 | "name": "#%%\n" 376 | } 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "import numpy as np\n", 381 | "from collections import defaultdict\n", 382 | "\n", 383 | "AF_PDB_cross_references = AF_dataframe['Cross-reference (PDB)'][AF_dataframe['Cross-reference (PDB)'].notna()]\n", 384 | "AF_PDB_mapping = {key: AF_PDB_cross_references[key] for key in AF_PDB_cross_references.keys()}\n", 385 | "PDB_AF_mapping = defaultdict(list)\n", 386 | "for p in AF_PDB_mapping:\n", 387 | " if type(AF_PDB_mapping[p]) == str:\n", 388 | " for p1 in AF_PDB_mapping[p][:-1].split(\";\"):\n", 389 | " PDB_AF_mapping[p1].append(p)\n", 390 | " else:\n", 391 | " for p1 in AF_PDB_mapping[p].values:\n", 392 | " PDB_AF_mapping[p1[:-1]].append(p)\n", 393 | "\n", 394 | "PDB_dataframe[\"AF\"] = [\";\".join(PDB_AF_mapping[p]) if p in PDB_AF_mapping else np.nan for p in PDB_dataframe[\"PDB_ID\"]]\n", 395 | "PDB_dataframe.to_csv(DATA_FOLDER / \"PDB_dataframe.txt\", sep=\"\\t\")" 396 | ] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": "geo_env", 402 | "language": "python", 403 | "name": "geo_env" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 3 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython3", 415 | "version": "3.8.8" 416 | } 417 | }, 418 | "nbformat": 4, 419 | "nbformat_minor": 1 420 | } 421 | -------------------------------------------------------------------------------- /2-topic-modelling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 12 | "from sklearn.preprocessing import StandardScaler\n", 13 | "import itertools\n", 14 | "from pathlib import Path\n", 15 | "from sklearn.decomposition import NMF\n", 16 | "import openTSNE\n", 17 | "import pickle" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "pycharm": { 25 | "name": "#%%\n" 26 | } 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "DATA_FOLDER = Path(\"data\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "pycharm": { 37 | "name": "#%% md\n" 38 | } 39 | }, 40 | "source": [ 41 | "Loading all corpus files:" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "pycharm": { 49 | "name": "#%%\n" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "corpus_files = DATA_FOLDER.glob(\"*_ids_corpus_resolution_4_6*.txt\")\n", 55 | "keys_corpus = (line.strip().split(\"\\t\") for line in itertools.chain.from_iterable((open(file) for file in corpus_files)))\n", 56 | "keys, corpus = itertools.tee(keys_corpus)\n", 57 | "keys = [k[0] for k in keys]\n", 58 | "corpus = (k[1] for k in corpus)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "pycharm": { 65 | "name": "#%% md\n" 66 | } 67 | }, 68 | "source": [ 69 | "Calculating the TFIDF matrix:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "pycharm": { 77 | "name": "#%%\n" 78 | } 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "print(f\"Getting TFIDF matrix for {len(keys)} proteins...\")\n", 83 | "vectorizer = TfidfVectorizer(min_df=2)\n", 84 | "tfidf_matrix = vectorizer.fit_transform(corpus)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Fitting NMF model:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "pycharm": { 99 | "name": "#%%\n" 100 | } 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "num_topics = 250\n", 105 | "topic_model = NMF(n_components=num_topics,\n", 106 | " random_state=42,\n", 107 | " solver='cd', tol=0.0005,\n", 108 | " max_iter=500,\n", 109 | " alpha=.1,\n", 110 | " l1_ratio=.5,\n", 111 | " verbose=1)\n", 112 | "w_matrix = topic_model.fit_transform(tfidf_matrix)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Normalizing $W$ matrix for plotting:" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "pycharm": { 127 | "name": "#%%\n" 128 | } 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "scaler = StandardScaler()\n", 133 | "w_matrix_norm = scaler.fit_transform(w_matrix)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Fitting t-SNE model initialized with PCA on $W$ matrix:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "pycharm": { 148 | "name": "#%%\n" 149 | } 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "tsne_reducer = openTSNE.TSNE(\n", 154 | " perplexity=50,\n", 155 | " initialization=\"pca\",\n", 156 | " metric=\"cosine\",\n", 157 | " n_jobs=14,\n", 158 | " random_state=42,\n", 159 | " n_iter=1000,\n", 160 | " verbose=True\n", 161 | " )\n", 162 | "reduced = tsne_reducer.fit(w_matrix_norm)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Saving everything:" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "pycharm": { 177 | "name": "#%%\n" 178 | } 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "with open(DATA_FOLDER / \"topic_modelling_data.pkl\", \"wb\") as f:\n", 183 | " pickle.dump((keys,\n", 184 | " vectorizer, tfidf_matrix,\n", 185 | " topic_model, w_matrix,\n", 186 | " scaler, w_matrix_norm,\n", 187 | " tsne_reducer, reduced), f)" 188 | ] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.8.8" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 1 212 | } 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Structural fold space for AF2 predicted models across 21 proteomes 2 | 3 | Topic modelling of structural shape-mers to explore differences between the [AlphaFold DB](https://alphafold.ebi.ac.uk/) and the [PDB](https://www.rcsb.org/) 4 | 5 | ## Requirements 6 | 7 | * numpy 8 | * scipy 9 | * scikit-learn 10 | * geometricus (https://github.com/TurtleTools/geometricus) 11 | * portein (https://github.com/TurtleTools/portein) 12 | * kneed (https://github.com/arvkevi/kneed) 13 | 14 | 15 | ## Publications 16 | 17 | Akdel, M., Pires, D.E., Pardo, E.P., Jänes, J., Zalevsky, A.O., Mészáros, B., Bryant, P., Good, L.L., Laskowski, R.A., Pozzati, G. and Shenoy, A., 2021. A structural biology community assessment of AlphaFold 2 applications. bioRxiv. 18 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TurtleTools/alphafold-structural-space/d4baeb461ba76bd3c684418920e9ef78e7f45480/src/__init__.py -------------------------------------------------------------------------------- /src/make_data.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from ftplib import FTP 3 | import prody as pd 4 | from dataclasses import dataclass 5 | import numpy as np 6 | import typing as ty 7 | from geometricus import MomentInvariants, SplitType 8 | import tarfile 9 | from time import time 10 | from tqdm import tqdm 11 | from scipy import ndimage 12 | 13 | from src import uniprot_parser, proteinnet_parser 14 | 15 | UNIPROT_COLUMNS = ",".join(("id", "entry name", 'genes', 'genes(PREFERRED)', 'genes(ALTERNATIVE)', 16 | 'genes(OLN)', 'genes(ORF)', "organism", "protein names", "families", 17 | 'go', 'go(biological process)', 'go(molecular function)', 18 | 'go(cellular component)', 'database(PDB)', 'database(Pfam)')) 19 | 20 | 21 | @dataclass 22 | class MomentInvariantsSavable: 23 | name: str 24 | moments: ty.Union[np.ndarray, None] 25 | coordinates: ty.Union[np.ndarray, None] 26 | 27 | @classmethod 28 | def from_invariant(cls, invariant: MomentInvariants): 29 | return cls(invariant.name, invariant.moments, invariant.coordinates) 30 | 31 | 32 | def download_data(output_folder: Path): 33 | if not output_folder.exists(): 34 | output_folder.mkdir() 35 | ftp = FTP('ftp.ebi.ac.uk') 36 | ftp.login() 37 | ftp.cwd("/pub/databases/alphafold") 38 | for filename in ftp.nlst(): 39 | print(f"Retrieving {filename}") 40 | with open(output_folder / filename, 'wb') as f: 41 | ftp.retrbinary('RETR ' + filename, f.write) 42 | 43 | 44 | def extract_data(input_folder, output_folder): 45 | for filename in input_folder.glob("*.tar"): 46 | tar = tarfile.open(str(filename)) 47 | folder = output_folder / str(filename.stem) 48 | if not folder.exists(): 49 | folder.mkdir() 50 | tar.extractall(str(folder)) # specify which folder to extract to 51 | tar.close() 52 | 53 | 54 | def get_uniprot_info(data_folder, aux_folder, extension="pdb.gz"): 55 | start_time = time() 56 | for folder in data_folder.iterdir(): 57 | if folder.is_dir() and folder.stem.startswith("UP0"): 58 | uniprot_file = aux_folder / f"{folder.stem}_uniprot.txt" 59 | uniprot_ids = [filename.stem.split("-")[1] for filename in folder.glob(f"*{extension}")] 60 | if not uniprot_file.exists(): 61 | uniprot_parser.get_uniprot_info_from_ids(uniprot_ids, uniprot_file, chunk=True, 62 | columns=UNIPROT_COLUMNS) 63 | print(f"{folder.stem}: Time elapsed: {time() - start_time}s") 64 | 65 | 66 | def get_AF_shapemers(root_folder, 67 | resolution_kmer=4, 68 | resolution_radius=6, 69 | length_threshold=50): 70 | root_folder = Path(root_folder) 71 | with open( 72 | root_folder / 73 | f"AF_ids_corpus_resolution_{resolution_kmer}_{resolution_radius}_threshold_{length_threshold}.txt", 74 | "w") as corpus_file: 75 | for folder in root_folder.iterdir(): 76 | if folder.is_dir() and folder.stem.startswith("UP0"): 77 | print(folder.stem) 78 | for i, pdb_file in tqdm(enumerate(folder.glob("*.pdb.gz"))): 79 | key = pdb_file.stem 80 | pdb = pd.parsePDB(str(pdb_file)).select("protein and calpha") 81 | betas = ndimage.gaussian_filter1d(pdb.getBetas(), sigma=5) 82 | coords = pdb.getCoords() 83 | sequence = pdb.getSequence() 84 | 85 | indices = np.ones(betas.shape[0], dtype=int) 86 | indices[np.where(betas < 70)] = 0 87 | 88 | slices = ndimage.find_objects(ndimage.label(indices)[0]) 89 | index = 0 90 | shapemers = [] 91 | for s in slices: 92 | s = s[0] 93 | if s.stop - s.start > length_threshold: 94 | index += 1 95 | invariants = MomentInvariants.from_coordinates( 96 | key, 97 | coords[s.start: s.stop], 98 | sequence[s.start: s.stop], 99 | split_type=SplitType.KMER_CUT, 100 | split_size=16 101 | ) 102 | shapemers += [f"k{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in 103 | (np.log1p(invariants.moments) * resolution_kmer).astype(int)] 104 | invariants = MomentInvariants.from_coordinates( 105 | key, 106 | coords[s.start: s.stop], 107 | sequence[s.start: s.stop], 108 | split_type=SplitType.RADIUS, 109 | split_size=10 110 | ) 111 | shapemers += [f"r{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in 112 | (np.log1p(invariants.moments) * resolution_radius).astype(int)] 113 | if index > 0: 114 | corpus_file.write(key + "\t" + " ".join(shapemers) + "\n") 115 | 116 | 117 | def get_PDB_shapemers(casp_file, root_folder, resolution_kmer=4, resolution_radius=6): 118 | with open(Path(root_folder) / f"PDB_{casp_file.stem}_ids_corpus_resolution_{resolution_kmer}_{resolution_radius}.txt", 119 | "w") as corpus_file: 120 | for entry in tqdm(proteinnet_parser.yield_records_from_file(casp_file, 20)): 121 | entry = proteinnet_parser.clean_entry(entry, 'ca') 122 | invariants = MomentInvariants.from_coordinates( 123 | entry["ID"], 124 | entry["tertiary"], 125 | entry["primary"], 126 | split_type=SplitType.KMER_CUT, 127 | split_size=16 128 | ) 129 | shapemers = [f"k{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in 130 | (np.log1p(invariants.moments) * resolution_kmer).astype(int)] 131 | invariants = MomentInvariants.from_coordinates( 132 | entry["ID"], 133 | entry["tertiary"], 134 | entry["primary"], 135 | split_type=SplitType.RADIUS, 136 | split_size=10 137 | ) 138 | shapemers += [f"r{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in 139 | (np.log1p(invariants.moments) * resolution_radius).astype(int)] 140 | if len(shapemers): 141 | corpus_file.write(entry["ID"] + "\t" + " ".join(shapemers) + "\n") 142 | 143 | 144 | def get_AF_protein_information(data_folder): 145 | data_folder = Path(data_folder) 146 | avg_scores = {} 147 | lengths_high_confidence = {} 148 | lengths_full = {} 149 | for folder in data_folder.iterdir(): 150 | if folder.is_dir() and folder.stem.startswith("UP0"): 151 | for filename in tqdm(folder.glob("*.pdb.gz")): 152 | pdb = pd.parsePDB(str(filename)) 153 | if pdb is None: 154 | continue 155 | key = filename.stem 156 | avg_scores[key] = np.median(pdb.getBetas()) 157 | pdb = pdb.select("protein and calpha") 158 | if pdb is None: 159 | continue 160 | lengths_full[key] = len(pdb) 161 | pdb = pdb.select("beta > 70") 162 | if pdb is None: 163 | continue 164 | lengths_high_confidence[key] = len(pdb) 165 | return avg_scores, lengths_high_confidence, lengths_full 166 | 167 | 168 | def get_PDB_protein_information(casp_files): 169 | coords = {} 170 | for casp_file in casp_files: 171 | for entry in tqdm(proteinnet_parser.yield_records_from_file(casp_file, 20)): 172 | entry = proteinnet_parser.clean_entry(entry, 'ca') 173 | coords[entry["ID"]] = entry["tertiary"] 174 | return coords 175 | -------------------------------------------------------------------------------- /src/plotting.py: -------------------------------------------------------------------------------- 1 | from portein import get_best_transformation, apply_transformation, find_size 2 | import prody as pd 3 | from geometricus import MomentInvariants, SplitType 4 | from scipy.signal import resample 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_coords_topic_scores(coords, topic_id, h_matrix_norm, shapemer_to_index): 10 | def shapemer_to_topic_value(s_string): 11 | if s_string in shapemer_to_index: 12 | return h_matrix_norm[topic_id][shapemer_to_index[s_string]] 13 | else: 14 | return 0 15 | 16 | resolution_kmer = 4 17 | resolution_radius = 6 18 | weights = np.zeros(coords.shape[0]) 19 | protein_invariants = MomentInvariants.from_coordinates("protein_id", 20 | coords, 21 | None, 22 | split_type=SplitType.KMER_CUT) 23 | 24 | def get_similarity(x1, x2, gamma=0.03): 25 | return np.exp(-gamma * np.sum((coords[x1] - coords[x2]) ** 2, axis=-1)) 26 | 27 | shapemers = (np.log1p(protein_invariants.moments) * resolution_kmer).astype(int) 28 | for i, x in enumerate(shapemers): 29 | weight = shapemer_to_topic_value(f"k{x[0]}i{x[1]}i{x[2]}i{x[3]}") 30 | for index in range(i, i + 16): 31 | weights[i] += weight * get_similarity(i, index) 32 | protein_invariants = MomentInvariants.from_coordinates( 33 | "protein_id", 34 | coords, 35 | None, 36 | split_type=SplitType.RADIUS, 37 | split_size=10) 38 | shapemers = (np.log1p(protein_invariants.moments) * resolution_radius).astype(int) 39 | for i, x in enumerate(shapemers): 40 | weight = shapemer_to_topic_value(f"r{x[0]}i{x[1]}i{x[2]}i{x[3]}") 41 | for index in protein_invariants.split_indices[i]: 42 | weights[index] += weight * get_similarity(i, index) 43 | return weights 44 | 45 | def get_protein_topic_scores(path, topic_id, h_matrix_norm, shapemer_to_index, matplotlib=True): 46 | pdb = pd.parsePDB(str(path)) 47 | pdb_alpha = pdb.select("protein and calpha") 48 | opacities = pdb_alpha.getBetas() / 100 49 | coords = pdb_alpha.getCoords() 50 | weights = get_coords_topic_scores(coords, topic_id, h_matrix_norm, shapemer_to_index) 51 | if matplotlib: 52 | coords = apply_transformation(coords, get_best_transformation(coords)) 53 | return coords, weights, opacities 54 | else: 55 | matrix = get_best_transformation(coords) 56 | pdb = pd.applyTransformation(pd.Transformation(matrix), pdb) 57 | for i, res in enumerate(pdb.iterResidues()): 58 | res.setBetas([weights[i]] * len(res)) 59 | return pdb 60 | 61 | 62 | def plot_protein(coords, weights, opacities, max_value, upsample_rate=3): 63 | coords = resample(coords[:, :2], upsample_rate * coords.shape[0]) 64 | weights = np.repeat(weights, upsample_rate) 65 | opacities = np.repeat(opacities, upsample_rate) 66 | colors = [plt.cm.coolwarm(int(256 * (x / max_value))) for x in weights] 67 | fig, ax = plt.subplots(figsize=find_size(coords, height=5, width=None)) 68 | for i in range(coords.shape[0] - upsample_rate): 69 | ax.plot(coords[:, 0][i:i + 2], coords[:, 1][i:i + 2], 70 | lw=2, color=colors[i], alpha=opacities[i]) 71 | plt.axis("off") 72 | return fig, ax 73 | -------------------------------------------------------------------------------- /src/proteinnet_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Text-based parser for ProteinNet Records. 3 | """ 4 | 5 | __author__ = "Mohammed AlQuraishi" 6 | __copyright__ = "Copyright 2019, Harvard Medical School" 7 | __license__ = "MIT" 8 | 9 | # !/usr/bin/python 10 | 11 | # imports 12 | import sys 13 | import re 14 | import numpy as np 15 | from itertools import groupby 16 | 17 | # Constants 18 | NUM_DIMENSIONS = 3 19 | 20 | # Functions for conversion from Mathematica protein files to TFRecords 21 | _aa_dict = {'A': '0', 'C': '1', 'D': '2', 'E': '3', 'F': '4', 'G': '5', 'H': '6', 'I': '7', 'K': '8', 'L': '9', 'M': '10', 'N': '11', 'P': '12', 'Q': '13', 'R': '14', 'S': '15', 'T': '16', 'V': '17', 'W': '18', 'Y': '19'} 22 | _dssp_dict = {'L': '0', 'H': '1', 'B': '2', 'E': '3', 'G': '4', 'I': '5', 'T': '6', 'S': '7'} 23 | _mask_dict = {'-': '0', '+': '1'} 24 | 25 | 26 | def letter_to_num(string, dict_): 27 | """ Convert string of letters to list of ints """ 28 | patt = re.compile('[' + ''.join(dict_.keys()) + ']') 29 | num_string = patt.sub(lambda m: dict_[m.group(0)] + ' ', string) 30 | return [int(i) for i in num_string.split()] 31 | 32 | 33 | def yield_records_from_file(file, num_evo_entries: int = 20): 34 | def get_record(lines): 35 | entry = {"ID": lines[0].strip()} 36 | for i, line in enumerate(lines): 37 | if line == '[PRIMARY]' + '\n': 38 | primary = lines[i + 1].strip() 39 | entry.update({'primary': primary}) 40 | elif line == '[EVOLUTIONARY]' + '\n': 41 | evolutionary = [] 42 | for residue in range(num_evo_entries): 43 | evolutionary.append( 44 | [float(step) for step in lines[i + 1].strip().split()] 45 | ) 46 | entry.update({'evolutionary': np.array(evolutionary)}) 47 | elif line == '[SECONDARY]' + '\n': 48 | secondary = letter_to_num(lines[i + 1].strip(), _dssp_dict) 49 | entry.update({'secondary': secondary}) 50 | elif line == '[TERTIARY]' + '\n': 51 | tertiary = [] 52 | for axis in range(NUM_DIMENSIONS): 53 | tertiary.append([float(coord) for coord in lines[i + 1 + axis].strip().split()]) 54 | entry.update({'tertiary': np.array(tertiary).T}) 55 | elif line == '[MASK]' + '\n': 56 | mask = letter_to_num(lines[i + 1].strip(), _mask_dict) 57 | entry.update({'mask': mask}) 58 | else: 59 | continue 60 | return entry 61 | 62 | for k, g in groupby(open(file, "r"), lambda x: x.startswith("[ID]")): 63 | if not k: 64 | yield get_record(list(g)) 65 | 66 | 67 | def clean_entry(entry, atom="ca"): 68 | sequence = "primary" 69 | mask = np.where(np.array(entry['mask']) == 1)[0] 70 | entry[sequence] = ''.join(entry[sequence][x] for x in mask) 71 | mask_3d = np.array([i for n in mask for i in range(n*3, n*3+3)]).astype(int) 72 | entry['tertiary'] = entry['tertiary'][mask_3d] 73 | if atom == "ca": 74 | index = 1 75 | elif atom == "n": 76 | index = 0 77 | elif atom == "cb": 78 | index = 2 79 | else: 80 | raise ValueError("atom must be one of n, ca, cb") 81 | entry['tertiary'] = entry['tertiary'][np.arange(index, entry['tertiary'].shape[0]+index, 3)] / 100 82 | assert entry['tertiary'].shape[0] == len(entry[sequence]), (entry['tertiary'].shape[0], len(entry[sequence])) 83 | return entry 84 | -------------------------------------------------------------------------------- /src/uniprot_parser.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from tqdm import tqdm 3 | from pathlib import Path 4 | 5 | # Cross References 6 | DB_ABBREVS = ["database(EMBL)"] + ["database(" + line.strip().split(": ")[1] + ")" for line in 7 | requests.get("https://www.uniprot.org/docs/dbxref.txt").text.split("\n") if 8 | "Abbrev:" in line] 9 | 10 | COLUMN_NAMES = [ 11 | # Names & Taxonomy 12 | 'id', 'entry name', 'genes', 'genes(PREFERRED)', 'genes(ALTERNATIVE)', 13 | 'genes(OLN)', 'genes(ORF)', 'organism', 'organism-id', 'protein names', 14 | 'proteome', 'lineage(ALL)', 'lineage-id', 'virus hosts', 15 | # Sequences 16 | 'fragement', 'sequence', 'length', 'mass', 'encodedon', 17 | 'comment(ALTERNATIVE PRODUCTS)', 'comment(ERRONEOUS GENE MODEL PREDICTION)', 18 | 'comment(ERRONEOUS INITIATION)', 'comment(ERRONEOUS TERMINATION)', 19 | 'comment(ERRONEOUS TRANSLATION)', 'comment(FRAMESHIFT)', 20 | 'comment(MASS SPECTROMETRY)', 'comment(POLYMORPHISM)', 21 | 'comment(RNA EDITING)', 'comment(SEQUENCE CAUTION)', 22 | 'feature(ALTERNATIVE SEQUENCE)', 'feature(NATURAL VARIANT)', 23 | 'feature(NON ADJACENT RESIDUES)', 24 | 'feature(NON STANDARD RESIDUE)', 'feature(NON TERMINAL RESIDUE)', 25 | 'feature(SEQUENCE CONFLICT)', 'feature(SEQUENCE UNCERTAINTY)', 26 | 'version(sequence)', 27 | # Family and Domains 28 | 'domains', 'domain', 'comment(DOMAIN)', 'comment(SIMILARITY)', 29 | 'feature(COILED COIL)', 'feature(COMPOSITIONAL BIAS)', 30 | 'feature(DOMAIN EXTENT)', 'feature(MOTIF)', 'feature(REGION)', 31 | 'feature(REPEAT)', 'feature(ZINC FINGER)', 32 | # Function 33 | 'ec', 'comment(ABSORPTION)', 'comment(CATALYTIC ACTIVITY)', 34 | 'comment(COFACTOR)', 'comment(ENZYME REGULATION)', 'comment(FUNCTION)', 35 | 'comment(KINETICS)', 'comment(PATHWAY)', 'comment(REDOX POTENTIAL)', 36 | 'comment(TEMPERATURE DEPENDENCE)', 'comment(PH DEPENDENCE)', 37 | 'feature(ACTIVE SITE)', 'feature(BINDING SITE)', 'feature(DNA BINDING)', 38 | 'feature(METAL BINDING)', 'feature(NP BIND)', 'feature(SITE)', 39 | # Gene Ontologys 40 | 'go', 'go(biological process)', 'go(molecular function)', 41 | 'go(cellular component)', 'go-id', 42 | # InterPro 43 | 'interpro', 44 | # Interaction 45 | 'interactor', 'comment(SUBUNIT)', 46 | # Publications 47 | 'citation', 'citationmapping', 48 | # Date of 49 | 'created', 'last-modified', 'sequence-modified', 'version(entry)', 50 | # Structure 51 | '3d', 'feature(BETA STRAND)', 'feature(HELIX)', 'feature(TURN)', 52 | # Subcellular location 53 | 'comment(SUBCELLULAR LOCATION)', 'feature(INTRAMEMBRANE)', 54 | 'feature(TOPOLOGICAL DOMAIN)', 55 | 'feature(TRANSMEMBRANE)', 56 | # Miscellaneous 57 | 'annotation score', 'score', 'features', 'comment(CAUTION)', 58 | 'comment(TISSUE SPECIFICITY)', 59 | 'comment(GENERAL)', 'keywords', 'context', 'existence', 'tools', 60 | 'reviewed', 'feature', 'families', 'subcellular locations', 'taxonomy', 61 | 'version', 'clusters', 'comments', 'database', 'keyword-id', 'pathway', 62 | 'score', 63 | # Pathology & Biotech 64 | 'comment(ALLERGEN)', 'comment(BIOTECHNOLOGY)', 'comment(DISRUPTION PHENOTYPE)', 65 | 'comment(DISEASE)', 'comment(PHARMACEUTICAL)', 'comment(TOXIC DOSE)', 66 | # PTM / Processsing 67 | 'comment(PTM)', 'feature(CHAIN)', 'feature(CROSS LINK)', 'feature(DISULFIDE BOND)', 68 | 'feature(GLYCOSYLATION)', 'feature(INITIATOR METHIONINE)', 'feature(LIPIDATION)', 69 | 'feature(MODIFIED RESIDUE)', 'feature(PEPTIDE)', 'feature(PROPEPTIDE)', 70 | 'feature(SIGNAL)', 'feature(TRANSIT)', 71 | # Taxonomic lineage 72 | 'lineage(all)', 'lineage(SUPERKINGDOM)', 'lineage(KINGDOM)', 'lineage(SUBKINGDOM)', 73 | 'lineage(SUPERPHYLUM)', 'lineage(PHYLUM)', 'lineage(SUBPHYLUM)', 'lineage(SUPERCLASS)', 74 | 'lineage(CLASS)', 'lineage(SUBCLASS)', 'lineage(INFRACLASS)', 'lineage(SUPERORDER)', 75 | 'lineage(ORDER)', 'lineage(SUBORDER)', 'lineage(INFRAORDER)', 'lineage(PARVORDER)', 76 | 'lineage(SUPERFAMILY)', 'lineage(FAMILY)', 'lineage(SUBFAMILY)', 'lineage(TRIBE)', 77 | 'lineage(SUBTRIBE)', 'lineage(GENUS)', 'lineage(SUBGENUS)', 'lineage(SPECIES GROUP)', 78 | 'lineage(SPECIES SUBGROUP)', 'lineage(SPECIES)', 'lineage(SUBSPECIES)', 'lineage(VARIETAS)', 79 | 'lineage(FORMA)', 80 | # Taxonomic identifier 81 | 'lineage-id(all)', 'lineage-id(SUPERKINGDOM)', 'lineage-id(KINGDOM)', 'lineage-id(SUBKINGDOM)', 82 | 'lineage-id(SUPERPHYLUM)', 'lineage-id(PHYLUM)', 'lineage-id(SUBPHYLUM)', 'lineage-id(SUPERCLASS)', 83 | 'lineage-id(CLASS)', 'lineage-id(SUBCLASS)', 'lineage-id(INFRACLASS)', 'lineage-id(SUPERORDER)', 84 | 'lineage-id(ORDER)', 'lineage-id(SUBORDER)', 'lineage-id(INFRAORDER)', 'lineage-id(PARVORDER)', 85 | 'lineage-id(SUPERFAMILY)', 'lineage-id(FAMILY)', 'lineage-id(SUBFAMILY)', 'lineage-id(TRIBE)', 86 | 'lineage-id(SUBTRIBE)', 'lineage-id(GENUS)', 'lineage-id(SUBGENUS)', 'lineage-id(SPECIES GROUP)', 87 | 'lineage-id(SPECIES SUBGROUP)', 'lineage-id(SPECIES)', 'lineage-id(SUBSPECIES)', 'lineage-id(VARIETAS)', 88 | 'lineage-id(FORMA)'] 89 | 90 | 91 | def get_uniprot_info_from_ids(ids: list, filename: Path, chunk=False, identifier: str = "ACC+ID", to: str = "ACC", 92 | columns: str = ",".join(COLUMN_NAMES)): 93 | """ 94 | Batch retrieval of IDs and information from UniProt. 95 | 96 | Parameters 97 | ---------- 98 | ids 99 | input IDs 100 | filename 101 | write to this file 102 | chunk 103 | split into multiple queries of size 100 and join results 104 | identifier 105 | type of input IDs 106 | to 107 | output ID format - ACC returns all column information 108 | columns 109 | column names to return, preformatted sting (",".join(column_names)) 110 | 111 | 112 | Returns 113 | ------- 114 | All information written as newline separated, tab delimited text. 115 | """ 116 | mapping_url = 'http://www.uniprot.org/uploadlists/' 117 | mapping_params = { 118 | 'from': identifier, 119 | 'to': to, 120 | 'format': 'tab', 121 | 'columns': columns 122 | } 123 | with open(filename, "w") as f: 124 | if chunk: 125 | num_tries = 5 126 | for i, id_i in tqdm(enumerate(range(0, len(ids), 100))): 127 | id_chunk = ids[id_i: id_i + 100] 128 | good_text = False 129 | text = "" 130 | try_number = 0 131 | while not good_text and try_number < num_tries: 132 | mapping_params['query'] = ' '.join(id_chunk) 133 | response = requests.post(mapping_url, params=mapping_params) 134 | text = response.text 135 | if "" in text: 136 | good_text = False 137 | else: 138 | good_text = True 139 | try_number += 1 140 | if i == 0: 141 | f.write(text) 142 | else: 143 | f.write("\n".join(text.split("\n")[1:])) 144 | else: 145 | mapping_params['query'] = ' '.join(ids) 146 | response = requests.post(mapping_url, params=mapping_params) 147 | f.write(response.text) 148 | --------------------------------------------------------------------------------