├── .gitignore
├── 1-preparing-data.ipynb
├── 2-topic-modelling.ipynb
├── 3-analysis.ipynb
├── README.md
└── src
    ├── __init__.py
    ├── make_data.py
    ├── plotting.py
    ├── proteinnet_parser.py
    └── uniprot_parser.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | __pycache__
4 | 


--------------------------------------------------------------------------------
/1-preparing-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from src import make_data\n",
 20 |     "from pathlib import Path"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "metadata": {
 27 |     "pycharm": {
 28 |      "name": "#%%\n"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# Folder to store all the data\n",
 34 |     "DATA_FOLDER = Path(\"/path/to/data/folder\")"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {
 40 |     "pycharm": {
 41 |      "name": "#%% md\n"
 42 |     }
 43 |    },
 44 |    "source": [
 45 |     "Download AlphaFold (AF) database proteins as tar files and extract"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "pycharm": {
 53 |      "name": "#%%\n"
 54 |     }
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "make_data.download_data(DATA_FOLDER)\n",
 59 |     "make_data.extract_data(DATA_FOLDER, DATA_FOLDER)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {
 65 |     "pycharm": {
 66 |      "name": "#%% md\n"
 67 |     }
 68 |    },
 69 |    "source": [
 70 |     "Download UniProt annotations for all AF proteins, split by species"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {
 77 |     "pycharm": {
 78 |      "name": "#%%\n"
 79 |     }
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "uniprot_folder = DATA_FOLDER / \"uniprot_files\"\n",
 84 |     "if not uniprot_folder.exists():\n",
 85 |     "    uniprot_folder.mkdir()\n",
 86 |     "make_data.get_uniprot_info(DATA_FOLDER, uniprot_folder)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {
 92 |     "pycharm": {
 93 |      "name": "#%% md\n"
 94 |     }
 95 |    },
 96 |    "source": [
 97 |     "Get average pLDDT scores, number of high confidence residues, and total number of residues for each AF protein"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 7,
103 |    "metadata": {
104 |     "pycharm": {
105 |      "name": "#%%\n"
106 |     },
107 |     "scrolled": false
108 |    },
109 |    "outputs": [
110 |     {
111 |      "name": "stderr",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "3988it [01:33, 42.74it/s]\n",
115 |       "19694it [10:20, 31.73it/s]\n",
116 |       "12622it [09:22, 22.44it/s]\n",
117 |       "23391it [21:38, 18.01it/s]\n",
118 |       "27434it [16:18, 28.05it/s]\n",
119 |       "39299it [22:24, 29.22it/s]\n",
120 |       "4363it [02:29, 29.27it/s]\n",
121 |       "13458it [09:59, 22.45it/s]\n",
122 |       "1773it [00:51, 34.39it/s]\n",
123 |       "5187it [05:02, 17.14it/s]\n",
124 |       "19036it [13:44, 23.09it/s]\n",
125 |       "6040it [04:20, 23.15it/s]\n",
126 |       "5128it [03:46, 22.66it/s]\n",
127 |       "21272it [16:47, 21.10it/s]\n",
128 |       "7924it [07:17, 18.10it/s]\n",
129 |       "2888it [01:25, 33.86it/s]\n",
130 |       "55799it [32:34, 28.55it/s]\n",
131 |       "5974it [04:12, 23.64it/s]]\n",
132 |       "21615it [15:28, 23.29it/s]\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "avg_scores, lengths_high_confidence, lengths_full = make_data.get_AF_protein_information(DATA_FOLDER)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 8,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "uniprot_folder = DATA_FOLDER / \"uniprot_go\""
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {
152 |     "pycharm": {
153 |      "name": "#%% md\n"
154 |     }
155 |    },
156 |    "source": [
157 |     "Combine all UniProt data and scores into a dataframe"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 9,
163 |    "metadata": {
164 |     "pycharm": {
165 |      "name": "#%%\n"
166 |     }
167 |    },
168 |    "outputs": [
169 |     {
170 |      "name": "stderr",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "/mnt/backup2/geometric/geo_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3357: DtypeWarning: Columns (5) have mixed types.Specify dtype option on import or set low_memory=False.\n",
174 |       "  if (await self.run_code(code, result,  async_=asy)):\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "import pandas as pnd\n",
180 |     "AF_dataframe = pnd.concat([pnd.read_csv(filename, sep=\"\\t\") for filename in uniprot_folder.glob(\"UP*_uniprot.txt\")])\n",
181 |     "AF_dataframe[\"Protein family\"] = [str(val).split(\",\")[0] for val in AF_dataframe[\"Protein families\"]] # Superfamily\n",
182 |     "AF_dataframe[\"Organism\"] = [\" \".join(str(val).split(\" (\")[0].split(\" \")[:2]) for val in AF_dataframe[\"Organism\"]]\n",
183 |     "AF_dataframe[\"ID\"] = [f\"AF-{k}-F1-model_v1.pdb\" for k in AF_dataframe[\"Entry\"]]\n",
184 |     "AF_dataframe[\"Avg. score\"] = [avg_scores[key] if key in avg_scores else 40 for key in AF_dataframe[\"ID\"]]\n",
185 |     "AF_dataframe[\"Length\"] = [lengths_full[key] if key in lengths_full else 0 for key in AF_dataframe[\"ID\"]]\n",
186 |     "AF_dataframe[\"High confidence length\"] = [lengths_high_confidence[key] if key in lengths_high_confidence else 0 for key in AF_dataframe[\"ID\"]]\n",
187 |     "\n",
188 |     "AF_dataframe = AF_dataframe[[c for c in AF_dataframe.columns if not c.startswith(\"yourlist\")]]\n",
189 |     "AF_dataframe.to_csv(DATA_FOLDER / \"AF_dataframe.txt\", sep=\"\\t\")\n",
190 |     "AF_dataframe = AF_dataframe.set_index(\"ID\")"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "pycharm": {
197 |      "name": "#%% md\n"
198 |     }
199 |    },
200 |    "source": [
201 |     "Calculate shapemers for each AF protein"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "pycharm": {
209 |      "name": "#%%\n"
210 |     }
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "make_data.get_AF_shapemers(DATA_FOLDER)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {
220 |     "pycharm": {
221 |      "name": "#%% md\n"
222 |     }
223 |    },
224 |    "source": [
225 |     "Download and extract CASP12 data from\n",
226 |     "`https://sharehost.hms.harvard.edu/sysbio/alquraishi/proteinnet/human_readable/casp12.tar.gz`\n",
227 |     "into DATA_FOLDER / casp12\n",
228 |     "\n",
229 |     "Calculate shapemers for all CASP12 proteins"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {
236 |     "pycharm": {
237 |      "name": "#%%\n"
238 |     }
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "make_data.get_PDB_shapemers(DATA_FOLDER / \"casp12\" / \"training_100\",\n",
243 |     "                            DATA_FOLDER)\n",
244 |     "make_data.get_PDB_shapemers(DATA_FOLDER / \"casp12\" / \"validation\",\n",
245 |     "                            DATA_FOLDER)\n",
246 |     "make_data.get_PDB_shapemers(DATA_FOLDER / \"casp12\" / \"testing\",\n",
247 |     "                            DATA_FOLDER)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {
253 |     "pycharm": {
254 |      "name": "#%% md\n"
255 |     }
256 |    },
257 |    "source": [
258 |     "Get UniProt annotations for all PDB proteins"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 16,
264 |    "metadata": {
265 |     "pycharm": {
266 |      "name": "#%%\n"
267 |     }
268 |    },
269 |    "outputs": [
270 |     {
271 |      "name": "stderr",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "1043it [16:36,  1.05it/s]\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "import itertools\n",
280 |     "from src import uniprot_parser\n",
281 |     "import pickle\n",
282 |     "\n",
283 |     "corpus_files = DATA_FOLDER.glob(\"*_ids_corpus_res4_6*.txt\")\n",
284 |     "keys = (line.strip().split(\"\\t\")[0] for line in itertools.chain.from_iterable((open(file) for file in corpus_files)))\n",
285 |     "pdb_ids = []\n",
286 |     "for k in keys:\n",
287 |     "    if k.endswith(\".pdb\"):\n",
288 |     "        continue\n",
289 |     "    if \"#\" in k:\n",
290 |     "        if \"TBM\" in k or \"FM\" in k:\n",
291 |     "            continue\n",
292 |     "        k = k.split(\"#\")[1][:4]\n",
293 |     "    else:\n",
294 |     "        k = k[:4]\n",
295 |     "    pdb_ids.append(k)\n",
296 |     "uniprot_parser.get_uniprot_info_from_ids(pdb_ids,\n",
297 |     "                                         DATA_FOLDER / \"uniprot_go\" / \"casp12_uniprot.txt\",\n",
298 |     "                                         identifier=\"PDB_ID\",\n",
299 |     "                                         columns=make_data.UNIPROT_COLUMNS,\n",
300 |     "                                         chunk=True)"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 11,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "import pickle"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "pycharm": {
317 |      "name": "#%%\n"
318 |     }
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "coords = make_data.get_PDB_protein_information([DATA_FOLDER / \"casp12\" / f for f in [\"training_100\",\n",
323 |     "                                                                                     \"validation\",\n",
324 |     "                                                                                     \"testing\"]])"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 12,
330 |    "metadata": {
331 |     "pycharm": {
332 |      "name": "#%%\n"
333 |     }
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "with open(DATA_FOLDER / \"PDB_coords.pkl\", \"wb\") as f:\n",
338 |     "    pickle.dump(coords, f)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 17,
344 |    "metadata": {
345 |     "pycharm": {
346 |      "name": "#%%\n"
347 |     }
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "PDB_dataframe = pnd.read_csv(DATA_FOLDER / \"uniprot_go\" / \"casp12_uniprot.txt\", sep=\"\\t\")\n",
352 |     "mapping_column = [c for c in PDB_dataframe.columns if c.startswith(\"yourlist\")][0]\n",
353 |     "PDB_dataframe[\"PDB_ID\"] = PDB_dataframe[mapping_column]\n",
354 |     "PDB_dataframe[\"Protein family\"] = [str(val).split(\",\")[0] for val in PDB_dataframe[\"Protein families\"]] # Superfamily\n",
355 |     "PDB_dataframe[\"Organism\"] = [\" \".join(str(val).split(\" (\")[0].split(\" \")[:2]) for val in PDB_dataframe[\"Organism\"]]\n",
356 |     "PDB_dataframe = PDB_dataframe[[c for c in PDB_dataframe.columns if c != mapping_column]]"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {
362 |     "pycharm": {
363 |      "name": "#%% md\n"
364 |     }
365 |    },
366 |    "source": [
367 |     "Match AF proteins with previously determined PDB proteins"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 18,
373 |    "metadata": {
374 |     "pycharm": {
375 |      "name": "#%%\n"
376 |     }
377 |    },
378 |    "outputs": [],
379 |    "source": [
380 |     "import numpy as np\n",
381 |     "from collections import defaultdict\n",
382 |     "\n",
383 |     "AF_PDB_cross_references = AF_dataframe['Cross-reference (PDB)'][AF_dataframe['Cross-reference (PDB)'].notna()]\n",
384 |     "AF_PDB_mapping = {key: AF_PDB_cross_references[key] for key in AF_PDB_cross_references.keys()}\n",
385 |     "PDB_AF_mapping = defaultdict(list)\n",
386 |     "for p in AF_PDB_mapping:\n",
387 |     "    if type(AF_PDB_mapping[p]) == str:\n",
388 |     "        for p1 in AF_PDB_mapping[p][:-1].split(\";\"):\n",
389 |     "            PDB_AF_mapping[p1].append(p)\n",
390 |     "    else:\n",
391 |     "        for p1 in AF_PDB_mapping[p].values:\n",
392 |     "            PDB_AF_mapping[p1[:-1]].append(p)\n",
393 |     "\n",
394 |     "PDB_dataframe[\"AF\"] = [\";\".join(PDB_AF_mapping[p]) if p in PDB_AF_mapping else np.nan for p in PDB_dataframe[\"PDB_ID\"]]\n",
395 |     "PDB_dataframe.to_csv(DATA_FOLDER / \"PDB_dataframe.txt\", sep=\"\\t\")"
396 |    ]
397 |   }
398 |  ],
399 |  "metadata": {
400 |   "kernelspec": {
401 |    "display_name": "geo_env",
402 |    "language": "python",
403 |    "name": "geo_env"
404 |   },
405 |   "language_info": {
406 |    "codemirror_mode": {
407 |     "name": "ipython",
408 |     "version": 3
409 |    },
410 |    "file_extension": ".py",
411 |    "mimetype": "text/x-python",
412 |    "name": "python",
413 |    "nbconvert_exporter": "python",
414 |    "pygments_lexer": "ipython3",
415 |    "version": "3.8.8"
416 |   }
417 |  },
418 |  "nbformat": 4,
419 |  "nbformat_minor": 1
420 | }
421 | 


--------------------------------------------------------------------------------
/2-topic-modelling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 12 |     "from sklearn.preprocessing import StandardScaler\n",
 13 |     "import itertools\n",
 14 |     "from pathlib import Path\n",
 15 |     "from sklearn.decomposition import NMF\n",
 16 |     "import openTSNE\n",
 17 |     "import pickle"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {
 24 |     "pycharm": {
 25 |      "name": "#%%\n"
 26 |     }
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "DATA_FOLDER = Path(\"data\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {
 36 |     "pycharm": {
 37 |      "name": "#%% md\n"
 38 |     }
 39 |    },
 40 |    "source": [
 41 |     "Loading all corpus files:"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {
 48 |     "pycharm": {
 49 |      "name": "#%%\n"
 50 |     }
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "corpus_files = DATA_FOLDER.glob(\"*_ids_corpus_resolution_4_6*.txt\")\n",
 55 |     "keys_corpus = (line.strip().split(\"\\t\") for line in itertools.chain.from_iterable((open(file) for file in corpus_files)))\n",
 56 |     "keys, corpus = itertools.tee(keys_corpus)\n",
 57 |     "keys = [k[0] for k in keys]\n",
 58 |     "corpus = (k[1] for k in corpus)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {
 64 |     "pycharm": {
 65 |      "name": "#%% md\n"
 66 |     }
 67 |    },
 68 |    "source": [
 69 |     "Calculating the TFIDF matrix:"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "pycharm": {
 77 |      "name": "#%%\n"
 78 |     }
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "print(f\"Getting TFIDF matrix for {len(keys)} proteins...\")\n",
 83 |     "vectorizer = TfidfVectorizer(min_df=2)\n",
 84 |     "tfidf_matrix = vectorizer.fit_transform(corpus)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Fitting NMF model:"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {
 98 |     "pycharm": {
 99 |      "name": "#%%\n"
100 |     }
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "num_topics = 250\n",
105 |     "topic_model = NMF(n_components=num_topics,\n",
106 |     "            random_state=42,\n",
107 |     "            solver='cd', tol=0.0005,\n",
108 |     "            max_iter=500,\n",
109 |     "            alpha=.1,\n",
110 |     "            l1_ratio=.5,\n",
111 |     "            verbose=1)\n",
112 |     "w_matrix = topic_model.fit_transform(tfidf_matrix)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "Normalizing $W$ matrix for plotting:"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "pycharm": {
127 |      "name": "#%%\n"
128 |     }
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "scaler = StandardScaler()\n",
133 |     "w_matrix_norm = scaler.fit_transform(w_matrix)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Fitting t-SNE model initialized with PCA on $W$ matrix:"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "pycharm": {
148 |      "name": "#%%\n"
149 |     }
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "tsne_reducer = openTSNE.TSNE(\n",
154 |     "        perplexity=50,\n",
155 |     "        initialization=\"pca\",\n",
156 |     "        metric=\"cosine\",\n",
157 |     "        n_jobs=14,\n",
158 |     "        random_state=42,\n",
159 |     "        n_iter=1000,\n",
160 |     "        verbose=True\n",
161 |     "    )\n",
162 |     "reduced = tsne_reducer.fit(w_matrix_norm)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "Saving everything:"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "pycharm": {
177 |      "name": "#%%\n"
178 |     }
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "with open(DATA_FOLDER / \"topic_modelling_data.pkl\", \"wb\") as f:\n",
183 |     "    pickle.dump((keys,\n",
184 |     "                 vectorizer, tfidf_matrix,\n",
185 |     "                 topic_model, w_matrix,\n",
186 |     "                 scaler, w_matrix_norm,\n",
187 |     "                 tsne_reducer, reduced), f)"
188 |    ]
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.8.8"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 1
212 | }
213 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Structural fold space for AF2 predicted models across 21 proteomes
 2 | 
 3 | Topic modelling of structural shape-mers to explore differences between the [AlphaFold DB](https://alphafold.ebi.ac.uk/) and the [PDB](https://www.rcsb.org/)
 4 | 
 5 | ## Requirements
 6 | 
 7 | * numpy
 8 | * scipy
 9 | * scikit-learn
10 | * geometricus (https://github.com/TurtleTools/geometricus)
11 | * portein (https://github.com/TurtleTools/portein)
12 | * kneed (https://github.com/arvkevi/kneed)
13 | 
14 | 
15 | ## Publications
16 | 
17 | Akdel, M., Pires, D.E., Pardo, E.P., Jänes, J., Zalevsky, A.O., Mészáros, B., Bryant, P., Good, L.L., Laskowski, R.A., Pozzati, G. and Shenoy, A., 2021. A structural biology community assessment of AlphaFold 2 applications. bioRxiv.
18 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TurtleTools/alphafold-structural-space/d4baeb461ba76bd3c684418920e9ef78e7f45480/src/__init__.py


--------------------------------------------------------------------------------
/src/make_data.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from ftplib import FTP
  3 | import prody as pd
  4 | from dataclasses import dataclass
  5 | import numpy as np
  6 | import typing as ty
  7 | from geometricus import MomentInvariants, SplitType
  8 | import tarfile
  9 | from time import time
 10 | from tqdm import tqdm
 11 | from scipy import ndimage
 12 | 
 13 | from src import uniprot_parser, proteinnet_parser
 14 | 
 15 | UNIPROT_COLUMNS = ",".join(("id", "entry name", 'genes', 'genes(PREFERRED)', 'genes(ALTERNATIVE)',
 16 |                         'genes(OLN)', 'genes(ORF)', "organism", "protein names", "families",
 17 |                         'go', 'go(biological process)', 'go(molecular function)',
 18 |                         'go(cellular component)', 'database(PDB)', 'database(Pfam)'))
 19 | 
 20 | 
 21 | @dataclass
 22 | class MomentInvariantsSavable:
 23 |     name: str
 24 |     moments: ty.Union[np.ndarray, None]
 25 |     coordinates: ty.Union[np.ndarray, None]
 26 | 
 27 |     @classmethod
 28 |     def from_invariant(cls, invariant: MomentInvariants):
 29 |         return cls(invariant.name, invariant.moments, invariant.coordinates)
 30 | 
 31 | 
 32 | def download_data(output_folder: Path):
 33 |     if not output_folder.exists():
 34 |         output_folder.mkdir()
 35 |     ftp = FTP('ftp.ebi.ac.uk')
 36 |     ftp.login()
 37 |     ftp.cwd("/pub/databases/alphafold")
 38 |     for filename in ftp.nlst():
 39 |         print(f"Retrieving {filename}")
 40 |         with open(output_folder / filename, 'wb') as f:
 41 |             ftp.retrbinary('RETR ' + filename, f.write)
 42 | 
 43 | 
 44 | def extract_data(input_folder, output_folder):
 45 |     for filename in input_folder.glob("*.tar"):
 46 |         tar = tarfile.open(str(filename))
 47 |         folder = output_folder / str(filename.stem)
 48 |         if not folder.exists():
 49 |             folder.mkdir()
 50 |         tar.extractall(str(folder))  # specify which folder to extract to
 51 |         tar.close()
 52 | 
 53 | 
 54 | def get_uniprot_info(data_folder, aux_folder, extension="pdb.gz"):
 55 |     start_time = time()
 56 |     for folder in data_folder.iterdir():
 57 |         if folder.is_dir() and folder.stem.startswith("UP0"):
 58 |             uniprot_file = aux_folder / f"{folder.stem}_uniprot.txt"
 59 |             uniprot_ids = [filename.stem.split("-")[1] for filename in folder.glob(f"*{extension}")]
 60 |             if not uniprot_file.exists():
 61 |                 uniprot_parser.get_uniprot_info_from_ids(uniprot_ids, uniprot_file, chunk=True,
 62 |                                                          columns=UNIPROT_COLUMNS)
 63 |             print(f"{folder.stem}: Time elapsed: {time() - start_time}s")
 64 | 
 65 | 
 66 | def get_AF_shapemers(root_folder,
 67 |                      resolution_kmer=4,
 68 |                      resolution_radius=6,
 69 |                      length_threshold=50):
 70 |     root_folder = Path(root_folder)
 71 |     with open(
 72 |             root_folder /
 73 |             f"AF_ids_corpus_resolution_{resolution_kmer}_{resolution_radius}_threshold_{length_threshold}.txt",
 74 |             "w") as corpus_file:
 75 |         for folder in root_folder.iterdir():
 76 |             if folder.is_dir() and folder.stem.startswith("UP0"):
 77 |                 print(folder.stem)
 78 |                 for i, pdb_file in tqdm(enumerate(folder.glob("*.pdb.gz"))):
 79 |                     key = pdb_file.stem
 80 |                     pdb = pd.parsePDB(str(pdb_file)).select("protein and calpha")
 81 |                     betas = ndimage.gaussian_filter1d(pdb.getBetas(), sigma=5)
 82 |                     coords = pdb.getCoords()
 83 |                     sequence = pdb.getSequence()
 84 | 
 85 |                     indices = np.ones(betas.shape[0], dtype=int)
 86 |                     indices[np.where(betas < 70)] = 0
 87 | 
 88 |                     slices = ndimage.find_objects(ndimage.label(indices)[0])
 89 |                     index = 0
 90 |                     shapemers = []
 91 |                     for s in slices:
 92 |                         s = s[0]
 93 |                         if s.stop - s.start > length_threshold:
 94 |                             index += 1
 95 |                             invariants = MomentInvariants.from_coordinates(
 96 |                                 key,
 97 |                                 coords[s.start: s.stop],
 98 |                                 sequence[s.start: s.stop],
 99 |                                 split_type=SplitType.KMER_CUT,
100 |                                 split_size=16
101 |                             )
102 |                             shapemers += [f"k{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in
103 |                                           (np.log1p(invariants.moments) * resolution_kmer).astype(int)]
104 |                             invariants = MomentInvariants.from_coordinates(
105 |                                 key,
106 |                                 coords[s.start: s.stop],
107 |                                 sequence[s.start: s.stop],
108 |                                 split_type=SplitType.RADIUS,
109 |                                 split_size=10
110 |                             )
111 |                             shapemers += [f"r{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in
112 |                                           (np.log1p(invariants.moments) * resolution_radius).astype(int)]
113 |                     if index > 0:
114 |                         corpus_file.write(key + "\t" + " ".join(shapemers) + "\n")
115 | 
116 | 
117 | def get_PDB_shapemers(casp_file, root_folder, resolution_kmer=4, resolution_radius=6):
118 |     with open(Path(root_folder) / f"PDB_{casp_file.stem}_ids_corpus_resolution_{resolution_kmer}_{resolution_radius}.txt",
119 |               "w") as corpus_file:
120 |         for entry in tqdm(proteinnet_parser.yield_records_from_file(casp_file, 20)):
121 |             entry = proteinnet_parser.clean_entry(entry, 'ca')
122 |             invariants = MomentInvariants.from_coordinates(
123 |                 entry["ID"],
124 |                 entry["tertiary"],
125 |                 entry["primary"],
126 |                 split_type=SplitType.KMER_CUT,
127 |                 split_size=16
128 |             )
129 |             shapemers = [f"k{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in
130 |                          (np.log1p(invariants.moments) * resolution_kmer).astype(int)]
131 |             invariants = MomentInvariants.from_coordinates(
132 |                 entry["ID"],
133 |                 entry["tertiary"],
134 |                 entry["primary"],
135 |                 split_type=SplitType.RADIUS,
136 |                 split_size=10
137 |             )
138 |             shapemers += [f"r{x[0]}i{x[1]}i{x[2]}i{x[3]}" for x in
139 |                           (np.log1p(invariants.moments) * resolution_radius).astype(int)]
140 |             if len(shapemers):
141 |                 corpus_file.write(entry["ID"] + "\t" + " ".join(shapemers) + "\n")
142 | 
143 | 
144 | def get_AF_protein_information(data_folder):
145 |     data_folder = Path(data_folder)
146 |     avg_scores = {}
147 |     lengths_high_confidence = {}
148 |     lengths_full = {}
149 |     for folder in data_folder.iterdir():
150 |         if folder.is_dir() and folder.stem.startswith("UP0"):
151 |             for filename in tqdm(folder.glob("*.pdb.gz")):
152 |                 pdb = pd.parsePDB(str(filename))
153 |                 if pdb is None:
154 |                     continue
155 |                 key = filename.stem
156 |                 avg_scores[key] = np.median(pdb.getBetas())
157 |                 pdb = pdb.select("protein and calpha")
158 |                 if pdb is None:
159 |                     continue
160 |                 lengths_full[key] = len(pdb)
161 |                 pdb = pdb.select("beta > 70")
162 |                 if pdb is None:
163 |                     continue
164 |                 lengths_high_confidence[key] = len(pdb)
165 |     return avg_scores, lengths_high_confidence, lengths_full
166 | 
167 | 
168 | def get_PDB_protein_information(casp_files):
169 |     coords = {}
170 |     for casp_file in casp_files:
171 |         for entry in tqdm(proteinnet_parser.yield_records_from_file(casp_file, 20)):
172 |             entry = proteinnet_parser.clean_entry(entry, 'ca')
173 |             coords[entry["ID"]] = entry["tertiary"]
174 |     return coords
175 | 


--------------------------------------------------------------------------------
/src/plotting.py:
--------------------------------------------------------------------------------
 1 | from portein import get_best_transformation, apply_transformation, find_size
 2 | import prody as pd
 3 | from geometricus import MomentInvariants, SplitType
 4 | from scipy.signal import resample
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | 
 9 | def get_coords_topic_scores(coords, topic_id, h_matrix_norm, shapemer_to_index):
10 |     def shapemer_to_topic_value(s_string):
11 |         if s_string in shapemer_to_index:
12 |             return h_matrix_norm[topic_id][shapemer_to_index[s_string]]
13 |         else:
14 |             return 0
15 | 
16 |     resolution_kmer = 4
17 |     resolution_radius = 6
18 |     weights = np.zeros(coords.shape[0])
19 |     protein_invariants = MomentInvariants.from_coordinates("protein_id",
20 |                                                            coords,
21 |                                                            None,
22 |                                                            split_type=SplitType.KMER_CUT)
23 | 
24 |     def get_similarity(x1, x2, gamma=0.03):
25 |         return np.exp(-gamma * np.sum((coords[x1] - coords[x2]) ** 2, axis=-1))
26 | 
27 |     shapemers = (np.log1p(protein_invariants.moments) * resolution_kmer).astype(int)
28 |     for i, x in enumerate(shapemers):
29 |         weight = shapemer_to_topic_value(f"k{x[0]}i{x[1]}i{x[2]}i{x[3]}")
30 |         for index in range(i, i + 16):
31 |             weights[i] += weight * get_similarity(i, index)
32 |     protein_invariants = MomentInvariants.from_coordinates(
33 |         "protein_id",
34 |         coords,
35 |         None,
36 |         split_type=SplitType.RADIUS,
37 |         split_size=10)
38 |     shapemers = (np.log1p(protein_invariants.moments) * resolution_radius).astype(int)
39 |     for i, x in enumerate(shapemers):
40 |         weight = shapemer_to_topic_value(f"r{x[0]}i{x[1]}i{x[2]}i{x[3]}")
41 |         for index in protein_invariants.split_indices[i]:
42 |             weights[index] += weight * get_similarity(i, index)
43 |     return weights
44 | 
45 | def get_protein_topic_scores(path, topic_id, h_matrix_norm, shapemer_to_index, matplotlib=True):
46 |     pdb = pd.parsePDB(str(path))
47 |     pdb_alpha = pdb.select("protein and calpha")
48 |     opacities = pdb_alpha.getBetas() / 100
49 |     coords = pdb_alpha.getCoords()
50 |     weights = get_coords_topic_scores(coords, topic_id, h_matrix_norm, shapemer_to_index)
51 |     if matplotlib:
52 |         coords = apply_transformation(coords, get_best_transformation(coords))
53 |         return coords, weights, opacities
54 |     else:
55 |         matrix = get_best_transformation(coords)
56 |         pdb = pd.applyTransformation(pd.Transformation(matrix), pdb)
57 |         for i, res in enumerate(pdb.iterResidues()):
58 |             res.setBetas([weights[i]] * len(res))
59 |         return pdb
60 | 
61 | 
62 | def plot_protein(coords, weights, opacities, max_value, upsample_rate=3):
63 |     coords = resample(coords[:, :2], upsample_rate * coords.shape[0])
64 |     weights = np.repeat(weights, upsample_rate)
65 |     opacities = np.repeat(opacities, upsample_rate)
66 |     colors = [plt.cm.coolwarm(int(256 * (x / max_value))) for x in weights]
67 |     fig, ax = plt.subplots(figsize=find_size(coords, height=5, width=None))
68 |     for i in range(coords.shape[0] - upsample_rate):
69 |         ax.plot(coords[:, 0][i:i + 2], coords[:, 1][i:i + 2],
70 |                 lw=2, color=colors[i], alpha=opacities[i])
71 |     plt.axis("off")
72 |     return fig, ax
73 | 


--------------------------------------------------------------------------------
/src/proteinnet_parser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Text-based parser for ProteinNet Records.
 3 | """
 4 | 
 5 | __author__ = "Mohammed AlQuraishi"
 6 | __copyright__ = "Copyright 2019, Harvard Medical School"
 7 | __license__ = "MIT"
 8 | 
 9 | # !/usr/bin/python
10 | 
11 | # imports
12 | import sys
13 | import re
14 | import numpy as np
15 | from itertools import groupby
16 | 
17 | # Constants
18 | NUM_DIMENSIONS = 3
19 | 
20 | # Functions for conversion from Mathematica protein files to TFRecords
21 | _aa_dict = {'A': '0', 'C': '1', 'D': '2', 'E': '3', 'F': '4', 'G': '5', 'H': '6', 'I': '7', 'K': '8', 'L': '9', 'M': '10', 'N': '11', 'P': '12', 'Q': '13', 'R': '14', 'S': '15', 'T': '16', 'V': '17', 'W': '18', 'Y': '19'}
22 | _dssp_dict = {'L': '0', 'H': '1', 'B': '2', 'E': '3', 'G': '4', 'I': '5', 'T': '6', 'S': '7'}
23 | _mask_dict = {'-': '0', '+': '1'}
24 | 
25 | 
26 | def letter_to_num(string, dict_):
27 |     """ Convert string of letters to list of ints """
28 |     patt = re.compile('[' + ''.join(dict_.keys()) + ']')
29 |     num_string = patt.sub(lambda m: dict_[m.group(0)] + ' ', string)
30 |     return [int(i) for i in num_string.split()]
31 | 
32 | 
33 | def yield_records_from_file(file, num_evo_entries: int = 20):
34 |     def get_record(lines):
35 |         entry = {"ID": lines[0].strip()}
36 |         for i, line in enumerate(lines):
37 |             if line == '[PRIMARY]' + '\n':
38 |                 primary = lines[i + 1].strip()
39 |                 entry.update({'primary': primary})
40 |             elif line == '[EVOLUTIONARY]' + '\n':
41 |                 evolutionary = []
42 |                 for residue in range(num_evo_entries):
43 |                     evolutionary.append(
44 |                         [float(step) for step in lines[i + 1].strip().split()]
45 |                     )
46 |                 entry.update({'evolutionary': np.array(evolutionary)})
47 |             elif line == '[SECONDARY]' + '\n':
48 |                 secondary = letter_to_num(lines[i + 1].strip(), _dssp_dict)
49 |                 entry.update({'secondary': secondary})
50 |             elif line == '[TERTIARY]' + '\n':
51 |                 tertiary = []
52 |                 for axis in range(NUM_DIMENSIONS):
53 |                     tertiary.append([float(coord) for coord in lines[i + 1 + axis].strip().split()])
54 |                 entry.update({'tertiary': np.array(tertiary).T})
55 |             elif line == '[MASK]' + '\n':
56 |                 mask = letter_to_num(lines[i + 1].strip(), _mask_dict)
57 |                 entry.update({'mask': mask})
58 |             else:
59 |                 continue
60 |         return entry
61 | 
62 |     for k, g in groupby(open(file, "r"), lambda x: x.startswith("[ID]")):
63 |         if not k:
64 |             yield get_record(list(g))
65 | 
66 | 
67 | def clean_entry(entry, atom="ca"):
68 |     sequence = "primary"
69 |     mask = np.where(np.array(entry['mask']) == 1)[0]
70 |     entry[sequence] = ''.join(entry[sequence][x] for x in mask)
71 |     mask_3d = np.array([i for n in mask for i in range(n*3, n*3+3)]).astype(int)
72 |     entry['tertiary'] = entry['tertiary'][mask_3d]
73 |     if atom == "ca":
74 |         index = 1
75 |     elif atom == "n":
76 |         index = 0
77 |     elif atom == "cb":
78 |         index = 2
79 |     else:
80 |         raise ValueError("atom must be one of n, ca, cb")
81 |     entry['tertiary'] = entry['tertiary'][np.arange(index, entry['tertiary'].shape[0]+index, 3)] / 100
82 |     assert entry['tertiary'].shape[0] == len(entry[sequence]), (entry['tertiary'].shape[0], len(entry[sequence]))
83 |     return entry
84 | 


--------------------------------------------------------------------------------
/src/uniprot_parser.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from tqdm import tqdm
  3 | from pathlib import Path
  4 | 
  5 | # Cross References
  6 | DB_ABBREVS = ["database(EMBL)"] + ["database(" + line.strip().split(": ")[1] + ")" for line in
  7 |                                    requests.get("https://www.uniprot.org/docs/dbxref.txt").text.split("\n") if
  8 |                                    "Abbrev:" in line]
  9 | 
 10 | COLUMN_NAMES = [
 11 |     # Names & Taxonomy
 12 |     'id', 'entry name', 'genes', 'genes(PREFERRED)', 'genes(ALTERNATIVE)',
 13 |     'genes(OLN)', 'genes(ORF)', 'organism', 'organism-id', 'protein names',
 14 |     'proteome', 'lineage(ALL)', 'lineage-id', 'virus hosts',
 15 |     # Sequences
 16 |     'fragement', 'sequence', 'length', 'mass', 'encodedon',
 17 |     'comment(ALTERNATIVE PRODUCTS)', 'comment(ERRONEOUS GENE MODEL PREDICTION)',
 18 |     'comment(ERRONEOUS INITIATION)', 'comment(ERRONEOUS TERMINATION)',
 19 |     'comment(ERRONEOUS TRANSLATION)', 'comment(FRAMESHIFT)',
 20 |     'comment(MASS SPECTROMETRY)', 'comment(POLYMORPHISM)',
 21 |     'comment(RNA EDITING)', 'comment(SEQUENCE CAUTION)',
 22 |     'feature(ALTERNATIVE SEQUENCE)', 'feature(NATURAL VARIANT)',
 23 |     'feature(NON ADJACENT RESIDUES)',
 24 |     'feature(NON STANDARD RESIDUE)', 'feature(NON TERMINAL RESIDUE)',
 25 |     'feature(SEQUENCE CONFLICT)', 'feature(SEQUENCE UNCERTAINTY)',
 26 |     'version(sequence)',
 27 |     # Family and Domains
 28 |     'domains', 'domain', 'comment(DOMAIN)', 'comment(SIMILARITY)',
 29 |     'feature(COILED COIL)', 'feature(COMPOSITIONAL BIAS)',
 30 |     'feature(DOMAIN EXTENT)', 'feature(MOTIF)', 'feature(REGION)',
 31 |     'feature(REPEAT)', 'feature(ZINC FINGER)',
 32 |     # Function
 33 |     'ec', 'comment(ABSORPTION)', 'comment(CATALYTIC ACTIVITY)',
 34 |     'comment(COFACTOR)', 'comment(ENZYME REGULATION)', 'comment(FUNCTION)',
 35 |     'comment(KINETICS)', 'comment(PATHWAY)', 'comment(REDOX POTENTIAL)',
 36 |     'comment(TEMPERATURE DEPENDENCE)', 'comment(PH DEPENDENCE)',
 37 |     'feature(ACTIVE SITE)', 'feature(BINDING SITE)', 'feature(DNA BINDING)',
 38 |     'feature(METAL BINDING)', 'feature(NP BIND)', 'feature(SITE)',
 39 |     # Gene Ontologys
 40 |     'go', 'go(biological process)', 'go(molecular function)',
 41 |     'go(cellular component)', 'go-id',
 42 |     # InterPro
 43 |     'interpro',
 44 |     # Interaction
 45 |     'interactor', 'comment(SUBUNIT)',
 46 |     # Publications
 47 |     'citation', 'citationmapping',
 48 |     # Date of
 49 |     'created', 'last-modified', 'sequence-modified', 'version(entry)',
 50 |     # Structure
 51 |     '3d', 'feature(BETA STRAND)', 'feature(HELIX)', 'feature(TURN)',
 52 |     # Subcellular location
 53 |     'comment(SUBCELLULAR LOCATION)', 'feature(INTRAMEMBRANE)',
 54 |     'feature(TOPOLOGICAL DOMAIN)',
 55 |     'feature(TRANSMEMBRANE)',
 56 |     # Miscellaneous
 57 |     'annotation score', 'score', 'features', 'comment(CAUTION)',
 58 |     'comment(TISSUE SPECIFICITY)',
 59 |     'comment(GENERAL)', 'keywords', 'context', 'existence', 'tools',
 60 |     'reviewed', 'feature', 'families', 'subcellular locations', 'taxonomy',
 61 |     'version', 'clusters', 'comments', 'database', 'keyword-id', 'pathway',
 62 |     'score',
 63 |     # Pathology & Biotech
 64 |     'comment(ALLERGEN)', 'comment(BIOTECHNOLOGY)', 'comment(DISRUPTION PHENOTYPE)',
 65 |     'comment(DISEASE)', 'comment(PHARMACEUTICAL)', 'comment(TOXIC DOSE)',
 66 |     # PTM / Processsing
 67 |     'comment(PTM)', 'feature(CHAIN)', 'feature(CROSS LINK)', 'feature(DISULFIDE BOND)',
 68 |     'feature(GLYCOSYLATION)', 'feature(INITIATOR METHIONINE)', 'feature(LIPIDATION)',
 69 |     'feature(MODIFIED RESIDUE)', 'feature(PEPTIDE)', 'feature(PROPEPTIDE)',
 70 |     'feature(SIGNAL)', 'feature(TRANSIT)',
 71 |     # Taxonomic lineage
 72 |     'lineage(all)', 'lineage(SUPERKINGDOM)', 'lineage(KINGDOM)', 'lineage(SUBKINGDOM)',
 73 |     'lineage(SUPERPHYLUM)', 'lineage(PHYLUM)', 'lineage(SUBPHYLUM)', 'lineage(SUPERCLASS)',
 74 |     'lineage(CLASS)', 'lineage(SUBCLASS)', 'lineage(INFRACLASS)', 'lineage(SUPERORDER)',
 75 |     'lineage(ORDER)', 'lineage(SUBORDER)', 'lineage(INFRAORDER)', 'lineage(PARVORDER)',
 76 |     'lineage(SUPERFAMILY)', 'lineage(FAMILY)', 'lineage(SUBFAMILY)', 'lineage(TRIBE)',
 77 |     'lineage(SUBTRIBE)', 'lineage(GENUS)', 'lineage(SUBGENUS)', 'lineage(SPECIES GROUP)',
 78 |     'lineage(SPECIES SUBGROUP)', 'lineage(SPECIES)', 'lineage(SUBSPECIES)', 'lineage(VARIETAS)',
 79 |     'lineage(FORMA)',
 80 |     # Taxonomic identifier
 81 |     'lineage-id(all)', 'lineage-id(SUPERKINGDOM)', 'lineage-id(KINGDOM)', 'lineage-id(SUBKINGDOM)',
 82 |     'lineage-id(SUPERPHYLUM)', 'lineage-id(PHYLUM)', 'lineage-id(SUBPHYLUM)', 'lineage-id(SUPERCLASS)',
 83 |     'lineage-id(CLASS)', 'lineage-id(SUBCLASS)', 'lineage-id(INFRACLASS)', 'lineage-id(SUPERORDER)',
 84 |     'lineage-id(ORDER)', 'lineage-id(SUBORDER)', 'lineage-id(INFRAORDER)', 'lineage-id(PARVORDER)',
 85 |     'lineage-id(SUPERFAMILY)', 'lineage-id(FAMILY)', 'lineage-id(SUBFAMILY)', 'lineage-id(TRIBE)',
 86 |     'lineage-id(SUBTRIBE)', 'lineage-id(GENUS)', 'lineage-id(SUBGENUS)', 'lineage-id(SPECIES GROUP)',
 87 |     'lineage-id(SPECIES SUBGROUP)', 'lineage-id(SPECIES)', 'lineage-id(SUBSPECIES)', 'lineage-id(VARIETAS)',
 88 |     'lineage-id(FORMA)']
 89 | 
 90 | 
 91 | def get_uniprot_info_from_ids(ids: list, filename: Path, chunk=False, identifier: str = "ACC+ID", to: str = "ACC",
 92 |                               columns: str = ",".join(COLUMN_NAMES)):
 93 |     """
 94 |     Batch retrieval of IDs and information from UniProt.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     ids
 99 |         input IDs
100 |     filename
101 |         write to this file
102 |     chunk
103 |         split into multiple queries of size 100 and join results
104 |     identifier
105 |         type of input IDs
106 |     to
107 |         output ID format - ACC returns all column information
108 |     columns
109 |         column names to return, preformatted sting (",".join(column_names))
110 | 
111 | 
112 |     Returns
113 |     -------
114 |     All information written as newline separated, tab delimited text.
115 |     """
116 |     mapping_url = 'http://www.uniprot.org/uploadlists/'
117 |     mapping_params = {
118 |         'from': identifier,
119 |         'to': to,
120 |         'format': 'tab',
121 |         'columns': columns
122 |     }
123 |     with open(filename, "w") as f:
124 |         if chunk:
125 |             num_tries = 5
126 |             for i, id_i in tqdm(enumerate(range(0, len(ids), 100))):
127 |                 id_chunk = ids[id_i: id_i + 100]
128 |                 good_text = False
129 |                 text = ""
130 |                 try_number = 0
131 |                 while not good_text and try_number < num_tries:
132 |                     mapping_params['query'] = ' '.join(id_chunk)
133 |                     response = requests.post(mapping_url, params=mapping_params)
134 |                     text = response.text
135 |                     if "<html><head>" in text:
136 |                         good_text = False
137 |                     else:
138 |                         good_text = True
139 |                     try_number += 1
140 |                 if i == 0:
141 |                     f.write(text)
142 |                 else:
143 |                     f.write("\n".join(text.split("\n")[1:]))
144 |         else:
145 |             mapping_params['query'] = ' '.join(ids)
146 |             response = requests.post(mapping_url, params=mapping_params)
147 |             f.write(response.text)
148 | 


--------------------------------------------------------------------------------