├── .gitignore ├── LICENSE ├── README.md ├── UPGMA_classification-rdkit.ipynb ├── UPGMA_classification.arthor.ipynb ├── requirements.txt ├── setup.cfg ├── setup.py └── src └── automated_series_classification ├── Butinaclustering.py ├── UPGMAclustering.py ├── __init__.py ├── dataprep.py ├── mainSeriesClassification.py ├── utilsDataPrep.py ├── utilsDrawing.py └── utilsStructureEval.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # 132 | *.pkl 133 | *.db 134 | *.sdf.gz 135 | *.xls 136 | rdk_db 137 | CDK2Kinase 138 | BACE 139 | 140 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 iwatobipen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutomatedSeriesClassification 2 | 3 | This is code for automated chemical series classification 4 | 5 | 6 | ## Original article 7 | 8 | Automated Identification of Chemical Series: Classifying like a Medicinal Chemist 9 | https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00204 10 | 11 | ## Installation 12 | 13 | First, you should have RDKit installed. Then, the code can be downloaded and installed with: 14 | 15 | ```bash 16 | git clone https://github.com/rdkit/AutomatedSeriesClassification 17 | cd AutomatedSeriesClassification 18 | pip install -e . 19 | ``` 20 | 21 | The ``-e`` flag means it gets installed in editable mode. 22 | 23 | ## Example usage 24 | 25 | ### Data Preparation 26 | 27 | 1. The following script will download chembl27.sdf.gz and make substructurefingerprint library. 28 | If you want to use an alternate version of chembl, specify the `--chebml-version` flag. You 29 | can run `python -m automated_series_classification.dataprep --help` in your shell to see all options. 30 | 31 | ``` 32 | $ python -m automated_series_classification.dataprep # it'll take ~30 or more minutes on my PC 33 | ``` 34 | 35 | 2. Then launch jupyter notebook, the notebook use same dataset as original articles. But you'll get different results compared to the article. This is because I used more newer version of ChEMBL for this code. If you would like to use same dataset to original article it is easy, just changing download link of chembl 36 | 37 | 38 | ## Acknoledgements 39 | 40 | - Greg Landrum 41 | 42 | 43 | ## etc 44 | 45 | Any comments, requests and suggestions will be greatly appreciated. 46 | 47 | 48 | 49 | ## License 50 | [MIT](https://choosealicense.com/licenses/mit/) 51 | -------------------------------------------------------------------------------- /UPGMA_classification-rdkit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "RDKit version: 2021.03.3\n", 13 | "Pandas version: 1.2.4\n", 14 | "Numpy version: 1.20.3\n", 15 | "MatplotLib version: 3.4.2\n", 16 | "Sklearn version: 0.24.2\n", 17 | "Seaborn version: 0.11.1\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "import imp\n", 23 | "import pickle\n", 24 | "import os\n", 25 | "import sys\n", 26 | "import time\n", 27 | "\n", 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "import sklearn\n", 31 | "import seaborn as sns\n", 32 | "import matplotlib as mpl\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "from ipywidgets import interact,fixed,IntSlider\n", 35 | "from IPython.display import SVG\n", 36 | "from matplotlib import rcParams\n", 37 | "from matplotlib.colors import hex2color\n", 38 | "from rdkit import Chem, rdBase\n", 39 | "from rdkit.Chem import rdFMCS, Draw, PandasTools, AllChem, DataStructs, Descriptors\n", 40 | "from rdkit.Chem import rdSubstructLibrary\n", 41 | "from rdkit.Chem.Draw import IPythonConsole\n", 42 | "from rdkit import DataStructs\n", 43 | "\n", 44 | "print('RDKit version: ',rdBase.rdkitVersion)\n", 45 | "print('Pandas version:', pd.__version__)\n", 46 | "print('Numpy version:', np.__version__)\n", 47 | "print('MatplotLib version:', mpl.__version__)\n", 48 | "print('Sklearn version:', sklearn.__version__)\n", 49 | "print('Seaborn version:', sns.__version__)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "" 61 | ] 62 | }, 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "import automated_series_classification\n", 70 | "imp.reload(automated_series_classification)\n", 71 | "from automated_series_classification import utilsDataPrep\n", 72 | "from automated_series_classification import mainSeriesClassification\n", 73 | "imp.reload(mainSeriesClassification)\n", 74 | "from automated_series_classification import utilsDrawing\n", 75 | "from automated_series_classification import UPGMAclustering\n", 76 | "imp.reload(UPGMAclustering)\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Load ChEMBL database for substructure matching (constructed during data preprocessing)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "CPU times: user 20.9 s, sys: 620 ms, total: 21.6 s\n", 96 | "Wall time: 21.6 s\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "%%time\n", 102 | "with open('./data/chembl27_sssdata.pkl','rb') as file:\n", 103 | " chembldb = pickle.load(file)\n", 104 | " " 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "The library has 1855322 molecules.\n", 117 | "CPU times: user 2.71 s, sys: 148 ms, total: 2.86 s\n", 118 | "Wall time: 2.86 s\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "%%time\n", 124 | "mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder()\n", 125 | "fps = rdSubstructLibrary.PatternHolder()\n", 126 | "for smi,fp in chembldb:\n", 127 | " mols.AddSmiles(smi)\n", 128 | " fps.AddFingerprint(fp)\n", 129 | "chembl_library = rdSubstructLibrary.SubstructLibrary(mols,fps)\n", 130 | "chembldb = None\n", 131 | "print(f\"The library has {len(chembl_library)} molecules.\")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "Nchembl = len(chembl_library)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Define Parameters, note that flimit corresponds to the specificity limit E(p) as described in the paper\n", 148 | "\n", 149 | "TIPS:dbpath is not automatically updated so if you would like to try to analyze different dataset, you should remove rdk_db or define dbpath by different name" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "flimit=2e-3\n", 159 | "MinClusterSize=20 #20\n", 160 | "proj='CDK2Kinase'\n", 161 | "dbpath='./rdk_db/cdk2_db'\n", 162 | "filename='moldata_preprocessed.csv'\n", 163 | "datapath='./{0}/'.format(proj)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### UPGMA classification" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Set \"calcDists\" to True only if the pairwise molecular distance matrix for clustering is not calculated yet (this will take a while). Set \"calcScores\" only if you are interested in the intra-cluster distance metric, this slows down the clustering." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 7, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "read 3187 molecules\n", 190 | "creating database\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "UPGMAClassification=mainSeriesClassification.Classification(proj, datapath, dbpath, filename, chembl_library, \n", 196 | " flimit, MinClusterSize, clustering='UPGMA', \n", 197 | " calcDists=True, calcScores=False, useArthor=False)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 8, 203 | "metadata": { 204 | "scrolled": true 205 | }, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "Time elapsed during the calculation: 1121.0\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "#imp.reload(UPGMAclustering)\n", 217 | "#imp.reload(mainSeriesClassification)\n", 218 | "start=time.time()\n", 219 | "UPGMAClassification.ApplyClustering()\n", 220 | "end=time.time()\n", 221 | "print(f\"Time elapsed during the calculation: {end-start:.1f}\")" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Benchmark against human-defined classification (seriescolumn is the column with the human series assignment to each molecule in the dataframe moldata)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 9, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "UPGMAClassification.CalculatePerformance(seriescolumn='series assignment')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 10, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "moldata_proj=UPGMAClassification.moldata_proj\n", 247 | "ProjectClusters=UPGMAClassification.MCSdict\n", 248 | "PerformanceClusters=UPGMAClassification.PerformanceClusters" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 11, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "dict_keys([6347, 6149, 6317, 6319, 6016, 6181, 6230, 6147, 5970, 6313, 6316, 6168, 6277, 6104, 6020, 6198, 6105, 6151, 6067, 5996, 5740])" 260 | ] 261 | }, 262 | "execution_count": 11, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "UPGMAClassification.MCSdict.keys()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "### Calculate performance metrics" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 12, 281 | "metadata": { 282 | "scrolled": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "UPGMAClassification.CalculatePerformance()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 13, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "fraction of assigned molecules: 0.8650768748038908 , ambiguity score: 1.1454479506710191\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "Nmol=len(moldata_proj)\n", 304 | "Nrep=sum(moldata_proj['ClusterID'].map(lambda x: len(x)).tolist())\n", 305 | "Nunassigned=len(moldata_proj.loc[moldata_proj['ClusterID'].map(lambda x: len(x)==0)])\n", 306 | "frac_assigned=(Nmol-Nunassigned)/Nmol\n", 307 | "a=Nrep/(Nmol-Nunassigned)\n", 308 | "print('fraction of assigned molecules:',frac_assigned,', ambiguity score:',a)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "### Plot benchmarking between automatically identified and human defined series" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "rearrange performance results" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 37, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "scaflist=list(set(moldata_proj['scaffold'].tolist()))\n", 332 | "scaflist.sort()\n", 333 | "N_auto_series=len(PerformanceClusters['recall'])\n", 334 | "LinkVector=PerformanceClusters['linked series']\n", 335 | "\n", 336 | "dict_recall={LinkVector[ind,0]:np.zeros(len(scaflist)) for ind in range(N_auto_series)}\n", 337 | "dict_prec={LinkVector[ind,0]:np.zeros(len(scaflist)) for ind in range(N_auto_series)}\n", 338 | "for ind in range(N_auto_series):\n", 339 | " scafind=np.where(np.array(scaflist)==LinkVector[ind,1])\n", 340 | " dict_recall[LinkVector[ind,0]][scafind]=PerformanceClusters['recall'][ind]\n", 341 | " dict_prec[LinkVector[ind,0]][scafind]=PerformanceClusters['precision'][ind]\n", 342 | " \n", 343 | "keylist=LinkVector[:,0].tolist()\n", 344 | "keylist.sort()\n", 345 | "dict_recall_sorted={k:dict_recall[k] for k in keylist}\n", 346 | "dict_prec_sorted={k:dict_prec[k] for k in keylist}" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 39, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "image/png": "\n", 357 | "text/plain": [ 358 | "
" 359 | ] 360 | }, 361 | "metadata": { 362 | "needs_background": "light" 363 | }, 364 | "output_type": "display_data" 365 | } 366 | ], 367 | "source": [ 368 | "legend=['Series '+s[-2:] for s in scaflist]\n", 369 | "rcParams.update({'figure.autolayout': True})\n", 370 | "fig,ax=plt.subplots(2,1,figsize=(10,10))\n", 371 | "fig,_=utilsDrawing.barplot_vertical(fig, ax[0],dict_recall_sorted,'recall linked h. series',[],['']*len(keylist),1.02)\n", 372 | "fig,_=utilsDrawing.barplot_vertical(fig, ax[1],dict_prec_sorted,'precision linked h. series',legend,keylist,1.02)\n", 373 | "ax[1].set_xlabel('ID automatically-identified series',fontsize=22)\n", 374 | "plt.show()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "### Draw MCS of automatically-identified series and linked human-defined series" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 40, 387 | "metadata": { 388 | "scrolled": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "imp.reload(utilsDrawing)\n", 393 | "def renderMCS(seriesid,newscafslist,newscafsnames,linkedscafslist,linkedscafsnames):\n", 394 | " listid=np.where(np.array(newscafsnames)==seriesid)[0][0]\n", 395 | " svg_new=utilsDrawing.moltosvg(Chem.MolFromSmarts(newscafslist[listid]), molSize=(450,250))\n", 396 | " svg_linked=utilsDrawing.moltosvg(Chem.MolFromSmarts(linkedscafslist[listid]), molSize=(450,250))\n", 397 | " labels=[newscafsnames[listid],linkedscafsnames[listid]]\n", 398 | " svgGrid = utilsDrawing.SvgsToGrid([svg_new,svg_linked], labels=labels, svgsPerRow=2, molSize=(450,250))\n", 399 | " return(SVG(svgGrid))" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 41, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/plain": [ 410 | "'./CDK2Kinase/'" 411 | ] 412 | }, 413 | "execution_count": 41, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "datapath" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 42, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "application/vnd.jupyter.widget-view+json": { 430 | "model_id": "dd26780701c94c718c9fc3706742bb73", 431 | "version_major": 2, 432 | "version_minor": 0 433 | }, 434 | "text/plain": [ 435 | "interactive(children=(Dropdown(description='seriesid', options=('5740', '5970', '5996', '6016', '6020', '6067'…" 436 | ] 437 | }, 438 | "metadata": {}, 439 | "output_type": "display_data" 440 | } 441 | ], 442 | "source": [ 443 | "seriesdata_proj=pd.read_csv('{0}seriesdata.csv'.format(datapath))\n", 444 | "LinkVector_sorted=np.array([[x,y] for x,y in sorted(zip(LinkVector[:,0],LinkVector[:,1]))])\n", 445 | "linkedScafsList=[seriesdata_proj['MCSsampled'].loc[seriesdata_proj['ScafName']==x].iloc[0] for x in LinkVector_sorted[:,1]]\n", 446 | "linkedScafsNames=[x for x in LinkVector_sorted[:,1]]\n", 447 | "MCSlist=[ProjectClusters[int(k)][2] for k in keylist]\n", 448 | "\n", 449 | "interact(renderMCS, seriesid=keylist, newscafslist=fixed(MCSlist),newscafsnames=fixed(keylist),linkedscafslist=fixed(linkedScafsList),linkedscafsnames=fixed(linkedScafsNames));\n" 450 | ] 451 | } 452 | ], 453 | "metadata": { 454 | "kernelspec": { 455 | "display_name": "Python 3", 456 | "language": "python", 457 | "name": "python3" 458 | }, 459 | "language_info": { 460 | "codemirror_mode": { 461 | "name": "ipython", 462 | "version": 3 463 | }, 464 | "file_extension": ".py", 465 | "mimetype": "text/x-python", 466 | "name": "python", 467 | "nbconvert_exporter": "python", 468 | "pygments_lexer": "ipython3", 469 | "version": "3.8.10" 470 | } 471 | }, 472 | "nbformat": 4, 473 | "nbformat_minor": 2 474 | } 475 | -------------------------------------------------------------------------------- /UPGMA_classification.arthor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "RDKit version: 2021.03.3\n", 13 | "Pandas version: 1.2.4\n", 14 | "Numpy version: 1.20.3\n", 15 | "MatplotLib version: 3.4.2\n", 16 | "Arthor version: 3.4\n", 17 | "Sklearn version: 0.24.2\n", 18 | "Seaborn version: 0.11.1\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "import sys, os\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "import sklearn\n", 27 | "from rdkit import Chem, rdBase\n", 28 | "from rdkit.Chem import rdFMCS, Draw, PandasTools, AllChem, DataStructs, Descriptors\n", 29 | "from rdkit.Chem.Draw import IPythonConsole\n", 30 | "from IPython.display import SVG\n", 31 | "import arthor\n", 32 | "import matplotlib as mpl\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "from matplotlib import rcParams\n", 35 | "from matplotlib.colors import hex2color\n", 36 | "import seaborn as sns\n", 37 | "import time\n", 38 | "\n", 39 | "print('RDKit version: ',rdBase.rdkitVersion)\n", 40 | "print('Pandas version:', pd.__version__)\n", 41 | "print('Numpy version:', np.__version__)\n", 42 | "print('MatplotLib version:', mpl.__version__)\n", 43 | "print('Arthor version:', arthor.__version__)\n", 44 | "print('Sklearn version:', sklearn.__version__)\n", 45 | "print('Seaborn version:', sns.__version__)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "" 57 | ] 58 | }, 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "import imp\n", 66 | "import automated_series_classification\n", 67 | "imp.reload(automated_series_classification)\n", 68 | "from automated_series_classification import utilsDataPrep\n", 69 | "from automated_series_classification import mainSeriesClassification\n", 70 | "imp.reload(mainSeriesClassification)\n", 71 | "\n", 72 | "from automated_series_classification import utilsDrawing\n", 73 | "from automated_series_classification import UPGMAclustering\n", 74 | "imp.reload(UPGMAclustering)\n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Load ChEMBL database for substructure matching (constructed during data preprocessing)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "1941410\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "chembldb=arthor.SubDb('./arthor/chembl_27.atdb')\n", 99 | "Nchembl=len(chembldb.search('*'))\n", 100 | "print(Nchembl)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Define Parameters, note that flimit corresponds to the specificity limit E(p) as described in the paper" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "flimit=2e-3\n", 117 | "MinClusterSize=20\n", 118 | "proj='CDK2Kinase'\n", 119 | "filename='moldata_preprocessed.csv'\n", 120 | "arthorpath='./arthor/'\n", 121 | "datapath='./{0}/'.format(proj)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### UPGMA classification" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Set \"calcDists\" to True only if the pairwise molecular distance matrix for clustering is not calculated yet (this will take a while). Set \"calcScores\" only if you are interested in the intra-cluster distance metric, this slows down the clustering." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "read 3187 molecules\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "UPGMAClassification=mainSeriesClassification.Classification(proj, datapath, arthorpath, filename, chembldb, \n", 153 | " flimit, MinClusterSize, clustering='UPGMA', \n", 154 | " calcDists=True, calcScores=False)\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "Time elapsed during the calculation: 98.2\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "start=time.time()\n", 172 | "UPGMAClassification.ApplyClustering()\n", 173 | "end=time.time()\n", 174 | "print(f\"Time elapsed during the calculation: {end-start:.1f}\") " 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Benchmark against human-defined classification (seriescolumn is the column with the human series assignment to each molecule in the dataframe moldata)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 7, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "UPGMAClassification.CalculatePerformance(seriescolumn='series assignment')" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 8, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "moldata_proj=UPGMAClassification.moldata_proj\n", 200 | "ProjectClusters=UPGMAClassification.MCSdict\n", 201 | "PerformanceClusters=UPGMAClassification.PerformanceClusters" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Calculate performance metrics" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "fraction of assigned molecules: 0.8650768748038908 , ambiguity score: 1.1454479506710191\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "Nmol=len(moldata_proj)\n", 226 | "Nrep=sum(moldata_proj['ClusterID'].map(lambda x: len(x)).tolist())\n", 227 | "Nunassigned=len(moldata_proj.loc[moldata_proj['ClusterID'].map(lambda x: len(x)==0)])\n", 228 | "frac_assigned=(Nmol-Nunassigned)/Nmol\n", 229 | "a=Nrep/(Nmol-Nunassigned)\n", 230 | "print('fraction of assigned molecules:',frac_assigned,', ambiguity score:',a)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### Plot benchmarking between automatically identified and human defined series" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "rearrange performance results" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 10, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "scaflist=list(set(moldata_proj['scaffold'].tolist()))\n", 254 | "scaflist.sort()\n", 255 | "N_auto_series=len(PerformanceClusters['recall'])\n", 256 | "LinkVector=PerformanceClusters['linked series']\n", 257 | "\n", 258 | "dict_recall={LinkVector[ind,0]:np.zeros(len(scaflist)) for ind in range(N_auto_series)}\n", 259 | "dict_prec={LinkVector[ind,0]:np.zeros(len(scaflist)) for ind in range(N_auto_series)}\n", 260 | "for ind in range(N_auto_series):\n", 261 | " scafind=np.where(np.array(scaflist)==LinkVector[ind,1])\n", 262 | " dict_recall[LinkVector[ind,0]][scafind]=PerformanceClusters['recall'][ind]\n", 263 | " dict_prec[LinkVector[ind,0]][scafind]=PerformanceClusters['precision'][ind]\n", 264 | " \n", 265 | "keylist=LinkVector[:,0].tolist()\n", 266 | "keylist.sort()\n", 267 | "dict_recall_sorted={k:dict_recall[k] for k in keylist}\n", 268 | "dict_prec_sorted={k:dict_prec[k] for k in keylist}" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 11, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "image/png": "\n", 279 | "text/plain": [ 280 | "
" 281 | ] 282 | }, 283 | "metadata": { 284 | "needs_background": "light" 285 | }, 286 | "output_type": "display_data" 287 | } 288 | ], 289 | "source": [ 290 | "legend=['Series '+s[-2:] for s in scaflist]\n", 291 | "rcParams.update({'figure.autolayout': True})\n", 292 | "fig,ax=plt.subplots(2,1,figsize=(10,10))\n", 293 | "fig,_=utilsDrawing.barplot_vertical(fig, ax[0],dict_recall_sorted,'recall linked h. series',[],['']*len(keylist),1.02)\n", 294 | "fig,_=utilsDrawing.barplot_vertical(fig, ax[1],dict_prec_sorted,'precision linked h. series',legend,keylist,1.02)\n", 295 | "ax[1].set_xlabel('ID automatically-identified series',fontsize=22)\n", 296 | "plt.show()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "### Draw MCS of automatically-identified series and linked human-defined series" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 12, 309 | "metadata": { 310 | "scrolled": true 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "from ipywidgets import interact,fixed,IntSlider\n", 315 | "def renderMCS(seriesid,newscafslist,newscafsnames,linkedscafslist,linkedscafsnames):\n", 316 | " listid=np.where(np.array(newscafsnames)==seriesid)[0][0]\n", 317 | " svg_new=utilsDrawing.moltosvg(Chem.MolFromSmarts(newscafslist[listid]), molSize=(450,250))\n", 318 | " svg_linked=utilsDrawing.moltosvg(Chem.MolFromSmarts(linkedscafslist[listid]), molSize=(450,250))\n", 319 | " labels=[newscafsnames[listid],linkedscafsnames[listid]]\n", 320 | " svgGrid = utilsDrawing.SvgsToGrid([svg_new,svg_linked], labels=labels, svgsPerRow=2, molSize=(450,250))\n", 321 | " \n", 322 | " return(display(SVG(svgGrid)))" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 13, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "application/vnd.jupyter.widget-view+json": { 333 | "model_id": "8b770e6b46754dbc84bfc81ebb062aed", 334 | "version_major": 2, 335 | "version_minor": 0 336 | }, 337 | "text/plain": [ 338 | "interactive(children=(Dropdown(description='seriesid', options=('5740', '5970', '5996', '6016', '6020', '6067'…" 339 | ] 340 | }, 341 | "metadata": {}, 342 | "output_type": "display_data" 343 | } 344 | ], 345 | "source": [ 346 | "seriesdata_proj=pd.read_csv('{0}seriesdata.csv'.format(datapath))\n", 347 | "LinkVector_sorted=np.array([[x,y] for x,y in sorted(zip(LinkVector[:,0],LinkVector[:,1]))])\n", 348 | "linkedScafsList=[seriesdata_proj['MCSsampled'].loc[seriesdata_proj['ScafName']==x].iloc[0] for x in LinkVector_sorted[:,1]]\n", 349 | "linkedScafsNames=[x for x in LinkVector_sorted[:,1]]\n", 350 | "MCSlist=[ProjectClusters[int(k)][2] for k in keylist]\n", 351 | "\n", 352 | "interact(renderMCS, seriesid=keylist, newscafslist=fixed(MCSlist),newscafsnames=fixed(keylist),linkedscafslist=fixed(linkedScafsList),linkedscafsnames=fixed(linkedScafsNames));\n" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "### 10fold cross-validation by sampling 40% of all compounds" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 14, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "Time elapsed during the calculation: 492.38566040992737\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "fraction_sample=0.4\n", 377 | "N_sample=10\n", 378 | "start=time.time()\n", 379 | "UPGMAClassification.ClassificationCrossValidation(fraction_sample, N_sample)\n", 380 | "end=time.time()\n", 381 | "print(\"Time elapsed during the calculation:\", end - start) " 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 15, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "UPGMAClassification.EvaluationCrossValidation()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 16, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "image/png": "\n", 401 | "text/plain": [ 402 | "
" 403 | ] 404 | }, 405 | "metadata": { 406 | "needs_background": "light" 407 | }, 408 | "output_type": "display_data" 409 | } 410 | ], 411 | "source": [ 412 | "rcParams.update({'figure.autolayout': True})\n", 413 | "fig,ax=plt.subplots(1,1,figsize=(12,8))\n", 414 | "ax = sns.boxplot(x=\"series id\", y=\"fscore\", data=UPGMAClassification.EvalCrossval, color=[0.5,0.6,1.0],ax=ax)\n", 415 | "ax.set_ylabel('F$_1$ score',fontsize=26)\n", 416 | "ax.set_xlabel('series ID',fontsize=26)\n", 417 | "\n", 418 | "plt.setp(ax.get_xticklabels(), fontsize=22, rotation='vertical')\n", 419 | "plt.setp(ax.get_yticklabels(), fontsize=22)\n", 420 | "plt.show()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [] 429 | } 430 | ], 431 | "metadata": { 432 | "kernelspec": { 433 | "display_name": "Python 3", 434 | "language": "python", 435 | "name": "python3" 436 | }, 437 | "language_info": { 438 | "codemirror_mode": { 439 | "name": "ipython", 440 | "version": 3 441 | }, 442 | "file_extension": ".py", 443 | "mimetype": "text/x-python", 444 | "name": "python", 445 | "nbconvert_exporter": "python", 446 | "pygments_lexer": "ipython3", 447 | "version": "3.8.10" 448 | } 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 2 452 | } 453 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | rdkit 2 | seaborn 3 | xlrd 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | ########################## 2 | # Setup.py Configuration # 3 | ########################## 4 | [metadata] 5 | name = automated_series_classification 6 | version = 0.0.1 7 | long_description = file: README.rst 8 | 9 | # URLs associated with the project 10 | url = https://github.com/iwatobipen/AutomatedSeriesClassification 11 | download_url = https://github.com/iwatobipen/AutomatedSeriesClassification/releases 12 | project_urls = 13 | Bug Tracker = https://github.com/iwatobipen/AutomatedSeriesClassification/issues 14 | Source Code = https://github.com/iwatobipen/AutomatedSeriesClassification 15 | 16 | # Author information 17 | author = iwatobipen 18 | # author_email = ... 19 | maintainer = iwatobipen 20 | # maintainer_email = ... 21 | 22 | # License Information 23 | license = MIT 24 | license_file = LICENSE 25 | 26 | [options] 27 | # Requirements (formerly known as requirements.txt) 28 | install_requires = 29 | numpy 30 | pandas 31 | sklearn 32 | matplotlib 33 | wget 34 | tqdm 35 | click 36 | # Since RDKit can't be installed through pip, don't put it here 37 | 38 | # Where is my code (Part 1/3) 39 | packages = find: 40 | # Where is my code (Part 2/3) 41 | package_dir = 42 | = src 43 | 44 | [options.packages.find] 45 | # Where is my code (Part 3/3) 46 | where = src 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Setup module.""" 4 | 5 | import setuptools 6 | 7 | if __name__ == '__main__': 8 | setuptools.setup() 9 | -------------------------------------------------------------------------------- /src/automated_series_classification/Butinaclustering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jan 28 14:14:10 2020 5 | 6 | @author: krugefr1 7 | """ 8 | 9 | import numpy as np 10 | from automated_series_classification import utilsStructureEval 11 | 12 | def ApplyButina(distdata, moldata, chembldb, flimit, MinClusterSize,calcScores): 13 | MCSdict={} 14 | # sort assaydata ascending (i.e., lowest value is most active -> preprocess data accordingly) 15 | # indices: list in which molecules are selected as cluster centers 16 | assaydata=np.array(moldata['assay'].tolist()) 17 | indices=np.argsort(assaydata) 18 | Nchembl=len(chembldb.search('*')) 19 | 20 | while len(indices)>0: 21 | # assign all molecules to cluster center that comply with distthresh 22 | # distthresh is adjusted iteratively until MCS complies with specificity threshold flimit 23 | distthresh=0.8 24 | cluster=np.where(distdata[indices[0],:]=MinClusterSize: 26 | fChembl,Smarts=utilsStructureEval.MCSFromMollist(moldata.Molecule.iloc[cluster].tolist(),chembldb,Nchembl) 27 | step=0.1 28 | cluster_upper=cluster 29 | if fChembl>=flimit and len(cluster_upper)>MinClusterSize: 30 | while fChembl>=flimit: 31 | fChembl_upper=fChembl 32 | cluster_upper=cluster 33 | distupper=distthresh 34 | distthresh-=step 35 | cluster=np.where(distdata[indices[0],:]1: 37 | fChembl,Smarts=utilsStructureEval.MCSFromMollist(moldata.Molecule.iloc[cluster].tolist(),chembldb,Nchembl) 38 | else: break 39 | distlower=distthresh 40 | fChembl_lower=fChembl 41 | fChembl_lower_old=0 42 | while ((fChembl_lower-fChembl_lower_old)>1e-8) and (len(cluster_upper)>=MinClusterSize): 43 | distthresh=(distupper+distlower)/2 44 | cluster=np.where(distdata[indices[0],:]1: 46 | fChembl,Smarts=utilsStructureEval.MCSFromMollist(moldata.Molecule.iloc[cluster].tolist(),chembldb,Nchembl) 47 | else: break 48 | if fChembl>=flimit: 49 | fChembl_upper=fChembl 50 | distupper=distthresh 51 | cluster_upper=cluster 52 | else: 53 | fChembl_lower_old=fChembl_lower 54 | fChembl_lower=fChembl 55 | distlower=distthresh 56 | 57 | # select cluster/MCS if compliant with flimit and MinClusterSize 58 | if fChembl=MinClusterSize: 59 | if calcScores: 60 | MCSdict[indices[0]]=(fChembl,len(cluster),Smarts,distthresh) 61 | else: 62 | MCSdict[indices[0]]=(fChembl,len(cluster),Smarts) 63 | # remove molecules that were assigned to cluster (also incompliant, i.e. too small clusters) 64 | indices=np.array([x for x in indices if x not in cluster]) 65 | distdata[cluster,:]=1 66 | distdata[:,cluster]=1 67 | return MCSdict -------------------------------------------------------------------------------- /src/automated_series_classification/UPGMAclustering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 27 16:07:23 2020 5 | 6 | @author: krugefr1 7 | """ 8 | import numpy as np 9 | import sklearn 10 | from sklearn.cluster import AgglomerativeClustering 11 | from automated_series_classification import utilsStructureEval 12 | try: 13 | import arthor 14 | except ImportError: 15 | arthor = None 16 | 17 | def CalcSizeAndAssignment(children,Ndata): 18 | # Assigns molecules to the clusters of the UPGMA tree 19 | NumMolList=[] 20 | MolDict={} 21 | for i in range(len(children)): 22 | N=0 23 | mols_assigned=[] 24 | for j in range(len(children[i])): 25 | if children[i][j]0: 70 | childlayer=[] 71 | for c in currlayer: 72 | if c>=Ndata: 73 | if len(MolDict[c])>=MinClusterSize: 74 | fChembl,Smarts=utilsStructureEval.MCSFromMollist(moldata.Molecule.iloc[MolDict[c]].tolist(),chembldb,Nchembl,onlyCompleteRings=onlyCompleteRings) 75 | if fChembl>=flimit: 76 | childlayer+=children[c-Ndata].tolist() 77 | else: 78 | if calcScores: 79 | MCSdict[c]=(fChembl,len(MolDict[c]),Smarts,ScoreDict[c]) 80 | else: 81 | MCSdict[c]=(fChembl,len(MolDict[c]),Smarts) 82 | currlayer=childlayer 83 | return MCSdict 84 | 85 | def ApplyUPGMA(distdata_proj,moldata_proj,chembldb, flimit, MinClusterSize, calcScores, 86 | onlyCompleteRings=False, useArthor=True): 87 | global arthor 88 | if not useArthor: 89 | arthor = None 90 | utilsStructureEval.arthor = None 91 | # Apply UPGMA clustering 92 | cluster=AgglomerativeClustering(n_clusters=2, compute_full_tree=True, affinity='precomputed',linkage='average') 93 | cluster.fit(distdata_proj) 94 | 95 | # Assign Clusters 96 | NumMolList, MolDict= CalcSizeAndAssignment(cluster.children_,len(distdata_proj)) 97 | # Calculate intra-cluster distance scores 98 | if calcScores: 99 | ScoreDict=CalcScore(cluster.children_,distdata_proj,NumMolList) 100 | else: 101 | ScoreDict={} 102 | 103 | # filter out irrelevant clusters and calculate MCS on selected clusters 104 | MCSdict=DetermineRelevantMCS(len(distdata_proj),cluster.children_,MolDict,ScoreDict,chembldb,moldata_proj,flimit,MinClusterSize, calcScores, onlyCompleteRings=onlyCompleteRings) 105 | 106 | return MCSdict 107 | 108 | 109 | -------------------------------------------------------------------------------- /src/automated_series_classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/AutomatedSeriesClassification/a0be9997fbfd4a6708fddd136e27b7b373dcd57f/src/automated_series_classification/__init__.py -------------------------------------------------------------------------------- /src/automated_series_classification/dataprep.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import pickle 4 | 5 | import click 6 | import wget 7 | from rdkit import Chem, RDLogger, rdBase 8 | from rdkit.Chem import rdSubstructLibrary 9 | from tqdm import tqdm 10 | 11 | RDLogger.DisableLog("rdApp.warning") 12 | 13 | # See https://pubs.acs.org/doi/10.1021/jm020472j 14 | bradley_url = 'https://pubs.acs.org/doi/suppl/10.1021/jm020472j/suppl_file/jm020472j_s2.xls' 15 | 16 | 17 | @click.command() 18 | @click.option( 19 | '--directory', type=click.Path(file_okay=False, dir_okay=True), 20 | default=os.getcwd, help='Defaults to current directory', 21 | ) 22 | @click.option('--chebml-version', default='27', show_default=True) 23 | def main(directory: str, chebml_version: str): 24 | """Download the ChEBML data.""" 25 | os.makedirs(directory, exist_ok=True) 26 | 27 | bradley_path = os.path.join(directory, 'jm020472j_s2.xls') 28 | if not os.path.exists(bradley_path): 29 | try: 30 | wget.download(bradley_url, out=directory) 31 | except: 32 | click.echo('There goes ACS stopping science') 33 | 34 | chembl_url = ( 35 | f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/' 36 | f'chembl_{chebml_version}/chembl_{chebml_version}.sdf.gz' 37 | ) 38 | 39 | sdf_path = os.path.join(directory, f'chembl_{chebml_version}.sdf.gz') 40 | if not os.path.exists(sdf_path): 41 | wget.download(chembl_url, out=directory) 42 | 43 | sss_path = os.path.join(directory, f'chembl{chebml_version}_sssdata.pkl') 44 | if not os.path.exists(sss_path): 45 | click.echo(f'RDKit Version: {rdBase.rdkitVersion}') 46 | data = [] 47 | 48 | with gzip.GzipFile(sdf_path) as gz: 49 | suppl = Chem.ForwardSDMolSupplier(gz) 50 | for mol in tqdm(suppl, desc=f'Processing ChEBML {chebml_version}', unit_scale=True): 51 | if mol is None or mol.GetNumAtoms() > 50: 52 | continue 53 | fp = Chem.PatternFingerprint(mol) 54 | smi = Chem.MolToSmiles(mol) 55 | data.append((smi, fp)) 56 | 57 | click.echo(f'Outputting to {sss_path}') 58 | with open(sss_path, 'wb') as file: 59 | mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder() 60 | fps = rdSubstructLibrary.PatternHolder() 61 | for smi,fp in data: 62 | mols.AddSmiles(smi) 63 | fps.AddFingerprint(fp) 64 | library = rdSubstructLibrary.SubstructLibrary(mols,fps) 65 | pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL) 66 | 67 | click.echo('Done ;)') 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /src/automated_series_classification/mainSeriesClassification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 27 16:40:49 2020 5 | 6 | @author: krugefr1 7 | """ 8 | 9 | import numpy as np 10 | import os 11 | try: 12 | import arthor 13 | except ImportError: 14 | arthor = None 15 | from rdkit import Chem 16 | from rdkit.Chem import rdSubstructLibrary 17 | import pickle 18 | import random 19 | import pandas as pd 20 | import copy 21 | 22 | from automated_series_classification import UPGMAclustering, Butinaclustering, utilsDataPrep 23 | 24 | 25 | class Classification: 26 | def __init__(self, 27 | proj, 28 | datapath, 29 | dbpath, 30 | filename, 31 | chembldb, 32 | flimit=1e-3, 33 | MinClusterSize=20, 34 | clustering='UPGMA', 35 | calcDists=True, 36 | calcScores=False, 37 | smilesCol='Smiles', 38 | idCol='ID', 39 | onlyCompleteRings=False, 40 | useArthor=True): 41 | global arthor 42 | if not useArthor: 43 | arthor = None 44 | self.useArthor = useArthor 45 | self.proj = proj 46 | self.datapath = datapath 47 | self.dbpath = dbpath 48 | self.chembldb = chembldb 49 | self.flimit = flimit 50 | self.MinClusterSize = MinClusterSize 51 | self.clustering = clustering 52 | self.calcScores = calcScores 53 | self.calcDists = calcDists 54 | self.smilesCol = smilesCol 55 | self.idCol = idCol 56 | self.onlyCompleteRings = onlyCompleteRings 57 | # load data 58 | self.moldata_proj, self.distdata_proj = utilsDataPrep.PrepareData( 59 | self.proj, 60 | self.datapath, 61 | filename, 62 | distMeasure='Tanimoto', 63 | FP='Morgan2', 64 | calcDists=self.calcDists, 65 | smilesCol=smilesCol) 66 | if arthor is not None: 67 | if not os.path.isdir(dbpath): 68 | os.mkdir(dbpath) 69 | # set up project database for arthor substructure matching 70 | df = self.moldata_proj[[smilesCol, idCol]] 71 | df.to_csv('./arthor/{0}.smi'.format(self.proj), 72 | header=None, 73 | index=None, 74 | sep=' ') 75 | os.system('smi2atdb -j 0 -l {0}{1}.smi {0}{1}.atdb'.format( 76 | self.dbpath, self.proj)) 77 | os.system('atdb2fp -j 0 {0}{1}.atdb'.format( 78 | self.dbpath, self.proj)) 79 | self.proj_db = arthor.SubDb('{0}{1}.atdb'.format( 80 | self.dbpath, self.proj)) 81 | else: 82 | if type(dbpath) == rdSubstructLibrary.SubstructLibrary: 83 | self.proj_db = dbpath 84 | self.db_size = len(self.proj_db) 85 | else: 86 | if not os.path.exists(dbpath): 87 | print("creating database") 88 | mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder() 89 | fps = rdSubstructLibrary.PatternHolder() 90 | for smi in self.moldata_proj[smilesCol]: 91 | m = Chem.MolFromSmiles(smi) 92 | mols.AddSmiles(Chem.MolToSmiles(m)) 93 | fps.AddFingerprint(Chem.PatternFingerprint(m)) 94 | self.proj_db = rdSubstructLibrary.SubstructLibrary( 95 | mols, fps) 96 | self.db_size = len(mols) 97 | pickle.dump(self.proj_db, open(dbpath, 'wb+')) 98 | else: 99 | self.proj_db = pickle.load(open(dbpath, 'rb')) 100 | self.db_size = len(self.proj_db) 101 | 102 | def AssignSeriesToMCS(self, MCSdict): 103 | # assign series to MCS of selected clusters 104 | smartslist = [v[2] for v in MCSdict.values()] 105 | MolAssign_prel = {} 106 | MolAssignment = {} 107 | for s in range(len(smartslist)): 108 | if arthor is not None: 109 | res = self.proj_db.search(smartslist[s]) 110 | mols = [int(i) for i in res.to_array()] 111 | else: 112 | mols = self.proj_db.GetMatches(Chem.MolFromSmarts( 113 | smartslist[s]), 114 | maxResults=self.db_size) 115 | MolAssign_prel[list(MCSdict.keys())[s]] = list(mols) 116 | 117 | # remove all series that are entirely in another series 118 | for key1 in MolAssign_prel.keys(): 119 | add = 1 120 | for key2 in MolAssign_prel.keys(): 121 | if key2 != key1: 122 | if set(MolAssign_prel[key1]).issubset( 123 | set(MolAssign_prel[key2])): 124 | if set(MolAssign_prel[key2]).issubset( 125 | set(MolAssign_prel[key1])) and ( 126 | MCSdict[key1][0] >= MCSdict[key2][0]): 127 | add = 1 128 | else: 129 | add = 0 130 | break 131 | if add == 1 and MolAssign_prel[key1] not in MolAssignment.values(): 132 | MolAssignment[key1] = MolAssign_prel[key1] 133 | 134 | MolAssignment = { 135 | k: MolAssignment[k] 136 | for k in MolAssignment.keys() 137 | if len(MolAssignment[k]) > self.MinClusterSize 138 | } 139 | if self.calcScores: 140 | MCSdict = { 141 | k: (MCSdict[k][0], len(MolAssignment[k]), MCSdict[k][2], 142 | MCSdict[k][3], MolAssignment[k]) 143 | for k in MolAssignment.keys() 144 | } 145 | else: 146 | MCSdict = { 147 | k: (MCSdict[k][0], len(MolAssignment[k]), MCSdict[k][2], 148 | MolAssignment[k]) 149 | for k in MolAssignment.keys() 150 | } 151 | return MolAssignment, MCSdict 152 | 153 | def ApplyClustering(self): 154 | # apply custering and calculate MCS 155 | if self.clustering == 'UPGMA': 156 | MCSdict = UPGMAclustering.ApplyUPGMA( 157 | self.distdata_proj, 158 | self.moldata_proj, 159 | self.chembldb, 160 | self.flimit, 161 | self.MinClusterSize, 162 | self.calcScores, 163 | onlyCompleteRings=self.onlyCompleteRings, 164 | useArthor=self.useArthor) 165 | elif self.clustering == 'Butina': 166 | distdata = copy.deepcopy(self.distdata_proj) 167 | MCSdict = Butinaclustering.ApplyButina(distdata, 168 | self.moldata_proj, 169 | self.chembldb, 170 | self.flimit, 171 | self.MinClusterSize, 172 | self.calcScores, 173 | useArthor=self.useArthor) 174 | else: 175 | print('Clustering algorithm not implemented.') 176 | return 177 | 178 | # assign series through substructure matching and filtering 179 | self.MolAssignment, self.MCSdict = self.AssignSeriesToMCS(MCSdict) 180 | 181 | # prepare and save output 182 | self.moldata_proj['ClusterID'] = [ 183 | list() for x in range(self.moldata_proj.shape[0]) 184 | ] 185 | 186 | for k, vs in self.MolAssignment.items(): 187 | for v in vs: 188 | self.moldata_proj['ClusterID'].iloc[v].append(k) 189 | if self.clustering == 'UPGMA': 190 | self.moldata_proj.to_csv('{0}moldata_UPGMA.csv'.format( 191 | self.datapath)) 192 | with open('{0}ClusterData_UPGMA.pkl'.format(self.datapath), 193 | 'wb') as fileout: 194 | pickle.dump(self.MCSdict, fileout) 195 | elif self.clustering == 'Butina': 196 | self.moldata_proj.to_csv('{0}moldata_Butina.csv'.format( 197 | self.datapath)) 198 | with open('{0}ClusterData_Butina.pkl'.format(self.datapath), 199 | 'wb') as fileout: 200 | pickle.dump(self.MCSdict, fileout) 201 | else: 202 | print('Clustering algorithm not implemented.') 203 | return 204 | 205 | def CalculatePerformance(self, seriescolumn='series assignment'): 206 | 207 | # benchmark the automated classification against a different (probably human-defined) classification 208 | # human-defined compound assignment is specified in the column "seriescolumn" of the dataframe "moldata" 209 | # automated classification assignment specified in dict "MolAssignment" 210 | 211 | # calculates F1 score of automatically-identified series w.r.t. to all human-defined series, then links 212 | # each automatically-identified series to the human-defined series with highest F1 score 213 | 214 | scaflist = list(set(self.moldata_proj['scaffold'].tolist())) 215 | scaflist.sort() 216 | 217 | intersect_matrix = np.zeros((len(scaflist), len(self.MolAssignment))) 218 | NMatchScaf = [] 219 | NMatchCluster = np.array([len(v) for v in self.MolAssignment.values()]) 220 | for scaf_ind in range(len(scaflist)): 221 | mollist = self.moldata_proj[self.idCol].loc[self.moldata_proj[ 222 | seriescolumn].map(lambda x: scaflist[scaf_ind] in x)].tolist() 223 | intersect_scaf = np.array([ 224 | len(list(set(mollist) & set(clusterlist))) 225 | for clusterlist in self.MolAssignment.values() 226 | ]) 227 | intersect_matrix[scaf_ind, :] = intersect_scaf 228 | NMatchScaf.append(len(mollist)) 229 | 230 | NMatchScaf = np.array(NMatchScaf) 231 | RecallMatrix = intersect_matrix / NMatchScaf[:, None] 232 | PrecMatrix = intersect_matrix / NMatchCluster[None, :] 233 | Fscore = (2 * RecallMatrix * PrecMatrix) / (RecallMatrix + PrecMatrix + 234 | 1e-9) 235 | maxscore = np.argmax(Fscore, axis=0) 236 | 237 | PrecVector = np.zeros(len(self.MolAssignment)) 238 | RecallVector = np.zeros(len(self.MolAssignment)) 239 | FscoreVector = np.zeros(len(self.MolAssignment)) 240 | LinkVector = [] 241 | 242 | for col in range(len(self.MolAssignment)): 243 | PrecVector[col] = PrecMatrix[maxscore[col], col] 244 | RecallVector[col] = RecallMatrix[maxscore[col], col] 245 | FscoreVector[col] = Fscore[maxscore[col], col] 246 | LinkVector.append((list(self.MolAssignment.keys())[col], 247 | scaflist[maxscore[col]])) 248 | 249 | LinkVector = np.array(LinkVector) 250 | self.PerformanceClusters = { 251 | 'recall': RecallVector, 252 | 'precision': PrecVector, 253 | 'Fscore': FscoreVector, 254 | 'linked series': LinkVector 255 | } 256 | 257 | if self.clustering == 'UPGMA': 258 | with open('{0}PerformanceData_UPGMA.pkl'.format(self.datapath), 259 | 'wb') as fileout: 260 | pickle.dump(self.PerformanceClusters, fileout) 261 | elif self.clustering == 'Butina': 262 | with open('{0}PerformanceData_Butina.pkl'.format(self.datapath), 263 | 'wb') as fileout: 264 | pickle.dump(self.PerformanceClusters, fileout) 265 | else: 266 | print('Clustering algorithm not implemented.') 267 | return 268 | 269 | def ClassificationCrossValidation(self, fraction_sample, N_sample): 270 | samplerange = np.arange(len(self.moldata_proj)) 271 | invfrac = 1 / fraction_sample 272 | self.SampledSeries = {} 273 | for i in range(N_sample): 274 | 275 | # random sampling 276 | random.seed((i + 1) * 10) 277 | molinds = random.sample(population=samplerange.tolist(), 278 | k=int( 279 | len(samplerange.tolist()) // invfrac)) 280 | moldata_sample = self.moldata_proj.iloc[molinds] 281 | distdata_sample = self.distdata_proj[molinds, :] 282 | distdata_sample = distdata_sample[:, molinds] 283 | 284 | # apply custering and calculate MCS 285 | if self.clustering == 'UPGMA': 286 | MCSdict_sampled = UPGMAclustering.ApplyUPGMA( 287 | distdata_sample, 288 | moldata_sample, 289 | self.chembldb, 290 | self.flimit, 291 | self.MinClusterSize, 292 | self.calcScores, 293 | useArthor=self.useArthor) 294 | elif self.clustering == 'Butina': 295 | MCSdict_sampled = Butinaclustering.ApplyButina( 296 | distdata_sample, 297 | moldata_sample, 298 | self.chembldb, 299 | self.flimit, 300 | self.MinClusterSize, 301 | self.calcScores, 302 | useArthor=self.useArthor) 303 | else: 304 | print('Clustering algorithm not implemented.') 305 | return 306 | 307 | # assign series through substructure matching and filtering 308 | MolAssignment_sampled, MCSdict_sampled = self.AssignSeriesToMCS( 309 | MCSdict_sampled) 310 | self.SampledSeries[i] = MCSdict_sampled 311 | 312 | if self.clustering == 'UPGMA': 313 | with open( 314 | '{0}SampledSeries{1}_UPGMA.pkl'.format( 315 | self.datapath, int(fraction_sample * 100)), 316 | 'wb') as fileout: 317 | pickle.dump(self.SampledSeries, fileout) 318 | elif self.clustering == 'Butina': 319 | with open( 320 | '{0}SampledSeries{1}_Butina.pkl'.format( 321 | self.datapath, int(fraction_sample * 100)), 322 | 'wb') as fileout: 323 | pickle.dump(self.SampledSeries, fileout) 324 | else: 325 | print('Clustering algorithm not implemented.') 326 | return 327 | 328 | return 329 | 330 | def EvaluationCrossValidation(self): 331 | # Compare the classification obtained from sampling ("SampledSeries") against the original classification ("MCSdict") 332 | self.EvalCrossval = pd.DataFrame( 333 | columns=['series id', 'repetition', 'fscore']) 334 | for rep in self.SampledSeries.keys(): 335 | rep_dict = self.SampledSeries[rep] 336 | keylist = [k for k in rep_dict.keys()] 337 | for k in self.MCSdict.keys(): 338 | intersect = [ 339 | len(set(self.MCSdict[k][-1]) & set(v[-1])) 340 | for v in rep_dict.values() 341 | ] 342 | recall = np.array([ 343 | intersect[i] / len(rep_dict[keylist[i]][-1]) 344 | for i in range(len(keylist)) 345 | ]) 346 | precision = np.array(intersect) / len(self.MCSdict[k][-1]) 347 | fscore = max(2 * recall * precision / 348 | (recall + precision + 1e-9)) 349 | row = [int(k), int(rep), fscore] 350 | self.EvalCrossval.loc[len(self.EvalCrossval)] = row 351 | self.EvalCrossval['series id'] = self.EvalCrossval['series id'].apply( 352 | int) 353 | -------------------------------------------------------------------------------- /src/automated_series_classification/utilsDataPrep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 27 14:51:39 2020 5 | 6 | @author: krugefr1 7 | """ 8 | 9 | from rdkit import Chem 10 | from rdkit.Chem import AllChem, PandasTools, DataStructs 11 | import pickle 12 | import pandas as pd 13 | import numpy as np 14 | 15 | def calcDistMatrix(df, distMeasure): 16 | # calculates the distance matrix between all paris of molecules, standard: Tanimoto and Morgan2 FPs 17 | dists=np.zeros([len(df),len(df)]) 18 | if distMeasure=='Tanimoto': 19 | for i in range(1,len(df)): 20 | ds = DataStructs.BulkTanimotoSimilarity(df.FP.iloc[i],list(df.FP.iloc[:i]),returnDistance=1) 21 | for j in range(i): 22 | dists[i,j] = ds[j] 23 | dists[j,i] = ds[j] 24 | else: 25 | print(distMeasure, 'distance metric not implemented.') 26 | return 27 | return dists 28 | 29 | def readProjectData(filename, FP, smilesCol): 30 | # reads in the project data and calculates fingerprints 31 | df_proj=pd.read_csv(filename)#,names=['ID','Structure','mol name','scaffold','series assignment','assay'], skiprows=[0]) 32 | #df_proj = df_proj.head(100) 33 | PandasTools.AddMoleculeColumnToFrame(df_proj,smilesCol=smilesCol,molCol='Molecule') 34 | df_proj=df_proj.loc[df_proj['Molecule'].map(lambda x: x is not None)] 35 | if FP=='Morgan2': 36 | df_proj['FP']=df_proj.Molecule.map(lambda x : AllChem.GetMorganFingerprint(x,2)) 37 | else: 38 | print(FP, ' fingerprint not implemented.') 39 | return 40 | return df_proj 41 | 42 | 43 | def PrepareData(proj,datapath,filename,distMeasure='Tanimoto',FP='Morgan2', calcDists=False, smilesCol='Smiles'): 44 | # reads in project data and distance matrix (or calculate distance matrix) 45 | filename='{0}{1}'.format(datapath,filename) 46 | moldata=readProjectData(filename, FP, smilesCol) 47 | print(f'read {len(moldata)} molecules') 48 | if calcDists: 49 | dists=calcDistMatrix(moldata, distMeasure) 50 | with open('{0}distmatrix.txt'.format(datapath), 'wb') as fileout: 51 | pickle.dump(dists,fileout) 52 | else: 53 | with open('{0}distmatrix.txt'.format(datapath), 'rb') as filein: 54 | dists=pickle.load(filein) 55 | 56 | return moldata, dists 57 | 58 | -------------------------------------------------------------------------------- /src/automated_series_classification/utilsDrawing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 27 19:10:21 2020 5 | 6 | @author: krugefr1 7 | """ 8 | 9 | from rdkit.Chem import rdDepictor 10 | from rdkit.Chem.Draw import rdMolDraw2D 11 | from rdkit import Chem 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import re 15 | 16 | def moltosvg(mol,molSize=(450,250),kekulize=True): 17 | mc = Chem.Mol(mol.ToBinary()) 18 | if kekulize: 19 | try: 20 | Chem.Kekulize(mc) 21 | except: 22 | mc = Chem.Mol(mol.ToBinary()) 23 | if not mc.GetNumConformers(): 24 | rdDepictor.Compute2DCoords(mc) 25 | drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1]) 26 | # the MolDraw2D code is not very good at the moment at dealing with atom queries, 27 | # this is a workaround until that's fixed. 28 | # The rendering is still not going to be perfect because query bonds are not properly indicated 29 | opts = drawer.drawOptions() 30 | for atom in mc.GetAtoms(): 31 | if atom.HasQuery() and atom.DescribeQuery().find('AtomAtomicNum')!=0: 32 | opts.atomLabels[atom.GetIdx()]=atom.GetSmarts() 33 | drawer.DrawMolecule(mc) 34 | drawer.FinishDrawing() 35 | svg = drawer.GetDrawingText() 36 | # It seems that the svg renderer used doesn't quite hit the spec. 37 | # Here are some fixes to make it work in the notebook, although I think 38 | # the underlying issue needs to be resolved at the generation step 39 | return svg.replace('svg:','') 40 | 41 | def SvgsToGrid(svgs, labels, svgsPerRow=4,molSize=(250,150),fontSize=12): 42 | 43 | matcher = re.compile(r'^(<.*>\n)(\n)(.*)',re.DOTALL) 44 | hdr='' 45 | ftr='' 46 | rect='' 47 | nRows = len(svgs)//svgsPerRow 48 | if len(svgs)%svgsPerRow : nRows+=1 49 | blocks = ['']*(nRows*svgsPerRow) 50 | labelSizeDist = fontSize*5 51 | fullSize=(svgsPerRow*(molSize[0]+molSize[0]/10.0),nRows*(molSize[1]+labelSizeDist)) 52 | print(fullSize) 53 | 54 | count=0 55 | for svg,name in zip(svgs,labels): 56 | h,r,b = matcher.match(svg).groups() 57 | if not hdr: 58 | hdr = h.replace("width='"+str(molSize[0])+"px'","width='%dpx'"%fullSize[0]) 59 | hdr = hdr.replace("height='"+str(molSize[1])+"px'","height='%dpx'"%fullSize[1]) 60 | hdr = hdr.replace("viewBox='0 0 %d %d'"%(molSize[0],molSize[1]), 61 | "viewBox='0 0 %d %d'"%(fullSize[0],fullSize[1])) 62 | 63 | if not rect: 64 | rect = r 65 | legend = '\n' 66 | legend += ''+name.split('|')[0]+'\n' 67 | if len(name.split('|')) > 1: 68 | legend += ''+name.split('|')[1]+'\n' 69 | legend += '\n' 70 | blocks[count] = b + legend 71 | count+=1 72 | 73 | for i,elem in enumerate(blocks): 74 | row = i//svgsPerRow 75 | col = i%svgsPerRow 76 | elem = rect+elem 77 | blocks[i] = '%s'%(col*(molSize[0]+molSize[0]/10.0),row*(molSize[1]+labelSizeDist),elem) 78 | res = hdr + '\n'.join(blocks)+ftr 79 | return res 80 | 81 | def barplot_vertical(fig, ax, dict_input,ylabel,legend,xticks,legend_x_pos): 82 | N=len(dict_input) 83 | N2=len(list(dict_input.values())[0]) 84 | ind=np.arange(N) 85 | width=0.35 86 | if N2<=20: 87 | clist=list(np.arange(0,20,2))+list(np.arange(1,21,2)) 88 | else: 89 | clist=list(np.arange(0,N2//2*2+2,2))+list(np.arange(1,N2//2*2+1,2)) 90 | colors=plt.cm.tab20(clist[0:len(list(dict_input.values())[0])]) 91 | #p=np.zeros(N) 92 | bottom=np.zeros(N) 93 | for i in range(len(list(dict_input.values())[0])): 94 | ax.bar(ind,[v[i] for v in dict_input.values()], width, bottom=bottom, color=colors[i]) 95 | bottom = bottom + np.array([v[i] for v in dict_input.values()]) 96 | ax.set_ylabel(ylabel,fontsize=22) 97 | ax.set_xticks(ind) 98 | ax.set_xticklabels(xticks, rotation='vertical') 99 | if legend!=[]: 100 | ax.legend(legend, fontsize=20, ncol=4,bbox_to_anchor=(legend_x_pos, -0.3)) 101 | plt.setp(ax.get_xticklabels(), fontsize=18) 102 | plt.setp(ax.get_yticklabels(), fontsize=18) 103 | return fig,ax -------------------------------------------------------------------------------- /src/automated_series_classification/utilsStructureEval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Jan 27 16:29:23 2020 5 | 6 | @author: krugefr1 7 | """ 8 | from rdkit import Chem 9 | from rdkit.Chem import rdFMCS 10 | from rdkit.Chem import rdSubstructLibrary 11 | import pickle 12 | try: 13 | import arthor 14 | except ImportError: 15 | arthor = None 16 | 17 | 18 | def MCSFromMollist(mollist,chembldb,Nchembl,onlyCompleteRings=False): 19 | MCSSmarts2=rdFMCS.FindMCS(mollist,atomCompare=rdFMCS.AtomCompare.CompareAny,bondCompare=rdFMCS.BondCompare.CompareOrderExact,ringMatchesRingOnly=onlyCompleteRings,completeRingsOnly=onlyCompleteRings,timeout=1).smartsString 20 | MCSSmarts=rdFMCS.FindMCS(mollist,atomCompare=rdFMCS.AtomCompare.CompareElements,bondCompare=rdFMCS.BondCompare.CompareOrder,ringMatchesRingOnly=onlyCompleteRings,completeRingsOnly=onlyCompleteRings,timeout=1).smartsString 21 | if MCSSmarts2=='': fChembl2=1 22 | else: fChembl2=getFChembl(MCSSmarts2,chembldb,Nchembl) 23 | if MCSSmarts=='': fChembl=1 24 | else:fChembl=getFChembl(MCSSmarts,chembldb,Nchembl) 25 | if fChembl2