├── .DS_Store ├── .gitignore ├── LICENSE ├── README.asciidoc ├── ch00_cover.asciidoc ├── ch01_introduction.asciidoc ├── ch02_installation.asciidoc ├── ch03_python.asciidoc ├── ch04_database.asciidoc ├── ch05_rdkit.asciidoc ├── ch06_similarity.asciidoc ├── ch07_graph.asciidoc ├── ch08_visualization.asciidoc ├── ch09_qsar.asciidoc ├── ch10_deeplearning.asciidoc ├── ch11_dlqsar.asciidoc ├── ch12_generativemodels.asciidoc ├── ch13_beyond.asciidoc ├── images ├── by-nc-sa.png ├── ch02 │ └── anaconda01.png ├── ch04 │ ├── chembl01.png │ ├── chembl02.png │ ├── chembl03.png │ ├── chembl04.png │ ├── chembl05.png │ ├── chembl06.png │ ├── chembl07.png │ └── zinc01.png ├── ch05 │ ├── ch05_01.png │ ├── ch05_02.png │ ├── ch05_03.png │ ├── ch05_04.png │ ├── ch05_05.png │ ├── ch05_06.png │ ├── ch05_07.png │ └── ch05_08.png ├── ch06 │ ├── apx_rvx.png │ ├── apx_rvx_suf.png │ ├── cls01.png │ └── vs01.png ├── ch07 │ ├── chemviz2.png │ ├── mcs01.png │ ├── mcs02.png │ ├── mcs03.png │ ├── mcs04.png │ ├── mcs05.png │ ├── mmp01.png │ ├── mmp02.png │ ├── mmp03.png │ ├── mmp04.png │ ├── mms01.png │ └── scaffold.png ├── ch08 │ ├── pca01.png │ ├── pca02.png │ └── tsne01.png ├── ch10 │ ├── ch10_1.png │ ├── ch10_2.png │ └── ch10_3.png ├── ch11 │ ├── ch11_01.png │ └── ch11_nfp.png ├── jupyter.png ├── mishimasyk.png ├── python_for_ci.png └── souyakuchan.png ├── mkpdf.sh ├── notebooks ├── Chembl_FXa.txt ├── ch05_Sildenafil vs Vardenafil.ipynb ├── ch05_compounds.sdf ├── ch05_hetero_shuffle.ipynb ├── ch05_rdkit.ipynb ├── ch06_nov_hts.sdf ├── ch06_similarity.ipynb ├── ch07_MCS.ipynb ├── ch07_MMS.ipynb ├── ch08 │ ├── CHEMBL2380240.sdf │ ├── CHEMBL3098111.sdf │ ├── CHEMBL3112474.sdf │ ├── CHEMBL3351489.sdf │ ├── CHEMBL3352684.sdf │ ├── CHEMBL3526050.sdf │ ├── CHEMBL3739366.sdf │ ├── CHEMBL3739395.sdf │ ├── CHEMBL3769367.sdf │ └── CHEMBL3867477.sdf ├── ch08_compounds.txt ├── ch08_visualization.ipynb ├── ch09_compounds.txt ├── ch09_qsar.ipynb ├── ch11_simple_dnn.ipynb └── ch12_rnn.ipynb ├── pdf └── py4chemoinformatics.pdf ├── py4c-theme.yml └── py4c.asciidoc /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | fonts 2 | *-checkpoint.ipynb 3 | */.ipynb_checkpoints/* 4 | .ipynb_checkpoints 5 | */.DS_Store 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. 438 | 439 | -------------------------------------------------------------------------------- /README.asciidoc: -------------------------------------------------------------------------------- 1 | = Table of Contents 2 | :imagesdir: images 3 | 4 | *Update 03_2019:* forked and tried to translate to english. Corrections are welcome. 5 | 6 | I added a little (2018.12.12). Since the web interface is likely to be beyond the scope of introductory, I will consider how to do it. 7 | 8 | - https://asciidoctor.org/docs/asciidoc-syntax-quick-reference/#formatted-text[AsciiDoc Syntax Quick Reference] 9 | 10 | image::python_for_ci.png[py4chemoinformatics, width=250] 11 | 12 | == link:ch01_introduction.asciidoc[01 Introduction] 13 | 14 | - What is chemoinformatics? 15 | - What is RDKit? 16 | - Target audience 17 | - Acknowledgment 18 | - License 19 | 20 | == link:ch02_installation.asciidoc[02 Let's prepare the environment for chemoinformatics] 21 | 22 | - Anaconda(Python, Jupyter, scikit-learn) 23 | - RDKit 24 | 25 | == link:ch03_python.asciidoc[03 Basics of Python programming] 26 | 27 | - Python basics 28 | - Let's use it conveniently with Jupyter notebook 29 | - To do machine learning with Python 30 | 31 | == link:ch04_database.asciidoc[04 Public database for chemoinformatics] 32 | 33 | - ChEMBL 34 | - PubChem 35 | - Search for the information you want on ChEMBL 36 | 37 | == link:ch05_rdkit.asciidoc[05 Handling Structural Information with RDKit] 38 | 39 | - What is SMILES? 40 | - Let's draw the structure 41 | - How to handle multiple compounds at once? 42 | 43 | == link:ch06_similarity.asciidoc[06 Try to evaluate the similarity of compounds] 44 | 45 | - Descriptor, fingerprint 46 | - Calculate similarity 47 | - Virtual screening 48 | 49 | == link:ch07_graph.asciidoc[07 valuation of similarity using graph structure] 50 | 51 | - Classification by major skeleton (MCS) 52 | - Compound Network by Matched Molecular Pair 53 | - Visualize MMP networks using Cytoscape 54 | 55 | == link:ch08_visualization.asciidoc[08 I want to have many compounds at once] 56 | 57 | - Chemical Spaceとは 58 | - Mapping using tSNE 59 | 60 | == link:ch09_qsar.asciidoc[09 Basics of Quantitative Structure-Activity Relationship (QSAR)] 61 | 62 | - Consider the cause of the effect (Classification problem) 63 | - Predict the efficacy of drugs (regression problem) 64 | - Model applicability (applicability domain) 65 | 66 | == link:ch10_deeplearning.asciidoc[10 Introduction to Deep-Learning] 67 | 68 | - About TensorFlow and Keras 69 | - Google colab 70 | - Let's install 71 | 72 | == link:ch11_dlqsar.asciidoc[11 Structure-activity relationship using deep-learning] 73 | 74 | - Predictive model construction using DNN 75 | - I will devise a descriptor (neural fingerprint) 76 | 77 | == link:ch12_generativemodels.asciidoc[12 Let the computer think about chemical structure] 78 | 79 | - Structure generation using Recurrent Neural Network 80 | 81 | == link:ch13_beyond.asciidoc[13 Conclusion] 82 | 83 | - Final remarks and further reading 84 | -------------------------------------------------------------------------------- /ch00_cover.asciidoc: -------------------------------------------------------------------------------- 1 | = Introduction to chemoinformatics for AI drug discovery 2 | @fmkz___, @iwatobipen 3 | v0.40002(Draft) 2019/03/20 4 | :toc: 5 | :toc-title: 目次 6 | :lang: en 7 | :doctype: book 8 | :docname: Introduction to chemoinformatics for AI drug discovery 9 | :imagesdir: ./images 10 | :pdf-fontsdir: fonts 11 | :pdf-style: py4c-theme.yml 12 | :source-highlighter: coderay 13 | :title-logo-image: image::souyakuchan.png[mishima.syk] 14 | -------------------------------------------------------------------------------- /ch01_introduction.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 1: Introduction 2 | :imagesdir: ./images 3 | 4 | Chemoinformatics is a methodology that is used to analyze mainly chemical-related data using a computer and solve various problems. The term chemoinformatics was defined in the late 1990s and early 2000s, and in the pharmaceutical industry and pharmaceutical academia, the relationship between drug effects and compound characteristics is analyzed, large amounts of compound information are visualized, and compound similarity It is used in a wide variety of processes, including gender-based clustering. 5 | 6 | In recent years, drug discovery applications for deep learning have been explored, but not only in conventional chemoinformatics such as **new design proposals** and **synthetic route proposals**, as well as QSAR (Quantitative Structure-Activity Relationship) for predicting activity and physical properties. Applied research to areas that were not being conducted is also actively conducted. 7 | 8 | 9 | Compound design is innovative 10 | 11 | **** 12 | What kind of compound should we make in the first place? And how to synthesize it? The process of thinking about the background is an area where background knowledge and imagination are required, and conventionally it has been recognized that it is a difficult area for people other than to bear, but the advancement of what is also called AI to such areas is here It progressed rapidly in several years (2017-2019). 13 | **** 14 | 15 | Cheminformatics has already been used in various situations, but there was not much relevant information. There are several possible reasons for this, but there is no doubt that the two main reasons are that there were no open source toolkits and no public databases. However, with the advent of RDKit, an open source chemoinformatics toolkit called RDKit and a public database called ChEMBL, this has been resolved. 16 | 17 | In recent years, as with bioinformatics in chemoinformatics, a lot of information can be obtained immediately by searching on the web, and it is possible to learn by yourself, but as a set of information to take a first step, We decided to prepare "the content that could learn the basics of chemoinformatics and apply them". Considering the recent AI drug discovery boom, the latter chapter contains chapters on compound activity prediction and compound proposal using deep learning used in the context of “AI drug discovery”, so one-stop learning So you should be able to keep up with the recent trends. 18 | 19 | <<< 20 | 21 | === What is RDKit 22 | 23 | warning:: Here is a subsection of @ iwatobipen's talk about RDKit. At the draft stage, the words such as "I will say" or "based on" are used as they are, and the self-proclaimed is a "comprehensive" @ iwatobipen-style style of "gozuru" tone. 24 | 25 | My name is @iwatobipen, who writes a part of this book. I'm going to talk hot about RDKit here. 26 | 27 | What is the RD of RDKit? Actually, it is an abbreviation of **Rational Discovery** , and a framework that is the predecessor of the current open source was developed in 2000. It's so old and old. Then, in 2006, the code became open source and was released from sourceforge. Readers who think that Python's chemoinformatics toolkit includes OpenBabel besides RDKit will also be welcome. OpenBabel was first released in 2005. All come with a toolkit that has more than 10 years of history. I remember that OpenBabel was the major in around 2012, when the deaf people began to be interested in this area. At that time, there were almost no articles in Japanese, and the person who wrote this link:https://kzfm.hatenablog.com/archive[book] was a trial and error writing the code of RDKit referring to the link:https://kzfm.hatenablog.com/archive[chemo info] cookbook of @fmkz___ who is a co- 28 | author of this book and a pioneer in the industry Oh. If you want to keep track of chemoinfo related history, you should read this link:http://blog.kzfmix.com/entry/1542711744[article]. 29 | 30 | 31 | Developer Greg Landorum says 32 | 33 | [quote, Greg Landorum] 34 | RDKit is the Swiss Army Knife in chemoinformatics and is a collection of various functional pieces 35 | 36 | This is exactly the expression which got the target. As you can see if you look at the link:link:https://www.rdkit.org/docs/[official document] , it already has various features. Starting with reading and writing of compound information, drawing of structure, 3D structure conformation generation, R group decomposition, descriptor, fingerprint calculation, pharmacophore calculation etc. Oh. It can cover a wide range from analysis to visualization. Furthermore, the tools developed by Contributor and others using RDKit are packed in the link:https://github.com/rdkit/rdkit/tree/master/Contrib[Contrib] folder along with their hot feelings . How do you want to use it? Now I want to write code with RDKit as soon as possible, I cant't wait ;) 37 | 38 | NOTE: @iwatobipen is, of course, one of the contributors, and provides code to quickly cluster a large number of compound libraries called link:https://github.com/rdkit/rdkit/tree/master/Contrib/Fastcluster[Fastcluster] . (by @fmkz___) 39 | 40 | RDKit is also active in the development and user community, with more features being added. The style in which talented researchers from all over the world build up and develop as a whole is the strength and attraction of open source. If you have a chance, consider joining the annual RDKit User Group Meeting. It is hard to replace anything with Face2Face that users can discuss each other. In addition, I said that there was almost no information on Japanese at the time when the deaf began to use it, but in recent years there have been a lot of very good Japanese articles. Here are a few examples: There are many articles posted on Qiita. 41 | 42 | In addition, link:http://rdkit-users.jp/[RDKit-users-jp] by volunteers has also been launched. If your question in English seems to be a bit ..., I would like to ask a question here. Also, Japanese documents are merged into the latest version of RDKit's repository. This will also be helpful. This document only uses some of RDKit's features. You should still feel that you can do a lot of things. Once you have taken the first step of interest, you should go ahead with your own interest and motivation. If you do not understand something, ask the above community and post it to the repository of this book as an issue. **Well then let's get started!** 43 | 44 | ==== Main Japanese Commentary Site 45 | 46 | - link:http://rdkit-users.jp/[rdkit-users.jp] 47 | - link:https://magattaca.github.io/RDKit_unofficial_translation_JP/[RDKitドキュメンテーション非公式日本語版サイト:Unofficail site of rdkit documentation] 48 | - link:https://future-chem.com/[化学の新しいカタチ:The shape of new chemistry] 49 | 50 | === Target audience 51 | 52 | The following people are assumed as readers. 53 | 54 | - Postdoctoral student who wants to do data analysis of graduate students in pharmacy and medicine and pharmacy 55 | - Pharmacist at a pharmaceutical company who wants to analyze his own data 56 | - Those who feel the need for chemoinformatics in drug discovery chemists and those who are assigned suddenly due to the power of mystery 57 | - Bioinformaticians who are thinking of learning chemoinformatics 58 | - People who are interested in AI drug discovery but do not know what to start with 59 | 60 | === About the code of this book 61 | 62 | All of the programming code used in this book is located in the notebooks directory of the link:https://github.com/Mishima-syk/py4chemoinformatics[py4cheminformatics repository of Mishima.syk]. The first one of each of the image:jupyter.png[width="20"] chapter please see properly because it stretched a link to the chapter of Jupyter notebook to. 63 | 64 | The installation of Chapter 2 will enable you to use git commands, so you can download all the data in this manual including pdf with the following command 65 | 66 | [source, bash] 67 | ---- 68 | $ git clone https://github.com/Mishima-syk/py4chemoinformatics.git 69 | ---- 70 | 71 | === bonus 72 | 73 | .Chemoinformatics or Cheminformatics? 74 | **** 75 | Chemoinformatics or Cheminformatics? 76 | Originally I remember that Bio and the combination of the word “Chemo” appeared, but it was widely separated from Chem for a while by the launch of the link:https://jcheminf.biomedcentral.com/[Journal of Cheminformatics]. 77 | 78 | According to the recent link:https://trends.google.co.jp/trends/explore?date=all&q=chemoinformatics,cheminformatics[Google trend], it seems either way, but personally I think that it is better to put emphasis on Rhyme, so I will use Chemo in this book. 79 | **** 80 | 81 | <<< 82 | 83 | === Acknowledgment 84 | 85 | We would like to thank the following people for their bug fixes and suggestions for improvement when writing this document: 86 | 87 | link:https://twitter.com/antiplastics[@antiplastics], 88 | link:https://twitter.com/bonohu[@bonohu], 89 | link:https://twitter.com/ReLuTropy[@ReLuTropy], 90 | link:https://twitter.com/ski_nanko[@ski_nanko], 91 | link:https://twitter.com/torusengoku[@torusengoku], 92 | link:https://twitter.com/yamasaKit_[@yamasaKit_] 93 | link:https://twitter.com/4Elemento[@4Elemento], 94 | @4Elemento, thanks a lot for tranlation task!!!! (from @iwatobipen) 95 | 96 | From here onwards I wrote while listening to Nujabes-reflection eternal by @fmkz___ 20/03/20 97 | 98 | First of all, I would like to thank the link:https://twitter.com/bonohu[@bonohu] which triggered me to write this book. @Bonohu's link:https://www.amazon.co.jp/dp/4895929019[Dr. Bono's analysis of life science data]. At athe meeting of Mishima.syk we talked that "The Bono book Chemoinformatics version" would be nice. There is no doubt that what triggered me to write this book is, "Well, if yes, why not write?" Also, link: https://twitter.com/souyakuchan[@souyakuchan] link:https://adventar.org/calendars/3041[Drug Advent Calendar 2018, written in Japanese] has also become a good stimulus for writing. In other words, I think that I did not start to move specifically if I did not make a chapter here. 99 | 100 | Also, it is the existence of y-sama that should not be forgotten. link:http://mishima-syk.github.io/[Mishima.syk] y-sama has been away at the beginning and has fallen forever on 2019/01/06. He wrote wonderful post such as link:https://qiita.com/y\__sama/items/5b62d31cb7e6ed50f02c[Python environment construction of the person who aims at the data scientist 2016] and link:https://medium.com/@y__sama/druglikeness%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6%E3%81%AE%E3%82%88%E3%82%82%E3%82%84%E3%81%BE%E8%A9%B1-8310cec5ffc6[Small talk about drug likeness: written in Japanese]. If he was alive, we would probably write by three people and the content would have been more complete. This event also gave us a strong motivation to write. 101 | 102 | Finally, I would like to thank the participants who participated in Mishima.syk for drinking good wine and beer and having a hot discussion every time. Some content is based on the presentation at Mishima.syk, and has been revised based on your feedback. 103 | 104 | If you have read this book, and if you feel that chemoinformatics is interesting or you want to do drug discovery, please join Mishima.syk. I think it will be fun. In future drug discovery research, it will be important to push each other across affiliations and improve their skills. In fact, I think it is already such a society. I hope this book will help you have a pleasant research life. 105 | 106 | [quote, y__sama] 107 | I do what I want to do I live myself, I have no regrets in my life. 108 | Life enjoys winning. 109 | I think it would be fun to enjoy your life by chasing your joy to the fullest by saying that you hate something you hate. 110 | I wish you all the best in your life. 111 | 112 | === License 113 | 114 | This document is copyright (C) 2019 by @fmkz___ and @iwatobipen 115 | 116 | This document is link:https://github.com/Mishima-syk/py4chemoinformatics/blob/master/LICENSE[Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 117 | Public License]. 118 | 119 | image::by-nc-sa.png[CC-BY-NC-SA, width=100] 120 | 121 | <<< 122 | -------------------------------------------------------------------------------- /ch02_installation.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 2: Prepare the Environment for Chemoinformatics 2 | :imagesdir: images 3 | 4 | We will build the environment required for this document. 5 | 6 | === About Anaconda 7 | 8 | Anaconda is a package for easy environment creation and management for doing machine learning. You can also easily install packages, like RDKit, which will be explained later. 9 | 10 | 11 | ==== Q&A 12 | 13 | Why use Anaconda?:: 14 | The programming language Python has a relatively large number of standard libraries, but you need to install the libraries for chemoinformatics yourself. This is not a big deal if you get used to it, but it will be troublesome for beginners. Anaconda comes into play in order to reduce this effort. 15 | 16 | 17 | There are two major versions of Python: 2.x and 3.x.:: 18 | link:https://pythonclock.org/[Support for 2.x will end in 2020], so new learners do not need to use 2.x. 19 | 20 | === How to install Anaconda 21 | 22 | Now let's install Anaconda. Visit the link:https://www.anaconda.com/[official site] and download the Python 3 installer for your environment. If the OS is Linux / Mac, you can select the installer of GUI / CUI, so download Python 3.7 64-bit command line installer. 23 | 24 | image::ch02/anaconda01.png[APX+RVX, width=600, pdfwidth=60%] 25 | 26 | [source, bash] 27 | ---- 28 | $ bash ~/Downloads/Anaconda3-4.1.0-Linux-x86_64.sh # Please change the installer name accordingly 29 | ---- 30 | 31 | Press Enter 32 | 33 | [source, bash] 34 | ---- 35 | Welcome to Anaconda3 2018.12 36 | 37 | In order to continue the installation process, please review the license 38 | agreement. 39 | Please, press ENTER to continue 40 | >>> 41 | ---- 42 | 43 | Continue to press Enter and enter yes with yes, no 44 | 45 | [source, bash] 46 | ---- 47 | Do you accept the license terms? [yes|no] 48 | [no] >>> 49 | ---- 50 | 51 | I am asked where to install, but the default location is usually fine. 52 | Press Return. 53 | 54 | [source, bash] 55 | ---- 56 | Anaconda3 will now be installed into this location: 57 | /Users/kzfm/anaconda3 58 | 59 | - Press ENTER to confirm the location 60 | - Press CTRL-C to abort the installation 61 | - Or specify a different location below 62 | ---- 63 | 64 | You will be asked if you want to install VSCode after installation as well, so press No. 65 | 66 | [source, bash] 67 | ---- 68 | Thank you for installing Anaconda3! 69 | 70 | =========================================================================== 71 | 72 | Anaconda is partnered with Microsoft! Microsoft VSCode is a streamlined 73 | code editor with support for development operations like debugging, task 74 | running and version control. 75 | 76 | To install Visual Studio Code, you will need: 77 | - Internet connectivity 78 | 79 | Visual Studio Code License: https://code.visualstudio.com/license 80 | 81 | Do you wish to proceed with the installation of Microsoft VSCode? [yes|no] 82 | >>> Please answer 'yes' or 'no': 83 | >>> 84 | ---- 85 | 86 | Once the Anaconda installation is complete, you will be able to use the 'conda' command from a command prompt or terminal. 87 | 88 | === Build a Virtual Environment and Install a Package 89 | 90 | Python installed with Anaconda is 3.7, but the latest RDKit distributed at the time of this writing requires Python 3.6. So build a virtual environment with conda and install the required version of Python. After the -n of the command is "py4chemoinformatics", but you can use any name you like. After creating the virtual environment, install the packages used in this chapter and later. 91 | 92 | [source, bash] 93 | ---- 94 | $ conda create -n py4chemoinformatics python3.6 95 | $ source activate py4chemoinformatics # Mac/Linux 96 | $ activate py4chemoinformatics # Windows 97 | 98 | # install packages 99 | $ conda install -c conda-forge rdkit 100 | $ conda install -c conda-forge seaborn 101 | $ conda install -c conda-forge ggplot 102 | $ conda install -c conda-forge git 103 | ---- 104 | 105 | === Description of installed package 106 | 107 | ==== RDKit 108 | 109 | RDKit is one of the most commonly used toolkits in the field of chemoinformatics. One of the so-called open source software (OSS), which can be used free of charge. For more information Please refer to link:ch01_introduction.asciidoc[Introduction]. 110 | 111 | ==== seaborn 112 | It is one of the packages for link:https://seaborn.pydata.org/[visualizing statistical data]. 113 | 114 | ==== ggplot 115 | 116 | One of the graph drawing packages is that it can draw rationally with a consistent grammar . Originally developed for the statistical analysis language R, it was ported to Python by the company link:http://ggplot.yhathq.com/[yhat] . 117 | 118 | ==== Git 119 | 120 | It is a version control system. I will not explain Git in this book, but if you do not know Git at all , take a look at link:https://backlog.com/ja/git-tutorial/[Git Primer], which can be understood by monkeys. 121 | 122 | As explained in "Introduction", all data including pdf will be downloaded by the following command, so please download it as necessary. 123 | 124 | [source, bash] 125 | ---- 126 | $ git clone https://github.com/Mishima-syk/py4chemoinformatics.git 127 | ---- 128 | 129 | === Learn more about Conda 130 | 131 | Why create a virtual environment:: 132 | Some systems use Python internally to provide various features, so changing the Python version for a particular package can cause problems. Virtual environments solve these problems. Even if the package requires different library versions, you can set up a virtual Python environment for trial and error. If it becomes unnecessary, the virtual environment can be easily deleted without causing any problems in the original environment. So, by being able to create separate development environments in one system, you will not be bothered by library dependencies problems and Python version differences that often occur during development. 133 | 134 | In this document, only one virtual environment is prepared for this document, but in practice many virtual environments are often created and developed. Therefore, I will list the conda subcommands that I use frequently. 135 | 136 | [source, bash] 137 | ---- 138 | $ conda install  # install package 139 | $ conda create -n python =  # Create virtual environment. 140 | $ conda info -e # Display virtual environment list created 141 | $ conda remove -n # Virtual environment deletion 142 | $ source activate # Using virtual environment ( Mac/Linux) 143 | $ activate # Using virtual environment (Windows) 144 | $ source deactivate # leaving virtual environment 145 | $ conda list # Display a list of libraries installed in the virtual environment you are using now 146 | ---- 147 | 148 | <<< 149 | -------------------------------------------------------------------------------- /ch03_python.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 3: Basics of Python programming 2 | 3 | === Python basics 4 | 5 | This chapter introduces web sites and books **for effective learning** for python beginners. 6 | If you have something that is not understood in the following chapter, these information will help you. 7 | //// 8 | この章ではPythonに触れたことのない読者のために**効率的に勉強するため**のサイトや本などを紹介します。 9 | もしこれ以降の章でわからないことなどがあったら、この章のサイトや本を参考に学んでみてください。 10 | //// 11 | 12 | ==== Would like to learn Python from books 13 | 14 | https://www.amazon.co.jp/dp/4774196436/[Pythonスタートブック増補改訂版:Python start book]:: 15 | We recommend the book if you are beginner of programming. 16 | 17 | https://www.amazon.co.jp/dp/B01NCOIC2P/[みんなのPython 第4版:Python for everyone]:: 18 | If you have any experience in programming such as Javascript and/or Java, and would like to learn python now, this book is recommended. 19 | //// 20 | JavascriptやJavaなどのなにかプログラミングを少しかじっていて、これからPythonを覚えたいのであればこちらの本をおすすめします。 21 | //// 22 | 23 | ==== Would like to learn Python from any sources 24 | 25 | https://www.pycon.jp/support/bootcamp.html[Python Boot Camp(tutorial for python biginners)]:: 26 | This is a python tutorial event for beginner held by PyconJP. The events held on all the places of Japan. How about join the event when it take place neighbour? 27 | //// 28 | 一般社団法人PyCon JPが開催している初心者向けPythonチュートリアルイベントです。全国各地で行われているので近くで開催される場合には参加するとよいでしょう 29 | //// 30 | 31 | https://connpass.com/category/Python/[Local communities]:: 32 | It seems good to increase your motivation to join study workshops for beginners or for professionals. You can find many workshops in connpass site. 33 | //// 34 | あちこちで入門者向けからガチのヒト向けまでの勉強会やコミュニティなどもあるので、そういうのに参加してモチベーションを高めるのもよい方法です。 35 | //// 36 | 37 | https://www.udemy.com/topic/python/[udemy/python]:: 38 | It is effective way to learn programming with online learning service but we have never tried. You should ask a reputation around. And also there are many resources in YouTube. 39 | //// 40 | オンライン学習サービスを利用するのも効果的な手段のひとつですが、筆者は試したことがないのでわかりません。 41 | 周りの評判を聞いてみても良いでしょう。YouTubeを探すのもありです。 42 | //// 43 | 44 | ==== If you have something that is not understood in this book 45 | 46 | https://github.com/Mishima-syk/py4chemoinformatics/issues[py4chemoinformaticsのissues]:: 47 | We are happy to answer your question if you put questions in the issue of py4chemoinformatics. If there are something that is difficult to understand we will correct them. 48 | The cycle will make the document better and everybody will be happy ;) 49 | //// 50 | py4chemoinformaticsのissuesに質問していただければお答えします。わかりにくい場合だったら修正しますので、よりよくなってみんなハッピー。 51 | //// 52 | 53 | //// 54 | https://qiita.com/[Qiita]:: 55 | Qiitaで探せば大抵答えが見つかるはずです。 56 | Qiita is a community for Japanese. All documents are written in Japanese 57 | //// 58 | 59 | https://stackoverflow.com/[stackoverflow]:: 60 | Stack Overflow is good community. You should search in SOF first and then ask the community. 61 | 62 | http://mishima-syk.github.io/[Mishima.syk]:: 63 | Mishima.syk is the community where people who write the book gather. Topics are not limited to python but there are many presentations about python now. Discussion level is high but the community is also beginner friendly. We have planned hands-on sessions and they have an established reputation. The community members should be able to answer your questions. 64 | //// 65 | 本書を書いている人たちが集まるコミュニティです。特に話題をPythonに限定していませんが、Pythonを使ったネタが多めです。かなりガチですが、初心者対応も万全でハンズオンに定評があります。質問されれば大体答えられます。 66 | //// 67 | 68 | === Let's use it conveniently with Jupyter notebook 69 | By using link:https://jupyter.org/[Jupyter notebook], it is easy to write code and check the results. 70 | //// 71 | link:https://jupyter.org/[Jupyter notebook]を利用すると、コードを書いて結果を確認するということがとても簡単にできるようになります。 72 | //// 73 | 74 | The Jupyter Notebook is an open-source web application that allows you to embed code, rich text, math equation and etc. And it is easy to make high quality visualizations of the results. It is a nice platform for chemoinformatics because Jupyter Notebook can run code and draw chemical structures and many kinds of plots. Also, it has many features which improve programming productivity such as syntax highlight and auto indent. We recommend to use Jupyter especially for programming beginners. 75 | //// 76 | Jupyter notebookはWebブラウザーベースのツールで、コードだけではなくリッチテキスト、数式、なども同時にノートブックに埋め込めます。また結果を非常に綺麗な図として可視化することも容易にできます。つまり、化学構造やグラフも描画できるため、ケモインフォマティクスのためのプラットフォームとして使いやすいです。さらに、プログラミングの生産性を上げるような、ブラウザ上でコードを書くとシンタックスハイライトや、インデント挿入を自動で行ってくれたりという便利な機能もついているので、特に初学者は積極的に使うべきでしょう。 77 | //// 78 | 79 | ==== How to use? 80 | 81 | from terminal (in Windows, anaconda prompt) 82 | 83 | [source, bash] 84 | ---- 85 | $ jupyter notebook 86 | ---- 87 | 88 | After type the command above, Jupyter Notebook will be launched. In this book, all code is run on Jupyter Notebooks. 89 | //// 90 | と打てばJupyter Notebookが立ち上がります。本書ではこれ以降特に断らない限りJupyter Notebook上でのコードを実行することとします。 91 | //// 92 | 93 | === For machine learning with Python 94 | 95 | **Machine learning** is a must for learning informatics not only chemoinformatics. Some background knowledge of machine learning is required in the following sessions. link:https://scikit-learn.org/stable/[Scikit-learn] is used for machine learning with python. Scikit-learn is de facto standard for machine learning library for python. We use the package without any descriptions but we would like to share some links for beginners. 96 | 97 | link:http://shop.oreilly.com/product/0636920030515.do[Introduction to Machine Learning with Python]:: 98 | You can learn basics of machine learning with python. It is easy to read because there is less mathematical representations. 99 | 100 | link:https://github.com/Mishima-syk/sklearn-tutorial[sklearn-tutorial]:: 101 | Sklearn tutorial hands-on by @y-sama. Written in jupyter notebook. 102 | 103 | //// 104 | ケモインフォマティクスに限らず、インフォマティクスを学ぶにあたり、機械学習は外せません。本書でもある程度の機械学習の知識があることを前提に進めていきます。Pythonで機械学習をするにはlink:https://scikit-learn.org/stable/[Scikit-learn]というライブラリを利用するのが定番であり、本書でも特に説明せずに利用していきますが、初学者のために参考となる書籍などをすすめておきます。 105 | 106 | link:https://www.amazon.co.jp/dp/4873117984/[Pythonではじめる機械学習 ―scikit-learnで学ぶ特徴量エンジニアリングと機械学習の基礎]:: 107 | Pythonで機械学習をやるための基礎を学べます。数学的な表現があまりないので読みやすいです。 108 | 109 | link:https://github.com/Mishima-syk/sklearn-tutorial[sklearn-tutorial]:: 110 | y-samaによるsklearnのチュートリアルハンズオンのjupyter notebookです。 111 | //// 112 | 113 | <<< 114 | -------------------------------------------------------------------------------- /ch04_database.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 4: Public databases for chemoinformatics 2 | :imagesdir: images 3 | 4 | //// 5 | この章ではケモインフォマティクスでよく使うデータベースを紹介します。 6 | //// 7 | The section describes common databases which are used for chemoinformatics. 8 | 9 | === ChEMBL 10 | 11 | //// 12 | link:https://www.ebi.ac.uk/chembl/[ChEMBL]はEBIのChEMBLチームにより維持管理されている医薬品及び開発化合物の結合データ、薬物動態、薬理活性を収録したデータベースです。データは主にメディシナルケミストリ関連のジャーナルから手動で抽出されており、大体3,4ヶ月に一度データの更新があります。 13 | 14 | メディシナルケミストリ関連のジャーナルからデータを収集しているため、QSARに関連する情報や背景知識を論文そのものに求めることが可能であり、創薬研究をする際には有用です。 15 | 16 | NOTE: ChEMBLはもともとはlink:http://chembl.blogspot.com/2009/11/faq-where-can-i-download-starlite.html[StARlite]という商用データベースでした。詳しくはlink:http://cbi-society.org/home/documents/seminar/2009to12/CBI_Ikeda_511_d.pdf[慶応大学池田先生のChEMBLに関する資料]を参照してください。 17 | //// 18 | link:https://www.ebi.ac.uk/chembl/[ChEMBL] is a manually curated database of ADMET, physchem and bioactive molecules with drug like properties. The data is mostly curated from medicinal chemistry journals and updated every 3-4 months. 19 | 20 | The database is useful for drug discovery research because user can access a QSAR information and background knowledge of original reference journal from the database. 21 | 22 | NOTE: Originaly, ChEMBL was commercial database named link:http://chembl.blogspot.com/2009/11/faq-where-can-i-download-starlite.html[StARlite]. Details are described in this silde deck link:https://www.ebi.ac.uk/sites/ebi.ac.uk/files/content.ebi.ac.uk/materials/2012/121008_SME/chembl_-_anne_hersey.pdf[about ChEMBL]. 23 | 24 | === PubChem 25 | 26 | //// 27 | link:https://pubchem.ncbi.nlm.nih.gov/[PubChem]はNCBIにより維持管理されている低分子化合物とその生物学的活性データを収録している公開リポジトリです。5000万件以上の化合物情報と、100万件を超えるアッセイデータを含みそのデータ量の多さが特徴とも言えます。もうひとつの特徴はデータをアカデミアからの化合物登録やアッセイ結果の登録により成長することであり、ここが先のChEMBLとの大きな違いです。 28 | 29 | 特にPubChemは初期スクリーニングのデータが多いため、そのようなデータに対しなんらかのマイニングや分析を行いたい場合は有用だと考えられます。 30 | 31 | どちらを使うべき?:: 32 | QSARをやりたい場合にはやはりChEMBLのデータを利用することが多いです。IC50のようなデータが得られていることが多いですし、モデルの解釈に元論文をあたることができるというのが大きな理由です。 33 | //// 34 | link:https://pubchem.ncbi.nlm.nih.gov/[PubChem] is an open chemistry database of biological activities and molecules which is maintained by NCBI. It has more than 50 million compounds data and more than 1 million of biological assay dataset. Its large dataset is one of the main features of pubchem. Another feature is that the database grows up by data registration from academia, being this the biggest difference point to ChEMBL. 35 | You can check more details of the data source from current link:https://pubchem.ncbi.nlm.nih.gov/sources/[URL]. 36 | 37 | Especially pubchem has large amount of an eary stage screening data, so it will be useful when user would like to analyze or mining it. 38 | 39 | Which database should I use ChEMBL or PubChem?:: 40 | We think ChEMBL is preferred for QSAR analysis because ChEMBL provides many data such as IC50 and user can access to original journal for QSAR model interpretation. 41 | 42 | === Search Data which you want in ChEMBL 43 | 44 | //// 45 | NOTE: ChEMBLはユーザーインターフェースを刷新中で現在beta版のテストを行っていますが、いずれこちらに置き換わると思うので新バージョンのインターフェースでの検索方法を紹介します。 46 | 47 | まずはlink:https://www.ebi.ac.uk/chembl/[ChEMBL]にアクセスし、画面上部のCheck out our New Interface (Beta). というリンクをクリックして新しいインターフェース画面に移行します。 48 | 49 | image::ch04/chembl01.png[ChEMBL] 50 | 51 | ChEMBLのデータは主に4つのカテゴリに分かれていて、一意なIDが振られており相互に関連付けされています。それぞれのカテゴリについて簡単に説明すると 52 | 53 | Targets:: 54 | ターゲット分子についてその分子を対象としてアッセイされた論文に関してまとめられており、どういったジャーナルに投稿されているかや、どの年に投稿されたのかといった情報がまとめられています。また、アッセイに関しても同様にまとめられています。 55 | Compounds:: 56 | 化合物に関する基本的な物理量(分子量など)のほか、Rule of 5を満たしているかといった分子の特性情報や、臨床情報などの創薬関連情報のほか、ChEMBLでの関連アッセイ、関連論文のサマリがまとめられています。 57 | Assays:: 58 | アッセイに関する情報と元論文との関連付けがされているほか、アッセイに供された化合物データへのリンクが貼られています。 59 | Documents:: 60 | 論文のタイトル、ジャーナル名、アブストラクトの他に関連論文データへのリンクと、その論文中で行われたアッセイへのリンクと使われた化合物データへのリンクが貼られています。 61 | //// 62 | NOTE: User interface of ChEMBL is refleshing and testing beta version now. In this section describes how to search data from new UI because the UI will be main near the future. 63 | 64 | At first, go to link:https://www.ebi.ac.uk/chembl/[ChEMBL] and click the link 'Check out our New Interface (Beta)' on the top of the screen. Then you can move to new search page. 65 | 66 | image::ch04/chembl01.png[ChEMBL] 67 | 68 | Mainly ChEMBL has 4 data categories and each data has an unique id and has relations to other categories. Brief introductions are below. 69 | 70 | Targets:: 71 | The category has assay and reported journal informations of target molecules. 72 | Compounds:: 73 | The category has basic physicochemical properties of molecules such as Molecular Weight, whether the molecule passes Lipinksy's Rule of 5 or not. And other information about the molecule such as clinical, related assays which are stored in ChEMBL and summary of journals. 74 | Assays:: 75 | The category has relationship between assay information and original journal and link for the compounds which was assayed. 76 | Documents:: 77 | The category has journal name, title, abstruct and link to related journals and link to data of the comounds which are used in the journal. 78 | 79 | 80 | ==== If you want to find compounds which are related to a specific target 81 | 82 | //// 83 | ある創薬ターゲット分子がどのくらい研究開発されているかを知るために、それをターゲットとしてどのくらいの化合物が合成されたのか?さらに骨格のバリエーションはどのくらい存在するのかを調べたい場合がよくあります。ChEMBLを利用するとターゲット名で探索して関連化合物をダウンロードすることができます。 84 | 85 | ここでは抗がん剤のターゲットとして知られているTopoisomerase2を検索します。画面上部のフォームにtopoisomeraseと入力して検索するとスクリーンショットのように表示されるはずです。 86 | //// 87 | It is very common that we want to know how long a target has been studied, how many compounds are synthesized and how kinds of scaffolds are there. 88 | 89 | In this section, let's search Topoisomerase2 which is known popular target of cancer chemocerapy treatments. When you input the word **topoisomerase** in to the form which is located on top of the screen and search you can see the result as below. 90 | 91 | image::ch04/chembl02.png[ChEMBL] 92 | 93 | //// 94 | サジェスト機能による絞り込みでいくつか候補をリスト表示してくるのでTOP2Bを選んでください。画面をスクロールするとAssociated Compoundsセクションがありますのでグラフのタイトル(Associated Compounds for Target CHEMBL3396)をクリックすると関連化合物一覧画面が開きます。 95 | //// 96 | The system provides candidates list with suggest feature. So you should select TOP2B. You can find section of 'Associated Comounds' when you scroll the screen, you shoud click the title of graph named **Associated Compounds for Target CHEMBL3396** then related compounds list display will appear. 97 | 98 | image::ch04/chembl03.png[ChEMBL] 99 | 100 | //// 101 | 259化合物存在することがわかります。スクロールすると全体をみることができます。画面右のアイコンをクリックするとそれぞれCSV(カンマ区切りテキスト),TSV(タブ区切りテキスト),SDF(5章で説明しています)の形式でダウンロードできます。 102 | //// 103 | There are 259 compounds in the result. All data can see by scrolling the screen. And data can be downloaded as CSV, TSV and SDF format when you click the icon which is located on top right side of the screen. 104 | TIPS:: TSV means tab separated value, CSV means camma separated value 105 | 106 | image::ch04/chembl04.png[ChEMBL] 107 | 108 | ==== If you want to retrieve comound structures and assays data from ChEMBL 109 | 110 | //// 111 | QSARモデルを作る場合、アッセイの活性値と対応する化合物の構造情報が必要です。ChEMBLの場合アッセイのページからダウンロードすることでQSARモデル作成のためのデータを得ることができます。 112 | 113 | 大体次のような手順を辿ることがおおいです。 114 | 115 | - 論文データを検索してからそれに関連付けられているアッセイデータを辿る 116 | - ターゲットを検索してそれに紐付いているアッセイデータからQSARに使えそうなものを選ぶ 117 | 118 | ここでは後者のターゲットから検索してQSARモデルに使えそうなアッセイデータを探します。心毒性関連ターゲットとしてよく知られているhERGのQSARモデルを作りたいという状況を想定しています。 119 | 120 | 検索フォームにhERGと入力して、Search hERG for all in Assaysを選びます。361件ヒットしました。 121 | 122 | image::ch04/chembl05.png[ChEMBL] 123 | 124 | モデル構築のためのデータが欲しいのでデータ数が多い順に並べ替えます。ヘッダーのCompoundsをクリックして降順に並べ替えます。 125 | 126 | image::ch04/chembl06.png[ChEMBL] 127 | 128 | 論文由来で最もアッセイ数の多いCHEMBL829152を選んでクリックしてアッセイページを開きます。Activity chartの円グラフをクリックすると詳細画面が開くのでSelect allで全選択してTSV形式でダウンロードします。 129 | 130 | image::ch04/chembl07.png[ChEMBL] 131 | //// 132 | It is needed the structures and activity details for compounds when you would like to build QSAR model. You can download the data for QSAR from **Assay** page in ChEMBL. 133 | 134 | You can follow the steps outlined below. 135 | 136 | - Search journal data and the retrieve assay data which is related to the journal. 137 | - Search the target which you want to use and retrieve assay data which is related to the target. 138 | 139 | In the section, let's try the second approach, retrieve data from the target. We supporse that we would like to build QSAR model for hERG inhibition, hERG, Kv11.1 channel is best known for its contribution to the electrical activity of the heart. The hERG blocker will have risk of cardiotoxicity. 140 | 141 | Input **hERG** to search form and push **Search hERG for all assays**. You will can get 361 or more hits. 142 | 143 | image::ch04/chembl05.png[ChEMBL] 144 | 145 | Sort in descending order of number of data for modeling. Click **Compounds** on the header to do it. 146 | 147 | image::ch04/chembl06.png[ChEMBL] 148 | 149 | Click CHEMBL829152 which has largest data in the results the assay page will open. Click pi chart of acitivity then details of the data will be shown then select all and download the data as TSV format. 150 | 151 | image::ch04/chembl07.png[ChEMBL] 152 | 153 | NOTE:: 154 | **** 155 | The data might be garbled when you open the data on text editer like \^@C^@h\^@E^@M\^@B^@L^@. This reason is that the data encoded as utf-16-le. (Because the encoding is preferred for Excel) 156 | 157 | If you are using vi, you can fix the issue by just typing ':e ++enc=utf16le'. 158 | **** 159 | 160 | === Other useful datbases 161 | 162 | ==== link:http://zinc15.docking.org/[ZINC] 163 | 164 | //// 165 | ZINCは購入可能な試薬をコレクションしたデータベースです。現在のバージョンは15で約7億5000万の構造が収載されています。 166 | もともとがドッキングシミュレーションでの利用を想定して開発されているため、三次元化したデータをダウンロードすることも可能です。ZINCのデータでバーチャルスクリーニング(6章で説明します)を行い、ヒットした化合物を購入し実際のアッセイに供するというのが主な使い方だと思います。 167 | 168 | データのダウンロード方法は上部のTranchesタブをクリックすると次の画面に縦軸にLogP横軸に分子量の大きさで分類されそれぞれの区画にいくつの化合物が収載されているかの表が表示されます。 169 | //// 170 | ZINC is a database which collected commercial available reagents. Current version is 15 and about 750 million comounds are recorded. 171 | User can download 3D molecular structure data because originally the data base is developed for assuming docking simulation. I think that conduct virtual screening with data from ZINC, purchase hit comounds and assay these compounds is the main usage. 172 | 173 | How to download data? 174 | Click Tranches tab, then you can see on the next screen, the table which is devided the vertical axis shows LogP the horizontal axis shows molecular weight display a table of how many compounds are listed. 175 | 176 | image::ch04/zinc01.png[ChEMBL] 177 | 178 | //// 179 | ここから必要なデータセットを選んでダウンロードボタンを押すと、実際にデータセットのURLが列挙されたテキストファイルが得られますのでそれぞれにアクセスしてデータをダウンロードします。 180 | //// 181 | Select dataset which you want and click down load button, you can get text file which listed URL of the dataset. The data can get with accessing the URL. 182 | 183 | ==== link:http://togotv.dbcls.jp/[統合TV:Togo TV] 184 | 185 | //// 186 | 統合TVは生命科学分野の有用なデータベースやツールの使い方を動画で紹介するサイトで、link:https://dbcls.rois.ac.jp/[ライフサイエンス統合データベースセンター(DBCLS)]により管理、運用されています。その名の通りバイオインフォマティクス関連の動画が多いですが、ケモインフォマティクスを紹介した動画もいくつかありますので参考にしてみてください。link:http://togotv.dbcls.jp/information.html[文献・辞書・プログラミング]のカテゴリも役に立つはずです。 187 | //// 188 | Togo TV is a video site which describes useful database and tools and is managed and maintaind by link:https://dbcls.rois.ac.jp/[Database Center for Life Science(DBCLS)]. As its name suggests that there are many videos about bioinformatics, but there are some chemoinformatics videos are provided. Please reffer the site. link:http://togotv.dbcls.jp/information.html[journal・dictionary・programminc] might be useful. 189 | **Language of TogoTV is Japanese** 190 | 191 | - link:https://doi.org/10.7875/togotv.2017.121[PubChemを利用して化学物質やアッセイの結果を調べる 2017/Search compound and assay data by using PubChem 2017] 192 | - link:https://doi.org/10.7875/togotv.2014.014[ChEMBLを使って医薬品候補となる化合物について調べる/Search drug candidate comounds with ChEMBL] 193 | 194 | //// 195 | NOTE:: これ以外にもケモインフォマティクスに有用なデータベースがあればお知らせください。IssueやPRでも受け付けてます。 196 | //// 197 | NOTE:: If reader know other useful databases for chemoinformatics please inform us. Issue or Pull requests are also appreciated. 198 | 199 | <<< 200 | -------------------------------------------------------------------------------- /ch05_rdkit.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 5: Handling Structural Information with RDKit 2 | :imagesdir: images 3 | 4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch05_rdkit.ipynb"] 5 | 6 | //// 7 | この章ではRDKitを使って分子の読み込みの基本を覚えます。 8 | //// 9 | In this chapter we will learn the basics of reading molecules with RDKit. 10 | 11 | === What is SMILES? 12 | 13 | //// 14 | Simplified molecular input line entry system(SMILES)とは化学構造を文字列で表現するための表記方法です。 15 | 詳しくはlink:http://www.daylight.com/meetings/summerschool98/course/dave/smiles-intro.html#TOC[SMILES Tutorial]で説明されていますが、例えばc1ccccc1は6つの芳香族炭素が最初と最後をつないでループになっている構造、つまりベンゼンを表現していることになります。 16 | //// 17 | Simplified molecular input line entry system(SMILES) is a specification in the form of a line notation for describing the structure of chemical species using short ASCII strings. More detials are described in link:http://www.daylight.com/meetings/summerschool98/course/dave/smiles-intro.html#TOC[SMILES Tutorial]. For example **c1ccccc1** means that there are six aromatic carbon atoms and has a loop structure which is connected with start and end, you know it means benzene. 18 | 19 | === Let's draw chemical strcutre with SMILES :) 20 | 21 | //// 22 | SMILESで分子を表現することがわかったので、SMILESを読み込んで分子を描画させてみましょう。まずはRDKitのライブラリからChemクラスを読み込みます。二行目はJupyter Notebook上で構造を描画するための設定です。 23 | //// 24 | We could understand SMILES can represent molecules, so let's read SMILES and draw molecule. At first import Chem class from RDKit to do that. And the function in the second line named 'IPythonConsole' is read for drawing molecules on Notebook. 25 | **The majority of the basic molecular functionality is found in module rdkit.Chem** 26 | 27 | [source, python] 28 | ---- 29 | from rdkit import Chem 30 | from rdkit.Chem.Draw import IPythonConsole 31 | from rdkit.Chem import Draw 32 | ---- 33 | 34 | //// 35 | RDKitにはSMILES文字列を読み込むためにMolFromSmilesというメソッドが用意されていますので、これを使い分子を読み込みます。 36 | //// 37 | RDKit has MolFromSmiles method which reads SMILES. RDKit mol object can be constructed from SMILES with the function like below. 38 | 39 | [source, python] 40 | ---- 41 | mol = Chem.MolFromSmiles("c1ccccc1") 42 | ---- 43 | 44 | //// 45 | 続いて構造を描画しますが、単純にmolを評価するだけで構造が表示されます。 46 | //// 47 | Next we draws molecular structure. It is very simple, just evaluate mol object. 48 | 49 | [source, python] 50 | ---- 51 | mol 52 | ---- 53 | 54 | //// 55 | 図のように構造が表示されているはずです。 56 | //// 57 | Molecular structure will be drawn like following figure. 58 | 59 | image::ch05/ch05_01.png[Depict benzene] 60 | 61 | //// 62 | 上のように原子を線でつなぎ構造を表現する方法(構造式)と、SMILES表記はどちらも同じものを表現しています。構造式は人が見てわかりやすいですが、SMILESはASCII文字列で表現されるのでより少ないデータ量で表現できるというメリットがあります。 63 | 64 | NOTE: 文字列で表現できるということは、文字列生成アルゴリズムを応用することで新規な化学構造を生成することも可能ということです。この内容に関しては12章で詳しく説明します。 65 | //// 66 | Both methods connect atoms with bonds(2D Structure) and SMILES can represent same molecule. 2D structure is easy to understand for us and SMILES is not. But SMILES can define molecule as ASCII strings so SMILES can store molecule in low data volume. 67 | 68 | === How to handle multiple molecules at once? 69 | 70 | //// 71 | 複数の化合物を一つのファイルに格納する方法にはいくつかありますが、sdfというファイル形式を利用するのが一般的です。 72 | 73 | .sdfフォーマットとは? 74 | **** 75 | MDL社で開発された分子表現のためのフォーマットにMOL形式というものがあります。このMOL形式を拡張したものがSDF形式です。具体的にはMOL形式で表現されたものを"$$$$"という行で区切ることにより、複数の分子を取り扱えるようにしてあります。 76 | 77 | MOL形式は分子の三次元座標を格納することができ二次元だけでなく立体構造を表現できる点はSMILESとの大きな違いです。 78 | **** 79 | //// 80 | There are several ways to store multiple molecules in a file but SDF format file is common. 81 | 82 | .What's sdf format? 83 | **** 84 | There is MOL format which was developed by MDL. SDF format is an extension of this MOL format. In particular multiple compounds are delimited by lines consisting of four dollar signs ($$$$). A feature of the SDF format is its ability to include associated data. 85 | 86 | Huge differnce between MOL format and SMILES format is that MOL format can store 3d geometry information of molecule so MOL format can describe not only 2D but also stereo chemistry. 87 | **** 88 | 89 | ==== Download sdf file from ChEMBL 90 | 91 | //// 92 | 4章を参考にlink:https://www.ebi.ac.uk/chembl/beta/[ChEMBL]のトポイソメラーゼII阻害試験(CHEMBL669726)の構造データをsdfファイル形式でダウンロードします。 93 | 94 | NOTE:: 95 | **** 96 | 具体的な手順はリンクのページを開いて、検索フォームにCHEMBL669726を入力すると検索結果が表示されるので、Compoundsタブをクリックします。その後、全選択してSDFでダウンロードするとgzip圧縮されたsdfがダウンロードされるので、gunzipコマンドまたは適当な解凍ソフトで解凍してください。それをch05_compounds.sdfという名前で保存します。 97 | **** 98 | //// 99 | Refer to chapter 4, down load Topoisomerase II inhibitor data(CHEMBL669726) from link:https://www.ebi.ac.uk/chembl/beta/[ChEMBL] as sdf file format. 100 | 101 | NOTE:: 102 | **** 103 | Specially, open the link page and input 'CHEMBL66926' to search form then search results will be appeared. Then click compounds tab, select all and down load as SDF. File download will start and get file as compressed gzip format. Extract the file with guzip command or using an appropriate soft then rename the file to ch05_compounds.sdf. 104 | **** 105 | 106 | ==== Handling sdf with RDKit 107 | 108 | //// 109 | RDKitでsdfファイルを読み込むにはSDMolSupplierというメソッドを利用します。複数の化合物を取り扱うことになるのでmolではなくmolsという変数に格納していることに注意してください。どういう変数を使うかの決まりはありませんが、見てわかりやすい変数名をつけることで余計なミスを減らすことは心がけるとよいでしょう。 110 | //// 111 | SDMolSupplier method is used as sdf file reader of RDKit. Please note that we use mols variable instead of mol because we handle multiple molecules. There isn't a rule for variables naming but you should use variables name which is easy to understand in order to reduce the unnecessary mistakes. 112 | 113 | [source, python] 114 | ---- 115 | mols = Chem.SDMolSupplier("ch05_compounds.sdf") 116 | ---- 117 | 118 | //// 119 | 何件の分子が読み込まれたのか確認します。数を数えるにはlenを使います。 120 | //// 121 | Check how many coumpounds are read. len method is used to count number. 122 | 123 | [source, python] 124 | ---- 125 | len(mols) 126 | ---- 127 | 128 | //// 129 | 34件でした。 130 | //// 131 | Total 34 molecules are read. 132 | 133 | ==== Draw moleculear structures 134 | 135 | //// 136 | forループを使って、ひとつずつ分子を描画してもいいですが、RDKitには複数の分子を一度に並べて描画するメソッドが用意されているので、今回はそちらのMolsToGridImageメソッドを使います。なお一行に並べる分子の数を変更するにはmolsPerRowオプションで指定します 137 | //// 138 | You can draw molecule one by one with for loop but it is redundant. RDKit has method which can draw multiple molecules at once, so try to use the function named MolsToGridImage method. For your information the function has molsPerRow option which can change number of molecules per row. 139 | 140 | [source, python] 141 | ---- 142 | Draw.MolsToGridImage(mols) 143 | ---- 144 | 145 | image::ch05/ch05_04.png[MolsToGridImage] 146 | 147 | ===== (bonus) 148 | //// 149 | 参考までにループを回すやりかたも載せておきます。 150 | //// 151 | Following code shows draw molecule one by one with loop for your information. 152 | 153 | [source, python] 154 | ---- 155 | from IPython.core.display import display 156 | for mol in mols: 157 | display(mol) 158 | ---- 159 | 160 | === Let's try to do hetero shuffling 161 | 162 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch05_hetero_shuffle.ipynb"] 163 | 164 | //// 165 | 創薬の化合物最適化ブロジェクトで、分子の形を変更しないで化合物の特性を変えたいということがあります。このような場合、芳香環を形成する炭素、窒素、硫黄、酸素などの原子種を入れ替えることでより良い特性の化合物が得られることがありますがこのようにヘテロ原子(水素以外の原子)を入れ替えるアプローチをヘテロシャッフリングといいます。 166 | 167 | ヘテロシャッフリングを行うことで、活性を維持したまま物性を変化させて動態を良くする、活性そのものを向上させる、特許クレームの回避といった効果が期待できます。 168 | 169 | 少しの構造の違いが選択性や薬物動態が影響を与える有名な例として、Pfizer社のlink:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL192/[Sildenafil]とGSK社のlink:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL1520/[Vardinafil]が挙げられます。 170 | 171 | 二つの構造を比較すると中心の環構造部分の窒素原子の並びが異なっているだけで極めて似ています。両分子は同じ標的蛋白質を阻害しますが、そのlink:https://www.nature.com/articles/3901525[活性や薬物動態]は異なります。 172 | 173 | image::ch05/ch05_08.png[check structures] 174 | 175 | 上記の画像を生成するコードを示します。単にDraw.MolsToGridImageを適用するのではなく 176 | Core構造をベースにアライメントしていることとDraw.MolToGridImageのオプションにlegendsを与え、分子名を表示していることに注意してください。 177 | //// 178 | At the leard optimization satage of drug discovery, it often happens that researchers would like to improve molecular properties without changing molecular shape. In this case medicinal chemists of chage atoms such as carbon, nitrogen, sulphur and oxygen which in aromatic rings and it generats good profile molecules sometime. The approach which exchange aromatic atoms (except hydrogen) is called heteroshuffling. 179 | 180 | The heteroshuffling strategy is expected to improve physchem properties keeping potency, improve potency and claim avoidance. 181 | 182 | Pfizer's link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL192/[Sildenafil] and GSK's link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL1520/[Vardinafil] are well-known examples where small structural differences can affect selectivity and pharmacokinetics. 183 | 184 | The two structures are very similar except that the arrangement of the nitrogen atoms in the central ring structure is different. Both two molecules inhibt same target protein but link:https://www.nature.com/articles/3901525[their biological activities and pharmacokenetics] are different. 185 | 186 | image::ch05/ch05_08.png[check structures] 187 | 188 | Following code shows how to generate image described above. Please note that the code is not just using Draw.MolsToGridImage but align to core structure and add legends option to draw molecular's name. 189 | 190 | [source, python] 191 | ---- 192 | from rdkit import Chem 193 | from rdkit.Chem import AllChem 194 | from rdkit.Chem.Draw import IPythonConsole 195 | from rdkit.Chem import Draw 196 | from rdkit.Chem import rdDepictor 197 | from rdkit.Chem import rdFMCS 198 | from rdkit.Chem import TemplateAlign 199 | IPythonConsole.ipython_useSVG = True 200 | rdDepictor.SetPreferCoordGen(True) 201 | 202 | sildenafil = Chem.MolFromSmiles('CCCC1=NN(C)C2=C1NC(=NC2=O)C1=C(OCC)C=CC(=C1)S(=O)(=O)N1CCN(C)CC1') 203 | vardenafil = Chem.MolFromSmiles('CCCC1=NC(C)=C2N1NC(=NC2=O)C1=C(OCC)C=CC(=C1)S(=O)(=O)N1CCN(CC)CC1') 204 | rdDepictor.Compute2DCoords(sildenafil) 205 | rdDepictor.Compute2DCoords(vardenafil) 206 | res = rdFMCS.FindMCS([sildenafil, vardenafil], completeRingsOnly=True, atomCompare=rdFMCS.AtomCompare.CompareAny) 207 | MCS = Chem.MolFromSmarts(res.smartsString) 208 | rdDepictor.Compute2DCoords(MCS) 209 | 210 | TemplateAlign.AlignMolToTemplate2D(sildenafil, MCS) 211 | TemplateAlign.AlignMolToTemplate2D(vardenafil, MCS) 212 | Draw.MolsToGridImage([sildenafil, vardenafil], legends=['sildenafil', 'vardenafil']) 213 | ---- 214 | 215 | //// 216 | ヘテロシャッフルした分子を生成するためにHeteroShuffleというクラスを定義します。オブジェクトの生成にはシャッフルしたい分子と変換したい部分構造(Core)を与えます。クラス内のコードではまず、分子をCoreで切断し、Coreとそれ以外に分けます。CoreのAromatic原子で、置換基がついてない原子のみが置換候補になります。。シャッフル後のCoreとCore以外のパーツを再結合するための反応オブジェクトを生成するメソッドがmake_connectorです。このメソッドで作られた反応オブジェクトを利用してre_construct_molで分子を再構築しています。 217 | 218 | 考えられる原子の組み合わせを構築するために、itertools.productに、候補原子(C, S, N, O)の原子番号と、環を構成する原子数target_atomic_numsを与えます。その後に分子として生成できないものは排除するのでここでは考えられる全部の組み合わせを出します。 219 | //// 220 | HeteroShuffle class is defined to generate hetero shuffled molecules. To generate the objects, it is needed to input the molecule which would like to do hetero shuffle and core structure to shuffle. The target atoms are aromatic atoms in the core and atoms which has no substituent. The function named make_connector generates reaction objects to construct molecules from shuffled core and substituents. The function named re_construct_mol reconstruct molecules with the reaction objects. 221 | 222 | To generate possible combinations of atoms, the code pass candidates of atomic numbers (C, S, N, O) and number of atoms which constructs target ring. Invalid molecule will be removed after possible combinations is generated. 223 | 224 | [source, python] 225 | ---- 226 | class HeteroShuffle(): 227 | 228 | def __init__(self, mol, query): 229 | self.mol = mol 230 | self.query = query 231 | self.subs = Chem.ReplaceCore(self.mol, self.query) 232 | self.core = Chem.ReplaceSidechains(self.mol, self.query) 233 | self.target_atomic_nums = [6, 7, 8, 16] 234 | 235 | 236 | def make_connectors(self): 237 | n = len(Chem.MolToSmiles(self.subs).split('.')) 238 | map_no = n+1 239 | self.rxn_dict = {} 240 | for i in range(n): 241 | self.rxn_dict[i+1] = AllChem.ReactionFromSmarts('[{0}*][*:{1}].[{0}*][*:{2}]>>[*:{1}][*:{2}]'.format(i+1, map_no, map_no+1)) 242 | return self.rxn_dict 243 | 244 | def re_construct_mol(self, core): 245 | ''' 246 | re construct mols from given substructures and core 247 | ''' 248 | keys = self.rxn_dict.keys() 249 | ps = [[core]] 250 | for key in keys: 251 | ps = self.rxn_dict[key].RunReactants([ps[0][0], self.subs]) 252 | mol = ps[0][0] 253 | try: 254 | smi = Chem.MolToSmiles(mol) 255 | mol = Chem.MolFromSmiles(smi) 256 | Chem.SanitizeMol(mol) 257 | return mol 258 | except: 259 | return None 260 | 261 | def get_target_atoms(self): 262 | ''' 263 | get target atoms for replace 264 | target atoms means atoms which don't have anyatom(*) in neighbors 265 | ''' 266 | atoms = [] 267 | for atom in self.core.GetAromaticAtoms(): 268 | neighbors = [a.GetSymbol() for a in atom.GetNeighbors()] 269 | if '*' not in neighbors and atom.GetSymbol() !='*': 270 | atoms.append(atom) 271 | print(len(atoms)) 272 | return atoms 273 | 274 | def generate_mols(self): 275 | atoms = self.get_target_atoms() 276 | idxs = [atom.GetIdx() for atom in atoms] 277 | combinations = itertools.product(self.target_atomic_nums, repeat=len(idxs)) 278 | smiles_set = set() 279 | self.make_connectors() 280 | for combination in combinations: 281 | target = copy.deepcopy(self.core) 282 | #print(Chem.MolToSmiles(target)) 283 | for i, idx in enumerate(idxs): 284 | target.GetAtomWithIdx(idx).SetAtomicNum(combination[i]) 285 | smi = Chem.MolToSmiles(target) 286 | #smi = smi.replace('sH','s').replace('oH','o').replace('cH3','c') 287 | #print('rep '+smi) 288 | target = Chem.MolFromSmiles(smi) 289 | if target != None: 290 | n_attachment = len([atom for atom in target.GetAtoms() if atom.GetAtomicNum() == 0]) 291 | n_aromatic_atoms = len(list(target.GetAromaticAtoms())) 292 | if target.GetNumAtoms() - n_attachment == n_aromatic_atoms: 293 | try: 294 | mol = self.re_construct_mol(target) 295 | if checkmol(mol): 296 | smiles_set.add(Chem.MolToSmiles(mol)) 297 | except: 298 | pass 299 | mols = [Chem.MolFromSmiles(smi) for smi in smiles_set] 300 | return mols 301 | ---- 302 | 303 | //// 304 | 上のコードで使われているcheckmolという関数はc1coooo1のような6員環の構造もAromaticだと判定されてしまうのでそれを避けるために使っています。O, Sが許容されるのは5員環のヘテロ芳香環のみにしました。 305 | //// 306 | The checkmol function which is used to avoid molecule such as c1coooo1 is defied as aromatic. I defined molecule which is allowd contain O, S is only five menbered hetero aromatic rings. 307 | 308 | [source, python] 309 | ---- 310 | def checkmol(mol): 311 | arom_atoms = mol.GetAromaticAtoms() 312 | symbols = [atom.GetSymbol() for atom in arom_atoms if not atom.IsInRingSize(5)] 313 | if symbols == []: 314 | return True 315 | elif 'O' in symbols or 'S' in symbols: 316 | return False 317 | else: 318 | return True 319 | ---- 320 | 321 | //// 322 | 実際に使ってみます。 323 | //// 324 | Use the function. 325 | 326 | [source, python] 327 | ---- 328 | # Gefitinib 329 | mol1 = Chem.MolFromSmiles('COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4') 330 | core1 = Chem.MolFromSmiles('c1ccc2c(c1)cncn2') 331 | # Oxaprozin 332 | mol2 = Chem.MolFromSmiles('OC(=O)CCC1=NC(=C(O1)C1=CC=CC=C1)C1=CC=CC=C1') 333 | core2 = Chem.MolFromSmiles('c1cnco1') 334 | ---- 335 | 336 | //// 337 | 元の分子 338 | //// 339 | Original molecule. 340 | 341 | image::ch05/ch05_05.png[query] 342 | 343 | [source, python] 344 | ---- 345 | ht=HeteroSuffle(mol1, core1) 346 | res=ht.generate_mols() 347 | print(len(res)) 348 | Draw.MolsToGridImage(res, molsPerRow=5) 349 | ---- 350 | 351 | //// 352 | Gefitinibを入力とした場合の変換結果の一部です。芳香環を形成する原子が元の化合物から変化した分子が出力されています。 353 | また、Coreで指定したキナゾリン部分のみが変換されています。 354 | //// 355 | The image is part of the results Gefitinib as input. The molecules which is different from original molecule are generated. And quinazoline part is changed because I set quinazoline as core. 356 | 357 | image::ch05/ch05_06.png[res1] 358 | 359 | [source, python] 360 | ---- 361 | ht=HeteroSuffle(mol2, core2) 362 | res=ht.generate_mols() 363 | print(len(res)) 364 | Draw.MolsToGridImage(res, molsPerRow=5) 365 | ---- 366 | 367 | //// 368 | Oxaprozinを入力とした場合の変換結果です。こちらは中心に、link:https://en.wikipedia.org/wiki/Oxazole[オキサゾール]と呼ばれる5員環構造を有してます。5員環を形成する芳香環にはチオフェン、フランなどのように窒素や酸素を含むものもあります。以下の例でもS、Oが5員環の構成原子に含まれている分子が出力されています。 369 | //// 370 | This is the result of Oxaprozin is used as input. This molecul has ink:https://en.wikipedia.org/wiki/Oxazole[oxazole] which is five menbered ring as core. There are several hetero aromatic rings that conatin oxygen, sulphur such as thiophen furan. 371 | 372 | image::ch05/ch05_07.png[res2] 373 | 374 | //// 375 | どうでしょうか。二つの分子の例を示しました。一つ目の例、Gefitinibは、分子を構成する芳香環が、link:https://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%8A%E3%82%BE%E3%83%AA%E3%83%B3[キナゾリン]とベンゼンでした。キナゾリンは、ベンゼンとピリミジンという二つの6員環が縮環した構造です。6員環をベースに構成される芳香環を形成する原子の候補は炭素と窒素になります。(ピリリウムイオンなど電荷を持つものも考慮すれば酸素や硫黄も候補になりますが、通常このような構造をDrug Designで使うことは少ないので今回の説明からは外しています。link:https://ja.wikipedia.org/wiki/%E8%A4%87%E7%B4%A0%E7%92%B0%E5%BC%8F%E5%8C%96%E5%90%88%E7%89%A9[複素環式化合物の説明]) 376 | Oxaprozinはオキサゾールを有しています。5員環の芳香環を形成する原子の候補は炭素、窒素、硫黄、酸素が挙げられます。このような分子の場合の例として紹介しました。 377 | いずれのケースでも上記のコードでヘテロ原子がシャッフルされたものが生成されています 378 | //// 379 | What's on your mind? Two examples were shown. The first one is a case of aromatic rings are quinazoline and benzene. Qunazoline is the ring which is fused ring of benzene and pyrimidine. The candidates atoms for six membered aromatic rings will be carbon and nitrogen atoms. (Of cource if we consider for pyririum ion, oxygen will be candidate of atoms but these charged substructure is not common for drug discovery. So we ommited the atom.) 380 | Oxaprozin has an oxazole rings. The candidates of atoms for five membered aromatic rings will be carbon, nitrogen, sulphur and oxygen. The second one is introduced as an example of five membered hetero aromatic rings. 381 | HeteroShuffled molecules are generated in the both case. 382 | 383 | //// 384 | .ヘテロシャッフリングについてもう少し詳しく 385 | **** 386 | link:https://pubs.acs.org/doi/10.1021/jm3001289[J. Med. Chem. 2012, 55, 11, 5151-5164]ではPIM-1キナーゼ阻害剤におけるNシャッフリングの効果をFragment Molecular Orbital法という量子化学的なアプローチを使って検証しています。さらにlink:https://pubs.acs.org/doi/10.1021/acs.jcim.8b00563[J. Chem. Inf. Model. 2019, 59, 1, 149-158]ではAsp–Arg塩橋とヘテロ環のスタッキングのメカニズムを量子化学計算により探っており、置換デザインの指標になりそうです。 387 | 388 | また、バイオアベイラビリティ改善のためにヘテロシャッフリングを行った例としてはlink:https://dx.doi.org/10.1021/jm101027s[J. Med. Chem. 2011, 54, 8, 3076-3080]があります。 389 | **** 390 | //// 391 | .Describes about hetero shuffling more 392 | **** 393 | In the article link:https://pubs.acs.org/doi/10.1021/jm3001289[J. Med. Chem. 2012, 55, 11, 5151-5164] analyzed the effect of nitrogen shuffling for PIM-1 kinase inhibitor project with Fragment Molecular Orbital method which is a method of quantum chemistry. And another article link:https://pubs.acs.org/doi/10.1021/acs.jcim.8b00563[J. Chem. Inf. Model. 2019, 59, 1, 149-158] described mechanism of the stackibng between Asp-Arg salt bridge and hetero rings with quantum chemistry calclation. The approach seems to be good indicator for substituents design. 394 | 395 | Also, an example of hetero shuffling for improving the bio availability is ink:https://dx.doi.org/10.1021/jm101027s[J. Med. Chem. 2011, 54, 8, 3076-3080] 396 | **** 397 | 398 | <<< 399 | -------------------------------------------------------------------------------- /ch06_similarity.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 6: Try to evaluate the similarity of compounds 2 | :imagesdir: images 3 | 4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch06_similarity.ipynb"] 5 | 6 | === What does it mean that compounds are similar? 7 | 8 | Expressions that are somewhat shape is similar are not scientific. In the chemoinformatics, similarity or unsimilarity (distance) is used as quantitative metrics. 9 | 10 | In this section, we will introduce two major metrics. 11 | 12 | ==== Descriptor 13 | 14 | //// 15 | 分子の全体的な特徴を数値で表現するものを記述子と呼びます。分子量や極性表面性(PSA)、分配係数(logP)などがあり、現在までに多くの記述子が提案されています。これらの記述子の類似性を評価することで2つの分子がどのくらい似ているかを表現することが可能です。また分子全体の特徴を1つの数字で表現しており局所的な特徴ではないということに注意してください。 16 | 17 | NOTE: いくつかの記述子に関しては市販ソフトでないと計算できない場合があります。 18 | //// 19 | 20 | A parameter that represents the overall characteristics of a molecule numerically is called a descriptor. Many descriptors are proposed so far, such as molecular weight, polar surface area (PSA) and partition coefficient (logP). It is possible to evaluate a similarity between two molecules with these descriptors. Please note that descriptor represents whole molecular feature as a numeric value and it is not a local feature. 21 | 22 | NOTE: There are cases where commercial software is needed to calculate some descriptors. 23 | 24 | ==== Fingerprint 25 | 26 | A fingerprint is another feature, and is a binary representation of a partial structure of a molecule as a binary 0, 1, and it corresponds to the presence or absence of a partial structure and on (1) or off (0) of a bit, and represents a set of partial structures Represents the characteristics of the molecule. There are two types of fingerprints, fixed-length FP and variable-length FP. Formerly, MACSKey fixed-length FP (FP whose partial structure and index have been determined in advance) was used, but now ECFP 4 (It is common to use a variable-length FP called Morgan2). 27 | 28 | As for the RDKit fingerprint, please read link:https://www.rdkit.org/UGM/2012/Landrum_RDKit_UGM.Fingerprints.Final.pptx.pdf[Developper of RDKit, Greg's Slide] for details. 29 | 30 | Let's do similarity evaluation using this ECFP 4 (Morgan 2) this time. 31 | 32 | .Difference between SMILES and fingerprint 33 | **** 34 | SMILES is an ASCII string representation of the structure, and a fingerprint is a binary representation of the presence or absence of a substructure. The difference is that the former is one of the ** structural expressions **, while the latter is one of the ** feature expressions **. 35 | Since only the presence or absence of partial structures is expressed, information such as the relationship between partial structures (how connected by positional relationship) is lost, and the original structure is not restored. 36 | 37 | Some people call it Bag-of-Fragments because it corresponds to Bag-of-Words often used in text-mining. 38 | **** 39 | 40 | === Let's calculate similarity 41 | 42 | Let's evaluate the similarity of toluene and chlorobenzene as simple molecules. 43 | 44 | [source, python] 45 | ---- 46 | from rdkit import Chem, DataStructs 47 | from rdkit.Chem import AllChem, Draw 48 | from rdkit.Chem.Draw import IPythonConsole 49 | ---- 50 | 51 | Read molecule from SMILES. 52 | 53 | [source, python] 54 | ---- 55 | mol1 = Chem.MolFromSmiles("Cc1ccccc1") 56 | mol2 = Chem.MolFromSmiles("Clc1ccccc1") 57 | ---- 58 | 59 | Confirm it by visual observation. 60 | 61 | [source, python] 62 | ---- 63 | Draw.MolsToGridImage([mol1, mol2]) 64 | ---- 65 | 66 | Generate radius 2 morgan fingerprint which corresponds to ECFP4. 67 | 68 | [source, python] 69 | ---- 70 | fp1 = AllChem.GetMorganFingerprint(mol1, 2) 71 | fp2 = AllChem.GetMorganFingerprint(mol2, 2) 72 | ---- 73 | 74 | Tanimoto coefficient is used for similarity evaluation. 75 | 76 | [source, python] 77 | ---- 78 | DataStructs.TanimotoSimilarity(fp1, fp2) 79 | # 0.5384615384615384 80 | ---- 81 | 82 | === Virtual screening 83 | 84 | So far we have described how to evaluate the similarity of compounds. Using this index of similarity to select a specific group of compounds from a large number of compounds is called virtual screening. 85 | 86 | For example, if a compound that is likely to be a drug is published in a patent or a paper, or a compound that is likely to be promising is found in our assay system, similar compounds in the compound library database of our company or the database of commercially available compounds are more promising I want to find out if there is something like that. Here, it is possible to purchase an analog of influenza drug which is known as a neuraminidase inhibitor link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL466246/[Inavir] link:Find out using http://zinc15.docking.org/[ZINC]. 87 | 88 | The molecular weight of Inavir was about 350, and LogP was about -3. So we selected the fraction of the molecular weight 350-375 and LogP -1 from ZINC. This section is divided into 16 files, but download and use only the first set. 89 | 90 | NOTE: We described how to download the data in chapter 4. 91 | 92 | We can perform shell command on jupyter notebook by starting from !. The following is an example of downloading ZINC data set with wget command on jupyter notebook 93 | 94 | [source, python] 95 | ---- 96 | !wget http://files.docking.org/2D/EA/EAED.smi 97 | ---- 98 | 99 | Read SMILES from file and make it a mol object, but skip the first line because it is a header. Also, the last character of each line is a newline character, so it is excluded as l [:-1]. Finally, find out how many compounds there are. 100 | 101 | [source, python] 102 | ---- 103 | mols = [] 104 | with open("EAED.smi") as f: 105 | f.readline() 106 | for l in f: 107 | mol = Chem.MolFromSmiles(l[:-1]) 108 | mols.append(mol) 109 | print(len(mols)) 110 | # 195493 111 | ---- 112 | 113 | Next, prepare a function to check the degree of similarity with Inavir (LANIMAMIBIR). 114 | 115 | [source, python] 116 | ---- 117 | laninamivir = Chem.MolFromSmiles("CO[C@H]([C@H](O)CO)[C@@H]1OC(=C[C@H](NC(=N)N)[C@H]1NC(=O)C)C(=O)O") 118 | laninamivir_fp = AllChem.GetMorganFingerprint(laninamivir, 2) 119 | 120 | def calc_laninamivir_similarity(mol): 121 | fp = AllChem.GetMorganFingerprint(mol, 2) 122 | sim = DataStructs.TanimotoSimilarity(laninamivir_fp, fp) 123 | return sim 124 | ---- 125 | 126 | Check it. 127 | 128 | [source, python] 129 | ---- 130 | similar_mols =[] 131 | for mol in mols: 132 | sim = calc_laninamivir_similarity(mol) 133 | if sim > 0.2: 134 | similar_mols.append((mol, sim)) 135 | ---- 136 | 137 | Sort the results in descending order of similarity and retrieve only the first ten. 138 | 139 | [source, python] 140 | ---- 141 | similar_mols.sort(key=lambda x: x[1], reverse=True) 142 | mols = [l[0] for l in similar_mols[:10]] 143 | ---- 144 | 145 | Let's draw them. 146 | 147 | [source, python] 148 | ---- 149 | Draw.MolsToGridImage(mols, molsPerRow=5) 150 | ---- 151 | 152 | image::ch06/vs01.png[result] 153 | 154 | As you can see if the similarity is confirmed, about 200,000 compounds examined this time can only find a compound with a maximum similarity is 23%. However, ZINC contains 750 million entries, so there should be many more similar compounds in it. 155 | 156 | === Clustering 157 | 158 | For example, when purchasing a commercial compound and creating a library, we want to have as much diversity as possible, so we organize similar compounds and select a representative of them so that only similar compounds are not biased. In this way, if you want to organize compounds by structural similarity, use a method called clustering. 159 | 160 | Clustering of 5614 hits from link:https://www.ebi.ac.uk/chembl/beta/assay_report_card/CHEMBL1040694/[Novrtis's antimalarial assay] 161 | 162 | Import library for clustering and reading data. 163 | 164 | [source, python] 165 | ---- 166 | from rdkit.ML.Cluster import Butina 167 | mols = Chem.SDMolSupplier("ch06_nov_hts.sdf") 168 | ---- 169 | 170 | If RDKit can not read the molecule for some reason, it will generate None instead of a mol object. Since passing this None to the GetMorganFingerprintAsBitVect method results in an error, so we generate a fingerprint while excluding None. 171 | 172 | [source, python] 173 | ---- 174 | fps = [] 175 | valid_mols = [] 176 | 177 | for mol in mols: 178 | if mol is not None: 179 | fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) 180 | fps.append(fp) 181 | valid_mols.append(mol) 182 | ---- 183 | 184 | Generate a distance matrix (a lower triangular distance matrix) from the fingerprints. 185 | 186 | [source, python] 187 | ---- 188 | distance_matrix = [] 189 | for i, fp in enumerate(fps): 190 | similarities = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i+1]) 191 | distance_matrix.extend([1-sim for sim in similarities]) 192 | ---- 193 | 194 | Cluster compounds using a distance matrix. The third argument is the distance threshold. In this example, clustering is performed on compounds with a distance of 0.2 or 80% or more. 195 | 196 | [source, python] 197 | ---- 198 | clusters = Butina.ClusterData(distance_matrix, len(fps), 0.2, isDistData=True) 199 | ---- 200 | 201 | Check number of cluster. 202 | 203 | [source, python] 204 | ---- 205 | len(clusters) 206 | #2492 207 | ---- 208 | 209 | Visualize structures of first cluster. 210 | 211 | [source, python] 212 | ---- 213 | mols_ =[valid_mols[i] for i in clusters[0]] 214 | Draw.MolsToGridImage(mols_, molsPerRow=5) 215 | ---- 216 | 217 | image::ch06/cls01.png[clustering result, width=600, pdfwidth=60%] 218 | 219 | 220 | In this case, clustering was performed using the library provided in RDKit, but some methods can be used with link:https://scikit-learn.org/stable/modules/clustering.html[Scikit-learn]. And in practice this method is often used. 221 | 222 | === Structure Based Drug Design(SBDD) 223 | 224 | Here we evaluate the similarity of link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL231779/[apixaban] and link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL198362/[rivaroxaban], which are marketed as anticoagulants. 225 | 226 | [source, python] 227 | ---- 228 | apx = Chem.MolFromSmiles("COc1ccc(cc1)n2nc(C(=O)N)c3CCN(C(=O)c23)c4ccc(cc4)N5CCCCC5=O") 229 | rvx = Chem.MolFromSmiles("Clc1ccc(s1)C(=O)NC[C@H]2CN(C(=O)O2)c3ccc(cc3)N4CCOCC4=O") 230 | ---- 231 | 232 | [source, python] 233 | ---- 234 | Draw.MolsToGridImage([apx, rvx], legends=["apixaban", "rivaroxaban"]) 235 | ---- 236 | 237 | image::ch06/apx_rvx.png[APX+RVX, width=600, pdfwidth=60%] 238 | 239 | The structures are quite similar as you can see, but both of these two compounds are known to bind similarly to the same pocket of the serine protease FXa and to inhibit the function of the protein. 240 | 241 | [source, python] 242 | ---- 243 | apx_fp = AllChem.GetMorganFingerprint(apx, 2) 244 | rvx_fp = AllChem.GetMorganFingerprint(rvx, 2) 245 | 246 | DataStructs.TanimotoSimilarity(apx_fp, rvx_fp) 247 | # 0.40625 248 | ---- 249 | 250 | It's about 40% similar. In fact, both link:https://www.rcsb.org/structure/2P16[apixaban] and link:https://www.rcsb.org/structure/2W26[rivaroxaban] have their complex crystal structures solved and were drawn using link:https://pymol.org/2/[PyMOL]. 251 | 252 | NOTE:: It does not explain how to use PyMOL because it exceeds the contents of this document, but if you are interested, Please refer to link:http://www.protein.osaka-u.ac.jp/rcsfp/supracryst/suzuki/jpxtal/Katsutani/index.php[here]. 253 | 254 | image::ch06/apx_rvx_suf.png[APX+RVX, width=600, pdfwidth=60%] 255 | 256 | As you can see from the figure, apixaban and rivaroxaban are beautifully overlapping in three dimensions. In particular, methoxyphenyl and chlorothiol are located in a site called S1 pocket and are said to have some kind of strong interaction. As the ligand binding sites (pockets) of proteins become clearer, it becomes easier for the medicinal chemist to develop a strategy for the next modification, and the success rate and progress rate of the project will increase. 257 | 258 | An approach that optimizes the structure based on the shape of the protein determined by X-ray or cryo-electric testing is called Structure Based Drug Design (SBDD). Also, if you know the pocket, you can screen for compounds that physically bind to the pocket, which is called structure-based virtual screening (SBVS), and ligand-based virtual screening as you did in the previous chapter. It may be distinguished from ligand-based virtual screenig(LBVS). 259 | 260 | .History of Xa inhibitors and the importance of quantum chemistry calculation 261 | **** 262 | Although the contents of the chemoinformatics in this book are far apart, it is very useful in molecular design to trace the history of FXa inhibitors and to understand what improvements have been made through generations. In addition, since the interpretation of the S1 pocket interaction is very difficult visually and in classical mechanics, it can be interpreted only by quantum chemical calculation such as Fragment Molecular Orbital Method (FMO), so it is a mistake that quantum chemical calculation becomes essential in future molecular design I think. 263 | **** 264 | 265 | 266 | <<< 267 | -------------------------------------------------------------------------------- /ch07_graph.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 7: Assessing similarity using graph structures 2 | :imagesdir: images 3 | 4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch07_MCS.ipynb"] 5 | 6 | A graph is data consisting of nodes (vertices) and edges (branches) that indicate the connection between nodes. The chemical structure can be represented by this graph. In other words, we can represent atoms in a graph with nodes and bonds as edges. 7 | 8 | In general, fingerprints like those introduced in Chapter 6 are often used to evaluate the similarity between molecules, but there is also a method to evaluate similarity using a graph structure. The MCS (Maximum Common Substructure) introduced next refers to the common substructure of the target molecule set. The more common substructures, the more similar their molecules are. 9 | 10 | === Classification by major skeleton (MCS) 11 | 12 | Maximum Common Substructure (MCS) is the largest common substructure in a given group of chemical structures. RDKit provides a module called rdFMCS for MCS search. 13 | 14 | This time, we will use the file cdk2.sdf provided in rdkit as sample data for MCS search. RDConfig.RDDocsDir is a variable that represents the directory of sample data, and there is a file called cdk2.sdf under the Books/data/ directory, so set the file path with the os.path.join method. Note that os.path.join is a python built-in module for absorbing differences in os paths. 15 | 16 | [source, python] 17 | ---- 18 | import os 19 | from rdkit import Chem 20 | from rdkit.Chem import RDConfig 21 | from rdkit.Chem import rdFMCS 22 | from rdkit.Chem.Draw import IPythonConsole 23 | from rdkit.Chem import Draw 24 | filepath = os.path.join(RDConfig.RDDocsDir, 'Book', 'data', 'cdk2.sdf') 25 | mols = [mol for mol in Chem.SDMolSupplier(filepath)] 26 | # 構造を確認します 27 | Draw.MolsToGridImage(mols[:7], molsPerRow=5) 28 | ---- 29 | 30 | image::ch07/mcs01.png[compounds] 31 | 32 | Acquires MCS using the loaded molecule. With RDKit, you can specify multiple options for how to get MCS. The following shows an example of each option. 33 | 34 | 35 | 36 | 37 | . Default 38 | . Any atom can be used (as long as there is an order of structure and bond) 39 | . The bond order may be any (for example, benzene and cyclohexane have the same MCS) 40 | 41 | [source, python] 42 | ---- 43 | result1 = rdFMCS.FindMCS(mols[:7]) 44 | mcs1 = Chem.MolFromSmarts (result1.smartsString) 45 | mcs1 46 | print(result1.smartsString) 47 | #[#6]1:[#7]:[#6](:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7]:2)-[#7] 48 | ---- 49 | 50 | image::ch07/mcs02.png[MCS01] 51 | 52 | [source, python] 53 | ---- 54 | result2 = rdFMCS.FindMCS(mols[:7], atomCompare=rdFMCS.AtomCompare.CompareAny) 55 | mcs2 = Chem.MolFromSmarts(result2.smartsString) 56 | mcs2 57 | print(result2.smartsString) 58 | #[#6]-,:[#6]-,:[#6]-[#6]-[#8,#7]-[#6]1:[#7]:[#6](:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7]:2)-[#7] 59 | ---- 60 | 61 | image::ch07/mcs03.png[MCS02] 62 | 63 | [source, python] 64 | ---- 65 | result3 = rdFMCS.FindMCS(mols[:7], bondCompare=rdFMCS.BondCompare.CompareAny) 66 | mcs3 = Chem.MolFromSmarts(result3.smartsString) 67 | mcs3 68 | print(result3.smartsString) 69 | #[#6]1:[#7]:[#6](:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7]:2)-[#7] 70 | ---- 71 | 72 | image::ch07/mcs04.png[MCS03] 73 | 74 | In RDKit, Fraggle Similarity is implemented as an algorithm to quantify similarity based on MCS. By using this, clustering and analysis based on similarity can be performed. 75 | 76 | [source, python] 77 | ---- 78 | from rdkit.Chem.Fraggle import FraggleSim 79 | sim, match = FraggleSim.GetFraggleSimilarity(mols[0], mols[1]) 80 | print(sim, match) 81 | #0.925764192139738 *C(C)C.*COc1nc(N)nc2[nH]cnc12 82 | match_st = Chem.MolFromSmiles(match) 83 | match_st 84 | ---- 85 | 86 | image::ch07/mcs05.png[FraggleSimilarity] 87 | 88 | Thus, FraggleSimilarity returns similarities and matched substructures. It is often closer to the feeling of a chemist than the similarity using ECFP. Please refer to the reference link for details. 89 | 90 | Reference link 91 | 92 | - https://pubs.acs.org/doi/abs/10.1021/acs.jcim.5b00036[Efficient Heuristics for Maximum Common Substructure Search] 93 | - https://raw.github.com/rdkit/UGM_2013/master/Presentations/Hussain.Fraggle.pdf[Fraggle – A new similarity searching algorithm] 94 | 95 | === Matched Molecular Pair and Matched Molecular Series 96 | 97 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch07_MMS.ipynb"] 98 | 99 | At the structural optimization stage of drug discovery research, how to convert the starting compound (lead compound) is very important, but as the stage progresses, which structural conversion affects the activity and physical properties It is also very important to carry out a retrospective analysis of what it has exerted. 100 | 101 | TIP: If you are interested, you may read link:https://sar.pharm.or.jp/wp-content/uploads/2018/09/SARNews_19.pdf[https://sar.pharm.or.jp/wp-content/uploads/2018/09/SARNews_19.pdf]. 102 | 103 | Matched Molecular Pair (MMP) is a pair of molecules that differ only in the partial structure of some of the two molecules but are otherwise identical. As an example, chlorobenzene and fluorobenzene are MMPs because they differ only in Cl and F groups. By analyzing a large number of changes in the characteristics of such pairs, you can grasp the trend of substituent conversion. This is called Matched Molecular Pair Analyisis (MMPA). By performing MMPA on large-scale data, it is possible to extract universal rules for property changes caused by substituent changes. If you understand these rules, you will be able to proceed efficiently with structural optimization. 104 | 105 | Here, we analyze MMP using link:https://github.com/rdkit/rdkit/tree/master/Contrib/mmpa[RDKit/Contrib/MMPA][mmpa] provided in Contrib of RDKit. 106 | 107 | Move to Contrib/mmpa under the RDKit installation location and execute the python script sequentially. 108 | 109 | [source, python] 110 | ---- 111 | python rfrag.py #save file name of the data that was fragmented 112 | # For example 113 | # python rfrag.py data/sample_fragmented.txt 114 | 115 | python indexing.py MMP_ output file.CSV 116 | # eg 117 | # python index.py data/mmp.csv 118 | ---- 119 | 120 | Executing the above command will generate a csv file of molecule A, molecule B, ID of molecule A, ID of molecule B, SMIRKS of converted structure, and common part structure (context). MMPA can be performed by linking activity and physical properties based on this data. 121 | 122 | NOTE: link:http://www.daylight.com/dayhtml/doc/theory/theory.smirks.html[SMIRKS] is a method to express conversion of molecules by string notation like SMILES. 123 | 124 | 125 | A method called Matched Molecular Series (MMS) has also been proposed as an extension of MMP. Although MMP is a pair of molecules, MMS is a list of this pair as a group of three or more with common structure. 126 | 127 | I will actually make MMS. The following example uses data from Factor Xa in ChEMBL. For the implementation of MMS, we use the code of the link:https://github.com/rdkit/UGM_2014/blob/master/Presentations/OBoyle_MatchedSeries.pdf[presentation] by Noel O'Byle's RDKit UGM. 128 | 129 | Let's actually make an MMS. In the following example, Factor Xa data was link:https://www.ebi.ac.uk/chembl/bioactivity/results/1/cmpd_chemblid/asc/tab/display[downloaded from ChEMBL] and used as an example. For the implementation of MMS, we use the code of the link:https://github.com/rdkit/UGM_2014/blob/master/Presentations/OBoyle_MatchedSeries.pdf[presentation] by Noel O'Byle's RDKit UGM . 130 | 131 | First, loading the library to be used, loading the data, and desalting using SaltRemover. 132 | 133 | [source, python] 134 | ---- 135 | import sys 136 | import os 137 | import pandas as pd 138 | from rdkit import Chem 139 | from rdkit.Chem import rdMMPA 140 | from rdkit.Chem import RDConfig 141 | from rdkit.Chem import rdBase 142 | from rdkit.Chem.Draw import IPythonConsole 143 | from rdkit.Chem import Draw 144 | from rdkit.Chem import SaltRemover 145 | mmpapath = os.path.join(RDConfig.RDContribDir, 'mmpa') 146 | sys.path.append(mmpapath) 147 | df = pd.read_csv('Chembl_FXa.txt', sep='\t') 148 | remover = SaltRemover.SaltRemover() 149 | mols = [] 150 | for i, smi in enumerate(df.CANONICAL_SMILES): 151 | try: 152 | mol = Chem.MolFromSmiles(smi) 153 | mol.SetProp('CMPD_CHEMBLID', df.CMPD_CHEMBLID[i]) 154 | mol = remover.StripMol(mol) 155 | mols.append(mol) 156 | except: 157 | print(smi) 158 | ---- 159 | 160 | Then, import the mmpa rfrag registered in RDKit contrib, and divide the molecule into fragments. 161 | 162 | [source, python] 163 | ---- 164 | import rfrag 165 | rfragdata = [] 166 | for i, smi in enumerate(df.CANONICAL_SMILES): 167 | try: 168 | out = rfrag.fragment_mol(smi, df.CMPD_CHEMBLID[i]) 169 | rfragdata.append(out) 170 | except: 171 | print(smi, df.CMPD_CHEMBLID[i]) 172 | ---- 173 | 174 | Define a function to create an MMS. The code is almost the same as that described in the UGM document, but I changed the reading destination from a file to a list in order to do all processing on Jupyter. 175 | 176 | Here is an overview of the MMS creation process. 177 | 178 | . Cut each molecule according to a certain rule (cut by rotatable bond etc.) 179 | . Cut fragments create a dictionary of keys, store the fragments of molecules with the same key in the dictionary value 180 | 181 | By repeating the above process, molecules with common scaffold can be organized. Molecules that are grouped in a common scaffold will be molecules that have different non-scaffold substituents. 182 | 183 | .What is a scaffold? 184 | **** 185 | In drug discovery, there is a stage of structural optimization at the stage before preclinical studies, in which the major non-skeleton part of the compound is converted briefly into a balanced property suitable for drugs. 186 | 187 | This main skeleton is called a scaffold. For example, link:https://patentscope2.wipo.int/search/ja/detail.jsf?docId=JP232673446[in this patent], the part except R is fixed and this main skeleton is called a scaffold. 188 | 189 | image::ch07/scaffold.png[scaffold, width=100, pdfwidth=20%] 190 | **** 191 | 192 | [source, python] 193 | ---- 194 | from collections import namedtuple 195 | 196 | Frag = namedtuple( 'Frag', ['id', 'scaffold', 'rgroup'] ) 197 | 198 | class Series(): 199 | def __init__( self ): 200 | self.rgroups = [] 201 | self.scaffold = "" 202 | 203 | def getFrags(rfrags): 204 | frags = [] 205 | for lines in rfrags: 206 | for line in lines: 207 | broken = line.rstrip().split(",") 208 | if broken[2]: # single cut 209 | continue 210 | smiles = broken[-1].split(".") 211 | mols = [Chem.MolFromSmiles( smi ) for smi in smiles] 212 | numAtoms = [mol.GetNumAtoms() for mol in mols] 213 | if len(numAtoms) < 2: 214 | continue 215 | if numAtoms[0] > 5 and numAtoms[1] < 12: 216 | frags.append(Frag(broken[1], smiles[0], smiles[1])) 217 | if numAtoms[1] > 5 and numAtoms[0] < 12: 218 | frags.append(Frag(broken[1], smiles[1], smiles[0])) 219 | frags.sort(key=lambda x:(x.scaffold, x.rgroup)) 220 | return frags 221 | 222 | def getSeries(frags): 223 | oldfrag = Frag(None, None, None) 224 | series = Series() 225 | for frag in frags: 226 | if frag.scaffold != oldfrag.scaffold: 227 | if len(series.rgroups) >= 2: 228 | series.scaffold = oldfrag.scaffold 229 | yield series 230 | series = Series() 231 | series.rgroups.append((frag.rgroup, frag.id)) 232 | oldfrag = frag 233 | if len(series.rgroups) >= 2: 234 | series.scaffold = oldfrag.scaffold 235 | yield series 236 | ---- 237 | 238 | We are ready to make an MMS. Visualize only data that has four or more substituent conversions for the same scaffold. 239 | 240 | [source, python] 241 | ---- 242 | frags = getFrags(rfragdata) 243 | series = getSeries(frags) 244 | series =[i for i in series] 245 | from IPython.display import display 246 | for s in series[:50]: 247 | mols = [Chem.MolFromSmiles(s.scaffold)] 248 | ids = ['scaffold'] 249 | for r in s.rgroups: 250 | rg = Chem.MolFromSmiles(r[0]) 251 | mols.append(rg) 252 | ids.append(r[1]) 253 | if len(mols) > 5: 254 | display(Draw.MolsToGridImage(mols, molsPerRow=5, legends=ids)) 255 | print("########") 256 | ---- 257 | 258 | image::ch07/mms01.png[MMS] 259 | 260 | Five scaffolds for MMS were displayed for the scaffold. 261 | 262 | NOTE: link:https://pubs.acs.org/doi/10.1021/jm500022q[Activity prediction] can also be performed using this MMS. 263 | 264 | === Visualize MMP networks using Cytoscape 265 | 266 | WARNING: This content is beyond the content of the introductory, so please skip if you are not interested. 267 | 268 | MMP can be thought of as a graph structure that uses pre-conversion and post-conversion information as nodes and conversion rules as edges. This graph structure can be intuitively understood by using network visualization tools such as Cytoscape. 269 | 270 | In addition to the MMPA introduced earlier, RDKit has another project called link:https://github.com/rdkit/mmpdb[mmpdb]. It is provided as a command line tool group and database system, so it has the feature of being easy to manage in the long run. In this section, we introduce the visualization of MMP using link:https://github.com/Mishima-syk/12/tree/master/kzfm[mmpdb and Cytoscape]. 271 | 272 | 273 | NOTE: link:https://chemrxiv.org/articles/mmpdb_An_Open_Source_Matched_Molecular_Pair_Platform_for_Large_Multi-Property_Datasets/5999375[mmpdb: An Open Source Matched Molecular Pair Platform for Large Multi-Property Datasets] 274 | 275 | ==== Cytoscape installation 276 | 277 | link:https://cytoscape.org/[Cytoscape] is an open source network visualization software widely used in various scenes. You can display the structure network by using the compound structure display plug-in. 278 | 279 | Installation is as easy as downloading the corresponding OS installer from the link:https://cytoscape.org/download.html[download site] and installing according to the instructions. 280 | 281 | When installation is complete, launch Cytoscape and install the Chemviz2 plug-in for drawing compound structures. The procedure is easy, select chemviz2 from Apps → App Manager and install it. 282 | 283 | 284 | image::ch07/chemviz2.png[AppManager, width=400] 285 | 286 | ==== create a gml file from mmpdb 287 | 288 | The data to be used this time are 151 compounds of J. Med. Chem. (2008) 51: 2062-2077 . In principle, MMPA does not use HTS-like search data but scaffolds such as structure optimization. 289 | 290 | I will put the flow of the command. SMILES text and activity and property data need to be registered separately in the database. 291 | 292 | [source, bash] 293 | ---- 294 | $ mmpdb fragment smiles.txt -o CHEMBL930273.fragments # fragmentation 295 | $ mmpdb index CHEMBL930273.fragments -o CHEMBL930273.db # make db 296 | $ mmpdb loadprops -p act.txt CHEMBL930273.db # load properties 297 | ---- 298 | 299 | After that we will create a gml file for reading by Cytoscape, but this is beyond the scope of this document and will be omitted. If you are interested, you may want to read the link:https://github.com/Mishima-syk/12/tree/master/kzfm[code] directly, but the flow is as follows. 300 | 301 | . link:https://github.com/Mishima-syk/12/blob/master/kzfm/mmp2gml.py[Make a gml file using mmpdb and python-igraph] 302 | . link:https://github.com/Mishima-syk/12/blob/master/kzfm/CHEMBL930273.gml[Read gml file] by Cytoscape 303 | . Assign attributes to each parameter in Cytoscape to make it easier to understand visually 304 | .. Corresponds to the physical value of the node size 305 | .. Corresponds to the active color of the edge color 306 | .. Draw a structure with chemviz2 plugin and paste it to a node 307 | 308 | 309 | ==== Interpretation 310 | 311 | Let's look at the MMP network. MMP with little difference in activity is solidified in the upper left. In the lower right, red edges (a large difference in activity) are observed. MMPs are also called Activity Cliffs, even if such small substituent changes produce large activity differences. It is important not to overlook such changes in activity, as Activity Cliff is generally a breakthrough in drug discovery projects. 312 | 313 | image:ch07/mmp01.png[MMPN, width=600, pdfwidth=48%] image:ch07/mmp02.png[MMPN, width=600, pdfwidth=48%] 314 | 315 | It has been found that the substitution of the OH group with the MeO group causes the loss of activity when we actually confirm what substitution has been made. 316 | 317 | Since MMP alone can simply know the facts like this, I searched for a complex crystal structure of the analogue in order to consider it a little deeper. Then , a complex of GDB3β and a similar compound was found as link:https://www.rcsb.org/structure/5OY4[PDBID:5OY4]. 318 | 319 | 320 | image:ch07/mmp03.png[MMPN, width=600, pdfwidth=48%] image:ch07/mmp04.png[MMPN, width=600, pdfwidth=48%] 321 | 322 | If you replace the OH group with the MeO group, it will likely hit the wall of the pocket. In other words, this Activity Cliff is considered to be caused by steric hindrance of ligand and protein. 323 | 324 | <<< 325 | -------------------------------------------------------------------------------- /ch08_visualization.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 8: Want to have lots of compounds at once 2 | :imagesdir: images 3 | 4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch08_visualization.ipynb"] 5 | 6 | In order to see how much data is distributed, it is common to map in an appropriate space. Especially in chemoinformatics the word chemical space is used. 7 | 8 | === What is Chemical Space 9 | 10 | Chemical space refers to the arrangement of compounds in an n-dimensional space at some scale. In general, two or three dimensions are often used (for human understanding). Although various methods have been proposed for the scale, ie, similarity, it is often decided that a distance that well characterizes a compound is defined. 11 | 12 | This time, we will visualize which pharmaceutical company is developing what kind of compound for the antagonist of Orexin Receptor, which is known as a target for sleep medicine. See Chapter 4 for how to download data. This time we used the data of 10 papers in the table. 13 | 14 | There are two main things I want to know this time: 15 | 16 | - Were there companies that developed similar compounds? 17 | - Has Merck optimized only similar frameworks, or did it optimize multiple frameworks? 18 | 19 | .Orexin Receptor Antagonist 20 | |=== 21 | |Doc ID|Journal|Pharma 22 | |CHEMBL3098111|link:https://www.sciencedirect.com/science/article/pii/S0960894X13012511?via%3Dihub[Bioorg. Med. Chem. Lett. (2013) 23:6620-6624]|Merck 23 | |CHEMBL3867477|link:https://www.sciencedirect.com/science/article/pii/S0960894X16310472?via%3Dihub[Bioorg Med Chem Lett (2016) 26:5809-5814]|Merck 24 | |CHEMBL2380240|link:https://www.sciencedirect.com/science/article/pii/S0960894X13002801?via%3Dihub[Bioorg. Med. Chem. Lett. (2013) 23:2653-2658]|Rottapharm 25 | |CHEMBL3352684|link:https://www.sciencedirect.com/science/article/pii/S0960894X14008853?via%3Dihub[Bioorg. Med. Chem. Lett. (2014) 24:4884-4890]|Merck 26 | |CHEMBL3769367|link:https://pubs.acs.org/doi/10.1021/acs.jmedchem.5b00832[J. Med. Chem. (2016) 59:504-530]|Merck 27 | |CHEMBL3526050|link:http://dmd.aspetjournals.org/content/41/5/1046[Drug Metab. Dispos. (2013) 41:1046-1059]|Actelion 28 | |CHEMBL3112474|link:https://www.sciencedirect.com/science/article/pii/S0960894X13014765?via%3Dihub[Bioorg. Med. Chem. Lett. (2014) 24:1201-1208]|Actelion 29 | |CHEMBL3739366|link:https://pubs.rsc.org/en/Content/ArticleLanding/2015/MD/C5MD00027K#!divAbstract[MedChemComm (2015) 6:947-955]|Heptares 30 | |CHEMBL3739395|link:https://pubs.rsc.org/en/Content/ArticleLanding/2015/MD/C5MD00074B#!divAbstract[MedChemComm (2015) 6:1054-1064]|Actelion 31 | |CHEMBL3351489|link:https://www.sciencedirect.com/science/article/pii/S0968089614006300?via%3Dihub[Bioorg. Med. Chem. (2014) 22:6071-6088]|Eisai 32 | |=== 33 | 34 | 35 | === Mapping using Euclidean distance 36 | 37 | Use ggplot for the drawing library. Principal component analysis (PCA) is used to distribute and visualize similar compounds close together. At first we import necessary library 38 | 39 | [source, python] 40 | ---- 41 | from rdkit import Chem, DataStructs 42 | from rdkit.Chem import AllChem, Draw 43 | import numpy as np 44 | import pandas as pd 45 | from ggplot import * 46 | from sklearn.decomposition import PCA 47 | import os 48 | ---- 49 | 50 | Load the downloaded sdf, and create fingerprints for each compound, enabling correspondence between drug companies and document IDs. If you have any questions please check Chapter 6. 51 | 52 | [source, python] 53 | ---- 54 | oxrs = [("CHEMBL3098111", "Merck" ),("CHEMBL3867477", "Merck" ), 55 |      ("CHEMBL2380240", "Rottapharm" ),("CHEMBL3352684", "Merck" ), 56 |      ("CHEMBL3769367", "Merck" ),("CHEMBL3526050", "Actelion" ), 57 |      ("CHEMBL3112474", "Actelion" ),("CHEMBL3739366", "Heptares" ), 58 |      ("CHEMBL3739395", "Actelion" ), ("CHEMBL3351489", "Eisai" )] 59 | 60 | fps = [] 61 | docs = [] 62 | companies = [] 63 | 64 | for cid, company in oxrs: 65 | sdf_file = os.path.join("ch08", cid + ".sdf") 66 | mols = Chem.SDMolSupplier(sdf_file) 67 | for mol in mols: 68 | if mol is not None: 69 | fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) 70 | arr = np.zeros((1,)) 71 | DataStructs.ConvertToNumpyArray(fp, arr) 72 | docs.append(cid) 73 | companies.append(company) 74 | fps.append(arr) 75 | fps = np.array(fps) 76 | companies = np.array(companies) 77 | docs = np.array(docs) 78 | ---- 79 | 80 | If you check the information of the fingerprint, you can see that data of 293 compounds are obtained from 10 articles. 81 | 82 | [source, python] 83 | ---- 84 | fps.shape 85 | # (293, 2048) 86 | ---- 87 | 88 | You are now ready for principal component analysis. The number of principal components can be specified by n_components, but this time, I want to scatter two dimensions, so I set it to 2. 89 | 90 | [source, python] 91 | ---- 92 | pca = PCA(n_components=2) 93 | x = pca.fit_transform(fps) 94 | ---- 95 | 96 | Draw. I changed the color option according to each label, so I chose two attributes, COMPANY and DOCID. 97 | 98 | [source, python] 99 | ---- 100 | d = pd.DataFrame(x) 101 | d.columns = ["PCA1", "PCA2"] 102 | d["DOCID"] = docs 103 | d["COMPANY"] = companies 104 | g = ggplot(aes(x="PCA1", y="PCA2", color="COMPANY"), data=d) + geom_point() + xlab("X") + ylab("Y") 105 | g 106 | ---- 107 | 108 | You can now see what compounds each pharmaceutical company has optimized. Merck, Acterion, Eisai and Heptaress seem to have optimized similar compounds, as there is an overlapping area in the center of the chemical space. It is interesting to see whether the Acterion has been successfully deployed in a unique direction (lower left) or has not been deployed and has advanced into the red ocean center. 109 | 110 | Also, Merck seems to have optimized various frameworks. I don't know if I'm optimizing at the same time or running ahead for backup, but it's no doubt that there were a lot of skeletal optimizations running, so it's probably an attractive target. In fact, link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL1083659/[SUVOREXANT] was launched. 111 | 112 | image:ch08/pca01.png[PCA, size=400, pdfwidth=48%] image:ch08/pca02.png[PCA, size=400, pdfwidth=48%] 113 | 114 | .patinformatics 115 | **** 116 | In this chapter, we use dissertation data, but we do not use dissertation data when performing such analysis in a real field. Because when a company disseminates, it means that the project is over (whether it went to clinical or failed and closed). In the actual situation, analysis is performed using patent data. 117 | 118 | Based on the analysis and link:http://rkakamilan.hatenablog.com/entry/2017/12/17/235417[experience of Medicinal Chemist] and the insights of these companies, the project will proceed with a belief in their own successes while inferring the situation of other companies. 119 | 120 | **** 121 | 122 | === Mapping using tSNE 123 | 124 | It is said that tSNE has better resolution than PCA and is closer to the sense of medicinal chemist. Sklearn just changes PCA to TSNE. 125 | 126 | [source, python] 127 | ---- 128 | from sklearn.manifold import TSNE 129 | tsne = TSNE(n_components=2, random_state=0) 130 | tx = tsne.fit_transform(fps) 131 | ---- 132 | 133 | As you can see when drawing, it is separated better than PCA. 134 | 135 | [source, python] 136 | ---- 137 | d = pd.DataFrame(tx) 138 | d.columns = ["PCA1", "PCA2"] 139 | d["DOCID"] = docs 140 | d["COMPANY"] = companies 141 | g = ggplot(aes(x="PCA1", y="PCA2", color="COMPANY"), data=d) + geom_point() + xlab("X") + ylab("Y") 142 | g 143 | ---- 144 | 145 | image::ch08/tsne01.png[PCA, size=500] 146 | 147 | There are many other drawing methods besides PCA and tSNE introduced this time, so it is good to check. 148 | 149 | <<< 150 | -------------------------------------------------------------------------------- /ch09_qsar.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 9: Basics of Quantitative Structure-Activity Relationship (QSAR) 2 | 3 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch09_qsar.ipynb"] 4 | 5 | The correlation between chemical structure and biological activity is called Structure Activity Relationship (SAR) or Quantitative SAR (QSAR). In general, **similar compounds** are known to exhibit **similar biological activities**, and it is very important in drug discovery research to understand this correlation and apply it to drug design. 6 | 7 | In addition, there are two types of problems such as classification problems to estimate which class a compound belongs to, such as cell death or toxicity, or toxicity, and regression problems to estimate continuous values such as % inhibition. 8 | 9 | === Consider the cause of no effect (classification problem) 10 | 11 | Label the ones with an IC50 less than 1 uM with hERG inhibition and the others with no hERG inhibition using 73 data from ChEMBL link:https://www.ebi.ac.uk/chembl/assay/inspect/CHEMBL829152[hERG inhibition assay]. 12 | 13 | First, import the necessary libraries. 14 | 15 | [source, python] 16 | ---- 17 | from rdkit import Chem, DataStructs 18 | from rdkit.Chem import AllChem, Draw 19 | from rdkit.Chem.Draw import IPythonConsole 20 | import numpy as np 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.metrics import confusion_matrix, f1_score 23 | from sklearn.ensemble import RandomForestClassifier 24 | ---- 25 | 26 | Processing of tab-delimited text downloaded with ChEMBL is almost the same as in Chapter 8, but this time I want liveness data, so I search for the column **STANDARD_VALUE** and retrieve the numerical value. If this value is less than 1000 nM, label it as POS, otherwise label it as NEG. At the end, I will make the label numpy array. 27 | 28 | [source, python] 29 | ---- 30 | mols = [] 31 | labels = [] 32 | with open("ch09_compounds.txt") as f: 33 | header = f.readline() 34 | smiles_index = -1 35 | for i, title in enumerate(header.split("\t")): 36 | if title == "CANONICAL_SMILES": 37 | smiles_index = i 38 | elif title == "STANDARD_VALUE": 39 | value_index = i 40 | for l in f: 41 | ls = l.split("\t") 42 | mol = Chem.MolFromSmiles(ls[smiles_index]) 43 | mols.append(mol) 44 | val = float(ls[value_index]) 45 | if val < 1000: 46 | labels.append("POS") 47 | else: 48 | labels.append("NEG") 49 | 50 | labels = np.array(labels) 51 | ---- 52 | 53 | Then convert the mol object into a fingerprint. From this fingerprint, create a model to predict the presence or absence of hERG inhibition. 54 | 55 | [source, python] 56 | ---- 57 | fps = [] 58 | for mol in mols: 59 | fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) 60 | arr = np.zeros((1,)) 61 | DataStructs.ConvertToNumpyArray(fp, arr) 62 | fps.append(arr) 63 | fps = np.array(fps) 64 | ---- 65 | 66 | Divide the data set into two of the training set test set. The test set will be used later to evaluate the accuracy of the created prediction model. 67 | 68 | [source, python] 69 | ---- 70 | x_train, x_test, y_train, y_test = train_test_split(fps, labels) 71 | ---- 72 | 73 | To create a predictive model, just create an instance and train it with the fit method. 74 | 75 | [source, python] 76 | ---- 77 | rf = RandomForestClassifier() 78 | rf.fit(x_train, y_train) 79 | ---- 80 | 81 | Predict the test set you split up earlier. 82 | 83 | [source, python] 84 | ---- 85 | y_pred = rf.predict(x_test) 86 | ---- 87 | 88 | Create a Confusion matrix. 89 | 90 | .What is confusion matrix? 91 | **** 92 | Confusion matrix is a table that summarizes the results of class classification. It is possible to visualize clearly whether the class is classified correctly, and as TP and TN are many and FP and FN are small, it is possible to classify better. 93 | 94 | |=== 95 | || 2+|Actual class 96 | 97 | | 98 | | 99 | |Positive 100 | |Negative 101 | 102 | .2+|Predicted class 103 | |Positive 104 | |True Positive(TP) 105 | |False Positive(FP) 106 | 107 | |Negative 108 | |False Negative(FN) 109 | |True Negative(TN) 110 | |=== 111 | **** 112 | 113 | [source, python] 114 | ---- 115 | confusion_matrix(y_test, y_pred) 116 | #array([[11, 1],[ 5, 2]]) 117 | ---- 118 | 119 | |=== 120 | |11 |1 121 | 122 | |5 123 | |2 124 | |=== 125 | 126 | Let's look at the F1 score. 127 | 128 | [source, python] 129 | ---- 130 | f1_score(y_test, y_pred, pos_label="POS") 131 | #0.4 132 | ---- 133 | 134 | It is not very good. 135 | 136 | NOTE: Because the train_test_split function randomly splits the training set and test set, the value of the confidence matrix, F1 score changes with each execution. 137 | 138 | .With F1 score 139 | **** 140 | 141 | - The ratio of what is truly correct among what is predicted to be correct is called accuracy rate precision = TP / (TP + FP) 142 | - The rate at which the correct thing is predicted to be correct is called the recall rate recall = TP / (TP + FN) 143 | 144 | The F1 score is the harmonic mean of the precision rate and the recall rate 145 | 146 | It is calculated by 147 | F1 = 2 * (precision * recall) / (precision + recall) 148 | 149 | **** 150 | 151 | === Predict the efficacy of drugs (regression problem) 152 | 153 | Regression models, as discussed earlier, are models that predict continuous values. This time, create a regression model of RandomForest, and evaluate its accuracy with R2. Let's use the data from hERG's assay data used in classification problems. Import the required libraries first. 154 | 155 | [source, python] 156 | ---- 157 | from sklearn.ensemble import RandomForestRegressor 158 | from sklearn.metrics import r2_score 159 | from math import log10 160 | ---- 161 | 162 | We labeled it for classification problems, but now we want to predict continuous values, so we convert it to pIC50. (We will supplement later on why it is convenient to use pIC50) 163 | 164 | [source, python] 165 | ---- 166 | pIC50s = [] 167 | with open("ch09_compounds.txt") as f: 168 | header = f.readline() 169 | for i, title in enumerate(header.split("\t")): 170 | if title == "STANDARD_VALUE": 171 | value_index = i 172 | for l in f: 173 | ls = l.split("\t") 174 | val = float(ls[value_index]) 175 | pIC50 = 9 - log10(val) 176 | pIC50s.append(pIC50) 177 | 178 | pIC50s = np.array(pIC50s) 179 | ---- 180 | 181 | Divide the data set into two: training set and test set. The fingerprint uses what was created at the time of classification model. 182 | 183 | [source, python] 184 | ---- 185 | x_train, x_test, y_train, y_test = train_test_split(fps, pIC50s) 186 | ---- 187 | 188 | I will train. In the case of Scikit-learn, this procedure is fit and predict with almost the same method in any method. 189 | 190 | [source, python] 191 | ---- 192 | rf = RandomForestRegressor() 193 | rf.fit(x_train, y_train) 194 | ---- 195 | 196 | Let's predict. 197 | 198 | [source, python] 199 | ---- 200 | y_pred = rf.predict(x_test) 201 | ---- 202 | 203 | Let's put out the prediction accuracy with R2. 204 | 205 | [source, python] 206 | ---- 207 | r2_score(y_test, y_pred) 208 | #0.52 209 | ---- 210 | 211 | Is there anything like that? 212 | 213 | .With R2 score 214 | 215 | **** 216 | It is often used as one of the evaluation indicators for the goodness of fit of regression, also called the link:https://ja.wikipedia.org/wiki/%E6%B1%BA%E5%AE%9A%E4%BF%82%E6%95%B0[determination coefficient]. 217 | **** 218 | 219 | === Model applicability (applicability domain) 220 | 221 | The method introduced here is a model generated based on the hypothesis that **similar compounds exhibit similar biological activities**. What is the prediction accuracy if there is no compound that is similar to the training set? 222 | 223 | Of course, the predicted value is not reliable in that case. In other words, is the prediction likely to be that prediction? The degree of reliability always goes around. The extent to which such models can be trusted or applied is called the applicability domain. In this regard, the link:https://datachemeng.com/applicabilitydomain/[scope of application and model application] Mr. Kaneko, Meiji University are detailed. 224 | 225 | ==== (Extra column) How reliable can the applicability domain be? 226 | 227 | Long time ago, do the similar compounds of Dr. Hugo Kubinyi show similar activities? I remember the question that I was impressed by the fact that converting the estradiol OH group into a Methoxy group gave examples of the loss of activity. 228 | 229 | The applicability domain is a method to measure the accuracy of the prediction from the similarity of the training set. Here comes the question of who the similarity is for. It is our hand that we think this compound and this compound are similar, but it is ultimately determined by the protein whether it is similar or not. Therefore, the activity can not always be predicted from the similarity, and the activity often disappears even if the similarity is extremely high. In particular, Activity Cliff, described in the context of MMP, gives such an event its name. 230 | 231 | <<< 232 | -------------------------------------------------------------------------------- /ch10_deeplearning.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 10: Introduction to Deep Learning 2 | :imagesdir: images 3 | 4 | For this chapter, we will use deep learning to create QSAR models and generation models. 5 | 6 | === About deep learning 7 | 8 | Neurons exist in the brain of an organism, and they form a network to transmit information, store and learn. The Artificial Neural Network (ANN) is a mathematical model of this network structure. 9 | 10 | In general ANN, an input layer for inputting information for learning, an intermediate layer (or hidden layer) for learning a response (corresponding to the firing of a nerve synapse) based on a pattern of input information, and a third-layer or final output layer. However, deep learning enables highly accurate predictions by layering multiple hidden layers. 11 | 12 | Although I will not explain this in this book in particular, if you want to write and understand the code from scratch yourself, link:https://www.amazon.co.jp/dp/4873117585/[Deep Learning from scratch] can be helpful. Also, if you want to learn about theory properly, we recommend link:https://www.amazon.co.jp/dp/4048930621/[deep learning]. 13 | 14 | === About TensorFlow and Keras 15 | 16 | Tensorflow is a framework for machine learning developed by Google and released as OSS. It is often used mainly as a deep learning framework. 17 | 18 | NOTE: Tensorflow has recently made a major update from 1.x to 2.x, but since the 2.x version has just appeared and there is little reference information, it uses the 1.x system. Also, since the API differs depending on the version of the same 1.x, if there is code you want to run, please be careful about which version is written. 19 | 20 | Keras is a high-level API backed by a low-level framework such as Tensoflow, so you can write code more easily. Keras has been developed independently of Tensorflow, but recently Tensorflow comes with Keras. So you can use Keras without installing separately. The Tensorflow bundled version of Keras may not be the latest version of the home. 21 | 22 | It's annoying to decide which Keras to use, but for the sake of convenience, we will use Tensorflow-integrated Keras. 23 | 24 | .Relationship between Keras and Tensorflow 25 | **** 26 | I will organize Keras and Tensoflow a little while referring to the link:https://blog.keras.io/introducing-keras-2.html[official blog]. Originally Keras was developed as a separate project from Tensoflow (and, of course, still), to use Keras, Tensorflow had to be installed. However, around the timing of the major version upgrade of Keras 2.x 2017, the Tensoflow project has integrated Keras. The English below is an excerpt of the above link. It is now possible to call Keras from Tensorflow. 27 | 28 | _TensorFlow integration 29 | Although Keras has supported TensorFlow as a runtime backend since December 2015, the Keras API had so far been kept separate from the TensorFlow codebase. This is changing: the Keras API will now become available directly as part of TensorFlow, starting with TensorFlow 1.2. This is a big step towards making TensorFlow accessible to its next million users._ 30 | **** 31 | 32 | === Let's install 33 | 34 | Let's install Tensorflow and Keras. When installing with anaconda, the package to be installed differs slightly depending on whether you use a GPU compatible version or a CPU version. 35 | 36 | [source, bash] 37 | ---- 38 | # CPU only 39 | $ conda install -c conda-forge tensorflow 40 | # GPU enabled 41 | $ conda install -c anaconda tensorflow-gpu 42 | ---- 43 | 44 | NOTE: You can also use the pip command to install TensorFlow. In that case , please refer to the link:https://www.tensorflow.org/install[official document]. But basically, if you make an environment with Conda, it is desirable to put a package with Conda. 45 | 46 | Reference link 47 | 48 | 49 | - https://keras.io/#installation 50 | - https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-pkgs.html 51 | 52 | === About Google colab 53 | 54 | .Google colab 55 | **** 56 | link:https://colab.research.google.com/notebooks/welcome.ipynb[Google colaboratory] is a Jupyter notebook environment that can be run on the cloud. The framework for deep learning such as Theano, Thensorflow, Keras, Pytorch is already installed and time is limited, but because GPU can be used, it is very attractive that deep learning can be used without a GPU machine at hand is. 57 | 58 | **** 59 | 60 | You need a Google account to use it, so if you do not have a Google account, it is a good idea to get an account at this opportunity. 61 | If you have a Google account, you can also run GitHub-style notebooks directly on Colab. Let's open the Scikit-learn hands-on notebook previously used in Mishima.syk. 62 | 63 | NOTE:: It is a notebook created by @y__sama, but it is possible to learn from Auto data preparation to link:https://automl.github.io/auto-sklearn/master/[AutoSklearn]. 64 | 65 | First of all, go to link:https://colab.research.google.com/notebooks/welcome.ipynb[Google colaboratory]. If you do not get the screen below, please execute "Open Notebook" from "File" on the top left 66 | 67 | image::ch10/ch10_1.png[GoogleColabTop, width=600, pdfwidth=60%] 68 | 69 | 70 | Next, click the tab named GitHub, copy and paste the following URL, and you can move the code from Jupyter Notebook. 71 | 72 | https://github.com/Mishima-syk/sklearn-tutorial 73 | 74 | image::ch10/ch10_2.png[GoogleColab2, width=600, pdfwidth=60%] 75 | 76 | When you open the notebook, you will see a screen similar to the Jupyter Notebook. You can execute the code of the cell with Shift + Return key. 77 | 78 | image::ch10/ch10_2.png[NoteBook, width=600, pdfwidth=60%] 79 | 80 | To see the libraries available by default in Google Colab, type '! Pip freeze' in the cell and it will be listed. 81 | 82 | - absl-py==0.7.0 83 | - alabaster==0.7.12 84 | - snip ;) 85 | - yellowbrick==0.9.1 86 | - zict==0.1.3 87 | - zmq==0.0.0 88 | 89 | .Python deep learning framework 90 | **** 91 | There are many Python deep learning frameworks. Mainly link:http://deeplearning.net/software/theano/[Theano], 92 | link:https://www.tensorflow.org/[Tensorflow], 93 | link:https://keras.io/[Keras], 94 | link:https://mxnet.apache.org/[MXNet], 95 | link:https://chainer.org/[Chainer], 96 | link:https://pytorch.org/[PyTorch], 97 | etc. 98 | 99 | Various deep learning documents often use one of the above frameworks for implementation. You may want to try it and choose a framework that is easy to use. 100 | **** 101 | 102 | <<< 103 | -------------------------------------------------------------------------------- /ch11_dlqsar.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 11: Structure-activity relationship using deep learning 2 | :imagesdir: images 3 | 4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch11_simple_dnn.ipynb"] 5 | 6 | In this chapter, structure activity correlation analysis is performed using DNN. 7 | 8 | === Predictive model construction using DNN 9 | 10 | First, let's build a simple prediction model using DNN. Here we use the same data as in Chapter 9. First, create a classification model and label the Positive label as [0, 1] and the Negative label as a [1, 0] two-dimensional OneHot vector. If you create a model using Keras Model object, you can get the expected value of each of the above two dimensions. You can use Numpy's Argmax function to know which class it is likely to belong to. 11 | 12 | NOTE: OneHot vector is a vector in which one value is 1 and the other is 0. When considering a classification problem of 10 classes, such as [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], a vector such that somewhere is 1 and the remaining 9 are 0 I can express a class. In the above example, there are two classes of Positive / Negative, so the OneHot vector is two-dimensional. 13 | 14 | Import the required libraries. 15 | 16 | [source, python] 17 | ---- 18 | from rdkit import Chem, DataStructs 19 | from rdkit.Chem import AllChem, Draw 20 | from rdkit.Chem.Draw import IPythonConsole 21 | import numpy as np 22 | from sklearn.model_selection import train_test_split 23 | from sklearn.metrics import confusion_matrix, f1_score 24 | from thensorflow.python.keras.layers import Iput 25 | from thensorflow.python.keras.layers import Dense 26 | from thensorflow.python.keras.layers import Dropout 27 | from thensorflow.python.keras.layers import Activation 28 | from thensorflow.python.keras.Model import Model 29 | 30 | ---- 31 | 32 | Next, read the data. In Chapter 9 we put "POS" / "NEG" in the list of labels, so it was a one-dimensional representation, but this time it is two-dimensional. 33 | 34 | [source, python] 35 | ---- 36 | mols = [] 37 | labels = [] 38 | with open("ch09_compounds.txt") as f: 39 | header = f.readline() 40 | smiles_index = -1 41 | for i, title in enumerate(header.split("\t")): 42 | if title == "CANONICAL_SMILES": 43 | smiles_index = i 44 | elif title == "STANDARD_VALUE": 45 | value_index = i 46 | for l in f: 47 | ls = l.split("\t") 48 | mol = Chem.MolFromSmiles(ls[smiles_index]) 49 | mols.append(mol) 50 | val = float(ls[value_index]) 51 | if val < 1000: 52 | labels.append([0,1]) # Positive 53 | else: 54 | labels.append([1,0]) # Negative 55 | labels = np.array(labels) 56 | ---- 57 | 58 | Next, create classification models and regression models sequentially. 59 | 60 | The first is a regression model, and the input uses the same ECFP as in Chapter 9. In order to construct DNN, it is necessary to specify the dimension of input data explicitly, so we define the variable nBits. 61 | 62 | TIP: Specifying an appropriate integer in random_state for train_test_split is useful for verification because the same data is obtained each time. 63 | 64 | [source, python] 65 | ---- 66 | nBits = 2048 67 | fps = [] 68 | for mol in mols: 69 | fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nBits) 70 | arr = np.zeros((1,)) 71 | DataStructs.ConvertToNumpyArray(fp, arr) 72 | fps.append(arr) 73 | fps = np.array(fps) 74 | 75 | x_train1, x_test1, y_train1, y_test1 = train_test_split(fps, labels, random_state=794) 76 | ---- 77 | 78 | Create a neural network whose inputs are 2048 dimensions, the total connection layer of 300 neurons is three layers, and the final output layer is two. We used ReLU for the activation function and Softmax for two-dimensional multiclass classification for the output layer. 79 | 80 | The Dropout layer plays a role to prevent overlearning by randomly deleting neurons. 81 | 82 | Keras constructs a model by defining the model and then calling the compile function. Although optimizer and loss need to be changed according to the purpose, in this case 'categorical_crossentropy' was used, but there are many other than link:https://keras.io/ja/optimizers/[adam optimizer] , so it will actually require trial and error which is appropriate. 83 | 84 | TIP: link:https://en.wikipedia.org/wiki/Rectifier_(neural_networks)[ReLU] is often used because it can overcome the problem of gradient disappearance of link:https://en.wikipedia.org/wiki/Sigmoid_function[Sigmoid] function. 85 | 86 | [source, python] 87 | ---- 88 | # Define DNN classifier model 89 | epochs = 10 90 | inputlayer1 = Input(shape=(nBits, )) 91 | x1 = Dense(300, activation='relu')(inputlayer1) 92 | x1 = Dropout(0.2)(x1) 93 | x1 = Dense(300, activation='relu')(x1) 94 | x1 = Dropout(0.2)(x1) 95 | x1 = Dense(300, activation='relu')(x1) 96 | output1 = Dense(2, activation='softmax')(x1) 97 | model1 = Model(inputs=[inputlayer1], outputs=[output1]) 98 | 99 | model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 100 | ---- 101 | 102 | NOTE: 103 | Keras provides a link:https://keras.io/ja/models/sequential/[Sequential] model, which can be used to describe the network more simply than the example above (Functional API). The reason we defined the model in the Functional API is that it is easy to handle multiple inputs and more complex models if you get used to it. If you are interested in writing Sequential please check out the official site and Qiita 104 | 105 | NOTE: DNN optimizes the model while iterating the Backpropagation procedure, which compares the actual value with the predicted value predicted based on the initial randomly generated weight, and updates the weight so as to minimize the difference (LOSS). You It is Epochs that specifies the number of repetitions. You may seem to get smarter as you increase Epochs, but there is a risk of computational cost and over-learning, so it is not good if it is long. Observe Loss, Accuracy, etc. and find the appropriate number of Epochs. 106 | 107 | .Why is there a risk of overlearning when increasing Epochs? 108 | **** 109 | Using training data, we will adjust the weight to reduce the error between the correct value and the predicted value for each Epoch. If it is learned using a sufficient amount of training data and it is repeated too much, the generalization performance of the model will be reduced since the same training data will be learned over and over again. 110 | 111 | To judge overtraining, if you evaluate and plot the accuracy of Training set / Validation set for each Epoch, you can check whether the accuracy of Validation set does not change or deteriorate while accuracy of Training set improves. Keras has a function called link:https://keras.io/ja/callbacks/[Eary] stopping, which allows you to stop learning if the performance of the model does not change even if you have learned a certain number of times. 112 | 113 | See the introduction and references of https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/[Early stopping] for more information. 114 | 115 | **** 116 | 117 | After building the model, you can do fit / predict in the same way as Scikit-learn. 118 | 119 | [source, python] 120 | ---- 121 | hist1 = model1.fit(x_train1, y_train1, epochs=epochs) 122 | ---- 123 | 124 | Finally, let's visualize the result. 125 | 126 | [source, python] 127 | ---- 128 | %matplotlib inline 129 | import matplotlib.pyplot as plt 130 | plt.plot(range(epochs), hist1.history['acc'], label='acc') 131 | plt.legend() 132 | plt.plot(range(epochs), hist1.history['loss'], label='loss') 133 | plt.legend() 134 | ---- 135 | 136 | In this example, the model has good accuracy around 6Epoch. 137 | 138 | Next, verify with test data. 139 | 140 | [source, python] 141 | ---- 142 | y_pred1 = model1.predict(x_test1) 143 | y_pred_cls1 = np.argmax(y_pred1, axis=1) 144 | y_test_cls1 =np.argmax(y_test1, axis=1) 145 | confusion_matrix(y_test_cls1, y_pred_cls1) 146 | ---- 147 | 148 | A little subtle ,,,, 149 | 150 | The regression model is basically the same as the classification problem above. This time it is a regression, so the last output layer is the value itself, ie one dimensional The activation function is 0-1 in Sigmoid etc., so it is Linear. The training data uses the code of Chapter 9. 151 | 152 | [source, python] 153 | ---- 154 | from math import log10 155 | from sklearn.metrics import r2_score 156 | pIC50s = [] 157 | with open("ch09_compounds.txt") as f: 158 | header = f.readline() 159 | for i, title in enumerate(header.split("\t")): 160 | if title == "STANDARD_VALUE": 161 | value_index = i 162 | for l in f: 163 | ls = l.split("\t") 164 | val = float(ls[value_index]) 165 | pIC50 = 9 - log10(val) 166 | pIC50s.append(pIC50) 167 | 168 | pIC50s = np.array(pIC50s) 169 | x_train2, x_test2, y_train2, y_test2 = train_test_split(fps, pIC50s, random_state=794) 170 | ---- 171 | 172 | Next, define the model. Note that the Loss part is MSE, unlike the classification model above. 173 | 174 | [source, python] 175 | ---- 176 | epochs = 50 177 | inputlayer2 = Input(shape=(nBits, )) 178 | x2 = Dense(300, activation='relu')(inputlayer2) 179 | x2 = Dropout(0.2)(x2) 180 | x2 = Dense(300, activation='relu')(x2) 181 | x2 = Dropout(0.2)(x2) 182 | x2 = Dense(300, activation='relu')(x2) 183 | output2 = Dense(1, activation='linear')(x2) 184 | model2 = Model(inputs=[inputlayer2], outputs=[output2]) 185 | model2.compile(optimizer='adam', loss='mean_squared_error') 186 | ---- 187 | 188 | If you can do this, the rest is the same. 189 | 190 | [source, python] 191 | ---- 192 | hist = model2.fit(x_train2, y_train2, epochs=epochs) 193 | y_pred2 = model2.predict(x_test2) 194 | r2_score(y_test2, y_pred2) 195 | plt.scatter(y_test2, y_pred2) 196 | plt.xlabel('exp') 197 | plt.ylabel('pred') 198 | plt.plot(np.arange(np.min(y_test2)-0.5, np.max(y_test2)+0.5), np.arange(np.min(y_test2)-0.5, np.max(y_test2)+0.5)) 199 | ---- 200 | 201 | What do you think. The prediction model looks a bit like UnderEstimate. The DNN needs to tune a number of parameters, such as the number of layers to overlap, the percentage of dropouts, the number of neurons in the hidden layer, and the type of activation function. This example was hard-coded, but it is also interesting to compare the performance of the models by changing various parameters. 202 | 203 | === I will devise a descriptor (neural fingerprint) 204 | 205 | So far, we have created models of RandomForest and DNN using molecular fingerprints as input. One of the reasons why DNN has received a great deal of attention is that models can recognize feature quantities even if people do not extract feature quantities. 206 | 207 | For example, in image classification, a human defined the feature quantity called link:https://en.wikipedia.org/wiki/Scale-invariant_feature_transform[SIFT], and a model was created using this as an input, but the current DNN basically uses the pixel information of the image itself. 208 | 209 | In terms of chemoinformatics, SIFT is equivalent to a molecular fingerprint. So isn't it possible to improve DNN's performance by changing this (input) to a more primitive expression? It is extremely natural to think that. In 2015, Alan Aspuru-Guzik et al's group at Harvard University proposed the link:https://arxiv.org/pdf/1509.09292.pdf[Neural Finger print/NFP] as a challenge. 210 | 211 | The differences between ECFP and NFP used so far are shown by citing figures in their papers. 212 | 213 | image::ch11/ch11_nfp.png[Neural Finger Print] 214 | 215 | ECFP (Circular Fingerprints) converts information from each atom of input molecules to atoms in the vicinity of N (N is arbitrary) into Hash values (Mod in this example) to arbitrary values, and converts them into vectors of fixed length was. Roughly speaking, it is an image such as using the one where the presence or absence of the partial structure is corrected to the bit information of 0/1. On the other hand, NFP introduced this time is similar in concept to ECFP, but the part of Hash function is Sigmoid, and the part to be discretized with Mod is Softmax. Therefore, it is expected that input datasets will generate molecular fingerprints more flexibly than ECFP. 216 | 217 | A number of implementations have been published to GitHub since this paper was published, but each implementation does not work with Keras 1.x or Keras / Tensorflow, even if the Backend is Theano or Keras / Tensorflow There are a lot of environment-dependent things that are surprisingly difficult to handle. Unfortunately there is no one that works in the environment we built this time, so I created one that works with Keras 2.x / Python 3.6 based on this code . 218 | 219 | .Was it effective to use the classical method with pixel as it is in image classification? 220 | **** 221 | SIFT was proposed in 1999. According to the link:https://www.cs.ubc.ca/~lowe/papers/iccv99.pdf[original paper], the difficulty in dealing with pixels themselves in object (image) recognition seems to be in dealing with objects that differ in position, rotation, size (scale), light intensity, etc. It seems that various methods have been studied to convert these fluctuating values into universal features. There is no way to use the pixels themselves, but the machine learning that I started with link:https://www.oreilly.co.jp/books/9784873117980/[python], which I purchased when studying machine learning , has an example of learning and classifying human face image data. Here, with the pixel data as input, the feature of the face is extracted and classified by principal component analysis. I have not been able to find a document that was clearly valid on this question, but I think it was valid depending on the task. Please comment if you have any details. 222 | 223 | **** 224 | 225 | [source, python] 226 | ---- 227 | git clone https://github.com/iwatobipen/keras-neural-graph-fingerprint.git 228 | ---- 229 | 230 | If you look at the code in the example.py file, you will find the atmosphere somehow. In the previous examples, molecule representations were generated using RDKit for this example, but this time the fingerprint itself is learned by DNN. 231 | 232 | So, representing molecules as a graph is the input. As Atom_matrix, (max_atoms, num_atom_features) is used as Edge_matrix, (max_atoms, max_degree) as bond_tensor, and three matrices (max_atoms, max_degree, num_bond_features) are used. Since each molecule has a different number of atoms, max_atoms defines the maximum number of atoms. By doing this, it becomes input of the same matrix size for each numerator and batch learning becomes possible. 233 | 234 | If you want to execute Example, please enter the following command. 235 | 236 | [source, python] 237 | ---- 238 | python example.py 239 | ---- 240 | 241 | Reference link: 242 | - link:https://arxiv.org/abs/1509.09292[NGF-paper] 243 | - link:https://arxiv.org/abs/1611.03199[DeepChem-paper] 244 | - link:http://www.keiserlab.org/[keiserlab] 245 | - link:https://github.com/HIPS/neural-fingerprint[HIPS NFP] 246 | - link:https://github.com/debbiemarkslab/neural-fingerprint-theano[Theano base] 247 | - link:https://github.com/GUR9000/KerasNeuralFingerprint[for keras1.x] 248 | - link:https://github.com/ericmjl/graph-fingerprint[ericmjl/graph_fp] 249 | - link:https://github.com/deepchem/deepchem[DeepChem] 250 | - link:https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/[About Eary stopping] 251 | - link:https://www.cs.ubc.ca/~lowe/papers/iccv99.pdf[SIFT original Paper] 252 | 253 | <<< 254 | -------------------------------------------------------------------------------- /ch12_generativemodels.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 12: Let the computer think about the chemical structure 2 | :imagesdir: images 3 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch12_rnn.ipynb"] 4 | 5 | A generation model is one of the things that Deep Learning has had a great impact on the medicinal chemistry. In particular, the evolution of generation models in the last few years is amazing. Here, let's propose a new synthesis proposal using link:https://github.com/MarcusOlivecrona/REINVENT[REINVENT developed by Marcus Olivecrona]. 6 | 7 | .What is a Generation Model? 8 | **** 9 | The prediction model built in Chapter 11 is generally called a discrimination model. On the other hand, by modeling the distribution of inputs, it is possible to generate sampling or input data from the model. This is called a generative model. 10 | 11 | For more details , we recommend reading link:https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf[PRML 1.5.4] 12 | 13 | **** 14 | 15 | === Preparation 16 | Install a deep learning library called PyTorch with conda. It does not work with the new version, so specify the version and install it. 17 | 18 | 19 | .What is pytorch? 20 | Like keras, it is a library to use TensorFlow more conveniently. 21 | 22 | [source, bash] 23 | ---- 24 | $ conda install pytorch=0.3.1 -c pytorch 25 | ---- 26 | 27 | Then clone REINVENT itself from GitHub. 28 | 29 | [source, bash] 30 | ---- 31 | $ cd 32 | $ git clone https://github.com/MarcusOlivecrona/REINVENT.git 33 | ---- 34 | 35 | Next, download a pre-trained model with about 1.1 million data sets of ChEMBL and replace it with the original data. This data takes five or six hours using the GTX 1080Ti GPU machine, but if you want to train yourself, the GPU machine is a must. 36 | 37 | 38 | [source, bash] 39 | ---- 40 | $ wget https://github.com/Mishima-syk/13/raw/master/generator_handson/data.zip 41 | $ unzip data.zip 42 | $ mv data ./REINVENT/ 43 | ---- 44 | 45 | Now you are ready. 46 | 47 | === Illustration 48 | 49 | Here we create a model that produces an analogue of the antidiabetic drug sitagliptin, known commercially as link:https://www.drugbank.ca/drugs/DB01261[Januvia]. 50 | 51 | First, train the model to generate a highly similar structure using the tanimoto coefficients as scores. This time I will train 3000 steps, but it will take about 7 or 8 hours with Macbook Air, which is a little earlier. If you can not wait, please use the data link:https://github.com/Mishima-syk/13/tree/master/generator_handson/sitagliptin_agent_3000[here]. 52 | 53 | 54 | [source, bash] 55 | ---- 56 | ./main.py --scoring-function tanimoto --scoring-function-kwargs query_structure 'N[C@@H](CC(=O)N1CCn2c(C1)nnc2C(F)(F)F)Cc3cc(F)c(F)cc3F' --num-steps 3000 --sigma 80 57 | ---- 58 | 59 | From here, I will launch jupyter notebook. 60 | 61 | Load the necessary libraries. Specify the REINVENT directory for sys.path.append. 62 | 63 | 64 | [source, python] 65 | ---- 66 | %matplotlib inline 67 | import sys 68 | sys.path.append("[Your REINVENT DIR]") 69 | from rdkit import Chem 70 | from rdkit.Chem import AllChem, DataStructs, Draw 71 | import torch 72 | from model import RNN 73 | from data_structs import Vocabulary 74 | from utils import seq_to_smiles 75 | ---- 76 | 77 | Next, sample 50 compounds from the trained model. 78 | 79 | [source, python] 80 | ---- 81 | voc = Vocabulary(init_from_file="/Users/kzfm/mishima_syk/REINVENT/data/Voc") 82 | Agent = RNN(voc) 83 | Agent.rnn.load_state_dict(torch.load("sitagliptin_agent_3000/Agent.ckpt")) 84 | seqs, agent_likelihood, entropy = Agent.sample(50) 85 | smiles = seq_to_smiles(seqs, voc) 86 | ---- 87 | 88 | Let's see what kind of structure was actually generated. 89 | 90 | [source, python] 91 | ---- 92 | mols = [] 93 | for smi in smiles: 94 | mol = Chem.MolFromSmiles(smi) 95 | if mol is not None: 96 | mols.append(mol) 97 | 98 | Draw.MolsToGridImage(mols, molsPerRow=3, subImgSize=(500,400)) 99 | ---- 100 | 101 | Is there anything like that? 102 | 103 | image:ch11/ch11_01.png[Sitagliptin_analogues] 104 | 105 | .About REINVENT 106 | **** 107 | By all means, please read link:https://arxiv.org/abs/1704.07555[Molecular De Novo Design through Deep Reinforcement Learning] 108 | **** 109 | 110 | <<< 111 | -------------------------------------------------------------------------------- /ch13_beyond.asciidoc: -------------------------------------------------------------------------------- 1 | == Chapter 13: Conclusion 2 | :imagesdir: images 3 | 4 | === To learn more 5 | 6 | NOTE:: If you are interested in what you are interested in, you can send requests to the issue that you want to know more, or reply with twitter. Also recommended suggestions are saved. 7 | 8 | 9 | ==== Those who want to learn more machine learning 10 | 11 | 12 | You should aim to be able to read through link:https://www.microsoft.com/en-us/research/people/cmbishop/#!prml-book[Pattern Recognition and Machine Learning(PRML)]. The PDF can be downloaded for free. 13 | 14 | If you find that PRML is tough, you may find it easier to search for "before reading PRML" etc. so you should choose the one that suits you. 15 | 16 | 17 | ==== Want to learn more about Chemoinformatics from IT 18 | 19 | Since this book focuses on AI drug discovery, it explains the basics of machine learning and analysis methods, but chemoinformatics is, like bioinformatics, an efficient way of expressing molecules and data. It also includes storage methods and fast search technology. If you are interested in chemoinformatics as such informatics (IT aspect), it is recommended to read more from link:https://www.amazon.co.jp/Chemoinformatics-Basic-Concepts-Methods-English-ebook/dp/B07MMWKNSL/[Chemoinformatics: Basic Concepts and Methods] and dig deeper on topics of interest. 20 | 21 | ==== For a deeper understanding of medicinal chemistry and chemoinformatics 22 | 23 | If you belong to the pharmacokinetics, toxicity, or pharmacology of a pharmaceutical company or academia and want to know the point of this book by all means , we recommend that you read link:https://www.amazon.co.jp/Drug-Like-Properties-Concepts-Structure-Optimization-ebook/dp/B019OMDRU4/[Drug-Like Properties: Concepts, Structure Design and Methods from ADME to Toxicity Optimization]. You This is a text that is generally read by new employees who are assigned to the synthesis department of a pharmaceutical company, so it would be fun for anyone who has read this book. If there is a part that I can not catch up with, I can go over the related books, and I think it is good to learn further from this book as a clue. 24 | 25 | In addition, people who are involved in pharmacokinetics should be able to use it as a strength in link:https://www.amazon.co.jp/Physiologically-Based-Pharmacokinetic-PBPK-Modeling-Simulations-ebook/dp/B007BGZKWO/[PBPK modeling] if they can understand QSAR / QSPR in this document . Since optimization of kinetic profiles is very important for drug differentiation strategies, it may be very useful to have strong QSPR + PBPK. 26 | 27 | ==== If you want to be a drug designer 28 | 29 | Although this book has introduced informatics methods based on low molecular weight compounds, understanding of the target protein is essential when interpreting the results. In other words, drug design can not be done without understanding the three-dimensional structure of proteins. Therefore, it is good to read and learn books related to SBDD. 30 | 31 | NOTE: Unfortunately I have not studied SBDD in books, so please tell someone good books 32 | 33 | Furthermore, since SBDD deals with proteins, it is not necessary to concatenate it with chemoinformatics and bioinformatics. If you understand both in the framework of drug discovery, you will be able to think in more depth, so let's be able to do both. That is absolutely fun. link:https://www.amazon.co.jp/dp/4780909201/[DRY analysis books] and link:https://www.amazon.co.jp/dp/4297103192[information technology that supports life science data analysis] will surely help your career. 34 | 35 | As mentioned in Chapter 6, quantum chemical calculation is important to understand protein-ligand interactions. In particular, the ability to interpret interactions based on quantum chemistry in future SBDDs can be stated to be essential. Without prejudice think link:https://www.amazon.co.jp/dp/4130625047/[about the chemical in orbit concept] - the basic quantum chemistry please read the like. If you're using link:https://www.msg.chem.iastate.edu/gamess/[Gamess], you'll be able to help the link:https://www.amazon.co.jp/dp/4061543881/[new version of Quantum Chemical Beginners' Manual]. At least save energy decomposition analisys, which will increase your ability to interpret calculations and contribute to your project. Furthermore, FMO is needless to say, but it is an indispensable tool, so understanding link:https://www.jstage.jst.go.jp/article/jccj/advpub/0/advpub_2014-0039/_pdf[each component] will help drug design more than that 36 | 37 | ==== Beyond the "end" 38 | 39 | You can add more advanced content than this manual as a chapter. Please do PR. Add them to the contributor and specify the author at the beginning of the chapter. 40 | 41 | >>> 42 | 43 | -------------------------------------------------------------------------------- /images/by-nc-sa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/by-nc-sa.png -------------------------------------------------------------------------------- /images/ch02/anaconda01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch02/anaconda01.png -------------------------------------------------------------------------------- /images/ch04/chembl01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl01.png -------------------------------------------------------------------------------- /images/ch04/chembl02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl02.png -------------------------------------------------------------------------------- /images/ch04/chembl03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl03.png -------------------------------------------------------------------------------- /images/ch04/chembl04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl04.png -------------------------------------------------------------------------------- /images/ch04/chembl05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl05.png -------------------------------------------------------------------------------- /images/ch04/chembl06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl06.png -------------------------------------------------------------------------------- /images/ch04/chembl07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl07.png -------------------------------------------------------------------------------- /images/ch04/zinc01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/zinc01.png -------------------------------------------------------------------------------- /images/ch05/ch05_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_01.png -------------------------------------------------------------------------------- /images/ch05/ch05_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_02.png -------------------------------------------------------------------------------- /images/ch05/ch05_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_03.png -------------------------------------------------------------------------------- /images/ch05/ch05_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_04.png -------------------------------------------------------------------------------- /images/ch05/ch05_05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_05.png -------------------------------------------------------------------------------- /images/ch05/ch05_06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_06.png -------------------------------------------------------------------------------- /images/ch05/ch05_07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_07.png -------------------------------------------------------------------------------- /images/ch05/ch05_08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_08.png -------------------------------------------------------------------------------- /images/ch06/apx_rvx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/apx_rvx.png -------------------------------------------------------------------------------- /images/ch06/apx_rvx_suf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/apx_rvx_suf.png -------------------------------------------------------------------------------- /images/ch06/cls01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/cls01.png -------------------------------------------------------------------------------- /images/ch06/vs01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/vs01.png -------------------------------------------------------------------------------- /images/ch07/chemviz2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/chemviz2.png -------------------------------------------------------------------------------- /images/ch07/mcs01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs01.png -------------------------------------------------------------------------------- /images/ch07/mcs02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs02.png -------------------------------------------------------------------------------- /images/ch07/mcs03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs03.png -------------------------------------------------------------------------------- /images/ch07/mcs04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs04.png -------------------------------------------------------------------------------- /images/ch07/mcs05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs05.png -------------------------------------------------------------------------------- /images/ch07/mmp01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp01.png -------------------------------------------------------------------------------- /images/ch07/mmp02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp02.png -------------------------------------------------------------------------------- /images/ch07/mmp03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp03.png -------------------------------------------------------------------------------- /images/ch07/mmp04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp04.png -------------------------------------------------------------------------------- /images/ch07/mms01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mms01.png -------------------------------------------------------------------------------- /images/ch07/scaffold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/scaffold.png -------------------------------------------------------------------------------- /images/ch08/pca01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch08/pca01.png -------------------------------------------------------------------------------- /images/ch08/pca02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch08/pca02.png -------------------------------------------------------------------------------- /images/ch08/tsne01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch08/tsne01.png -------------------------------------------------------------------------------- /images/ch10/ch10_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch10/ch10_1.png -------------------------------------------------------------------------------- /images/ch10/ch10_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch10/ch10_2.png -------------------------------------------------------------------------------- /images/ch10/ch10_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch10/ch10_3.png -------------------------------------------------------------------------------- /images/ch11/ch11_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch11/ch11_01.png -------------------------------------------------------------------------------- /images/ch11/ch11_nfp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch11/ch11_nfp.png -------------------------------------------------------------------------------- /images/jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/jupyter.png -------------------------------------------------------------------------------- /images/mishimasyk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/mishimasyk.png -------------------------------------------------------------------------------- /images/python_for_ci.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/python_for_ci.png -------------------------------------------------------------------------------- /images/souyakuchan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/souyakuchan.png -------------------------------------------------------------------------------- /mkpdf.sh: -------------------------------------------------------------------------------- 1 | cat \ 2 | ch00_cover.asciidoc \ 3 | ch01_introduction.asciidoc \ 4 | ch02_installation.asciidoc \ 5 | ch03_python.asciidoc \ 6 | ch04_database.asciidoc \ 7 | ch05_rdkit.asciidoc \ 8 | ch06_similarity.asciidoc \ 9 | ch07_graph.asciidoc \ 10 | ch08_visualization.asciidoc \ 11 | ch09_qsar.asciidoc \ 12 | ch10_deeplearning.asciidoc \ 13 | ch11_dlqsar.asciidoc \ 14 | ch12_generativemodels.asciidoc \ 15 | ch13_beyond.asciidoc > py4c.asciidoc 16 | asciidoctor-pdf -r asciidoctor-pdf-cjk -o pdf/py4chemoinformatics.pdf py4c.asciidoc 17 | -------------------------------------------------------------------------------- /notebooks/ch09_compounds.txt: -------------------------------------------------------------------------------- 1 | CMPD_CHEMBLID MOLREGNO PARENT_CMPD_CHEMBLID PARENT_MOLREGNO MOL_PREF_NAME COMPOUND_KEY MOLWEIGHT ALOGP PSA NUM_RO5_VIOLATIONS CANONICAL_SMILES ACTIVITY_ID STANDARD_TYPE RELATION STANDARD_VALUE STANDARD_UNITS PCHEMBL_VALUE PUBLISHED_TYPE PUBLISHED_RELATION PUBLISHED_VALUE PUBLISHED_UNITS ACTIVITY_COMMENT DATA_VALIDITY_COMMENT POTENTIAL_DUPLICATE BAO_ENDPOINT UO_UNITS QUDT_UNITS ASSAY_ID ASSAY_CHEMBLID ASSAY_TYPE DESCRIPTION ASSAY_SRC_ID ASSAY_SRC_DESCRIPTION ASSAY_ORGANISM ASSAY_STRAIN ASSAY_TAX_ID CURATED_BY BAO_FORMAT TID TARGET_CHEMBLID TARGET_TYPE PROTEIN_ACCESSION PREF_NAME ORGANISM CONFIDENCE_SCORE TARGET_MAPPING APD_NAME APD_CONFIDENCE DOC_ID DOC_CHEMBLID PUBMED_ID JOURNAL YEAR VOLUME ISSUE FIRST_PAGE CELL_ID CELL_CHEMBL_ID CELL_NAME ACTIVITY_PARAMS ACTIVITY_PROPS 2 | CHEMBL549 14367 CHEMBL549 14367 CITALOPRAM citalopram 324.4 3.81 36.26 0 CN(C)CCCC1(OCc2cc(ccc12)C#N)c3ccc(F)cc3 1523704 IC50 = 3981.07 nM 5.4 pIC50 = 5.4 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 3 | CHEMBL196431 325943 CHEMBL196431 325943 EDDP 277.41 4.6 3.24 0 C\C=C/1\N(C)C(C)CC1(c2ccccc2)c3ccccc3 1523842 IC50 = 50118.72 nM 4.3 pIC50 = 4.3 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 4 | CHEMBL12713 10813 CHEMBL12713 10813 SERTINDOLE Sertindole 440.95 4.63 40.51 0 Fc1ccc(cc1)n2cc(C3CCN(CCN4CCNC4=O)CC3)c5cc(Cl)ccc25 1523558 IC50 = 10 nM 8 pIC50 = 8 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 5 | CHEMBL473 5638 CHEMBL473 5638 DOFETILIDE Dofetilide 441.58 1.98 104.81 0 CN(CCOc1ccc(NS(=O)(=O)C)cc1)CCc2ccc(NS(=O)(=O)C)cc2 1523555 IC50 = 10 nM 8 pIC50 = 8 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 6 | CHEMBL607 22528 CHEMBL607 22528 MEPERIDINE Meperidine 247.34 2.21 29.54 0 CCOC(=O)C1(CCN(C)CC1)c2ccccc2 1523678 IC50 = 323.59 nM 6.49 pIC50 = 6.49 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 7 | CHEMBL11 605 CHEMBL11 605 IMIPRAMINE Imipramine 280.42 3.88 6.48 0 CN(C)CCCN1c2ccccc2CCc3ccccc13 1523698 IC50 = 3388.44 nM 5.47 pIC50 = 5.47 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 8 | CHEMBL71 6216 CHEMBL71 6216 CHLORPROMAZINE chlorpromazine 318.87 4.89 6.48 0 CN(C)CCCN1c2ccccc2Sc3ccc(Cl)cc13 1523687 IC50 = 1479.11 nM 5.83 pIC50 = 5.83 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 9 | CHEMBL708 33664 CHEMBL708 33664 ZIPRASIDONE Ziprasidone 412.95 3.81 48.47 0 Clc1cc2NC(=O)Cc2cc1CCN3CCN(CC3)c4nsc5ccccc45 1523567 IC50 = 120.23 nM 6.92 pIC50 = 6.92 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 10 | CHEMBL629 27072 CHEMBL629 27072 AMITRIPTYLINE Amitriptyline 277.41 4.17 3.24 0 CN(C)CCC=C1c2ccccc2CCc3ccccc13 1523711 IC50 = 10000 nM 5 pIC50 = 5 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 11 | CHEMBL1088 148004 CHEMBL1088 148004 MESORIDAZINE Mesoridazine 386.59 4.9 23.55 0 CN1CCCCC1CCN2c3ccccc3Sc4ccc(cc24)[S+](C)[O-] 1523681 IC50 = 549.54 nM 6.26 pIC50 = 6.26 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 12 | CHEMBL533 12608 CHEMBL533 12608 IBUTILIDE Ibutilide 384.59 4.16 69.64 0 CCCCCCCN(CC)CCCC(O)c1ccc(NS(=O)(=O)C)cc1 1523559 IC50 = 10 nM 8 pIC50 = 8 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 13 | CHEMBL4 146 CHEMBL4 146 OFLOXACIN Ofloxacin 361.37 1.54 75.01 0 CC1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O 1523958 IC50 = 1412537.54 nM pIC50 = 2.85 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 14 | CHEMBL96153 153104 CHEMBL96153 153104 TERIKALANT Terikalant 381.52 4.84 30.93 0 COc1ccc(cc1OC)C2CCN(CC[C@H]3CCOc4ccccc34)CC2 1523673 IC50 = 251.19 nM 6.6 pIC50 = 6.6 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 15 | CHEMBL364512 326211 CHEMBL364512 326211 RISPERIDON Risperidon 412.51 2.92 59.39 0 CC1=C(CCN2CCC(CC2)C3NOc4cc(F)ccc34)C(=O)N5CCCCC5=N1 1523572 IC50 = 151.36 nM 6.82 pIC50 = 6.82 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 16 | CHEMBL652 27588 CHEMBL652 27588 FLECAINIDE Flecainide 414.35 3.44 59.59 0 FC(F)(F)COc1ccc(OCC(F)(F)F)c(c1)C(=O)NCC2CCCCN2 1523703 IC50 = 3890.45 nM 5.41 pIC50 = 5.41 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 17 | CHEMBL633 27185 CHEMBL633 27185 AMIODARONE Amiodarone 645.32 6.94 42.68 2 CCCCc1oc2ccccc2c1C(=O)c3cc(I)c(OCCN(CC)CC)c(I)c3 1523710 IC50 = 10000 nM 5 pIC50 = 5 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 18 | CHEMBL31 1712 CHEMBL31 1712 GATIFLOXACIN Gatifloxacin 375.4 1.98 83.8 0 COc1c(N2CCNC(C)C2)c(F)cc3C(=O)C(=CN(C4CC4)c13)C(=O)O 1523948 IC50 = 128824.96 nM pIC50 = 3.89 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 19 | CHEMBL583 17136 CHEMBL583 17136 GREPAFLOXACIN Grepafloxacin 359.4 2.28 74.57 0 CC1CN(CCN1)c2cc3N(C=C(C(=O)O)C(=O)c3c(C)c2F)C4CC4 1523840 IC50 = 50118.72 nM 4.3 pIC50 = 4.3 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 20 | CHEMBL1000 111185 CHEMBL1000 111185 CETIRIZINE cetirizine 388.9 3.15 53.01 0 OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3 1523839 IC50 = 30199.52 nM 4.52 pIC50 = 4.52 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 21 | CHEMBL485 6167 CHEMBL485 6167 CODEINE codeine 299.37 1.5 41.93 0 COc1ccc2C[C@@H]3[C@@H]4C=C[C@H](O)[C@@H]5Oc1c2[C@]45CCN3C 1523951 IC50 = 301995.17 nM pIC50 = 3.52 Outside typical range 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 22 | CHEMBL640 27341 CHEMBL640 27341 PROCAINAMIDE Procainamide 235.33 1.34 58.36 0 CCN(CC)CCNC(=O)c1ccc(N)cc1 1523949 IC50 = 138038.43 nM pIC50 = 3.86 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 23 | CHEMBL1110 153342 CHEMBL1110 153342 ALOSETRON Alosetron 294.36 2.41 53.92 0 Cc1[nH]cnc1CN2CCc3c(C2=O)c4ccccc4n3C 1523696 IC50 = 3235.94 nM 5.49 pIC50 = 5.49 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 24 | CHEMBL517 11143 CHEMBL517 11143 DISOPYRAMIDE Disopyramide 339.48 3.36 59.22 0 CC(C)N(CCC(C(=O)N)(c1ccccc1)c2ccccn2)C(C)C 1523945 IC50 = 91201.08 nM 4.04 pIC50 = 4.04 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 25 | CHEMBL479 5970 CHEMBL479 5970 THIORIDAZINE Thioridazine 370.59 5.89 6.48 1 CSc1ccc2Sc3ccccc3N(CCC4CCCCN4C)c2c1 1523670 IC50 = 190.55 nM 6.72 pIC50 = 6.72 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 26 | CHEMBL723 36662 CHEMBL723 36662 CARVEDILOL Carvedilol 406.48 3.74 75.74 0 COc1ccccc1OCCNCC(O)COc2cccc3[nH]c4ccccc4c23 1523712 IC50 = 10471.29 nM 4.98 pIC50 = 4.98 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 27 | CHEMBL715 34197 CHEMBL715 34197 OLANZAPINE Olanzapine 312.44 3.44 30.87 0 CN1CCN(CC1)C2=Nc3ccccc3Nc4sc(C)cc24 1523669 IC50 = 181.97 nM 6.74 pIC50 = 6.74 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 28 | CHEMBL92870 150806 CHEMBL92870 150806 LIDOFLAZINE Lidoflazine 491.63 5.75 35.58 1 Cc1cccc(C)c1NC(=O)CN2CCN(CCCC(c3ccc(F)cc3)c4ccc(F)cc4)CC2 1523560 IC50 = 15.85 nM 7.8 pIC50 = 7.8 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 29 | CHEMBL295698 72036 CHEMBL295698 72036 LEVOKETOCONAZOLE Ketoconazole 531.44 4.21 69.06 1 CC(=O)N1CCN(CC1)c2ccc(OC[C@@H]3CO[C@](Cn4ccnc4)(O3)c5ccc(Cl)cc5Cl)cc2 1523693 IC50 = 1905.46 nM 5.72 pIC50 = 5.72 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 30 | CHEMBL1294 255477 CHEMBL1294 255477 QUINIDINE Quinidine 324.42 3.17 45.59 0 COc1ccc2nccc([C@H](O)[C@H]3C[C@@H]4CCN3C[C@@H]4C=C)c2c1 1523674 IC50 = 323.59 nM 6.49 pIC50 = 6.49 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 31 | CHEMBL1107 152728 CHEMBL1107 152728 HALOFANTRINE Halofantrine 500.43 8.64 23.47 2 CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c3cc(ccc13)C(F)(F)F 1523672 IC50 = 199.53 nM 6.7 pIC50 = 6.7 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 32 | CHEMBL2368925 1543376 CHEMBL2368925 1543376 DOLASETRON Dolasetron 324.38 2.52 62.4 0 O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c4c[nH]c5ccccc45 1523834 IC50 = 12022.64 nM 4.92 pIC50 = 4.92 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 33 | CHEMBL54 3859 CHEMBL54 3859 HALOPERIDOL Haloperidol 375.87 4.43 40.54 0 OC1(CCN(CCCC(=O)c2ccc(F)cc2)CC1)c3ccc(Cl)cc3 1523563 IC50 = 30.2 nM 7.52 pIC50 = 7.52 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 34 | CHEMBL45816 72035 CHEMBL45816 72035 MIBEFRADIL Mibefradil 495.64 5.27 67.45 1 COCC(=O)O[C@]1(CCN(C)CCCc2nc3ccccc3[nH]2)CCc4cc(F)ccc4[C@@H]1C(C)C 1523686 IC50 = 1445.44 nM 5.84 pIC50 = 5.84 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 35 | CHEMBL289469 54547 CHEMBL289469 54547 GRANISETRON Granisetron 312.42 2.32 50.16 0 CN1C2CCCC1CC(C2)NC(=O)c3nn(C)c4ccccc34 1523699 IC50 = 3715.35 nM 5.43 pIC50 = 5.43 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 36 | CHEMBL46 3183 CHEMBL46 3183 ONDANSETRON Ondansetron 293.37 3.13 39.82 0 Cc1nccn1CC2CCc3c(C2=O)c4ccccc4n3C 1523683 IC50 = 812.83 nM 6.09 pIC50 = 6.09 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 37 | CHEMBL370805 333370 CHEMBL370805 333370 COCAINE cocaine 303.36 1.87 55.84 0 COC(=O)[C@H]1[C@H](C[C@@H]2CC[C@H]1N2C)OC(=O)c3ccccc3 1523706 IC50 = 7244.36 nM 5.14 pIC50 = 5.14 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 38 | CHEMBL33 1795 CHEMBL33 1795 LEVOFLOXACIN Levofloxacin 361.37 1.54 75.01 0 C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O 1523952 IC50 = 912010.84 nM pIC50 = 3.04 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 39 | CHEMBL651 27576 CHEMBL651 27576 METHADONE Methadone 309.45 4.29 20.31 0 CCC(=O)C(CC(C)N(C)C)(c1ccccc1)c2ccccc2 1523708 IC50 = 9772.37 nM 5.01 pIC50 = 5.01 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 40 | CHEMBL1108 152751 CHEMBL1108 152751 DROPERIDOL Droperidol 379.44 3.68 58.1 0 Fc1ccc(cc1)C(=O)CCCN2CCC(=CC2)N3C(=O)Nc4ccccc34 1523564 IC50 = 32.36 nM 7.49 pIC50 = 7.49 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 41 | CHEMBL1729 557741 CHEMBL1729 557741 CISAPRIDE cisapride 465.95 3.36 86.05 0 COC1CN(CCCOc2ccc(F)cc2)CCC1NC(=O)c3cc(Cl)c(N)cc3OC 1523565 IC50 = 39.81 nM 7.4 pIC50 = 7.4 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 42 | CHEMBL1741 570147 CHEMBL1741 570147 CLARITHROMYCIN clarithromycin 747.96 2.44 182.91 2 CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]3O[C@H](C)C[C@@H]([C@H]3O)N(C)C)[C@@](C)(C[C@@H](C)C(=O)[C@H](C)[C@@H](O)[C@]1(C)O)OC 1523843 IC50 = 58884.37 nM 4.23 pIC50 = 4.23 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 43 | CHEMBL3 115 CHEMBL3 115 NICOTINE Nicotine 162.24 1.85 16.13 0 CN1CCC[C@H]1c2cccnc2 1523950 IC50 = 245470.89 nM pIC50 = 3.61 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 44 | CHEMBL416956 51162 CHEMBL416956 51162 MEFLOQUINE Mefloquine 378.32 4.45 45.15 0 OC(C1CCCCN1)c2cc(nc3c(cccc23)C(F)(F)F)C(F)(F)F 1523705 IC50 = 5623.41 nM 5.25 pIC50 = 5.25 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 45 | CHEMBL23 1278 CHEMBL23 1278 DILTIAZEM Diltiazem 414.53 3.37 59.08 0 COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@@H]2OC(=O)C 1523835 IC50 = 17378.01 nM 4.76 pIC50 = 4.76 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 46 | CHEMBL94454 152819 CHEMBL94454 152819 MIZOLASTINE Mizolastine 432.5 3.41 70.05 0 CN(C1CCN(CC1)c2nc3ccccc3n2Cc4ccc(F)cc4)C5=NC=CC(=O)N5 1523680 IC50 = 436.52 nM 6.36 pIC50 = 6.36 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 47 | CHEMBL657 27629 CHEMBL657 27629 DIPHENHYDRAMINE Diphenhydramine 255.36 3.35 12.47 0 CN(C)CCOC(c1ccccc1)c2ccccc2 1523838 IC50 = 26915.35 nM 4.57 pIC50 = 4.57 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 48 | CHEMBL2 97 CHEMBL2 97 PRAZOSIN Prazosine 383.41 1.78 106.95 0 COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4 1523691 IC50 = 1584.89 nM 5.8 pIC50 = 5.8 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 49 | CHEMBL6966 1219 CHEMBL6966 1219 VERAPAMIL Verapamil 454.61 5.09 63.95 1 COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2)cc1OC 1523571 IC50 = 141.25 nM 6.85 pIC50 = 6.85 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 50 | CHEMBL511142 446784 CHEMBL511142 446784 BUPRENORPHINE Buprenorphine 467.65 4.41 62.16 0 CO[C@@]12CC[C@@]3(C[C@@H]1[C@](C)(O)C(C)(C)C)[C@H]4Cc5ccc(O)c6O[C@@H]2[C@]3(CCN4CC7CC7)c56 1523707 IC50 = 7585.78 nM 5.12 pIC50 = 5.12 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 51 | CHEMBL72 6243 CHEMBL72 6243 DESIPRAMINE Desipramine 266.39 3.53 15.27 0 CNCCCN1c2ccccc2CCc3ccccc13 1523684 IC50 = 1380.38 nM 5.86 pIC50 = 5.86 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 52 | CHEMBL42 2261 CHEMBL42 2261 CLOZAPINE clozapine 326.83 3.72 30.87 0 CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24 1523679 IC50 = 323.59 nM 6.49 pIC50 = 6.49 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 53 | CHEMBL1423 364141 CHEMBL1423 364141 PIMOZIDE Pimozide 461.56 5.86 41.03 1 Fc1ccc(cc1)C(CCCN2CCC(CC2)N3C(=O)Nc4ccccc34)c5ccc(F)cc5 1523566 IC50 = 50.12 nM 7.3 pIC50 = 7.3 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 54 | CHEMBL327980 152611 CHEMBL327980 152611 E-4031 401.53 2.9 79.37 0 Cc1cccc(CCN2CCC(CC2)C(=O)c3ccc(NS(=O)(=O)C)cc3)n1 1523562 IC50 = 19.95 nM 7.7 pIC50 = 7.7 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 55 | CHEMBL1008 112651 CHEMBL1008 112651 BEPRIDIL Bepridil 366.55 4.83 15.71 0 CC(C)COCC(CN(Cc1ccccc1)c2ccccc2)N3CCCC3 1523682 IC50 = 549.54 nM 6.26 pIC50 = 6.26 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 56 | CHEMBL41 2223 CHEMBL41 2223 FLUOXETINE Fluoxetine 309.33 4.44 21.26 0 CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2 1523690 IC50 = 1513.56 nM 5.82 pIC50 = 5.82 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 57 | CHEMBL193 419596 CHEMBL193 419596 NIFEDIPINE Nifedipine 346.34 2.18 107.77 0 COC(=O)C1=C(C)NC(=C(C1c2ccccc2[N+](=O)[O-])C(=O)OC)C 1523841 IC50 = 50118.72 nM 4.3 pIC50 = 4.3 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 58 | CHEMBL1514 421070 CHEMBL1514 421070 LEVOMETHADYL ACETATE Laam 353.51 4.65 29.54 0 CC[C@H](OC(=O)C)C(C[C@H](C)N(C)C)(c1ccccc1)c2ccccc2 1523694 IC50 = 2187.76 nM 5.66 pIC50 = 5.66 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 59 | CHEMBL17157 19569 CHEMBL17157 19569 TERFENADINE Terfenadine 471.69 6.45 43.7 1 CC(C)(C)c1ccc(cc1)C(O)CCCN2CCC(CC2)C(O)(c3ccccc3)c4ccccc4 1523671 IC50 = 199.53 nM 6.7 pIC50 = 6.7 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 60 | CHEMBL296419 65605 CHEMBL296419 65605 ASTEMIZOLE Astemizole (table 1) 458.58 5.35 42.32 1 COc1ccc(CCN2CCC(CC2)Nc3nc4ccccc4n3Cc5ccc(F)cc5)cc1 1523554 IC50 = 10 nM 8 pIC50 = 8 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 61 | CHEMBL596 19915 CHEMBL596 19915 FENTANYL Fentanyl 336.48 4.14 23.55 0 CCC(=O)N(C1CCN(CCc2ccccc2)CC1)c3ccccc3 1523692 IC50 = 1819.7 nM 5.74 pIC50 = 5.74 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 62 | CHEMBL998 110803 CHEMBL998 110803 LORATADINE Loratadine 382.89 4.89 42.43 0 CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc4cccnc24)CC1 1523668 IC50 = 169.82 nM 6.77 pIC50 = 6.77 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 63 | CHEMBL505 10358 CHEMBL505 10358 CHLORPHENIRAMINE chlorpheniramine 274.8 3.82 16.13 0 CN(C)CCC(c1ccc(Cl)cc1)c2ccccn2 1523837 IC50 = 20892.96 nM 4.68 pIC50 = 4.68 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 64 | CHEMBL70 6005 CHEMBL70 6005 MORPHINE Morphine 285.34 1.2 52.93 0 CN1CC[C@]23[C@H]4Oc5c(O)ccc(C[C@@H]1[C@@H]2C=C[C@@H]4O)c35 1523954 IC50 = 1000000 nM pIC50 = 3 Outside typical range 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 65 | CHEMBL475534 453570 CHEMBL475534 453570 NITRENDIPINE Nitrendipine 360.37 2.57 107.77 0 CCOC(=O)C1=C(C)NC(=C(C1c2cccc(c2)[N+](=O)[O-])C(=O)OC)C 1523709 IC50 = 10000 nM 5 pIC50 = 5 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 66 | CHEMBL850 65351 CHEMBL850 65351 SPARFLOXACIN Sparfloxacin 392.41 2.08 100.59 0 C[C@@H]1CN(C[C@H](C)N1)c2c(F)c(N)c3C(=O)C(=CN(C4CC4)c3c2F)C(=O)O 1523836 IC50 = 18197.01 nM 4.74 pIC50 = 4.74 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 67 | CHEMBL192 410802 CHEMBL192 410802 SILDENAFIL UK-92480, Sildenafil 474.59 1.61 113.42 0 CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4 1523697 IC50 = 3311.31 nM 5.48 pIC50 = 5.48 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 68 | CHEMBL1382 321964 CHEMBL1382 321964 TOLTERODINE Tolterodine 325.5 5.34 23.47 1 CC(C)N(CC[C@H](c1ccccc1)c2cc(C)ccc2O)C(C)C 1523561 IC50 = 16.98 nM 7.77 pIC50 = 7.77 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 69 | CHEMBL305660 111060 CHEMBL305660 111060 EBASTINE Ebastine 469.67 7.22 29.54 1 CC(C)(C)c1ccc(cc1)C(=O)CCCN2CCC(CC2)OC(c3ccccc3)c4ccccc4 1523695 IC50 = 3019.95 nM 5.52 pIC50 = 5.52 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 70 | CHEMBL32 1788 CHEMBL32 1788 MOXIFLOXACIN Moxifloxacin 401.44 2.37 83.8 0 COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O 1523947 IC50 = 128824.96 nM pIC50 = 3.89 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 71 | CHEMBL1106 152610 CHEMBL1106 152610 EPINASTINE Epinastine 249.32 2.47 41.62 0 NC1=NCC2N1c3ccccc3Cc4ccccc24 1523946 IC50 = 91201.08 nM 4.04 pIC50 = 4.04 0 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 72 | CHEMBL219916 364745 CHEMBL219916 364745 DOMPERIDONE Domperidone 425.92 3.35 78.82 0 Clc1ccc2N(C3CCN(CCCN4C(=O)Nc5ccccc45)CC3)C(=O)Nc2c1 1523667 IC50 = 162.18 nM 6.79 pIC50 = 6.79 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 73 | CHEMBL123558 208679 CHEMBL123558 208679 AZIMILIDE Azimilide 457.96 3.23 72.6 0 CN1CCN(CCCCN2C(=O)CN(\N=C\c3oc(cc3)c4ccc(Cl)cc4)C2=O)CC1 1523685 IC50 = 1412.54 nM 5.85 pIC50 = 5.85 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 74 | CHEMBL8 241 CHEMBL8 241 CIPROFLOXACIN Ciprofloxacin 331.35 1.58 74.57 0 OC(=O)C1=CN(C2CC2)c3cc(N4CCNCC4)c(F)cc3C1=O 1523953 IC50 = 954992.59 nM pIC50 = 3.02 Outside typical range 1 BAO_0000199 307245 CHEMBL829152 B Inhibitory concentration against potassium channel HERG 1 Scientific Literature Expert BAO_0000357 165 CHEMBL240 SINGLE PROTEIN Q12809 HERG Homo sapiens 8 Homologous protein 20472 CHEMBL1139598 15911273 Bioorg. Med. Chem. Lett. 2005 15 11 2886 75 | -------------------------------------------------------------------------------- /notebooks/ch09_qsar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 構造活性相関(QSAR)の基礎\n", 8 | "\n", 9 | "## 効果ありなしの原因を考えてみる(分類問題)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 17, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from rdkit import Chem, DataStructs\n", 19 | "from rdkit.Chem import AllChem, Draw\n", 20 | "from rdkit.Chem.Draw import IPythonConsole\n", 21 | "import numpy as np\n", 22 | "from sklearn.model_selection import train_test_split\n", 23 | "from sklearn.metrics import confusion_matrix, f1_score\n", 24 | "from sklearn.ensemble import RandomForestClassifier" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "mols = []\n", 34 | "labels = []\n", 35 | "with open(\"ch09_compounds.txt\") as f:\n", 36 | " header = f.readline()\n", 37 | " smiles_index = -1\n", 38 | " for i, title in enumerate(header.split(\"\\t\")):\n", 39 | " if title == \"CANONICAL_SMILES\":\n", 40 | " smiles_index = i\n", 41 | " elif title == \"STANDARD_VALUE\":\n", 42 | " value_index = i\n", 43 | " for l in f:\n", 44 | " ls = l.split(\"\\t\")\n", 45 | " mol = Chem.MolFromSmiles(ls[smiles_index])\n", 46 | " mols.append(mol)\n", 47 | " val = float(ls[value_index])\n", 48 | " if val < 1000:\n", 49 | " labels.append(\"POS\")\n", 50 | " else:\n", 51 | " labels.append(\"NEG\")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "labels = np.array(labels)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 7, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "fps = []\n", 70 | "for mol in mols:\n", 71 | " fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)\n", 72 | " arr = np.zeros((1,))\n", 73 | " DataStructs.ConvertToNumpyArray(fp, arr)\n", 74 | " fps.append(arr)\n", 75 | "fps = np.array(fps)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 11, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "x_train, x_test, y_train, y_test = train_test_split(fps, labels)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 12, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 96 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 97 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 98 | " min_samples_leaf=1, min_samples_split=2,\n", 99 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", 100 | " oob_score=False, random_state=None, verbose=0,\n", 101 | " warm_start=False)" 102 | ] 103 | }, 104 | "execution_count": 12, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "rf = RandomForestClassifier()\n", 111 | "rf.fit(x_train, y_train)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 23, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "y_pred = rf.predict(x_test)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 24, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "array([[11, 1],\n", 132 | " [ 5, 2]])" 133 | ] 134 | }, 135 | "execution_count": 24, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "confusion_matrix(y_test, y_pred)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 26, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "0.4" 153 | ] 154 | }, 155 | "execution_count": 26, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "f1_score(y_test, y_pred, pos_label=\"POS\" )" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | " ## 薬の効き目を予測しよう(回帰問題)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 40, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "from sklearn.ensemble import RandomForestRegressor\n", 178 | "from sklearn.metrics import r2_score\n", 179 | "from math import log10" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 35, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "pIC50s = []\n", 189 | "with open(\"ch09_compounds.txt\") as f:\n", 190 | " header = f.readline()\n", 191 | " for i, title in enumerate(header.split(\"\\t\")):\n", 192 | " if title == \"STANDARD_VALUE\":\n", 193 | " value_index = i\n", 194 | " for l in f:\n", 195 | " ls = l.split(\"\\t\")\n", 196 | " val = float(ls[value_index])\n", 197 | " pIC50 = 9 - log10(val)\n", 198 | " pIC50s.append(pIC50)\n", 199 | "\n", 200 | "pIC50s = np.array(pIC50s)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 37, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "x_train, x_test, y_train, y_test = train_test_split(fps, pIC50s)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 38, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", 221 | " max_features='auto', max_leaf_nodes=None,\n", 222 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 223 | " min_samples_leaf=1, min_samples_split=2,\n", 224 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", 225 | " oob_score=False, random_state=None, verbose=0, warm_start=False)" 226 | ] 227 | }, 228 | "execution_count": 38, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "rf = RandomForestRegressor()\n", 235 | "rf.fit(x_train, y_train)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 39, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "y_pred = rf.predict(x_test)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 41, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "0.5213586033887229" 256 | ] 257 | }, 258 | "execution_count": 41, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "r2_score(y_test, y_pred)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [] 273 | } 274 | ], 275 | "metadata": { 276 | "kernelspec": { 277 | "display_name": "Python 3", 278 | "language": "python", 279 | "name": "python3" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 3 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython3", 291 | "version": "3.6.8" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 2 296 | } 297 | -------------------------------------------------------------------------------- /pdf/py4chemoinformatics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/pdf/py4chemoinformatics.pdf -------------------------------------------------------------------------------- /py4c-theme.yml: -------------------------------------------------------------------------------- 1 | font: 2 | catalog: 3 | # Noto Serif supports Latin, Latin-1 Supplement, Latin Extended-A, Greek, Cyrillic, Vietnamese & an assortment of symbols 4 | AozoraMincho: 5 | normal: AozoraMinchoRegular.ttf 6 | italic: AozoraMinchoMedium.ttf 7 | bold: AozoraMinchoHeavy.ttf 8 | bold_italic: AozoraMinchoBlack.ttf 9 | RictyDiminished: 10 | normal: RictyDiminished-Regular.ttf 11 | italic: RictyDiminished-Oblique.ttf 12 | bold: RictyDiminished-Bold.ttf 13 | bold_italic: RictyDiminished-BoldOblique.ttf 14 | Noto Serif: 15 | normal: notoserif-regular-subset.ttf 16 | bold: notoserif-bold-subset.ttf 17 | italic: notoserif-italic-subset.ttf 18 | bold_italic: notoserif-bold_italic-subset.ttf 19 | # M+ 1mn supports ASCII and the circled numbers used for conums 20 | M+ 1mn: 21 | normal: mplus1mn-regular-ascii-conums.ttf 22 | bold: mplus1mn-bold-ascii.ttf 23 | italic: mplus1mn-italic-ascii.ttf 24 | bold_italic: mplus1mn-bold_italic-ascii.ttf 25 | # M+ 1p supports Latin, Latin-1 Supplement, Latin Extended, Greek, Cyrillic, Vietnamese, Japanese & an assortment of symbols 26 | # It also provides arrows for ->, <-, => and <= replacements in case these glyphs are missing from font 27 | M+ 1p Fallback: 28 | normal: mplus1p-regular-fallback.ttf 29 | bold: mplus1p-regular-fallback.ttf 30 | italic: mplus1p-regular-fallback.ttf 31 | bold_italic: mplus1p-regular-fallback.ttf 32 | fallbacks: 33 | - M+ 1p Fallback 34 | page: 35 | background_color: ffffff 36 | layout: portrait 37 | margin: [0.5in, 0.67in, 0.67in, 0.67in] 38 | # margin_inner and margin_outer keys are used for recto/verso print margins when media=prepress 39 | margin_inner: 0.75in 40 | margin_outer: 0.59in 41 | size: A4 42 | base: 43 | align: justify 44 | # color as hex string (leading # is optional) 45 | font_color: 333333 46 | # color as RGB array 47 | #font_color: [51, 51, 51] 48 | # color as CMYK array (approximated) 49 | #font_color: [0, 0, 0, 0.92] 50 | #font_color: [0, 0, 0, 92%] 51 | font_family: Noto Serif 52 | # choose one of these font_size/line_height_length combinations 53 | font_size: 10 54 | line_height_length: 15 55 | #font_size: 11.25 56 | #line_height_length: 18 57 | #font_size: 11.2 58 | #line_height_length: 16 59 | font_size: 10.5 60 | #line_height_length: 15 61 | # correct line height for Noto Serif metrics 62 | line_height_length: 12 63 | #font_size: 11.25 64 | #line_height_length: 18 65 | line_height: $base_line_height_length / $base_font_size 66 | font_size_large: round($base_font_size * 1.25) 67 | font_size_small: round($base_font_size * 0.85) 68 | font_size_min: $base_font_size * 0.75 69 | font_style: normal 70 | border_color: eeeeee 71 | border_radius: 4 72 | border_width: 0.5 73 | # FIXME vertical_rhythm is weird; we should think in terms of ems 74 | #vertical_rhythm: $base_line_height_length * 2 / 3 75 | # correct line height for Noto Serif metrics (comes with built-in line height) 76 | vertical_rhythm: $base_line_height_length 77 | horizontal_rhythm: $base_line_height_length 78 | # QUESTION should vertical_spacing be block_spacing instead? 79 | vertical_spacing: $vertical_rhythm 80 | link: 81 | font_color: 428bca 82 | # literal is currently used for inline monospaced in prose and table cells 83 | literal: 84 | font_color: b12146 85 | font_family: M+ 1mn 86 | menu_caret_content: " \u203a " 87 | heading: 88 | align: left 89 | #font_color: 181818 90 | font_color: $base_font_color 91 | font_family: $base_font_family 92 | font_style: bold 93 | # h1 is used for part titles (book doctype only) 94 | h1_font_size: floor($base_font_size * 2.6) 95 | # h2 is used for chapter titles (book doctype only) 96 | h2_font_size: floor($base_font_size * 2.15) 97 | h3_font_size: round($base_font_size * 1.7) 98 | h4_font_size: $base_font_size_large 99 | h5_font_size: $base_font_size 100 | h6_font_size: $base_font_size_small 101 | #line_height: 1.4 102 | # correct line height for Noto Serif metrics (comes with built-in line height) 103 | line_height: 1 104 | margin_top: $vertical_rhythm * 0.4 105 | margin_bottom: $vertical_rhythm * 0.9 106 | title_page: 107 | align: right 108 | logo: 109 | top: 30% 110 | title: 111 | top: 80% 112 | font_size: $heading_h1_font_size 113 | font_color: 999999 114 | line_height: 0.9 115 | subtitle: 116 | font_size: $heading_h3_font_size 117 | font_style: bold_italic 118 | line_height: 1 119 | authors: 120 | margin_top: $base_font_size * 1.25 121 | font_size: $base_font_size_large 122 | font_color: 181818 123 | revision: 124 | margin_top: $base_font_size * 1.25 125 | block: 126 | margin_top: 0 127 | margin_bottom: $vertical_rhythm 128 | caption: 129 | align: left 130 | font_size: $base_font_size * 0.95 131 | font_style: italic 132 | # FIXME perhaps set line_height instead of / in addition to margins? 133 | margin_inside: $vertical_rhythm / 3 134 | #margin_inside: $vertical_rhythm / 4 135 | margin_outside: 0 136 | lead: 137 | font_size: $base_font_size_large 138 | line_height: 1.4 139 | abstract: 140 | font_color: 5c6266 141 | font_size: $lead_font_size 142 | line_height: $lead_line_height 143 | font_style: italic 144 | first_line_font_style: bold 145 | title: 146 | align: center 147 | font_color: $heading_font_color 148 | font_family: $heading_font_family 149 | font_size: $heading_h4_font_size 150 | font_style: $heading_font_style 151 | admonition: 152 | column_rule_color: $base_border_color 153 | column_rule_width: $base_border_width 154 | padding: [0, $horizontal_rhythm, 0, $horizontal_rhythm] 155 | #icon: 156 | # tip: 157 | # name: fa-lightbulb-o 158 | # stroke_color: 111111 159 | # size: 24 160 | label: 161 | text_transform: uppercase 162 | font_style: bold 163 | blockquote: 164 | font_color: $base_font_color 165 | font_size: $base_font_size_large 166 | border_color: $base_border_color 167 | border_width: 5 168 | # FIXME disable negative padding bottom once margin collapsing is implemented 169 | padding: [0, $horizontal_rhythm, $block_margin_bottom * -0.75, $horizontal_rhythm + $blockquote_border_width / 2] 170 | cite_font_size: $base_font_size_small 171 | cite_font_color: 999999 172 | # code is used for source blocks (perhaps change to source or listing?) 173 | code: 174 | font_color: $base_font_color 175 | font_family: $literal_font_family 176 | font_size: ceil($base_font_size) 177 | padding: $code_font_size 178 | line_height: 1.25 179 | # line_gap is an experimental property to control how a background color is applied to an inline block element 180 | line_gap: 3.8 181 | background_color: f5f5f5 182 | border_color: cccccc 183 | border_radius: $base_border_radius 184 | border_width: 0.75 185 | conum: 186 | font_family: M+ 1mn 187 | font_color: $literal_font_color 188 | font_size: $base_font_size 189 | line_height: 4 / 3 190 | example: 191 | border_color: $base_border_color 192 | border_radius: $base_border_radius 193 | border_width: 0.75 194 | background_color: ffffff 195 | # FIXME reenable padding bottom once margin collapsing is implemented 196 | padding: [$vertical_rhythm, $horizontal_rhythm, 0, $horizontal_rhythm] 197 | image: 198 | align: left 199 | prose: 200 | margin_top: $block_margin_top 201 | margin_bottom: $block_margin_bottom 202 | sidebar: 203 | background_color: eeeeee 204 | border_color: e1e1e1 205 | border_radius: $base_border_radius 206 | border_width: $base_border_width 207 | # FIXME reenable padding bottom once margin collapsing is implemented 208 | padding: [$vertical_rhythm, $vertical_rhythm * 1.25, 0, $vertical_rhythm * 1.25] 209 | title: 210 | align: center 211 | font_color: $heading_font_color 212 | font_family: $heading_font_family 213 | font_size: $heading_h4_font_size 214 | font_style: $heading_font_style 215 | thematic_break: 216 | border_color: $base_border_color 217 | border_style: solid 218 | border_width: $base_border_width 219 | margin_top: $vertical_rhythm * 0.5 220 | margin_bottom: $vertical_rhythm * 1.5 221 | description_list: 222 | term_font_style: bold 223 | term_spacing: $vertical_rhythm / 4 224 | description_indent: $horizontal_rhythm * 1.25 225 | outline_list: 226 | indent: $horizontal_rhythm * 1.5 227 | #marker_font_color: 404040 228 | # NOTE outline_list_item_spacing applies to list items that do not have complex content 229 | item_spacing: $vertical_rhythm / 2 230 | table: 231 | background_color: $page_background_color 232 | #head_background_color: 233 | #head_font_color: $base_font_color 234 | head_font_style: bold 235 | #body_background_color: 236 | body_stripe_background_color: f9f9f9 237 | foot_background_color: f0f0f0 238 | border_color: dddddd 239 | border_width: $base_border_width 240 | cell_padding: 3 241 | toc: 242 | indent: $horizontal_rhythm 243 | line_height: 1.4 244 | dot_leader: 245 | #content: ". " 246 | font_color: a9a9a9 247 | #levels: 2 3 248 | # NOTE in addition to footer, header is also supported 249 | footer: 250 | font_size: $base_font_size_small 251 | # NOTE if background_color is set, background and border will span width of page 252 | border_color: dddddd 253 | border_width: 0.25 254 | height: $base_line_height_length * 2.5 255 | line_height: 1 256 | padding: [$base_line_height_length / 2, 1, 0, 1] 257 | vertical_align: top 258 | #image_vertical_align: or 259 | # additional attributes for content: 260 | # * {page-count} 261 | # * {page-number} 262 | # * {document-title} 263 | # * {document-subtitle} 264 | # * {chapter-title} 265 | # * {section-title} 266 | # * {section-or-chapter-title} 267 | recto: 268 | #columns: "<50% =0% >50%" 269 | right: 270 | content: '{page-number}' 271 | #content: '{section-or-chapter-title} | {page-number}' 272 | #content: '{document-title} | {page-number}' 273 | #center: 274 | # content: '{page-number}' 275 | verso: 276 | #columns: $footer_recto_columns 277 | left: 278 | content: $footer_recto_right_content 279 | #content: '{page-number} | {chapter-title}' 280 | #center: 281 | # content: '{page-number}' 282 | --------------------------------------------------------------------------------