├── .DS_Store
├── .gitignore
├── LICENSE
├── README.asciidoc
├── ch00_cover.asciidoc
├── ch01_introduction.asciidoc
├── ch02_installation.asciidoc
├── ch03_python.asciidoc
├── ch04_database.asciidoc
├── ch05_rdkit.asciidoc
├── ch06_similarity.asciidoc
├── ch07_graph.asciidoc
├── ch08_visualization.asciidoc
├── ch09_qsar.asciidoc
├── ch10_deeplearning.asciidoc
├── ch11_dlqsar.asciidoc
├── ch12_generativemodels.asciidoc
├── ch13_beyond.asciidoc
├── images
    ├── by-nc-sa.png
    ├── ch02
    │   └── anaconda01.png
    ├── ch04
    │   ├── chembl01.png
    │   ├── chembl02.png
    │   ├── chembl03.png
    │   ├── chembl04.png
    │   ├── chembl05.png
    │   ├── chembl06.png
    │   ├── chembl07.png
    │   └── zinc01.png
    ├── ch05
    │   ├── ch05_01.png
    │   ├── ch05_02.png
    │   ├── ch05_03.png
    │   ├── ch05_04.png
    │   ├── ch05_05.png
    │   ├── ch05_06.png
    │   ├── ch05_07.png
    │   └── ch05_08.png
    ├── ch06
    │   ├── apx_rvx.png
    │   ├── apx_rvx_suf.png
    │   ├── cls01.png
    │   └── vs01.png
    ├── ch07
    │   ├── chemviz2.png
    │   ├── mcs01.png
    │   ├── mcs02.png
    │   ├── mcs03.png
    │   ├── mcs04.png
    │   ├── mcs05.png
    │   ├── mmp01.png
    │   ├── mmp02.png
    │   ├── mmp03.png
    │   ├── mmp04.png
    │   ├── mms01.png
    │   └── scaffold.png
    ├── ch08
    │   ├── pca01.png
    │   ├── pca02.png
    │   └── tsne01.png
    ├── ch10
    │   ├── ch10_1.png
    │   ├── ch10_2.png
    │   └── ch10_3.png
    ├── ch11
    │   ├── ch11_01.png
    │   └── ch11_nfp.png
    ├── jupyter.png
    ├── mishimasyk.png
    ├── python_for_ci.png
    └── souyakuchan.png
├── mkpdf.sh
├── notebooks
    ├── Chembl_FXa.txt
    ├── ch05_Sildenafil vs Vardenafil.ipynb
    ├── ch05_compounds.sdf
    ├── ch05_hetero_shuffle.ipynb
    ├── ch05_rdkit.ipynb
    ├── ch06_nov_hts.sdf
    ├── ch06_similarity.ipynb
    ├── ch07_MCS.ipynb
    ├── ch07_MMS.ipynb
    ├── ch08
    │   ├── CHEMBL2380240.sdf
    │   ├── CHEMBL3098111.sdf
    │   ├── CHEMBL3112474.sdf
    │   ├── CHEMBL3351489.sdf
    │   ├── CHEMBL3352684.sdf
    │   ├── CHEMBL3526050.sdf
    │   ├── CHEMBL3739366.sdf
    │   ├── CHEMBL3739395.sdf
    │   ├── CHEMBL3769367.sdf
    │   └── CHEMBL3867477.sdf
    ├── ch08_compounds.txt
    ├── ch08_visualization.ipynb
    ├── ch09_compounds.txt
    ├── ch09_qsar.ipynb
    ├── ch11_simple_dnn.ipynb
    └── ch12_rnn.ipynb
├── pdf
    └── py4chemoinformatics.pdf
├── py4c-theme.yml
└── py4c.asciidoc


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | fonts
2 | *-checkpoint.ipynb
3 | */.ipynb_checkpoints/*
4 | .ipynb_checkpoints
5 | */.DS_Store
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public: 
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 58 | Public License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 63 | ("Public License"). To the extent this Public License may be
 64 | interpreted as a contract, You are granted the Licensed Rights in
 65 | consideration of Your acceptance of these terms and conditions, and the
 66 | Licensor grants You such rights in consideration of benefits the
 67 | Licensor receives from making the Licensed Material available under
 68 | these terms and conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-NC-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution, NonCommercial, and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. NonCommercial means not primarily intended for or directed towards
126 |      commercial advantage or monetary compensation. For purposes of
127 |      this Public License, the exchange of the Licensed Material for
128 |      other material subject to Copyright and Similar Rights by digital
129 |      file-sharing or similar means is NonCommercial provided there is
130 |      no payment of monetary compensation in connection with the
131 |      exchange.
132 | 
133 |   l. Share means to provide material to the public by any means or
134 |      process that requires permission under the Licensed Rights, such
135 |      as reproduction, public display, public performance, distribution,
136 |      dissemination, communication, or importation, and to make material
137 |      available to the public including in ways that members of the
138 |      public may access the material from a place and at a time
139 |      individually chosen by them.
140 | 
141 |   m. Sui Generis Database Rights means rights other than copyright
142 |      resulting from Directive 96/9/EC of the European Parliament and of
143 |      the Council of 11 March 1996 on the legal protection of databases,
144 |      as amended and/or succeeded, as well as other essentially
145 |      equivalent rights anywhere in the world.
146 | 
147 |   n. You means the individual or entity exercising the Licensed Rights
148 |      under this Public License. Your has a corresponding meaning.
149 | 
150 | 
151 | Section 2 -- Scope.
152 | 
153 |   a. License grant.
154 | 
155 |        1. Subject to the terms and conditions of this Public License,
156 |           the Licensor hereby grants You a worldwide, royalty-free,
157 |           non-sublicensable, non-exclusive, irrevocable license to
158 |           exercise the Licensed Rights in the Licensed Material to:
159 | 
160 |             a. reproduce and Share the Licensed Material, in whole or
161 |                in part, for NonCommercial purposes only; and
162 | 
163 |             b. produce, reproduce, and Share Adapted Material for
164 |                NonCommercial purposes only.
165 | 
166 |        2. Exceptions and Limitations. For the avoidance of doubt, where
167 |           Exceptions and Limitations apply to Your use, this Public
168 |           License does not apply, and You do not need to comply with
169 |           its terms and conditions.
170 | 
171 |        3. Term. The term of this Public License is specified in Section
172 |           6(a).
173 | 
174 |        4. Media and formats; technical modifications allowed. The
175 |           Licensor authorizes You to exercise the Licensed Rights in
176 |           all media and formats whether now known or hereafter created,
177 |           and to make technical modifications necessary to do so. The
178 |           Licensor waives and/or agrees not to assert any right or
179 |           authority to forbid You from making technical modifications
180 |           necessary to exercise the Licensed Rights, including
181 |           technical modifications necessary to circumvent Effective
182 |           Technological Measures. For purposes of this Public License,
183 |           simply making modifications authorized by this Section 2(a)
184 |           (4) never produces Adapted Material.
185 | 
186 |        5. Downstream recipients.
187 | 
188 |             a. Offer from the Licensor -- Licensed Material. Every
189 |                recipient of the Licensed Material automatically
190 |                receives an offer from the Licensor to exercise the
191 |                Licensed Rights under the terms and conditions of this
192 |                Public License.
193 | 
194 |             b. Additional offer from the Licensor -- Adapted Material.
195 |                Every recipient of Adapted Material from You
196 |                automatically receives an offer from the Licensor to
197 |                exercise the Licensed Rights in the Adapted Material
198 |                under the conditions of the Adapter's License You apply.
199 | 
200 |             c. No downstream restrictions. You may not offer or impose
201 |                any additional or different terms or conditions on, or
202 |                apply any Effective Technological Measures to, the
203 |                Licensed Material if doing so restricts exercise of the
204 |                Licensed Rights by any recipient of the Licensed
205 |                Material.
206 | 
207 |        6. No endorsement. Nothing in this Public License constitutes or
208 |           may be construed as permission to assert or imply that You
209 |           are, or that Your use of the Licensed Material is, connected
210 |           with, or sponsored, endorsed, or granted official status by,
211 |           the Licensor or others designated to receive attribution as
212 |           provided in Section 3(a)(1)(A)(i).
213 | 
214 |   b. Other rights.
215 | 
216 |        1. Moral rights, such as the right of integrity, are not
217 |           licensed under this Public License, nor are publicity,
218 |           privacy, and/or other similar personality rights; however, to
219 |           the extent possible, the Licensor waives and/or agrees not to
220 |           assert any such rights held by the Licensor to the limited
221 |           extent necessary to allow You to exercise the Licensed
222 |           Rights, but not otherwise.
223 | 
224 |        2. Patent and trademark rights are not licensed under this
225 |           Public License.
226 | 
227 |        3. To the extent possible, the Licensor waives any right to
228 |           collect royalties from You for the exercise of the Licensed
229 |           Rights, whether directly or through a collecting society
230 |           under any voluntary or waivable statutory or compulsory
231 |           licensing scheme. In all other cases the Licensor expressly
232 |           reserves any right to collect such royalties, including when
233 |           the Licensed Material is used other than for NonCommercial
234 |           purposes.
235 | 
236 | 
237 | Section 3 -- License Conditions.
238 | 
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 | 
242 |   a. Attribution.
243 | 
244 |        1. If You Share the Licensed Material (including in modified
245 |           form), You must:
246 | 
247 |             a. retain the following if it is supplied by the Licensor
248 |                with the Licensed Material:
249 | 
250 |                  i. identification of the creator(s) of the Licensed
251 |                     Material and any others designated to receive
252 |                     attribution, in any reasonable manner requested by
253 |                     the Licensor (including by pseudonym if
254 |                     designated);
255 | 
256 |                 ii. a copyright notice;
257 | 
258 |                iii. a notice that refers to this Public License;
259 | 
260 |                 iv. a notice that refers to the disclaimer of
261 |                     warranties;
262 | 
263 |                  v. a URI or hyperlink to the Licensed Material to the
264 |                     extent reasonably practicable;
265 | 
266 |             b. indicate if You modified the Licensed Material and
267 |                retain an indication of any previous modifications; and
268 | 
269 |             c. indicate the Licensed Material is licensed under this
270 |                Public License, and include the text of, or the URI or
271 |                hyperlink to, this Public License.
272 | 
273 |        2. You may satisfy the conditions in Section 3(a)(1) in any
274 |           reasonable manner based on the medium, means, and context in
275 |           which You Share the Licensed Material. For example, it may be
276 |           reasonable to satisfy the conditions by providing a URI or
277 |           hyperlink to a resource that includes the required
278 |           information.
279 |        3. If requested by the Licensor, You must remove any of the
280 |           information required by Section 3(a)(1)(A) to the extent
281 |           reasonably practicable.
282 | 
283 |   b. ShareAlike.
284 | 
285 |      In addition to the conditions in Section 3(a), if You Share
286 |      Adapted Material You produce, the following conditions also apply.
287 | 
288 |        1. The Adapter's License You apply must be a Creative Commons
289 |           license with the same License Elements, this version or
290 |           later, or a BY-NC-SA Compatible License.
291 | 
292 |        2. You must include the text of, or the URI or hyperlink to, the
293 |           Adapter's License You apply. You may satisfy this condition
294 |           in any reasonable manner based on the medium, means, and
295 |           context in which You Share Adapted Material.
296 | 
297 |        3. You may not offer or impose any additional or different terms
298 |           or conditions on, or apply any Effective Technological
299 |           Measures to, Adapted Material that restrict exercise of the
300 |           rights granted under the Adapter's License You apply.
301 | 
302 | 
303 | Section 4 -- Sui Generis Database Rights.
304 | 
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 | 
308 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 |      to extract, reuse, reproduce, and Share all or a substantial
310 |      portion of the contents of the database for NonCommercial purposes
311 |      only;
312 | 
313 |   b. if You include all or a substantial portion of the database
314 |      contents in a database in which You have Sui Generis Database
315 |      Rights, then the database in which You have Sui Generis Database
316 |      Rights (but not its individual contents) is Adapted Material,
317 |      including for purposes of Section 3(b); and
318 | 
319 |   c. You must comply with the conditions in Section 3(a) if You Share
320 |      all or a substantial portion of the contents of the database.
321 | 
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 | 
326 | 
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 | 
329 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 | 
340 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 | 
350 |   c. The disclaimer of warranties and limitation of liability provided
351 |      above shall be interpreted in a manner that, to the extent
352 |      possible, most closely approximates an absolute disclaimer and
353 |      waiver of all liability.
354 | 
355 | 
356 | Section 6 -- Term and Termination.
357 | 
358 |   a. This Public License applies for the term of the Copyright and
359 |      Similar Rights licensed here. However, if You fail to comply with
360 |      this Public License, then Your rights under this Public License
361 |      terminate automatically.
362 | 
363 |   b. Where Your right to use the Licensed Material has terminated under
364 |      Section 6(a), it reinstates:
365 | 
366 |        1. automatically as of the date the violation is cured, provided
367 |           it is cured within 30 days of Your discovery of the
368 |           violation; or
369 | 
370 |        2. upon express reinstatement by the Licensor.
371 | 
372 |      For the avoidance of doubt, this Section 6(b) does not affect any
373 |      right the Licensor may have to seek remedies for Your violations
374 |      of this Public License.
375 | 
376 |   c. For the avoidance of doubt, the Licensor may also offer the
377 |      Licensed Material under separate terms or conditions or stop
378 |      distributing the Licensed Material at any time; however, doing so
379 |      will not terminate this Public License.
380 | 
381 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 |      License.
383 | 
384 | 
385 | Section 7 -- Other Terms and Conditions.
386 | 
387 |   a. The Licensor shall not be bound by any additional or different
388 |      terms or conditions communicated by You unless expressly agreed.
389 | 
390 |   b. Any arrangements, understandings, or agreements regarding the
391 |      Licensed Material not stated herein are separate from and
392 |      independent of the terms and conditions of this Public License.
393 | 
394 | 
395 | Section 8 -- Interpretation.
396 | 
397 |   a. For the avoidance of doubt, this Public License does not, and
398 |      shall not be interpreted to, reduce, limit, restrict, or impose
399 |      conditions on any use of the Licensed Material that could lawfully
400 |      be made without permission under this Public License.
401 | 
402 |   b. To the extent possible, if any provision of this Public License is
403 |      deemed unenforceable, it shall be automatically reformed to the
404 |      minimum extent necessary to make it enforceable. If the provision
405 |      cannot be reformed, it shall be severed from this Public License
406 |      without affecting the enforceability of the remaining terms and
407 |      conditions.
408 | 
409 |   c. No term or condition of this Public License will be waived and no
410 |      failure to comply consented to unless expressly agreed to by the
411 |      Licensor.
412 | 
413 |   d. Nothing in this Public License constitutes or may be interpreted
414 |      as a limitation upon, or waiver of, any privileges and immunities
415 |      that apply to the Licensor or You, including from the legal
416 |      processes of any jurisdiction or authority.
417 | 
418 | =======================================================================
419 | 
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 | 
437 | Creative Commons may be contacted at creativecommons.org.
438 | 
439 | 


--------------------------------------------------------------------------------
/README.asciidoc:
--------------------------------------------------------------------------------
 1 | = Table of Contents
 2 | :imagesdir: images
 3 | 
 4 | *Update 03_2019:* forked and tried to translate to english. Corrections are welcome.
 5 | 
 6 | I added a little (2018.12.12). Since the web interface is likely to be beyond the scope of introductory, I will consider how to do it.
 7 | 
 8 | - https://asciidoctor.org/docs/asciidoc-syntax-quick-reference/#formatted-text[AsciiDoc Syntax Quick Reference]
 9 | 
10 | image::python_for_ci.png[py4chemoinformatics, width=250]
11 | 
12 | == link:ch01_introduction.asciidoc[01 Introduction]
13 | 
14 | - What is chemoinformatics?
15 | - What is RDKit?
16 | - Target audience
17 | - Acknowledgment
18 | - License
19 | 
20 | == link:ch02_installation.asciidoc[02 Let's prepare the environment for chemoinformatics]
21 | 
22 | - Anaconda(Python, Jupyter, scikit-learn)
23 | - RDKit
24 | 
25 | == link:ch03_python.asciidoc[03 Basics of Python programming]
26 | 
27 | - Python basics
28 | - Let's use it conveniently with Jupyter notebook
29 | - To do machine learning with Python
30 | 
31 | == link:ch04_database.asciidoc[04 Public database for chemoinformatics]
32 | 
33 | - ChEMBL
34 | - PubChem
35 | - Search for the information you want on ChEMBL
36 | 
37 | == link:ch05_rdkit.asciidoc[05 Handling Structural Information with RDKit]
38 | 
39 | - What is SMILES?
40 | - Let's draw the structure
41 | - How to handle multiple compounds at once?
42 | 
43 | == link:ch06_similarity.asciidoc[06 Try to evaluate the similarity of compounds]
44 | 
45 | - Descriptor, fingerprint
46 | - Calculate similarity
47 | - Virtual screening
48 | 
49 | == link:ch07_graph.asciidoc[07 valuation of similarity using graph structure]
50 | 
51 | - Classification by major skeleton (MCS)
52 | - Compound Network by Matched Molecular Pair
53 | - Visualize MMP networks using Cytoscape
54 | 
55 | == link:ch08_visualization.asciidoc[08 I want to have many compounds at once]
56 | 
57 | - Chemical Spaceとは
58 | - Mapping using tSNE
59 | 
60 | == link:ch09_qsar.asciidoc[09 Basics of Quantitative Structure-Activity Relationship (QSAR)]
61 | 
62 | - Consider the cause of the effect (Classification problem)
63 | - Predict the efficacy of drugs (regression problem)
64 | - Model applicability (applicability domain)
65 | 
66 | == link:ch10_deeplearning.asciidoc[10 Introduction to Deep-Learning]
67 | 
68 | - About TensorFlow and Keras
69 | - Google colab
70 | - Let's install
71 | 
72 | == link:ch11_dlqsar.asciidoc[11 Structure-activity relationship using deep-learning]
73 | 
74 | - Predictive model construction using DNN
75 | - I will devise a descriptor (neural fingerprint)
76 | 
77 | == link:ch12_generativemodels.asciidoc[12 Let the computer think about chemical structure]
78 | 
79 | - Structure generation using Recurrent Neural Network
80 | 
81 | == link:ch13_beyond.asciidoc[13 Conclusion]
82 | 
83 | - Final remarks and further reading
84 | 


--------------------------------------------------------------------------------
/ch00_cover.asciidoc:
--------------------------------------------------------------------------------
 1 | = Introduction to chemoinformatics for AI drug discovery
 2 | @fmkz___, @iwatobipen
 3 | v0.40002(Draft) 2019/03/20
 4 | :toc:
 5 | :toc-title: 目次
 6 | :lang: en
 7 | :doctype: book
 8 | :docname: Introduction to chemoinformatics for AI drug discovery
 9 | :imagesdir: ./images
10 | :pdf-fontsdir: fonts
11 | :pdf-style: py4c-theme.yml
12 | :source-highlighter: coderay
13 | :title-logo-image: image::souyakuchan.png[mishima.syk]
14 | 


--------------------------------------------------------------------------------
/ch01_introduction.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 1: Introduction
  2 | :imagesdir: ./images
  3 | 
  4 | Chemoinformatics is a methodology that is used to analyze mainly chemical-related data using a computer and solve various problems. The term chemoinformatics was defined in the late 1990s and early 2000s, and in the pharmaceutical industry and pharmaceutical academia, the relationship between drug effects and compound characteristics is analyzed, large amounts of compound information are visualized, and compound similarity It is used in a wide variety of processes, including gender-based clustering.
  5 | 
  6 | In recent years, drug discovery applications for deep learning have been explored, but not only in conventional chemoinformatics such as **new design proposals** and **synthetic route proposals**, as well as QSAR (Quantitative Structure-Activity Relationship) for predicting activity and physical properties. Applied research to areas that were not being conducted is also actively conducted.
  7 | 
  8 | 
  9 | Compound design is innovative
 10 | 
 11 | ****
 12 | What kind of compound should we make in the first place? And how to synthesize it? The process of thinking about the background is an area where background knowledge and imagination are required, and conventionally it has been recognized that it is a difficult area for people other than to bear, but the advancement of what is also called AI to such areas is here It progressed rapidly in several years (2017-2019).
 13 | ****
 14 | 
 15 | Cheminformatics has already been used in various situations, but there was not much relevant information. There are several possible reasons for this, but there is no doubt that the two main reasons are that there were no open source toolkits and no public databases. However, with the advent of RDKit, an open source chemoinformatics toolkit called RDKit and a public database called ChEMBL, this has been resolved.
 16 | 
 17 | In recent years, as with bioinformatics in chemoinformatics, a lot of information can be obtained immediately by searching on the web, and it is possible to learn by yourself, but as a set of information to take a first step, We decided to prepare "the content that could learn the basics of chemoinformatics and apply them". Considering the recent AI drug discovery boom, the latter chapter contains chapters on compound activity prediction and compound proposal using deep learning used in the context of “AI drug discovery”, so one-stop learning So you should be able to keep up with the recent trends.
 18 | 
 19 | <<<
 20 | 
 21 | === What is RDKit
 22 | 
 23 | warning:: Here is a subsection of @ iwatobipen's talk about RDKit. At the draft stage, the words such as "I will say" or "based on" are used as they are, and the self-proclaimed is a "comprehensive" @ iwatobipen-style style of "gozuru" tone.
 24 | 
 25 | My name is @iwatobipen, who writes a part of this book. I'm going to talk hot about RDKit here.
 26 | 
 27 | What is the RD of RDKit? Actually, it is an abbreviation of **Rational Discovery** , and a framework that is the predecessor of the current open source was developed in 2000. It's so old and old. Then, in 2006, the code became open source and was released from sourceforge. Readers who think that Python's chemoinformatics toolkit includes OpenBabel besides RDKit will also be welcome. OpenBabel was first released in 2005. All come with a toolkit that has more than 10 years of history. I remember that OpenBabel was the major in around 2012, when the deaf people began to be interested in this area. At that time, there were almost no articles in Japanese, and the person who wrote this link:https://kzfm.hatenablog.com/archive[book] was a trial and error writing the code of RDKit referring to the link:https://kzfm.hatenablog.com/archive[chemo info] cookbook of @fmkz___ who is a co-
 28 | author of this book and a pioneer in the industry Oh. If you want to keep track of chemoinfo related history, you should read this link:http://blog.kzfmix.com/entry/1542711744[article].
 29 | 
 30 | 
 31 | Developer Greg Landorum says
 32 | 
 33 | [quote, Greg Landorum]
 34 | RDKit is the Swiss Army Knife in chemoinformatics and is a collection of various functional pieces
 35 | 
 36 | This is exactly the expression which got the target. As you can see if you look at the link:link:https://www.rdkit.org/docs/[official document] , it already has various features. Starting with reading and writing of compound information, drawing of structure, 3D structure conformation generation, R group decomposition, descriptor, fingerprint calculation, pharmacophore calculation etc. Oh. It can cover a wide range from analysis to visualization. Furthermore, the tools developed by Contributor and others using RDKit are packed in the link:https://github.com/rdkit/rdkit/tree/master/Contrib[Contrib] folder along with their hot feelings . How do you want to use it? Now I want to write code with RDKit as soon as possible, I cant't wait ;)
 37 | 
 38 | NOTE: @iwatobipen is, of course, one of the contributors, and provides code to quickly cluster a large number of compound libraries called link:https://github.com/rdkit/rdkit/tree/master/Contrib/Fastcluster[Fastcluster] . (by @fmkz___)
 39 | 
 40 | RDKit is also active in the development and user community, with more features being added. The style in which talented researchers from all over the world build up and develop as a whole is the strength and attraction of open source. If you have a chance, consider joining the annual RDKit User Group Meeting. It is hard to replace anything with Face2Face that users can discuss each other. In addition, I said that there was almost no information on Japanese at the time when the deaf began to use it, but in recent years there have been a lot of very good Japanese articles. Here are a few examples: There are many articles posted on Qiita.
 41 | 
 42 | In addition, link:http://rdkit-users.jp/[RDKit-users-jp] by volunteers has also been launched. If your question in English seems to be a bit ..., I would like to ask a question here. Also, Japanese documents are merged into the latest version of RDKit's repository. This will also be helpful. This document only uses some of RDKit's features. You should still feel that you can do a lot of things. Once you have taken the first step of interest, you should go ahead with your own interest and motivation. If you do not understand something, ask the above community and post it to the repository of this book as an issue. **Well then let's get started!**
 43 | 
 44 | ==== Main Japanese Commentary Site
 45 | 
 46 | - link:http://rdkit-users.jp/[rdkit-users.jp]
 47 | - link:https://magattaca.github.io/RDKit_unofficial_translation_JP/[RDKitドキュメンテーション非公式日本語版サイト:Unofficail site of rdkit documentation]
 48 | - link:https://future-chem.com/[化学の新しいカタチ:The shape of new chemistry]
 49 | 
 50 | === Target audience
 51 | 
 52 | The following people are assumed as readers.
 53 | 
 54 | - Postdoctoral student who wants to do data analysis of graduate students in pharmacy and medicine and pharmacy
 55 | - Pharmacist at a pharmaceutical company who wants to analyze his own data
 56 | - Those who feel the need for chemoinformatics in drug discovery chemists and those who are assigned suddenly due to the power of mystery
 57 | - Bioinformaticians who are thinking of learning chemoinformatics
 58 | - People who are interested in AI drug discovery but do not know what to start with
 59 | 
 60 | === About the code of this book
 61 | 
 62 | All of the programming code used in this book is located in the notebooks directory of the link:https://github.com/Mishima-syk/py4chemoinformatics[py4cheminformatics repository of Mishima.syk]. The first one of each of the image:jupyter.png[width="20"] chapter please see properly because it stretched a link to the chapter of Jupyter notebook to.
 63 | 
 64 | The installation of Chapter 2 will enable you to use git commands, so you can download all the data in this manual including pdf with the following command
 65 | 
 66 | [source, bash]
 67 | ----
 68 | $ git clone https://github.com/Mishima-syk/py4chemoinformatics.git
 69 | ----
 70 | 
 71 | === bonus
 72 | 
 73 | .Chemoinformatics or Cheminformatics?
 74 | ****
 75 | Chemoinformatics or Cheminformatics?
 76 | Originally I remember that Bio and the combination of the word “Chemo” appeared, but it was widely separated from Chem for a while by the launch of the link:https://jcheminf.biomedcentral.com/[Journal of Cheminformatics].
 77 | 
 78 | According to the recent link:https://trends.google.co.jp/trends/explore?date=all&q=chemoinformatics,cheminformatics[Google trend], it seems either way, but personally I think that it is better to put emphasis on Rhyme, so I will use Chemo in this book.
 79 | ****
 80 | 
 81 | <<<
 82 | 
 83 | === Acknowledgment
 84 | 
 85 | We would like to thank the following people for their bug fixes and suggestions for improvement when writing this document:
 86 | 
 87 | link:https://twitter.com/antiplastics[@antiplastics],
 88 | link:https://twitter.com/bonohu[@bonohu],
 89 | link:https://twitter.com/ReLuTropy[@ReLuTropy],
 90 | link:https://twitter.com/ski_nanko[@ski_nanko],
 91 | link:https://twitter.com/torusengoku[@torusengoku],
 92 | link:https://twitter.com/yamasaKit_[@yamasaKit_]
 93 | link:https://twitter.com/4Elemento[@4Elemento],
 94 | @4Elemento, thanks a lot for tranlation task!!!! (from @iwatobipen)
 95 | 
 96 | From here onwards I wrote while listening to Nujabes-reflection eternal by @fmkz___ 20/03/20
 97 | 
 98 | First of all, I would like to thank the link:https://twitter.com/bonohu[@bonohu] which triggered me to write this book. @Bonohu's link:https://www.amazon.co.jp/dp/4895929019[Dr. Bono's analysis of life science data]. At athe meeting of Mishima.syk we talked that "The Bono book Chemoinformatics version" would be nice. There is no doubt that what triggered me to write this book is, "Well, if yes, why not write?" Also, link: https://twitter.com/souyakuchan[@souyakuchan] link:https://adventar.org/calendars/3041[Drug Advent Calendar 2018, written in Japanese] has also become a good stimulus for writing. In other words, I think that I did not start to move specifically if I did not make a chapter here.
 99 | 
100 | Also, it is the existence of y-sama that should not be forgotten. link:http://mishima-syk.github.io/[Mishima.syk] y-sama has been away at the beginning and has fallen forever on 2019/01/06. He wrote wonderful post such as link:https://qiita.com/y\__sama/items/5b62d31cb7e6ed50f02c[Python environment construction of the person who aims at the data scientist 2016] and link:https://medium.com/@y__sama/druglikeness%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6%E3%81%AE%E3%82%88%E3%82%82%E3%82%84%E3%81%BE%E8%A9%B1-8310cec5ffc6[Small talk about drug likeness: written in Japanese]. If he was alive, we would probably write by three people and the content would have been more complete. This event also gave us a strong motivation to write.
101 | 
102 | Finally, I would like to thank the participants who participated in Mishima.syk for drinking good wine and beer and having a hot discussion every time. Some content is based on the presentation at Mishima.syk, and has been revised based on your feedback.
103 | 
104 | If you have read this book, and if you feel that chemoinformatics is interesting or you want to do drug discovery, please join Mishima.syk. I think it will be fun. In future drug discovery research, it will be important to push each other across affiliations and improve their skills. In fact, I think it is already such a society. I hope this book will help you have a pleasant research life.
105 | 
106 | [quote, y__sama]
107 | I do what I want to do I live myself, I have no regrets in my life.
108 | Life enjoys winning.
109 | I think it would be fun to enjoy your life by chasing your joy to the fullest by saying that you hate something you hate.
110 | I wish you all the best in your life.
111 | 
112 | === License
113 | 
114 | This document is copyright (C) 2019 by @fmkz___ and @iwatobipen
115 | 
116 | This document is link:https://github.com/Mishima-syk/py4chemoinformatics/blob/master/LICENSE[Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
117 | Public License].
118 | 
119 | image::by-nc-sa.png[CC-BY-NC-SA, width=100]
120 | 
121 | <<<
122 | 


--------------------------------------------------------------------------------
/ch02_installation.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 2: Prepare the Environment for Chemoinformatics
  2 | :imagesdir: images
  3 | 
  4 | We will build the environment required for this document.
  5 | 
  6 | === About Anaconda
  7 | 
  8 | Anaconda is a package for easy environment creation and management for doing machine learning. You can also easily install packages, like RDKit, which will be explained later.
  9 | 
 10 | 
 11 | ==== Q&A
 12 | 
 13 | Why use Anaconda?::
 14 | The programming language Python has a relatively large number of standard libraries, but you need to install the libraries for chemoinformatics yourself. This is not a big deal if you get used to it, but it will be troublesome for beginners. Anaconda comes into play in order to reduce this effort.
 15 | 
 16 | 
 17 | There are two major versions of Python: 2.x and 3.x.::
 18 |   link:https://pythonclock.org/[Support for 2.x will end in 2020], so new learners do not need to use 2.x.
 19 | 
 20 | === How to install Anaconda
 21 | 
 22 | Now let's install Anaconda. Visit the link:https://www.anaconda.com/[official site] and download the Python 3 installer for your environment. If the OS is Linux / Mac, you can select the installer of GUI / CUI, so download Python 3.7 64-bit command line installer.
 23 | 
 24 | image::ch02/anaconda01.png[APX+RVX, width=600, pdfwidth=60%]
 25 | 
 26 | [source, bash]
 27 | ----
 28 | $ bash ~/Downloads/Anaconda3-4.1.0-Linux-x86_64.sh # Please change the installer name accordingly
 29 | ----
 30 | 
 31 | Press Enter
 32 | 
 33 | [source, bash]
 34 | ----
 35 | Welcome to Anaconda3 2018.12
 36 | 
 37 | In order to continue the installation process, please review the license
 38 | agreement.
 39 | Please, press ENTER to continue
 40 | >>> 
 41 | ----
 42 | 
 43 | Continue to press Enter and enter yes with yes, no
 44 | 
 45 | [source, bash]
 46 | ----
 47 | Do you accept the license terms? [yes|no]
 48 | [no] >>> 
 49 | ----
 50 | 
 51 | I am asked where to install, but the default location is usually fine.
 52 | Press Return.
 53 | 
 54 | [source, bash]
 55 | ----
 56 | Anaconda3 will now be installed into this location:
 57 | /Users/kzfm/anaconda3
 58 | 
 59 |   - Press ENTER to confirm the location
 60 |   - Press CTRL-C to abort the installation
 61 |   - Or specify a different location below
 62 | ----
 63 | 
 64 | You will be asked if you want to install  VSCode after installation as well, so press No.
 65 | 
 66 | [source, bash]
 67 | ----
 68 | Thank you for installing Anaconda3!
 69 | 
 70 | ===========================================================================
 71 | 
 72 | Anaconda is partnered with Microsoft! Microsoft VSCode is a streamlined
 73 | code editor with support for development operations like debugging, task
 74 | running and version control.
 75 | 
 76 | To install Visual Studio Code, you will need:
 77 |   - Internet connectivity
 78 | 
 79 | Visual Studio Code License: https://code.visualstudio.com/license
 80 | 
 81 | Do you wish to proceed with the installation of Microsoft VSCode? [yes|no]
 82 | >>> Please answer 'yes' or 'no':
 83 | >>> 
 84 | ----
 85 | 
 86 | Once the Anaconda installation is complete, you will be able to use the 'conda' command from a command prompt or terminal.
 87 | 
 88 | === Build a Virtual Environment and Install a Package
 89 | 
 90 | Python installed with Anaconda is 3.7, but the latest RDKit distributed at the time of this writing requires Python 3.6. So build a virtual environment with conda and install the required version of Python. After the -n of the command is "py4chemoinformatics", but you can use any name you like. After creating the virtual environment, install the packages used in this chapter and later.
 91 | 
 92 | [source, bash]
 93 | ----
 94 | $ conda create -n py4chemoinformatics python3.6
 95 | $ source activate py4chemoinformatics # Mac/Linux
 96 | $ activate py4chemoinformatics # Windows
 97 | 
 98 | # install packages
 99 | $ conda install -c conda-forge rdkit
100 | $ conda install -c conda-forge seaborn
101 | $ conda install -c conda-forge ggplot
102 | $ conda install -c conda-forge git
103 | ----
104 | 
105 | === Description of installed package
106 | 
107 | ==== RDKit
108 | 
109 | RDKit is one of the most commonly used toolkits in the field of chemoinformatics. One of the so-called open source software (OSS), which can be used free of charge. For more information Please refer to link:ch01_introduction.asciidoc[Introduction].
110 | 
111 | ==== seaborn
112 | It is one of the packages for link:https://seaborn.pydata.org/[visualizing statistical data].
113 | 
114 | ==== ggplot
115 | 
116 | One of the graph drawing packages is that it can draw rationally with a consistent grammar . Originally developed for the statistical analysis language R, it was ported to Python by the company link:http://ggplot.yhathq.com/[yhat] .
117 | 
118 | ==== Git
119 | 
120 | It is a version control system. I will not explain Git in this book, but if you do not know Git at all , take a look at link:https://backlog.com/ja/git-tutorial/[Git Primer], which can be understood by monkeys.
121 | 
122 | As explained in "Introduction", all data including pdf will be downloaded by the following command, so please download it as necessary.
123 | 
124 | [source, bash]
125 | ----
126 | $ git clone https://github.com/Mishima-syk/py4chemoinformatics.git
127 | ----
128 | 
129 | === Learn more about Conda
130 | 
131 | Why create a virtual environment::
132 | Some systems use Python internally to provide various features, so changing the Python version for a particular package can cause problems. Virtual environments solve these problems. Even if the package requires different library versions, you can set up a virtual Python environment for trial and error. If it becomes unnecessary, the virtual environment can be easily deleted without causing any problems in the original environment. So, by being able to create separate development environments in one system, you will not be bothered by library dependencies problems and Python version differences that often occur during development.
133 | 
134 | In this document, only one virtual environment is prepared for this document, but in practice many virtual environments are often created and developed. Therefore, I will list the conda subcommands that I use frequently.
135 | 
136 | [source, bash]
137 | ----
138 | $ conda install <package name>　# install package
139 | $ conda create -n <Name-of-virtual-environment> python = <version>　# Create virtual environment. 
140 | $ conda info -e  # Display virtual environment list created 
141 | $ conda remove -n <environment-name> # Virtual environment deletion 
142 | $ source activate <environment-name> # Using virtual environment ( Mac/Linux) 
143 | $ activate <environment-name> # Using virtual environment (Windows)
144 | $ source deactivate # leaving virtual environment 
145 | $ conda list # Display a list of libraries installed in the virtual environment you are using now
146 | ----
147 | 
148 | <<<
149 | 


--------------------------------------------------------------------------------
/ch03_python.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 3: Basics of Python programming
  2 | 
  3 | === Python basics
  4 | 
  5 | This chapter introduces web sites and books **for effective learning** for python beginners.
  6 | If you have something that is not understood in the following chapter, these information will help you.
  7 | ////
  8 | この章ではPythonに触れたことのない読者のために**効率的に勉強するため**のサイトや本などを紹介します。
  9 | もしこれ以降の章でわからないことなどがあったら、この章のサイトや本を参考に学んでみてください。
 10 | ////
 11 | 
 12 | ==== Would like to learn Python from books
 13 | 
 14 | https://www.amazon.co.jp/dp/4774196436/[Pythonスタートブック増補改訂版:Python start book]::
 15 | We recommend the book if you are beginner of programming.
 16 | 
 17 | https://www.amazon.co.jp/dp/B01NCOIC2P/[みんなのPython 第4版:Python for everyone]::
 18 | If you have any experience in programming such as Javascript and/or Java, and would like to learn python now, this book is recommended.
 19 | ////
 20 | JavascriptやJavaなどのなにかプログラミングを少しかじっていて、これからPythonを覚えたいのであればこちらの本をおすすめします。
 21 | ////
 22 | 
 23 | ==== Would like to learn Python from any sources
 24 | 
 25 | https://www.pycon.jp/support/bootcamp.html[Python Boot Camp(tutorial for python biginners)]::
 26 | This is a python tutorial event for beginner held by　PyconJP. The events held on all the places of Japan. How about join the event when it take place neighbour?
 27 | ////
 28 | 一般社団法人PyCon JPが開催している初心者向けPythonチュートリアルイベントです。全国各地で行われているので近くで開催される場合には参加するとよいでしょう
 29 | ////
 30 | 
 31 | https://connpass.com/category/Python/[Local communities]::
 32 | It seems good to increase your motivation to join study workshops for beginners or for professionals. You can find many workshops in connpass site. 
 33 | ////
 34 | あちこちで入門者向けからガチのヒト向けまでの勉強会やコミュニティなどもあるので、そういうのに参加してモチベーションを高めるのもよい方法です。
 35 | ////
 36 | 
 37 | https://www.udemy.com/topic/python/[udemy/python]::
 38 | It is effective way to learn programming with online learning service but we have never tried. You should ask a reputation around. And also there are many resources in YouTube.
 39 | ////
 40 | オンライン学習サービスを利用するのも効果的な手段のひとつですが、筆者は試したことがないのでわかりません。
 41 | 周りの評判を聞いてみても良いでしょう。YouTubeを探すのもありです。
 42 | ////
 43 | 
 44 | ==== If you have something that is not understood in this book
 45 | 
 46 | https://github.com/Mishima-syk/py4chemoinformatics/issues[py4chemoinformaticsのissues]::
 47 | We are happy to answer your question if you put questions in the issue of py4chemoinformatics. If there are something that is difficult to understand we will correct them.
 48 | The cycle will make the document better and everybody will be happy ;)
 49 | ////
 50 | py4chemoinformaticsのissuesに質問していただければお答えします。わかりにくい場合だったら修正しますので、よりよくなってみんなハッピー。
 51 | ////
 52 | 
 53 | ////
 54 | https://qiita.com/[Qiita]::
 55 | Qiitaで探せば大抵答えが見つかるはずです。
 56 | Qiita is a community for Japanese. All documents are written in Japanese
 57 | ////
 58 | 
 59 | https://stackoverflow.com/[stackoverflow]::
 60 | Stack Overflow is good community. You should search in SOF first and then ask the community.
 61 | 
 62 | http://mishima-syk.github.io/[Mishima.syk]::
 63 | Mishima.syk is the community where people who write the book gather. Topics are not limited to python but there are many presentations about python now. Discussion level is high but the community is also beginner friendly. We have planned hands-on sessions and they have an established reputation. The community members should be able to answer your questions. 
 64 | ////
 65 | 本書を書いている人たちが集まるコミュニティです。特に話題をPythonに限定していませんが、Pythonを使ったネタが多めです。かなりガチですが、初心者対応も万全でハンズオンに定評があります。質問されれば大体答えられます。
 66 | ////
 67 | 
 68 | === Let's use it conveniently with Jupyter notebook
 69 | By using link:https://jupyter.org/[Jupyter notebook], it is easy to write code and check the results.
 70 | ////
 71 | link:https://jupyter.org/[Jupyter notebook]を利用すると、コードを書いて結果を確認するということがとても簡単にできるようになります。
 72 | ////
 73 | 
 74 | The Jupyter Notebook is an open-source web application that allows you to embed code, rich text, math equation and etc. And it is easy to make high quality visualizations of the results. It is a nice platform for chemoinformatics because Jupyter Notebook can run code and draw chemical structures and many kinds of plots. Also, it has many features which improve programming productivity such as syntax highlight and auto indent. We recommend to use Jupyter especially for programming beginners.
 75 | ////
 76 | Jupyter notebookはWebブラウザーベースのツールで、コードだけではなくリッチテキスト、数式、なども同時にノートブックに埋め込めます。また結果を非常に綺麗な図として可視化することも容易にできます。つまり、化学構造やグラフも描画できるため、ケモインフォマティクスのためのプラットフォームとして使いやすいです。さらに、プログラミングの生産性を上げるような、ブラウザ上でコードを書くとシンタックスハイライトや、インデント挿入を自動で行ってくれたりという便利な機能もついているので、特に初学者は積極的に使うべきでしょう。
 77 | ////
 78 | 
 79 | ==== How to use?
 80 | 
 81 | from terminal (in Windows, anaconda prompt)
 82 | 
 83 | [source, bash]
 84 | ----
 85 | $ jupyter notebook
 86 | ----
 87 | 
 88 | After type the command above, Jupyter Notebook will be launched. In this book, all code is run on Jupyter Notebooks.
 89 | ////
 90 | と打てばJupyter Notebookが立ち上がります。本書ではこれ以降特に断らない限りJupyter Notebook上でのコードを実行することとします。
 91 | ////
 92 | 
 93 | === For machine learning with Python
 94 | 
 95 | **Machine learning** is a must for learning informatics not only chemoinformatics. Some background knowledge of machine learning is required in the following sessions. link:https://scikit-learn.org/stable/[Scikit-learn] is used for machine learning with python. Scikit-learn is de facto standard for machine learning library for python. We use the package without any descriptions but we would like to share some links for beginners.
 96 | 
 97 | link:http://shop.oreilly.com/product/0636920030515.do[Introduction to Machine Learning with Python]::
 98 | You can learn basics of machine learning with python. It is easy to read because there is less mathematical representations. 
 99 | 
100 | link:https://github.com/Mishima-syk/sklearn-tutorial[sklearn-tutorial]::
101 | Sklearn tutorial hands-on by @y-sama. Written in jupyter notebook.
102 | 
103 | ////
104 | ケモインフォマティクスに限らず、インフォマティクスを学ぶにあたり、機械学習は外せません。本書でもある程度の機械学習の知識があることを前提に進めていきます。Pythonで機械学習をするにはlink:https://scikit-learn.org/stable/[Scikit-learn]というライブラリを利用するのが定番であり、本書でも特に説明せずに利用していきますが、初学者のために参考となる書籍などをすすめておきます。
105 | 
106 | link:https://www.amazon.co.jp/dp/4873117984/[Pythonではじめる機械学習 ―scikit-learnで学ぶ特徴量エンジニアリングと機械学習の基礎]::
107 | Pythonで機械学習をやるための基礎を学べます。数学的な表現があまりないので読みやすいです。
108 | 
109 | link:https://github.com/Mishima-syk/sklearn-tutorial[sklearn-tutorial]::
110 | y-samaによるsklearnのチュートリアルハンズオンのjupyter notebookです。
111 | ////
112 | 
113 | <<<
114 | 


--------------------------------------------------------------------------------
/ch04_database.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 4: Public databases for chemoinformatics
  2 | :imagesdir: images
  3 | 
  4 | ////
  5 | この章ではケモインフォマティクスでよく使うデータベースを紹介します。
  6 | ////
  7 | The section describes common databases which are used for chemoinformatics.
  8 | 
  9 | === ChEMBL
 10 | 
 11 | ////
 12 | link:https://www.ebi.ac.uk/chembl/[ChEMBL]はEBIのChEMBLチームにより維持管理されている医薬品及び開発化合物の結合データ、薬物動態、薬理活性を収録したデータベースです。データは主にメディシナルケミストリ関連のジャーナルから手動で抽出されており、大体3,4ヶ月に一度データの更新があります。
 13 | 
 14 | メディシナルケミストリ関連のジャーナルからデータを収集しているため、QSARに関連する情報や背景知識を論文そのものに求めることが可能であり、創薬研究をする際には有用です。
 15 | 
 16 | NOTE: ChEMBLはもともとはlink:http://chembl.blogspot.com/2009/11/faq-where-can-i-download-starlite.html[StARlite]という商用データベースでした。詳しくはlink:http://cbi-society.org/home/documents/seminar/2009to12/CBI_Ikeda_511_d.pdf[慶応大学池田先生のChEMBLに関する資料]を参照してください。
 17 | ////
 18 | link:https://www.ebi.ac.uk/chembl/[ChEMBL] is a manually curated database of ADMET, physchem and bioactive molecules with drug like properties. The data is mostly curated  from medicinal chemistry journals and updated every 3-4 months.
 19 | 
 20 | The database is useful for drug discovery research because user can access a QSAR information and background knowledge of original reference journal from the database.
 21 | 
 22 | NOTE: Originaly, ChEMBL was commercial database named link:http://chembl.blogspot.com/2009/11/faq-where-can-i-download-starlite.html[StARlite]. Details are described in this silde deck link:https://www.ebi.ac.uk/sites/ebi.ac.uk/files/content.ebi.ac.uk/materials/2012/121008_SME/chembl_-_anne_hersey.pdf[about ChEMBL].
 23 | 
 24 | === PubChem
 25 | 
 26 | ////
 27 | link:https://pubchem.ncbi.nlm.nih.gov/[PubChem]はNCBIにより維持管理されている低分子化合物とその生物学的活性データを収録している公開リポジトリです。5000万件以上の化合物情報と、100万件を超えるアッセイデータを含みそのデータ量の多さが特徴とも言えます。もうひとつの特徴はデータをアカデミアからの化合物登録やアッセイ結果の登録により成長することであり、ここが先のChEMBLとの大きな違いです。
 28 | 
 29 | 特にPubChemは初期スクリーニングのデータが多いため、そのようなデータに対しなんらかのマイニングや分析を行いたい場合は有用だと考えられます。
 30 | 
 31 | どちらを使うべき?::
 32 | QSARをやりたい場合にはやはりChEMBLのデータを利用することが多いです。IC50のようなデータが得られていることが多いですし、モデルの解釈に元論文をあたることができるというのが大きな理由です。
 33 | ////
 34 | link:https://pubchem.ncbi.nlm.nih.gov/[PubChem] is an open chemistry database of biological activities and molecules which is maintained by NCBI. It has more than 50 million compounds data and more than 1 million of biological assay dataset. Its large dataset is one of the main features of pubchem. Another feature is that the database grows up by data registration from academia, being this the biggest difference point to ChEMBL. 
 35 | You can check more details of the data source from current link:https://pubchem.ncbi.nlm.nih.gov/sources/[URL].
 36 | 
 37 | Especially pubchem has large amount of an eary stage screening data, so it will be useful when user would like to analyze or mining it.
 38 | 
 39 | Which database should I use ChEMBL or PubChem?::
 40 | We think ChEMBL is preferred for QSAR analysis because ChEMBL provides many data such as IC50 and user can access to original journal for QSAR model interpretation.
 41 | 
 42 | === Search Data which you want in ChEMBL
 43 | 
 44 | ////
 45 | NOTE: ChEMBLはユーザーインターフェースを刷新中で現在beta版のテストを行っていますが、いずれこちらに置き換わると思うので新バージョンのインターフェースでの検索方法を紹介します。
 46 | 
 47 | まずはlink:https://www.ebi.ac.uk/chembl/[ChEMBL]にアクセスし、画面上部のCheck out our New Interface (Beta). というリンクをクリックして新しいインターフェース画面に移行します。
 48 | 
 49 | image::ch04/chembl01.png[ChEMBL]
 50 | 
 51 | ChEMBLのデータは主に4つのカテゴリに分かれていて、一意なIDが振られており相互に関連付けされています。それぞれのカテゴリについて簡単に説明すると
 52 | 
 53 | Targets::
 54 | ターゲット分子についてその分子を対象としてアッセイされた論文に関してまとめられており、どういったジャーナルに投稿されているかや、どの年に投稿されたのかといった情報がまとめられています。また、アッセイに関しても同様にまとめられています。
 55 | Compounds::
 56 | 化合物に関する基本的な物理量（分子量など）のほか、Rule of 5を満たしているかといった分子の特性情報や、臨床情報などの創薬関連情報のほか、ChEMBLでの関連アッセイ、関連論文のサマリがまとめられています。
 57 | Assays::
 58 | アッセイに関する情報と元論文との関連付けがされているほか、アッセイに供された化合物データへのリンクが貼られています。
 59 | Documents::
 60 | 論文のタイトル、ジャーナル名、アブストラクトの他に関連論文データへのリンクと、その論文中で行われたアッセイへのリンクと使われた化合物データへのリンクが貼られています。
 61 | ////
 62 | NOTE: User interface of ChEMBL is refleshing and testing beta version now. In this section describes how to search data from new UI because the UI will be main near the future.
 63 | 
 64 | At first, go to link:https://www.ebi.ac.uk/chembl/[ChEMBL] and click the link 'Check out our New Interface (Beta)' on the top of the screen. Then you can move to new search page.
 65 | 
 66 | image::ch04/chembl01.png[ChEMBL]
 67 | 
 68 | Mainly ChEMBL has 4 data categories and each data has an unique id and has relations to other categories. Brief introductions are below.
 69 | 
 70 | Targets::
 71 | The category has assay and reported journal informations of target molecules.
 72 | Compounds::
 73 | The category has basic physicochemical properties of molecules such as Molecular Weight, whether the molecule passes Lipinksy's Rule of 5 or not. And other information about the molecule such as clinical, related assays which are stored in ChEMBL and summary of journals.
 74 | Assays::
 75 | The category has relationship between assay information and original journal and link for the compounds which was assayed.
 76 | Documents::
 77 | The category has journal name, title, abstruct and link to related journals and link to data of the comounds which are used in the journal.
 78 | 
 79 | 
 80 | ==== If you want to find compounds which are related to a specific target
 81 | 
 82 | ////
 83 | ある創薬ターゲット分子がどのくらい研究開発されているかを知るために、それをターゲットとしてどのくらいの化合物が合成されたのか？さらに骨格のバリエーションはどのくらい存在するのかを調べたい場合がよくあります。ChEMBLを利用するとターゲット名で探索して関連化合物をダウンロードすることができます。
 84 | 
 85 | ここでは抗がん剤のターゲットとして知られているTopoisomerase2を検索します。画面上部のフォームにtopoisomeraseと入力して検索するとスクリーンショットのように表示されるはずです。
 86 | ////
 87 | It is very common that we want to know how long a target has been studied, how many compounds are synthesized and how kinds of scaffolds are there.
 88 | 
 89 | In this section, let's search Topoisomerase2 which is known popular target of cancer chemocerapy treatments. When you input the word **topoisomerase** in to the form which is located on top of the screen and search you can see the result as below.
 90 | 
 91 | image::ch04/chembl02.png[ChEMBL]
 92 | 
 93 | ////
 94 | サジェスト機能による絞り込みでいくつか候補をリスト表示してくるのでTOP2Bを選んでください。画面をスクロールするとAssociated Compoundsセクションがありますのでグラフのタイトル(Associated Compounds for Target CHEMBL3396)をクリックすると関連化合物一覧画面が開きます。
 95 | ////
 96 | The system provides candidates list with suggest feature. So you should select TOP2B. You can find section of 'Associated Comounds' when you scroll the screen, you shoud click the title of graph named **Associated Compounds for Target CHEMBL3396** then related compounds list display will appear.
 97 | 
 98 | image::ch04/chembl03.png[ChEMBL]
 99 | 
100 | ////
101 | 259化合物存在することがわかります。スクロールすると全体をみることができます。画面右のアイコンをクリックするとそれぞれCSV(カンマ区切りテキスト),TSV(タブ区切りテキスト),SDF(5章で説明しています)の形式でダウンロードできます。
102 | ////
103 | There are 259 compounds in the result. All data can see by scrolling the screen. And data can be downloaded as CSV, TSV and SDF format when you click the icon which is located on top right side of the screen.
104 | TIPS:: TSV means tab separated value, CSV means camma separated value
105 | 
106 | image::ch04/chembl04.png[ChEMBL]
107 | 
108 | ==== If you want to retrieve comound structures and assays data from ChEMBL
109 | 
110 | ////
111 | QSARモデルを作る場合、アッセイの活性値と対応する化合物の構造情報が必要です。ChEMBLの場合アッセイのページからダウンロードすることでQSARモデル作成のためのデータを得ることができます。
112 | 
113 | 大体次のような手順を辿ることがおおいです。
114 | 
115 | - 論文データを検索してからそれに関連付けられているアッセイデータを辿る
116 | - ターゲットを検索してそれに紐付いているアッセイデータからQSARに使えそうなものを選ぶ
117 | 
118 | ここでは後者のターゲットから検索してQSARモデルに使えそうなアッセイデータを探します。心毒性関連ターゲットとしてよく知られているhERGのQSARモデルを作りたいという状況を想定しています。
119 | 
120 | 検索フォームにhERGと入力して、Search hERG for all in Assaysを選びます。361件ヒットしました。
121 | 
122 | image::ch04/chembl05.png[ChEMBL]
123 | 
124 | モデル構築のためのデータが欲しいのでデータ数が多い順に並べ替えます。ヘッダーのCompoundsをクリックして降順に並べ替えます。
125 | 
126 | image::ch04/chembl06.png[ChEMBL]
127 | 
128 | 論文由来で最もアッセイ数の多いCHEMBL829152を選んでクリックしてアッセイページを開きます。Activity chartの円グラフをクリックすると詳細画面が開くのでSelect allで全選択してTSV形式でダウンロードします。
129 | 
130 | image::ch04/chembl07.png[ChEMBL]
131 | ////
132 | It is needed the structures and activity details for compounds when you would like to build QSAR model. You can download the data for QSAR from **Assay** page in ChEMBL.
133 | 
134 | You can follow the steps outlined below.
135 | 
136 | - Search journal data and the  retrieve assay data which is related to the journal.
137 | - Search the target which you want to use and retrieve assay data which is related to the target.
138 | 
139 | In the section, let's try the second approach, retrieve data from the target. We supporse that we would like to build QSAR model for hERG inhibition, hERG, Kv11.1 channel is best known for its contribution to the electrical activity of the heart. The hERG blocker will have risk of cardiotoxicity.
140 | 
141 | Input **hERG** to search form and push **Search hERG for all assays**. You will can get 361 or more hits.
142 | 
143 | image::ch04/chembl05.png[ChEMBL]
144 | 
145 | Sort in descending order of number of data for modeling. Click **Compounds** on the header to do it.
146 | 
147 | image::ch04/chembl06.png[ChEMBL]
148 | 
149 | Click CHEMBL829152 which has largest data in the results the assay page will open. Click pi chart of acitivity then details of the data will be shown then select all and download the data as TSV format.
150 | 
151 | image::ch04/chembl07.png[ChEMBL]
152 | 
153 | NOTE::
154 | ****
155 | The data might be garbled when you open the data on text editer like \^@C^@h\^@E^@M\^@B^@L^@. This reason is that the data encoded as utf-16-le. (Because the encoding is preferred for Excel)
156 | 
157 | If you are using vi, you can fix the issue by just typing ':e ++enc=utf16le'.
158 | ****
159 | 
160 | === Other useful datbases
161 | 
162 | ==== link:http://zinc15.docking.org/[ZINC]
163 | 
164 | ////
165 | ZINCは購入可能な試薬をコレクションしたデータベースです。現在のバージョンは15で約7億5000万の構造が収載されています。
166 | もともとがドッキングシミュレーションでの利用を想定して開発されているため、三次元化したデータをダウンロードすることも可能です。ZINCのデータでバーチャルスクリーニング(6章で説明します)を行い、ヒットした化合物を購入し実際のアッセイに供するというのが主な使い方だと思います。
167 | 
168 | データのダウンロード方法は上部のTranchesタブをクリックすると次の画面に縦軸にLogP横軸に分子量の大きさで分類されそれぞれの区画にいくつの化合物が収載されているかの表が表示されます。
169 | ////
170 | ZINC is a database which collected commercial available reagents. Current version is 15 and about 750 million comounds are recorded.
171 | User can download 3D molecular structure data because originally the data base is developed for assuming docking simulation. I think that conduct virtual screening with data from ZINC, purchase hit comounds and assay these compounds is the main usage.
172 | 
173 | How to download data?
174 | Click Tranches tab, then you can see on the next screen, the table which is devided the vertical axis shows LogP the horizontal axis shows molecular weight display a table of how many compounds are listed.
175 | 
176 | image::ch04/zinc01.png[ChEMBL]
177 | 
178 | ////
179 | ここから必要なデータセットを選んでダウンロードボタンを押すと、実際にデータセットのURLが列挙されたテキストファイルが得られますのでそれぞれにアクセスしてデータをダウンロードします。
180 | ////
181 | Select dataset which you want and click down load button, you can get text file which listed URL of the dataset. The data can get with accessing the URL. 
182 | 
183 | ==== link:http://togotv.dbcls.jp/[統合TV:Togo TV]
184 | 
185 | ////
186 | 統合TVは生命科学分野の有用なデータベースやツールの使い方を動画で紹介するサイトで、link:https://dbcls.rois.ac.jp/[ライフサイエンス統合データベースセンター(DBCLS)]により管理、運用されています。その名の通りバイオインフォマティクス関連の動画が多いですが、ケモインフォマティクスを紹介した動画もいくつかありますので参考にしてみてください。link:http://togotv.dbcls.jp/information.html[文献・辞書・プログラミング]のカテゴリも役に立つはずです。
187 | ////
188 | Togo TV is a video site which describes useful database and tools and is managed and maintaind by link:https://dbcls.rois.ac.jp/[Database Center for Life Science(DBCLS)]. As its name suggests that there are many videos about bioinformatics, but there are some chemoinformatics videos are provided. Please reffer the site. link:http://togotv.dbcls.jp/information.html[journal・dictionary・programminc] might be useful.
189 | **Language of TogoTV is Japanese**
190 | 
191 | - link:https://doi.org/10.7875/togotv.2017.121[PubChemを利用して化学物質やアッセイの結果を調べる 2017/Search compound and assay data by using PubChem 2017]
192 | - link:https://doi.org/10.7875/togotv.2014.014[ChEMBLを使って医薬品候補となる化合物について調べる/Search drug candidate comounds with ChEMBL]
193 | 
194 | ////
195 | NOTE:: これ以外にもケモインフォマティクスに有用なデータベースがあればお知らせください。IssueやPRでも受け付けてます。
196 | ////
197 | NOTE:: If reader know other useful databases for chemoinformatics please inform us. Issue or Pull requests are also appreciated.
198 | 
199 | <<<
200 | 


--------------------------------------------------------------------------------
/ch05_rdkit.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 5: Handling Structural Information with RDKit
  2 | :imagesdir: images
  3 | 
  4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch05_rdkit.ipynb"]
  5 | 
  6 | ////
  7 | この章ではRDKitを使って分子の読み込みの基本を覚えます。
  8 | ////
  9 | In this chapter we will learn the basics of reading molecules with RDKit.
 10 | 
 11 | === What is SMILES?
 12 | 
 13 | ////
 14 | Simplified molecular input line entry system(SMILES)とは化学構造を文字列で表現するための表記方法です。
 15 | 詳しくはlink:http://www.daylight.com/meetings/summerschool98/course/dave/smiles-intro.html#TOC[SMILES Tutorial]で説明されていますが、例えばc1ccccc1は6つの芳香族炭素が最初と最後をつないでループになっている構造、つまりベンゼンを表現していることになります。
 16 | ////
 17 | Simplified molecular input line entry system(SMILES) is a specification in the form of a line notation for describing the structure of chemical species using short ASCII strings. More detials are described in link:http://www.daylight.com/meetings/summerschool98/course/dave/smiles-intro.html#TOC[SMILES Tutorial]. For example **c1ccccc1** means that there are six aromatic carbon atoms and has a loop structure which is connected with start and end, you know it means benzene. 
 18 | 
 19 | === Let's draw chemical strcutre with SMILES :)
 20 | 
 21 | ////
 22 | SMILESで分子を表現することがわかったので、SMILESを読み込んで分子を描画させてみましょう。まずはRDKitのライブラリからChemクラスを読み込みます。二行目はJupyter Notebook上で構造を描画するための設定です。
 23 | ////
 24 | We could understand SMILES can represent molecules, so let's read SMILES and draw molecule. At first import Chem class from RDKit to do that. And the function in the second line  named 'IPythonConsole' is read for drawing molecules on Notebook.
 25 |  **The majority of the basic molecular functionality is found in module rdkit.Chem**
 26 | 
 27 | [source, python]
 28 | ----
 29 | from rdkit import Chem
 30 | from rdkit.Chem.Draw import IPythonConsole
 31 | from rdkit.Chem import Draw
 32 | ----
 33 | 
 34 | ////
 35 | RDKitにはSMILES文字列を読み込むためにMolFromSmilesというメソッドが用意されていますので、これを使い分子を読み込みます。
 36 | ////
 37 | RDKit has MolFromSmiles method which reads SMILES. RDKit mol object can be constructed from SMILES with the function like below. 
 38 | 
 39 | [source, python]
 40 | ----
 41 | mol = Chem.MolFromSmiles("c1ccccc1")
 42 | ----
 43 | 
 44 | ////
 45 | 続いて構造を描画しますが、単純にmolを評価するだけで構造が表示されます。
 46 | ////
 47 | Next we draws molecular structure. It is very simple, just evaluate mol object.
 48 | 
 49 | [source, python]
 50 | ----
 51 | mol
 52 | ----
 53 | 
 54 | ////
 55 | 図のように構造が表示されているはずです。
 56 | ////
 57 | Molecular structure will be drawn like following figure.
 58 | 
 59 | image::ch05/ch05_01.png[Depict benzene]
 60 | 
 61 | ////
 62 | 上のように原子を線でつなぎ構造を表現する方法（構造式）と、SMILES表記はどちらも同じものを表現しています。構造式は人が見てわかりやすいですが、SMILESはASCII文字列で表現されるのでより少ないデータ量で表現できるというメリットがあります。
 63 | 
 64 | NOTE: 文字列で表現できるということは、文字列生成アルゴリズムを応用することで新規な化学構造を生成することも可能ということです。この内容に関しては12章で詳しく説明します。
 65 | ////
 66 | Both methods connect atoms with bonds(2D Structure) and SMILES can represent same molecule. 2D structure is easy to understand for us and SMILES is not. But SMILES can define molecule as ASCII strings so SMILES can store molecule in low data volume.
 67 | 
 68 | === How to handle multiple molecules at once？
 69 | 
 70 | ////
 71 | 複数の化合物を一つのファイルに格納する方法にはいくつかありますが、sdfというファイル形式を利用するのが一般的です。
 72 | 
 73 | .sdfフォーマットとは？
 74 | ****
 75 | MDL社で開発された分子表現のためのフォーマットにMOL形式というものがあります。このMOL形式を拡張したものがSDF形式です。具体的にはMOL形式で表現されたものを"$$$$"という行で区切ることにより、複数の分子を取り扱えるようにしてあります。
 76 | 
 77 | MOL形式は分子の三次元座標を格納することができ二次元だけでなく立体構造を表現できる点はSMILESとの大きな違いです。
 78 | ****
 79 | ////
 80 | There are several ways to store multiple molecules in a file but SDF format file is common.
 81 | 
 82 | .What's sdf format?
 83 | ****
 84 | There is MOL format which was developed by MDL. SDF format is an extension of this MOL format. In particular multiple compounds are delimited by lines consisting of four dollar signs ($$$$). A feature of the SDF format is its ability to include associated data. 
 85 | 
 86 | Huge differnce between MOL format and SMILES format is that MOL format can store 3d geometry information of molecule so MOL format can describe not only 2D but also stereo chemistry.
 87 | ****
 88 | 
 89 | ==== Download sdf file from ChEMBL
 90 | 
 91 | ////
 92 | 4章を参考にlink:https://www.ebi.ac.uk/chembl/beta/[ChEMBL]のトポイソメラーゼII阻害試験(CHEMBL669726)の構造データをsdfファイル形式でダウンロードします。
 93 | 
 94 | NOTE::
 95 | ****
 96 | 具体的な手順はリンクのページを開いて、検索フォームにCHEMBL669726を入力すると検索結果が表示されるので、Compoundsタブをクリックします。その後、全選択してSDFでダウンロードするとgzip圧縮されたsdfがダウンロードされるので、gunzipコマンドまたは適当な解凍ソフトで解凍してください。それをch05_compounds.sdfという名前で保存します。
 97 | ****
 98 | ////
 99 | Refer to chapter 4, down load Topoisomerase II inhibitor data(CHEMBL669726) from link:https://www.ebi.ac.uk/chembl/beta/[ChEMBL] as sdf file format.
100 | 
101 | NOTE::
102 | ****
103 | Specially, open the link page and input 'CHEMBL66926' to search form then search results will be appeared. Then click compounds tab, select all and down load as SDF. File download will start and get file as compressed gzip format. Extract the file with guzip command or using an appropriate soft then rename the file to ch05_compounds.sdf.
104 | ****
105 | 
106 | ==== Handling sdf with RDKit
107 | 
108 | ////
109 | RDKitでsdfファイルを読み込むにはSDMolSupplierというメソッドを利用します。複数の化合物を取り扱うことになるのでmolではなくmolsという変数に格納していることに注意してください。どういう変数を使うかの決まりはありませんが、見てわかりやすい変数名をつけることで余計なミスを減らすことは心がけるとよいでしょう。
110 | ////
111 | SDMolSupplier method is used as sdf file reader of RDKit. Please note that we use mols variable instead of mol because we handle multiple molecules. There isn't a rule for variables naming but you should use variables name which is easy to understand in order to reduce the unnecessary mistakes.
112 | 
113 | [source, python]
114 | ----
115 | mols = Chem.SDMolSupplier("ch05_compounds.sdf")
116 | ----
117 | 
118 | ////
119 | 何件の分子が読み込まれたのか確認します。数を数えるにはlenを使います。
120 | ////
121 | Check how many coumpounds are read. len method is used to count number.
122 | 
123 | [source, python]
124 | ----
125 | len(mols)
126 | ----
127 | 
128 | ////
129 | 34件でした。
130 | ////
131 | Total 34 molecules are read.
132 | 
133 | ==== Draw moleculear structures
134 | 
135 | ////
136 | forループを使って、ひとつずつ分子を描画してもいいですが、RDKitには複数の分子を一度に並べて描画するメソッドが用意されているので、今回はそちらのMolsToGridImageメソッドを使います。なお一行に並べる分子の数を変更するにはmolsPerRowオプションで指定します
137 | ////
138 | You can draw molecule one by one with for loop but it is redundant. RDKit has method which can draw multiple molecules at once, so try to use the function named MolsToGridImage method. For your information the function has molsPerRow option which can change number of molecules per row.
139 | 
140 | [source, python]
141 | ----
142 | Draw.MolsToGridImage(mols)
143 | ----
144 | 
145 | image::ch05/ch05_04.png[MolsToGridImage]
146 | 
147 | ===== (bonus)
148 | ////
149 | 参考までにループを回すやりかたも載せておきます。
150 | ////
151 | Following code shows draw molecule one by one with loop for your information.
152 | 
153 | [source, python]
154 | ----
155 | from IPython.core.display import display
156 | for mol in mols:
157 |     display(mol)
158 | ----
159 | 
160 | === Let's try to do hetero shuffling
161 | 
162 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch05_hetero_shuffle.ipynb"]
163 | 
164 | ////
165 | 創薬の化合物最適化ブロジェクトで、分子の形を変更しないで化合物の特性を変えたいということがあります。このような場合、芳香環を形成する炭素、窒素、硫黄、酸素などの原子種を入れ替えることでより良い特性の化合物が得られることがありますがこのようにヘテロ原子(水素以外の原子)を入れ替えるアプローチをヘテロシャッフリングといいます。
166 | 
167 | ヘテロシャッフリングを行うことで、活性を維持したまま物性を変化させて動態を良くする、活性そのものを向上させる、特許クレームの回避といった効果が期待できます。
168 | 
169 | 少しの構造の違いが選択性や薬物動態が影響を与える有名な例として、Pfizer社のlink:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL192/[Sildenafil]とGSK社のlink:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL1520/[Vardinafil]が挙げられます。
170 | 
171 | 二つの構造を比較すると中心の環構造部分の窒素原子の並びが異なっているだけで極めて似ています。両分子は同じ標的蛋白質を阻害しますが、そのlink:https://www.nature.com/articles/3901525[活性や薬物動態]は異なります。
172 | 
173 | image::ch05/ch05_08.png[check structures]
174 | 
175 | 上記の画像を生成するコードを示します。単にDraw.MolsToGridImageを適用するのではなく
176 | Core構造をベースにアライメントしていることとDraw.MolToGridImageのオプションにlegendsを与え、分子名を表示していることに注意してください。
177 | ////
178 | At the leard optimization satage of drug discovery, it often happens that researchers would like to improve molecular properties without changing molecular shape. In this case medicinal chemists of chage atoms such as carbon, nitrogen, sulphur and oxygen which in aromatic rings and it generats good profile molecules sometime. The approach which exchange aromatic atoms (except hydrogen) is called heteroshuffling.
179 | 
180 | The heteroshuffling strategy is expected to improve physchem properties keeping potency, improve potency and claim avoidance.
181 | 
182 | Pfizer's link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL192/[Sildenafil] and GSK's link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL1520/[Vardinafil] are well-known examples where small structural differences can affect selectivity and pharmacokinetics.
183 | 
184 | The two structures are very similar except that the arrangement of the nitrogen atoms in the central ring structure is different. Both two molecules inhibt same target protein but link:https://www.nature.com/articles/3901525[their biological activities and pharmacokenetics] are different.
185 | 
186 | image::ch05/ch05_08.png[check structures]
187 | 
188 | Following code shows how to generate image described above. Please note that the code is not just using Draw.MolsToGridImage but align to core structure and add legends option to draw molecular's name.
189 | 
190 | [source, python]
191 | ----
192 | from rdkit import Chem
193 | from rdkit.Chem import AllChem
194 | from rdkit.Chem.Draw import IPythonConsole
195 | from rdkit.Chem import Draw
196 | from rdkit.Chem import rdDepictor
197 | from rdkit.Chem import rdFMCS
198 | from rdkit.Chem import TemplateAlign
199 | IPythonConsole.ipython_useSVG = True
200 | rdDepictor.SetPreferCoordGen(True)
201 | 
202 | sildenafil = Chem.MolFromSmiles('CCCC1=NN(C)C2=C1NC(=NC2=O)C1=C(OCC)C=CC(=C1)S(=O)(=O)N1CCN(C)CC1')
203 | vardenafil = Chem.MolFromSmiles('CCCC1=NC(C)=C2N1NC(=NC2=O)C1=C(OCC)C=CC(=C1)S(=O)(=O)N1CCN(CC)CC1')
204 | rdDepictor.Compute2DCoords(sildenafil)
205 | rdDepictor.Compute2DCoords(vardenafil)
206 | res = rdFMCS.FindMCS([sildenafil, vardenafil], completeRingsOnly=True, atomCompare=rdFMCS.AtomCompare.CompareAny)
207 | MCS = Chem.MolFromSmarts(res.smartsString)
208 | rdDepictor.Compute2DCoords(MCS)
209 | 
210 | TemplateAlign.AlignMolToTemplate2D(sildenafil, MCS)
211 | TemplateAlign.AlignMolToTemplate2D(vardenafil, MCS)
212 | Draw.MolsToGridImage([sildenafil, vardenafil], legends=['sildenafil', 'vardenafil'])
213 | ----
214 | 
215 | ////
216 | ヘテロシャッフルした分子を生成するためにHeteroShuffleというクラスを定義します。オブジェクトの生成にはシャッフルしたい分子と変換したい部分構造（Core）を与えます。クラス内のコードではまず、分子をCoreで切断し、Coreとそれ以外に分けます。CoreのAromatic原子で、置換基がついてない原子のみが置換候補になります。。シャッフル後のCoreとCore以外のパーツを再結合するための反応オブジェクトを生成するメソッドがmake_connectorです。このメソッドで作られた反応オブジェクトを利用してre_construct_molで分子を再構築しています。
217 | 
218 | 考えられる原子の組み合わせを構築するために、itertools.productに、候補原子（C, S, N, O）の原子番号と、環を構成する原子数target_atomic_numsを与えます。その後に分子として生成できないものは排除するのでここでは考えられる全部の組み合わせを出します。
219 | ////
220 | HeteroShuffle class is defined to generate hetero shuffled molecules. To generate the objects, it is needed to input the molecule which would like to do hetero shuffle and core structure to shuffle. The target atoms are aromatic atoms in the core and atoms which has no substituent. The function named make_connector generates reaction objects to construct molecules from shuffled core and substituents. The function named re_construct_mol reconstruct molecules with the reaction objects.
221 | 
222 | To generate possible combinations of atoms, the code pass candidates of atomic numbers (C, S, N, O) and number of atoms which constructs target ring. Invalid molecule will be removed after possible combinations is generated.
223 | 
224 | [source, python]
225 | ----
226 | class HeteroShuffle():
227 |     
228 |     def __init__(self, mol, query):
229 |         self.mol = mol
230 |         self.query = query
231 |         self.subs = Chem.ReplaceCore(self.mol, self.query)
232 |         self.core = Chem.ReplaceSidechains(self.mol, self.query)
233 |         self.target_atomic_nums = [6, 7, 8, 16]
234 |     
235 |     
236 |     def make_connectors(self):
237 |         n = len(Chem.MolToSmiles(self.subs).split('.'))
238 |         map_no = n+1
239 |         self.rxn_dict = {}
240 |         for i in range(n):
241 |             self.rxn_dict[i+1] = AllChem.ReactionFromSmarts('[{0}*][*:{1}].[{0}*][*:{2}]>>[*:{1}][*:{2}]'.format(i+1, map_no, map_no+1))
242 |         return self.rxn_dict
243 | 
244 |     def re_construct_mol(self, core):
245 |         '''
246 |         re construct mols from given substructures and core
247 |         '''
248 |         keys = self.rxn_dict.keys()
249 |         ps = [[core]]
250 |         for key in keys:
251 |             ps = self.rxn_dict[key].RunReactants([ps[0][0], self.subs])
252 |         mol = ps[0][0]
253 |         try:
254 |             smi = Chem.MolToSmiles(mol)
255 |             mol = Chem.MolFromSmiles(smi)
256 |             Chem.SanitizeMol(mol)
257 |             return mol
258 |         except:
259 |             return None
260 | 
261 |     def get_target_atoms(self):
262 |         '''
263 |         get target atoms for replace
264 |         target atoms means atoms which don't have anyatom(*) in neighbors
265 |         '''
266 |         atoms = []
267 |         for atom in self.core.GetAromaticAtoms():
268 |             neighbors = [a.GetSymbol() for a in atom.GetNeighbors()]
269 |             if '*' not in neighbors and atom.GetSymbol() !='*':
270 |                 atoms.append(atom)
271 |         print(len(atoms))
272 |         return atoms
273 |     
274 |     def generate_mols(self):
275 |         atoms = self.get_target_atoms()
276 |         idxs = [atom.GetIdx() for atom in atoms]
277 |         combinations = itertools.product(self.target_atomic_nums, repeat=len(idxs))
278 |         smiles_set = set()
279 |         self.make_connectors()
280 |         for combination in combinations:
281 |             target = copy.deepcopy(self.core)
282 |             #print(Chem.MolToSmiles(target))
283 |             for i, idx in enumerate(idxs):
284 |                 target.GetAtomWithIdx(idx).SetAtomicNum(combination[i])
285 |             smi = Chem.MolToSmiles(target)
286 |             #smi = smi.replace('sH','s').replace('oH','o').replace('cH3','c')
287 |             #print('rep '+smi)
288 |             target = Chem.MolFromSmiles(smi)
289 |             if target != None:
290 |                 n_attachment = len([atom for atom in target.GetAtoms() if atom.GetAtomicNum() == 0])
291 |                 n_aromatic_atoms = len(list(target.GetAromaticAtoms()))
292 |                 if target.GetNumAtoms() - n_attachment == n_aromatic_atoms:
293 |                     try:
294 |                         mol = self.re_construct_mol(target)  
295 |                         if checkmol(mol):
296 |                             smiles_set.add(Chem.MolToSmiles(mol))
297 |                     except:
298 |                         pass
299 |         mols = [Chem.MolFromSmiles(smi) for smi in smiles_set]
300 |         return mols
301 | ----
302 | 
303 | ////
304 | 上のコードで使われているcheckmolという関数はc1coooo1のような６員環の構造もAromaticだと判定されてしまうのでそれを避けるために使っています。O, Sが許容されるのは５員環のヘテロ芳香環のみにしました。
305 | ////
306 | The checkmol function which is used to avoid molecule such as c1coooo1 is defied as aromatic. I defined molecule which is allowd contain O, S is only five menbered hetero aromatic rings.
307 | 
308 | [source, python]
309 | ----
310 | def checkmol(mol):
311 |     arom_atoms = mol.GetAromaticAtoms()
312 |     symbols = [atom.GetSymbol() for atom in arom_atoms if not atom.IsInRingSize(5)]
313 |     if symbols == []:
314 |         return True
315 |     elif 'O' in symbols or 'S' in symbols:
316 |         return False
317 |     else:
318 |         return True
319 | ----
320 | 
321 | ////
322 | 実際に使ってみます。
323 | ////
324 | Use the function.
325 | 
326 | [source, python]
327 | ----
328 | # Gefitinib
329 | mol1 = Chem.MolFromSmiles('COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4')
330 | core1 = Chem.MolFromSmiles('c1ccc2c(c1)cncn2')
331 | #  Oxaprozin
332 | mol2 = Chem.MolFromSmiles('OC(=O)CCC1=NC(=C(O1)C1=CC=CC=C1)C1=CC=CC=C1')
333 | core2 =  Chem.MolFromSmiles('c1cnco1')
334 | ----
335 | 
336 | ////
337 | 元の分子
338 | ////
339 | Original molecule.
340 | 
341 | image::ch05/ch05_05.png[query]
342 | 
343 | [source, python]
344 | ----
345 | ht=HeteroSuffle(mol1, core1)
346 | res=ht.generate_mols()
347 | print(len(res))
348 | Draw.MolsToGridImage(res, molsPerRow=5)
349 | ----
350 | 
351 | ////
352 | Gefitinibを入力とした場合の変換結果の一部です。芳香環を形成する原子が元の化合物から変化した分子が出力されています。
353 | また、Coreで指定したキナゾリン部分のみが変換されています。
354 | ////
355 | The image is part of the results Gefitinib as input. The molecules which is different from original molecule are generated. And quinazoline part is changed because I set quinazoline as core.
356 | 
357 | image::ch05/ch05_06.png[res1]
358 | 
359 | [source, python]
360 | ----
361 | ht=HeteroSuffle(mol2, core2)
362 | res=ht.generate_mols()
363 | print(len(res))
364 | Draw.MolsToGridImage(res, molsPerRow=5)
365 | ----
366 | 
367 | ////
368 | Oxaprozinを入力とした場合の変換結果です。こちらは中心に、link:https://en.wikipedia.org/wiki/Oxazole[オキサゾール]と呼ばれる5員環構造を有してます。５員環を形成する芳香環にはチオフェン、フランなどのように窒素や酸素を含むものもあります。以下の例でもS、Oが5員環の構成原子に含まれている分子が出力されています。
369 | ////
370 | This is the result of Oxaprozin is used as input. This molecul has ink:https://en.wikipedia.org/wiki/Oxazole[oxazole] which is five menbered ring as core. There are several hetero aromatic rings that conatin oxygen, sulphur such as thiophen furan.
371 | 
372 | image::ch05/ch05_07.png[res2]
373 | 
374 | ////
375 | どうでしょうか。二つの分子の例を示しました。一つ目の例、Gefitinibは、分子を構成する芳香環が、link:https://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%8A%E3%82%BE%E3%83%AA%E3%83%B3[キナゾリン]とベンゼンでした。キナゾリンは、ベンゼンとピリミジンという二つの６員環が縮環した構造です。６員環をベースに構成される芳香環を形成する原子の候補は炭素と窒素になります。（ピリリウムイオンなど電荷を持つものも考慮すれば酸素や硫黄も候補になりますが、通常このような構造をDrug Designで使うことは少ないので今回の説明からは外しています。link:https://ja.wikipedia.org/wiki/%E8%A4%87%E7%B4%A0%E7%92%B0%E5%BC%8F%E5%8C%96%E5%90%88%E7%89%A9[複素環式化合物の説明]）
376 | Oxaprozinはオキサゾールを有しています。５員環の芳香環を形成する原子の候補は炭素、窒素、硫黄、酸素が挙げられます。このような分子の場合の例として紹介しました。
377 | いずれのケースでも上記のコードでヘテロ原子がシャッフルされたものが生成されています
378 | ////
379 | What's on your mind? Two examples were shown. The first one is a case of aromatic rings are quinazoline and benzene. Qunazoline is the ring which is fused ring of benzene and pyrimidine. The candidates atoms for six membered aromatic rings will be carbon and nitrogen atoms. (Of cource if we consider for pyririum ion, oxygen will be candidate of atoms but these charged substructure is not common for drug discovery. So we ommited the atom.)
380 | Oxaprozin has an oxazole rings. The candidates of atoms for five membered aromatic rings will be carbon, nitrogen, sulphur and oxygen. The second one is introduced as an example of five membered hetero aromatic rings.
381 | HeteroShuffled molecules are generated in the both case.
382 | 
383 | ////
384 | .ヘテロシャッフリングについてもう少し詳しく
385 | ****
386 | link:https://pubs.acs.org/doi/10.1021/jm3001289[J. Med. Chem. 2012,  55, 11, 5151-5164]ではPIM-1キナーゼ阻害剤におけるNシャッフリングの効果をFragment Molecular Orbital法という量子化学的なアプローチを使って検証しています。さらにlink:https://pubs.acs.org/doi/10.1021/acs.jcim.8b00563[J. Chem. Inf. Model. 2019,  59, 1, 149-158]ではAsp–Arg塩橋とヘテロ環のスタッキングのメカニズムを量子化学計算により探っており、置換デザインの指標になりそうです。
387 | 
388 | また、バイオアベイラビリティ改善のためにヘテロシャッフリングを行った例としてはlink:https://dx.doi.org/10.1021/jm101027s[J. Med. Chem. 2011,  54, 8, 3076-3080]があります。
389 | ****
390 | ////
391 | .Describes about hetero shuffling more
392 | ****
393 | In the article link:https://pubs.acs.org/doi/10.1021/jm3001289[J. Med. Chem. 2012,  55, 11, 5151-5164] analyzed the effect of nitrogen shuffling for PIM-1 kinase inhibitor project with Fragment Molecular Orbital method which is a method of quantum chemistry. And another article link:https://pubs.acs.org/doi/10.1021/acs.jcim.8b00563[J. Chem. Inf. Model. 2019,  59, 1, 149-158] described mechanism of the stackibng between Asp-Arg salt bridge and hetero rings with quantum chemistry calclation. The approach seems to be good indicator for substituents design.
394 | 
395 | Also, an example of hetero shuffling for improving the bio availability is ink:https://dx.doi.org/10.1021/jm101027s[J. Med. Chem. 2011,  54, 8, 3076-3080]
396 | ****
397 | 
398 | <<<
399 | 


--------------------------------------------------------------------------------
/ch06_similarity.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 6: Try to evaluate the similarity of compounds
  2 | :imagesdir: images
  3 | 
  4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch06_similarity.ipynb"]
  5 | 
  6 | === What does it mean that compounds are similar？
  7 | 
  8 | Expressions that are somewhat shape is similar are not scientific. In the chemoinformatics, similarity or unsimilarity (distance) is used as quantitative metrics. 
  9 | 
 10 | In this section, we will introduce two major metrics.
 11 | 
 12 | ==== Descriptor
 13 | 
 14 | ////
 15 | 分子の全体的な特徴を数値で表現するものを記述子と呼びます。分子量や極性表面性（PSA）、分配係数(logP)などがあり、現在までに多くの記述子が提案されています。これらの記述子の類似性を評価することで２つの分子がどのくらい似ているかを表現することが可能です。また分子全体の特徴を1つの数字で表現しており局所的な特徴ではないということに注意してください。
 16 | 
 17 | NOTE: いくつかの記述子に関しては市販ソフトでないと計算できない場合があります。
 18 | ////
 19 | 
 20 | A parameter that represents the overall characteristics of a molecule numerically is called a descriptor. Many descriptors are proposed so far, such as molecular weight, polar surface area (PSA) and partition coefficient (logP). It is possible to evaluate a similarity between two molecules with these descriptors. Please note that descriptor represents whole molecular feature as a numeric value and it is not a local feature.
 21 | 
 22 | NOTE: There are cases where commercial software is needed to calculate some descriptors.
 23 | 
 24 | ==== Fingerprint
 25 | 
 26 | A fingerprint is another feature, and is a binary representation of a partial structure of a molecule as a binary 0, 1, and it corresponds to the presence or absence of a partial structure and on (1) or off (0) of a bit, and represents a set of partial structures Represents the characteristics of the molecule. There are two types of fingerprints, fixed-length FP and variable-length FP. Formerly, MACSKey fixed-length FP (FP whose partial structure and index have been determined in advance) was used, but now ECFP 4 (It is common to use a variable-length FP called Morgan2).
 27 | 
 28 | As for the RDKit fingerprint, please read link:https://www.rdkit.org/UGM/2012/Landrum_RDKit_UGM.Fingerprints.Final.pptx.pdf[Developper of RDKit, Greg's Slide] for details.
 29 | 
 30 | Let's do similarity evaluation using this ECFP 4 (Morgan 2) this time.
 31 | 
 32 | .Difference between SMILES and fingerprint
 33 | ****
 34 | SMILES is an ASCII string representation of the structure, and a fingerprint is a binary representation of the presence or absence of a substructure. The difference is that the former is one of the ** structural expressions **, while the latter is one of the ** feature expressions **.
 35 | Since only the presence or absence of partial structures is expressed, information such as the relationship between partial structures (how connected by positional relationship) is lost, and the original structure is not restored.
 36 | 
 37 | Some people call it Bag-of-Fragments because it corresponds to Bag-of-Words often used in text-mining.
 38 | ****
 39 | 
 40 | === Let's calculate similarity
 41 | 
 42 | Let's evaluate the similarity of toluene and chlorobenzene as simple molecules.
 43 | 
 44 | [source, python]
 45 | ----
 46 | from rdkit import Chem, DataStructs
 47 | from rdkit.Chem import AllChem, Draw
 48 | from rdkit.Chem.Draw import IPythonConsole
 49 | ----
 50 | 
 51 | Read molecule from SMILES.
 52 | 
 53 | [source, python]
 54 | ----
 55 | mol1 = Chem.MolFromSmiles("Cc1ccccc1")
 56 | mol2 = Chem.MolFromSmiles("Clc1ccccc1")
 57 | ----
 58 | 
 59 | Confirm it by visual observation.
 60 | 
 61 | [source, python]
 62 | ----
 63 | Draw.MolsToGridImage([mol1, mol2])
 64 | ----
 65 | 
 66 | Generate radius 2 morgan fingerprint which corresponds to ECFP4.
 67 | 
 68 | [source, python]
 69 | ----
 70 | fp1 = AllChem.GetMorganFingerprint(mol1, 2)
 71 | fp2 = AllChem.GetMorganFingerprint(mol2, 2)
 72 | ----
 73 | 
 74 | Tanimoto coefficient is used for similarity evaluation.
 75 | 
 76 | [source, python]
 77 | ----
 78 | DataStructs.TanimotoSimilarity(fp1, fp2)
 79 | # 0.5384615384615384
 80 | ----
 81 | 
 82 | === Virtual screening 
 83 | 
 84 | So far we have described how to evaluate the similarity of compounds. Using this index of similarity to select a specific group of compounds from a large number of compounds is called virtual screening.
 85 | 
 86 | For example, if a compound that is likely to be a drug is published in a patent or a paper, or a compound that is likely to be promising is found in our assay system, similar compounds in the compound library database of our company or the database of commercially available compounds are more promising I want to find out if there is something like that. Here, it is possible to purchase an analog of influenza drug which is known as a neuraminidase inhibitor link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL466246/[Inavir] link:Find out using http://zinc15.docking.org/[ZINC].
 87 | 
 88 | The molecular weight of Inavir was about 350, and LogP was about -3. So we selected the fraction of the molecular weight 350-375 and LogP -1 from ZINC. This section is divided into 16 files, but download and use only the first set.
 89 | 
 90 | NOTE: We described how to download the data in chapter 4.
 91 | 
 92 | We can perform shell command on jupyter notebook by starting from !. The following is an example of downloading ZINC data set with wget command on jupyter notebook
 93 | 
 94 | [source, python]
 95 | ----
 96 | !wget http://files.docking.org/2D/EA/EAED.smi
 97 | ----
 98 | 
 99 | Read SMILES from file and make it a mol object, but skip the first line because it is a header. Also, the last character of each line is a newline character, so it is excluded as l [:-1]. Finally, find out how many compounds there are.
100 | 
101 | [source, python]
102 | ----
103 | mols = []
104 | with open("EAED.smi") as f:
105 |     f.readline()
106 |     for l in f:
107 |         mol = Chem.MolFromSmiles(l[:-1])
108 |         mols.append(mol)
109 | print(len(mols))
110 | # 195493
111 | ----
112 | 
113 | Next, prepare a function to check the degree of similarity with Inavir (LANIMAMIBIR).
114 | 
115 | [source, python]
116 | ----
117 | laninamivir = Chem.MolFromSmiles("CO[C@H]([C@H](O)CO)[C@@H]1OC(=C[C@H](NC(=N)N)[C@H]1NC(=O)C)C(=O)O")
118 | laninamivir_fp = AllChem.GetMorganFingerprint(laninamivir, 2)
119 | 
120 | def calc_laninamivir_similarity(mol):
121 |     fp = AllChem.GetMorganFingerprint(mol, 2)
122 |     sim = DataStructs.TanimotoSimilarity(laninamivir_fp, fp)
123 |     return sim
124 | ----
125 | 
126 | Check it.
127 | 
128 | [source, python]
129 | ----
130 | similar_mols =[]
131 | for mol in mols:
132 |     sim = calc_laninamivir_similarity(mol)
133 |     if sim > 0.2:
134 |         similar_mols.append((mol, sim))
135 | ----
136 | 
137 | Sort the results in descending order of similarity and retrieve only the first ten.
138 | 
139 | [source, python]
140 | ----
141 | similar_mols.sort(key=lambda x: x[1], reverse=True)
142 | mols = [l[0] for l in similar_mols[:10]]
143 | ----
144 | 
145 | Let's draw them.
146 | 
147 | [source, python]
148 | ----
149 | Draw.MolsToGridImage(mols, molsPerRow=5)
150 | ----
151 | 
152 | image::ch06/vs01.png[result]
153 | 
154 | As you can see if the similarity is confirmed, about 200,000 compounds examined this time can only find a compound with a maximum similarity is 23%. However, ZINC contains 750 million entries, so there should be many more similar compounds in it.
155 | 
156 | === Clustering
157 | 
158 | For example, when purchasing a commercial compound and creating a library, we want to have as much diversity as possible, so we organize similar compounds and select a representative of them so that only similar compounds are not biased. In this way, if you want to organize compounds by structural similarity, use a method called clustering.
159 | 
160 | Clustering of 5614 hits from link:https://www.ebi.ac.uk/chembl/beta/assay_report_card/CHEMBL1040694/[Novrtis's antimalarial assay]
161 | 
162 | Import library for clustering and reading data.
163 | 
164 | [source, python]
165 | ----
166 | from rdkit.ML.Cluster import Butina
167 | mols = Chem.SDMolSupplier("ch06_nov_hts.sdf")
168 | ----
169 | 
170 | If RDKit can not read the molecule for some reason, it will generate None instead of a mol object. Since passing this None to the GetMorganFingerprintAsBitVect method results in an error, so we generate a fingerprint while excluding None.
171 | 
172 | [source, python]
173 | ----
174 | fps = []
175 | valid_mols = []
176 | 
177 | for mol in mols:
178 |     if mol is not None:
179 |         fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
180 |         fps.append(fp)
181 |         valid_mols.append(mol)
182 | ----
183 | 
184 | Generate a distance matrix (a lower triangular distance matrix) from the fingerprints.
185 | 
186 | [source, python]
187 | ----
188 | distance_matrix = []
189 | for i, fp in enumerate(fps):
190 |     similarities = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i+1])
191 |     distance_matrix.extend([1-sim for sim in similarities])
192 | ----
193 | 
194 | Cluster compounds using a distance matrix. The third argument is the distance threshold. In this example, clustering is performed on compounds with a distance of 0.2 or 80% or more.
195 | 
196 | [source, python]
197 | ----
198 | clusters = Butina.ClusterData(distance_matrix, len(fps), 0.2, isDistData=True)
199 | ----
200 | 
201 | Check number of cluster.
202 | 
203 | [source, python]
204 | ----
205 | len(clusters)
206 | #2492
207 | ----
208 | 
209 | Visualize structures of first cluster.
210 | 
211 | [source, python]
212 | ----
213 | mols_ =[valid_mols[i] for i in clusters[0]]
214 | Draw.MolsToGridImage(mols_, molsPerRow=5)
215 | ----
216 | 
217 | image::ch06/cls01.png[clustering result, width=600, pdfwidth=60%]
218 | 
219 | 
220 | In this case, clustering was performed using the library provided in RDKit, but some methods can be used with link:https://scikit-learn.org/stable/modules/clustering.html[Scikit-learn].  And in practice this method is often used.
221 | 
222 | === Structure Based Drug Design(SBDD)
223 | 
224 | Here we evaluate the similarity of link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL231779/[apixaban] and link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL198362/[rivaroxaban], which are marketed as anticoagulants.
225 | 
226 | [source, python]
227 | ----
228 | apx = Chem.MolFromSmiles("COc1ccc(cc1)n2nc(C(=O)N)c3CCN(C(=O)c23)c4ccc(cc4)N5CCCCC5=O")
229 | rvx = Chem.MolFromSmiles("Clc1ccc(s1)C(=O)NC[C@H]2CN(C(=O)O2)c3ccc(cc3)N4CCOCC4=O")
230 | ----
231 | 
232 | [source, python]
233 | ----
234 | Draw.MolsToGridImage([apx, rvx], legends=["apixaban", "rivaroxaban"])
235 | ----
236 | 
237 | image::ch06/apx_rvx.png[APX+RVX, width=600, pdfwidth=60%]
238 | 
239 | The structures are quite similar as you can see, but both of these two compounds are known to bind similarly to the same pocket of the serine protease FXa and to inhibit the function of the protein.
240 | 
241 | [source, python]
242 | ----
243 | apx_fp = AllChem.GetMorganFingerprint(apx, 2)
244 | rvx_fp = AllChem.GetMorganFingerprint(rvx, 2)
245 | 
246 | DataStructs.TanimotoSimilarity(apx_fp, rvx_fp)
247 | # 0.40625
248 | ----
249 | 
250 | It's about 40% similar.  In fact, both link:https://www.rcsb.org/structure/2P16[apixaban] and link:https://www.rcsb.org/structure/2W26[rivaroxaban]  have their complex crystal structures solved and were drawn using link:https://pymol.org/2/[PyMOL].
251 | 
252 | NOTE:: It does not explain how to use PyMOL because it exceeds the contents of this document, but if you are interested, Please refer to link:http://www.protein.osaka-u.ac.jp/rcsfp/supracryst/suzuki/jpxtal/Katsutani/index.php[here].
253 | 
254 | image::ch06/apx_rvx_suf.png[APX+RVX, width=600, pdfwidth=60%]
255 | 
256 | As you can see from the figure, apixaban and rivaroxaban are beautifully overlapping in three dimensions. In particular, methoxyphenyl and chlorothiol are located in a site called S1 pocket and are said to have some kind of strong interaction. As the ligand binding sites (pockets) of proteins become clearer, it becomes easier for the medicinal chemist to develop a strategy for the next modification, and the success rate and progress rate of the project will increase.
257 | 
258 | An approach that optimizes the structure based on the shape of the protein determined by X-ray or cryo-electric testing is called Structure Based Drug Design (SBDD). Also, if you know the pocket, you can screen for compounds that physically bind to the pocket, which is called structure-based virtual screening (SBVS), and ligand-based virtual screening as you did in the previous chapter. It may be distinguished from ligand-based virtual screenig(LBVS).
259 | 
260 | .History of Xa inhibitors and the importance of quantum chemistry calculation
261 | ****
262 | Although the contents of the chemoinformatics in this book are far apart, it is very useful in molecular design to trace the history of FXa inhibitors and to understand what improvements have been made through generations. In addition, since the interpretation of the S1 pocket interaction is very difficult visually and in classical mechanics, it can be interpreted only by quantum chemical calculation such as Fragment Molecular Orbital Method (FMO), so it is a mistake that quantum chemical calculation becomes essential in future molecular design I think.
263 | ****
264 | 
265 | 
266 | <<<
267 | 


--------------------------------------------------------------------------------
/ch07_graph.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 7: Assessing similarity using graph structures
  2 | :imagesdir: images
  3 | 
  4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch07_MCS.ipynb"]
  5 | 
  6 | A graph is data consisting of nodes (vertices) and edges (branches) that indicate the connection between nodes. The chemical structure can be represented by this graph. In other words, we can represent atoms in a graph with nodes and bonds as edges.
  7 | 
  8 | In general, fingerprints like those introduced in Chapter 6 are often used to evaluate the similarity between molecules, but there is also a method to evaluate similarity using a graph structure. The MCS (Maximum Common Substructure) introduced next refers to the common substructure of the target molecule set. The more common substructures, the more similar their molecules are.
  9 | 
 10 | === Classification by major skeleton (MCS)
 11 | 
 12 | Maximum Common Substructure (MCS) is the largest common substructure in a given group of chemical structures. RDKit provides a module called rdFMCS for MCS search.
 13 | 
 14 | This time, we will use the file cdk2.sdf provided in rdkit as sample data for MCS search. RDConfig.RDDocsDir is a variable that represents the directory of sample data, and there is a file called cdk2.sdf under the Books/data/ directory, so set the file path with the os.path.join method. Note that os.path.join is a python built-in module for absorbing differences in os paths.
 15 | 
 16 | [source, python]
 17 | ----
 18 | import os
 19 | from rdkit import Chem
 20 | from rdkit.Chem import RDConfig
 21 | from rdkit.Chem import rdFMCS
 22 | from rdkit.Chem.Draw import IPythonConsole
 23 | from rdkit.Chem import Draw
 24 | filepath = os.path.join(RDConfig.RDDocsDir, 'Book', 'data', 'cdk2.sdf')
 25 | mols = [mol for mol in Chem.SDMolSupplier(filepath)]
 26 | # 構造を確認します
 27 | Draw.MolsToGridImage(mols[:7], molsPerRow=5)
 28 | ----
 29 | 
 30 | image::ch07/mcs01.png[compounds]
 31 | 
 32 | Acquires MCS using the loaded molecule. With RDKit, you can specify multiple options for how to get MCS. The following shows an example of each option.
 33 | 
 34 | 
 35 | 
 36 | 
 37 | . Default
 38 | . Any atom can be used (as long as there is an order of structure and bond)
 39 | . The bond order may be any (for example, benzene and cyclohexane have the same MCS)
 40 | 
 41 | [source, python]
 42 | ----
 43 | result1 = rdFMCS.FindMCS(mols[:7])
 44 | mcs1 = Chem.MolFromSmarts　(result1.smartsString)
 45 | mcs1
 46 | print(result1.smartsString)
 47 | #[#6]1:[#7]:[#6](:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7]:2)-[#7]
 48 | ----
 49 | 
 50 | image::ch07/mcs02.png[MCS01]
 51 | 
 52 | [source, python]
 53 | ----
 54 | result2 = rdFMCS.FindMCS(mols[:7], atomCompare=rdFMCS.AtomCompare.CompareAny)
 55 | mcs2 = Chem.MolFromSmarts(result2.smartsString)
 56 | mcs2
 57 | print(result2.smartsString)
 58 | #[#6]-,:[#6]-,:[#6]-[#6]-[#8,#7]-[#6]1:[#7]:[#6](:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7]:2)-[#7]
 59 | ----
 60 | 
 61 | image::ch07/mcs03.png[MCS02]
 62 | 
 63 | [source, python]
 64 | ----
 65 | result3 = rdFMCS.FindMCS(mols[:7], bondCompare=rdFMCS.BondCompare.CompareAny)
 66 | mcs3 = Chem.MolFromSmarts(result3.smartsString)
 67 | mcs3
 68 | print(result3.smartsString)
 69 | #[#6]1:[#7]:[#6](:[#7]:[#6]2:[#6]:1:[#7]:[#6]:[#7]:2)-[#7]
 70 | ----
 71 | 
 72 | image::ch07/mcs04.png[MCS03]
 73 | 
 74 | In RDKit, Fraggle Similarity is implemented as an algorithm to quantify similarity based on MCS. By using this, clustering and analysis based on similarity can be performed.
 75 | 
 76 | [source, python]
 77 | ----
 78 | from rdkit.Chem.Fraggle import FraggleSim
 79 | sim, match = FraggleSim.GetFraggleSimilarity(mols[0], mols[1])
 80 | print(sim, match)
 81 | #0.925764192139738 *C(C)C.*COc1nc(N)nc2[nH]cnc12
 82 | match_st = Chem.MolFromSmiles(match)
 83 | match_st
 84 | ----
 85 | 
 86 | image::ch07/mcs05.png[FraggleSimilarity]
 87 | 
 88 | Thus, FraggleSimilarity returns similarities and matched substructures. It is often closer to the feeling of a chemist than the similarity using ECFP. Please refer to the reference link for details.
 89 | 
 90 | Reference link
 91 | 
 92 | - https://pubs.acs.org/doi/abs/10.1021/acs.jcim.5b00036[Efficient Heuristics for Maximum Common Substructure Search]
 93 | - https://raw.github.com/rdkit/UGM_2013/master/Presentations/Hussain.Fraggle.pdf[Fraggle – A new similarity searching algorithm]
 94 | 
 95 | === Matched Molecular Pair and Matched Molecular Series
 96 | 
 97 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch07_MMS.ipynb"]
 98 | 
 99 | At the structural optimization stage of drug discovery research, how to convert the starting compound (lead compound) is very important, but as the stage progresses, which structural conversion affects the activity and physical properties It is also very important to carry out a retrospective analysis of what it has exerted.
100 | 
101 | TIP: If you are interested, you may read link:https://sar.pharm.or.jp/wp-content/uploads/2018/09/SARNews_19.pdf[https://sar.pharm.or.jp/wp-content/uploads/2018/09/SARNews_19.pdf].
102 | 
103 | Matched Molecular Pair (MMP) is a pair of molecules that differ only in the partial structure of some of the two molecules but are otherwise identical. As an example, chlorobenzene and fluorobenzene are MMPs because they differ only in Cl and F groups. By analyzing a large number of changes in the characteristics of such pairs, you can grasp the trend of substituent conversion. This is called Matched Molecular Pair Analyisis (MMPA). By performing MMPA on large-scale data, it is possible to extract universal rules for property changes caused by substituent changes. If you understand these rules, you will be able to proceed efficiently with structural optimization.
104 | 
105 | Here, we analyze MMP using link:https://github.com/rdkit/rdkit/tree/master/Contrib/mmpa[RDKit/Contrib/MMPA][mmpa] provided in Contrib of RDKit.
106 | 
107 | Move to Contrib/mmpa under the RDKit installation location and execute the python script sequentially.
108 | 
109 | [source, python]
110 | ----
111 | python rfrag.py <name of the File you want to implement the MMPA> #save file name of the data that was fragmented
112 | # For example
113 | # python rfrag.py <data/sample.smi >data/sample_fragmented.txt
114 | 
115 | python indexing.py <can be in the previous command Fragment file > MMP_ output file.CSV 
116 | # eg
117 | # python index.py <data/sample_fragmented.txt >data/mmp.csv
118 | ----
119 | 
120 | Executing the above command will generate a csv file of molecule A, molecule B, ID of molecule A, ID of molecule B, SMIRKS of converted structure, and common part structure (context). MMPA can be performed by linking activity and physical properties based on this data.
121 | 
122 | NOTE: link:http://www.daylight.com/dayhtml/doc/theory/theory.smirks.html[SMIRKS] is a method to express conversion of molecules by string notation like SMILES.
123 | 
124 | 
125 | A method called Matched Molecular Series (MMS) has also been proposed as an extension of MMP. Although MMP is a pair of molecules, MMS is a list of this pair as a group of three or more with common structure.
126 | 
127 | I will actually make MMS. The following example uses data from Factor Xa in ChEMBL. For the implementation of MMS, we use the code of the link:https://github.com/rdkit/UGM_2014/blob/master/Presentations/OBoyle_MatchedSeries.pdf[presentation] by Noel O'Byle's RDKit UGM.
128 | 
129 | Let's actually make an MMS. In the following example, Factor Xa data was link:https://www.ebi.ac.uk/chembl/bioactivity/results/1/cmpd_chemblid/asc/tab/display[downloaded from ChEMBL] and used as an example. For the implementation of MMS, we use the code of the link:https://github.com/rdkit/UGM_2014/blob/master/Presentations/OBoyle_MatchedSeries.pdf[presentation] by Noel O'Byle's RDKit UGM .
130 | 
131 | First, loading the library to be used, loading the data, and desalting using SaltRemover.
132 | 
133 | [source, python]
134 | ----
135 | import sys
136 | import os
137 | import pandas as pd
138 | from rdkit import Chem
139 | from rdkit.Chem import rdMMPA
140 | from rdkit.Chem import RDConfig
141 | from rdkit.Chem import rdBase
142 | from rdkit.Chem.Draw import IPythonConsole
143 | from rdkit.Chem import Draw
144 | from rdkit.Chem import SaltRemover
145 | mmpapath = os.path.join(RDConfig.RDContribDir, 'mmpa')
146 | sys.path.append(mmpapath)
147 | df = pd.read_csv('Chembl_FXa.txt', sep='\t')
148 | remover = SaltRemover.SaltRemover()
149 | mols = []
150 | for i, smi in enumerate(df.CANONICAL_SMILES):
151 |     try:
152 |         mol = Chem.MolFromSmiles(smi)
153 |         mol.SetProp('CMPD_CHEMBLID', df.CMPD_CHEMBLID[i])
154 |         mol = remover.StripMol(mol)
155 |         mols.append(mol)
156 |     except:
157 |         print(smi)
158 | ----
159 | 
160 | Then, import the mmpa rfrag registered in RDKit contrib, and divide the molecule into fragments.
161 | 
162 | [source, python]
163 | ----
164 | import rfrag
165 | rfragdata = []
166 | for i, smi in enumerate(df.CANONICAL_SMILES):
167 |     try:
168 |         out = rfrag.fragment_mol(smi, df.CMPD_CHEMBLID[i])
169 |         rfragdata.append(out)
170 |     except:
171 |         print(smi, df.CMPD_CHEMBLID[i])
172 | ----
173 | 
174 | Define a function to create an MMS. The code is almost the same as that described in the UGM document, but I changed the reading destination from a file to a list in order to do all processing on Jupyter.
175 | 
176 | Here is an overview of the MMS creation process.
177 | 
178 | . Cut each molecule according to a certain rule (cut by rotatable bond etc.)
179 | . Cut fragments create a dictionary of keys, store the fragments of molecules with the same key in the dictionary value
180 | 
181 | By repeating the above process, molecules with common scaffold can be organized. Molecules that are grouped in a common scaffold will be molecules that have different non-scaffold substituents.
182 | 
183 | .What is a scaffold?
184 | ****
185 | In drug discovery, there is a stage of structural optimization at the stage before preclinical studies, in which the major non-skeleton part of the compound is converted briefly into a balanced property suitable for drugs.
186 | 
187 | This main skeleton is called a scaffold. For example, link:https://patentscope2.wipo.int/search/ja/detail.jsf?docId=JP232673446[in this patent], the part except R is fixed and this main skeleton is called a scaffold.
188 | 
189 | image::ch07/scaffold.png[scaffold, width=100, pdfwidth=20%]
190 | ****
191 | 
192 | [source, python]
193 | ----
194 | from collections import namedtuple
195 |  
196 | Frag = namedtuple( 'Frag', ['id', 'scaffold', 'rgroup'] )
197 |  
198 | class Series():
199 |     def __init__( self ):
200 |         self.rgroups = []
201 |         self.scaffold = ""
202 | 
203 | def getFrags(rfrags):
204 |     frags = []
205 |     for lines in rfrags:
206 |         for line in lines:
207 |             broken = line.rstrip().split(",")
208 |             if broken[2]: # single cut
209 |                 continue
210 |             smiles = broken[-1].split(".")
211 |             mols = [Chem.MolFromSmiles( smi ) for smi in smiles]
212 |             numAtoms = [mol.GetNumAtoms() for mol in mols]
213 |             if len(numAtoms) < 2:
214 |                 continue
215 |             if numAtoms[0] > 5 and numAtoms[1] < 12:
216 |                 frags.append(Frag(broken[1], smiles[0], smiles[1]))
217 |             if numAtoms[1] > 5 and numAtoms[0] < 12:
218 |                 frags.append(Frag(broken[1], smiles[1], smiles[0]))
219 |     frags.sort(key=lambda x:(x.scaffold, x.rgroup))
220 |     return frags
221 |  
222 | def getSeries(frags):
223 |     oldfrag = Frag(None, None, None)
224 |     series = Series()
225 |     for frag in frags:
226 |         if frag.scaffold != oldfrag.scaffold:
227 |             if len(series.rgroups) >= 2:
228 |                 series.scaffold = oldfrag.scaffold
229 |                 yield series
230 |             series = Series()
231 |         series.rgroups.append((frag.rgroup, frag.id))
232 |         oldfrag = frag
233 |     if len(series.rgroups) >= 2:
234 |         series.scaffold = oldfrag.scaffold
235 |         yield series
236 | ----
237 | 
238 | We are ready to make an MMS. Visualize only data that has four or more substituent conversions for the same scaffold.
239 | 
240 | [source, python]
241 | ----
242 | frags = getFrags(rfragdata)
243 | series = getSeries(frags)
244 | series =[i for i in series]
245 | from IPython.display import display
246 | for s in series[:50]:
247 |     mols = [Chem.MolFromSmiles(s.scaffold)]
248 |     ids = ['scaffold']
249 |     for r in s.rgroups:
250 |         rg = Chem.MolFromSmiles(r[0])
251 |         mols.append(rg)
252 |         ids.append(r[1])
253 |     if len(mols) > 5:
254 |         display(Draw.MolsToGridImage(mols, molsPerRow=5, legends=ids))
255 |         print("########")
256 | ----
257 | 
258 | image::ch07/mms01.png[MMS]
259 | 
260 | Five scaffolds for MMS were displayed for the scaffold.
261 | 
262 | NOTE: link:https://pubs.acs.org/doi/10.1021/jm500022q[Activity prediction] can also be performed using this MMS.
263 | 
264 | === Visualize MMP networks using Cytoscape
265 | 
266 | WARNING: This content is beyond the content of the introductory, so please skip if you are not interested.
267 | 
268 | MMP can be thought of as a graph structure that uses pre-conversion and post-conversion information as nodes and conversion rules as edges. This graph structure can be intuitively understood by using network visualization tools such as Cytoscape.
269 | 
270 | In addition to the MMPA introduced earlier, RDKit has another project called link:https://github.com/rdkit/mmpdb[mmpdb]. It is provided as a command line tool group and database system, so it has the feature of being easy to manage in the long run. In this section, we introduce the visualization of MMP using link:https://github.com/Mishima-syk/12/tree/master/kzfm[mmpdb and Cytoscape].
271 | 
272 | 
273 | NOTE: link:https://chemrxiv.org/articles/mmpdb_An_Open_Source_Matched_Molecular_Pair_Platform_for_Large_Multi-Property_Datasets/5999375[mmpdb: An Open Source Matched Molecular Pair Platform for Large Multi-Property Datasets]
274 | 
275 | ==== Cytoscape installation
276 | 
277 | link:https://cytoscape.org/[Cytoscape] is an open source network visualization software widely used in various scenes. You can display the structure network by using the compound structure display plug-in.
278 | 
279 | Installation is as easy as downloading the corresponding OS installer from the link:https://cytoscape.org/download.html[download site] and installing according to the instructions.
280 | 
281 | When installation is complete, launch Cytoscape and install the Chemviz2 plug-in for drawing compound structures. The procedure is easy, select chemviz2 from Apps → App Manager and install it.
282 | 
283 | 
284 | image::ch07/chemviz2.png[AppManager, width=400]
285 | 
286 | ==== create a gml file from mmpdb
287 | 
288 | The data to be used this time are 151 compounds of <Inhibition of recombinant GSK3-beta> J. Med. Chem. (2008) 51: 2062-2077 . In principle, MMPA does not use HTS-like search data but scaffolds such as structure optimization.
289 | 
290 | I will put the flow of the command. SMILES text and activity and property data need to be registered separately in the database.
291 | 
292 | [source, bash]
293 | ----
294 | $ mmpdb fragment smiles.txt -o CHEMBL930273.fragments     # fragmentation
295 | $ mmpdb index CHEMBL930273.fragments -o CHEMBL930273.db   # make db
296 | $ mmpdb loadprops -p act.txt CHEMBL930273.db              # load properties
297 | ----
298 | 
299 | After that we will create a gml file for reading by Cytoscape, but this is beyond the scope of this document and will be omitted. If you are interested, you may want to read the link:https://github.com/Mishima-syk/12/tree/master/kzfm[code] directly, but the flow is as follows.
300 | 
301 | . link:https://github.com/Mishima-syk/12/blob/master/kzfm/mmp2gml.py[Make a gml file using mmpdb and python-igraph]
302 | . link:https://github.com/Mishima-syk/12/blob/master/kzfm/CHEMBL930273.gml[Read gml file] by Cytoscape
303 | . Assign attributes to each parameter in Cytoscape to make it easier to understand visually
304 | .. Corresponds to the physical value of the node size
305 | .. Corresponds to the active color of the edge color
306 | .. Draw a structure with chemviz2 plugin and paste it to a node
307 | 
308 | 
309 | ==== Interpretation
310 | 
311 | Let's look at the MMP network. MMP with little difference in activity is solidified in the upper left. In the lower right, red edges (a large difference in activity) are observed. MMPs are also called Activity Cliffs, even if such small substituent changes produce large activity differences. It is important not to overlook such changes in activity, as Activity Cliff is generally a breakthrough in drug discovery projects.
312 | 
313 | image:ch07/mmp01.png[MMPN, width=600, pdfwidth=48%] image:ch07/mmp02.png[MMPN, width=600, pdfwidth=48%]
314 | 
315 | It has been found that the substitution of the OH group with the MeO group causes the loss of activity when we actually confirm what substitution has been made.
316 | 
317 | Since MMP alone can simply know the facts like this, I searched for a complex crystal structure of the analogue in order to consider it a little deeper. Then , a complex of GDB3β and a similar compound was found as link:https://www.rcsb.org/structure/5OY4[PDBID:5OY4].
318 | 
319 | 
320 | image:ch07/mmp03.png[MMPN, width=600, pdfwidth=48%] image:ch07/mmp04.png[MMPN, width=600, pdfwidth=48%]
321 | 
322 | If you replace the OH group with the MeO group, it will likely hit the wall of the pocket. In other words, this Activity Cliff is considered to be caused by steric hindrance of ligand and protein.
323 | 
324 | <<<
325 | 


--------------------------------------------------------------------------------
/ch08_visualization.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 8: Want to have lots of compounds at once
  2 | :imagesdir: images
  3 | 
  4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch08_visualization.ipynb"]
  5 | 
  6 | In order to see how much data is distributed, it is common to map in an appropriate space. Especially in chemoinformatics the word chemical space is used.
  7 | 
  8 | === What is Chemical Space
  9 | 
 10 | Chemical space refers to the arrangement of compounds in an n-dimensional space at some scale. In general, two or three dimensions are often used (for human understanding). Although various methods have been proposed for the scale, ie, similarity, it is often decided that a distance that well characterizes a compound is defined.
 11 | 
 12 | This time, we will visualize which pharmaceutical company is developing what kind of compound for the antagonist of Orexin Receptor, which is known as a target for sleep medicine. See Chapter 4 for how to download data. This time we used the data of 10 papers in the table.
 13 | 
 14 | There are two main things I want to know this time:
 15 | 
 16 | - Were there companies that developed similar compounds?
 17 | - Has Merck optimized only similar frameworks, or did it optimize multiple frameworks?
 18 | 
 19 | .Orexin Receptor Antagonist
 20 | |===
 21 | |Doc ID|Journal|Pharma
 22 | |CHEMBL3098111|link:https://www.sciencedirect.com/science/article/pii/S0960894X13012511?via%3Dihub[Bioorg. Med. Chem. Lett. (2013) 23:6620-6624]|Merck
 23 | |CHEMBL3867477|link:https://www.sciencedirect.com/science/article/pii/S0960894X16310472?via%3Dihub[Bioorg Med Chem Lett (2016) 26:5809-5814]|Merck
 24 | |CHEMBL2380240|link:https://www.sciencedirect.com/science/article/pii/S0960894X13002801?via%3Dihub[Bioorg. Med. Chem. Lett. (2013) 23:2653-2658]|Rottapharm
 25 | |CHEMBL3352684|link:https://www.sciencedirect.com/science/article/pii/S0960894X14008853?via%3Dihub[Bioorg. Med. Chem. Lett. (2014) 24:4884-4890]|Merck
 26 | |CHEMBL3769367|link:https://pubs.acs.org/doi/10.1021/acs.jmedchem.5b00832[J. Med. Chem. (2016) 59:504-530]|Merck
 27 | |CHEMBL3526050|link:http://dmd.aspetjournals.org/content/41/5/1046[Drug Metab. Dispos. (2013) 41:1046-1059]|Actelion
 28 | |CHEMBL3112474|link:https://www.sciencedirect.com/science/article/pii/S0960894X13014765?via%3Dihub[Bioorg. Med. Chem. Lett. (2014) 24:1201-1208]|Actelion
 29 | |CHEMBL3739366|link:https://pubs.rsc.org/en/Content/ArticleLanding/2015/MD/C5MD00027K#!divAbstract[MedChemComm (2015) 6:947-955]|Heptares
 30 | |CHEMBL3739395|link:https://pubs.rsc.org/en/Content/ArticleLanding/2015/MD/C5MD00074B#!divAbstract[MedChemComm (2015) 6:1054-1064]|Actelion
 31 | |CHEMBL3351489|link:https://www.sciencedirect.com/science/article/pii/S0968089614006300?via%3Dihub[Bioorg. Med. Chem. (2014) 22:6071-6088]|Eisai
 32 | |===
 33 | 
 34 | 
 35 | === Mapping using Euclidean distance
 36 | 
 37 | Use ggplot for the drawing library. Principal component analysis (PCA) is used to distribute and visualize similar compounds close together. At first we import necessary library
 38 | 
 39 | [source, python]
 40 | ----
 41 | from rdkit import Chem, DataStructs
 42 | from rdkit.Chem import AllChem, Draw
 43 | import numpy as np
 44 | import pandas as pd
 45 | from ggplot import *
 46 | from sklearn.decomposition import PCA
 47 | import os
 48 | ----
 49 | 
 50 | Load the downloaded sdf, and create fingerprints for each compound, enabling correspondence between drug companies and document IDs. If you have any questions please check Chapter 6.
 51 | 
 52 | [source, python]
 53 | ----
 54 | oxrs = [("CHEMBL3098111", "Merck" ),("CHEMBL3867477", "Merck" ),
 55 | 　　　　　("CHEMBL2380240", "Rottapharm" ),("CHEMBL3352684", "Merck" ),
 56 | 　　　　　("CHEMBL3769367", "Merck" ),("CHEMBL3526050", "Actelion" ),
 57 | 　　　　　("CHEMBL3112474", "Actelion" ),("CHEMBL3739366", "Heptares" ),
 58 | 　　　　　("CHEMBL3739395", "Actelion" ), ("CHEMBL3351489", "Eisai" )]
 59 | 
 60 | fps = []
 61 | docs = []
 62 | companies = []
 63 | 
 64 | for cid, company in oxrs:
 65 |     sdf_file = os.path.join("ch08", cid + ".sdf")
 66 |     mols = Chem.SDMolSupplier(sdf_file)
 67 |     for mol in mols:
 68 |         if mol is not None:
 69 |             fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
 70 |             arr = np.zeros((1,))
 71 |             DataStructs.ConvertToNumpyArray(fp, arr)
 72 |             docs.append(cid)
 73 |             companies.append(company)
 74 |             fps.append(arr)
 75 | fps = np.array(fps)
 76 | companies = np.array(companies)
 77 | docs = np.array(docs)
 78 | ----
 79 | 
 80 | If you check the information of the fingerprint, you can see that data of 293 compounds are obtained from 10 articles.
 81 | 
 82 | [source, python]
 83 | ----
 84 | fps.shape
 85 | # (293, 2048)
 86 | ----
 87 | 
 88 | You are now ready for principal component analysis. The number of principal components can be specified by n_components, but this time, I want to scatter two dimensions, so I set it to 2.
 89 | 
 90 | [source, python]
 91 | ----
 92 | pca = PCA(n_components=2)
 93 | x = pca.fit_transform(fps)
 94 | ----
 95 | 
 96 | Draw. I changed the color option according to each label, so I chose two attributes, COMPANY and DOCID.
 97 | 
 98 | [source, python]
 99 | ----
100 | d = pd.DataFrame(x)
101 | d.columns = ["PCA1", "PCA2"]
102 | d["DOCID"] = docs
103 | d["COMPANY"] = companies
104 | g = ggplot(aes(x="PCA1", y="PCA2", color="COMPANY"), data=d) + geom_point() + xlab("X") + ylab("Y")
105 | g
106 | ----
107 | 
108 | You can now see what compounds each pharmaceutical company has optimized. Merck, Acterion, Eisai and Heptaress seem to have optimized similar compounds, as there is an overlapping area in the center of the chemical space. It is interesting to see whether the Acterion has been successfully deployed in a unique direction (lower left) or has not been deployed and has advanced into the red ocean center.
109 | 
110 | Also, Merck seems to have optimized various frameworks. I don't know if I'm optimizing at the same time or running ahead for backup, but it's no doubt that there were a lot of skeletal optimizations running, so it's probably an attractive target. In fact, link:https://www.ebi.ac.uk/chembl/beta/compound_report_card/CHEMBL1083659/[SUVOREXANT] was launched.
111 | 
112 | image:ch08/pca01.png[PCA, size=400, pdfwidth=48%] image:ch08/pca02.png[PCA, size=400, pdfwidth=48%]
113 | 
114 | .patinformatics
115 | ****
116 | In this chapter, we use dissertation data, but we do not use dissertation data when performing such analysis in a real field. Because when a company disseminates, it means that the project is over (whether it went to clinical or failed and closed). In the actual situation, analysis is performed using patent data.
117 | 
118 | Based on the analysis and link:http://rkakamilan.hatenablog.com/entry/2017/12/17/235417[experience of Medicinal Chemist] and the insights of these companies, the project will proceed with a belief in their own successes while inferring the situation of other companies.
119 | 
120 | ****
121 | 
122 | === Mapping using tSNE
123 | 
124 | It is said that tSNE has better resolution than PCA and is closer to the sense of medicinal chemist. Sklearn just changes PCA to TSNE.
125 | 
126 | [source, python]
127 | ----
128 | from sklearn.manifold import TSNE
129 | tsne = TSNE(n_components=2, random_state=0)
130 | tx = tsne.fit_transform(fps)
131 | ----
132 | 
133 | As you can see when drawing, it is separated better than PCA.
134 | 
135 | [source, python]
136 | ----
137 | d = pd.DataFrame(tx)
138 | d.columns = ["PCA1", "PCA2"]
139 | d["DOCID"] = docs
140 | d["COMPANY"] = companies 
141 | g = ggplot(aes(x="PCA1", y="PCA2", color="COMPANY"), data=d) + geom_point() + xlab("X") + ylab("Y")
142 | g
143 | ----
144 | 
145 | image::ch08/tsne01.png[PCA, size=500]
146 | 
147 | There are many other drawing methods besides PCA and tSNE introduced this time, so it is good to check.
148 | 
149 | <<<
150 | 


--------------------------------------------------------------------------------
/ch09_qsar.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 9: Basics of Quantitative Structure-Activity Relationship (QSAR)
  2 | 
  3 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch09_qsar.ipynb"]
  4 | 
  5 | The correlation between chemical structure and biological activity is called Structure Activity Relationship (SAR) or Quantitative SAR (QSAR). In general, **similar compounds** are known to exhibit **similar biological activities**, and it is very important in drug discovery research to understand this correlation and apply it to drug design.
  6 | 
  7 | In addition, there are two types of problems such as classification problems to estimate which class a compound belongs to, such as cell death or toxicity, or toxicity, and regression problems to estimate continuous values such as % inhibition.
  8 | 
  9 | === Consider the cause of no effect (classification problem)
 10 | 
 11 | Label the ones with an IC50 less than 1 uM with hERG inhibition and the others with no hERG inhibition using 73 data from ChEMBL link:https://www.ebi.ac.uk/chembl/assay/inspect/CHEMBL829152[hERG inhibition assay].
 12 | 
 13 | First, import the necessary libraries.
 14 | 
 15 | [source, python]
 16 | ----
 17 | from rdkit import Chem, DataStructs
 18 | from rdkit.Chem import AllChem, Draw
 19 | from rdkit.Chem.Draw import IPythonConsole
 20 | import numpy as np
 21 | from sklearn.model_selection import train_test_split
 22 | from sklearn.metrics import confusion_matrix, f1_score
 23 | from sklearn.ensemble import RandomForestClassifier
 24 | ----
 25 | 
 26 | Processing of tab-delimited text downloaded with ChEMBL is almost the same as in Chapter 8, but this time I want liveness data, so I search for the column **STANDARD_VALUE** and retrieve the numerical value. If this value is less than 1000 nM, label it as POS, otherwise label it as NEG. At the end, I will make the label numpy array.
 27 | 
 28 | [source, python]
 29 | ----
 30 | mols = []
 31 | labels = []
 32 | with open("ch09_compounds.txt") as f:
 33 |     header = f.readline()
 34 |     smiles_index = -1
 35 |     for i, title in enumerate(header.split("\t")):
 36 |         if title == "CANONICAL_SMILES":
 37 |             smiles_index = i
 38 |         elif title == "STANDARD_VALUE":
 39 |             value_index = i
 40 |     for l in f:
 41 |         ls = l.split("\t")
 42 |         mol = Chem.MolFromSmiles(ls[smiles_index])
 43 |         mols.append(mol)
 44 |         val = float(ls[value_index])
 45 |         if val < 1000:
 46 |             labels.append("POS")
 47 |         else:
 48 |             labels.append("NEG")
 49 | 
 50 | labels = np.array(labels)
 51 | ----
 52 | 
 53 | Then convert the mol object into a fingerprint. From this fingerprint, create a model to predict the presence or absence of hERG inhibition.
 54 | 
 55 | [source, python]
 56 | ----
 57 | fps = []
 58 | for mol in mols:
 59 |     fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
 60 |     arr = np.zeros((1,))
 61 |     DataStructs.ConvertToNumpyArray(fp, arr)
 62 |     fps.append(arr)
 63 | fps = np.array(fps)
 64 | ----
 65 | 
 66 | Divide the data set into two of the training set test set. The test set will be used later to evaluate the accuracy of the created prediction model.
 67 | 
 68 | [source, python]
 69 | ----
 70 | x_train, x_test, y_train, y_test = train_test_split(fps, labels)
 71 | ----
 72 | 
 73 | To create a predictive model, just create an instance and train it with the fit method.
 74 | 
 75 | [source, python]
 76 | ----
 77 | rf = RandomForestClassifier()
 78 | rf.fit(x_train, y_train)
 79 | ----
 80 | 
 81 | Predict the test set you split up earlier.
 82 | 
 83 | [source, python]
 84 | ----
 85 | y_pred = rf.predict(x_test)
 86 | ----
 87 | 
 88 | Create a Confusion matrix.
 89 | 
 90 | .What is confusion matrix?
 91 | ****
 92 | Confusion matrix is a table that summarizes the results of class classification. It is possible to visualize clearly whether the class is classified correctly, and as TP and TN are many and FP and FN are small, it is possible to classify better.
 93 | 
 94 | |===
 95 | || 2+|Actual class
 96 | 
 97 | |
 98 | |
 99 | |Positive
100 | |Negative
101 | 
102 | .2+|Predicted class
103 | |Positive
104 | |True Positive(TP)
105 | |False Positive(FP)
106 | 
107 | |Negative
108 | |False Negative(FN)
109 | |True Negative(TN)
110 | |===
111 | ****
112 | 
113 | [source, python]
114 | ----
115 | confusion_matrix(y_test, y_pred)
116 | #array([[11,  1],[ 5,  2]])
117 | ----
118 | 
119 | |===
120 | |11 |1
121 | 
122 | |5
123 | |2
124 | |===
125 | 
126 | Let's look at the F1 score.
127 | 
128 | [source, python]
129 | ----
130 | f1_score(y_test, y_pred, pos_label="POS")
131 | #0.4
132 | ----
133 | 
134 | It is not very good.
135 | 
136 | NOTE: Because the train_test_split function randomly splits the training set and test set, the value of the confidence matrix, F1 score changes with each execution.
137 | 
138 | .With F1 score
139 | ****
140 | 
141 | - The ratio of what is truly correct among what is predicted to be correct is called accuracy rate precision = TP / (TP + FP)
142 | - The rate at which the correct thing is predicted to be correct is called the recall rate recall = TP / (TP + FN)
143 | 
144 | The F1 score is the harmonic mean of the precision rate and the recall rate
145 | 
146 | It is calculated by
147 | F1 = 2 * (precision * recall) / (precision + recall)
148 | 
149 | ****
150 | 
151 | === Predict the efficacy of drugs (regression problem)
152 | 
153 | Regression models, as discussed earlier, are models that predict continuous values. This time, create a regression model of RandomForest, and evaluate its accuracy with R2. Let's use the data from hERG's assay data used in classification problems. Import the required libraries first.
154 | 
155 | [source, python]
156 | ----
157 | from sklearn.ensemble import RandomForestRegressor
158 | from sklearn.metrics import r2_score
159 | from math import log10
160 | ----
161 | 
162 | We labeled it for classification problems, but now we want to predict continuous values, so we convert it to pIC50. (We will supplement later on why it is convenient to use pIC50)
163 | 
164 | [source, python]
165 | ----
166 | pIC50s = []
167 | with open("ch09_compounds.txt") as f:
168 |     header = f.readline()
169 |     for i, title in enumerate(header.split("\t")):
170 |         if title == "STANDARD_VALUE":
171 |             value_index = i
172 |     for l in f:
173 |         ls = l.split("\t")
174 |         val = float(ls[value_index])
175 |         pIC50 = 9 - log10(val)
176 |         pIC50s.append(pIC50)
177 | 
178 | pIC50s = np.array(pIC50s)
179 | ----
180 | 
181 | Divide the data set into two: training set and test set. The fingerprint uses what was created at the time of classification model.
182 | 
183 | [source, python]
184 | ----
185 | x_train, x_test, y_train, y_test = train_test_split(fps, pIC50s)
186 | ----
187 | 
188 | I will train. In the case of Scikit-learn, this procedure is fit and predict with almost the same method in any method.
189 | 
190 | [source, python]
191 | ----
192 | rf = RandomForestRegressor()
193 | rf.fit(x_train, y_train)
194 | ----
195 | 
196 | Let's predict.
197 | 
198 | [source, python]
199 | ----
200 | y_pred = rf.predict(x_test)
201 | ----
202 | 
203 | Let's put out the prediction accuracy with R2.
204 | 
205 | [source, python]
206 | ----
207 | r2_score(y_test, y_pred)
208 | #0.52
209 | ----
210 | 
211 | Is there anything like that?
212 | 
213 | .With R2 score
214 | 
215 | ****
216 | It is often used as one of the evaluation indicators for the goodness of fit of regression, also called the link:https://ja.wikipedia.org/wiki/%E6%B1%BA%E5%AE%9A%E4%BF%82%E6%95%B0[determination coefficient].
217 | ****
218 | 
219 | === Model applicability (applicability domain)
220 | 
221 | The method introduced here is a model generated based on the hypothesis that **similar compounds exhibit similar biological activities**. What is the prediction accuracy if there is no compound that is similar to the training set?
222 | 
223 | Of course, the predicted value is not reliable in that case. In other words, is the prediction likely to be that prediction? The degree of reliability always goes around. The extent to which such models can be trusted or applied is called the applicability domain. In this regard, the link:https://datachemeng.com/applicabilitydomain/[scope of application and model application] Mr. Kaneko, Meiji University are detailed.
224 | 
225 | ==== (Extra column) How reliable can the applicability domain be?
226 | 
227 | Long time ago, do the similar compounds of Dr. Hugo Kubinyi show similar activities? I remember the question that I was impressed by the fact that converting the estradiol OH group into a Methoxy group gave examples of the loss of activity.
228 | 
229 | The applicability domain is a method to measure the accuracy of the prediction from the similarity of the training set. Here comes the question of who the similarity is for. It is our hand that we think this compound and this compound are similar, but it is ultimately determined by the protein whether it is similar or not. Therefore, the activity can not always be predicted from the similarity, and the activity often disappears even if the similarity is extremely high. In particular, Activity Cliff, described in the context of MMP, gives such an event its name.
230 | 
231 | <<<
232 | 


--------------------------------------------------------------------------------
/ch10_deeplearning.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 10: Introduction to Deep Learning
  2 | :imagesdir: images
  3 | 
  4 | For this chapter, we will use deep learning to create QSAR models and generation models.
  5 | 
  6 | === About deep learning
  7 | 
  8 | Neurons exist in the brain of an organism, and they form a network to transmit information, store and learn. The Artificial Neural Network (ANN) is a mathematical model of this network structure.
  9 | 
 10 | In general ANN, an input layer for inputting information for learning, an intermediate layer (or hidden layer) for learning a response (corresponding to the firing of a nerve synapse) based on a pattern of input information, and a third-layer or final output layer. However, deep learning enables highly accurate predictions by layering multiple hidden layers.
 11 | 
 12 | Although I will not explain this in this book in particular, if you want to write and understand the code from scratch yourself, link:https://www.amazon.co.jp/dp/4873117585/[Deep Learning from scratch] can be helpful. Also, if you want to learn about theory properly, we recommend link:https://www.amazon.co.jp/dp/4048930621/[deep learning].
 13 | 
 14 | === About TensorFlow and Keras
 15 | 
 16 | Tensorflow is a framework for machine learning developed by Google and released as OSS. It is often used mainly as a deep learning framework.
 17 | 
 18 | NOTE: Tensorflow has recently made a major update from 1.x to 2.x, but since the 2.x version has just appeared and there is little reference information, it uses the 1.x system. Also, since the API differs depending on the version of the same 1.x, if there is code you want to run, please be careful about which version is written.
 19 | 
 20 | Keras is a high-level API backed by a low-level framework such as Tensoflow, so you can write code more easily. Keras has been developed independently of Tensorflow, but recently Tensorflow comes with Keras. So you can use Keras without installing separately. The Tensorflow bundled version of Keras may not be the latest version of the home.
 21 | 
 22 | It's annoying to decide which Keras to use, but for the sake of convenience, we will use Tensorflow-integrated Keras.
 23 | 
 24 | .Relationship between Keras and Tensorflow
 25 | ****
 26 | I will organize Keras and Tensoflow a little while referring to the link:https://blog.keras.io/introducing-keras-2.html[official blog]. Originally Keras was developed as a separate project from Tensoflow (and, of course, still), to use Keras, Tensorflow had to be installed. However, around the timing of the major version upgrade of Keras 2.x 2017, the Tensoflow project has integrated Keras. The English below is an excerpt of the above link. It is now possible to call Keras from Tensorflow.
 27 | 
 28 | _TensorFlow integration
 29 | Although Keras has supported TensorFlow as a runtime backend since December 2015, the Keras API had so far been kept separate from the TensorFlow codebase. This is changing: the Keras API will now become available directly as part of TensorFlow, starting with TensorFlow 1.2. This is a big step towards making TensorFlow accessible to its next million users._
 30 | ****
 31 | 
 32 | === Let's install
 33 | 
 34 | Let's install Tensorflow and Keras. When installing with anaconda, the package to be installed differs slightly depending on whether you use a GPU compatible version or a CPU version.
 35 | 
 36 | [source, bash]
 37 | ----
 38 | # CPU only
 39 | $ conda install -c conda-forge tensorflow
 40 | # GPU enabled
 41 | $ conda install -c anaconda tensorflow-gpu
 42 | ----
 43 | 
 44 | NOTE: You can also use the pip command to install TensorFlow. In that case , please refer to the link:https://www.tensorflow.org/install[official document]. But basically, if you make an environment with Conda, it is desirable to put a package with Conda.
 45 | 
 46 | Reference link
 47 | 
 48 | 
 49 | - https://keras.io/#installation
 50 | - https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-pkgs.html
 51 | 
 52 | === About Google colab
 53 | 
 54 | .Google colab
 55 | ****
 56 | link:https://colab.research.google.com/notebooks/welcome.ipynb[Google colaboratory] is a Jupyter notebook environment that can be run on the cloud. The framework for deep learning such as Theano, Thensorflow, Keras, Pytorch is already installed and time is limited, but because GPU can be used, it is very attractive that deep learning can be used without a GPU machine at hand is.
 57 | 
 58 | ****
 59 | 
 60 | You need a Google account to use it, so if you do not have a Google account, it is a good idea to get an account at this opportunity.
 61 | If you have a Google account, you can also run GitHub-style notebooks directly on Colab. Let's open the Scikit-learn hands-on notebook previously used in Mishima.syk.
 62 | 
 63 | NOTE:: It is a notebook created by @y__sama, but it is possible to learn from Auto data preparation to link:https://automl.github.io/auto-sklearn/master/[AutoSklearn].
 64 | 
 65 | First of all, go to link:https://colab.research.google.com/notebooks/welcome.ipynb[Google colaboratory]. If you do not get the screen below, please execute "Open Notebook" from "File" on the top left
 66 | 
 67 | image::ch10/ch10_1.png[GoogleColabTop, width=600, pdfwidth=60%]
 68 | 
 69 | 
 70 | Next, click the tab named GitHub, copy and paste the following URL, and you can move the code from Jupyter Notebook.
 71 | 
 72 | https://github.com/Mishima-syk/sklearn-tutorial
 73 | 
 74 | image::ch10/ch10_2.png[GoogleColab2, width=600, pdfwidth=60%]
 75 | 
 76 | When you open the notebook, you will see a screen similar to the Jupyter Notebook. You can execute the code of the cell with Shift + Return key.
 77 | 
 78 | image::ch10/ch10_2.png[NoteBook, width=600, pdfwidth=60%]
 79 | 
 80 | To see the libraries available by default in Google Colab, type '! Pip freeze' in the cell and it will be listed.
 81 | 
 82 | - absl-py==0.7.0
 83 | - alabaster==0.7.12
 84 | - snip ;) 
 85 | - yellowbrick==0.9.1
 86 | - zict==0.1.3
 87 | - zmq==0.0.0
 88 | 
 89 | .Python deep learning framework
 90 | ****
 91 | There are many Python deep learning frameworks. Mainly link:http://deeplearning.net/software/theano/[Theano], 
 92 | link:https://www.tensorflow.org/[Tensorflow], 
 93 | link:https://keras.io/[Keras], 
 94 | link:https://mxnet.apache.org/[MXNet], 
 95 | link:https://chainer.org/[Chainer], 
 96 | link:https://pytorch.org/[PyTorch],
 97 | etc.
 98 | 
 99 | Various deep learning documents often use one of the above frameworks for implementation. You may want to try it and choose a framework that is easy to use.
100 | ****
101 | 
102 | <<<
103 | 


--------------------------------------------------------------------------------
/ch11_dlqsar.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 11: Structure-activity relationship using deep learning
  2 | :imagesdir: images
  3 | 
  4 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch11_simple_dnn.ipynb"]
  5 | 
  6 | In this chapter, structure activity correlation analysis is performed using DNN.
  7 | 
  8 | === Predictive model construction using DNN
  9 | 
 10 | First, let's build a simple prediction model using DNN. Here we use the same data as in Chapter 9. First, create a classification model and label the Positive label as [0, 1] and the Negative label as a [1, 0] two-dimensional OneHot vector. If you create a model using Keras Model object, you can get the expected value of each of the above two dimensions. You can use Numpy's Argmax function to know which class it is likely to belong to.
 11 | 
 12 | NOTE: OneHot vector is a vector in which one value is 1 and the other is 0. When considering a classification problem of 10 classes, such as [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], a vector such that somewhere is 1 and the remaining 9 are 0 I can express a class. In the above example, there are two classes of Positive / Negative, so the OneHot vector is two-dimensional.
 13 | 
 14 | Import the required libraries.
 15 | 
 16 | [source, python]
 17 | ----
 18 | from rdkit import Chem, DataStructs
 19 | from rdkit.Chem import AllChem, Draw
 20 | from rdkit.Chem.Draw import IPythonConsole
 21 | import numpy as np
 22 | from sklearn.model_selection import train_test_split
 23 | from sklearn.metrics import confusion_matrix, f1_score
 24 | from thensorflow.python.keras.layers import Iput
 25 | from thensorflow.python.keras.layers import Dense
 26 | from thensorflow.python.keras.layers import Dropout
 27 | from thensorflow.python.keras.layers import Activation
 28 | from thensorflow.python.keras.Model import Model
 29 | 
 30 | ----
 31 | 
 32 | Next, read the data. In Chapter 9 we put "POS" / "NEG" in the list of labels, so it was a one-dimensional representation, but this time it is two-dimensional.
 33 | 
 34 | [source, python]
 35 | ----
 36 | mols = []
 37 | labels = []
 38 | with open("ch09_compounds.txt") as f:
 39 |     header = f.readline()
 40 |     smiles_index = -1
 41 |     for i, title in enumerate(header.split("\t")):
 42 |         if title == "CANONICAL_SMILES":
 43 |             smiles_index = i
 44 |         elif title == "STANDARD_VALUE":
 45 |             value_index = i
 46 |     for l in f:
 47 |         ls = l.split("\t")
 48 |         mol = Chem.MolFromSmiles(ls[smiles_index])
 49 |         mols.append(mol)
 50 |         val = float(ls[value_index])
 51 |         if val < 1000:
 52 |             labels.append([0,1]) # Positive
 53 |         else:
 54 |             labels.append([1,0]) # Negative
 55 | labels = np.array(labels)
 56 | ----
 57 | 
 58 | Next, create classification models and regression models sequentially.
 59 | 
 60 | The first is a regression model, and the input uses the same ECFP as in Chapter 9. In order to construct DNN, it is necessary to specify the dimension of input data explicitly, so we define the variable nBits.
 61 | 
 62 | TIP: Specifying an appropriate integer in random_state for train_test_split is useful for verification because the same data is obtained each time.
 63 | 
 64 | [source, python]
 65 | ----
 66 | nBits = 2048
 67 | fps = []
 68 | for mol in mols:
 69 |     fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nBits)
 70 |     arr = np.zeros((1,))
 71 |     DataStructs.ConvertToNumpyArray(fp, arr)
 72 |     fps.append(arr)
 73 | fps = np.array(fps)
 74 | 
 75 | x_train1, x_test1, y_train1, y_test1 = train_test_split(fps, labels, random_state=794)
 76 | ----
 77 | 
 78 | Create a neural network whose inputs are 2048 dimensions, the total connection layer of 300 neurons is three layers, and the final output layer is two. We used ReLU for the activation function and Softmax for two-dimensional multiclass classification for the output layer.
 79 | 
 80 | The Dropout layer plays a role to prevent overlearning by randomly deleting neurons.
 81 | 
 82 | Keras constructs a model by defining the model and then calling the compile function. Although optimizer and loss need to be changed according to the purpose, in this case 'categorical_crossentropy' was used, but there are many other than link:https://keras.io/ja/optimizers/[adam optimizer] , so it will actually require trial and error which is appropriate.
 83 | 
 84 | TIP: link:https://en.wikipedia.org/wiki/Rectifier_(neural_networks)[ReLU] is often used because it can overcome the problem of gradient disappearance of link:https://en.wikipedia.org/wiki/Sigmoid_function[Sigmoid] function.
 85 | 
 86 | [source, python]
 87 | ----
 88 | # Define DNN classifier model
 89 | epochs = 10
 90 | inputlayer1 = Input(shape=(nBits, ))
 91 | x1 = Dense(300, activation='relu')(inputlayer1)
 92 | x1 = Dropout(0.2)(x1)
 93 | x1 = Dense(300, activation='relu')(x1)
 94 | x1 = Dropout(0.2)(x1)
 95 | x1 = Dense(300, activation='relu')(x1)
 96 | output1 = Dense(2, activation='softmax')(x1)
 97 | model1 = Model(inputs=[inputlayer1], outputs=[output1])
 98 | 
 99 | model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
100 | ----
101 | 
102 | NOTE: 
103 | Keras provides a link:https://keras.io/ja/models/sequential/[Sequential] model, which can be used to describe the network more simply than the example above (Functional API). The reason we defined the model in the Functional API is that it is easy to handle multiple inputs and more complex models if you get used to it. If you are interested in writing Sequential please check out the official site and Qiita
104 | 
105 | NOTE: DNN optimizes the model while iterating the Backpropagation procedure, which compares the actual value with the predicted value predicted based on the initial randomly generated weight, and updates the weight so as to minimize the difference (LOSS). You It is Epochs that specifies the number of repetitions. You may seem to get smarter as you increase Epochs, but there is a risk of computational cost and over-learning, so it is not good if it is long. Observe Loss, Accuracy, etc. and find the appropriate number of Epochs.
106 | 
107 | .Why is there a risk of overlearning when increasing Epochs?
108 | ****
109 | Using training data, we will adjust the weight to reduce the error between the correct value and the predicted value for each Epoch. If it is learned using a sufficient amount of training data and it is repeated too much, the generalization performance of the model will be reduced since the same training data will be learned over and over again.
110 | 
111 | To judge overtraining, if you evaluate and plot the accuracy of Training set / Validation set for each Epoch, you can check whether the accuracy of Validation set does not change or deteriorate while accuracy of Training set improves. Keras has a function called link:https://keras.io/ja/callbacks/[Eary] stopping, which allows you to stop learning if the performance of the model does not change even if you have learned a certain number of times.
112 | 
113 | See the introduction and references of https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/[Early stopping] for more information.
114 | 
115 | ****
116 | 
117 | After building the model, you can do fit / predict in the same way as Scikit-learn.
118 | 
119 | [source, python]
120 | ----
121 | hist1 = model1.fit(x_train1, y_train1, epochs=epochs)
122 | ----
123 | 
124 | Finally, let's visualize the result.
125 | 
126 | [source, python]
127 | ----
128 | %matplotlib inline
129 | import matplotlib.pyplot as plt
130 | plt.plot(range(epochs), hist1.history['acc'], label='acc')
131 | plt.legend()
132 | plt.plot(range(epochs), hist1.history['loss'], label='loss')
133 | plt.legend()
134 | ----
135 | 
136 | In this example, the model has good accuracy around 6Epoch.
137 | 
138 | Next, verify with test data.
139 | 
140 | [source, python]
141 | ----
142 | y_pred1 = model1.predict(x_test1)
143 | y_pred_cls1 = np.argmax(y_pred1, axis=1)
144 | y_test_cls1 =np.argmax(y_test1, axis=1)
145 | confusion_matrix(y_test_cls1, y_pred_cls1)
146 | ----
147 | 
148 | A little subtle ,,,,
149 | 
150 | The regression model is basically the same as the classification problem above. This time it is a regression, so the last output layer is the value itself, ie one dimensional The activation function is 0-1 in Sigmoid etc., so it is Linear. The training data uses the code of Chapter 9.
151 | 
152 | [source, python]
153 | ----
154 | from math import log10
155 | from sklearn.metrics import r2_score
156 | pIC50s = []
157 | with open("ch09_compounds.txt") as f:
158 |     header = f.readline()
159 |     for i, title in enumerate(header.split("\t")):
160 |         if title == "STANDARD_VALUE":
161 |             value_index = i
162 |     for l in f:
163 |         ls = l.split("\t")
164 |         val = float(ls[value_index])
165 |         pIC50 = 9 - log10(val)
166 |         pIC50s.append(pIC50)
167 | 
168 | pIC50s = np.array(pIC50s)
169 | x_train2, x_test2, y_train2, y_test2 = train_test_split(fps, pIC50s, random_state=794)
170 | ----
171 | 
172 | Next, define the model. Note that the Loss part is MSE, unlike the classification model above.
173 | 
174 | [source, python]
175 | ----
176 | epochs = 50
177 | inputlayer2 = Input(shape=(nBits, ))
178 | x2 = Dense(300, activation='relu')(inputlayer2)
179 | x2 = Dropout(0.2)(x2)
180 | x2 = Dense(300, activation='relu')(x2)
181 | x2 = Dropout(0.2)(x2)
182 | x2 = Dense(300, activation='relu')(x2)
183 | output2 = Dense(1, activation='linear')(x2)
184 | model2 = Model(inputs=[inputlayer2], outputs=[output2])
185 | model2.compile(optimizer='adam', loss='mean_squared_error')
186 | ----
187 | 
188 | If you can do this, the rest is the same.
189 | 
190 | [source, python]
191 | ----
192 | hist = model2.fit(x_train2, y_train2, epochs=epochs)
193 | y_pred2 = model2.predict(x_test2)
194 | r2_score(y_test2, y_pred2)
195 | plt.scatter(y_test2, y_pred2)
196 | plt.xlabel('exp')
197 | plt.ylabel('pred')
198 | plt.plot(np.arange(np.min(y_test2)-0.5, np.max(y_test2)+0.5), np.arange(np.min(y_test2)-0.5, np.max(y_test2)+0.5))
199 | ----
200 | 
201 | What do you think. The prediction model looks a bit like UnderEstimate. The DNN needs to tune a number of parameters, such as the number of layers to overlap, the percentage of dropouts, the number of neurons in the hidden layer, and the type of activation function. This example was hard-coded, but it is also interesting to compare the performance of the models by changing various parameters.
202 | 
203 | === I will devise a descriptor (neural fingerprint)
204 | 
205 | So far, we have created models of RandomForest and DNN using molecular fingerprints as input. One of the reasons why DNN has received a great deal of attention is that models can recognize feature quantities even if people do not extract feature quantities.
206 | 
207 | For example, in image classification, a human defined the feature quantity called link:https://en.wikipedia.org/wiki/Scale-invariant_feature_transform[SIFT], and a model was created using this as an input, but the current DNN basically uses the pixel information of the image itself.
208 | 
209 | In terms of chemoinformatics, SIFT is equivalent to a molecular fingerprint. So isn't it possible to improve DNN's performance by changing this (input) to a more primitive expression? It is extremely natural to think that. In 2015, Alan Aspuru-Guzik et al's group at Harvard University proposed the link:https://arxiv.org/pdf/1509.09292.pdf[Neural Finger print/NFP] as a challenge.
210 | 
211 | The differences between ECFP and NFP used so far are shown by citing figures in their papers.
212 | 
213 | image::ch11/ch11_nfp.png[Neural Finger Print]
214 | 
215 | ECFP (Circular Fingerprints) converts information from each atom of input molecules to atoms in the vicinity of N (N is arbitrary) into Hash values  (Mod in this example) to arbitrary values, and converts them into vectors of fixed length was. Roughly speaking, it is an image such as using the one where the presence or absence of the partial structure is corrected to the bit information of 0/1. On the other hand, NFP introduced this time is similar in concept to ECFP, but the part of Hash function is Sigmoid, and the part to be discretized with Mod is Softmax. Therefore, it is expected that input datasets will generate molecular fingerprints more flexibly than ECFP.
216 | 
217 | A number of implementations have been published to GitHub since this paper was published, but each implementation does not work with Keras 1.x or Keras / Tensorflow, even if the Backend is Theano or Keras / Tensorflow There are a lot of environment-dependent things that are surprisingly difficult to handle. Unfortunately there is no one that works in the environment we built this time, so I created one that works with Keras 2.x / Python 3.6 based on this code .
218 | 
219 | .Was it effective to use the classical method with pixel as it is in image classification?
220 | ****
221 | SIFT was proposed in 1999. According to the link:https://www.cs.ubc.ca/~lowe/papers/iccv99.pdf[original paper], the difficulty in dealing with pixels themselves in object (image) recognition seems to be in dealing with objects that differ in position, rotation, size (scale), light intensity, etc. It seems that various methods have been studied to convert these fluctuating values into universal features. There is no way to use the pixels themselves, but the machine learning that I started with link:https://www.oreilly.co.jp/books/9784873117980/[python], which I purchased when studying machine learning , has an example of learning and classifying human face image data. Here, with the pixel data as input, the feature of the face is extracted and classified by principal component analysis. I have not been able to find a document that was clearly valid on this question, but I think it was valid depending on the task. Please comment if you have any details.
222 | 
223 | ****
224 | 
225 | [source, python]
226 | ----
227 | git clone https://github.com/iwatobipen/keras-neural-graph-fingerprint.git
228 | ----
229 | 
230 | If you look at the code in the example.py file, you will find the atmosphere somehow. In the previous examples, molecule representations were generated using RDKit for this example, but this time the fingerprint itself is learned by DNN.
231 | 
232 | So, representing molecules as a graph is the input. As Atom_matrix, (max_atoms, num_atom_features) is used as Edge_matrix, (max_atoms, max_degree) as bond_tensor, and three matrices (max_atoms, max_degree, num_bond_features) are used. Since each molecule has a different number of atoms, max_atoms defines the maximum number of atoms. By doing this, it becomes input of the same matrix size for each numerator and batch learning becomes possible.
233 | 
234 | If you want to execute Example, please enter the following command.
235 | 
236 | [source, python]
237 | ----
238 | python example.py
239 | ----
240 | 
241 | Reference link:
242 | - link:https://arxiv.org/abs/1509.09292[NGF-paper]
243 | - link:https://arxiv.org/abs/1611.03199[DeepChem-paper]
244 | - link:http://www.keiserlab.org/[keiserlab]
245 | - link:https://github.com/HIPS/neural-fingerprint[HIPS NFP]
246 | - link:https://github.com/debbiemarkslab/neural-fingerprint-theano[Theano base]
247 | - link:https://github.com/GUR9000/KerasNeuralFingerprint[for keras1.x]
248 | - link:https://github.com/ericmjl/graph-fingerprint[ericmjl/graph_fp]
249 | - link:https://github.com/deepchem/deepchem[DeepChem]
250 | - link:https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/[About Eary stopping]
251 | - link:https://www.cs.ubc.ca/~lowe/papers/iccv99.pdf[SIFT original Paper]
252 | 
253 | <<<
254 | 


--------------------------------------------------------------------------------
/ch12_generativemodels.asciidoc:
--------------------------------------------------------------------------------
  1 | == Chapter 12: Let the computer think about the chemical structure
  2 | :imagesdir: images
  3 | image:jupyter.png[link="https://github.com/Mishima-syk/py4chemoinformatics/blob/master/notebooks/ch12_rnn.ipynb"]
  4 | 
  5 | A generation model is one of the things that Deep Learning has had a great impact on the medicinal chemistry. In particular, the evolution of generation models in the last few years is amazing. Here, let's propose a new synthesis proposal using link:https://github.com/MarcusOlivecrona/REINVENT[REINVENT developed by Marcus Olivecrona].
  6 | 
  7 | .What is a Generation Model?
  8 | ****
  9 | The prediction model built in Chapter 11 is generally called a discrimination model. On the other hand, by modeling the distribution of inputs, it is possible to generate sampling or input data from the model. This is called a generative model.
 10 | 
 11 | For more details , we recommend reading link:https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf[PRML 1.5.4]
 12 | 
 13 | ****
 14 | 
 15 | === Preparation
 16 | Install a deep learning library called PyTorch with conda. It does not work with the new version, so specify the version and install it.
 17 | 
 18 | 
 19 | .What is pytorch?
 20 | Like keras, it is a library to use TensorFlow more conveniently.
 21 | 
 22 | [source, bash]
 23 | ----
 24 | $ conda install pytorch=0.3.1 -c pytorch
 25 | ----
 26 | 
 27 | Then clone REINVENT itself from GitHub.
 28 | 
 29 | [source, bash]
 30 | ----
 31 | $ cd <path to your working directory>
 32 | $ git clone https://github.com/MarcusOlivecrona/REINVENT.git
 33 | ----
 34 | 
 35 | Next, download a pre-trained model with about 1.1 million data sets of ChEMBL and replace it with the original data. This data takes five or six hours using the GTX 1080Ti GPU machine, but if you want to train yourself, the GPU machine is a must.
 36 | 
 37 | 
 38 | [source, bash]
 39 | ----
 40 | $ wget https://github.com/Mishima-syk/13/raw/master/generator_handson/data.zip
 41 | $ unzip data.zip
 42 | $ mv data ./REINVENT/
 43 | ----
 44 | 
 45 | Now you are ready.
 46 | 
 47 | === Illustration
 48 | 
 49 | Here we create a model that produces an analogue of the antidiabetic drug sitagliptin, known commercially as link:https://www.drugbank.ca/drugs/DB01261[Januvia].
 50 | 
 51 | First, train the model to generate a highly similar structure using the tanimoto coefficients as scores. This time I will train 3000 steps, but it will take about 7 or 8 hours with Macbook Air, which is a little earlier. If you can not wait, please use the data link:https://github.com/Mishima-syk/13/tree/master/generator_handson/sitagliptin_agent_3000[here].
 52 | 
 53 | 
 54 | [source, bash]
 55 | ----
 56 | ./main.py --scoring-function tanimoto --scoring-function-kwargs query_structure 'N[C@@H](CC(=O)N1CCn2c(C1)nnc2C(F)(F)F)Cc3cc(F)c(F)cc3F' --num-steps 3000 --sigma 80
 57 | ----
 58 | 
 59 | From here, I will launch jupyter notebook.
 60 | 
 61 | Load the necessary libraries. Specify the REINVENT directory for sys.path.append.
 62 | 
 63 | 
 64 | [source, python]
 65 | ----
 66 | %matplotlib inline
 67 | import sys
 68 | sys.path.append("[Your REINVENT DIR]")
 69 | from rdkit import Chem
 70 | from rdkit.Chem import AllChem, DataStructs, Draw
 71 | import torch
 72 | from model import RNN
 73 | from data_structs import Vocabulary
 74 | from utils import seq_to_smiles
 75 | ----
 76 | 
 77 | Next, sample 50 compounds from the trained model.
 78 | 
 79 | [source, python]
 80 | ----
 81 | voc = Vocabulary(init_from_file="/Users/kzfm/mishima_syk/REINVENT/data/Voc")
 82 | Agent = RNN(voc)
 83 | Agent.rnn.load_state_dict(torch.load("sitagliptin_agent_3000/Agent.ckpt"))
 84 | seqs, agent_likelihood, entropy = Agent.sample(50)
 85 | smiles = seq_to_smiles(seqs, voc)
 86 | ----
 87 | 
 88 | Let's see what kind of structure was actually generated.
 89 | 
 90 | [source, python]
 91 | ----
 92 | mols = []
 93 | for smi in smiles:
 94 |     mol = Chem.MolFromSmiles(smi)
 95 |     if mol is not None:
 96 |         mols.append(mol)
 97 | 
 98 | Draw.MolsToGridImage(mols, molsPerRow=3, subImgSize=(500,400))
 99 | ----
100 | 
101 | Is there anything like that?
102 | 
103 | image:ch11/ch11_01.png[Sitagliptin_analogues]
104 | 
105 | .About REINVENT
106 | ****
107 | By all means, please read link:https://arxiv.org/abs/1704.07555[Molecular De Novo Design through Deep Reinforcement Learning]
108 | ****
109 | 
110 | <<<
111 | 


--------------------------------------------------------------------------------
/ch13_beyond.asciidoc:
--------------------------------------------------------------------------------
 1 | == Chapter 13: Conclusion
 2 | :imagesdir: images
 3 | 
 4 | === To learn more
 5 | 
 6 | NOTE:: If you are interested in what you are interested in, you can send requests to the issue that you want to know more, or reply with twitter. Also recommended suggestions are saved.
 7 | 
 8 | 
 9 | ==== Those who want to learn more machine learning
10 | 
11 | 
12 | You should aim to be able to read through link:https://www.microsoft.com/en-us/research/people/cmbishop/#!prml-book[Pattern Recognition and Machine Learning(PRML)]. The PDF can be downloaded for free.
13 | 
14 | If you find that PRML is tough, you may find it easier to search for "before reading PRML" etc. so you should choose the one that suits you.
15 | 
16 | 
17 | ==== Want to learn more about Chemoinformatics from IT
18 | 
19 | Since this book focuses on AI drug discovery, it explains the basics of machine learning and analysis methods, but chemoinformatics is, like bioinformatics, an efficient way of expressing molecules and data. It also includes storage methods and fast search technology. If you are interested in chemoinformatics as such informatics (IT aspect), it is recommended to read more from link:https://www.amazon.co.jp/Chemoinformatics-Basic-Concepts-Methods-English-ebook/dp/B07MMWKNSL/[Chemoinformatics: Basic Concepts and Methods] and dig deeper on topics of interest.
20 | 
21 | ==== For a deeper understanding of medicinal chemistry and chemoinformatics
22 | 
23 | If you belong to the pharmacokinetics, toxicity, or pharmacology of a pharmaceutical company or academia and want to know the point of this book by all means , we recommend that you read link:https://www.amazon.co.jp/Drug-Like-Properties-Concepts-Structure-Optimization-ebook/dp/B019OMDRU4/[Drug-Like Properties: Concepts, Structure Design and Methods from ADME to Toxicity Optimization]. You This is a text that is generally read by new employees who are assigned to the synthesis department of a pharmaceutical company, so it would be fun for anyone who has read this book. If there is a part that I can not catch up with, I can go over the related books, and I think it is good to learn further from this book as a clue.
24 | 
25 | In addition, people who are involved in pharmacokinetics should be able to use it as a strength in link:https://www.amazon.co.jp/Physiologically-Based-Pharmacokinetic-PBPK-Modeling-Simulations-ebook/dp/B007BGZKWO/[PBPK modeling] if they can understand QSAR / QSPR in this document . Since optimization of kinetic profiles is very important for drug differentiation strategies, it may be very useful to have strong QSPR + PBPK.
26 | 
27 | ==== If you want to be a drug designer
28 | 
29 | Although this book has introduced informatics methods based on low molecular weight compounds, understanding of the target protein is essential when interpreting the results. In other words, drug design can not be done without understanding the three-dimensional structure of proteins. Therefore, it is good to read and learn books related to SBDD.
30 | 
31 | NOTE: Unfortunately I have not studied SBDD in books, so please tell someone good books
32 | 
33 | Furthermore, since SBDD deals with proteins, it is not necessary to concatenate it with chemoinformatics and bioinformatics. If you understand both in the framework of drug discovery, you will be able to think in more depth, so let's be able to do both. That is absolutely fun. link:https://www.amazon.co.jp/dp/4780909201/[DRY analysis books] and link:https://www.amazon.co.jp/dp/4297103192[information technology that supports life science data analysis] will surely help your career.
34 | 
35 | As mentioned in Chapter 6, quantum chemical calculation is important to understand protein-ligand interactions. In particular, the ability to interpret interactions based on quantum chemistry in future SBDDs can be stated to be essential. Without prejudice think link:https://www.amazon.co.jp/dp/4130625047/[about the chemical in orbit concept] - the basic quantum chemistry please read the like. If you're using link:https://www.msg.chem.iastate.edu/gamess/[Gamess], you'll be able to help the link:https://www.amazon.co.jp/dp/4061543881/[new version of Quantum Chemical Beginners' Manual]. At least save energy decomposition analisys, which will increase your ability to interpret calculations and contribute to your project. Furthermore, FMO is needless to say, but it is an indispensable tool, so understanding link:https://www.jstage.jst.go.jp/article/jccj/advpub/0/advpub_2014-0039/_pdf[each component] will help drug design more than that
36 | 
37 | ==== Beyond the "end"
38 | 
39 | You can add more advanced content than this manual as a chapter. Please do PR. Add them to the contributor and specify the author at the beginning of the chapter.
40 | 
41 | >>>
42 | 
43 | 


--------------------------------------------------------------------------------
/images/by-nc-sa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/by-nc-sa.png


--------------------------------------------------------------------------------
/images/ch02/anaconda01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch02/anaconda01.png


--------------------------------------------------------------------------------
/images/ch04/chembl01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl01.png


--------------------------------------------------------------------------------
/images/ch04/chembl02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl02.png


--------------------------------------------------------------------------------
/images/ch04/chembl03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl03.png


--------------------------------------------------------------------------------
/images/ch04/chembl04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl04.png


--------------------------------------------------------------------------------
/images/ch04/chembl05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl05.png


--------------------------------------------------------------------------------
/images/ch04/chembl06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl06.png


--------------------------------------------------------------------------------
/images/ch04/chembl07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/chembl07.png


--------------------------------------------------------------------------------
/images/ch04/zinc01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch04/zinc01.png


--------------------------------------------------------------------------------
/images/ch05/ch05_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_01.png


--------------------------------------------------------------------------------
/images/ch05/ch05_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_02.png


--------------------------------------------------------------------------------
/images/ch05/ch05_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_03.png


--------------------------------------------------------------------------------
/images/ch05/ch05_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_04.png


--------------------------------------------------------------------------------
/images/ch05/ch05_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_05.png


--------------------------------------------------------------------------------
/images/ch05/ch05_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_06.png


--------------------------------------------------------------------------------
/images/ch05/ch05_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_07.png


--------------------------------------------------------------------------------
/images/ch05/ch05_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch05/ch05_08.png


--------------------------------------------------------------------------------
/images/ch06/apx_rvx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/apx_rvx.png


--------------------------------------------------------------------------------
/images/ch06/apx_rvx_suf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/apx_rvx_suf.png


--------------------------------------------------------------------------------
/images/ch06/cls01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/cls01.png


--------------------------------------------------------------------------------
/images/ch06/vs01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch06/vs01.png


--------------------------------------------------------------------------------
/images/ch07/chemviz2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/chemviz2.png


--------------------------------------------------------------------------------
/images/ch07/mcs01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs01.png


--------------------------------------------------------------------------------
/images/ch07/mcs02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs02.png


--------------------------------------------------------------------------------
/images/ch07/mcs03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs03.png


--------------------------------------------------------------------------------
/images/ch07/mcs04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs04.png


--------------------------------------------------------------------------------
/images/ch07/mcs05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mcs05.png


--------------------------------------------------------------------------------
/images/ch07/mmp01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp01.png


--------------------------------------------------------------------------------
/images/ch07/mmp02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp02.png


--------------------------------------------------------------------------------
/images/ch07/mmp03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp03.png


--------------------------------------------------------------------------------
/images/ch07/mmp04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mmp04.png


--------------------------------------------------------------------------------
/images/ch07/mms01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/mms01.png


--------------------------------------------------------------------------------
/images/ch07/scaffold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch07/scaffold.png


--------------------------------------------------------------------------------
/images/ch08/pca01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch08/pca01.png


--------------------------------------------------------------------------------
/images/ch08/pca02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch08/pca02.png


--------------------------------------------------------------------------------
/images/ch08/tsne01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch08/tsne01.png


--------------------------------------------------------------------------------
/images/ch10/ch10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch10/ch10_1.png


--------------------------------------------------------------------------------
/images/ch10/ch10_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch10/ch10_2.png


--------------------------------------------------------------------------------
/images/ch10/ch10_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch10/ch10_3.png


--------------------------------------------------------------------------------
/images/ch11/ch11_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch11/ch11_01.png


--------------------------------------------------------------------------------
/images/ch11/ch11_nfp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/ch11/ch11_nfp.png


--------------------------------------------------------------------------------
/images/jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/jupyter.png


--------------------------------------------------------------------------------
/images/mishimasyk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/mishimasyk.png


--------------------------------------------------------------------------------
/images/python_for_ci.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/python_for_ci.png


--------------------------------------------------------------------------------
/images/souyakuchan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/images/souyakuchan.png


--------------------------------------------------------------------------------
/mkpdf.sh:
--------------------------------------------------------------------------------
 1 | cat \
 2 | ch00_cover.asciidoc \
 3 | ch01_introduction.asciidoc \
 4 | ch02_installation.asciidoc \
 5 | ch03_python.asciidoc \
 6 | ch04_database.asciidoc \
 7 | ch05_rdkit.asciidoc \
 8 | ch06_similarity.asciidoc \
 9 | ch07_graph.asciidoc \
10 | ch08_visualization.asciidoc \
11 | ch09_qsar.asciidoc \
12 | ch10_deeplearning.asciidoc \
13 | ch11_dlqsar.asciidoc \
14 | ch12_generativemodels.asciidoc \
15 | ch13_beyond.asciidoc > py4c.asciidoc
16 | asciidoctor-pdf -r asciidoctor-pdf-cjk -o pdf/py4chemoinformatics.pdf py4c.asciidoc
17 | 


--------------------------------------------------------------------------------
/notebooks/ch09_compounds.txt:
--------------------------------------------------------------------------------
 1 | CMPD_CHEMBLID	MOLREGNO	PARENT_CMPD_CHEMBLID	PARENT_MOLREGNO	MOL_PREF_NAME	COMPOUND_KEY	MOLWEIGHT	ALOGP	PSA	NUM_RO5_VIOLATIONS	CANONICAL_SMILES	ACTIVITY_ID	STANDARD_TYPE	RELATION	STANDARD_VALUE	STANDARD_UNITS	PCHEMBL_VALUE	PUBLISHED_TYPE	PUBLISHED_RELATION	PUBLISHED_VALUE	PUBLISHED_UNITS	ACTIVITY_COMMENT	DATA_VALIDITY_COMMENT	POTENTIAL_DUPLICATE	BAO_ENDPOINT	UO_UNITS	QUDT_UNITS	ASSAY_ID	ASSAY_CHEMBLID	ASSAY_TYPE	DESCRIPTION	ASSAY_SRC_ID	ASSAY_SRC_DESCRIPTION	ASSAY_ORGANISM	ASSAY_STRAIN	ASSAY_TAX_ID	CURATED_BY	BAO_FORMAT	TID	TARGET_CHEMBLID	TARGET_TYPE	PROTEIN_ACCESSION	PREF_NAME	ORGANISM	CONFIDENCE_SCORE	TARGET_MAPPING	APD_NAME	APD_CONFIDENCE	DOC_ID	DOC_CHEMBLID	PUBMED_ID	JOURNAL	YEAR	VOLUME	ISSUE	FIRST_PAGE	CELL_ID	CELL_CHEMBL_ID	CELL_NAME	ACTIVITY_PARAMS	ACTIVITY_PROPS
 2 | CHEMBL549	14367	CHEMBL549	14367	CITALOPRAM	citalopram	324.4	3.81	36.26	0	CN(C)CCCC1(OCc2cc(ccc12)C#N)c3ccc(F)cc3	1523704	IC50	=	3981.07	nM	5.4	pIC50	=	5.4				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 3 | CHEMBL196431	325943	CHEMBL196431	325943		EDDP	277.41	4.6	3.24	0	C\C=C/1\N(C)C(C)CC1(c2ccccc2)c3ccccc3	1523842	IC50	=	50118.72	nM	4.3	pIC50	=	4.3				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 4 | CHEMBL12713	10813	CHEMBL12713	10813	SERTINDOLE	Sertindole	440.95	4.63	40.51	0	Fc1ccc(cc1)n2cc(C3CCN(CCN4CCNC4=O)CC3)c5cc(Cl)ccc25	1523558	IC50	=	10	nM	8	pIC50	=	8				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 5 | CHEMBL473	5638	CHEMBL473	5638	DOFETILIDE	Dofetilide	441.58	1.98	104.81	0	CN(CCOc1ccc(NS(=O)(=O)C)cc1)CCc2ccc(NS(=O)(=O)C)cc2	1523555	IC50	=	10	nM	8	pIC50	=	8				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 6 | CHEMBL607	22528	CHEMBL607	22528	MEPERIDINE	Meperidine	247.34	2.21	29.54	0	CCOC(=O)C1(CCN(C)CC1)c2ccccc2	1523678	IC50	=	323.59	nM	6.49	pIC50	=	6.49				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 7 | CHEMBL11	605	CHEMBL11	605	IMIPRAMINE	Imipramine	280.42	3.88	6.48	0	CN(C)CCCN1c2ccccc2CCc3ccccc13	1523698	IC50	=	3388.44	nM	5.47	pIC50	=	5.47				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 8 | CHEMBL71	6216	CHEMBL71	6216	CHLORPROMAZINE	chlorpromazine	318.87	4.89	6.48	0	CN(C)CCCN1c2ccccc2Sc3ccc(Cl)cc13	1523687	IC50	=	1479.11	nM	5.83	pIC50	=	5.83				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
 9 | CHEMBL708	33664	CHEMBL708	33664	ZIPRASIDONE	Ziprasidone	412.95	3.81	48.47	0	Clc1cc2NC(=O)Cc2cc1CCN3CCN(CC3)c4nsc5ccccc45	1523567	IC50	=	120.23	nM	6.92	pIC50	=	6.92				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
10 | CHEMBL629	27072	CHEMBL629	27072	AMITRIPTYLINE	Amitriptyline	277.41	4.17	3.24	0	CN(C)CCC=C1c2ccccc2CCc3ccccc13	1523711	IC50	=	10000	nM	5	pIC50	=	5				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
11 | CHEMBL1088	148004	CHEMBL1088	148004	MESORIDAZINE	Mesoridazine	386.59	4.9	23.55	0	CN1CCCCC1CCN2c3ccccc3Sc4ccc(cc24)[S+](C)[O-]	1523681	IC50	=	549.54	nM	6.26	pIC50	=	6.26				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
12 | CHEMBL533	12608	CHEMBL533	12608	IBUTILIDE	Ibutilide	384.59	4.16	69.64	0	CCCCCCCN(CC)CCCC(O)c1ccc(NS(=O)(=O)C)cc1	1523559	IC50	=	10	nM	8	pIC50	=	8				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
13 | CHEMBL4	146	CHEMBL4	146	OFLOXACIN	Ofloxacin	361.37	1.54	75.01	0	CC1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O	1523958	IC50	=	1412537.54	nM		pIC50	=	2.85			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
14 | CHEMBL96153	153104	CHEMBL96153	153104	TERIKALANT	Terikalant	381.52	4.84	30.93	0	COc1ccc(cc1OC)C2CCN(CC[C@H]3CCOc4ccccc34)CC2	1523673	IC50	=	251.19	nM	6.6	pIC50	=	6.6				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
15 | CHEMBL364512	326211	CHEMBL364512	326211	RISPERIDON	Risperidon	412.51	2.92	59.39	0	CC1=C(CCN2CCC(CC2)C3NOc4cc(F)ccc34)C(=O)N5CCCCC5=N1	1523572	IC50	=	151.36	nM	6.82	pIC50	=	6.82				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
16 | CHEMBL652	27588	CHEMBL652	27588	FLECAINIDE	Flecainide	414.35	3.44	59.59	0	FC(F)(F)COc1ccc(OCC(F)(F)F)c(c1)C(=O)NCC2CCCCN2	1523703	IC50	=	3890.45	nM	5.41	pIC50	=	5.41				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
17 | CHEMBL633	27185	CHEMBL633	27185	AMIODARONE	Amiodarone	645.32	6.94	42.68	2	CCCCc1oc2ccccc2c1C(=O)c3cc(I)c(OCCN(CC)CC)c(I)c3	1523710	IC50	=	10000	nM	5	pIC50	=	5				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
18 | CHEMBL31	1712	CHEMBL31	1712	GATIFLOXACIN	Gatifloxacin	375.4	1.98	83.8	0	COc1c(N2CCNC(C)C2)c(F)cc3C(=O)C(=CN(C4CC4)c13)C(=O)O	1523948	IC50	=	128824.96	nM		pIC50	=	3.89			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
19 | CHEMBL583	17136	CHEMBL583	17136	GREPAFLOXACIN	Grepafloxacin	359.4	2.28	74.57	0	CC1CN(CCN1)c2cc3N(C=C(C(=O)O)C(=O)c3c(C)c2F)C4CC4	1523840	IC50	=	50118.72	nM	4.3	pIC50	=	4.3				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
20 | CHEMBL1000	111185	CHEMBL1000	111185	CETIRIZINE	cetirizine	388.9	3.15	53.01	0	OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3	1523839	IC50	=	30199.52	nM	4.52	pIC50	=	4.52				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
21 | CHEMBL485	6167	CHEMBL485	6167	CODEINE	codeine	299.37	1.5	41.93	0	COc1ccc2C[C@@H]3[C@@H]4C=C[C@H](O)[C@@H]5Oc1c2[C@]45CCN3C	1523951	IC50	=	301995.17	nM		pIC50	=	3.52			Outside typical range	0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
22 | CHEMBL640	27341	CHEMBL640	27341	PROCAINAMIDE	Procainamide	235.33	1.34	58.36	0	CCN(CC)CCNC(=O)c1ccc(N)cc1	1523949	IC50	=	138038.43	nM		pIC50	=	3.86			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
23 | CHEMBL1110	153342	CHEMBL1110	153342	ALOSETRON	Alosetron	294.36	2.41	53.92	0	Cc1[nH]cnc1CN2CCc3c(C2=O)c4ccccc4n3C	1523696	IC50	=	3235.94	nM	5.49	pIC50	=	5.49				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
24 | CHEMBL517	11143	CHEMBL517	11143	DISOPYRAMIDE	Disopyramide	339.48	3.36	59.22	0	CC(C)N(CCC(C(=O)N)(c1ccccc1)c2ccccn2)C(C)C	1523945	IC50	=	91201.08	nM	4.04	pIC50	=	4.04				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
25 | CHEMBL479	5970	CHEMBL479	5970	THIORIDAZINE	Thioridazine	370.59	5.89	6.48	1	CSc1ccc2Sc3ccccc3N(CCC4CCCCN4C)c2c1	1523670	IC50	=	190.55	nM	6.72	pIC50	=	6.72				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
26 | CHEMBL723	36662	CHEMBL723	36662	CARVEDILOL	Carvedilol	406.48	3.74	75.74	0	COc1ccccc1OCCNCC(O)COc2cccc3[nH]c4ccccc4c23	1523712	IC50	=	10471.29	nM	4.98	pIC50	=	4.98				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
27 | CHEMBL715	34197	CHEMBL715	34197	OLANZAPINE	Olanzapine	312.44	3.44	30.87	0	CN1CCN(CC1)C2=Nc3ccccc3Nc4sc(C)cc24	1523669	IC50	=	181.97	nM	6.74	pIC50	=	6.74				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
28 | CHEMBL92870	150806	CHEMBL92870	150806	LIDOFLAZINE	Lidoflazine	491.63	5.75	35.58	1	Cc1cccc(C)c1NC(=O)CN2CCN(CCCC(c3ccc(F)cc3)c4ccc(F)cc4)CC2	1523560	IC50	=	15.85	nM	7.8	pIC50	=	7.8				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
29 | CHEMBL295698	72036	CHEMBL295698	72036	LEVOKETOCONAZOLE	Ketoconazole	531.44	4.21	69.06	1	CC(=O)N1CCN(CC1)c2ccc(OC[C@@H]3CO[C@](Cn4ccnc4)(O3)c5ccc(Cl)cc5Cl)cc2	1523693	IC50	=	1905.46	nM	5.72	pIC50	=	5.72				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
30 | CHEMBL1294	255477	CHEMBL1294	255477	QUINIDINE	Quinidine	324.42	3.17	45.59	0	COc1ccc2nccc([C@H](O)[C@H]3C[C@@H]4CCN3C[C@@H]4C=C)c2c1	1523674	IC50	=	323.59	nM	6.49	pIC50	=	6.49				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
31 | CHEMBL1107	152728	CHEMBL1107	152728	HALOFANTRINE	Halofantrine	500.43	8.64	23.47	2	CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c3cc(ccc13)C(F)(F)F	1523672	IC50	=	199.53	nM	6.7	pIC50	=	6.7				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
32 | CHEMBL2368925	1543376	CHEMBL2368925	1543376	DOLASETRON	Dolasetron	324.38	2.52	62.4	0	O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c4c[nH]c5ccccc45	1523834	IC50	=	12022.64	nM	4.92	pIC50	=	4.92				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
33 | CHEMBL54	3859	CHEMBL54	3859	HALOPERIDOL	Haloperidol	375.87	4.43	40.54	0	OC1(CCN(CCCC(=O)c2ccc(F)cc2)CC1)c3ccc(Cl)cc3	1523563	IC50	=	30.2	nM	7.52	pIC50	=	7.52				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
34 | CHEMBL45816	72035	CHEMBL45816	72035	MIBEFRADIL	Mibefradil	495.64	5.27	67.45	1	COCC(=O)O[C@]1(CCN(C)CCCc2nc3ccccc3[nH]2)CCc4cc(F)ccc4[C@@H]1C(C)C	1523686	IC50	=	1445.44	nM	5.84	pIC50	=	5.84				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
35 | CHEMBL289469	54547	CHEMBL289469	54547	GRANISETRON	Granisetron	312.42	2.32	50.16	0	CN1C2CCCC1CC(C2)NC(=O)c3nn(C)c4ccccc34	1523699	IC50	=	3715.35	nM	5.43	pIC50	=	5.43				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
36 | CHEMBL46	3183	CHEMBL46	3183	ONDANSETRON	Ondansetron	293.37	3.13	39.82	0	Cc1nccn1CC2CCc3c(C2=O)c4ccccc4n3C	1523683	IC50	=	812.83	nM	6.09	pIC50	=	6.09				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
37 | CHEMBL370805	333370	CHEMBL370805	333370	COCAINE	cocaine	303.36	1.87	55.84	0	COC(=O)[C@H]1[C@H](C[C@@H]2CC[C@H]1N2C)OC(=O)c3ccccc3	1523706	IC50	=	7244.36	nM	5.14	pIC50	=	5.14				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
38 | CHEMBL33	1795	CHEMBL33	1795	LEVOFLOXACIN	Levofloxacin	361.37	1.54	75.01	0	C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O	1523952	IC50	=	912010.84	nM		pIC50	=	3.04			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
39 | CHEMBL651	27576	CHEMBL651	27576	METHADONE	Methadone	309.45	4.29	20.31	0	CCC(=O)C(CC(C)N(C)C)(c1ccccc1)c2ccccc2	1523708	IC50	=	9772.37	nM	5.01	pIC50	=	5.01				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
40 | CHEMBL1108	152751	CHEMBL1108	152751	DROPERIDOL	Droperidol	379.44	3.68	58.1	0	Fc1ccc(cc1)C(=O)CCCN2CCC(=CC2)N3C(=O)Nc4ccccc34	1523564	IC50	=	32.36	nM	7.49	pIC50	=	7.49				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
41 | CHEMBL1729	557741	CHEMBL1729	557741	CISAPRIDE	cisapride	465.95	3.36	86.05	0	COC1CN(CCCOc2ccc(F)cc2)CCC1NC(=O)c3cc(Cl)c(N)cc3OC	1523565	IC50	=	39.81	nM	7.4	pIC50	=	7.4				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
42 | CHEMBL1741	570147	CHEMBL1741	570147	CLARITHROMYCIN	clarithromycin	747.96	2.44	182.91	2	CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]3O[C@H](C)C[C@@H]([C@H]3O)N(C)C)[C@@](C)(C[C@@H](C)C(=O)[C@H](C)[C@@H](O)[C@]1(C)O)OC	1523843	IC50	=	58884.37	nM	4.23	pIC50	=	4.23				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
43 | CHEMBL3	115	CHEMBL3	115	NICOTINE	Nicotine	162.24	1.85	16.13	0	CN1CCC[C@H]1c2cccnc2	1523950	IC50	=	245470.89	nM		pIC50	=	3.61			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
44 | CHEMBL416956	51162	CHEMBL416956	51162	MEFLOQUINE	Mefloquine	378.32	4.45	45.15	0	OC(C1CCCCN1)c2cc(nc3c(cccc23)C(F)(F)F)C(F)(F)F	1523705	IC50	=	5623.41	nM	5.25	pIC50	=	5.25				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
45 | CHEMBL23	1278	CHEMBL23	1278	DILTIAZEM	Diltiazem	414.53	3.37	59.08	0	COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@@H]2OC(=O)C	1523835	IC50	=	17378.01	nM	4.76	pIC50	=	4.76				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
46 | CHEMBL94454	152819	CHEMBL94454	152819	MIZOLASTINE	Mizolastine	432.5	3.41	70.05	0	CN(C1CCN(CC1)c2nc3ccccc3n2Cc4ccc(F)cc4)C5=NC=CC(=O)N5	1523680	IC50	=	436.52	nM	6.36	pIC50	=	6.36				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
47 | CHEMBL657	27629	CHEMBL657	27629	DIPHENHYDRAMINE	Diphenhydramine	255.36	3.35	12.47	0	CN(C)CCOC(c1ccccc1)c2ccccc2	1523838	IC50	=	26915.35	nM	4.57	pIC50	=	4.57				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
48 | CHEMBL2	97	CHEMBL2	97	PRAZOSIN	Prazosine	383.41	1.78	106.95	0	COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4	1523691	IC50	=	1584.89	nM	5.8	pIC50	=	5.8				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
49 | CHEMBL6966	1219	CHEMBL6966	1219	VERAPAMIL	Verapamil	454.61	5.09	63.95	1	COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2)cc1OC	1523571	IC50	=	141.25	nM	6.85	pIC50	=	6.85				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
50 | CHEMBL511142	446784	CHEMBL511142	446784	BUPRENORPHINE	Buprenorphine	467.65	4.41	62.16	0	CO[C@@]12CC[C@@]3(C[C@@H]1[C@](C)(O)C(C)(C)C)[C@H]4Cc5ccc(O)c6O[C@@H]2[C@]3(CCN4CC7CC7)c56	1523707	IC50	=	7585.78	nM	5.12	pIC50	=	5.12				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
51 | CHEMBL72	6243	CHEMBL72	6243	DESIPRAMINE	Desipramine	266.39	3.53	15.27	0	CNCCCN1c2ccccc2CCc3ccccc13	1523684	IC50	=	1380.38	nM	5.86	pIC50	=	5.86				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
52 | CHEMBL42	2261	CHEMBL42	2261	CLOZAPINE	clozapine	326.83	3.72	30.87	0	CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24	1523679	IC50	=	323.59	nM	6.49	pIC50	=	6.49				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
53 | CHEMBL1423	364141	CHEMBL1423	364141	PIMOZIDE	Pimozide	461.56	5.86	41.03	1	Fc1ccc(cc1)C(CCCN2CCC(CC2)N3C(=O)Nc4ccccc34)c5ccc(F)cc5	1523566	IC50	=	50.12	nM	7.3	pIC50	=	7.3				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
54 | CHEMBL327980	152611	CHEMBL327980	152611		E-4031	401.53	2.9	79.37	0	Cc1cccc(CCN2CCC(CC2)C(=O)c3ccc(NS(=O)(=O)C)cc3)n1	1523562	IC50	=	19.95	nM	7.7	pIC50	=	7.7				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
55 | CHEMBL1008	112651	CHEMBL1008	112651	BEPRIDIL	Bepridil	366.55	4.83	15.71	0	CC(C)COCC(CN(Cc1ccccc1)c2ccccc2)N3CCCC3	1523682	IC50	=	549.54	nM	6.26	pIC50	=	6.26				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
56 | CHEMBL41	2223	CHEMBL41	2223	FLUOXETINE	Fluoxetine	309.33	4.44	21.26	0	CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2	1523690	IC50	=	1513.56	nM	5.82	pIC50	=	5.82				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
57 | CHEMBL193	419596	CHEMBL193	419596	NIFEDIPINE	Nifedipine	346.34	2.18	107.77	0	COC(=O)C1=C(C)NC(=C(C1c2ccccc2[N+](=O)[O-])C(=O)OC)C	1523841	IC50	=	50118.72	nM	4.3	pIC50	=	4.3				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
58 | CHEMBL1514	421070	CHEMBL1514	421070	LEVOMETHADYL ACETATE	Laam	353.51	4.65	29.54	0	CC[C@H](OC(=O)C)C(C[C@H](C)N(C)C)(c1ccccc1)c2ccccc2	1523694	IC50	=	2187.76	nM	5.66	pIC50	=	5.66				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
59 | CHEMBL17157	19569	CHEMBL17157	19569	TERFENADINE	Terfenadine	471.69	6.45	43.7	1	CC(C)(C)c1ccc(cc1)C(O)CCCN2CCC(CC2)C(O)(c3ccccc3)c4ccccc4	1523671	IC50	=	199.53	nM	6.7	pIC50	=	6.7				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
60 | CHEMBL296419	65605	CHEMBL296419	65605	ASTEMIZOLE	Astemizole (table 1)	458.58	5.35	42.32	1	COc1ccc(CCN2CCC(CC2)Nc3nc4ccccc4n3Cc5ccc(F)cc5)cc1	1523554	IC50	=	10	nM	8	pIC50	=	8				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
61 | CHEMBL596	19915	CHEMBL596	19915	FENTANYL	Fentanyl	336.48	4.14	23.55	0	CCC(=O)N(C1CCN(CCc2ccccc2)CC1)c3ccccc3	1523692	IC50	=	1819.7	nM	5.74	pIC50	=	5.74				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
62 | CHEMBL998	110803	CHEMBL998	110803	LORATADINE	Loratadine	382.89	4.89	42.43	0	CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc4cccnc24)CC1	1523668	IC50	=	169.82	nM	6.77	pIC50	=	6.77				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
63 | CHEMBL505	10358	CHEMBL505	10358	CHLORPHENIRAMINE	chlorpheniramine	274.8	3.82	16.13	0	CN(C)CCC(c1ccc(Cl)cc1)c2ccccn2	1523837	IC50	=	20892.96	nM	4.68	pIC50	=	4.68				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
64 | CHEMBL70	6005	CHEMBL70	6005	MORPHINE	Morphine	285.34	1.2	52.93	0	CN1CC[C@]23[C@H]4Oc5c(O)ccc(C[C@@H]1[C@@H]2C=C[C@@H]4O)c35	1523954	IC50	=	1000000	nM		pIC50	=	3			Outside typical range	0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
65 | CHEMBL475534	453570	CHEMBL475534	453570	NITRENDIPINE	Nitrendipine	360.37	2.57	107.77	0	CCOC(=O)C1=C(C)NC(=C(C1c2cccc(c2)[N+](=O)[O-])C(=O)OC)C	1523709	IC50	=	10000	nM	5	pIC50	=	5				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
66 | CHEMBL850	65351	CHEMBL850	65351	SPARFLOXACIN	Sparfloxacin	392.41	2.08	100.59	0	C[C@@H]1CN(C[C@H](C)N1)c2c(F)c(N)c3C(=O)C(=CN(C4CC4)c3c2F)C(=O)O	1523836	IC50	=	18197.01	nM	4.74	pIC50	=	4.74				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
67 | CHEMBL192	410802	CHEMBL192	410802	SILDENAFIL	UK-92480, Sildenafil	474.59	1.61	113.42	0	CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4	1523697	IC50	=	3311.31	nM	5.48	pIC50	=	5.48				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
68 | CHEMBL1382	321964	CHEMBL1382	321964	TOLTERODINE	Tolterodine	325.5	5.34	23.47	1	CC(C)N(CC[C@H](c1ccccc1)c2cc(C)ccc2O)C(C)C	1523561	IC50	=	16.98	nM	7.77	pIC50	=	7.77				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
69 | CHEMBL305660	111060	CHEMBL305660	111060	EBASTINE	Ebastine	469.67	7.22	29.54	1	CC(C)(C)c1ccc(cc1)C(=O)CCCN2CCC(CC2)OC(c3ccccc3)c4ccccc4	1523695	IC50	=	3019.95	nM	5.52	pIC50	=	5.52				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
70 | CHEMBL32	1788	CHEMBL32	1788	MOXIFLOXACIN	Moxifloxacin	401.44	2.37	83.8	0	COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O	1523947	IC50	=	128824.96	nM		pIC50	=	3.89			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
71 | CHEMBL1106	152610	CHEMBL1106	152610	EPINASTINE	Epinastine	249.32	2.47	41.62	0	NC1=NCC2N1c3ccccc3Cc4ccccc24	1523946	IC50	=	91201.08	nM	4.04	pIC50	=	4.04				0	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
72 | CHEMBL219916	364745	CHEMBL219916	364745	DOMPERIDONE	Domperidone	425.92	3.35	78.82	0	Clc1ccc2N(C3CCN(CCCN4C(=O)Nc5ccccc45)CC3)C(=O)Nc2c1	1523667	IC50	=	162.18	nM	6.79	pIC50	=	6.79				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
73 | CHEMBL123558	208679	CHEMBL123558	208679	AZIMILIDE	Azimilide	457.96	3.23	72.6	0	CN1CCN(CCCCN2C(=O)CN(\N=C\c3oc(cc3)c4ccc(Cl)cc4)C2=O)CC1	1523685	IC50	=	1412.54	nM	5.85	pIC50	=	5.85				1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
74 | CHEMBL8	241	CHEMBL8	241	CIPROFLOXACIN	Ciprofloxacin	331.35	1.58	74.57	0	OC(=O)C1=CN(C2CC2)c3cc(N4CCNCC4)c(F)cc3C1=O	1523953	IC50	=	954992.59	nM		pIC50	=	3.02			Outside typical range	1	BAO_0000199			307245	CHEMBL829152	B	Inhibitory concentration against potassium channel HERG	1	Scientific Literature				Expert	BAO_0000357	165	CHEMBL240	SINGLE PROTEIN	Q12809	HERG	Homo sapiens	8	Homologous protein			20472	CHEMBL1139598	15911273	Bioorg. Med. Chem. Lett.	2005	15	11	2886					
75 | 


--------------------------------------------------------------------------------
/notebooks/ch09_qsar.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 構造活性相関（QSAR）の基礎\n",
  8 |     "\n",
  9 |     "## 効果ありなしの原因を考えてみる（分類問題）"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 17,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from rdkit import Chem, DataStructs\n",
 19 |     "from rdkit.Chem import AllChem, Draw\n",
 20 |     "from rdkit.Chem.Draw import IPythonConsole\n",
 21 |     "import numpy as np\n",
 22 |     "from sklearn.model_selection import train_test_split\n",
 23 |     "from sklearn.metrics import confusion_matrix, f1_score\n",
 24 |     "from sklearn.ensemble import RandomForestClassifier"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "mols = []\n",
 34 |     "labels = []\n",
 35 |     "with open(\"ch09_compounds.txt\") as f:\n",
 36 |     "    header = f.readline()\n",
 37 |     "    smiles_index = -1\n",
 38 |     "    for i, title in enumerate(header.split(\"\\t\")):\n",
 39 |     "        if title == \"CANONICAL_SMILES\":\n",
 40 |     "            smiles_index = i\n",
 41 |     "        elif title == \"STANDARD_VALUE\":\n",
 42 |     "            value_index = i\n",
 43 |     "    for l in f:\n",
 44 |     "        ls = l.split(\"\\t\")\n",
 45 |     "        mol = Chem.MolFromSmiles(ls[smiles_index])\n",
 46 |     "        mols.append(mol)\n",
 47 |     "        val = float(ls[value_index])\n",
 48 |     "        if val < 1000:\n",
 49 |     "            labels.append(\"POS\")\n",
 50 |     "        else:\n",
 51 |     "            labels.append(\"NEG\")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 5,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "labels = np.array(labels)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 7,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "fps = []\n",
 70 |     "for mol in mols:\n",
 71 |     "    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)\n",
 72 |     "    arr = np.zeros((1,))\n",
 73 |     "    DataStructs.ConvertToNumpyArray(fp, arr)\n",
 74 |     "    fps.append(arr)\n",
 75 |     "fps = np.array(fps)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 11,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "x_train, x_test, y_train, y_test = train_test_split(fps, labels)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 12,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
 96 |        "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
 97 |        "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
 98 |        "            min_samples_leaf=1, min_samples_split=2,\n",
 99 |        "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
100 |        "            oob_score=False, random_state=None, verbose=0,\n",
101 |        "            warm_start=False)"
102 |       ]
103 |      },
104 |      "execution_count": 12,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "rf = RandomForestClassifier()\n",
111 |     "rf.fit(x_train, y_train)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 23,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "y_pred = rf.predict(x_test)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 24,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "array([[11,  1],\n",
132 |        "       [ 5,  2]])"
133 |       ]
134 |      },
135 |      "execution_count": 24,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "confusion_matrix(y_test, y_pred)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 26,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "0.4"
153 |       ]
154 |      },
155 |      "execution_count": 26,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "f1_score(y_test, y_pred, pos_label=\"POS\" )"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "   ## 薬の効き目を予測しよう（回帰問題）"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 40,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "from sklearn.ensemble import RandomForestRegressor\n",
178 |     "from sklearn.metrics import r2_score\n",
179 |     "from math import log10"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 35,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "pIC50s = []\n",
189 |     "with open(\"ch09_compounds.txt\") as f:\n",
190 |     "    header = f.readline()\n",
191 |     "    for i, title in enumerate(header.split(\"\\t\")):\n",
192 |     "        if title == \"STANDARD_VALUE\":\n",
193 |     "            value_index = i\n",
194 |     "    for l in f:\n",
195 |     "        ls = l.split(\"\\t\")\n",
196 |     "        val = float(ls[value_index])\n",
197 |     "        pIC50 = 9 - log10(val)\n",
198 |     "        pIC50s.append(pIC50)\n",
199 |     "\n",
200 |     "pIC50s = np.array(pIC50s)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 37,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "x_train, x_test, y_train, y_test = train_test_split(fps, pIC50s)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 38,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
221 |        "           max_features='auto', max_leaf_nodes=None,\n",
222 |        "           min_impurity_decrease=0.0, min_impurity_split=None,\n",
223 |        "           min_samples_leaf=1, min_samples_split=2,\n",
224 |        "           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
225 |        "           oob_score=False, random_state=None, verbose=0, warm_start=False)"
226 |       ]
227 |      },
228 |      "execution_count": 38,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "rf = RandomForestRegressor()\n",
235 |     "rf.fit(x_train, y_train)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 39,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "y_pred = rf.predict(x_test)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 41,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "0.5213586033887229"
256 |       ]
257 |      },
258 |      "execution_count": 41,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "r2_score(y_test, y_pred)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": []
273 |   }
274 |  ],
275 |  "metadata": {
276 |   "kernelspec": {
277 |    "display_name": "Python 3",
278 |    "language": "python",
279 |    "name": "python3"
280 |   },
281 |   "language_info": {
282 |    "codemirror_mode": {
283 |     "name": "ipython",
284 |     "version": 3
285 |    },
286 |    "file_extension": ".py",
287 |    "mimetype": "text/x-python",
288 |    "name": "python",
289 |    "nbconvert_exporter": "python",
290 |    "pygments_lexer": "ipython3",
291 |    "version": "3.6.8"
292 |   }
293 |  },
294 |  "nbformat": 4,
295 |  "nbformat_minor": 2
296 | }
297 | 


--------------------------------------------------------------------------------
/pdf/py4chemoinformatics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwatobipen/py4chemoinformatics/8b8c2a2362d6d9cbb7e490e697bb9b8ad6e4223b/pdf/py4chemoinformatics.pdf


--------------------------------------------------------------------------------
/py4c-theme.yml:
--------------------------------------------------------------------------------
  1 | font:
  2 |   catalog:
  3 |     # Noto Serif supports Latin, Latin-1 Supplement, Latin Extended-A, Greek, Cyrillic, Vietnamese & an assortment of symbols
  4 |     AozoraMincho:
  5 |       normal: AozoraMinchoRegular.ttf
  6 |       italic: AozoraMinchoMedium.ttf
  7 |       bold: AozoraMinchoHeavy.ttf
  8 |       bold_italic: AozoraMinchoBlack.ttf
  9 |     RictyDiminished:
 10 |       normal: RictyDiminished-Regular.ttf
 11 |       italic: RictyDiminished-Oblique.ttf
 12 |       bold: RictyDiminished-Bold.ttf
 13 |       bold_italic: RictyDiminished-BoldOblique.ttf
 14 |     Noto Serif:
 15 |       normal: notoserif-regular-subset.ttf
 16 |       bold: notoserif-bold-subset.ttf
 17 |       italic: notoserif-italic-subset.ttf
 18 |       bold_italic: notoserif-bold_italic-subset.ttf
 19 |     # M+ 1mn supports ASCII and the circled numbers used for conums
 20 |     M+ 1mn:
 21 |       normal: mplus1mn-regular-ascii-conums.ttf
 22 |       bold: mplus1mn-bold-ascii.ttf
 23 |       italic: mplus1mn-italic-ascii.ttf
 24 |       bold_italic: mplus1mn-bold_italic-ascii.ttf
 25 |     # M+ 1p supports Latin, Latin-1 Supplement, Latin Extended, Greek, Cyrillic, Vietnamese, Japanese & an assortment of symbols
 26 |     # It also provides arrows for ->, <-, => and <= replacements in case these glyphs are missing from font
 27 |     M+ 1p Fallback:
 28 |       normal: mplus1p-regular-fallback.ttf
 29 |       bold: mplus1p-regular-fallback.ttf
 30 |       italic: mplus1p-regular-fallback.ttf
 31 |       bold_italic: mplus1p-regular-fallback.ttf
 32 |   fallbacks:
 33 |     - M+ 1p Fallback
 34 | page:
 35 |   background_color: ffffff
 36 |   layout: portrait
 37 |   margin: [0.5in, 0.67in, 0.67in, 0.67in]
 38 |   # margin_inner and margin_outer keys are used for recto/verso print margins when media=prepress
 39 |   margin_inner: 0.75in
 40 |   margin_outer: 0.59in
 41 |   size: A4
 42 | base:
 43 |   align: justify
 44 |   # color as hex string (leading # is optional)
 45 |   font_color: 333333
 46 |   # color as RGB array
 47 |   #font_color: [51, 51, 51]
 48 |   # color as CMYK array (approximated)
 49 |   #font_color: [0, 0, 0, 0.92]
 50 |   #font_color: [0, 0, 0, 92%]
 51 |   font_family: Noto Serif
 52 |   # choose one of these font_size/line_height_length combinations
 53 |   font_size: 10
 54 |   line_height_length: 15
 55 |   #font_size: 11.25
 56 |   #line_height_length: 18
 57 |   #font_size: 11.2
 58 |   #line_height_length: 16
 59 |   font_size: 10.5
 60 |   #line_height_length: 15
 61 |   # correct line height for Noto Serif metrics
 62 |   line_height_length: 12
 63 |   #font_size: 11.25
 64 |   #line_height_length: 18
 65 |   line_height: $base_line_height_length / $base_font_size
 66 |   font_size_large: round($base_font_size * 1.25)
 67 |   font_size_small: round($base_font_size * 0.85)
 68 |   font_size_min: $base_font_size * 0.75
 69 |   font_style: normal
 70 |   border_color: eeeeee
 71 |   border_radius: 4
 72 |   border_width: 0.5
 73 | # FIXME vertical_rhythm is weird; we should think in terms of ems
 74 | #vertical_rhythm: $base_line_height_length * 2 / 3
 75 | # correct line height for Noto Serif metrics (comes with built-in line height)
 76 | vertical_rhythm: $base_line_height_length
 77 | horizontal_rhythm: $base_line_height_length
 78 | # QUESTION should vertical_spacing be block_spacing instead?
 79 | vertical_spacing: $vertical_rhythm
 80 | link:
 81 |   font_color: 428bca
 82 | # literal is currently used for inline monospaced in prose and table cells
 83 | literal:
 84 |   font_color: b12146
 85 |   font_family: M+ 1mn
 86 | menu_caret_content: " <font size=\"1.15em\"><color rgb=\"b12146\">\u203a</color></font> "
 87 | heading:
 88 |   align: left
 89 |   #font_color: 181818
 90 |   font_color: $base_font_color
 91 |   font_family: $base_font_family
 92 |   font_style: bold
 93 |   # h1 is used for part titles (book doctype only)
 94 |   h1_font_size: floor($base_font_size * 2.6)
 95 |   # h2 is used for chapter titles (book doctype only)
 96 |   h2_font_size: floor($base_font_size * 2.15)
 97 |   h3_font_size: round($base_font_size * 1.7)
 98 |   h4_font_size: $base_font_size_large
 99 |   h5_font_size: $base_font_size
100 |   h6_font_size: $base_font_size_small
101 |   #line_height: 1.4
102 |   # correct line height for Noto Serif metrics (comes with built-in line height)
103 |   line_height: 1
104 |   margin_top: $vertical_rhythm * 0.4
105 |   margin_bottom: $vertical_rhythm * 0.9
106 | title_page:
107 |   align: right
108 |   logo:
109 |     top: 30%
110 |   title:
111 |     top: 80%
112 |     font_size: $heading_h1_font_size
113 |     font_color: 999999
114 |     line_height: 0.9
115 |   subtitle:
116 |     font_size: $heading_h3_font_size
117 |     font_style: bold_italic
118 |     line_height: 1
119 |   authors:
120 |     margin_top: $base_font_size * 1.25
121 |     font_size: $base_font_size_large
122 |     font_color: 181818
123 |   revision:
124 |     margin_top: $base_font_size * 1.25
125 | block:
126 |   margin_top: 0
127 |   margin_bottom: $vertical_rhythm
128 | caption:
129 |   align: left
130 |   font_size: $base_font_size * 0.95
131 |   font_style: italic
132 |   # FIXME perhaps set line_height instead of / in addition to margins?
133 |   margin_inside: $vertical_rhythm / 3
134 |   #margin_inside: $vertical_rhythm / 4
135 |   margin_outside: 0
136 | lead:
137 |   font_size: $base_font_size_large
138 |   line_height: 1.4
139 | abstract:
140 |   font_color: 5c6266
141 |   font_size: $lead_font_size
142 |   line_height: $lead_line_height
143 |   font_style: italic
144 |   first_line_font_style: bold
145 |   title:
146 |     align: center
147 |     font_color: $heading_font_color
148 |     font_family: $heading_font_family
149 |     font_size: $heading_h4_font_size
150 |     font_style: $heading_font_style
151 | admonition:
152 |   column_rule_color: $base_border_color
153 |   column_rule_width: $base_border_width
154 |   padding: [0, $horizontal_rhythm, 0, $horizontal_rhythm]
155 |   #icon:
156 |   #  tip:
157 |   #    name: fa-lightbulb-o
158 |   #    stroke_color: 111111
159 |   #    size: 24
160 |   label:
161 |     text_transform: uppercase
162 |     font_style: bold
163 | blockquote:
164 |   font_color: $base_font_color
165 |   font_size: $base_font_size_large
166 |   border_color: $base_border_color
167 |   border_width: 5
168 |   # FIXME disable negative padding bottom once margin collapsing is implemented
169 |   padding: [0, $horizontal_rhythm, $block_margin_bottom * -0.75, $horizontal_rhythm + $blockquote_border_width / 2]
170 |   cite_font_size: $base_font_size_small
171 |   cite_font_color: 999999
172 | # code is used for source blocks (perhaps change to source or listing?)
173 | code:
174 |   font_color: $base_font_color
175 |   font_family: $literal_font_family
176 |   font_size: ceil($base_font_size)
177 |   padding: $code_font_size
178 |   line_height: 1.25
179 |   # line_gap is an experimental property to control how a background color is applied to an inline block element
180 |   line_gap: 3.8
181 |   background_color: f5f5f5
182 |   border_color: cccccc
183 |   border_radius: $base_border_radius
184 |   border_width: 0.75
185 | conum:
186 |   font_family: M+ 1mn
187 |   font_color: $literal_font_color
188 |   font_size: $base_font_size
189 |   line_height: 4 / 3
190 | example:
191 |   border_color: $base_border_color
192 |   border_radius: $base_border_radius
193 |   border_width: 0.75
194 |   background_color: ffffff
195 |   # FIXME reenable padding bottom once margin collapsing is implemented
196 |   padding: [$vertical_rhythm, $horizontal_rhythm, 0, $horizontal_rhythm]
197 | image:
198 |   align: left
199 | prose:
200 |   margin_top: $block_margin_top
201 |   margin_bottom: $block_margin_bottom
202 | sidebar:
203 |   background_color: eeeeee
204 |   border_color: e1e1e1
205 |   border_radius: $base_border_radius
206 |   border_width: $base_border_width
207 |   # FIXME reenable padding bottom once margin collapsing is implemented
208 |   padding: [$vertical_rhythm, $vertical_rhythm * 1.25, 0, $vertical_rhythm * 1.25]
209 |   title:
210 |     align: center
211 |     font_color: $heading_font_color
212 |     font_family: $heading_font_family
213 |     font_size: $heading_h4_font_size
214 |     font_style: $heading_font_style
215 | thematic_break:
216 |   border_color: $base_border_color
217 |   border_style: solid
218 |   border_width: $base_border_width
219 |   margin_top: $vertical_rhythm * 0.5
220 |   margin_bottom: $vertical_rhythm * 1.5
221 | description_list:
222 |   term_font_style: bold
223 |   term_spacing: $vertical_rhythm / 4
224 |   description_indent: $horizontal_rhythm * 1.25
225 | outline_list:
226 |   indent: $horizontal_rhythm * 1.5
227 |   #marker_font_color: 404040
228 |   # NOTE outline_list_item_spacing applies to list items that do not have complex content
229 |   item_spacing: $vertical_rhythm / 2
230 | table:
231 |   background_color: $page_background_color
232 |   #head_background_color: <hex value>
233 |   #head_font_color: $base_font_color
234 |   head_font_style: bold
235 |   #body_background_color: <hex value>
236 |   body_stripe_background_color: f9f9f9
237 |   foot_background_color: f0f0f0
238 |   border_color: dddddd
239 |   border_width: $base_border_width
240 |   cell_padding: 3
241 | toc:
242 |   indent: $horizontal_rhythm
243 |   line_height: 1.4
244 |   dot_leader:
245 |     #content: ". "
246 |     font_color: a9a9a9
247 |     #levels: 2 3
248 | # NOTE in addition to footer, header is also supported
249 | footer:
250 |   font_size: $base_font_size_small
251 |   # NOTE if background_color is set, background and border will span width of page
252 |   border_color: dddddd
253 |   border_width: 0.25
254 |   height: $base_line_height_length * 2.5
255 |   line_height: 1
256 |   padding: [$base_line_height_length / 2, 1, 0, 1]
257 |   vertical_align: top
258 |   #image_vertical_align: <alignment> or <number>
259 |   # additional attributes for content:
260 |   # * {page-count}
261 |   # * {page-number}
262 |   # * {document-title}
263 |   # * {document-subtitle}
264 |   # * {chapter-title}
265 |   # * {section-title}
266 |   # * {section-or-chapter-title}
267 |   recto:
268 |     #columns: "<50% =0% >50%"
269 |     right:
270 |       content: '{page-number}'
271 |       #content: '{section-or-chapter-title} | {page-number}'
272 |       #content: '{document-title} | {page-number}'
273 |     #center:
274 |     #  content: '{page-number}'
275 |   verso:
276 |     #columns: $footer_recto_columns
277 |     left:
278 |       content: $footer_recto_right_content
279 |       #content: '{page-number} | {chapter-title}'
280 |     #center:
281 |     #  content: '{page-number}'
282 | 


--------------------------------------------------------------------------------