├── .gitignore ├── LICENSE ├── README.md ├── cdk_pywrapper ├── __init__.py ├── cdk │ └── cdk_bridge.java ├── cdk_pywrapper.py ├── chemlib.py ├── config.py └── tests │ ├── __init__.py │ └── cdk_pywrapper_test.py ├── requirements.txt ├── setup.cfg ├── setup.py └── version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | venv/ 15 | venv2/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | .gitignore 96 | .idea/ 97 | VERSION 98 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python Wrapper for the Chemistry Development kit 2 | 3 | ### tl;dr 4 | * A Python wrapper for the CDK (which is written in Java) 5 | * Primary purpose: 6 | * Generate diverse chemical compound identifiers (SMILES, InChI) 7 | * Inter-convert between these identifiers 8 | * Fully compatible to Python 3.x 9 | 10 | ### Motivation 11 | The chemistry world only has a small number of open tools, e.g. [OpenBabel](http://openbabel.org) and the 12 | [Chemistry Development Kit](cdk.sourceforge.net) ([github](https://github.com/cdk)). 13 | 14 | I have been using OpenBabel for some time now, and it is a great tool offering many options, 15 | I found several issues which make it hard to use: 16 | * Generating InChI (keys) from SMILES often either does not work or struggles with stereochemistry. 17 | * InChI cannot be used as input format. 18 | 19 | ### Installation 20 | 21 | ```bash 22 | git clone https://github.com/sebotic/cdk_pywrapper.git 23 | cd cdk_pywrapper 24 | 25 | pip install . 26 | 27 | ``` 28 | 29 | This will install the package on your local system, it will download the CDK and it will build the cdk_bridge.java. 30 | So after that, cdk_pywrapper should be ready to use, like in the example below. 31 | 32 | Don't forget to use e.g. sudo for global installation or pip3 for Python 3. 33 | 34 | I will also host this on pypi soon, so no repo cloning will be required. I have tested it on Linux and MacOS, not sure if it would work on Windows. 35 | 36 | ### Example 37 | 38 | ```python 39 | from cdk_pywrapper.cdk_pywrapper import Compound 40 | 41 | smiles = 'CCN1C2=CC=CC=C2SC1=CC=CC=CC3=[N+](C4=CC=CC=C4S3)CC.[I-]' 42 | cmpnd = Compound(compound_string=smiles, identifier_type='smiles') 43 | ikey = cmpnd.get_inchi_key() 44 | print(ikey) 45 | 46 | ``` 47 | Output: 'MNQDKWZEUULFPX-UHFFFAOYSA-M' 48 | 49 | 50 | -------------------------------------------------------------------------------- /cdk_pywrapper/__init__.py: -------------------------------------------------------------------------------- 1 | # import cdk_pywrapper.cdk_pywrapper 2 | # import cdk_pywrapper.config 3 | 4 | gw = 'twest' 5 | 6 | ade = 'maose' -------------------------------------------------------------------------------- /cdk_pywrapper/cdk/cdk_bridge.java: -------------------------------------------------------------------------------- 1 | /* 2 | * A py4j bridge for the CDK 3 | * Also has a class for substructure search and SVG xml generation 4 | * Copyright 2018 Sebastian Burgstaller-Muehlbacher 5 | * Licensed under AGPLv3 6 | */ 7 | 8 | import py4j.GatewayServer; 9 | import org.openscience.cdk.*; 10 | import org.openscience.cdk.DefaultChemObjectBuilder; 11 | import org.openscience.cdk.interfaces.IChemObjectBuilder; 12 | import org.openscience.cdk.interfaces.IAtomContainer; 13 | import org.openscience.cdk.interfaces.IChemObject; 14 | 15 | import org.openscience.cdk.smiles.SmilesParser; 16 | import org.openscience.cdk.smiles.SmiFlavor; 17 | import org.openscience.cdk.smiles.SmilesGenerator; 18 | 19 | import org.openscience.cdk.exception.InvalidSmilesException; 20 | import org.openscience.cdk.exception.CDKException; 21 | import org.openscience.cdk.smiles.smarts.SmartsPattern; 22 | import org.openscience.cdk.isomorphism.Pattern; 23 | import org.openscience.cdk.isomorphism.Mappings; 24 | import org.openscience.cdk.depict.DepictionGenerator; 25 | 26 | import org.openscience.cdk.aromaticity.Aromaticity; 27 | import org.openscience.cdk.aromaticity.ElectronDonation; 28 | import org.openscience.cdk.graph.Cycles; 29 | import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; 30 | 31 | import java.util.*; 32 | import java.io.IOException; 33 | import java.awt.Color; 34 | import java.util.concurrent.ConcurrentHashMap; 35 | 36 | class CDKBridge { 37 | 38 | public static void main(String[] args) { 39 | CDKBridge app = new CDKBridge(); 40 | GatewayServer server = new GatewayServer(app); 41 | server.start(); 42 | System.out.println("Server process started sucessfully"); 43 | } 44 | } 45 | 46 | class SearchHandler { 47 | 48 | ConcurrentHashMap moleculeContainers; 49 | Pattern pattern; 50 | int totalCount = 0; 51 | 52 | public SearchHandler() { 53 | this.moleculeContainers = new ConcurrentHashMap(); 54 | // this.buildSubstructureIndex(molecules); 55 | } 56 | 57 | 58 | 59 | public String getSVG(IAtomContainer c, Iterable substructures) { 60 | Color color = Color.orange; 61 | 62 | DepictionGenerator dg = new DepictionGenerator() 63 | .withHighlight(substructures, color) 64 | .withAtomColors() 65 | .withOuterGlowHighlight(4.0); 66 | 67 | try { 68 | return dg.depict(c).toSvgStr(); 69 | 70 | } catch (CDKException e) { 71 | System.err.println(e.getMessage()); 72 | return ""; 73 | } 74 | } 75 | 76 | public ArrayList searchPattern(String p, HashMap mols) { 77 | 78 | ConcurrentHashMap molecules = new ConcurrentHashMap<>(mols); 79 | IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance(); 80 | SmilesParser parser = new SmilesParser(builder); 81 | 82 | 83 | try { 84 | IAtomContainer patternMol = parser.parseSmiles(p); 85 | Aromaticity aromaticity = new Aromaticity(ElectronDonation.daylight(), 86 | Cycles.all()); 87 | 88 | aromaticity.apply(patternMol); 89 | 90 | patternMol = AtomContainerManipulator.copyAndSuppressedHydrogens(patternMol); 91 | 92 | SmilesGenerator sg = new SmilesGenerator(SmiFlavor.UseAromaticSymbols); 93 | p = sg.create(patternMol); 94 | 95 | 96 | } catch (InvalidSmilesException e) { 97 | System.err.println(e.getMessage()); 98 | } catch (CDKException e) { 99 | 100 | } 101 | 102 | try { 103 | this.pattern = SmartsPattern.create(p); 104 | } catch (IOException e) { 105 | System.err.println(e.getMessage()); 106 | 107 | return new ArrayList(); 108 | } 109 | 110 | ConcurrentHashMap ma = new ConcurrentHashMap<>(); 111 | 112 | molecules.forEach(1, (k, v) -> { 113 | if (this.totalCount < 200) { 114 | try { 115 | 116 | IAtomContainer ac = parser.parseSmiles(v); 117 | 118 | Mappings mappings = pattern.matchAll(ac); 119 | int match_count = mappings.countUnique(); 120 | 121 | if (match_count > 0) { 122 | Iterable substructures = mappings.toChemObjects(); 123 | String svg = this.getSVG(ac, substructures); 124 | 125 | ArrayList tmp = new ArrayList(3); 126 | tmp.add(0, k); 127 | tmp.add(1, String.valueOf(match_count)); 128 | tmp.add(2, svg); 129 | ma.put(k, tmp); 130 | 131 | this.totalCount += 1; 132 | } 133 | 134 | 135 | } catch (InvalidSmilesException e) { 136 | System.err.println(e.getMessage()); 137 | } 138 | } 139 | } 140 | ); 141 | 142 | 143 | ArrayList matches = new ArrayList(500); 144 | for (Map.Entry entry : ma.entrySet()) { 145 | matches.add(entry.getValue()); 146 | 147 | } 148 | 149 | // try { 150 | // Pattern pattern = SmartsPattern.create(p); 151 | // 152 | // int totalCount = 0; 153 | // for (Map.Entry entry : moleculeContainers.entrySet()) { 154 | // String key = entry.getKey(); 155 | // IAtomContainer ac = entry.getValue(); 156 | // Mappings mappings = pattern.matchAll(ac); 157 | // int match_count = mappings.countUnique(); 158 | // 159 | // if (match_count > 0) { 160 | // Iterable substructures = mappings.toChemObjects(); 161 | // String svg = this.getSVG(ac, substructures); 162 | // 163 | // ArrayList tmp = new ArrayList(3); 164 | // tmp.add(0, key); 165 | // tmp.add(1, String.valueOf(match_count)); 166 | // tmp.add(2, svg); 167 | // matches.add(tmp); 168 | // 169 | // totalCount += 1; 170 | // } 171 | // 172 | // if (totalCount > 200) { 173 | // return matches; 174 | // } 175 | // } 176 | // 177 | // } catch (IOException e) { 178 | // System.err.println(e.getMessage()); 179 | // } 180 | 181 | return matches; 182 | } 183 | } 184 | 185 | -------------------------------------------------------------------------------- /cdk_pywrapper/cdk_pywrapper.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import time 4 | import os 5 | import atexit 6 | import platform 7 | import copy 8 | import psutil 9 | 10 | import py4j 11 | from py4j.java_gateway import JavaGateway, GatewayParameters 12 | from py4j.java_collections import SetConverter, MapConverter, ListConverter 13 | from py4j.protocol import Py4JJavaError 14 | 15 | # import cdk_pywrapper.config as config 16 | import cdk_pywrapper 17 | print(cdk_pywrapper.__path__) 18 | 19 | # make sure host paths are set correctly, 20 | # TODO: test if this can reasonably be replace by finding full path using 'which' shell command 21 | host_os = platform.system() 22 | ps_path = 'ps' 23 | java_path = 'java' 24 | grep_path = 'grep' 25 | 26 | cp_sep = ':' 27 | 28 | if host_os == 'Darwin': 29 | cdk_path = os.path.join(*cdk_pywrapper.__path__[0].split('/')[:-4]) 30 | cdk_jar_path = os.path.join('/', cdk_path, 'share', 'cdk') 31 | 32 | py4j_path = os.path.join(*py4j.__path__[0].split('/')[:-4]) 33 | py4j_jar_path = os.path.join('/', py4j_path, 'share', 'py4j', 'py4j' + py4j.__version__ + '.jar') 34 | 35 | ps_path = '/bin/ps' 36 | java_path = '/usr/bin/java' 37 | grep_path = '/usr/bin/grep' 38 | elif host_os == 'Linux': 39 | cdk_path = os.path.join(*cdk_pywrapper.__path__[0].split('/')[:-4]) 40 | cdk_jar_path = os.path.join('/', cdk_path, 'share', 'cdk') 41 | 42 | py4j_path = os.path.join(*py4j.__path__[0].split('/')[:-4]) 43 | py4j_jar_path = os.path.join('/', py4j_path, 'share', 'py4j', 'py4j' + py4j.__version__ + '.jar') 44 | 45 | ps_path = '/usr/bin/ps' 46 | java_path = '/usr/bin/java' 47 | grep_path = '/usr/bin/grep' 48 | elif host_os == 'Windows': 49 | cp_sep = ';' 50 | drive, path = os.path.splitdrive(cdk_pywrapper.__path__[0]) 51 | cdk_path = os.path.join(drive + '\\', *path.split('\\')[:-3]) 52 | cdk_jar_path = os.path.join(cdk_path, 'share', 'cdk') 53 | 54 | drive, path = os.path.splitdrive(py4j.__path__[0]) 55 | py4j_path = os.path.join(drive + '\\', *path.split('\\')[:-3]) 56 | py4j_jar_path = os.path.join(py4j_path, 'share', 'py4j', 'py4j' + py4j.__version__ + '.jar') 57 | 58 | print(cdk_path) 59 | print(cdk_jar_path) 60 | print(py4j_path) 61 | print(py4j_jar_path) 62 | 63 | # from py4j.clientserver import ClientServer, JavaParameters, PythonParameters 64 | 65 | # set dev classpaths 66 | if not __debug__: 67 | cdk_jar_path = os.path.join('.', 'cdk') 68 | 69 | __author__ = 'Sebastian Burgstaller-Muehlbacher' 70 | __license__ = 'AGPLv3' 71 | __copyright__ = 'Sebastian Burgstaller-Muehlbacher' 72 | 73 | 74 | server_process_running = False 75 | # with subprocess.Popen(['{} aux | {} CDK'.format(ps_path, grep_path)], shell=True, stdout=subprocess.PIPE) as proc: 76 | # line = proc.stdout.read() 77 | # print(line) 78 | # if 'CDKBridge' in str(line): 79 | # print('process running') 80 | # server_process_running = True 81 | 82 | for proc in psutil.process_iter(): 83 | pinfo = proc.as_dict(attrs=['pid', 'name', 'username', 'cmdline']) 84 | if 'cmdline' in pinfo and pinfo['cmdline'] and 'CDKBridge' in pinfo['cmdline']: 85 | server_process_running = True 86 | print('Server process already running:', server_process_running) 87 | 88 | 89 | # if not any([True if 'CDKBridge' in p.cmdline() else False for p in psutil.process_iter()]): 90 | if not server_process_running: 91 | # compile and start py4j server 92 | # print(os.getcwd()) 93 | # subprocess.check_call(["javac -cp '{}:.{}' ../cdk/cdk_bridge.java".format(py4j_path, cdk_path)], shell=True) 94 | 95 | # subprocess.check_call(["javac -cp '{}:{}' ../cdk_pywrapper/cdk/cdk_bridge.java".format(py4j_jar_path, 96 | # '../cdk_pywrapper/cdk/cdk-2.1.1.jar')], shell=True) 97 | # # print('compiled sucessfully') 98 | print('starting server process') 99 | # p = subprocess.Popen(['{} -cp {};{};{}\\ CDKBridge'.format(java_path, py4j_jar_path, 100 | # os.path.join(cdk_jar_path, 'cdk-2.2.jar'), 101 | # cdk_jar_path)], shell=True) 102 | 103 | if host_os == 'Linux' or host_os == 'Darwin': 104 | p = subprocess.Popen([java_path + 105 | ' -cp ' + 106 | ' {}:{}:{} '.format(py4j_jar_path, 107 | os.path.join(cdk_jar_path, 'cdk-2.2.jar'), 108 | cdk_jar_path) + 109 | ' CDKBridge'], 110 | shell=True) 111 | 112 | elif host_os == 'Windows': 113 | p = subprocess.Popen([java_path, 114 | '-cp', 115 | '{}{}{}{}{}\\'.format(py4j_jar_path, 116 | cp_sep, 117 | os.path.join(cdk_jar_path, 'cdk-2.2.jar'), 118 | cp_sep, 119 | cdk_jar_path), 120 | 'CDKBridge'], 121 | shell=True) 122 | 123 | # wait 5 sec to start up JVM and server 124 | time.sleep(5) 125 | 126 | # connect to the JVM 127 | # gateway = JavaGateway(gateway_parameters=GatewayParameters(auto_convert=True)) 128 | # gateway = ClientServer( 129 | # java_parameters=JavaParameters(), 130 | # python_parameters=PythonParameters()) 131 | 132 | 133 | # shorten paths 134 | # cdk = gateway.jvm.org.openscience.cdk 135 | # java = gateway.jvm.java 136 | # javax = gateway.jvm.javax 137 | 138 | # map exceptions 139 | # InvalidSmilesException = cdk.exception.InvalidSmilesException 140 | # CDKException = cdk.exception.CDKException 141 | # NullPointerException = java.lang.NullPointerException 142 | 143 | gateway = JavaGateway(gateway_parameters=GatewayParameters(auto_convert=True)) 144 | 145 | 146 | # make sure the Java gateway server is shut down at exit of Python, but don't shut down if it has already been running 147 | @atexit.register 148 | def cleanup_gateway(): 149 | if not server_process_running: 150 | gateway.shutdown() 151 | 152 | 153 | def search_substructure(pattern, molecules): 154 | if host_os == 'Linux' or host_os == 'Darwin': 155 | g = JavaGateway.launch_gateway(classpath="{}:{}:{}/".format(py4j_jar_path, 156 | os.path.join(cdk_jar_path, 'cdk-2.2.jar'), 157 | cdk_jar_path), java_path=java_path) 158 | elif host_os == 'Windows': 159 | g = JavaGateway.launch_gateway(classpath="{};{};{}\\".format(py4j_jar_path, 160 | os.path.join(cdk_jar_path, 'cdk-2.2.jar'), 161 | cdk_jar_path), java_path='java') 162 | 163 | # search_handler = g.jvm.SearchHandler(MapConverter().convert(molecules, g._gateway_client)) 164 | search_handler = g.jvm.SearchHandler() 165 | 166 | matches = search_handler.searchPattern(pattern, MapConverter().convert(molecules, g._gateway_client)) 167 | 168 | results = copy.deepcopy([{'id': copy.deepcopy(str(compound_id)), 'match_count': copy.deepcopy(int(match_count)), 169 | 'svg': copy.deepcopy(str(svg))} 170 | for compound_id, match_count, svg in matches]) 171 | g.shutdown() 172 | return results 173 | 174 | 175 | class Compound(object): 176 | def __init__(self, compound_string, identifier_type, suppress_hydrogens=False, add_explicit_hydrogens=False): 177 | allowed_types = ['smiles', 'inchi', 'atom_container'] 178 | assert(identifier_type in allowed_types) 179 | 180 | self.cdk = gateway.jvm.org.openscience.cdk 181 | self.java = gateway.jvm.java 182 | 183 | self.identifier_type = identifier_type 184 | self.mol_container = None 185 | self.inchi_factory = self.cdk.inchi.InChIGeneratorFactory.getInstance() 186 | 187 | if self.identifier_type not in allowed_types: 188 | raise ValueError('Not a valid identifier type') 189 | try: 190 | if identifier_type == 'atom_container': 191 | self.compound_string = compound_string 192 | self.mol_container = self.compound_string 193 | else: 194 | self.compound_string = compound_string.strip() 195 | builder = self.cdk.DefaultChemObjectBuilder.getInstance() 196 | if self.identifier_type == 'inchi': 197 | s = self.inchi_factory.getInChIToStructure(self.compound_string, builder) 198 | self.mol_container = s.getAtomContainer() 199 | elif self.identifier_type == 'smiles': 200 | 201 | smiles_parser = self.cdk.smiles.SmilesParser(builder) 202 | self.mol_container = smiles_parser.parseSmiles(self.compound_string) 203 | 204 | if suppress_hydrogens: 205 | self.mol_container = self.cdk.tools.manipulator.AtomContainerManipulator.copyAndSuppressedHydrogens( 206 | self.mol_container) 207 | 208 | if add_explicit_hydrogens: 209 | self.cdk.tools.manipulator.AtomContainerManipulator\ 210 | .percieveAtomTypesAndConfigureAtoms(self.mol_container) 211 | self.cdk.tools.CDKHydrogenAdder.getInstance(builder).addImplicitHydrogens(self.mol_container) 212 | self.cdk.tools.manipulator.AtomContainerManipulator\ 213 | .convertImplicitToExplicitHydrogens(self.mol_container) 214 | 215 | except Py4JJavaError as e: 216 | print(e) 217 | raise ValueError('Invalid {} provided!'.format(self.identifier_type)) 218 | 219 | def get_smiles(self, smiles_type='isomeric'): 220 | if smiles_type == 'isomeric': 221 | smiles_flavor = self.cdk.smiles.SmiFlavor.Isomeric 222 | smiles_generator = self.cdk.smiles.SmilesGenerator(smiles_flavor) 223 | elif smiles_type == 'unique': 224 | smiles_generator = self.cdk.smiles.SmilesGenerator.unique() 225 | elif smiles_type == 'generic': 226 | smiles_generator = self.cdk.smiles.SmilesGenerator.generic() 227 | 228 | elif smiles_type == 'use_aromatic_symbols': 229 | # need to add aromaticity information first before generating aromatic smiles 230 | aromaticity = self.cdk.aromaticity.Aromaticity(self.cdk.aromaticity.ElectronDonation.daylight(), 231 | self.cdk.graph.Cycles.all()) 232 | try: 233 | aromaticity.apply(self.mol_container) 234 | except Exception as e: 235 | print(e) 236 | 237 | smiles_flavor = self.cdk.smiles.SmiFlavor.UseAromaticSymbols 238 | smiles_generator = self.cdk.smiles.SmilesGenerator(smiles_flavor) 239 | else: 240 | smiles_generator = self.cdk.smiles.SmilesGenerator.absolute() 241 | 242 | return smiles_generator.create(self.mol_container) 243 | 244 | def get_inchi_key(self): 245 | gen = self.inchi_factory.getInChIGenerator(self.mol_container) 246 | return gen.getInchiKey() 247 | 248 | def get_inchi(self): 249 | gen = self.inchi_factory.getInChIGenerator(self.mol_container) 250 | return gen.getInchi() 251 | 252 | def get_tautomers(self): 253 | tautomer_generator = self.cdk.tautomers.InChITautomerGenerator() 254 | tautomers = tautomer_generator.getTautomers(self.mol_container) 255 | # py4j.java_collections.JavaList('o16', gateway) 256 | # mol1 = tautomers[0] 257 | t_obj = [Compound(compound_string=x, identifier_type='atom_container') for x in tautomers] 258 | print([t.get_inchi_key() for t in t_obj]) 259 | print(*[t.get_inchi() for t in t_obj], sep='\n') 260 | print(*[t.get_smiles() for t in t_obj], sep='\n') 261 | return list(tautomers) 262 | 263 | def get_stereocenters(self): 264 | stereocenters = self.cdk.stereo.Stereocenters.of(self.mol_container) 265 | sc = [] 266 | 267 | for x in range(self.mol_container.getAtomCount()): 268 | if stereocenters.isStereocenter(x): 269 | sc.append(( 270 | str(stereocenters.elementType(x)), 271 | str(stereocenters.stereocenterType(x)), 272 | x, 273 | self.mol_container.getAtom(x).getSymbol()) 274 | ) 275 | # print(str(stereocenters.stereocenterType(x))) 276 | # print(self.mol_container.getAtom(x).getSymbol()) 277 | 278 | return sc 279 | 280 | def get_configuration_class(self): 281 | 282 | for se in self.mol_container.stereoElements(): 283 | config_class = se.getConfigClass() 284 | print(config_class) 285 | 286 | print(se.getStereo()) 287 | 288 | if config_class == self.cdk.interfaces.IStereoElement.TH: 289 | print('tetrahedral') 290 | elif config_class == self.cdk.interfaces.IStereoElement.CT: 291 | print('cis-trans') 292 | elif config_class == self.cdk.interfaces.IStereoElement.Octahedral: 293 | print('octaheral') 294 | elif config_class == self.cdk.interfaces.IStereoElement.AL: 295 | print('extended tetrahedral') 296 | elif config_class == self.cdk.interfaces.IStereoElement.AT: 297 | print('atropisomeric') 298 | elif config_class == self.cdk.interfaces.IStereoElement.SP: 299 | print('square planar') 300 | elif config_class == self.cdk.interfaces.IStereoElement.SPY: 301 | print('square pyramidal') 302 | elif config_class == self.cdk.interfaces.IStereoElement.TBPY: 303 | print('trigonal bipyramidal') 304 | elif config_class == self.cdk.interfaces.IStereoElement.PBPY: 305 | print('pentagonal bipyramidal') 306 | elif config_class == self.cdk.interfaces.IStereoElement.HBPY8: 307 | print('hexagonal bipyramidal') 308 | elif config_class == self.cdk.interfaces.IStereoElement.HBPY9: 309 | print('heptagonal bipyramidal') 310 | 311 | configuration = se.getConfigOrder() 312 | if configuration == self.cdk.interfaces.IStereoElement.LEFT: 313 | print('left') 314 | elif configuration == self.cdk.interfaces.IStereoElement.RIGHT: 315 | print('right') 316 | elif configuration == self.cdk.interfaces.IStereoElement.OPPOSITE: 317 | print('opposite') 318 | elif configuration == self.cdk.interfaces.IStereoElement.TOGETHER: 319 | print('together') 320 | print(configuration) 321 | print('---------------------------------') 322 | 323 | def get_chirality(self): 324 | configurations = [x[0] for x in self.get_configuration_order()] 325 | raw_stereocenters = [element_type for (element_type, sterecenter_type, atom_number, element_symbol) in 326 | self.get_stereocenters() if element_type == 'Tetracoordinate' and element_symbol == 'C'] 327 | 328 | # print(len(configurations), configurations) 329 | # print(self.get_configuration_order()) 330 | # print(len(raw_stereocenters), raw_stereocenters) 331 | # print(self.get_stereocenters()) 332 | 333 | if len(configurations) != len(raw_stereocenters): 334 | return 'racemate' 335 | elif len(raw_stereocenters) == 0 or (len(set(configurations).intersection(set(['R', 'S']))) 336 | == 2 and self.has_point_symmetry()): 337 | return 'achiral' 338 | else: 339 | return 'chiral' 340 | 341 | def get_configuration_order(self): 342 | configurations = [] 343 | for se in self.mol_container.stereoElements(): 344 | conf = str(self.cdk.geometry.cip.CIPTool.getCIPChirality(self.mol_container, se)) 345 | 346 | # that is not the IUPAC naming convention atom number but a CDK internal representation 347 | focus_atom_number = se.getFocus().getIndex() 348 | 349 | configurations.append((conf, focus_atom_number)) 350 | 351 | return configurations 352 | 353 | def has_point_symmetry(self): 354 | atom_count = self.mol_container.getAtomCount() 355 | qr = self.cdk.signature.SignatureQuotientGraph(self.mol_container) 356 | if atom_count % 2 == 0 and qr.getVertexCount() <= atom_count / 2 and qr.getVertexCount() == qr.getEdgeCount(): 357 | return True 358 | elif (atom_count - 1) % 2 == 0 and (atom_count - 1) / 2 >= qr.getVertexCount() > qr.getEdgeCount(): 359 | return True 360 | else: 361 | return False 362 | 363 | def get_monoisotopic_mass(self): 364 | weight = self.cdk.qsar.descriptors.molecular.WeightDescriptor() 365 | # print(weight.getDescriptorNames()) 366 | 367 | return weight.calculate(self.mol_container).getValue().toString() 368 | 369 | def get_natural_mass(self): 370 | mass = self.cdk.tools.manipulator.AtomContainerManipulator() 371 | return mass.getNaturalExactMass(self.mol_container) 372 | 373 | def get_mw(self): 374 | return self.cdk.tools.manipulator.AtomContainerManipulator().getMolecularWeight(self.mol_container) 375 | 376 | def get_tpsa(self): 377 | return self.cdk.qsar.descriptors.molecular.TPSADescriptor().calculate(self.mol_container).getValue().toString() 378 | 379 | def get_rotable_bond_count(self): 380 | return self.cdk.qsar.descriptors.molecular.RotatableBondsCountDescriptor()\ 381 | .calculate(self.mol_container).getValue().toString() 382 | 383 | def get_hbond_acceptor_count(self): 384 | return self.cdk.qsar.descriptors.molecular.HBondAcceptorCountDescriptor() \ 385 | .calculate(self.mol_container).getValue().toString() 386 | 387 | def get_hbond_donor_count(self): 388 | return self.cdk.qsar.descriptors.molecular.HBondDonorCountDescriptor() \ 389 | .calculate(self.mol_container).getValue().toString() 390 | 391 | def get_xlogp(self): 392 | return self.cdk.qsar.descriptors.molecular.XLogPDescriptor() \ 393 | .calculate(self.mol_container).getValue().toString() 394 | 395 | def get_ro5_failures(self): 396 | return self.cdk.qsar.descriptors.molecular.RuleOfFiveDescriptor() \ 397 | .calculate(self.mol_container).getValue().toString() 398 | 399 | def get_acidic_group_count(self): 400 | agcd = self.cdk.qsar.descriptors.molecular.AcidicGroupCountDescriptor() 401 | agcd.initialise(self.cdk.DefaultChemObjectBuilder.getInstance()) 402 | return agcd.calculate(self.mol_container).getValue().toString() 403 | 404 | def get_mol2(self, filename=''): 405 | """ 406 | A method to convert a molecule to the mol2 format and optionally write it to a file 407 | :param filename: the filename, the mol2 file should be written to. 408 | :type filename: str 409 | :return: A mol2 file in string format 410 | """ 411 | sdg = self.cdk.layout.StructureDiagramGenerator(self.mol_container) 412 | sdg.generateCoordinates() 413 | 414 | writer = self.java.io.StringWriter() 415 | mol2writer = self.cdk.io.Mol2Writer(writer) 416 | 417 | mol2writer.writeMolecule(self.mol_container) 418 | mol2writer.close() 419 | 420 | mol2string = writer.toString() 421 | 422 | if filename: 423 | with open(filename, "w") as text_file: 424 | text_file.write(mol2string) 425 | 426 | return mol2string 427 | 428 | def get_molfile(self, filename=''): 429 | """ 430 | A method to convert a molecule to molfile V2000 (MDLV2000) format and optionally write it to a file 431 | :param filename: the filename, the molfile V2000 (MDLV2000) file should be written to. 432 | :type filename: str 433 | :return: A molfile V2000 (MDLV2000) file in string format 434 | """ 435 | sdg = self.cdk.layout.StructureDiagramGenerator(self.mol_container) 436 | sdg.generateCoordinates() 437 | 438 | writer = self.java.io.StringWriter() 439 | molfile_writer = self.cdk.io.MDLV2000Writer(writer) 440 | 441 | molfile_writer.writeMolecule(self.mol_container) 442 | molfile_writer.close() 443 | 444 | molfile2string = writer.toString() 445 | 446 | if filename: 447 | with open(filename, "w") as text_file: 448 | text_file.write(molfile2string) 449 | 450 | return molfile2string 451 | 452 | def get_fingerprint(self): 453 | fingerprinter = self.cdk.fingerprint.Fingerprinter() 454 | fingerprint = fingerprinter.getBitFingerprint(self.mol_container) 455 | # raw_fingerprint = fingerprinter.getRawFingerprint(self.mol_container) 456 | print('Fingerprint size:', fingerprint.size()) 457 | print(fingerprint.asBitSet()) 458 | # print('raw fingerprint', raw_fingerprint) 459 | return fingerprint 460 | 461 | def get_bitmap_fingerprint(self): 462 | fingerprinter = self.cdk.fingerprint.Fingerprinter() 463 | fingerprint = fingerprinter.getBitFingerprint(self.mol_container) 464 | return fingerprint.asBitSet() 465 | 466 | def get_tanimoto(self, other_molecule): 467 | return self.cdk.similarity.Tanimoto.calculate(self.get_fingerprint(), other_molecule.get_fingerprint()) 468 | 469 | def get_tanimoto_from_bitset(self, other_molecule): 470 | return self.cdk.similarity.Tanimoto.calculate(self.get_bitmap_fingerprint(), other_molecule.get_bitmap_fingerprint()) 471 | 472 | def get_molecule_signature(self): 473 | molecule_signature = self.cdk.signature.MoleculeSignature(self.mol_container) 474 | return molecule_signature.toCanonicalString() 475 | 476 | def substructure_search(self, smarts='O=CO'): 477 | querytool = self.cdk.smiles.smarts.SMARTSQueryTool(smarts, self.cdk.DefaultChemObjectBuilder.getInstance()) 478 | status = querytool.matches(self.mol_container) 479 | 480 | if status: 481 | nmatch = querytool.countMatches() 482 | mappings = querytool.getMatchingAtoms() 483 | for i in range(nmatch): 484 | print(mappings.get(i)) 485 | 486 | return '' 487 | 488 | def get_svg(self, file_name=None, substructures=None): 489 | if substructures: 490 | color = self.java.awt.Color.orange 491 | dg = self.cdk.depict.DepictionGenerator()\ 492 | .withHighlight(substructures, color)\ 493 | .withAtomColors()\ 494 | .withOuterGlowHighlight(4.0) 495 | else: 496 | dg = self.cdk.depict.DepictionGenerator().withAtomColors() 497 | 498 | if file_name: 499 | if not file_name.split('.')[-1].lower() == 'svg': 500 | file_name += '.svg' 501 | 502 | dg.depict(self.mol_container).writeTo(file_name) 503 | return '' 504 | 505 | else: 506 | return dg.depict(self.mol_container).toSvgStr() 507 | 508 | def get_molecular_weight(self): 509 | weight_descriptor = self.cdk.qsar.descriptors.molecular.WeightDescriptor() 510 | return weight_descriptor.calculate(self.mol_container).getValue().toString() 511 | 512 | @staticmethod 513 | def search_substructure(search_string, molecules, svg_return_count=10): 514 | """A slow version of a substructure search going back and forth btwn Java and Python""" 515 | 516 | cdk = gateway.jvm.org.openscience.cdk 517 | pattern = cdk.smiles.smarts.SmartsPattern.create(search_string) 518 | results = [] 519 | 520 | for count, (compound_id, smiles) in enumerate(molecules): 521 | try: 522 | mol = Compound(compound_string=smiles, identifier_type='smiles') 523 | except ValueError as e: 524 | continue 525 | 526 | mappings = pattern.matchAll(mol.mol_container) 527 | match_count = mappings.countUnique() 528 | if match_count > 0: 529 | substructures = mappings.toChemObjects() 530 | svg = '' 531 | if len(results) <= svg_return_count: 532 | svg = mol.get_svg(substructures=substructures) 533 | 534 | results.append({ 535 | 'compound_id': compound_id, 536 | 'smiles': smiles, 537 | 'svg': svg 538 | }) 539 | # print(svg) 540 | 541 | return results 542 | 543 | 544 | def main(): 545 | test_inchi = 'InChI=1S/C23H18ClF2N3O3S/c1-2-9-33(31,32)29-19-8-7-18(25)20(21(19)26)22(30)17-12-28-23-16(17)10-14(11-27-23)13-3-5-15(24)6-4-13/h3-8,10-12,29H,2,9H2,1H3,(H,27,28)' 546 | cmpnd = Compound(compound_string=test_inchi, identifier_type='inchi') 547 | print(cmpnd.get_smiles()) 548 | print(cmpnd.get_inchi_key()) 549 | print(cmpnd.get_inchi()) 550 | 551 | mol = 'C[BH]1H[BH](C)H1' 552 | mol = "CC(=O)Cl" 553 | cmpnd = Compound(compound_string=mol, identifier_type='smiles') 554 | print(cmpnd.get_smiles()) 555 | print(cmpnd.get_inchi_key()) 556 | print(cmpnd.get_inchi()) 557 | 558 | print('ran through') 559 | time.sleep(5) 560 | 561 | if __name__ == '__main__': 562 | sys.exit(main()) 563 | -------------------------------------------------------------------------------- /cdk_pywrapper/chemlib.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import sys 3 | import simplejson 4 | import time 5 | import re 6 | import wikidataintegrator.wdi_core as wdi_core 7 | import wikidataintegrator.wdi_login as wdi_login 8 | import pprint 9 | import pandas as pd 10 | import numpy as np 11 | import os 12 | 13 | import chemspipy 14 | # sys.path.append('/home/sebastian/PycharmProjects/cdk_pywrapper/') 15 | from cdk_pywrapper.cdk_pywrapper import Compound 16 | 17 | """ 18 | A Python library for PubChem RDF 19 | """ 20 | 21 | __author__ = 'Sebastian Burgstaller-Muehlbacher' 22 | __license__ = 'AGPLv3' 23 | __copyright__ = 'Sebastian Burgstaller-Muehlbacher' 24 | 25 | 26 | class UNIIMolecule(object): 27 | unii_data = pd.read_csv('./unii_data/unii_data_ndfrt.csv', low_memory=False, index_col=0, 28 | dtype={ 29 | 'UNII': np.str, 30 | 'RXCUI': np.str, 31 | 'INN_ID': np.str, 32 | 'ITIS': np.str, 33 | 'NCBI': np.str, 34 | 'RxNorm_CUI': np.str, # same as RXCUI, but from NDF-RT 35 | }) 36 | 37 | for count, row in unii_data.iterrows(): 38 | smiles = row['SMILES'] 39 | ikey = row['INCHIKEY'] 40 | if pd.notnull(smiles) and pd.isnull(ikey): 41 | cmpnd = Compound(compound_string=smiles, identifier_type='smiles') 42 | unii_data.loc[count, 'INCHIKEY'] = cmpnd.get_inchi_key() 43 | 44 | if count % 10000 == 0: 45 | print('processed to UNII ID', count) 46 | 47 | unii_data.to_csv('./unii_data/unii_data_ndfrt.csv') 48 | 49 | def __init__(self, unii=None, inchi_key=None): 50 | 51 | print('unii inchi key', inchi_key) 52 | if unii: 53 | ind = UNIIMolecule.unii_data['UNII'].values == unii 54 | else: 55 | ind = UNIIMolecule.unii_data['INCHIKEY'].values == inchi_key 56 | 57 | 58 | self.data = UNIIMolecule.unii_data.loc[ind, :] 59 | 60 | if len(self.data.index) != 1: 61 | raise ValueError('Provided ID did not return a unique UNII') 62 | 63 | self.data_index = self.data.index[0] 64 | 65 | 66 | 67 | @property 68 | def stdinchikey(self): 69 | ikey = self.data.loc[self.data_index, 'INCHIKEY'] 70 | if pd.isnull(ikey) and pd.isnull(self.smiles): 71 | return None 72 | elif pd.notnull(self.smiles): 73 | cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 74 | ikey = cmpnd.get_inchi_key() 75 | 76 | return ikey 77 | 78 | @property 79 | def stdinchi(self): 80 | if pd.isnull(self.smiles): 81 | return None 82 | elif pd.notnull(self.smiles): 83 | cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 84 | return cmpnd.get_inchi() 85 | 86 | @property 87 | def preferred_name(self): 88 | name = self.data.loc[self.data_index, 'PT'] 89 | return UNIIMolecule.label_converter(name) if pd.notnull(name) else None 90 | 91 | @property 92 | def smiles(self): 93 | smiles = self.data.loc[self.data_index, 'SMILES'] 94 | return smiles if pd.notnull(smiles) else None 95 | 96 | @property 97 | def molecule_type(self): 98 | molecule_type = self.data.loc[self.data_index, 'UNII_TYPE'] 99 | return molecule_type if pd.notnull(molecule_type) else None 100 | 101 | @property 102 | def unii(self): 103 | return self.data.loc[self.data_index, 'UNII'] 104 | 105 | @property 106 | def cas(self): 107 | cas = self.data.loc[self.data_index, 'RN'] 108 | return cas if pd.notnull(cas) else None 109 | 110 | @property 111 | def einecs(self): 112 | einecs = self.data.loc[self.data_index, 'EC'] 113 | return einecs if pd.notnull(einecs) else None 114 | 115 | @property 116 | def rxnorm(self): 117 | rxnorm = self.data.loc[self.data_index, 'RXCUI'] 118 | return rxnorm if pd.notnull(rxnorm) else None 119 | 120 | @property 121 | def ndfrt(self): 122 | ndfrt = self.data.loc[self.data_index, 'NUI'] 123 | return ndfrt if pd.notnull(ndfrt) else None 124 | 125 | @property 126 | def umls(self): 127 | umls_cui = self.data.loc[self.data_index, 'UMLS_CUI'] 128 | return umls_cui if pd.notnull(umls_cui) else None 129 | 130 | 131 | def to_wikidata(self): 132 | item_label = self.preferred_name if self.preferred_name else self.unii 133 | 134 | refs = [[ 135 | wdi_core.WDItemID(value='Q6593799', prop_nr='P248', is_reference=True), # stated in 136 | wdi_core.WDExternalID(value=self.unii, prop_nr='P652', is_reference=True), # source element 137 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 138 | wdi_core.WDMonolingualText(value=item_label[0:400], language='en', prop_nr='P1476', is_reference=True), 139 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved 140 | ]] 141 | print('UNII Main label is', item_label) 142 | 143 | cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 144 | isomeric_smiles = cmpnd.get_smiles(smiles_type='isomeric') 145 | canonical_smiles = cmpnd.get_smiles(smiles_type='generic') 146 | 147 | elements = { 148 | 'P652': self.unii, 149 | 'P233': canonical_smiles, 150 | 'P2017': isomeric_smiles, 151 | 'P235': self.stdinchikey, 152 | 'P234': self.stdinchi[6:], 153 | 'P231': self.cas, 154 | 'P232': self.einecs, 155 | 'P2892': self.umls, 156 | 'P2115': self.ndfrt, 157 | 'P3345': self.rxnorm 158 | } 159 | 160 | dtypes = { 161 | 'P652': wdi_core.WDExternalID, 162 | 'P683': wdi_core.WDExternalID, 163 | 'P661': wdi_core.WDExternalID, 164 | 'P2153': wdi_core.WDExternalID, 165 | 'P233': wdi_core.WDString, 166 | 'P2017': wdi_core.WDString, 167 | 'P235': wdi_core.WDExternalID, 168 | 'P234': wdi_core.WDExternalID, 169 | 'P274': wdi_core.WDString, 170 | 'P231': wdi_core.WDExternalID, 171 | 'P232': wdi_core.WDExternalID, 172 | 'P2892': wdi_core.WDExternalID, 173 | 'P2115': wdi_core.WDExternalID, 174 | 'P3345': wdi_core.WDExternalID 175 | } 176 | 177 | # do not add isomeric smiles if no isomeric info is available 178 | if canonical_smiles == isomeric_smiles or len(self.smiles) > 400: 179 | del elements['P2017'] 180 | 181 | # do not try to add InChI longer than 400 chars 182 | if len(self.stdinchi[6:]) > 400: 183 | del elements['P234'] 184 | 185 | if len(self.smiles) > 400: 186 | del elements['P233'] 187 | 188 | data = [] 189 | 190 | for k, v in elements.items(): 191 | if not v: 192 | continue 193 | 194 | print('{}:'.format(k), v) 195 | if isinstance(v, list) or isinstance(v, set): 196 | for x in v: 197 | data.append(dtypes[k](prop_nr=k, value=x, references=refs)) 198 | else: 199 | data.append(dtypes[k](prop_nr=k, value=v, references=refs)) 200 | 201 | return data 202 | 203 | @staticmethod 204 | def label_converter(label): 205 | label = label.lower() 206 | 207 | greek_codes = { 208 | '.alpha.': '\u03B1', 209 | '.beta.': '\u03B2', 210 | '.gamma.': '\u03B3', 211 | '.delta.': '\u03B4', 212 | '.epsilon.': '\u03B5', 213 | '.zeta.': '\u03B6 ', 214 | '.eta.': '\u03B7', 215 | '.theta.': '\u03B8', 216 | '.iota.': '\u03B9', 217 | '.kappa.': '\u03BA', 218 | '.lambda.': '\u03BB', 219 | '.mu.': '\u03BC', 220 | '.nu.': '\u03BD', 221 | '.xi.': '\u03BE', 222 | '.omicron.': '\u03BF', 223 | '.pi.': '\u03C0', 224 | '.rho.': '\u03C1', 225 | '.sigma.': '\u03C3', 226 | '.tau.': '\u03C4', 227 | '.upsilon.': '\u03C5', 228 | '.phi.': '\u03C6', 229 | '.chi.': '\u03C7', 230 | '.psi.': '\u03C8', 231 | '.omega.': '\u03C9', 232 | 233 | } 234 | 235 | for greek_letter, unicode in greek_codes.items(): 236 | if greek_letter in label: 237 | label = label.replace(greek_letter, unicode) 238 | 239 | match = re.compile('(^|[^a-z])([ezdlnhros]{1}|dl{1})[^a-z]{1}') 240 | 241 | while True: 242 | if re.search(match, label): 243 | replacement = label[re.search(match, label).start(): re.search(match, label).end()].upper() 244 | label = re.sub(match, repl=replacement, string=label, count=1) 245 | else: 246 | break 247 | 248 | splits = label.split(', ') 249 | splits.reverse() 250 | return ''.join(splits) 251 | 252 | 253 | class DrugBankMolecule(object): 254 | """DrugBank ID, Accession Numbers, Common name, CAS, UNII, Synonyms, Standard InChI Key""" 255 | 256 | drugbank_data = pd.read_csv('drugbank vocabulary.csv', low_memory=False) 257 | drugbank_data = pd.concat([drugbank_data.drop_duplicates(subset=['Standard InChI Key'], keep='first'), 258 | drugbank_data.loc[drugbank_data['Standard InChI Key'].isnull(), :]]) 259 | 260 | 261 | def __init__(self, db=None, inchi_key=None): 262 | 263 | print('unii inchi key', inchi_key) 264 | if db: 265 | ind = DrugBankMolecule.drugbank_data['DrugBank ID'].values == db 266 | else: 267 | ind = DrugBankMolecule.drugbank_data['Standard InChI Key'].values == inchi_key 268 | 269 | 270 | self.data = DrugBankMolecule.drugbank_data.loc[ind, :] 271 | 272 | if len(self.data.index) != 1: 273 | raise ValueError('Provided ID did not return a unique DrugBank ID') 274 | 275 | self.data_index = self.data.index[0] 276 | 277 | @property 278 | def stdinchikey(self): 279 | ikey = self.data.loc[self.data_index, 'Standard InChI Key'] 280 | if pd.isnull(ikey): 281 | return None 282 | 283 | return ikey 284 | 285 | @property 286 | def stdinchi(self): 287 | # CC0 data does not provide InChI, instead could create a PubChemMolecule using the InChI key provided and use that 288 | return None 289 | 290 | @property 291 | def preferred_name(self): 292 | name = self.data.loc[self.data_index, 'Common name'] 293 | return name if pd.notnull(name) else None 294 | 295 | @property 296 | def synonyms(self): 297 | synonyms = self.data.loc[self.data_index, 'Synonyms'] 298 | return synonyms.split(' | ') if pd.notnull(synonyms) else [] 299 | 300 | @property 301 | def smiles(self): 302 | # same applies as for InChIs 303 | return None 304 | 305 | @property 306 | def molecule_type(self): 307 | # return either 'approved', 'experimental', 'retracted', 'biotech', 'antibody'. Based on what the accession numbers say 308 | return None 309 | 310 | @property 311 | def accession_numbers(self): 312 | acc_nrs = self.data.loc[self.data_index, 'Accession Numbers'].split('|') 313 | return acc_nrs 314 | 315 | @property 316 | def unii(self): 317 | unii = self.data.loc[self.data_index, 'UNII'] 318 | return unii if pd.notnull(unii) else None 319 | 320 | @property 321 | def cas(self): 322 | cas = self.data.loc[self.data_index, 'CAS'] 323 | return cas if pd.notnull(cas) else None 324 | 325 | @property 326 | def drugbank(self): 327 | return self.data.loc[self.data_index, 'DrugBank ID'][2:] 328 | 329 | def to_wikidata(self): 330 | item_label = self.preferred_name if self.preferred_name else 'DB' + self.drugbank 331 | 332 | refs = [[ 333 | wdi_core.WDItemID(value='Q1122544', prop_nr='P248', is_reference=True), # stated in 334 | wdi_core.WDExternalID(value=self.drugbank, prop_nr='P715', is_reference=True), # source element 335 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 336 | wdi_core.WDMonolingualText(value=item_label[0:400], language='en', prop_nr='P1476', is_reference=True), 337 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved 338 | ]] 339 | print('DrugBank Main label is', item_label) 340 | 341 | # cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 342 | # isomeric_smiles = cmpnd.get_smiles(smiles_type='isomeric') 343 | # canonical_smiles = cmpnd.get_smiles(smiles_type='generic') 344 | 345 | elements = { 346 | 'P652': self.unii, 347 | 'P715': self.drugbank, 348 | #'P233': canonical_smiles, 349 | #'P2017': isomeric_smiles, 350 | #'P235': self.stdinchikey, 351 | #'P234': self.stdinchi[6:], 352 | 'P231': self.cas, 353 | } 354 | 355 | dtypes = { 356 | 'P652': wdi_core.WDExternalID, 357 | 'P715': wdi_core.WDExternalID, 358 | 'P683': wdi_core.WDExternalID, 359 | 'P661': wdi_core.WDExternalID, 360 | 'P2153': wdi_core.WDExternalID, 361 | 'P233': wdi_core.WDString, 362 | 'P2017': wdi_core.WDString, 363 | 'P235': wdi_core.WDExternalID, 364 | 'P234': wdi_core.WDExternalID, 365 | 'P274': wdi_core.WDString, 366 | 'P231': wdi_core.WDExternalID, 367 | 'P232': wdi_core.WDExternalID 368 | } 369 | 370 | # # do not add isomeric smiles if no isomeric info is available 371 | # if canonical_smiles == isomeric_smiles or len(self.smiles) > 400: 372 | # del elements['P2017'] 373 | # 374 | # # do not try to add InChI longer than 400 chars 375 | # if len(self.stdinchi[6:]) > 400: 376 | # del elements['P234'] 377 | # 378 | # if len(self.smiles) > 400: 379 | # del elements['P233'] 380 | 381 | data = [] 382 | 383 | for k, v in elements.items(): 384 | if not v: 385 | continue 386 | 387 | print('{}:'.format(k), v) 388 | if isinstance(v, list) or isinstance(v, set): 389 | for x in v: 390 | data.append(dtypes[k](prop_nr=k, value=x, references=refs)) 391 | else: 392 | data.append(dtypes[k](prop_nr=k, value=v, references=refs)) 393 | 394 | return data 395 | 396 | 397 | class ChEBIMolecule(object): 398 | chebi_data_path = './chebi_data/' 399 | chebi_data = pd.read_csv(os.path.join(chebi_data_path, 'chebiId_inchi.tsv'), low_memory=False, index_col=0, sep='\t') 400 | 401 | 'ID COMPOUND_ID TYPE SOURCE NAME ADAPTED LANGUAGE' 402 | chebi_names = pd.read_csv(os.path.join(chebi_data_path, 'names.tsv'), low_memory=False, index_col=None, sep='\t', 403 | dtype={'ID': np.str, 'COMPOUND_ID': np.str}, na_filter=False) 404 | 405 | zwitterion_id_list = set() 406 | for zz in chebi_names.iterrows(): 407 | data = zz[1] 408 | if 'zwitterion' in data['NAME']: 409 | zwitterion_id_list.add(np.int64(data['COMPOUND_ID'])) 410 | 411 | compounds = pd.read_csv(os.path.join(chebi_data_path, 'compounds.tsv'), low_memory=False, index_col=0, sep='\t') 412 | 413 | for c, zz in compounds.iterrows(): 414 | # pd.NaN is handled as a float datatype so it needs extra treatment, what a nonsense. 415 | if pd.isnull(zz['NAME']): 416 | zwitterion_id_list.add(c) 417 | continue 418 | if 'zwitterion' in zz['NAME']: 419 | zwitterion_id_list.add(c) 420 | 421 | chebi_data = chebi_data.drop(list(zwitterion_id_list)) 422 | 423 | if 'InChI key' not in chebi_data: 424 | print('Generating InChI keys ...') 425 | for row in chebi_data.iterrows(): 426 | index = row[0] 427 | data = row[1] 428 | 429 | inchi = data['InChI'] 430 | cmpnd = Compound(compound_string=inchi, identifier_type='inchi') 431 | chebi_data.loc[index, 'InChI key'] = cmpnd.get_inchi_key() 432 | 433 | if index % 1000 == 0: 434 | print('processed to ChEBI ID', index) 435 | 436 | chebi_data.to_csv(os.path.join(chebi_data_path, 'chebiId_inchi.tsv'), sep='\t') 437 | 438 | 439 | 'ID COMPOUND_ID SOURCE TYPE ACCESSION_NUMBER' 440 | db_accessions = pd.read_csv(os.path.join(chebi_data_path, 'database_accession.tsv'), low_memory=False, index_col=None, sep='\t', 441 | dtype={'ID': np.str, 'COMPOUND_ID': np.str}, na_filter=False) 442 | 443 | # remove CAS numbers provided by KEGG, as they are frequently incorrect 444 | db_accessions = db_accessions.loc[~(db_accessions['SOURCE'].isin(['KEGG COMPOUND']) & 445 | db_accessions['TYPE'].isin(['CAS Registry Number'])), :] 446 | 447 | def __init__(self, chebi_id=None, inchi_key=None): 448 | 449 | if chebi_id: 450 | ind = ChEBIMolecule.chebi_data.index == np.int64(chebi_id) 451 | else: 452 | ind = ChEBIMolecule.chebi_data['InChI key'].values == inchi_key 453 | 454 | self._canonical_smiles = None 455 | self._isomeric_smiles = None 456 | 457 | self.data = ChEBIMolecule.chebi_data.loc[ind, :] 458 | 459 | if len(self.data.index) != 1: 460 | raise ValueError('No unique found for ChEBI ID') 461 | 462 | self.data_index = self.data.index[0] 463 | 464 | self.all_names = ChEBIMolecule.chebi_names.loc[ChEBIMolecule.chebi_names['COMPOUND_ID'] == self.chebi, :] 465 | self.accessions = ChEBIMolecule.db_accessions.loc[ChEBIMolecule.db_accessions['COMPOUND_ID'] == self.chebi, :] 466 | self.chebi_base_data = ChEBIMolecule.compounds.loc[ChEBIMolecule.compounds.index == np.int64(self.chebi), :] 467 | 468 | @property 469 | def stdinchikey(self): 470 | return self.data.loc[self.data_index, 'InChI key'] 471 | 472 | @property 473 | def stdinchi(self): 474 | return self.data.loc[self.data_index, 'InChI'] 475 | 476 | @property 477 | def preferred_name(self): 478 | pref_names = [x[1]['NAME'] for x in self.chebi_base_data 479 | .loc[self.chebi_base_data.index == np.int64(self.chebi), :].iterrows()] 480 | return pref_names[0] if len(pref_names) > 0 else None 481 | 482 | @property 483 | def synonyms(self): 484 | return [x[1]['NAME'] for x in self.all_names.loc[self.all_names['TYPE'] == 'SYNONYM', :] 485 | .iterrows() if x[1]['LANGUAGE'] == 'en'] 486 | 487 | @property 488 | def canonical_smiles(self): 489 | if not self._canonical_smiles: 490 | cmpnd = Compound(compound_string=self.stdinchi, identifier_type='inchi') 491 | self._canonical_smiles = cmpnd.get_smiles(smiles_type='generic') 492 | self._isomeric_smiles = cmpnd.get_smiles(smiles_type='isomeric') 493 | return self._canonical_smiles 494 | 495 | @canonical_smiles.setter 496 | def canonical_smiles(self, value): 497 | self._canonical_smiles = value 498 | 499 | @property 500 | def isomeric_smiles(self): 501 | if not self._isomeric_smiles: 502 | csmiles = self.canonical_smiles 503 | return self._isomeric_smiles 504 | 505 | @isomeric_smiles.setter 506 | def isomeric_smiles(self, value): 507 | self._isomeric_smiles = value 508 | 509 | @property 510 | def chebi(self): 511 | return str(self.data_index) 512 | 513 | @property 514 | def cas(self): 515 | return set([x[1]['ACCESSION_NUMBER'] for x in self.accessions.loc[self.accessions['TYPE'] 516 | .isin(['CAS Registry Number']), :].iterrows()]) 517 | 518 | @property 519 | def hmdb(self): 520 | return set([x[1]['ACCESSION_NUMBER'] for x in self.accessions.loc[self.accessions['TYPE'] 521 | .isin(['HMDB accession']), :].iterrows()]) 522 | 523 | @property 524 | def beilstein(self): 525 | return set([x[1]['ACCESSION_NUMBER'] for x in self.accessions.loc[self.accessions['TYPE'] 526 | .isin(['Beilstein Registry Number', 'Reaxys Registry Number']), :].iterrows()]) 527 | 528 | @property 529 | def kegg(self): 530 | return set([x[1]['ACCESSION_NUMBER'] for x in self.accessions.loc[self.accessions['TYPE'] 531 | .isin(['KEGG COMPOUND accession', 'KEGG DRUG accession']), :].iterrows()]) 532 | 533 | @property 534 | def knapsack(self): 535 | return set([x[1]['ACCESSION_NUMBER'] for x in self.accessions.loc[self.accessions['TYPE'] 536 | .isin(['KNApSAcK accession']), :].iterrows()]) 537 | 538 | @property 539 | def who_inn(self): 540 | return [x[1]['NAME'] for x in self.all_names.loc[self.all_names['TYPE'] == 'INN', :] 541 | .iterrows() if x[1]['LANGUAGE'] == 'en'] 542 | 543 | def to_wikidata(self): 544 | item_label = self.preferred_name if self.preferred_name else 'ChEBI:' + self.chebi 545 | 546 | refs = [[ 547 | wdi_core.WDItemID(value='Q902623', prop_nr='P248', is_reference=True), # stated in 548 | wdi_core.WDExternalID(value=self.chebi, prop_nr='P683', is_reference=True), # source element 549 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 550 | wdi_core.WDMonolingualText(value=item_label[0:400], language='en', prop_nr='P1476', is_reference=True), 551 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved 552 | ]] 553 | print('ChEBI Main label is', item_label) 554 | 555 | elements = { 556 | 'P683': self.chebi, 557 | 'P233': self.canonical_smiles, 558 | 'P2017': self.isomeric_smiles, 559 | 'P235': self.stdinchikey, 560 | 'P234': self.stdinchi[6:], 561 | 'P231': self.cas, 562 | 'P665': self.kegg, 563 | 'P2057': self.hmdb, 564 | 'P1579': self.beilstein, 565 | 'P2064': self.knapsack, 566 | 'P2275': self.who_inn 567 | } 568 | 569 | dtypes = { 570 | 'P652': wdi_core.WDExternalID, 571 | 'P683': wdi_core.WDExternalID, 572 | 'P661': wdi_core.WDExternalID, 573 | 'P2153': wdi_core.WDExternalID, 574 | 'P233': wdi_core.WDString, 575 | 'P2017': wdi_core.WDString, 576 | 'P235': wdi_core.WDExternalID, 577 | 'P234': wdi_core.WDExternalID, 578 | 'P274': wdi_core.WDString, 579 | 'P231': wdi_core.WDExternalID, 580 | 'P232': wdi_core.WDExternalID, 581 | 'P665': wdi_core.WDExternalID, 582 | 'P2057': wdi_core.WDExternalID, 583 | 'P1579': wdi_core.WDExternalID, 584 | 'P2064': wdi_core.WDExternalID, 585 | 'P2275': wdi_core.WDMonolingualText 586 | } 587 | 588 | # do not add isomeric smiles if no isomeric info is available 589 | if self.canonical_smiles == self.isomeric_smiles or len(self.isomeric_smiles) > 400: 590 | del elements['P2017'] 591 | 592 | # do not try to add InChI longer than 400 chars 593 | if len(self.stdinchi[6:]) > 400: 594 | del elements['P234'] 595 | 596 | if len(self.canonical_smiles) > 400: 597 | del elements['P233'] 598 | 599 | data = [] 600 | 601 | for k, v in elements.items(): 602 | if not v: 603 | continue 604 | 605 | print('{}:'.format(k), v) 606 | if isinstance(v, list) or isinstance(v, set): 607 | for x in v: 608 | data.append(dtypes[k](prop_nr=k, value=x, references=refs)) 609 | else: 610 | data.append(dtypes[k](prop_nr=k, value=v, references=refs)) 611 | 612 | return data 613 | 614 | 615 | class GTPLMolecule(object): 616 | def __init__(self, gtpl_id=None, cid=None, sid=None, inchi_key=None): 617 | gtp_data = pd.read_csv('./iuphar/ligands.csv', low_memory=False, 618 | dtype={'PubChem SID': np.str, 'PubChem CID': np.str, 'Ligand id': np.str}) 619 | 620 | # remove all labelled or radioactive compounds as they have the same inchi key as unlabelled compounds 621 | gtp_data = gtp_data.loc[pd.isnull(gtp_data['Labelled'].values), :] 622 | 623 | print('gtpl inchi', inchi_key) 624 | if gtpl_id: 625 | ind = gtp_data['Ligand id'].values == gtpl_id 626 | elif cid: 627 | ind = gtp_data['PubChem CID'].values == cid 628 | elif sid: 629 | ind = gtp_data['PubChem CID'].values == sid 630 | else: 631 | ind = gtp_data['InChIKey'].values == inchi_key 632 | 633 | 634 | self.data = gtp_data.loc[ind, :] 635 | 636 | if len(self.data.index) != 1: 637 | raise ValueError('Provided ID did not return a unique GTPL ID') 638 | 639 | self.data_index = self.data.index[0] 640 | 641 | 642 | @property 643 | def stdinchikey(self): 644 | return self.data.loc[self.data_index, 'InChIKey'] 645 | 646 | @property 647 | def stdinchi(self): 648 | return self.data.loc[self.data_index, 'InChI'] 649 | 650 | @property 651 | def preferred_name(self): 652 | return GTPLMolecule.label_converter(self.data.loc[self.data_index, 'Name']) 653 | 654 | @property 655 | def synonyms(self): 656 | synonyms = self.data.loc[self.data_index, 'Synonyms'] 657 | synonyms = synonyms.split('|') if pd.notnull(synonyms) else [] 658 | return [GTPLMolecule.label_converter(x) for x in synonyms] 659 | 660 | @property 661 | def smiles(self): 662 | return self.data.loc[self.data_index, 'SMILES'] 663 | 664 | @property 665 | def molecule_type(self): 666 | return self.data.loc[self.data_index, 'Type'] 667 | 668 | @property 669 | def gtpl_id(self): 670 | return self.data.loc[self.data_index, 'Ligand id'] 671 | 672 | def to_wikidata(self): 673 | item_label = self.preferred_name if self.preferred_name else 'GTPL' + self.gtpl_id 674 | 675 | refs = [[ 676 | wdi_core.WDItemID(value='Q17091219', prop_nr='P248', is_reference=True), # stated in 677 | wdi_core.WDExternalID(value=self.gtpl_id, prop_nr='P595', is_reference=True), # source element 678 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 679 | wdi_core.WDMonolingualText(value=item_label[0:400], language='en', prop_nr='P1476', is_reference=True), 680 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved 681 | ]] 682 | print('GTPL Main label is', item_label) 683 | 684 | cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 685 | isomeric_smiles = cmpnd.get_smiles(smiles_type='isomeric') 686 | canonical_smiles = cmpnd.get_smiles(smiles_type='generic') 687 | 688 | elements = { 689 | 'P595': self.gtpl_id, 690 | 'P233': canonical_smiles, 691 | 'P2017': isomeric_smiles, 692 | 'P235': self.stdinchikey, 693 | 'P234': self.stdinchi[6:], 694 | } 695 | 696 | dtypes = { 697 | 'P595': wdi_core.WDExternalID, 698 | 'P683': wdi_core.WDExternalID, 699 | 'P661': wdi_core.WDExternalID, 700 | 'P2153': wdi_core.WDExternalID, 701 | 'P233': wdi_core.WDString, 702 | 'P2017': wdi_core.WDString, 703 | 'P235': wdi_core.WDExternalID, 704 | 'P234': wdi_core.WDExternalID, 705 | 'P274': wdi_core.WDString 706 | } 707 | 708 | # do not add isomeric smiles if no isomeric info is available 709 | if canonical_smiles == isomeric_smiles or len(self.smiles) > 400: 710 | del elements['P2017'] 711 | 712 | # do not try to add InChI longer than 400 chars 713 | if len(self.stdinchi[6:]) > 400: 714 | del elements['P234'] 715 | 716 | if len(self.smiles) > 400: 717 | del elements['P233'] 718 | 719 | data = [] 720 | 721 | for k, v in elements.items(): 722 | if not v: 723 | continue 724 | 725 | print('{}:'.format(k), v) 726 | if isinstance(v, list) or isinstance(v, set): 727 | for x in v: 728 | data.append(dtypes[k](prop_nr=k, value=x, references=refs)) 729 | else: 730 | data.append(dtypes[k](prop_nr=k, value=v, references=refs)) 731 | 732 | return data 733 | 734 | @staticmethod 735 | def label_converter(label): 736 | greek_codes = { 737 | 'α': '\u03B1', 738 | 'β': '\u03B2', 739 | 'γ': '\u03B3', 740 | 'δ': '\u03B4', 741 | 'ε': '\u03B5', 742 | 'ζ': '\u03B6 ', 743 | 'η': '\u03B7', 744 | 'θ': '\u03B8', 745 | 'ι': '\u03B9', 746 | 'κ': '\u03BA', 747 | 'λ': '\u03BB', 748 | 'μ': '\u03BC', 749 | 'ν': '\u03BD', 750 | 'ξ': '\u03BE', 751 | 'ο': '\u03BF', 752 | 'π': '\u03C0', 753 | 'ρ': '\u03C1', 754 | 'σ': '\u03C3', 755 | 'τ': '\u03C4', 756 | 'υ': '\u03C5', 757 | 'φ': '\u03C6', 758 | 'χ': '\u03C7', 759 | 'ψ': '\u03C8', 760 | 'ω': '\u03C9', 761 | 762 | 'Α': '\u0391', 763 | '&‌Beta;': '\u0392', 764 | '&G‌amma;': '\u0393', 765 | 'Δ': '\u0394', 766 | '&E‌psilon;': '\u0395', 767 | 'Ζ': '\u0396', 768 | '&E‌ta;': '\u0397', 769 | '&T‌heta;': '\u0398', 770 | 'Ι': '\u0399', 771 | '&K‌appa;': '\u039A', 772 | '&L‌ambda;': '\u039B', 773 | '&‌Mu;': '\u039C', 774 | '&‌Nu;': '\u039D', 775 | 'Ξ': '\u039E', 776 | 'Ο': '\u039F', 777 | 'Π': '\u03A0', 778 | 'Ρ': '\u03A1', 779 | 'Σ': '\u03A3', 780 | 'Τ': '\u03A4', 781 | 'Υ': '\u03A5', 782 | 'Φ': '\u03A6', 783 | 'Χ': '\u03A7', 784 | 'Ψ': '\u03A8', 785 | 'Ω': '\u03A9', 786 | 787 | '®': '\u00AE', 788 | '±': '\u00B1' 789 | } 790 | 791 | for greek_letter, unicode in greek_codes.items(): 792 | if greek_letter in label: 793 | label = label.replace(greek_letter, unicode) 794 | 795 | remove_tags = ['', '', '', '', '', ''] 796 | for x in remove_tags: 797 | label = label.replace(x, '') 798 | 799 | return label 800 | 801 | 802 | 803 | class ChEMBLMolecule(object): 804 | def __init__(self, chembl_id=None, inchi_key=None): 805 | ci = chembl_id if chembl_id is not None else inchi_key 806 | 807 | url = 'https://www.ebi.ac.uk/chembl/api/data/molecule/{}.json'.format(ci.upper()) 808 | r = requests.get(url) 809 | if r.status_code == 404: 810 | raise ValueError('ChEMBL ID {} not found in ChEMBL'.format(chembl_id)) 811 | self.compound = r.json() 812 | 813 | @property 814 | def stdinchikey(self): 815 | return self.compound['molecule_structures']['standard_inchi_key'] 816 | 817 | @property 818 | def stdinchi(self): 819 | return self.compound['molecule_structures']['standard_inchi'] 820 | 821 | @property 822 | def preferred_name(self): 823 | return self.compound['pref_name'] 824 | 825 | @property 826 | def smiles(self): 827 | return self.compound['molecule_structures']['canonical_smiles'] 828 | 829 | @property 830 | def chembl_id(self): 831 | return self.compound['molecule_chembl_id'] 832 | 833 | @property 834 | def monoisotopic_mass(self): 835 | return self.compound['molecule_properties']['mw_monoisotopic'] 836 | 837 | @property 838 | def chebi(self): 839 | return self.compound['chebi_par_id'] if 'chebi_par_id' in self.compound else None 840 | 841 | def to_wikidata(self): 842 | item_label = self.preferred_name if self.preferred_name else self.chembl_id 843 | 844 | refs = [[ 845 | wdi_core.WDItemID(value='Q6120337', prop_nr='P248', is_reference=True), # stated in 846 | wdi_core.WDExternalID(value=self.chembl_id, prop_nr='P592', is_reference=True), # source element 847 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 848 | wdi_core.WDMonolingualText(value=item_label[0:400], language='en', prop_nr='P1476', is_reference=True), 849 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved 850 | ]] 851 | print('ChEMBL Main label is', item_label) 852 | 853 | cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 854 | isomeric_smiles = cmpnd.get_smiles(smiles_type='isomeric') 855 | canonical_smiles = cmpnd.get_smiles(smiles_type='generic') 856 | 857 | elements = { 858 | 'P592': self.chembl_id, 859 | 'P233': canonical_smiles, 860 | 'P2017': isomeric_smiles, 861 | 'P235': self.stdinchikey, 862 | 'P234': self.stdinchi[6:], 863 | 'P683': str(self.chebi) if self.chebi else None 864 | } 865 | 866 | dtypes = { 867 | 'P592': wdi_core.WDExternalID, 868 | 'P683': wdi_core.WDExternalID, 869 | 'P661': wdi_core.WDExternalID, 870 | 'P2153': wdi_core.WDExternalID, 871 | 'P233': wdi_core.WDString, 872 | 'P2017': wdi_core.WDString, 873 | 'P235': wdi_core.WDExternalID, 874 | 'P234': wdi_core.WDExternalID, 875 | 'P274': wdi_core.WDString 876 | } 877 | 878 | # do not add isomeric smiles if no isomeric info is available 879 | if canonical_smiles == isomeric_smiles or len(self.smiles) > 400: 880 | del elements['P2017'] 881 | 882 | # do not try to add InChI longer than 400 chars 883 | if len(self.stdinchi[6:]) > 400: 884 | del elements['P234'] 885 | 886 | if len(self.smiles) > 400: 887 | del elements['P233'] 888 | 889 | data = [ 890 | wdi_core.WDQuantity(value=self.monoisotopic_mass, prop_nr='P2067', upper_bound=self.monoisotopic_mass, 891 | lower_bound=self.monoisotopic_mass, unit='http://www.wikidata.org/entity/Q483261', 892 | references=refs) 893 | ] 894 | 895 | for k, v in elements.items(): 896 | if not v: 897 | continue 898 | 899 | print('{}:'.format(k), v) 900 | if isinstance(v, list) or isinstance(v, set): 901 | for x in v: 902 | data.append(dtypes[k](prop_nr=k, value=x, references=refs)) 903 | else: 904 | data.append(dtypes[k](prop_nr=k, value=v, references=refs)) 905 | 906 | return data 907 | 908 | class ChemSpiderMolecule(object): 909 | token = '' 910 | 911 | def __init__(self, csid=None, mol=None): 912 | if csid: 913 | cs = chemspipy.ChemSpider(ChemSpiderMolecule.token) 914 | self.compound = cs.get_compound(csid) 915 | else: 916 | self.compound = mol 917 | 918 | # self._inchikey = self.compound.inchikey 919 | # self._inchi = self.compound.inchi 920 | # self._common_name = self.compound.common_name 921 | # self._smiles = self.compound.smiles 922 | 923 | 924 | # ikey = 'HGCGQDMQKGRJNO-UHFFFAOYSA-N' 925 | # ikey = 'MTNISTQLDNOGTM-UHFFFAOYSA-N' 926 | # ikey = 'ZWAWYSBJNBVQHP-UHFFFAOYSA-N' 927 | 928 | 929 | @property 930 | def stdinchikey(self): 931 | return self.compound.stdinchikey 932 | 933 | @property 934 | def stdinchi(self): 935 | return self.compound.stdinchi 936 | 937 | @property 938 | def common_name(self): 939 | try: 940 | return self.compound.common_name 941 | except KeyError: 942 | return None 943 | 944 | @property 945 | def smiles(self): 946 | return self.compound.smiles 947 | 948 | @property 949 | def csid(self): 950 | return str(self.compound.csid) 951 | 952 | @property 953 | def monoisotopic_mass(self): 954 | return self.compound.monoisotopic_mass 955 | 956 | 957 | def to_wikidata(self): 958 | item_label = self.common_name if self.common_name else self.csid 959 | 960 | pubchem_ref = [[ 961 | wdi_core.WDItemID(value='Q2311683', prop_nr='P248', is_reference=True), # stated in 962 | wdi_core.WDExternalID(value=self.csid, prop_nr='P661', is_reference=True), # source element 963 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 964 | wdi_core.WDMonolingualText(value=item_label[0:200], language='en', prop_nr='P1476', is_reference=True), 965 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # retrieved 966 | ]] 967 | print('Main label is', item_label) 968 | 969 | try: 970 | cmpnd = Compound(compound_string=self.smiles, identifier_type='smiles') 971 | isomeric_smiles = cmpnd.get_smiles(smiles_type='isomeric') 972 | canonical_smiles = cmpnd.get_smiles(smiles_type='generic') 973 | except ValueError as e: 974 | print(e) 975 | print('Error when trying to convert ChemSpider SMILES') 976 | canonical_smiles = None 977 | isomeric_smiles = None 978 | 979 | elements = { 980 | 'P661': self.csid, 981 | 'P233': canonical_smiles, 982 | 'P2017': isomeric_smiles, 983 | 'P235': self.stdinchikey, 984 | 'P234': self.stdinchi[6:], 985 | } 986 | 987 | dtypes = { 988 | 'P661': wdi_core.WDExternalID, 989 | 'P2153': wdi_core.WDExternalID, 990 | 'P233': wdi_core.WDString, 991 | 'P2017': wdi_core.WDString, 992 | 'P235': wdi_core.WDExternalID, 993 | 'P234': wdi_core.WDExternalID, 994 | 'P274': wdi_core.WDString 995 | } 996 | 997 | # do not add isomeric smiles if no isomeric info is available 998 | if canonical_smiles == isomeric_smiles or len(self.smiles) > 400: 999 | del elements['P2017'] 1000 | 1001 | # do not try to add InChI longer than 400 chars 1002 | if len(self.stdinchi[6:]) > 400: 1003 | del elements['P234'] 1004 | 1005 | if len(self.smiles) > 400: 1006 | del elements['P233'] 1007 | 1008 | data = [] 1009 | if float(self.monoisotopic_mass) != 0: 1010 | data = [ 1011 | wdi_core.WDQuantity(value=self.monoisotopic_mass, prop_nr='P2067', upper_bound=self.monoisotopic_mass, 1012 | lower_bound=self.monoisotopic_mass, unit='http://www.wikidata.org/entity/Q483261', 1013 | references=pubchem_ref) 1014 | ] 1015 | 1016 | for k, v in elements.items(): 1017 | if not v: 1018 | continue 1019 | 1020 | print('{}:'.format(k), v) 1021 | if isinstance(v, list) or isinstance(v, set): 1022 | for x in v: 1023 | data.append(dtypes[k](prop_nr=k, value=x, references=pubchem_ref)) 1024 | else: 1025 | data.append(dtypes[k](prop_nr=k, value=v, references=pubchem_ref)) 1026 | 1027 | return data 1028 | 1029 | @staticmethod 1030 | def search(search_string): 1031 | molecules = [] 1032 | 1033 | cs = chemspipy.ChemSpider(ChemSpiderMolecule.token) 1034 | 1035 | for x in cs.search(search_string): 1036 | molecules.append(ChemSpiderMolecule(mol=x)) 1037 | # print(x.common_name) 1038 | # print(x.stdinchikey) 1039 | # print(x.stdinchi) 1040 | # print(x.csid) 1041 | return molecules 1042 | 1043 | 1044 | class PubChemMolecule(object): 1045 | 1046 | # s = requests.Session() 1047 | headers = { 1048 | 'accept': 'application/json', 1049 | 'content-type': 'application/json', 1050 | 'charset': 'utf-8' 1051 | } 1052 | 1053 | base_url = 'http://pubchem.ncbi.nlm.nih.gov/rest/rdf/{}' 1054 | 1055 | def __init__(self, cid=None, inchi_key=None, inchi=None, sid=None, mol_type='canonical'): 1056 | self.dtxsid = None 1057 | self.einecs = None 1058 | self.cas = None 1059 | self.zinc = None 1060 | self.chembl = None 1061 | self.kegg = None 1062 | self.chebi = None 1063 | self.unii = None 1064 | 1065 | self._cid = None 1066 | self._sids = None 1067 | self._inchi_key = None 1068 | self._inchi = None 1069 | self._canonical_smiles = None 1070 | self._isomeric_smiles = None 1071 | self._exact_mass = None 1072 | self._molecular_formula = None 1073 | self._aids = None 1074 | 1075 | # self.s = requests.Session() 1076 | # PubChemMolecule.s.close() 1077 | # PubChemMolecule.s = self.s 1078 | print('cid parameter value', cid) 1079 | if cid: 1080 | self.cid = cid 1081 | if sid: 1082 | self.sids = sid 1083 | if inchi_key: 1084 | self.stdinchikey = inchi_key 1085 | if inchi: 1086 | self.inchi = inchi 1087 | 1088 | assert(mol_type == 'canonical' or mol_type == 'zwitterion') 1089 | self.mol_type = mol_type 1090 | 1091 | if cid: 1092 | pass 1093 | elif inchi_key: 1094 | cids = self._retrieve_pubchem_cids(self.stdinchikey) 1095 | if len(cids) == 0: 1096 | raise InChIKeyMissingError('InChI key not found in PubChem!') 1097 | if len(cids) == 1: 1098 | self.cid = cids[0] 1099 | else: 1100 | self.cid = self._determine_mol_type(cids) 1101 | 1102 | self.synonyms = PubChemMolecule._get_synonyms(self.cid) 1103 | self.main_label = '' if len(self.synonyms) == 0 else self.synonyms[0] 1104 | 1105 | @property 1106 | def canonical_smiles(self): 1107 | if not self._canonical_smiles: 1108 | self._canonical_smiles = PubChemMolecule._get_descriptors(self.cid, 'Canonical_SMILES') 1109 | return self._canonical_smiles 1110 | 1111 | @canonical_smiles.setter 1112 | def canonical_smiles(self, value): 1113 | self._canonical_smiles = value 1114 | 1115 | @property 1116 | def isomeric_smiles(self): 1117 | if not self._isomeric_smiles: 1118 | self._isomeric_smiles = PubChemMolecule._get_descriptors(self.cid, 'Isomeric_SMILES') 1119 | return self._isomeric_smiles 1120 | 1121 | @isomeric_smiles.setter 1122 | def isomeric_smiles(self, value): 1123 | self._isomeric_smiles = value 1124 | 1125 | @property 1126 | def exact_mass(self): 1127 | """Get exact mass of a PubChem compound.""" 1128 | if not self._exact_mass: 1129 | self._exact_mass = PubChemMolecule._get_descriptors(self.cid, 'Exact_Mass') 1130 | return self._exact_mass 1131 | 1132 | @exact_mass.setter 1133 | def exact_mass(self, value): 1134 | """Set exact mass of a PubChem compound.""" 1135 | self._exact_mass = value 1136 | 1137 | @property 1138 | def molecular_formula(self): 1139 | if not self._molecular_formula: 1140 | self._molecular_formula = PubChemMolecule._get_descriptors(self.cid, 'Molecular_Formula') 1141 | return self._molecular_formula 1142 | 1143 | @molecular_formula.setter 1144 | def molecular_formula(self, value): 1145 | self._molecular_formula = value 1146 | 1147 | @property 1148 | def cid(self): 1149 | return self._cid 1150 | 1151 | @cid.setter 1152 | def cid(self, value): 1153 | 1154 | if value and not value.lower().startswith('cid'): 1155 | # make sure that the provided cid is an integer, will raise a ValueError if not 1156 | int(value) 1157 | 1158 | self._cid = 'CID{}'.format(value) 1159 | else: 1160 | self._cid = value 1161 | 1162 | if self._cid: 1163 | base_data = PubChemMolecule._retrieve_basic_compound_info(self.cid) 1164 | 1165 | # object triples 1166 | has_parts = set() 1167 | active_ingredient_of = set() 1168 | has_roles = set() 1169 | has_parent = set() 1170 | 1171 | # deal with item as subject 1172 | subj_data = base_data['compound/' + self._cid] 1173 | del base_data['compound/' + self._cid] 1174 | 1175 | subj_mapping = { 1176 | 'vocabulary#FDAApprovedDrugs': has_roles, 1177 | 'vocabulary#is_active_ingredient_of': active_ingredient_of, 1178 | 'http://purl.obolibrary.org/obo/has-role': has_roles, 1179 | 'vocabulary#has_parent': has_parent 1180 | } 1181 | 1182 | for k, v in subj_data.items(): 1183 | if k not in subj_mapping: 1184 | continue 1185 | 1186 | value = v[0]['value'] 1187 | if value.startswith('compound/CID'): 1188 | value = value.split('/')[-1] 1189 | subj_mapping[k].add(value) 1190 | 1191 | # subject properties 1192 | isotopologues = set() 1193 | stereoisomers = set() 1194 | same_connectivity = set() 1195 | sids = set() 1196 | parent_of = set() 1197 | part_of = set() 1198 | 1199 | obj_mapping = { 1200 | 'vocabulary#has_parent': parent_of, 1201 | 'http://semanticscience.org/resource/CHEMINF_000455': isotopologues, 1202 | 'http://semanticscience.org/resource/CHEMINF_000461': stereoisomers, 1203 | 'http://semanticscience.org/resource/CHEMINF_000462': same_connectivity, 1204 | 'http://semanticscience.org/resource/CHEMINF_000477': sids, 1205 | 'http://semanticscience.org/resource/CHEMINF_000478': part_of, 1206 | 'http://semanticscience.org/resource/has-attribute': set(), 1207 | 'http://semanticscience.org/resource/CHEMINF_000446': 'cas', 1208 | 'http://semanticscience.org/resource/CHEMINF_000447': 'einecs', 1209 | 'http://semanticscience.org/resource/CHEMINF_000412': 'chembl', 1210 | 'http://semanticscience.org/resource/CHEMINF_000409': 'kegg', 1211 | 'http://semanticscience.org/resource/CHEMINF_000407': 'chebi', 1212 | 'http://semanticscience.org/resource/CHEMINF_000563': 'unii', 1213 | 1214 | } 1215 | 1216 | prefix_mapping = { 1217 | ('DTXSID', 'dtxsid'), 1218 | ('ZINC', 'zinc') 1219 | } 1220 | 1221 | # deal with item as object 1222 | for k, v in base_data.items(): 1223 | if k.startswith('inchikey'): 1224 | self.stdinchikey = k.split('/')[-1] 1225 | continue 1226 | 1227 | if k.startswith('synonym/MD5_'): 1228 | # print(k) 1229 | 1230 | res = requests.get(url=self.base_url.format(k + '.json'), headers=self.headers).json() 1231 | 1232 | identifier = [x['value'] for x in res[k]['http://semanticscience.org/resource/has-value']] 1233 | 1234 | types = [x['value'] for x in res[k]['http://www.w3.org/1999/02/22-rdf-syntax-ns#type']] 1235 | 1236 | # retrieve database identifiers 1237 | if len(types) == 1 and types[0] == 'http://semanticscience.org/resource/CHEMINF_000467': 1238 | for pref, prop in prefix_mapping: 1239 | if identifier[0].startswith(pref): 1240 | #print(prop, identifier[0]) 1241 | setattr(self, prop, identifier[0]) 1242 | 1243 | for x in types: 1244 | if x in obj_mapping: 1245 | 1246 | # process identifier strings from PubChem, if needed 1247 | #EINECS 1248 | if x == 'http://semanticscience.org/resource/CHEMINF_000447': 1249 | identifier = [x.split(' ').pop() for x in identifier] 1250 | #ChEBI 1251 | if x == 'http://semanticscience.org/resource/CHEMINF_000407': 1252 | identifier = list(set([x.split(':').pop() for x in identifier])) 1253 | #UNII 1254 | if x == 'http://semanticscience.org/resource/CHEMINF_000563': 1255 | identifier = list(set([x.upper() for x in identifier])) 1256 | 1257 | #print(obj_mapping[x], x, identifier) 1258 | setattr(self, obj_mapping[x], identifier) 1259 | 1260 | for kk, vv in v.items(): 1261 | if kk not in obj_mapping: 1262 | continue 1263 | 1264 | obj_mapping[kk].add(k.split('/')[-1]) 1265 | 1266 | self.sids = list(sids) 1267 | 1268 | @property 1269 | def sids(self): 1270 | return self._sid 1271 | 1272 | @sids.setter 1273 | def sids(self, value): 1274 | self._sid = value 1275 | 1276 | @property 1277 | def aids(self): 1278 | return self._aids 1279 | 1280 | @aids.setter 1281 | def aids(self, value): 1282 | self._aids = value 1283 | 1284 | @property 1285 | def stdinchikey(self): 1286 | return self._inchi_key 1287 | 1288 | @stdinchikey.setter 1289 | def stdinchikey(self, value): 1290 | self._inchi_key = value 1291 | 1292 | @property 1293 | def inchi(self): 1294 | if not self._inchi: 1295 | self._inchi = PubChemMolecule._get_descriptors(self.cid, 'IUPAC_InChI') 1296 | return self._inchi 1297 | 1298 | @inchi.setter 1299 | def inchi(self, value): 1300 | self._inchi = value 1301 | 1302 | @property 1303 | def assay_ids(self): 1304 | return PubChemMolecule._get_assay_ids(self.sids) 1305 | 1306 | def _determine_mol_type(self, cids): 1307 | print(cids) 1308 | zwitterion_charge_count = [] 1309 | for count, cid in enumerate(cids): 1310 | ismiles = PubChemMolecule._get_descriptors(cid, 'Isomeric_SMILES') 1311 | plus_count = ismiles.count('+') 1312 | minus_count = ismiles.count('-') 1313 | zwitterion_charge_count.append(plus_count + minus_count) 1314 | 1315 | if self.mol_type == 'canonical': 1316 | charge = min(zwitterion_charge_count) 1317 | else: 1318 | charge = max(zwitterion_charge_count) 1319 | 1320 | if zwitterion_charge_count.count(charge) > 1: 1321 | x = [len(simplejson.dumps(PubChemMolecule._retrieve_basic_compound_info(cids[c]))) 1322 | if z == charge else 0 for c, z in enumerate(zwitterion_charge_count)] 1323 | return cids[x.index(max(x))] 1324 | else: 1325 | return cids[zwitterion_charge_count.index(charge)] 1326 | 1327 | @staticmethod 1328 | def _get_synonyms(cid): 1329 | url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/synonyms/json'.format(cid[3:]) 1330 | # reply = PubChemMolecule.s.get(url, headers=PubChemMolecule.headers) 1331 | # reply = PubChemMolecule.s.get(url, headers=PubChemMolecule.headers) 1332 | reply = requests.get(url, headers=PubChemMolecule.headers) 1333 | if 'Fault' in reply.json(): 1334 | return [] 1335 | return reply.json()['InformationList']['Information'][0]['Synonym'] 1336 | 1337 | @staticmethod 1338 | def _retrieve_basic_compound_info(cid): 1339 | cmpnd_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{}.json'.format(cid) 1340 | print(cmpnd_url) 1341 | 1342 | # r = PubChemMolecule.s.get(cmpnd_url, headers=PubChemMolecule.headers).json() 1343 | r = requests.get(cmpnd_url, headers=PubChemMolecule.headers).json() 1344 | 1345 | return r 1346 | 1347 | @staticmethod 1348 | def _get_descriptors(cid, descr_type): 1349 | url = 'https://pubchem.ncbi.nlm.nih.gov/rest/rdf/descriptor/{}_{}.json'.format(cid, descr_type) 1350 | 1351 | # descr_json = PubChemMolecule.s.get(url, headers=PubChemMolecule.headers).json() 1352 | descr_json = requests.get(url, headers=PubChemMolecule.headers).json() 1353 | return descr_json['descriptor/{}_{}' 1354 | .format(cid, descr_type)]['http://semanticscience.org/resource/has-value'][0]['value'] 1355 | 1356 | @staticmethod 1357 | def _get_assay_ids(sids): 1358 | url = 'http://pubchem.ncbi.nlm.nih.gov/rest/rdf/query' 1359 | assay_ids = dict() 1360 | 1361 | for sid_block in [sids[c : c + 20] for c in range(0, len(sids), 20)]: 1362 | r = dict() 1363 | 1364 | params = { 1365 | 'graph': 'substance', 1366 | 'pred': 'obo:BFO_0000056', 1367 | 'subj': ','.join(['substance:{}'.format(x) for x in sid_block]), 1368 | 'format': 'json' 1369 | } 1370 | 1371 | try: 1372 | response = requests.get(url, params=params, headers=PubChemMolecule.headers) 1373 | print(response.url) 1374 | r = response.json()['results']['bindings'] 1375 | print('length response items', len(r)) 1376 | 1377 | except simplejson.JSONDecodeError as e: 1378 | print(e) 1379 | print('Error retrieving PubChem Assay Ids') 1380 | 1381 | for x in r: 1382 | if 'subject' not in x: 1383 | continue 1384 | 1385 | assay_id = x['object']['value'].split('/')[-1].split('_')[0] 1386 | sid = x['subject']['value'].split('/')[-1] 1387 | 1388 | if sid in assay_ids: 1389 | 1390 | assay_ids[sid].add(assay_id) 1391 | else: 1392 | assay_ids.update({sid: {assay_id}}) 1393 | print(assay_ids) 1394 | return assay_ids 1395 | 1396 | @staticmethod 1397 | def _retrieve_pubchem_cids(ikey): 1398 | url = 'http://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/{}.json'.format(ikey) 1399 | 1400 | try: 1401 | # r = PubChemMolecule.s.get(url, headers=PubChemMolecule.headers).json() 1402 | r = requests.get(url, headers=PubChemMolecule.headers).json() 1403 | except simplejson.JSONDecodeError as e: 1404 | # print(e.__str__()) 1405 | print('PubChem does not have this InChI key', ikey) 1406 | return [] 1407 | 1408 | cids = list() 1409 | if 'http://semanticscience.org/resource/is-attribute-of' in r['inchikey/{}'.format(ikey)]: 1410 | for x in r['inchikey/{}'.format(ikey)]['http://semanticscience.org/resource/is-attribute-of']: 1411 | cids.append(x['value'].split('/')[-1]) 1412 | 1413 | return cids 1414 | 1415 | # def __del__(self): 1416 | # self.s.close() 1417 | 1418 | def to_wikidata(self): 1419 | item_label = self.cid if self.main_label == '' else self.main_label 1420 | 1421 | pubchem_ref = [[ 1422 | wdi_core.WDItemID(value='Q278487', prop_nr='P248', is_reference=True), # stated in 1423 | wdi_core.WDExternalID(value=self.cid[3:], prop_nr='P662', is_reference=True), # source element 1424 | wdi_core.WDItemID(value='Q1860', prop_nr='P407', is_reference=True), # language of work 1425 | wdi_core.WDMonolingualText(value=item_label[0:400], language='en', prop_nr='P1476', is_reference=True), 1426 | wdi_core.WDTime(time=time.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) # publication date 1427 | ]] 1428 | print('Main label is', self.main_label) 1429 | 1430 | elements = { 1431 | 'P662': self.cid[3:], 1432 | #'P2153': self.sid[3:], 1433 | 'P233': self.canonical_smiles, 1434 | 'P2017': self.isomeric_smiles, 1435 | 'P235': self.stdinchikey, 1436 | 'P234': self.inchi[6:], 1437 | 'P274': PubChemMolecule.convert_to_index_numbers(self.molecular_formula), 1438 | 'P3117': self.dtxsid, 1439 | 'P231': self.cas, 1440 | 'P232': self.einecs, 1441 | 'P2084': self.zinc, 1442 | 'P592': self.chembl, 1443 | 'P665': self.kegg, 1444 | 'P683': self.chebi, 1445 | 'P652': self.unii, 1446 | 1447 | } 1448 | 1449 | dtypes = { 1450 | 'P662': wdi_core.WDExternalID, 1451 | 'P2153': wdi_core.WDExternalID, 1452 | 'P233': wdi_core.WDString, 1453 | 'P2017': wdi_core.WDString, 1454 | 'P235': wdi_core.WDExternalID, 1455 | 'P234': wdi_core.WDExternalID, 1456 | 'P274': wdi_core.WDString, 1457 | 'P232': wdi_core.WDExternalID, 1458 | 'P231': wdi_core.WDExternalID, 1459 | 'P3117': wdi_core.WDExternalID, 1460 | 'P2084': wdi_core.WDExternalID, 1461 | 'P592': wdi_core.WDExternalID, 1462 | 'P665': wdi_core.WDExternalID, 1463 | 'P683': wdi_core.WDExternalID, 1464 | 'P652': wdi_core.WDExternalID, 1465 | 1466 | 1467 | } 1468 | 1469 | # do not add isomeric smiles if canonical smiles is the same 1470 | if self.canonical_smiles == self.isomeric_smiles or len(self.isomeric_smiles) > 400: 1471 | del elements['P2017'] 1472 | 1473 | # do not try to add InChI longer than 400 chars 1474 | if len(self.inchi[6:]) > 400: 1475 | del elements['P234'] 1476 | 1477 | if len(self.canonical_smiles) > 400: 1478 | del elements['P233'] 1479 | 1480 | data = [ 1481 | wdi_core.WDQuantity(value=self.exact_mass, prop_nr='P2067', upper_bound=self.exact_mass, 1482 | lower_bound=self.exact_mass, unit='http://www.wikidata.org/entity/Q483261', 1483 | references=pubchem_ref) 1484 | ] 1485 | 1486 | for k, v in elements.items(): 1487 | if not v: 1488 | continue 1489 | 1490 | print('{}:'.format(k), v) 1491 | if isinstance(v, list) or isinstance(v, set): 1492 | for x in v: 1493 | data.append(dtypes[k](prop_nr=k, value=x, references=pubchem_ref)) 1494 | else: 1495 | data.append(dtypes[k](prop_nr=k, value=v, references=pubchem_ref)) 1496 | 1497 | return data 1498 | 1499 | @staticmethod 1500 | def convert_to_index_numbers(formula_string): 1501 | """ 1502 | Converts the numbers in a normal string into unicode index numbers (as used in chemical formulas) 1503 | :param formula_string: a string containing numbers which should be converted to index numbers 1504 | :type formula_string: str 1505 | :return: returns a unicode string with numbers converted to index numbers 1506 | """ 1507 | index_numbers = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉'] 1508 | conventional_numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] 1509 | 1510 | conversion_map = dict(zip(conventional_numbers, index_numbers)) 1511 | 1512 | for i in set(formula_string): 1513 | if i in conversion_map: 1514 | formula_string = formula_string.replace(i, conversion_map[i]) 1515 | 1516 | return formula_string 1517 | 1518 | 1519 | class InChIKeyMissingError(Exception): 1520 | def __init__(self, value): 1521 | self.value = value 1522 | 1523 | def __str__(self): 1524 | return repr(self.value) 1525 | 1526 | 1527 | def main(): 1528 | # a = PubChemMolecule(inchi_key='ADPBHYYCECQFTN-UHFFFAOYSA-K') 1529 | # print(a.cid) 1530 | # print(a.main_label) 1531 | # 1532 | # b = PubChemMolecule(inchi_key='PIOZZBNFRIZETM-UHFFFAOYSA-L') 1533 | # print(b.cid) 1534 | # print(b.main_label) 1535 | # 1536 | # 1537 | # c = PubChemMolecule(inchi_key='RNAICSBVACLLGM-GNAZCLTHSA-N') 1538 | # print(c.cid) 1539 | # print(c.main_label) 1540 | 1541 | login_obj = wdi_login.WDLogin(user='', pwd='') 1542 | 1543 | 1544 | query = ''' 1545 | SELECT * WHERE { 1546 | ?cmpnd wdt:P235 ?pc . 1547 | FILTER NOT EXISTS{ 1548 | #{?cmpnd wdt:P279 wd:Q11173 .} UNION 1549 | #{?cmpnd wdt:P31 wd:Q11173 .} UNION 1550 | {?cmpnd wdt:P662 ?x .} 1551 | } 1552 | } 1553 | ''' 1554 | 1555 | results = wdi_core.WDItemEngine.execute_sparql_query(query=query) 1556 | 1557 | cid_not_found_count = 0 1558 | for count, item in enumerate(results['results']['bindings']): 1559 | start = time.time() 1560 | ikey = item['pc']['value'] 1561 | try: 1562 | print('--' * 10) 1563 | print(ikey) 1564 | cmpnd = PubChemMolecule(inchi_key=ikey) 1565 | print(cmpnd.cid) 1566 | print(cmpnd.canonical_smiles) 1567 | print(cmpnd.isomeric_smiles) 1568 | print(cmpnd.inchi) 1569 | print(cmpnd.exact_mass) 1570 | print(cmpnd.molecular_formula) 1571 | print(cmpnd.main_label) 1572 | print(cmpnd.sids) 1573 | cmpnd.s.close() 1574 | 1575 | wd_item = wdi_core.WDItemEngine(item_name='ddk', domain='drugs', data=cmpnd.to_wikidata(), 1576 | append_value=['P31']) 1577 | print(wd_item.wd_item_id) 1578 | pprint.pprint(wd_item.entity_metadata) 1579 | # pprint.pprint(wd_item.get_wd_json_representation()) 1580 | wd_item.write(login_obj) 1581 | 1582 | # if count > 120: 1583 | # break 1584 | except InChIKeyMissingError as e: 1585 | print(ikey, e) 1586 | cid_not_found_count += 1 1587 | continue 1588 | except Exception as e: 1589 | print(e) 1590 | 1591 | wdi_core.WDItemEngine.log( 1592 | 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' 1593 | .format( 1594 | main_data_id='{}'.format(ikey), 1595 | exception_type=type(e), 1596 | message=e.__str__(), 1597 | wd_id='', 1598 | duration=time.time() - start 1599 | )) 1600 | 1601 | 1602 | 1603 | print('not found count', cid_not_found_count) 1604 | 1605 | 1606 | 1607 | 1608 | if __name__ == '__main__': 1609 | sys.exit(main()) 1610 | -------------------------------------------------------------------------------- /cdk_pywrapper/config.py: -------------------------------------------------------------------------------- 1 | # get the the py4j server jar file with 'find /usr/ -type f -name py4j*jar' 2 | # better: use pip: pip3 show -f py4j 3 | py4j_path = '/usr/local/share/py4j/py4j0.10.7.jar' 4 | cdk_path = './cdk/cdk-1.5.13.jar' 5 | -------------------------------------------------------------------------------- /cdk_pywrapper/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebotic/cdk_pywrapper/94f0f6f337cf3162b303d95b6d06da03e61bdee3/cdk_pywrapper/tests/__init__.py -------------------------------------------------------------------------------- /cdk_pywrapper/tests/cdk_pywrapper_test.py: -------------------------------------------------------------------------------- 1 | from cdk_pywrapper.cdk_pywrapper import Compound 2 | import sys 3 | 4 | __author__ = 'Sebastian Burgstaller-Muehlbacher' 5 | __license__ = 'AGPLv3' 6 | __copyright__ = 'Sebastian Burgstaller-Muehlbacher' 7 | 8 | '''A main method with a list of InChIs. These are then used to generate SMILES and InChI keys.''' 9 | 10 | 11 | def main(): 12 | test_inchis = [ 13 | 'InChI=1S/C23H18ClF2N3O3S/c1-2-9-33(31,32)29-19-8-7-18(25)20(21(19)26)22(30)17-12-28-23-16(17)10-14(11-27-23)13-3-5-15(24)6-4-13/h3-8,10-12,29H,2,9H2,1H3,(H,27,28)', 14 | 'InChI=1S/C33H42N4O6/c1-7-20-19(6)32(42)37-27(20)14-25-18(5)23(10-12-31(40)41)29(35-25)15-28-22(9-11-30(38)39)17(4)24(34-28)13-26-16(3)21(8-2)33(43)36-26/h15,26-27,35H,7-14H2,1-6H3,(H,36,43)(H,37,42)(H,38,39)(H,40,41)/b28-15-/t26-,27-/m0/s1', 15 | 'InChI=1S/C21H25ClFN3O3/c1-2-28-20-10-19(24)18(22)9-17(20)21(27)25-11-16-13-26(7-8-29-16)12-14-3-5-15(23)6-4-14/h3-6,9-10,16H,2,7-8,11-13,24H2,1H3,(H,25,27)', 16 | 'InChI=1S/C16H12FN3O3/c1-19-14-7-6-10(20(22)23)8-12(14)16(18-9-15(19)21)11-4-2-3-5-13(11)17/h2-8H,9H2,1H3', 17 | 'InChI=1S/C10H17N3O6S/c11-5(10(18)19)1-2-7(14)13-6(4-20)9(17)12-3-8(15)16/h5-6,20H,1-4,11H2,(H,12,17)(H,13,14)(H,15,16)(H,18,19)/t5-,6-/m0/s1', 18 | 'InChI=1S/C13H16N2O/c1-8-13-11(5-6-14-8)10-4-3-9(16-2)7-12(10)15-13/h3-4,7-8,14-15H,5-6H2,1-2H3', 19 | 'InChI=1S/C27H44O2/c1-19-10-13-23(28)18-22(19)12-11-21-9-7-17-27(5)24(14-15-25(21)27)20(2)8-6-16-26(3,4)29/h11-12,20,23-25,28-29H,1,6-10,13-18H2,2-5H3/b21-11+,22-12-/t20-,23+,24-,25+,27-/m1/s1', 20 | 'InChI=1S/C40H56/c1-31(19-13-21-33(3)25-27-37-35(5)23-15-29-39(37,7)8)17-11-12-18-32(2)20-14-22-34(4)26-28-38-36(6)24-16-30-40(38,9)10/h11-14,17-23,25-28,37H,15-16,24,29-30H2,1-10H3/b12-11+,19-13+,20-14+,27-25+,28-26+,31-17+,32-18+,33-21+,34-22+/t37-/m0/s1', 21 | 'InChI=1S/C11H14N4O5/c1-14-3-13-9-6(10(14)19)12-4-15(9)11-8(18)7(17)5(2-16)20-11/h3-5,7-8,11,16-18H,2H2,1H3', 22 | 'InChI=1S/C27H44O2/c1-18(2)8-6-9-19(3)24-13-14-25-21(10-7-15-27(24,25)5)11-12-22-16-23(28)17-26(29)20(22)4/h11-12,18-19,23-26,28-29H,4,6-10,13-17H2,1-3,5H3/b21-11+,22-12-/t19-,23-,24-,25+,26+,27-/m1/s1', 23 | 'InChI=1S/C9H14N5O4P/c1-6(18-5-19(15,16)17)2-14-4-13-7-8(10)11-3-12-9(7)14/h3-4,6H,2,5H2,1H3,(H2,10,11,12)(H2,15,16,17)/t6-/m1/s1', 24 | 'InChI=1S/C51H79NO13/c1-30-16-12-11-13-17-31(2)42(61-8)28-38-21-19-36(7)51(60,65-38)48(57)49(58)52-23-15-14-18-39(52)50(59)64-43(33(4)26-37-20-22-40(53)44(27-37)62-9)29-41(54)32(3)25-35(6)46(56)47(63-10)45(55)34(5)24-30/h11-13,16-17,25,30,32-34,36-40,42-44,46-47,53,56,60H,14-15,18-24,26-29H2,1-10H3/b13-11+,16-12+,31-17+,35-25+/t30-,32-,33-,34-,36-,37+,38+,39+,40-,42+,43+,44-,46-,47+,51-/m1/s1' 25 | 26 | ] 27 | 28 | for inchi in test_inchis: 29 | 30 | cmpnd = Compound(compound_string=inchi, identifier_type='inchi') 31 | print(cmpnd.get_smiles()) 32 | print(cmpnd.get_inchi_key()) 33 | print(cmpnd.get_inchi()) 34 | print(cmpnd.get_mol2()) 35 | print(cmpnd.get_fingerprint()) 36 | print('----------------------------') 37 | 38 | # group of compounds with same connectivity but different configuration: 39 | # https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/MNQDKWZEUULFPX-UHFFFAOYSA-M.html 40 | smiles = [ 41 | '[Ba++].[O-][Fe]([O-])(=O)=O', 42 | 'CCN1C2=CC=CC=C2SC1=CC=CC=CC3=[N+](C4=CC=CC=C4S3)CC.[I-]', 43 | 'CCN\\1C2=CC=CC=C2S/C1=C\C=C\C=C\C3=[N+](C4=CC=CC=C4S3)CC.[I-]', 44 | 'CCN\\1C2=CC=CC=C2S/C1=C/C=C/C=C/C3=[N+](C4=CC=CC=C4S3)CC.[I-]', 45 | 'CCN\\1C2=CC=CC=C2S/C1=C\\C=C\\C=C/C3=[N+](C4=CC=CC=C4S3)CC.[I-]', 46 | 'CCN\\1C2=CC=CC=C2S/C1=C/C=C/C=CC3=[N+](C4=CC=CC=C4S3)CC.[I-]', 47 | 'CC1=CC=CC=C1OCC2=CC=CC=C2/C(=N\OC)/C(=O)OC', 48 | 'CCCCCC/C=C\CCCCCCCC(=O)O', 49 | 'CC(C)(C)c1nc(c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2F)c(s1)c4ccnc(N)n4', 50 | 'CC(C)(C)C1=NC(C2=CC=CC(NS(=O)(=O)N(C)(CC))=C2F)=C(S1)C4=CC=NC(N)=N4', 51 | 'C1=CC2=C(C=C1O)C(=CN2)C[C@@H](C(=O)[O-])[NH3+]', 52 | 'CN/C(=C\[N+](=O)[O-])/NCCSCC1=CC=C(O1)CN(C)C', 53 | 'CN/C(=C/[N+](=O)[O-])/NCCSCC1=CC=C(O1)CN(C)C', 54 | 'COCCOC[C@H](CC1(CCCC1)C(=O)N[C@@H]2CC[C@@H](CC2)C(=O)O)C(=O)Oc3ccc4CCCc4c3', 55 | 'C1=C(N=C(S1)N=C(N)N)CSCC/C(=N/S(=O)(=O)N)/N', 56 | 'C[C@]([C@H]1C[C@@]23CC[C@@]1([C@H]4[C@@]25CCN([C@@H]3CC6=C5C(=C(C=C6)O)O4)CC7CC7)OC)(C(C)(C)CC)O', 57 | 'CC(=O)O[Hg]c1cc(ccc1O)C(CC(C)(C)C)(C)C', 58 | 'CC(=O)O.CC(C)(C)CC(C)(C)[C]1C=CC(=C=C1)[O-].[Hg+]', 59 | 'N/C(N)=C([N+]([O-])=O)\[N+]([O-])=O', 60 | 'CC(C)C1=C(C(=C(N1CC[C@H](C[C@H](CC(=O)O)O)O)C2=CC=C(C=C2)F)C3=CC=CC=C3)C(=O)NC4=CC=CC=C4', 61 | 'c1cc(ccc1/N=N/c2ccc(c(c2)OS(=O)O)N)OS(=O)O.[Na+].[Na+]', 62 | 'Clc1ccc2Nc4ccccc4C(=N\c2c1)/N3CCNCC3', 63 | '[Yb][Yb][Yb][Ag][Ag]', 64 | 'N[C@@H](CSSC[C@H](N)C(O)=O)C(O)=O' 65 | 'CC1(C\\2CCC1(C(=O)/C2=C/c3ccc(cc3)C=O)CS(=O)(=O)[O-])C.[Na+]', 66 | 'CNC(=O)C1=CC=CC=C1NC2=NC(=NC=C2Cl)NC3=CC=C(C=C3)N4CCN(CCCN)CC4', 67 | 'OC(=O)CN/C(=N\c1ccc(C#N)cc1)NC2CCCCCCCC2', 68 | 'N[C@@]12C[C@]3(O[N+]([O-])=O)C[C@@](C2)(CC)C[C@@](C1)(CC)C3', 69 | 'C1C2CC3CC1(ON(OO))CC(C2)(C3)N', 70 | '[N+](=O)([O-])OC12CC3(CC(CC(C1)(C3)N)(C2)CC)CC', 71 | 'COc1cc(c(cc1C(=O)N[C@@H]2CC[N@@]3CCC[C@H]2C3)Cl)N', 72 | 'OCN(C(=O)N(CO)C)', 73 | '[O-][n+]1cc[n+](c2c1cccc2)[O-]', 74 | '[2H]C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C(O)=O', 75 | 'CS(O)(=O)=O.[H][C@@]12CC(C)C(C(=O)CN3CCN(CC3)c3cc(nc(n3)N3CCCC3)N3CCCC3)[C@@]1(C)CC=C1[C@@]2([H])CCC2=CC(=O)C=C[C@]12C', 76 | 'OCCCC(O)=O', 77 | 'Cc1nnc(s1)SCC2=C(N3[C@@H]([C@@H](C3=O)NC(=O)Cn4cnnn4)SC2)C(=O)[O-]', 78 | 'CC(=O)Oc1ccc(cc1)C(c1ccc(OC(C)=O)cc1)c1ccccn1' 79 | 80 | ] 81 | 82 | for smile in smiles: 83 | try: 84 | cmpnd = Compound(compound_string=smile, identifier_type='smiles') 85 | print(cmpnd.get_smiles(smiles_type='isomeric')) 86 | print(cmpnd.get_smiles(smiles_type='unique')) 87 | print(cmpnd.get_smiles(smiles_type='absolute')) 88 | print(cmpnd.get_smiles(smiles_type='generic')) 89 | print(cmpnd.get_inchi_key()) 90 | print(cmpnd.get_inchi()) 91 | print(cmpnd.get_mol2()) 92 | print(cmpnd.get_fingerprint()) 93 | print(cmpnd.get_tanimoto(Compound(compound_string='C1C2CC3CC1(ON(OO))CC(C2)(C3)N', identifier_type='smiles'))) 94 | print(cmpnd.get_tanimoto_from_bitset(Compound(compound_string='C1C2CC3CC1(ON(OO))CC(C2)(C3)N', identifier_type='smiles'))) 95 | print(cmpnd.get_molfile()) 96 | print('----------------------------') 97 | 98 | except ValueError as e: 99 | print(e) 100 | 101 | cmpnd = Compound(compound_string='InChI=1S/C5H10N2O3/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H2,7,8)(H,9,10)/p-1', 102 | identifier_type='inchi') 103 | print(cmpnd.get_smiles(smiles_type='generic')) 104 | print(cmpnd.get_inchi_key()) 105 | print(cmpnd.get_inchi()) 106 | 107 | # cdk_pywrapper.gateway.shutdown() 108 | 109 | 110 | if __name__ == '__main__': 111 | sys.exit(main()) 112 | 113 | 114 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | psutil 3 | wget 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | version-file: version.txt 4 | 5 | [check-manifest] 6 | ignore = 7 | .travis.yml 8 | PKG-INFO 9 | *.egg-info 10 | *.egg-info/* 11 | setup.cfg 12 | .hgtags 13 | .hgignore 14 | .gitignore 15 | .bzrignore 16 | *.mo 17 | .git/* 18 | 19 | 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from setuptools import setup, find_packages 3 | from subprocess import check_output 4 | import subprocess 5 | import py4j 6 | import os 7 | import wget 8 | 9 | host_os = platform.system() 10 | 11 | cdk_version = 'cdk-2.2' 12 | cdk_jar_path = os.path.join('.', 'cdk_pywrapper', 'cdk') 13 | cdk_jar = os.path.join(cdk_jar_path, cdk_version + '.jar') 14 | 15 | fn = wget.download('https://github.com/cdk/cdk/releases/download/{0}/{0}.jar'.format(cdk_version), out=cdk_jar_path) 16 | print('successfully downloaded', fn) 17 | 18 | if host_os == 'Linux' or host_os == 'Darwin': 19 | py4j_path = os.path.join(*py4j.__path__[0].split('/')[:-4]) 20 | py4j_jar_path = os.path.join('/', py4j_path, 'share', 'py4j', 'py4j' + py4j.__version__ + '.jar') 21 | cp_sep = ':' 22 | 23 | subprocess.check_call([ 24 | 'javac ' + 25 | ' -cp ' + 26 | ' {}{}{} '.format(py4j_jar_path, 27 | cp_sep, 28 | cdk_jar) + 29 | os.path.join('.', 'cdk_pywrapper', 'cdk', 'cdk_bridge.java') 30 | ], 31 | shell=True) 32 | 33 | if host_os == 'Windows': 34 | cp_sep = ';' 35 | drive, path = os.path.splitdrive(py4j.__path__[0]) 36 | py4j_path = os.path.join(drive + '\\', *path.split('\\')[:-3]) 37 | py4j_jar_path = os.path.join(py4j_path, 'share', 'py4j', 'py4j' + py4j.__version__ + '.jar') 38 | 39 | subprocess.check_call([ 40 | 'javac', 41 | '-cp', 42 | '{}{}{}'.format(py4j_jar_path, 43 | cp_sep, 44 | cdk_jar), 45 | os.path.join('.', 'cdk_pywrapper', 'cdk', 'cdk_bridge.java') 46 | ], 47 | shell=True) 48 | 49 | MAJOR_VERSION = 0 50 | MINOR_VERSION = 0 51 | MICRO_VERSION = 1 52 | 53 | REPO_URL = 'https://github.com/sebotic/cdk_pywrapper' 54 | 55 | setup( 56 | name='cdk_pywrapper', 57 | version="{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION), 58 | data_files=[("share/cdk", [cdk_jar, './cdk_pywrapper/cdk/CDKBridge.class', 59 | './cdk_pywrapper/cdk/SearchHandler.class'])], 60 | author='Sebastian Burgstaller-Muehlbacher', 61 | author_email='sburgs@scripps.edu', 62 | description='Python wrapper for the CDK (Chemistry Development Kit)', 63 | license='AGPLv3', 64 | keywords='chemistry, CDK, Chemistry Development Kit', 65 | url=REPO_URL, 66 | # packages=find_packages(), 67 | packages=['cdk_pywrapper'], 68 | # include_package_data=True, 69 | # long_description=read('README.md'), 70 | classifiers=[ 71 | "Programming Language :: Python", 72 | "Programming Language :: Python :: 3", 73 | "Programming Language :: Python :: 2.7", 74 | "Development Status :: 4 - Beta", 75 | "Operating System :: POSIX", 76 | "Operating System :: MacOS :: MacOS X", 77 | "Operating System :: Microsoft :: Windows", 78 | "Intended Audience :: Science/Research", 79 | "Topic :: Utilities", 80 | "Topic :: Scientific/Engineering :: Bio-Informatics", 81 | ], 82 | install_requires=[ 83 | 'py4j' 84 | ], 85 | ) 86 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.0.2 2 | --------------------------------------------------------------------------------