├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── pubchem_utils ├── __init__.py ├── pug.py ├── scripts │ ├── __init__.py │ ├── download_records.py │ ├── id_exchange.py │ └── test │ │ ├── __init__.py │ │ └── test_download_records.py └── test │ └── __init__.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 2.7 3 | 4 | before_install: 5 | 6 | # install code analysis tools 7 | - pip install pep8 pyflakes 8 | 9 | # install other packages 10 | - pip install joblib 11 | 12 | # install the package 13 | install: python setup.py install 14 | 15 | # run tests 16 | script: 17 | - nosetests 18 | - pep8 pubchem_utils 19 | - pyflakes pubchem_utils 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Stanford University 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pubchem-utils 2 | ============= 3 | 4 | Utilities for interacting with [PubChem](https://pubchem.ncbi.nlm.nih.gov) 5 | 6 | __Note:__ sometimes one or more of the tests fail but then pass when re-run. Until I can 7 | write better tests to capture this behavior, I have taken down the Travis indicator 8 | so as not to give a false impression. Please double-check your results when using this 9 | code in case of sporadic failures. 10 | 11 | Quick Start 12 | ----------- 13 | 14 | ```python 15 | from pubchem_utils import PubChem 16 | pc = PubChem() 17 | ``` 18 | 19 | Download 3D structures for a batch of CIDs: 20 | 21 | ```python 22 | pc.get_records([2244, 3672], filename='painkillers.sdf.gz', use_3d=True) 23 | ``` 24 | 25 | Retrieve SIDs active in a PubChem BioAssay experiment: 26 | 27 | ```python 28 | sids = pc.get_ids_from_assay(466, sids=True, activity_outcome='active') 29 | ``` 30 | 31 | Download the data table for a PubChem BioAssay experiment: 32 | 33 | ```python 34 | pc.get_assay_data(466, filename='AID466.csv.gz') 35 | ``` 36 | 37 | Get the PubChem CID for a compound in [ChEMBL](https://www.ebi.ac.uk/chembl): 38 | 39 | ```python 40 | id_map = pc.id_exchange('CHEMBL25') # source is inferred from ID string 41 | ``` 42 | 43 | Search PubChem for the CID matching a SMILES string: 44 | 45 | ```python 46 | cid = pc.structure_search('CC(=O)OC1=CC=CC=C1C(=O)O') 47 | ``` 48 | -------------------------------------------------------------------------------- /pubchem_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for interacting with PubChem. 3 | """ 4 | import json 5 | import numpy as np 6 | import shutil 7 | import re 8 | import time 9 | import urllib 10 | import urllib2 11 | 12 | from joblib import delayed, Parallel 13 | 14 | from .pug import PugQuery 15 | 16 | __author__ = "Steven Kearnes" 17 | __copyright__ = "Copyright 2014-2015, Stanford University" 18 | __license__ = "3-clause BSD" 19 | 20 | 21 | class PubChem(object): 22 | """ 23 | Submit queries to PUG and return PUGQuery objects. 24 | 25 | Parameters 26 | ---------- 27 | submit : bool, optional (default True) 28 | Whether to automatically submit PUGQuery queries. 29 | delay : int, optional (default 10) 30 | Number of seconds for PUGQuery objects to wait between status 31 | checks. 32 | verbose : bool, optional (default False) 33 | Whether to create PUG queries in verbose mode. 34 | """ 35 | def __init__(self, submit=True, delay=10, verbose=False): 36 | self.submit = submit 37 | self.delay = delay 38 | self.verbose = verbose 39 | 40 | def get_query(self, query): 41 | """ 42 | Create a PUG request. 43 | 44 | Parameters 45 | ---------- 46 | query : str 47 | PUG query XML. 48 | """ 49 | return PugQuery(query, submit=self.submit, delay=self.delay, 50 | verbose=self.verbose) 51 | 52 | def get_records(self, ids, filename=None, sids=False, 53 | download_format='sdf', compression='gzip', use_3d=False, 54 | n_conformers=1): 55 | """ 56 | Download records for substances or compounds identified by 57 | PubChem substance IDs (SIDs) or compound IDs (CIDs). 58 | 59 | Parameters 60 | ---------- 61 | ids : iterable 62 | PubChem substance or compound IDs. 63 | filename : str, optional 64 | Output filename. If not provided, a temporary file is created. 65 | sids : bool, optional (default False) 66 | Whether ids are SIDs. If False, IDs are assumed to be CIDs. 67 | download_format : str, optional (default 'sdf') 68 | Download file format. 69 | compression : str, optional (default 'gzip') 70 | Compression type for downloaded structures. 71 | use_3d : bool, optional (default False) 72 | Whether to query 3D information. If False, 2D information is 73 | retrieved. 74 | n_conformers : int, optional (default 1) 75 | Number of conformers to download if retrieving 3D structures. 76 | """ 77 | query_template = """ 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | %(database)s 88 | 89 | %(uids)s 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | %(n_conformers)s 100 | 101 | 102 | 103 | 104 | 105 | 106 | """ 107 | mapping = {} 108 | 109 | # database 110 | if sids: 111 | mapping['database'] = 'pcsubstance' 112 | else: 113 | mapping['database'] = 'pccompound' 114 | 115 | # download format 116 | download_formats = ['text-asn', 'binary-asn', 'xml', 'sdf', 'image', 117 | 'image-small', 'smiles', 'inchi'] 118 | assert download_format in download_formats, ( 119 | 'download_format must be one of ' + str(download_formats)) 120 | mapping['download_format'] = download_format 121 | 122 | # compression 123 | if compression is None: 124 | compression = 'none' 125 | compressions = ['none', 'gzip', 'bzip2'] 126 | assert compression in compressions, ( 127 | 'compression must be one of ' + str(compressions)) 128 | mapping['compression'] = compression 129 | 130 | # 3D 131 | if use_3d: 132 | mapping['use_3d'] = 'true' 133 | else: 134 | mapping['use_3d'] = 'false' 135 | 136 | # conformers 137 | mapping['n_conformers'] = n_conformers 138 | 139 | # create XML for each ID 140 | xml_uids = '' 141 | for uid in ids: 142 | xml_uids += ('{}'.format(uid) + 143 | '\n') 144 | mapping['uids'] = xml_uids 145 | 146 | # construct query 147 | query = self.get_query(query_template % mapping) 148 | rval = query.fetch(filename, compression=compression) 149 | return rval 150 | 151 | def get_record(self, id, filename=None, sid=False, use_3d=False): 152 | """ 153 | Download a single record for a substance or compound identified by 154 | PubChem substance ID (SID) or compound ID (CID). 155 | 156 | Parameters 157 | ---------- 158 | id : str 159 | PubChem substance or compound ID. 160 | filename : str, optional 161 | Output filename. If not provided, the output is returned as a 162 | string 163 | sid : bool, optional (default False) 164 | Whether id is a SID. If False, ID is assumed to be a CID. 165 | use_3d : bool, optional (default False) 166 | Whether to query 3D information. If False, 2D information is 167 | retrieved. 168 | 169 | Returns 170 | ------- 171 | val : {str, None} 172 | The requested substance or compound, in an SDF-format string, or 173 | None if `filename` output is specified 174 | 175 | Notes 176 | ----- 177 | Requests for multiple substances, compounds or conformers can be 178 | batched together _much_ more efficiently by using `PubChem.get_records` 179 | 180 | Raises 181 | ------ 182 | Invalid CID or SID requests will result in a urllib2.HTTPError (400). 183 | Certain PubChem compounds and substances may not have available 3D 184 | structures, in which case this method, when called with use_3d=True, 185 | will throw a urllib2.HTTPError (404). 186 | """ 187 | 188 | base = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug/%s?%s' 189 | if sid: 190 | specialization = 'substance/sid/%s/SDF' % id 191 | else: 192 | specialization = 'compound/cid/%s/SDF' % id 193 | 194 | if use_3d: 195 | params = {'record_type': '3d'} 196 | else: 197 | params = {} 198 | 199 | url = base % (specialization, urllib.urlencode(params)) 200 | comm = urllib2.urlopen(url) 201 | 202 | if filename is None: 203 | return comm.read() 204 | else: 205 | with open(filename, 'wb') as f: 206 | shutil.copyfileobj(comm, f) 207 | 208 | def get_parent_cids(self, cids): 209 | """ 210 | Get IDs of parent compounds. Note that the parent IDs are not 211 | guaranteed to be returned in the same order as the child IDs, so we 212 | return a set. 213 | 214 | Parameters 215 | ---------- 216 | ids : iterable 217 | PubChem substance or compound IDs. 218 | sids : bool, optional (default False) 219 | Whether ids are SIDs. If False, IDs are assumed to be CIDs. 220 | """ 221 | url_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' + 222 | '/cid/%(cids)s/cids/TXT?cids_type=parent') 223 | mapping = {'cids': ','.join([str(cid) for cid in cids])} 224 | response = urllib2.urlopen(url_template % mapping) 225 | parents = set() 226 | for line in response.readlines(): 227 | cid = int(line) 228 | if cid: # 0 is not a valid ID 229 | parents.add(cid) 230 | return parents 231 | 232 | def get_ids_from_assay(self, aid, sids=False, activity_outcome=None): 233 | """ 234 | Retrieve substance or compound IDs tested in a PubChem BioAssay 235 | assay. 236 | 237 | Parameters 238 | ---------- 239 | aid : int 240 | PubChem BioAssay assay ID (AID). 241 | sids : bool, optional (default False) 242 | Whether ids are SIDs. If False, IDs are assumed to be CIDs. 243 | activity_outcome : str, optional 244 | If provided, only retrieve records with this activity outcome, 245 | such as 'active'. 246 | """ 247 | url_template = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid' + 248 | '/%(aid)s/%(database)s/txt') 249 | mapping = {'aid': aid} 250 | if sids: 251 | mapping['database'] = 'sids' 252 | else: 253 | mapping['database'] = 'cids' 254 | if activity_outcome is not None: 255 | url_template += '?{}_type={}'.format(mapping['database'], 256 | activity_outcome.lower()) 257 | url = url_template % mapping 258 | response = urllib2.urlopen(url) 259 | ids = [] 260 | for this in response.readlines(): 261 | this = this.strip() 262 | if int(this): # 0 is not a valid ID 263 | ids.append(this) 264 | ids = np.asarray(ids, dtype=int) 265 | return ids 266 | 267 | def get_assay_data(self, aids, filename=None, substance_view=True, 268 | concise=False, compression='gzip'): 269 | """ 270 | Download PubChem BioAssay data table. 271 | 272 | Parameters 273 | ---------- 274 | aids : array_like 275 | PubChem BioAssay IDs (AIDs). 276 | filename : str, optional 277 | Output filename. If not provided, a temporary file is created. 278 | substance_view : bool, optional (default True) 279 | Whether to group results by substance. If False, results will be 280 | grouped by compound. The default (True) is recommended when 281 | retrieving data from a single assay. 282 | compression : str, optional (default 'gzip') 283 | Compression type for assay data. 284 | concise : bool, optional (default False) 285 | Whether to return the concise data table. If False, the complete 286 | data table is retrieved. 287 | """ 288 | query_template = """ 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 4 299 | 300 | 301 | 302 | 303 | pcassay 304 | 305 | %(aids)s 306 | 307 | 308 | 309 | 310 | 311 | %(dataset)s 312 | 313 | 314 | %(group_by)s 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | """ 328 | group_by = ('{}' + 329 | '') 330 | if substance_view: 331 | group_by = group_by.format('substance', 4) 332 | else: 333 | group_by = group_by.format('compound', 0) 334 | 335 | dataset = ('{}' + 336 | '') 337 | if concise: 338 | dataset = dataset.format('concise', 1) 339 | else: 340 | dataset = dataset.format('complete', 0) 341 | aid_xml = '' 342 | for aid in np.atleast_1d(aids): 343 | aid_xml += ('{}'.format(aid) + 344 | '') 345 | mapping = {'group_by': group_by, 'dataset': dataset, 'aids': aid_xml, 346 | 'compression': compression} 347 | query = self.get_query(query_template % mapping) 348 | rval = query.fetch(filename, compression=compression) 349 | return rval 350 | 351 | def get_assay_descriptions(self, aids, output_format='json', 352 | batch_size=500, n_jobs=1, max_attempts=3): 353 | """ 354 | Get assay descriptions. 355 | 356 | Parameters 357 | ---------- 358 | aids : list 359 | List of assay IDs. 360 | output_format : str (default='json') 361 | Output format. 362 | """ 363 | results = Parallel(n_jobs=n_jobs, verbose=5)( 364 | delayed(_get_assay_descriptions) 365 | (this_aids, output_format, batch_size, max_attempts) 366 | for this_aids in np.array_split(aids, n_jobs)) 367 | descriptions = [] 368 | if output_format == 'json': 369 | for result in results: 370 | for this in result: 371 | data = json.loads(this) 372 | assert len(data) == 1 373 | assert data.keys()[0] == 'PC_AssayContainer' 374 | for description in data['PC_AssayContainer']: 375 | descriptions.append(description['assay']['descr']) 376 | else: 377 | raise NotImplementedError(output_format) 378 | return descriptions 379 | 380 | def id_exchange(self, ids, source=None, operation_type='same', 381 | output_type='cid'): 382 | """ 383 | Use the PubChem Identifier exchange service. 384 | 385 | Currently only supports mapping from Registry IDs (e.g. ChEMBL IDs) to 386 | PubChem IDs. 387 | 388 | Parameters 389 | ---------- 390 | ids : iterable 391 | Input identifiers. 392 | source : str, optional 393 | Input source. If None, it will be inferred from ids (if possible). 394 | operation_type : str, optional (default 'same') 395 | Operation type. Defaults to exact matches. 396 | output_type : str, optional (default 'cid') 397 | Output type. Defaults to PubChem CIDs. 398 | """ 399 | query_template = """ 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | %(source)s 414 | 415 | %(source_ids)s 416 | 417 | 418 | 419 | 420 | 421 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | """ 436 | ids = np.atleast_1d(ids) 437 | if np.unique(ids).size != len(ids): 438 | raise ValueError('Source IDs must be unique.') 439 | if source is None: 440 | source = self.guess_source(ids[0]) 441 | if source is None: 442 | raise ValueError('Cannot guess identifier source.') 443 | mapping = {'source': source, 'operation_type': operation_type, 444 | 'output_type': output_type} 445 | source_ids = [] 446 | for source_id in ids: 447 | id_xml = ('{}'.format(source_id) + 448 | '\n') 449 | source_ids.append(id_xml) 450 | mapping['source_ids'] = ''.join(source_ids) 451 | 452 | # construct query 453 | query = self.get_query(query_template % mapping) 454 | rval = query.fetch(compression='gzip') 455 | 456 | # identify matched and unmatched IDs 457 | id_map = {} 458 | for line in rval.splitlines(): 459 | source, dest = line.split() 460 | try: 461 | dest = int(dest) # try to convert to an int 462 | except ValueError: 463 | pass 464 | if source in id_map and id_map[source] != dest: 465 | raise ValueError('Nonidentical duplicate mapping.') 466 | id_map[source] = dest 467 | for source_id in ids: 468 | if source_id not in id_map: 469 | id_map[source_id] = None 470 | return id_map 471 | 472 | @staticmethod 473 | def guess_source(identifier): 474 | """ 475 | Guess the source for an identifier. 476 | 477 | Parameters 478 | ---------- 479 | identifier : str 480 | Identifier. 481 | """ 482 | source = None 483 | if str(identifier).startswith('CHEMBL'): 484 | source = 'ChEMBL' 485 | elif str(identifier).startswith('ZINC'): 486 | source = 'ZINC' 487 | return source 488 | 489 | def structure_search(self, structure, structure_format='smiles'): 490 | """ 491 | Search PubChem for identical structure and return matching CID. 492 | 493 | Parameters 494 | ---------- 495 | structure : str 496 | SMILES or SDF query. 497 | structure_format : str, optional (default 'smiles') 498 | Structure format. Can be either 'smiles' or 'sdf'. 499 | """ 500 | query_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' + 501 | '/identity/{}/XML') 502 | status_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug' + 503 | '/compound/listkey/{}/cids/XML') 504 | request_id = None 505 | post_data = urllib.urlencode({structure_format: structure}) 506 | req = urllib2.Request(query_template.format(structure_format)) 507 | req.add_header('Content-Type', 'application/x-www-form-urlencoded') 508 | response = urllib2.urlopen(req, data=post_data) 509 | for line in response.readlines(): 510 | search = re.search('(\d+)', line) 511 | if search is not None: 512 | request_id = search.groups()[0] 513 | if request_id is None: 514 | return None 515 | cid = None 516 | while True: 517 | try: 518 | response = urllib2.urlopen( 519 | status_template.format(request_id)) 520 | except urllib2.HTTPError: 521 | break 522 | for line in response.readlines(): 523 | search = re.search('(\d+)', line) 524 | if search is not None: 525 | cid = int(search.groups()[0]) 526 | if cid is not None: 527 | break 528 | time.sleep(self.delay) 529 | return cid 530 | 531 | 532 | def _get_assay_descriptions(aids, output_format='json', batch_size=500, 533 | max_attempts=3): 534 | """ 535 | Parallel worker for PubChem.get_assay_descriptions. 536 | 537 | Parameters 538 | ---------- 539 | aids : list 540 | List of assay IDs. 541 | output_format : str (default='json') 542 | Output format. 543 | batch_size : int (default 500) 544 | Number of descriptions per request. 545 | max_attempts : int (default 3) 546 | Maximum number of query attempts. The batch_size is halved after each 547 | failure. 548 | """ 549 | url = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/' + 550 | '{aids}/description/{format}') 551 | descriptions = [] 552 | failures = 0 553 | start = 0 554 | while True: 555 | if start >= len(aids): 556 | break # stop when we are out of AIDs 557 | query_aids = aids[start:start+batch_size] 558 | query = url.format(aids=','.join([str(aid) for aid in query_aids]), 559 | format=output_format) 560 | try: 561 | response = urllib2.urlopen(query) 562 | except urllib2.HTTPError as e: 563 | failures += 1 564 | batch_size /= 2 # halve the batch size and try again 565 | if failures >= max_attempts: 566 | raise e 567 | continue 568 | descriptions.append(response.read()) 569 | failures = 0 # reset the failure count 570 | start += batch_size # move the start index 571 | return descriptions 572 | -------------------------------------------------------------------------------- /pubchem_utils/pug.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for interacting with the PubChem Power User Gateway (PUG). 3 | 4 | The PUG XML schema is located at 5 | https://pubchem.ncbi.nlm.nih.gov/pug/pug.xsd. 6 | 7 | See also https://pubchem.ncbi.nlm.nih.gov/pug/pughelp.html. 8 | """ 9 | import gzip 10 | import re 11 | from StringIO import StringIO 12 | import time 13 | import urllib 14 | import urllib2 15 | import warnings 16 | 17 | __author__ = "Steven Kearnes" 18 | __copyright__ = "Copyright 2014, Stanford University" 19 | __license__ = "3-clause BSD" 20 | 21 | 22 | class PugQuery(object): 23 | """ 24 | Submit a PUG query and store the download URL when it becomes 25 | available. 26 | 27 | Parameters 28 | ---------- 29 | query : str 30 | PUG query XML. 31 | submit : bool, optional (default True) 32 | Whether to automatically submit the query. 33 | delay : int, optional (default 10) 34 | Number of seconds to wait between status checks. 35 | n_attempts : int, optional (default 3) 36 | Number of times to attempt query submission. 37 | verbose : bool, optional (default False) 38 | Whether to be verbose. 39 | """ 40 | cancel_template = """ 41 | 42 | 43 | 44 | 45 | 46 | %(id)s 47 | 48 | 49 | 50 | 51 | 52 | 53 | """ 54 | status_template = """ 55 | 56 | 57 | 58 | 59 | 60 | %(id)s 61 | 62 | 63 | 64 | 65 | 66 | 67 | """ 68 | url = 'https://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi' 69 | 70 | def __init__(self, query, submit=True, delay=10, n_attempts=3, 71 | verbose=False): 72 | self.query = query 73 | self.delay = delay 74 | self.n_attemps = n_attempts 75 | self.verbose = verbose 76 | 77 | self.id = None 78 | self.download_url = None 79 | self.filename = None 80 | self.data = None 81 | self.alive = False 82 | 83 | if submit: 84 | self.submit() 85 | 86 | def __del__(self): 87 | """ 88 | Cancel uncompleted queries. 89 | """ 90 | self.cancel() 91 | 92 | def request(self, query): 93 | """ 94 | Submit a query to PUG and extract either the query ID or the 95 | download URL. 96 | 97 | Parameters 98 | ---------- 99 | query : str 100 | PUG query XML. 101 | """ 102 | q = None 103 | for i in xrange(self.n_attemps): 104 | try: 105 | q = urllib2.urlopen(self.url, query) 106 | break 107 | except urllib2.HTTPError as e: 108 | if i + 1 < self.n_attemps: 109 | continue 110 | else: 111 | raise e 112 | response = q.read() 113 | 114 | # check for errors 115 | status_re = re.search('', response) 116 | status = status_re.groups()[0] 117 | if status not in ['success', 'queued', 'running', 'stopped']: 118 | msg = 'Original Query:\n------\n{}\n'.format( 119 | '\n'.join(self.query.splitlines()[:100])) 120 | if query != self.query: 121 | msg += 'Current Query:\n--------------\n{}\n'.format( 122 | '\n'.join(query.splitlines()[:100])) 123 | msg += 'Response:\n---------\n{}'.format(response) 124 | raise PUGError(msg) 125 | 126 | # check for a download URL 127 | download_url_re = re.search( 128 | '\s*(.*?)\s*', 129 | response) 130 | if download_url_re is not None: 131 | self.download_url = download_url_re.groups()[0] 132 | 133 | # otherwise, extract the request ID 134 | elif self.id is None: 135 | reqid_re = re.search( 136 | '\s*(.*?)\s*', response) 137 | self.id = reqid_re.groups()[0] 138 | 139 | def cancel(self): 140 | """ 141 | Cancel a pending request. 142 | """ 143 | if self.alive: 144 | assert self.id is not None 145 | warnings.warn('Canceling PUG request.') 146 | query = self.cancel_template % {'id': self.id} 147 | self.request(query) 148 | self.alive = False 149 | 150 | def check_status(self): 151 | """ 152 | Check the status of the query. 153 | """ 154 | assert self.id is not None 155 | query = self.status_template % {'id': self.id} 156 | self.request(query) 157 | 158 | def submit(self): 159 | """ 160 | Submit the query and monitor its progress. 161 | """ 162 | if self.alive: 163 | warnings.warn('This request is already active.') 164 | return 165 | self.alive = True 166 | self.request(self.query) 167 | if self.verbose: 168 | print self.id, 169 | while self.download_url is None: 170 | time.sleep(self.delay) 171 | self.check_status() 172 | self.alive = False 173 | 174 | def fetch(self, filename=None, compression=None): 175 | """ 176 | Fetch the result of the query. 177 | 178 | Parameters 179 | ---------- 180 | filename : str, optional 181 | Output filename. If not provided, the data is read into memory. 182 | compression : str, optional 183 | Compression type used to decode data. 184 | """ 185 | if not self.alive: 186 | self.submit() 187 | if self.download_url is None: 188 | raise PUGError('No download URL.') 189 | 190 | # fetch 191 | if filename is not None: 192 | filename, _ = urllib.urlretrieve(self.download_url, filename) 193 | self.filename = filename 194 | return filename 195 | else: 196 | data = urllib2.urlopen(self.download_url).read() 197 | if compression is not None: 198 | if compression == 'gzip': 199 | with gzip.GzipFile(fileobj=StringIO(data)) as f: 200 | data = f.read() 201 | else: 202 | raise NotImplementedError(compression) 203 | self.data = data 204 | return data 205 | 206 | 207 | class PUGError(Exception): 208 | """ 209 | PUG exception class. 210 | """ 211 | -------------------------------------------------------------------------------- /pubchem_utils/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scripting utilities. 3 | """ 4 | import gzip 5 | 6 | __author__ = "Steven Kearnes" 7 | __copyright__ = "Copyright 2014, Stanford University" 8 | __license__ = "3-clause BSD" 9 | 10 | 11 | def read_ids(filename): 12 | """ 13 | Read record IDs from a file. 14 | 15 | Parameters 16 | ---------- 17 | filename : str 18 | Filename containing record IDs. 19 | """ 20 | if filename.endswith('.gz'): 21 | f = gzip.open(filename) 22 | else: 23 | f = open(filename) 24 | ids = [line.strip() for line in f] 25 | f.close() 26 | return ids 27 | -------------------------------------------------------------------------------- /pubchem_utils/scripts/download_records.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download records from PubChem by ID. 3 | """ 4 | import argparse 5 | 6 | from pubchem_utils import PubChem 7 | from pubchem_utils.scripts import read_ids 8 | 9 | __author__ = "Steven Kearnes" 10 | __copyright__ = "Copyright 2014, Stanford University" 11 | __license__ = "3-clause BSD" 12 | 13 | 14 | def parse_args(input_args=None): 15 | """ 16 | Parse command-line arguments. 17 | 18 | Parameters 19 | ---------- 20 | input_args : list, optional 21 | Input arguments. If not provided, defaults to sys.argv[1:]. 22 | """ 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('input', 25 | help='Input filename containing record IDs.') 26 | parser.add_argument('output', 27 | help='Output filename.') 28 | parser.add_argument('--sids', action='store_true', 29 | help='Whether IDs are substance IDs (if False, IDs ' + 30 | 'are assumed to be compound IDs).') 31 | parser.add_argument('-f', '--format', dest='download_format', 32 | default='sdf', help='Download format.') 33 | parser.add_argument('-c', '--compression', default='gzip', 34 | help='Compression type.') 35 | parser.add_argument('--3d', action='store_true', dest='use_3d', 36 | help='Whether to download 3D structures.') 37 | parser.add_argument('-n', '--n-conformers', type=int, default=1, 38 | help='Number of conformers to download if ' + 39 | 'retrieving 3D structures.') 40 | parser.add_argument('-d', '--delay', type=int, default=10, 41 | help='Number of seconds to wait between status ' + 42 | 'checks.') 43 | rval = parser.parse_args(input_args) 44 | return rval 45 | 46 | 47 | def main(ids, filename=None, sids=False, download_format='sdf', 48 | compression='gzip', use_3d=False, n_conformers=1, delay=10): 49 | """ 50 | Download records from PubChem by ID. 51 | 52 | Parameters 53 | ---------- 54 | ids : iterable 55 | PubChem substance or compound IDs. 56 | filename : str, optional 57 | Output filename. If not provided, a temporary file is created. 58 | sids : bool, optional (default False) 59 | Whether ids are SIDs. If False, IDs are assumed to be CIDs. 60 | download_format : str, optional (default 'sdf') 61 | Download file format. 62 | compression : str, optional (default 'gzip') 63 | Compression type for downloaded structures. 64 | use_3d : bool, optional (default True) 65 | Whether to query 3D information. If False, 2D information is 66 | retrieved. 67 | n_conformers : int, optional (default 1) 68 | Number of conformers to download if retrieving 3D structures. 69 | delay : int, optional (default 10) 70 | Number of seconds to wait between status checks. 71 | """ 72 | engine = PubChem(delay=delay) 73 | engine.get_records(ids, filename, sids, download_format, compression, 74 | use_3d, n_conformers) 75 | 76 | if __name__ == '__main__': 77 | args = parse_args() 78 | record_ids = read_ids(args.input) 79 | main(record_ids, args.output, args.sids, args.download_format, 80 | args.compression, args.use_3d, args.n_conformers, args.delay) 81 | -------------------------------------------------------------------------------- /pubchem_utils/scripts/id_exchange.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Use the PubChem Identifier Exchange service. 4 | """ 5 | import argparse 6 | import numpy as np 7 | 8 | from pubchem_utils import PubChem 9 | from pubchem_utils.scripts import read_ids 10 | 11 | __author__ = "Steven Kearnes" 12 | __copyright__ = "Copyright 2014, Stanford University" 13 | __license__ = "3-clause BSD" 14 | 15 | 16 | def parse_args(input_args=None): 17 | """ 18 | Parse command-line arguments. 19 | 20 | Parameters 21 | ---------- 22 | input_args : list, optional 23 | Input arguments. If not provided, defaults to sys.argv[1:]. 24 | """ 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('input', 27 | help='Input filename containing record IDs.') 28 | parser.add_argument('-s', '--source', 29 | help='Source for input IDs. If not provided, the ' + 30 | 'source will be inferred from the input IDs.') 31 | parser.add_argument('-m', '--mapping', action='store_true', 32 | help='Whether to write ID mapping. If false, only ' + 33 | 'result IDs will be saved.') 34 | parser.add_argument('-p', '--prefix', 35 | help='Prefix for output files.') 36 | parser.add_argument('--sids', action='store_true', 37 | help='Whether returned IDs are substance IDs ' 38 | '(if False, returned IDs will be compound IDs).') 39 | parser.add_argument('-d', '--delay', type=int, default=10, 40 | help='Number of seconds to wait between status ' + 41 | 'checks.') 42 | return parser.parse_args(input_args) 43 | 44 | 45 | def main(ids, source=None, prefix=None, sids=False, mapping=False, delay=10): 46 | """ 47 | Download records from PubChem by ID. 48 | 49 | Parameters 50 | ---------- 51 | ids : iterable 52 | Source IDs. 53 | source : str, optional 54 | Input source. If None, it will be inferred from ids (if possible). 55 | prefix : str, optional 56 | Prefix for output files. 57 | sids : bool, optional (default False) 58 | Whether ids are SIDs. If False, IDs are assumed to be CIDs. 59 | mapping : bool, optional (default False) 60 | delay : int, optional (default 10) 61 | Number of seconds to wait between status checks. 62 | """ 63 | engine = PubChem(delay=delay) 64 | if sids: 65 | output_type = 'sid' 66 | else: 67 | output_type = 'cid' 68 | matched, unmatched = engine.id_exchange(np.unique(ids), source, 69 | output_type=output_type) 70 | if mapping: 71 | with open('{}-mapping.txt'.format(prefix), 'wb') as f: 72 | for key, value in matched.items(): 73 | f.write('{}\t{}\n'.format(key, value)) 74 | else: 75 | with open('{}-matched.txt'.format(prefix), 'wb') as f: 76 | for value in matched.values(): 77 | f.write('{}\n'.format(value)) 78 | if len(unmatched): 79 | with open('{}-unmatched.txt'.format(prefix), 'wb') as f: 80 | for value in unmatched: 81 | f.write('{}\n'.format(value)) 82 | 83 | if __name__ == '__main__': 84 | args = parse_args() 85 | record_ids = read_ids(args.input) 86 | main(record_ids, args.source, args.prefix, args.sids, args.mapping, 87 | args.delay) 88 | -------------------------------------------------------------------------------- /pubchem_utils/scripts/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skearnes/pubchem-utils/6dd796d4e0ef65641547c429ed39ce39d0742510/pubchem_utils/scripts/test/__init__.py -------------------------------------------------------------------------------- /pubchem_utils/scripts/test/test_download_records.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for download_records.py. 3 | """ 4 | import numpy as np 5 | import shutil 6 | import tempfile 7 | import unittest 8 | 9 | from .. import read_ids 10 | from ..download_records import main, parse_args 11 | 12 | 13 | class TestDownloadIds(unittest.TestCase): 14 | """ 15 | Tests for download_records.py. 16 | """ 17 | def setUp(self): 18 | """ 19 | Set up tests. 20 | """ 21 | self.temp_dir = tempfile.mkdtemp() 22 | _, self.filename = tempfile.mkstemp(dir=self.temp_dir) 23 | 24 | # write CIDs 25 | self.cids = np.asarray([2244]) 26 | _, self.cid_filename = tempfile.mkstemp(suffix='.txt', 27 | dir=self.temp_dir) 28 | with open(self.cid_filename, 'wb') as f: 29 | for cid in self.cids: 30 | f.write('{}\n'.format(cid)) 31 | 32 | # write SIDs 33 | self.sids = [179038559] 34 | _, self.sid_filename = tempfile.mkstemp(suffix='.txt', 35 | dir=self.temp_dir) 36 | with open(self.sid_filename, 'wb') as f: 37 | for sid in self.sids: 38 | f.write('{}\n'.format(sid)) 39 | 40 | def tearDown(self): 41 | """ 42 | Clean up tests. 43 | """ 44 | shutil.rmtree(self.temp_dir) 45 | 46 | def run_script(self, ids, args): 47 | """ 48 | Run main loop of script. 49 | """ 50 | main(ids, args.output, args.sids, args.download_format, 51 | args.compression, args.use_3d, args.n_conformers, args.delay) 52 | 53 | def test_read_ids(self): 54 | """ 55 | Test read_ids. 56 | """ 57 | ids = read_ids(self.cid_filename) 58 | assert np.array_equal(np.asarray(ids, dtype=self.cids.dtype), 59 | self.cids) 60 | 61 | def test_download_cid(self): 62 | """ 63 | Download a CID. 64 | """ 65 | ids = read_ids(self.cid_filename) 66 | args = parse_args([self.cid_filename, self.filename]) 67 | self.run_script(ids, args) 68 | -------------------------------------------------------------------------------- /pubchem_utils/test/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for PubChem PUG interface. 3 | """ 4 | import numpy as np 5 | import os 6 | import unittest 7 | import urllib2 8 | import tempfile 9 | 10 | from .. import PubChem 11 | 12 | 13 | class TestPubChem(unittest.TestCase): 14 | """ 15 | Tests for PubChem. 16 | 17 | Reference comparisons are made when possible to records retrieved using the 18 | PubChem PUG REST interface via http://pubchem.ncbi.nlm.nih.gov/rest/pug. 19 | """ 20 | def setUp(self): 21 | """ 22 | Set up tests. 23 | """ 24 | self.engine = PubChem(delay=3) # shorten delay for tests 25 | self.rest_url = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug' 26 | 27 | def identical_sdf(self, a, b): 28 | """ 29 | Compare SDF records. 30 | 31 | SDF records downloaded from PubChem have a timestamp that should not be 32 | considered in the comparison. 33 | 34 | Parameters 35 | ---------- 36 | a, b : str 37 | SDF records to compare. 38 | """ 39 | if a == b: # sometimes the timestamps match 40 | return True 41 | 42 | try: 43 | a_lines = a.split('\n') 44 | b_lines = b.split('\n') 45 | assert len(a_lines) == len(b_lines) 46 | for i in xrange(len(a_lines)): 47 | if i == 1: 48 | assert a_lines[i].strip().startswith('-OEChem') 49 | assert b_lines[i].strip().startswith('-OEChem') 50 | continue 51 | assert a_lines[i] == b_lines[i] 52 | return True 53 | except AssertionError: 54 | return False 55 | 56 | def test_get_records_cid(self): 57 | """ 58 | 2D CID request with get_records(). 59 | """ 60 | url = os.path.join(self.rest_url, 'compound/cid/2244/SDF') 61 | ref = urllib2.urlopen(url).read() 62 | data = self.engine.get_records([2244]) 63 | assert self.identical_sdf(data, ref) 64 | 65 | def test_get_record_cid(self): 66 | """ 67 | 2D CID request with get_record(). 68 | """ 69 | url = os.path.join(self.rest_url, 'compound/cid/2244/SDF') 70 | ref = urllib2.urlopen(url).read() 71 | data = self.engine.get_record(2244) 72 | assert self.identical_sdf(data, ref) 73 | 74 | def test_get_records_sid(self): 75 | """ 76 | SID request with get_records(). 77 | """ 78 | url = os.path.join(self.rest_url, 'substance/sid/179038559/SDF') 79 | ref = urllib2.urlopen(url).read() 80 | data = self.engine.get_records([179038559], sids=True) 81 | assert self.identical_sdf(data, ref) 82 | 83 | def test_get_record_sid(self): 84 | """ 85 | SID request with get_record(). 86 | """ 87 | url = os.path.join(self.rest_url, 'substance/sid/179038559/SDF') 88 | ref = urllib2.urlopen(url).read() 89 | data = self.engine.get_record(179038559, sid=True) 90 | assert self.identical_sdf(data, ref) 91 | 92 | def test_get_records_3d(self): 93 | """ 94 | 3D structure request with get_records(). 95 | """ 96 | url = os.path.join(self.rest_url, 97 | 'compound/cid/2244/SDF?record_type=3d') 98 | ref = urllib2.urlopen(url).read() 99 | data = self.engine.get_records([2244], use_3d=True) 100 | assert self.identical_sdf(data, ref) 101 | 102 | def test_get_record_3d(self): 103 | """ 104 | 3D structure request with get_record(). 105 | """ 106 | url = os.path.join(self.rest_url, 107 | 'compound/cid/2244/SDF?record_type=3d') 108 | ref = urllib2.urlopen(url).read() 109 | data = self.engine.get_record(2244, use_3d=True) 110 | assert self.identical_sdf(data, ref) 111 | 112 | def test_aid_cids(self): 113 | """ 114 | Fetch CIDs from an AID. 115 | """ 116 | url = os.path.join(self.rest_url, 'assay/aid/466/cids/TXT') 117 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int) 118 | data = self.engine.get_ids_from_assay(466) 119 | assert np.array_equal(data, ref) 120 | 121 | def test_aid_sids(self): 122 | """ 123 | Fetch SIDs from an AID. 124 | """ 125 | url = os.path.join(self.rest_url, 'assay/aid/466/sids/TXT') 126 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int) 127 | data = self.engine.get_ids_from_assay(466, sids=True) 128 | assert np.array_equal(data, ref) 129 | 130 | def test_aid_active_cids(self): 131 | """ 132 | Fetch active CIDs from an AID. 133 | """ 134 | url = os.path.join(self.rest_url, 135 | 'assay/aid/466/cids/TXT?cids_type=active') 136 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int) 137 | data = self.engine.get_ids_from_assay(466, activity_outcome='active') 138 | assert np.array_equal(data, ref) 139 | 140 | def test_aid_inactive_cids(self): 141 | """ 142 | Fetch inactive CIDs from an AID. 143 | """ 144 | url = os.path.join(self.rest_url, 145 | 'assay/aid/466/cids/TXT?cids_type=inactive') 146 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int) 147 | data = self.engine.get_ids_from_assay(466, activity_outcome='inactive') 148 | assert np.array_equal(data, ref) 149 | 150 | def test_aid_active_sids(self): 151 | """ 152 | Fetch active SIDs from an AID. 153 | """ 154 | url = os.path.join(self.rest_url, 155 | 'assay/aid/466/sids/TXT?sids_type=active') 156 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int) 157 | data = self.engine.get_ids_from_assay(466, sids=True, 158 | activity_outcome='active') 159 | assert np.array_equal(data, ref) 160 | 161 | def test_aid_inactive_sids(self): 162 | """ 163 | Fetch inactive SIDs from an AID. 164 | """ 165 | url = os.path.join(self.rest_url, 166 | 'assay/aid/466/sids/TXT?sids_type=inactive') 167 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int) 168 | data = self.engine.get_ids_from_assay(466, sids=True, 169 | activity_outcome='inactive') 170 | assert np.array_equal(data, ref) 171 | 172 | def test_get_assay_data(self): 173 | """ 174 | Test PubChem.get_assay_data. 175 | """ 176 | data = self.engine.get_assay_data(504772) 177 | assert len(data.splitlines()) == 332 # 331 records plus header 178 | 179 | def test_get_assay_descriptions(self): 180 | """ 181 | Test PubChem.get_assay_descriptions. 182 | """ 183 | data = self.engine.get_assay_descriptions([490]) 184 | assert len(data) == 1 185 | assert data[0]['aid']['id'] == 490 # check AID 186 | 187 | def test_get_assay_descriptions_parallel(self): 188 | """ 189 | Test PubChem.get_assay_descriptions with n_jobs > 1. 190 | """ 191 | aids = [490, 466, 9, 548, 851] 192 | data = self.engine.get_assay_descriptions(aids, n_jobs=2) 193 | assert len(data) == 5 194 | 195 | # check AIDs are all present (order is not guaranteed) 196 | desc_aids = [] 197 | for desc in data: 198 | desc_aids.append(desc['aid']['id']) 199 | assert np.array_equal(np.sort(aids), np.sort(desc_aids)) 200 | 201 | def test_id_exchange(self): 202 | """ 203 | Test PubChem.id_exchange. 204 | """ 205 | data = self.engine.id_exchange('CHEMBL25') 206 | assert data['CHEMBL25'] == 2244 207 | 208 | def test_structure_search_smiles(self): 209 | """ 210 | Test PubChem.structure_search with SMILES queries. 211 | """ 212 | smiles = self.engine.get_records([2244], download_format='smiles') 213 | smiles = smiles.split()[1] 214 | assert self.engine.structure_search(smiles) == 2244 215 | 216 | def test_structure_search_sdf(self): 217 | """ 218 | Test PubChem.structure_search with SDF queries. 219 | """ 220 | sdf = self.engine.get_records([2244]) 221 | assert self.engine.structure_search( 222 | sdf, structure_format='sdf') == 2244 223 | 224 | def test_get_parent_cids(self): 225 | """ 226 | Test PubChem.get_parent_cids. 227 | """ 228 | same = self.engine.get_parent_cids([2244]) 229 | assert same == {2244}, same 230 | parents = self.engine.get_parent_cids([23666729, 5338317]) 231 | assert parents == {2244, 3672}, parents 232 | 233 | def test_get_record_filename(self): 234 | """ 235 | Test PubChem.get_record()'s filename kwarg. 236 | """ 237 | fd, fn = tempfile.mkstemp() 238 | try: 239 | ref = self.engine.get_record(2244) 240 | self.engine.get_record(2244, filename=fn) 241 | with open(fn) as f: 242 | data = f.read() 243 | assert self.identical_sdf(data, ref) 244 | finally: 245 | os.close(fd) 246 | os.unlink(fn) 247 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | numpy 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | def main(): 5 | setup( 6 | name='pubchem_utils', 7 | version='0.1', 8 | license='3-clause BSD', 9 | url='https://github.com/skearnes/pubchem-utils', 10 | description='Utilities for interacting with PubChem', 11 | packages=find_packages(), 12 | ) 13 | 14 | if __name__ == '__main__': 15 | main() 16 | --------------------------------------------------------------------------------