├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── pubchem_utils
├── __init__.py
├── pug.py
├── scripts
│ ├── __init__.py
│ ├── download_records.py
│ ├── id_exchange.py
│ └── test
│ │ ├── __init__.py
│ │ └── test_download_records.py
└── test
│ └── __init__.py
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 |
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 |
37 | # Translations
38 | *.mo
39 |
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 |
45 | # Rope
46 | .ropeproject
47 |
48 | # Django stuff:
49 | *.log
50 | *.pot
51 |
52 | # Sphinx documentation
53 | docs/_build/
54 |
55 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python: 2.7
3 |
4 | before_install:
5 |
6 | # install code analysis tools
7 | - pip install pep8 pyflakes
8 |
9 | # install other packages
10 | - pip install joblib
11 |
12 | # install the package
13 | install: python setup.py install
14 |
15 | # run tests
16 | script:
17 | - nosetests
18 | - pep8 pubchem_utils
19 | - pyflakes pubchem_utils
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014, Stanford University
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * Neither the name of the copyright holder nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | pubchem-utils
2 | =============
3 |
4 | Utilities for interacting with [PubChem](https://pubchem.ncbi.nlm.nih.gov)
5 |
6 | __Note:__ sometimes one or more of the tests fail but then pass when re-run. Until I can
7 | write better tests to capture this behavior, I have taken down the Travis indicator
8 | so as not to give a false impression. Please double-check your results when using this
9 | code in case of sporadic failures.
10 |
11 | Quick Start
12 | -----------
13 |
14 | ```python
15 | from pubchem_utils import PubChem
16 | pc = PubChem()
17 | ```
18 |
19 | Download 3D structures for a batch of CIDs:
20 |
21 | ```python
22 | pc.get_records([2244, 3672], filename='painkillers.sdf.gz', use_3d=True)
23 | ```
24 |
25 | Retrieve SIDs active in a PubChem BioAssay experiment:
26 |
27 | ```python
28 | sids = pc.get_ids_from_assay(466, sids=True, activity_outcome='active')
29 | ```
30 |
31 | Download the data table for a PubChem BioAssay experiment:
32 |
33 | ```python
34 | pc.get_assay_data(466, filename='AID466.csv.gz')
35 | ```
36 |
37 | Get the PubChem CID for a compound in [ChEMBL](https://www.ebi.ac.uk/chembl):
38 |
39 | ```python
40 | id_map = pc.id_exchange('CHEMBL25') # source is inferred from ID string
41 | ```
42 |
43 | Search PubChem for the CID matching a SMILES string:
44 |
45 | ```python
46 | cid = pc.structure_search('CC(=O)OC1=CC=CC=C1C(=O)O')
47 | ```
48 |
--------------------------------------------------------------------------------
/pubchem_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for interacting with PubChem.
3 | """
4 | import json
5 | import numpy as np
6 | import shutil
7 | import re
8 | import time
9 | import urllib
10 | import urllib2
11 |
12 | from joblib import delayed, Parallel
13 |
14 | from .pug import PugQuery
15 |
16 | __author__ = "Steven Kearnes"
17 | __copyright__ = "Copyright 2014-2015, Stanford University"
18 | __license__ = "3-clause BSD"
19 |
20 |
21 | class PubChem(object):
22 | """
23 | Submit queries to PUG and return PUGQuery objects.
24 |
25 | Parameters
26 | ----------
27 | submit : bool, optional (default True)
28 | Whether to automatically submit PUGQuery queries.
29 | delay : int, optional (default 10)
30 | Number of seconds for PUGQuery objects to wait between status
31 | checks.
32 | verbose : bool, optional (default False)
33 | Whether to create PUG queries in verbose mode.
34 | """
35 | def __init__(self, submit=True, delay=10, verbose=False):
36 | self.submit = submit
37 | self.delay = delay
38 | self.verbose = verbose
39 |
40 | def get_query(self, query):
41 | """
42 | Create a PUG request.
43 |
44 | Parameters
45 | ----------
46 | query : str
47 | PUG query XML.
48 | """
49 | return PugQuery(query, submit=self.submit, delay=self.delay,
50 | verbose=self.verbose)
51 |
52 | def get_records(self, ids, filename=None, sids=False,
53 | download_format='sdf', compression='gzip', use_3d=False,
54 | n_conformers=1):
55 | """
56 | Download records for substances or compounds identified by
57 | PubChem substance IDs (SIDs) or compound IDs (CIDs).
58 |
59 | Parameters
60 | ----------
61 | ids : iterable
62 | PubChem substance or compound IDs.
63 | filename : str, optional
64 | Output filename. If not provided, a temporary file is created.
65 | sids : bool, optional (default False)
66 | Whether ids are SIDs. If False, IDs are assumed to be CIDs.
67 | download_format : str, optional (default 'sdf')
68 | Download file format.
69 | compression : str, optional (default 'gzip')
70 | Compression type for downloaded structures.
71 | use_3d : bool, optional (default False)
72 | Whether to query 3D information. If False, 2D information is
73 | retrieved.
74 | n_conformers : int, optional (default 1)
75 | Number of conformers to download if retrieving 3D structures.
76 | """
77 | query_template = """
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | %(database)s
88 |
89 | %(uids)s
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | %(n_conformers)s
100 |
101 |
102 |
103 |
104 |
105 |
106 | """
107 | mapping = {}
108 |
109 | # database
110 | if sids:
111 | mapping['database'] = 'pcsubstance'
112 | else:
113 | mapping['database'] = 'pccompound'
114 |
115 | # download format
116 | download_formats = ['text-asn', 'binary-asn', 'xml', 'sdf', 'image',
117 | 'image-small', 'smiles', 'inchi']
118 | assert download_format in download_formats, (
119 | 'download_format must be one of ' + str(download_formats))
120 | mapping['download_format'] = download_format
121 |
122 | # compression
123 | if compression is None:
124 | compression = 'none'
125 | compressions = ['none', 'gzip', 'bzip2']
126 | assert compression in compressions, (
127 | 'compression must be one of ' + str(compressions))
128 | mapping['compression'] = compression
129 |
130 | # 3D
131 | if use_3d:
132 | mapping['use_3d'] = 'true'
133 | else:
134 | mapping['use_3d'] = 'false'
135 |
136 | # conformers
137 | mapping['n_conformers'] = n_conformers
138 |
139 | # create XML for each ID
140 | xml_uids = ''
141 | for uid in ids:
142 | xml_uids += ('{}'.format(uid) +
143 | '\n')
144 | mapping['uids'] = xml_uids
145 |
146 | # construct query
147 | query = self.get_query(query_template % mapping)
148 | rval = query.fetch(filename, compression=compression)
149 | return rval
150 |
151 | def get_record(self, id, filename=None, sid=False, use_3d=False):
152 | """
153 | Download a single record for a substance or compound identified by
154 | PubChem substance ID (SID) or compound ID (CID).
155 |
156 | Parameters
157 | ----------
158 | id : str
159 | PubChem substance or compound ID.
160 | filename : str, optional
161 | Output filename. If not provided, the output is returned as a
162 | string
163 | sid : bool, optional (default False)
164 | Whether id is a SID. If False, ID is assumed to be a CID.
165 | use_3d : bool, optional (default False)
166 | Whether to query 3D information. If False, 2D information is
167 | retrieved.
168 |
169 | Returns
170 | -------
171 | val : {str, None}
172 | The requested substance or compound, in an SDF-format string, or
173 | None if `filename` output is specified
174 |
175 | Notes
176 | -----
177 | Requests for multiple substances, compounds or conformers can be
178 | batched together _much_ more efficiently by using `PubChem.get_records`
179 |
180 | Raises
181 | ------
182 | Invalid CID or SID requests will result in a urllib2.HTTPError (400).
183 | Certain PubChem compounds and substances may not have available 3D
184 | structures, in which case this method, when called with use_3d=True,
185 | will throw a urllib2.HTTPError (404).
186 | """
187 |
188 | base = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug/%s?%s'
189 | if sid:
190 | specialization = 'substance/sid/%s/SDF' % id
191 | else:
192 | specialization = 'compound/cid/%s/SDF' % id
193 |
194 | if use_3d:
195 | params = {'record_type': '3d'}
196 | else:
197 | params = {}
198 |
199 | url = base % (specialization, urllib.urlencode(params))
200 | comm = urllib2.urlopen(url)
201 |
202 | if filename is None:
203 | return comm.read()
204 | else:
205 | with open(filename, 'wb') as f:
206 | shutil.copyfileobj(comm, f)
207 |
208 | def get_parent_cids(self, cids):
209 | """
210 | Get IDs of parent compounds. Note that the parent IDs are not
211 | guaranteed to be returned in the same order as the child IDs, so we
212 | return a set.
213 |
214 | Parameters
215 | ----------
216 | ids : iterable
217 | PubChem substance or compound IDs.
218 | sids : bool, optional (default False)
219 | Whether ids are SIDs. If False, IDs are assumed to be CIDs.
220 | """
221 | url_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' +
222 | '/cid/%(cids)s/cids/TXT?cids_type=parent')
223 | mapping = {'cids': ','.join([str(cid) for cid in cids])}
224 | response = urllib2.urlopen(url_template % mapping)
225 | parents = set()
226 | for line in response.readlines():
227 | cid = int(line)
228 | if cid: # 0 is not a valid ID
229 | parents.add(cid)
230 | return parents
231 |
232 | def get_ids_from_assay(self, aid, sids=False, activity_outcome=None):
233 | """
234 | Retrieve substance or compound IDs tested in a PubChem BioAssay
235 | assay.
236 |
237 | Parameters
238 | ----------
239 | aid : int
240 | PubChem BioAssay assay ID (AID).
241 | sids : bool, optional (default False)
242 | Whether ids are SIDs. If False, IDs are assumed to be CIDs.
243 | activity_outcome : str, optional
244 | If provided, only retrieve records with this activity outcome,
245 | such as 'active'.
246 | """
247 | url_template = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid' +
248 | '/%(aid)s/%(database)s/txt')
249 | mapping = {'aid': aid}
250 | if sids:
251 | mapping['database'] = 'sids'
252 | else:
253 | mapping['database'] = 'cids'
254 | if activity_outcome is not None:
255 | url_template += '?{}_type={}'.format(mapping['database'],
256 | activity_outcome.lower())
257 | url = url_template % mapping
258 | response = urllib2.urlopen(url)
259 | ids = []
260 | for this in response.readlines():
261 | this = this.strip()
262 | if int(this): # 0 is not a valid ID
263 | ids.append(this)
264 | ids = np.asarray(ids, dtype=int)
265 | return ids
266 |
267 | def get_assay_data(self, aids, filename=None, substance_view=True,
268 | concise=False, compression='gzip'):
269 | """
270 | Download PubChem BioAssay data table.
271 |
272 | Parameters
273 | ----------
274 | aids : array_like
275 | PubChem BioAssay IDs (AIDs).
276 | filename : str, optional
277 | Output filename. If not provided, a temporary file is created.
278 | substance_view : bool, optional (default True)
279 | Whether to group results by substance. If False, results will be
280 | grouped by compound. The default (True) is recommended when
281 | retrieving data from a single assay.
282 | compression : str, optional (default 'gzip')
283 | Compression type for assay data.
284 | concise : bool, optional (default False)
285 | Whether to return the concise data table. If False, the complete
286 | data table is retrieved.
287 | """
288 | query_template = """
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 | 4
299 |
300 |
301 |
302 |
303 | pcassay
304 |
305 | %(aids)s
306 |
307 |
308 |
309 |
310 |
311 | %(dataset)s
312 |
313 |
314 | %(group_by)s
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 | """
328 | group_by = ('{}' +
329 | '')
330 | if substance_view:
331 | group_by = group_by.format('substance', 4)
332 | else:
333 | group_by = group_by.format('compound', 0)
334 |
335 | dataset = ('{}' +
336 | '')
337 | if concise:
338 | dataset = dataset.format('concise', 1)
339 | else:
340 | dataset = dataset.format('complete', 0)
341 | aid_xml = ''
342 | for aid in np.atleast_1d(aids):
343 | aid_xml += ('{}'.format(aid) +
344 | '')
345 | mapping = {'group_by': group_by, 'dataset': dataset, 'aids': aid_xml,
346 | 'compression': compression}
347 | query = self.get_query(query_template % mapping)
348 | rval = query.fetch(filename, compression=compression)
349 | return rval
350 |
351 | def get_assay_descriptions(self, aids, output_format='json',
352 | batch_size=500, n_jobs=1, max_attempts=3):
353 | """
354 | Get assay descriptions.
355 |
356 | Parameters
357 | ----------
358 | aids : list
359 | List of assay IDs.
360 | output_format : str (default='json')
361 | Output format.
362 | """
363 | results = Parallel(n_jobs=n_jobs, verbose=5)(
364 | delayed(_get_assay_descriptions)
365 | (this_aids, output_format, batch_size, max_attempts)
366 | for this_aids in np.array_split(aids, n_jobs))
367 | descriptions = []
368 | if output_format == 'json':
369 | for result in results:
370 | for this in result:
371 | data = json.loads(this)
372 | assert len(data) == 1
373 | assert data.keys()[0] == 'PC_AssayContainer'
374 | for description in data['PC_AssayContainer']:
375 | descriptions.append(description['assay']['descr'])
376 | else:
377 | raise NotImplementedError(output_format)
378 | return descriptions
379 |
380 | def id_exchange(self, ids, source=None, operation_type='same',
381 | output_type='cid'):
382 | """
383 | Use the PubChem Identifier exchange service.
384 |
385 | Currently only supports mapping from Registry IDs (e.g. ChEMBL IDs) to
386 | PubChem IDs.
387 |
388 | Parameters
389 | ----------
390 | ids : iterable
391 | Input identifiers.
392 | source : str, optional
393 | Input source. If None, it will be inferred from ids (if possible).
394 | operation_type : str, optional (default 'same')
395 | Operation type. Defaults to exact matches.
396 | output_type : str, optional (default 'cid')
397 | Output type. Defaults to PubChem CIDs.
398 | """
399 | query_template = """
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 | %(source)s
414 |
415 | %(source_ids)s
416 |
417 |
418 |
419 |
420 |
421 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 | """
436 | ids = np.atleast_1d(ids)
437 | if np.unique(ids).size != len(ids):
438 | raise ValueError('Source IDs must be unique.')
439 | if source is None:
440 | source = self.guess_source(ids[0])
441 | if source is None:
442 | raise ValueError('Cannot guess identifier source.')
443 | mapping = {'source': source, 'operation_type': operation_type,
444 | 'output_type': output_type}
445 | source_ids = []
446 | for source_id in ids:
447 | id_xml = ('{}'.format(source_id) +
448 | '\n')
449 | source_ids.append(id_xml)
450 | mapping['source_ids'] = ''.join(source_ids)
451 |
452 | # construct query
453 | query = self.get_query(query_template % mapping)
454 | rval = query.fetch(compression='gzip')
455 |
456 | # identify matched and unmatched IDs
457 | id_map = {}
458 | for line in rval.splitlines():
459 | source, dest = line.split()
460 | try:
461 | dest = int(dest) # try to convert to an int
462 | except ValueError:
463 | pass
464 | if source in id_map and id_map[source] != dest:
465 | raise ValueError('Nonidentical duplicate mapping.')
466 | id_map[source] = dest
467 | for source_id in ids:
468 | if source_id not in id_map:
469 | id_map[source_id] = None
470 | return id_map
471 |
472 | @staticmethod
473 | def guess_source(identifier):
474 | """
475 | Guess the source for an identifier.
476 |
477 | Parameters
478 | ----------
479 | identifier : str
480 | Identifier.
481 | """
482 | source = None
483 | if str(identifier).startswith('CHEMBL'):
484 | source = 'ChEMBL'
485 | elif str(identifier).startswith('ZINC'):
486 | source = 'ZINC'
487 | return source
488 |
489 | def structure_search(self, structure, structure_format='smiles'):
490 | """
491 | Search PubChem for identical structure and return matching CID.
492 |
493 | Parameters
494 | ----------
495 | structure : str
496 | SMILES or SDF query.
497 | structure_format : str, optional (default 'smiles')
498 | Structure format. Can be either 'smiles' or 'sdf'.
499 | """
500 | query_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' +
501 | '/identity/{}/XML')
502 | status_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug' +
503 | '/compound/listkey/{}/cids/XML')
504 | request_id = None
505 | post_data = urllib.urlencode({structure_format: structure})
506 | req = urllib2.Request(query_template.format(structure_format))
507 | req.add_header('Content-Type', 'application/x-www-form-urlencoded')
508 | response = urllib2.urlopen(req, data=post_data)
509 | for line in response.readlines():
510 | search = re.search('(\d+)', line)
511 | if search is not None:
512 | request_id = search.groups()[0]
513 | if request_id is None:
514 | return None
515 | cid = None
516 | while True:
517 | try:
518 | response = urllib2.urlopen(
519 | status_template.format(request_id))
520 | except urllib2.HTTPError:
521 | break
522 | for line in response.readlines():
523 | search = re.search('(\d+)', line)
524 | if search is not None:
525 | cid = int(search.groups()[0])
526 | if cid is not None:
527 | break
528 | time.sleep(self.delay)
529 | return cid
530 |
531 |
532 | def _get_assay_descriptions(aids, output_format='json', batch_size=500,
533 | max_attempts=3):
534 | """
535 | Parallel worker for PubChem.get_assay_descriptions.
536 |
537 | Parameters
538 | ----------
539 | aids : list
540 | List of assay IDs.
541 | output_format : str (default='json')
542 | Output format.
543 | batch_size : int (default 500)
544 | Number of descriptions per request.
545 | max_attempts : int (default 3)
546 | Maximum number of query attempts. The batch_size is halved after each
547 | failure.
548 | """
549 | url = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/' +
550 | '{aids}/description/{format}')
551 | descriptions = []
552 | failures = 0
553 | start = 0
554 | while True:
555 | if start >= len(aids):
556 | break # stop when we are out of AIDs
557 | query_aids = aids[start:start+batch_size]
558 | query = url.format(aids=','.join([str(aid) for aid in query_aids]),
559 | format=output_format)
560 | try:
561 | response = urllib2.urlopen(query)
562 | except urllib2.HTTPError as e:
563 | failures += 1
564 | batch_size /= 2 # halve the batch size and try again
565 | if failures >= max_attempts:
566 | raise e
567 | continue
568 | descriptions.append(response.read())
569 | failures = 0 # reset the failure count
570 | start += batch_size # move the start index
571 | return descriptions
572 |
--------------------------------------------------------------------------------
/pubchem_utils/pug.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for interacting with the PubChem Power User Gateway (PUG).
3 |
4 | The PUG XML schema is located at
5 | https://pubchem.ncbi.nlm.nih.gov/pug/pug.xsd.
6 |
7 | See also https://pubchem.ncbi.nlm.nih.gov/pug/pughelp.html.
8 | """
9 | import gzip
10 | import re
11 | from StringIO import StringIO
12 | import time
13 | import urllib
14 | import urllib2
15 | import warnings
16 |
17 | __author__ = "Steven Kearnes"
18 | __copyright__ = "Copyright 2014, Stanford University"
19 | __license__ = "3-clause BSD"
20 |
21 |
22 | class PugQuery(object):
23 | """
24 | Submit a PUG query and store the download URL when it becomes
25 | available.
26 |
27 | Parameters
28 | ----------
29 | query : str
30 | PUG query XML.
31 | submit : bool, optional (default True)
32 | Whether to automatically submit the query.
33 | delay : int, optional (default 10)
34 | Number of seconds to wait between status checks.
35 | n_attempts : int, optional (default 3)
36 | Number of times to attempt query submission.
37 | verbose : bool, optional (default False)
38 | Whether to be verbose.
39 | """
40 | cancel_template = """
41 |
42 |
43 |
44 |
45 |
46 | %(id)s
47 |
48 |
49 |
50 |
51 |
52 |
53 | """
54 | status_template = """
55 |
56 |
57 |
58 |
59 |
60 | %(id)s
61 |
62 |
63 |
64 |
65 |
66 |
67 | """
68 | url = 'https://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi'
69 |
70 | def __init__(self, query, submit=True, delay=10, n_attempts=3,
71 | verbose=False):
72 | self.query = query
73 | self.delay = delay
74 | self.n_attemps = n_attempts
75 | self.verbose = verbose
76 |
77 | self.id = None
78 | self.download_url = None
79 | self.filename = None
80 | self.data = None
81 | self.alive = False
82 |
83 | if submit:
84 | self.submit()
85 |
86 | def __del__(self):
87 | """
88 | Cancel uncompleted queries.
89 | """
90 | self.cancel()
91 |
92 | def request(self, query):
93 | """
94 | Submit a query to PUG and extract either the query ID or the
95 | download URL.
96 |
97 | Parameters
98 | ----------
99 | query : str
100 | PUG query XML.
101 | """
102 | q = None
103 | for i in xrange(self.n_attemps):
104 | try:
105 | q = urllib2.urlopen(self.url, query)
106 | break
107 | except urllib2.HTTPError as e:
108 | if i + 1 < self.n_attemps:
109 | continue
110 | else:
111 | raise e
112 | response = q.read()
113 |
114 | # check for errors
115 | status_re = re.search('', response)
116 | status = status_re.groups()[0]
117 | if status not in ['success', 'queued', 'running', 'stopped']:
118 | msg = 'Original Query:\n------\n{}\n'.format(
119 | '\n'.join(self.query.splitlines()[:100]))
120 | if query != self.query:
121 | msg += 'Current Query:\n--------------\n{}\n'.format(
122 | '\n'.join(query.splitlines()[:100]))
123 | msg += 'Response:\n---------\n{}'.format(response)
124 | raise PUGError(msg)
125 |
126 | # check for a download URL
127 | download_url_re = re.search(
128 | '\s*(.*?)\s*',
129 | response)
130 | if download_url_re is not None:
131 | self.download_url = download_url_re.groups()[0]
132 |
133 | # otherwise, extract the request ID
134 | elif self.id is None:
135 | reqid_re = re.search(
136 | '\s*(.*?)\s*', response)
137 | self.id = reqid_re.groups()[0]
138 |
139 | def cancel(self):
140 | """
141 | Cancel a pending request.
142 | """
143 | if self.alive:
144 | assert self.id is not None
145 | warnings.warn('Canceling PUG request.')
146 | query = self.cancel_template % {'id': self.id}
147 | self.request(query)
148 | self.alive = False
149 |
150 | def check_status(self):
151 | """
152 | Check the status of the query.
153 | """
154 | assert self.id is not None
155 | query = self.status_template % {'id': self.id}
156 | self.request(query)
157 |
158 | def submit(self):
159 | """
160 | Submit the query and monitor its progress.
161 | """
162 | if self.alive:
163 | warnings.warn('This request is already active.')
164 | return
165 | self.alive = True
166 | self.request(self.query)
167 | if self.verbose:
168 | print self.id,
169 | while self.download_url is None:
170 | time.sleep(self.delay)
171 | self.check_status()
172 | self.alive = False
173 |
174 | def fetch(self, filename=None, compression=None):
175 | """
176 | Fetch the result of the query.
177 |
178 | Parameters
179 | ----------
180 | filename : str, optional
181 | Output filename. If not provided, the data is read into memory.
182 | compression : str, optional
183 | Compression type used to decode data.
184 | """
185 | if not self.alive:
186 | self.submit()
187 | if self.download_url is None:
188 | raise PUGError('No download URL.')
189 |
190 | # fetch
191 | if filename is not None:
192 | filename, _ = urllib.urlretrieve(self.download_url, filename)
193 | self.filename = filename
194 | return filename
195 | else:
196 | data = urllib2.urlopen(self.download_url).read()
197 | if compression is not None:
198 | if compression == 'gzip':
199 | with gzip.GzipFile(fileobj=StringIO(data)) as f:
200 | data = f.read()
201 | else:
202 | raise NotImplementedError(compression)
203 | self.data = data
204 | return data
205 |
206 |
207 | class PUGError(Exception):
208 | """
209 | PUG exception class.
210 | """
211 |
--------------------------------------------------------------------------------
/pubchem_utils/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Scripting utilities.
3 | """
4 | import gzip
5 |
6 | __author__ = "Steven Kearnes"
7 | __copyright__ = "Copyright 2014, Stanford University"
8 | __license__ = "3-clause BSD"
9 |
10 |
11 | def read_ids(filename):
12 | """
13 | Read record IDs from a file.
14 |
15 | Parameters
16 | ----------
17 | filename : str
18 | Filename containing record IDs.
19 | """
20 | if filename.endswith('.gz'):
21 | f = gzip.open(filename)
22 | else:
23 | f = open(filename)
24 | ids = [line.strip() for line in f]
25 | f.close()
26 | return ids
27 |
--------------------------------------------------------------------------------
/pubchem_utils/scripts/download_records.py:
--------------------------------------------------------------------------------
1 | """
2 | Download records from PubChem by ID.
3 | """
4 | import argparse
5 |
6 | from pubchem_utils import PubChem
7 | from pubchem_utils.scripts import read_ids
8 |
9 | __author__ = "Steven Kearnes"
10 | __copyright__ = "Copyright 2014, Stanford University"
11 | __license__ = "3-clause BSD"
12 |
13 |
14 | def parse_args(input_args=None):
15 | """
16 | Parse command-line arguments.
17 |
18 | Parameters
19 | ----------
20 | input_args : list, optional
21 | Input arguments. If not provided, defaults to sys.argv[1:].
22 | """
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument('input',
25 | help='Input filename containing record IDs.')
26 | parser.add_argument('output',
27 | help='Output filename.')
28 | parser.add_argument('--sids', action='store_true',
29 | help='Whether IDs are substance IDs (if False, IDs ' +
30 | 'are assumed to be compound IDs).')
31 | parser.add_argument('-f', '--format', dest='download_format',
32 | default='sdf', help='Download format.')
33 | parser.add_argument('-c', '--compression', default='gzip',
34 | help='Compression type.')
35 | parser.add_argument('--3d', action='store_true', dest='use_3d',
36 | help='Whether to download 3D structures.')
37 | parser.add_argument('-n', '--n-conformers', type=int, default=1,
38 | help='Number of conformers to download if ' +
39 | 'retrieving 3D structures.')
40 | parser.add_argument('-d', '--delay', type=int, default=10,
41 | help='Number of seconds to wait between status ' +
42 | 'checks.')
43 | rval = parser.parse_args(input_args)
44 | return rval
45 |
46 |
47 | def main(ids, filename=None, sids=False, download_format='sdf',
48 | compression='gzip', use_3d=False, n_conformers=1, delay=10):
49 | """
50 | Download records from PubChem by ID.
51 |
52 | Parameters
53 | ----------
54 | ids : iterable
55 | PubChem substance or compound IDs.
56 | filename : str, optional
57 | Output filename. If not provided, a temporary file is created.
58 | sids : bool, optional (default False)
59 | Whether ids are SIDs. If False, IDs are assumed to be CIDs.
60 | download_format : str, optional (default 'sdf')
61 | Download file format.
62 | compression : str, optional (default 'gzip')
63 | Compression type for downloaded structures.
64 | use_3d : bool, optional (default True)
65 | Whether to query 3D information. If False, 2D information is
66 | retrieved.
67 | n_conformers : int, optional (default 1)
68 | Number of conformers to download if retrieving 3D structures.
69 | delay : int, optional (default 10)
70 | Number of seconds to wait between status checks.
71 | """
72 | engine = PubChem(delay=delay)
73 | engine.get_records(ids, filename, sids, download_format, compression,
74 | use_3d, n_conformers)
75 |
76 | if __name__ == '__main__':
77 | args = parse_args()
78 | record_ids = read_ids(args.input)
79 | main(record_ids, args.output, args.sids, args.download_format,
80 | args.compression, args.use_3d, args.n_conformers, args.delay)
81 |
--------------------------------------------------------------------------------
/pubchem_utils/scripts/id_exchange.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Use the PubChem Identifier Exchange service.
4 | """
5 | import argparse
6 | import numpy as np
7 |
8 | from pubchem_utils import PubChem
9 | from pubchem_utils.scripts import read_ids
10 |
11 | __author__ = "Steven Kearnes"
12 | __copyright__ = "Copyright 2014, Stanford University"
13 | __license__ = "3-clause BSD"
14 |
15 |
16 | def parse_args(input_args=None):
17 | """
18 | Parse command-line arguments.
19 |
20 | Parameters
21 | ----------
22 | input_args : list, optional
23 | Input arguments. If not provided, defaults to sys.argv[1:].
24 | """
25 | parser = argparse.ArgumentParser()
26 | parser.add_argument('input',
27 | help='Input filename containing record IDs.')
28 | parser.add_argument('-s', '--source',
29 | help='Source for input IDs. If not provided, the ' +
30 | 'source will be inferred from the input IDs.')
31 | parser.add_argument('-m', '--mapping', action='store_true',
32 | help='Whether to write ID mapping. If false, only ' +
33 | 'result IDs will be saved.')
34 | parser.add_argument('-p', '--prefix',
35 | help='Prefix for output files.')
36 | parser.add_argument('--sids', action='store_true',
37 | help='Whether returned IDs are substance IDs '
38 | '(if False, returned IDs will be compound IDs).')
39 | parser.add_argument('-d', '--delay', type=int, default=10,
40 | help='Number of seconds to wait between status ' +
41 | 'checks.')
42 | return parser.parse_args(input_args)
43 |
44 |
45 | def main(ids, source=None, prefix=None, sids=False, mapping=False, delay=10):
46 | """
47 | Download records from PubChem by ID.
48 |
49 | Parameters
50 | ----------
51 | ids : iterable
52 | Source IDs.
53 | source : str, optional
54 | Input source. If None, it will be inferred from ids (if possible).
55 | prefix : str, optional
56 | Prefix for output files.
57 | sids : bool, optional (default False)
58 | Whether ids are SIDs. If False, IDs are assumed to be CIDs.
59 | mapping : bool, optional (default False)
60 | delay : int, optional (default 10)
61 | Number of seconds to wait between status checks.
62 | """
63 | engine = PubChem(delay=delay)
64 | if sids:
65 | output_type = 'sid'
66 | else:
67 | output_type = 'cid'
68 | matched, unmatched = engine.id_exchange(np.unique(ids), source,
69 | output_type=output_type)
70 | if mapping:
71 | with open('{}-mapping.txt'.format(prefix), 'wb') as f:
72 | for key, value in matched.items():
73 | f.write('{}\t{}\n'.format(key, value))
74 | else:
75 | with open('{}-matched.txt'.format(prefix), 'wb') as f:
76 | for value in matched.values():
77 | f.write('{}\n'.format(value))
78 | if len(unmatched):
79 | with open('{}-unmatched.txt'.format(prefix), 'wb') as f:
80 | for value in unmatched:
81 | f.write('{}\n'.format(value))
82 |
83 | if __name__ == '__main__':
84 | args = parse_args()
85 | record_ids = read_ids(args.input)
86 | main(record_ids, args.source, args.prefix, args.sids, args.mapping,
87 | args.delay)
88 |
--------------------------------------------------------------------------------
/pubchem_utils/scripts/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skearnes/pubchem-utils/6dd796d4e0ef65641547c429ed39ce39d0742510/pubchem_utils/scripts/test/__init__.py
--------------------------------------------------------------------------------
/pubchem_utils/scripts/test/test_download_records.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for download_records.py.
3 | """
4 | import numpy as np
5 | import shutil
6 | import tempfile
7 | import unittest
8 |
9 | from .. import read_ids
10 | from ..download_records import main, parse_args
11 |
12 |
13 | class TestDownloadIds(unittest.TestCase):
14 | """
15 | Tests for download_records.py.
16 | """
17 | def setUp(self):
18 | """
19 | Set up tests.
20 | """
21 | self.temp_dir = tempfile.mkdtemp()
22 | _, self.filename = tempfile.mkstemp(dir=self.temp_dir)
23 |
24 | # write CIDs
25 | self.cids = np.asarray([2244])
26 | _, self.cid_filename = tempfile.mkstemp(suffix='.txt',
27 | dir=self.temp_dir)
28 | with open(self.cid_filename, 'wb') as f:
29 | for cid in self.cids:
30 | f.write('{}\n'.format(cid))
31 |
32 | # write SIDs
33 | self.sids = [179038559]
34 | _, self.sid_filename = tempfile.mkstemp(suffix='.txt',
35 | dir=self.temp_dir)
36 | with open(self.sid_filename, 'wb') as f:
37 | for sid in self.sids:
38 | f.write('{}\n'.format(sid))
39 |
40 | def tearDown(self):
41 | """
42 | Clean up tests.
43 | """
44 | shutil.rmtree(self.temp_dir)
45 |
46 | def run_script(self, ids, args):
47 | """
48 | Run main loop of script.
49 | """
50 | main(ids, args.output, args.sids, args.download_format,
51 | args.compression, args.use_3d, args.n_conformers, args.delay)
52 |
53 | def test_read_ids(self):
54 | """
55 | Test read_ids.
56 | """
57 | ids = read_ids(self.cid_filename)
58 | assert np.array_equal(np.asarray(ids, dtype=self.cids.dtype),
59 | self.cids)
60 |
61 | def test_download_cid(self):
62 | """
63 | Download a CID.
64 | """
65 | ids = read_ids(self.cid_filename)
66 | args = parse_args([self.cid_filename, self.filename])
67 | self.run_script(ids, args)
68 |
--------------------------------------------------------------------------------
/pubchem_utils/test/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for PubChem PUG interface.
3 | """
4 | import numpy as np
5 | import os
6 | import unittest
7 | import urllib2
8 | import tempfile
9 |
10 | from .. import PubChem
11 |
12 |
13 | class TestPubChem(unittest.TestCase):
14 | """
15 | Tests for PubChem.
16 |
17 | Reference comparisons are made when possible to records retrieved using the
18 | PubChem PUG REST interface via http://pubchem.ncbi.nlm.nih.gov/rest/pug.
19 | """
20 | def setUp(self):
21 | """
22 | Set up tests.
23 | """
24 | self.engine = PubChem(delay=3) # shorten delay for tests
25 | self.rest_url = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug'
26 |
27 | def identical_sdf(self, a, b):
28 | """
29 | Compare SDF records.
30 |
31 | SDF records downloaded from PubChem have a timestamp that should not be
32 | considered in the comparison.
33 |
34 | Parameters
35 | ----------
36 | a, b : str
37 | SDF records to compare.
38 | """
39 | if a == b: # sometimes the timestamps match
40 | return True
41 |
42 | try:
43 | a_lines = a.split('\n')
44 | b_lines = b.split('\n')
45 | assert len(a_lines) == len(b_lines)
46 | for i in xrange(len(a_lines)):
47 | if i == 1:
48 | assert a_lines[i].strip().startswith('-OEChem')
49 | assert b_lines[i].strip().startswith('-OEChem')
50 | continue
51 | assert a_lines[i] == b_lines[i]
52 | return True
53 | except AssertionError:
54 | return False
55 |
56 | def test_get_records_cid(self):
57 | """
58 | 2D CID request with get_records().
59 | """
60 | url = os.path.join(self.rest_url, 'compound/cid/2244/SDF')
61 | ref = urllib2.urlopen(url).read()
62 | data = self.engine.get_records([2244])
63 | assert self.identical_sdf(data, ref)
64 |
65 | def test_get_record_cid(self):
66 | """
67 | 2D CID request with get_record().
68 | """
69 | url = os.path.join(self.rest_url, 'compound/cid/2244/SDF')
70 | ref = urllib2.urlopen(url).read()
71 | data = self.engine.get_record(2244)
72 | assert self.identical_sdf(data, ref)
73 |
74 | def test_get_records_sid(self):
75 | """
76 | SID request with get_records().
77 | """
78 | url = os.path.join(self.rest_url, 'substance/sid/179038559/SDF')
79 | ref = urllib2.urlopen(url).read()
80 | data = self.engine.get_records([179038559], sids=True)
81 | assert self.identical_sdf(data, ref)
82 |
83 | def test_get_record_sid(self):
84 | """
85 | SID request with get_record().
86 | """
87 | url = os.path.join(self.rest_url, 'substance/sid/179038559/SDF')
88 | ref = urllib2.urlopen(url).read()
89 | data = self.engine.get_record(179038559, sid=True)
90 | assert self.identical_sdf(data, ref)
91 |
92 | def test_get_records_3d(self):
93 | """
94 | 3D structure request with get_records().
95 | """
96 | url = os.path.join(self.rest_url,
97 | 'compound/cid/2244/SDF?record_type=3d')
98 | ref = urllib2.urlopen(url).read()
99 | data = self.engine.get_records([2244], use_3d=True)
100 | assert self.identical_sdf(data, ref)
101 |
102 | def test_get_record_3d(self):
103 | """
104 | 3D structure request with get_record().
105 | """
106 | url = os.path.join(self.rest_url,
107 | 'compound/cid/2244/SDF?record_type=3d')
108 | ref = urllib2.urlopen(url).read()
109 | data = self.engine.get_record(2244, use_3d=True)
110 | assert self.identical_sdf(data, ref)
111 |
112 | def test_aid_cids(self):
113 | """
114 | Fetch CIDs from an AID.
115 | """
116 | url = os.path.join(self.rest_url, 'assay/aid/466/cids/TXT')
117 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
118 | data = self.engine.get_ids_from_assay(466)
119 | assert np.array_equal(data, ref)
120 |
121 | def test_aid_sids(self):
122 | """
123 | Fetch SIDs from an AID.
124 | """
125 | url = os.path.join(self.rest_url, 'assay/aid/466/sids/TXT')
126 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
127 | data = self.engine.get_ids_from_assay(466, sids=True)
128 | assert np.array_equal(data, ref)
129 |
130 | def test_aid_active_cids(self):
131 | """
132 | Fetch active CIDs from an AID.
133 | """
134 | url = os.path.join(self.rest_url,
135 | 'assay/aid/466/cids/TXT?cids_type=active')
136 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
137 | data = self.engine.get_ids_from_assay(466, activity_outcome='active')
138 | assert np.array_equal(data, ref)
139 |
140 | def test_aid_inactive_cids(self):
141 | """
142 | Fetch inactive CIDs from an AID.
143 | """
144 | url = os.path.join(self.rest_url,
145 | 'assay/aid/466/cids/TXT?cids_type=inactive')
146 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
147 | data = self.engine.get_ids_from_assay(466, activity_outcome='inactive')
148 | assert np.array_equal(data, ref)
149 |
150 | def test_aid_active_sids(self):
151 | """
152 | Fetch active SIDs from an AID.
153 | """
154 | url = os.path.join(self.rest_url,
155 | 'assay/aid/466/sids/TXT?sids_type=active')
156 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
157 | data = self.engine.get_ids_from_assay(466, sids=True,
158 | activity_outcome='active')
159 | assert np.array_equal(data, ref)
160 |
161 | def test_aid_inactive_sids(self):
162 | """
163 | Fetch inactive SIDs from an AID.
164 | """
165 | url = os.path.join(self.rest_url,
166 | 'assay/aid/466/sids/TXT?sids_type=inactive')
167 | ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
168 | data = self.engine.get_ids_from_assay(466, sids=True,
169 | activity_outcome='inactive')
170 | assert np.array_equal(data, ref)
171 |
172 | def test_get_assay_data(self):
173 | """
174 | Test PubChem.get_assay_data.
175 | """
176 | data = self.engine.get_assay_data(504772)
177 | assert len(data.splitlines()) == 332 # 331 records plus header
178 |
179 | def test_get_assay_descriptions(self):
180 | """
181 | Test PubChem.get_assay_descriptions.
182 | """
183 | data = self.engine.get_assay_descriptions([490])
184 | assert len(data) == 1
185 | assert data[0]['aid']['id'] == 490 # check AID
186 |
187 | def test_get_assay_descriptions_parallel(self):
188 | """
189 | Test PubChem.get_assay_descriptions with n_jobs > 1.
190 | """
191 | aids = [490, 466, 9, 548, 851]
192 | data = self.engine.get_assay_descriptions(aids, n_jobs=2)
193 | assert len(data) == 5
194 |
195 | # check AIDs are all present (order is not guaranteed)
196 | desc_aids = []
197 | for desc in data:
198 | desc_aids.append(desc['aid']['id'])
199 | assert np.array_equal(np.sort(aids), np.sort(desc_aids))
200 |
201 | def test_id_exchange(self):
202 | """
203 | Test PubChem.id_exchange.
204 | """
205 | data = self.engine.id_exchange('CHEMBL25')
206 | assert data['CHEMBL25'] == 2244
207 |
208 | def test_structure_search_smiles(self):
209 | """
210 | Test PubChem.structure_search with SMILES queries.
211 | """
212 | smiles = self.engine.get_records([2244], download_format='smiles')
213 | smiles = smiles.split()[1]
214 | assert self.engine.structure_search(smiles) == 2244
215 |
216 | def test_structure_search_sdf(self):
217 | """
218 | Test PubChem.structure_search with SDF queries.
219 | """
220 | sdf = self.engine.get_records([2244])
221 | assert self.engine.structure_search(
222 | sdf, structure_format='sdf') == 2244
223 |
224 | def test_get_parent_cids(self):
225 | """
226 | Test PubChem.get_parent_cids.
227 | """
228 | same = self.engine.get_parent_cids([2244])
229 | assert same == {2244}, same
230 | parents = self.engine.get_parent_cids([23666729, 5338317])
231 | assert parents == {2244, 3672}, parents
232 |
233 | def test_get_record_filename(self):
234 | """
235 | Test PubChem.get_record()'s filename kwarg.
236 | """
237 | fd, fn = tempfile.mkstemp()
238 | try:
239 | ref = self.engine.get_record(2244)
240 | self.engine.get_record(2244, filename=fn)
241 | with open(fn) as f:
242 | data = f.read()
243 | assert self.identical_sdf(data, ref)
244 | finally:
245 | os.close(fd)
246 | os.unlink(fn)
247 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | numpy
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 |
4 | def main():
5 | setup(
6 | name='pubchem_utils',
7 | version='0.1',
8 | license='3-clause BSD',
9 | url='https://github.com/skearnes/pubchem-utils',
10 | description='Utilities for interacting with PubChem',
11 | packages=find_packages(),
12 | )
13 |
14 | if __name__ == '__main__':
15 | main()
16 |
--------------------------------------------------------------------------------