├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── pubchem_utils
    ├── __init__.py
    ├── pug.py
    ├── scripts
    │   ├── __init__.py
    │   ├── download_records.py
    │   ├── id_exchange.py
    │   └── test
    │   │   ├── __init__.py
    │   │   └── test_download_records.py
    └── test
    │   └── __init__.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python: 2.7
 3 | 
 4 | before_install:
 5 | 
 6 |  # install code analysis tools
 7 |  - pip install pep8 pyflakes
 8 | 
 9 |  # install other packages
10 |  - pip install joblib
11 | 
12 | # install the package
13 | install: python setup.py install
14 | 
15 | # run tests
16 | script:
17 |  - nosetests
18 |  - pep8 pubchem_utils
19 |  - pyflakes pubchem_utils
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Stanford University
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the copyright holder nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pubchem-utils
 2 | =============
 3 | 
 4 | Utilities for interacting with [PubChem](https://pubchem.ncbi.nlm.nih.gov)
 5 | 
 6 | __Note:__ sometimes one or more of the tests fail but then pass when re-run. Until I can
 7 | write better tests to capture this behavior, I have taken down the Travis indicator
 8 | so as not to give a false impression. Please double-check your results when using this
 9 | code in case of sporadic failures.
10 | 
11 | Quick Start
12 | -----------
13 | 
14 | ```python
15 | from pubchem_utils import PubChem
16 | pc = PubChem()
17 | ```
18 | 
19 | Download 3D structures for a batch of CIDs:
20 | 
21 | ```python
22 | pc.get_records([2244, 3672], filename='painkillers.sdf.gz', use_3d=True)
23 | ```
24 | 
25 | Retrieve SIDs active in a PubChem BioAssay experiment:
26 | 
27 | ```python
28 | sids = pc.get_ids_from_assay(466, sids=True, activity_outcome='active')
29 | ```
30 | 
31 | Download the data table for a PubChem BioAssay experiment:
32 | 
33 | ```python
34 | pc.get_assay_data(466, filename='AID466.csv.gz')
35 | ```
36 | 
37 | Get the PubChem CID for a compound in [ChEMBL](https://www.ebi.ac.uk/chembl):
38 | 
39 | ```python
40 | id_map = pc.id_exchange('CHEMBL25')  # source is inferred from ID string
41 | ```
42 | 
43 | Search PubChem for the CID matching a SMILES string:
44 | 
45 | ```python
46 | cid = pc.structure_search('CC(=O)OC1=CC=CC=C1C(=O)O')
47 | ```
48 | 


--------------------------------------------------------------------------------
/pubchem_utils/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for interacting with PubChem.
  3 | """
  4 | import json
  5 | import numpy as np
  6 | import shutil
  7 | import re
  8 | import time
  9 | import urllib
 10 | import urllib2
 11 | 
 12 | from joblib import delayed, Parallel
 13 | 
 14 | from .pug import PugQuery
 15 | 
 16 | __author__ = "Steven Kearnes"
 17 | __copyright__ = "Copyright 2014-2015, Stanford University"
 18 | __license__ = "3-clause BSD"
 19 | 
 20 | 
 21 | class PubChem(object):
 22 |     """
 23 |     Submit queries to PUG and return PUGQuery objects.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     submit : bool, optional (default True)
 28 |         Whether to automatically submit PUGQuery queries.
 29 |     delay : int, optional (default 10)
 30 |         Number of seconds for PUGQuery objects to wait between status
 31 |         checks.
 32 |     verbose : bool, optional (default False)
 33 |         Whether to create PUG queries in verbose mode.
 34 |     """
 35 |     def __init__(self, submit=True, delay=10, verbose=False):
 36 |         self.submit = submit
 37 |         self.delay = delay
 38 |         self.verbose = verbose
 39 | 
 40 |     def get_query(self, query):
 41 |         """
 42 |         Create a PUG request.
 43 | 
 44 |         Parameters
 45 |         ----------
 46 |         query : str
 47 |             PUG query XML.
 48 |         """
 49 |         return PugQuery(query, submit=self.submit, delay=self.delay,
 50 |                         verbose=self.verbose)
 51 | 
 52 |     def get_records(self, ids, filename=None, sids=False,
 53 |                     download_format='sdf', compression='gzip', use_3d=False,
 54 |                     n_conformers=1):
 55 |         """
 56 |         Download records for substances or compounds identified by
 57 |         PubChem substance IDs (SIDs) or compound IDs (CIDs).
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         ids : iterable
 62 |             PubChem substance or compound IDs.
 63 |         filename : str, optional
 64 |             Output filename. If not provided, a temporary file is created.
 65 |         sids : bool, optional (default False)
 66 |             Whether ids are SIDs. If False, IDs are assumed to be CIDs.
 67 |         download_format : str, optional (default 'sdf')
 68 |             Download file format.
 69 |         compression : str, optional (default 'gzip')
 70 |             Compression type for downloaded structures.
 71 |         use_3d : bool, optional (default False)
 72 |             Whether to query 3D information. If False, 2D information is
 73 |             retrieved.
 74 |         n_conformers : int, optional (default 1)
 75 |             Number of conformers to download if retrieving 3D structures.
 76 |         """
 77 |         query_template = """
 78 |         <PCT-Data>
 79 |          <PCT-Data_input>
 80 |           <PCT-InputData>
 81 |            <PCT-InputData_download>
 82 |             <PCT-Download>
 83 |              <PCT-Download_uids>
 84 |               <PCT-QueryUids>
 85 |                <PCT-QueryUids_ids>
 86 |                 <PCT-ID-List>
 87 |                  <PCT-ID-List_db>%(database)s</PCT-ID-List_db>
 88 |                  <PCT-ID-List_uids>
 89 |                   %(uids)s
 90 |                  </PCT-ID-List_uids>
 91 |                 </PCT-ID-List>
 92 |                </PCT-QueryUids_ids>
 93 |               </PCT-QueryUids>
 94 |              </PCT-Download_uids>
 95 |              <PCT-Download_format value="%(download_format)s"/>
 96 |              <PCT-Download_compression value="%(compression)s"/>
 97 |              <PCT-Download_use-3d value="%(use_3d)s"/>
 98 |              <PCT-Download_n-3d-conformers>
 99 |               %(n_conformers)s
100 |              </PCT-Download_n-3d-conformers>
101 |             </PCT-Download>
102 |            </PCT-InputData_download>
103 |           </PCT-InputData>
104 |          </PCT-Data_input>
105 |         </PCT-Data>
106 |         """
107 |         mapping = {}
108 | 
109 |         # database
110 |         if sids:
111 |             mapping['database'] = 'pcsubstance'
112 |         else:
113 |             mapping['database'] = 'pccompound'
114 | 
115 |         # download format
116 |         download_formats = ['text-asn', 'binary-asn', 'xml', 'sdf', 'image',
117 |                             'image-small', 'smiles', 'inchi']
118 |         assert download_format in download_formats, (
119 |             'download_format must be one of ' + str(download_formats))
120 |         mapping['download_format'] = download_format
121 | 
122 |         # compression
123 |         if compression is None:
124 |             compression = 'none'
125 |         compressions = ['none', 'gzip', 'bzip2']
126 |         assert compression in compressions, (
127 |             'compression must be one of ' + str(compressions))
128 |         mapping['compression'] = compression
129 | 
130 |         # 3D
131 |         if use_3d:
132 |             mapping['use_3d'] = 'true'
133 |         else:
134 |             mapping['use_3d'] = 'false'
135 | 
136 |         # conformers
137 |         mapping['n_conformers'] = n_conformers
138 | 
139 |         # create XML for each ID
140 |         xml_uids = ''
141 |         for uid in ids:
142 |             xml_uids += ('<PCT-ID-List_uids_E>{}'.format(uid) +
143 |                          '</PCT-ID-List_uids_E>\n')
144 |         mapping['uids'] = xml_uids
145 | 
146 |         # construct query
147 |         query = self.get_query(query_template % mapping)
148 |         rval = query.fetch(filename, compression=compression)
149 |         return rval
150 | 
151 |     def get_record(self, id, filename=None, sid=False, use_3d=False):
152 |         """
153 |         Download a single record for a substance or compound identified by
154 |         PubChem substance ID (SID) or compound ID (CID).
155 | 
156 |         Parameters
157 |         ----------
158 |         id : str
159 |             PubChem substance or compound ID.
160 |         filename : str, optional
161 |             Output filename. If not provided, the output is returned as a
162 |             string
163 |         sid : bool, optional (default False)
164 |             Whether id is a SID. If False, ID is assumed to be a CID.
165 |         use_3d : bool, optional (default False)
166 |             Whether to query 3D information. If False, 2D information is
167 |             retrieved.
168 | 
169 |         Returns
170 |         -------
171 |         val : {str, None}
172 |             The requested substance or compound, in an SDF-format string, or
173 |             None if `filename` output is specified
174 | 
175 |         Notes
176 |         -----
177 |         Requests for multiple substances, compounds or conformers can be
178 |         batched together _much_ more efficiently by using `PubChem.get_records`
179 | 
180 |         Raises
181 |         ------
182 |         Invalid CID or SID requests will result in a urllib2.HTTPError (400).
183 |         Certain PubChem compounds and substances may not have available 3D
184 |         structures, in which case this method, when called with use_3d=True,
185 |         will throw a urllib2.HTTPError (404).
186 |         """
187 | 
188 |         base = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug/%s?%s'
189 |         if sid:
190 |             specialization = 'substance/sid/%s/SDF' % id
191 |         else:
192 |             specialization = 'compound/cid/%s/SDF' % id
193 | 
194 |         if use_3d:
195 |             params = {'record_type': '3d'}
196 |         else:
197 |             params = {}
198 | 
199 |         url = base % (specialization, urllib.urlencode(params))
200 |         comm = urllib2.urlopen(url)
201 | 
202 |         if filename is None:
203 |             return comm.read()
204 |         else:
205 |             with open(filename, 'wb') as f:
206 |                 shutil.copyfileobj(comm, f)
207 | 
208 |     def get_parent_cids(self, cids):
209 |         """
210 |         Get IDs of parent compounds. Note that the parent IDs are not
211 |         guaranteed to be returned in the same order as the child IDs, so we
212 |         return a set.
213 | 
214 |         Parameters
215 |         ----------
216 |         ids : iterable
217 |             PubChem substance or compound IDs.
218 |         sids : bool, optional (default False)
219 |             Whether ids are SIDs. If False, IDs are assumed to be CIDs.
220 |         """
221 |         url_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' +
222 |                         '/cid/%(cids)s/cids/TXT?cids_type=parent')
223 |         mapping = {'cids': ','.join([str(cid) for cid in cids])}
224 |         response = urllib2.urlopen(url_template % mapping)
225 |         parents = set()
226 |         for line in response.readlines():
227 |             cid = int(line)
228 |             if cid:  # 0 is not a valid ID
229 |                 parents.add(cid)
230 |         return parents
231 | 
232 |     def get_ids_from_assay(self, aid, sids=False, activity_outcome=None):
233 |         """
234 |         Retrieve substance or compound IDs tested in a PubChem BioAssay
235 |         assay.
236 | 
237 |         Parameters
238 |         ----------
239 |         aid : int
240 |             PubChem BioAssay assay ID (AID).
241 |         sids : bool, optional (default False)
242 |             Whether ids are SIDs. If False, IDs are assumed to be CIDs.
243 |         activity_outcome : str, optional
244 |             If provided, only retrieve records with this activity outcome,
245 |             such as 'active'.
246 |         """
247 |         url_template = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid' +
248 |                         '/%(aid)s/%(database)s/txt')
249 |         mapping = {'aid': aid}
250 |         if sids:
251 |             mapping['database'] = 'sids'
252 |         else:
253 |             mapping['database'] = 'cids'
254 |         if activity_outcome is not None:
255 |             url_template += '?{}_type={}'.format(mapping['database'],
256 |                                                  activity_outcome.lower())
257 |         url = url_template % mapping
258 |         response = urllib2.urlopen(url)
259 |         ids = []
260 |         for this in response.readlines():
261 |             this = this.strip()
262 |             if int(this):  # 0 is not a valid ID
263 |                 ids.append(this)
264 |         ids = np.asarray(ids, dtype=int)
265 |         return ids
266 | 
267 |     def get_assay_data(self, aids, filename=None, substance_view=True,
268 |                        concise=False, compression='gzip'):
269 |         """
270 |         Download PubChem BioAssay data table.
271 | 
272 |         Parameters
273 |         ----------
274 |         aids : array_like
275 |             PubChem BioAssay IDs (AIDs).
276 |         filename : str, optional
277 |             Output filename. If not provided, a temporary file is created.
278 |         substance_view : bool, optional (default True)
279 |             Whether to group results by substance. If False, results will be
280 |             grouped by compound. The default (True) is recommended when
281 |             retrieving data from a single assay.
282 |         compression : str, optional (default 'gzip')
283 |             Compression type for assay data.
284 |         concise : bool, optional (default False)
285 |             Whether to return the concise data table. If False, the complete
286 |             data table is retrieved.
287 |         """
288 |         query_template = """
289 | <PCT-Data>
290 |   <PCT-Data_input>
291 |     <PCT-InputData>
292 |       <PCT-InputData_query>
293 |         <PCT-Query>
294 |           <PCT-Query_type>
295 |             <PCT-QueryType>
296 |               <PCT-QueryType_bas>
297 |                 <PCT-QueryAssayData>
298 |     <PCT-QueryAssayData_output value="csv">4</PCT-QueryAssayData_output>
299 |                   <PCT-QueryAssayData_aids>
300 |                     <PCT-QueryUids>
301 |                       <PCT-QueryUids_ids>
302 |                         <PCT-ID-List>
303 |                           <PCT-ID-List_db>pcassay</PCT-ID-List_db>
304 |                           <PCT-ID-List_uids>
305 |                             %(aids)s
306 |                           </PCT-ID-List_uids>
307 |                         </PCT-ID-List>
308 |                       </PCT-QueryUids_ids>
309 |                     </PCT-QueryUids>
310 |                   </PCT-QueryAssayData_aids>
311 |                     %(dataset)s
312 |                   <PCT-QueryAssayData_focus>
313 |                     <PCT-Assay-FocusOption>
314 |                     %(group_by)s
315 |                     </PCT-Assay-FocusOption>
316 |                   </PCT-QueryAssayData_focus>
317 |                   <PCT-QueryAssayData_compression value="%(compression)s"/>
318 |                 </PCT-QueryAssayData>
319 |               </PCT-QueryType_bas>
320 |             </PCT-QueryType>
321 |           </PCT-Query_type>
322 |         </PCT-Query>
323 |       </PCT-InputData_query>
324 |     </PCT-InputData>
325 |   </PCT-Data_input>
326 | </PCT-Data>
327 | """
328 |         group_by = ('<PCT-Assay-FocusOption_group-results-by value="{}">{}' +
329 |                     '</PCT-Assay-FocusOption_group-results-by>')
330 |         if substance_view:
331 |             group_by = group_by.format('substance', 4)
332 |         else:
333 |             group_by = group_by.format('compound', 0)
334 | 
335 |         dataset = ('<PCT-QueryAssayData_dataset value="{}">{}' +
336 |                    '</PCT-QueryAssayData_dataset>')
337 |         if concise:
338 |             dataset = dataset.format('concise', 1)
339 |         else:
340 |             dataset = dataset.format('complete', 0)
341 |         aid_xml = ''
342 |         for aid in np.atleast_1d(aids):
343 |             aid_xml += ('<PCT-ID-List_uids_E>{}'.format(aid) +
344 |                         '</PCT-ID-List_uids_E>')
345 |         mapping = {'group_by': group_by, 'dataset': dataset, 'aids': aid_xml,
346 |                    'compression': compression}
347 |         query = self.get_query(query_template % mapping)
348 |         rval = query.fetch(filename, compression=compression)
349 |         return rval
350 | 
351 |     def get_assay_descriptions(self, aids, output_format='json',
352 |                                batch_size=500, n_jobs=1, max_attempts=3):
353 |         """
354 |         Get assay descriptions.
355 | 
356 |         Parameters
357 |         ----------
358 |         aids : list
359 |             List of assay IDs.
360 |         output_format : str (default='json')
361 |             Output format.
362 |         """
363 |         results = Parallel(n_jobs=n_jobs, verbose=5)(
364 |             delayed(_get_assay_descriptions)
365 |             (this_aids, output_format, batch_size, max_attempts)
366 |             for this_aids in np.array_split(aids, n_jobs))
367 |         descriptions = []
368 |         if output_format == 'json':
369 |             for result in results:
370 |                 for this in result:
371 |                     data = json.loads(this)
372 |                     assert len(data) == 1
373 |                     assert data.keys()[0] == 'PC_AssayContainer'
374 |                     for description in data['PC_AssayContainer']:
375 |                         descriptions.append(description['assay']['descr'])
376 |         else:
377 |             raise NotImplementedError(output_format)
378 |         return descriptions
379 | 
380 |     def id_exchange(self, ids, source=None, operation_type='same',
381 |                     output_type='cid'):
382 |         """
383 |         Use the PubChem Identifier exchange service.
384 | 
385 |         Currently only supports mapping from Registry IDs (e.g. ChEMBL IDs) to
386 |         PubChem IDs.
387 | 
388 |         Parameters
389 |         ----------
390 |         ids : iterable
391 |             Input identifiers.
392 |         source : str, optional
393 |             Input source. If None, it will be inferred from ids (if possible).
394 |         operation_type : str, optional (default 'same')
395 |             Operation type. Defaults to exact matches.
396 |         output_type : str, optional (default 'cid')
397 |             Output type. Defaults to PubChem CIDs.
398 |         """
399 |         query_template = """
400 | <PCT-Data>
401 |   <PCT-Data_input>
402 |     <PCT-InputData>
403 |       <PCT-InputData_query>
404 |         <PCT-Query>
405 |           <PCT-Query_type>
406 |             <PCT-QueryType>
407 |               <PCT-QueryType_id-exchange>
408 |                 <PCT-QueryIDExchange>
409 |                   <PCT-QueryIDExchange_input>
410 |                     <PCT-QueryUids>
411 |                       <PCT-QueryUids_source-ids>
412 |                         <PCT-RegistryIDs>
413 |         <PCT-RegistryIDs_source-name>%(source)s</PCT-RegistryIDs_source-name>
414 |                           <PCT-RegistryIDs_source-ids>
415 |                             %(source_ids)s
416 |                           </PCT-RegistryIDs_source-ids>
417 |                         </PCT-RegistryIDs>
418 |                       </PCT-QueryUids_source-ids>
419 |                     </PCT-QueryUids>
420 |                   </PCT-QueryIDExchange_input>
421 |                   <PCT-QueryIDExchange_operation-type
422 |                     value="%(operation_type)s"/>
423 |                   <PCT-QueryIDExchange_output-type value="%(output_type)s"/>
424 |                   <PCT-QueryIDExchange_output-method value="file-pair"/>
425 |                   <PCT-QueryIDExchange_compression value="gzip"/>
426 |                 </PCT-QueryIDExchange>
427 |               </PCT-QueryType_id-exchange>
428 |             </PCT-QueryType>
429 |           </PCT-Query_type>
430 |         </PCT-Query>
431 |       </PCT-InputData_query>
432 |     </PCT-InputData>
433 |   </PCT-Data_input>
434 | </PCT-Data>
435 | """
436 |         ids = np.atleast_1d(ids)
437 |         if np.unique(ids).size != len(ids):
438 |             raise ValueError('Source IDs must be unique.')
439 |         if source is None:
440 |             source = self.guess_source(ids[0])
441 |             if source is None:
442 |                 raise ValueError('Cannot guess identifier source.')
443 |         mapping = {'source': source, 'operation_type': operation_type,
444 |                    'output_type': output_type}
445 |         source_ids = []
446 |         for source_id in ids:
447 |             id_xml = ('<PCT-RegistryIDs_source-ids_E>{}'.format(source_id) +
448 |                       '</PCT-RegistryIDs_source-ids_E>\n')
449 |             source_ids.append(id_xml)
450 |         mapping['source_ids'] = ''.join(source_ids)
451 | 
452 |         # construct query
453 |         query = self.get_query(query_template % mapping)
454 |         rval = query.fetch(compression='gzip')
455 | 
456 |         # identify matched and unmatched IDs
457 |         id_map = {}
458 |         for line in rval.splitlines():
459 |             source, dest = line.split()
460 |             try:
461 |                 dest = int(dest)  # try to convert to an int
462 |             except ValueError:
463 |                 pass
464 |             if source in id_map and id_map[source] != dest:
465 |                 raise ValueError('Nonidentical duplicate mapping.')
466 |             id_map[source] = dest
467 |         for source_id in ids:
468 |             if source_id not in id_map:
469 |                 id_map[source_id] = None
470 |         return id_map
471 | 
472 |     @staticmethod
473 |     def guess_source(identifier):
474 |         """
475 |         Guess the source for an identifier.
476 | 
477 |         Parameters
478 |         ----------
479 |         identifier : str
480 |             Identifier.
481 |         """
482 |         source = None
483 |         if str(identifier).startswith('CHEMBL'):
484 |             source = 'ChEMBL'
485 |         elif str(identifier).startswith('ZINC'):
486 |             source = 'ZINC'
487 |         return source
488 | 
489 |     def structure_search(self, structure, structure_format='smiles'):
490 |         """
491 |         Search PubChem for identical structure and return matching CID.
492 | 
493 |         Parameters
494 |         ----------
495 |         structure : str
496 |             SMILES or SDF query.
497 |         structure_format : str, optional (default 'smiles')
498 |             Structure format. Can be either 'smiles' or 'sdf'.
499 |         """
500 |         query_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' +
501 |                           '/identity/{}/XML')
502 |         status_template = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug' +
503 |                            '/compound/listkey/{}/cids/XML')
504 |         request_id = None
505 |         post_data = urllib.urlencode({structure_format: structure})
506 |         req = urllib2.Request(query_template.format(structure_format))
507 |         req.add_header('Content-Type', 'application/x-www-form-urlencoded')
508 |         response = urllib2.urlopen(req, data=post_data)
509 |         for line in response.readlines():
510 |             search = re.search('<ListKey>(\d+)</ListKey>', line)
511 |             if search is not None:
512 |                 request_id = search.groups()[0]
513 |         if request_id is None:
514 |             return None
515 |         cid = None
516 |         while True:
517 |             try:
518 |                 response = urllib2.urlopen(
519 |                     status_template.format(request_id))
520 |             except urllib2.HTTPError:
521 |                 break
522 |             for line in response.readlines():
523 |                 search = re.search('<CID>(\d+)</CID>', line)
524 |                 if search is not None:
525 |                     cid = int(search.groups()[0])
526 |             if cid is not None:
527 |                 break
528 |             time.sleep(self.delay)
529 |         return cid
530 | 
531 | 
532 | def _get_assay_descriptions(aids, output_format='json', batch_size=500,
533 |                             max_attempts=3):
534 |     """
535 |     Parallel worker for PubChem.get_assay_descriptions.
536 | 
537 |     Parameters
538 |     ----------
539 |     aids : list
540 |         List of assay IDs.
541 |     output_format : str (default='json')
542 |         Output format.
543 |     batch_size : int (default 500)
544 |         Number of descriptions per request.
545 |     max_attempts : int (default 3)
546 |         Maximum number of query attempts. The batch_size is halved after each
547 |         failure.
548 |     """
549 |     url = ('http://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/' +
550 |            '{aids}/description/{format}')
551 |     descriptions = []
552 |     failures = 0
553 |     start = 0
554 |     while True:
555 |         if start >= len(aids):
556 |             break  # stop when we are out of AIDs
557 |         query_aids = aids[start:start+batch_size]
558 |         query = url.format(aids=','.join([str(aid) for aid in query_aids]),
559 |                            format=output_format)
560 |         try:
561 |             response = urllib2.urlopen(query)
562 |         except urllib2.HTTPError as e:
563 |             failures += 1
564 |             batch_size /= 2  # halve the batch size and try again
565 |             if failures >= max_attempts:
566 |                 raise e
567 |             continue
568 |         descriptions.append(response.read())
569 |         failures = 0  # reset the failure count
570 |         start += batch_size  # move the start index
571 |     return descriptions
572 | 


--------------------------------------------------------------------------------
/pubchem_utils/pug.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for interacting with the PubChem Power User Gateway (PUG).
  3 | 
  4 | The PUG XML schema is located at
  5 | https://pubchem.ncbi.nlm.nih.gov/pug/pug.xsd.
  6 | 
  7 | See also https://pubchem.ncbi.nlm.nih.gov/pug/pughelp.html.
  8 | """
  9 | import gzip
 10 | import re
 11 | from StringIO import StringIO
 12 | import time
 13 | import urllib
 14 | import urllib2
 15 | import warnings
 16 | 
 17 | __author__ = "Steven Kearnes"
 18 | __copyright__ = "Copyright 2014, Stanford University"
 19 | __license__ = "3-clause BSD"
 20 | 
 21 | 
 22 | class PugQuery(object):
 23 |     """
 24 |     Submit a PUG query and store the download URL when it becomes
 25 |     available.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     query : str
 30 |         PUG query XML.
 31 |     submit : bool, optional (default True)
 32 |         Whether to automatically submit the query.
 33 |     delay : int, optional (default 10)
 34 |         Number of seconds to wait between status checks.
 35 |     n_attempts : int, optional (default 3)
 36 |         Number of times to attempt query submission.
 37 |     verbose : bool, optional (default False)
 38 |         Whether to be verbose.
 39 |     """
 40 |     cancel_template = """
 41 |     <PCT-Data>
 42 |       <PCT-Data_input>
 43 |         <PCT-InputData>
 44 |           <PCT-InputData_request>
 45 |             <PCT-Request>
 46 |               <PCT-Request_reqid>%(id)s</PCT-Request_reqid>
 47 |               <PCT-Request_type value="cancel"/>
 48 |             </PCT-Request>
 49 |           </PCT-InputData_request>
 50 |         </PCT-InputData>
 51 |       </PCT-Data_input>
 52 |     </PCT-Data>
 53 |     """
 54 |     status_template = """
 55 |     <PCT-Data>
 56 |      <PCT-Data_input>
 57 |       <PCT-InputData>
 58 |        <PCT-InputData_request>
 59 |         <PCT-Request>
 60 |          <PCT-Request_reqid>%(id)s</PCT-Request_reqid>
 61 |          <PCT-Request_type value="status"/>
 62 |         </PCT-Request>
 63 |        </PCT-InputData_request>
 64 |       </PCT-InputData>
 65 |      </PCT-Data_input>
 66 |     </PCT-Data>
 67 |     """
 68 |     url = 'https://pubchem.ncbi.nlm.nih.gov/pug/pug.cgi'
 69 | 
 70 |     def __init__(self, query, submit=True, delay=10, n_attempts=3,
 71 |                  verbose=False):
 72 |         self.query = query
 73 |         self.delay = delay
 74 |         self.n_attemps = n_attempts
 75 |         self.verbose = verbose
 76 | 
 77 |         self.id = None
 78 |         self.download_url = None
 79 |         self.filename = None
 80 |         self.data = None
 81 |         self.alive = False
 82 | 
 83 |         if submit:
 84 |             self.submit()
 85 | 
 86 |     def __del__(self):
 87 |         """
 88 |         Cancel uncompleted queries.
 89 |         """
 90 |         self.cancel()
 91 | 
 92 |     def request(self, query):
 93 |         """
 94 |         Submit a query to PUG and extract either the query ID or the
 95 |         download URL.
 96 | 
 97 |         Parameters
 98 |         ----------
 99 |         query : str
100 |             PUG query XML.
101 |         """
102 |         q = None
103 |         for i in xrange(self.n_attemps):
104 |             try:
105 |                 q = urllib2.urlopen(self.url, query)
106 |                 break
107 |             except urllib2.HTTPError as e:
108 |                 if i + 1 < self.n_attemps:
109 |                     continue
110 |                 else:
111 |                     raise e
112 |         response = q.read()
113 | 
114 |         # check for errors
115 |         status_re = re.search('<PCT-Status value="(.*?)"/>', response)
116 |         status = status_re.groups()[0]
117 |         if status not in ['success', 'queued', 'running', 'stopped']:
118 |             msg = 'Original Query:\n------\n{}\n'.format(
119 |                 '\n'.join(self.query.splitlines()[:100]))
120 |             if query != self.query:
121 |                 msg += 'Current Query:\n--------------\n{}\n'.format(
122 |                     '\n'.join(query.splitlines()[:100]))
123 |             msg += 'Response:\n---------\n{}'.format(response)
124 |             raise PUGError(msg)
125 | 
126 |         # check for a download URL
127 |         download_url_re = re.search(
128 |             '<PCT-Download-URL_url>\s*(.*?)\s*</PCT-Download-URL_url>',
129 |             response)
130 |         if download_url_re is not None:
131 |             self.download_url = download_url_re.groups()[0]
132 | 
133 |         # otherwise, extract the request ID
134 |         elif self.id is None:
135 |             reqid_re = re.search(
136 |                 '<PCT-Waiting_reqid>\s*(.*?)\s*</PCT-Waiting_reqid>', response)
137 |             self.id = reqid_re.groups()[0]
138 | 
139 |     def cancel(self):
140 |         """
141 |         Cancel a pending request.
142 |         """
143 |         if self.alive:
144 |             assert self.id is not None
145 |             warnings.warn('Canceling PUG request.')
146 |             query = self.cancel_template % {'id': self.id}
147 |             self.request(query)
148 |             self.alive = False
149 | 
150 |     def check_status(self):
151 |         """
152 |         Check the status of the query.
153 |         """
154 |         assert self.id is not None
155 |         query = self.status_template % {'id': self.id}
156 |         self.request(query)
157 | 
158 |     def submit(self):
159 |         """
160 |         Submit the query and monitor its progress.
161 |         """
162 |         if self.alive:
163 |             warnings.warn('This request is already active.')
164 |             return
165 |         self.alive = True
166 |         self.request(self.query)
167 |         if self.verbose:
168 |             print self.id,
169 |         while self.download_url is None:
170 |             time.sleep(self.delay)
171 |             self.check_status()
172 |         self.alive = False
173 | 
174 |     def fetch(self, filename=None, compression=None):
175 |         """
176 |         Fetch the result of the query.
177 | 
178 |         Parameters
179 |         ----------
180 |         filename : str, optional
181 |             Output filename. If not provided, the data is read into memory.
182 |         compression : str, optional
183 |             Compression type used to decode data.
184 |         """
185 |         if not self.alive:
186 |             self.submit()
187 |         if self.download_url is None:
188 |             raise PUGError('No download URL.')
189 | 
190 |         # fetch
191 |         if filename is not None:
192 |             filename, _ = urllib.urlretrieve(self.download_url, filename)
193 |             self.filename = filename
194 |             return filename
195 |         else:
196 |             data = urllib2.urlopen(self.download_url).read()
197 |             if compression is not None:
198 |                 if compression == 'gzip':
199 |                     with gzip.GzipFile(fileobj=StringIO(data)) as f:
200 |                         data = f.read()
201 |                 else:
202 |                     raise NotImplementedError(compression)
203 |             self.data = data
204 |             return data
205 | 
206 | 
207 | class PUGError(Exception):
208 |     """
209 |     PUG exception class.
210 |     """
211 | 


--------------------------------------------------------------------------------
/pubchem_utils/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scripting utilities.
 3 | """
 4 | import gzip
 5 | 
 6 | __author__ = "Steven Kearnes"
 7 | __copyright__ = "Copyright 2014, Stanford University"
 8 | __license__ = "3-clause BSD"
 9 | 
10 | 
11 | def read_ids(filename):
12 |     """
13 |     Read record IDs from a file.
14 | 
15 |     Parameters
16 |     ----------
17 |     filename : str
18 |         Filename containing record IDs.
19 |     """
20 |     if filename.endswith('.gz'):
21 |         f = gzip.open(filename)
22 |     else:
23 |         f = open(filename)
24 |     ids = [line.strip() for line in f]
25 |     f.close()
26 |     return ids
27 | 


--------------------------------------------------------------------------------
/pubchem_utils/scripts/download_records.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Download records from PubChem by ID.
 3 | """
 4 | import argparse
 5 | 
 6 | from pubchem_utils import PubChem
 7 | from pubchem_utils.scripts import read_ids
 8 | 
 9 | __author__ = "Steven Kearnes"
10 | __copyright__ = "Copyright 2014, Stanford University"
11 | __license__ = "3-clause BSD"
12 | 
13 | 
14 | def parse_args(input_args=None):
15 |     """
16 |     Parse command-line arguments.
17 | 
18 |     Parameters
19 |     ----------
20 |     input_args : list, optional
21 |         Input arguments. If not provided, defaults to sys.argv[1:].
22 |     """
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument('input',
25 |                         help='Input filename containing record IDs.')
26 |     parser.add_argument('output',
27 |                         help='Output filename.')
28 |     parser.add_argument('--sids', action='store_true',
29 |                         help='Whether IDs are substance IDs (if False, IDs ' +
30 |                              'are assumed to be compound IDs).')
31 |     parser.add_argument('-f', '--format', dest='download_format',
32 |                         default='sdf', help='Download format.')
33 |     parser.add_argument('-c', '--compression', default='gzip',
34 |                         help='Compression type.')
35 |     parser.add_argument('--3d', action='store_true', dest='use_3d',
36 |                         help='Whether to download 3D structures.')
37 |     parser.add_argument('-n', '--n-conformers', type=int, default=1,
38 |                         help='Number of conformers to download if ' +
39 |                              'retrieving 3D structures.')
40 |     parser.add_argument('-d', '--delay', type=int, default=10,
41 |                         help='Number of seconds to wait between status ' +
42 |                              'checks.')
43 |     rval = parser.parse_args(input_args)
44 |     return rval
45 | 
46 | 
47 | def main(ids, filename=None, sids=False, download_format='sdf',
48 |          compression='gzip', use_3d=False, n_conformers=1, delay=10):
49 |     """
50 |     Download records from PubChem by ID.
51 | 
52 |     Parameters
53 |     ----------
54 |     ids : iterable
55 |         PubChem substance or compound IDs.
56 |     filename : str, optional
57 |         Output filename. If not provided, a temporary file is created.
58 |     sids : bool, optional (default False)
59 |         Whether ids are SIDs. If False, IDs are assumed to be CIDs.
60 |     download_format : str, optional (default 'sdf')
61 |         Download file format.
62 |     compression : str, optional (default 'gzip')
63 |         Compression type for downloaded structures.
64 |     use_3d : bool, optional (default True)
65 |         Whether to query 3D information. If False, 2D information is
66 |         retrieved.
67 |     n_conformers : int, optional (default 1)
68 |         Number of conformers to download if retrieving 3D structures.
69 |     delay : int, optional (default 10)
70 |         Number of seconds to wait between status checks.
71 |     """
72 |     engine = PubChem(delay=delay)
73 |     engine.get_records(ids, filename, sids, download_format, compression,
74 |                        use_3d, n_conformers)
75 | 
76 | if __name__ == '__main__':
77 |     args = parse_args()
78 |     record_ids = read_ids(args.input)
79 |     main(record_ids, args.output, args.sids, args.download_format,
80 |          args.compression, args.use_3d, args.n_conformers, args.delay)
81 | 


--------------------------------------------------------------------------------
/pubchem_utils/scripts/id_exchange.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Use the PubChem Identifier Exchange service.
 4 | """
 5 | import argparse
 6 | import numpy as np
 7 | 
 8 | from pubchem_utils import PubChem
 9 | from pubchem_utils.scripts import read_ids
10 | 
11 | __author__ = "Steven Kearnes"
12 | __copyright__ = "Copyright 2014, Stanford University"
13 | __license__ = "3-clause BSD"
14 | 
15 | 
16 | def parse_args(input_args=None):
17 |     """
18 |     Parse command-line arguments.
19 | 
20 |     Parameters
21 |     ----------
22 |     input_args : list, optional
23 |         Input arguments. If not provided, defaults to sys.argv[1:].
24 |     """
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('input',
27 |                         help='Input filename containing record IDs.')
28 |     parser.add_argument('-s', '--source',
29 |                         help='Source for input IDs. If not provided, the ' +
30 |                              'source will be inferred from the input IDs.')
31 |     parser.add_argument('-m', '--mapping', action='store_true',
32 |                         help='Whether to write ID mapping. If false, only ' +
33 |                              'result IDs will be saved.')
34 |     parser.add_argument('-p', '--prefix',
35 |                         help='Prefix for output files.')
36 |     parser.add_argument('--sids', action='store_true',
37 |                         help='Whether returned IDs are substance IDs '
38 |                              '(if False, returned IDs will be compound IDs).')
39 |     parser.add_argument('-d', '--delay', type=int, default=10,
40 |                         help='Number of seconds to wait between status ' +
41 |                              'checks.')
42 |     return parser.parse_args(input_args)
43 | 
44 | 
45 | def main(ids, source=None, prefix=None, sids=False, mapping=False, delay=10):
46 |     """
47 |     Download records from PubChem by ID.
48 | 
49 |     Parameters
50 |     ----------
51 |     ids : iterable
52 |         Source IDs.
53 |     source : str, optional
54 |         Input source. If None, it will be inferred from ids (if possible).
55 |     prefix : str, optional
56 |         Prefix for output files.
57 |     sids : bool, optional (default False)
58 |         Whether ids are SIDs. If False, IDs are assumed to be CIDs.
59 |     mapping : bool, optional (default False)
60 |     delay : int, optional (default 10)
61 |         Number of seconds to wait between status checks.
62 |     """
63 |     engine = PubChem(delay=delay)
64 |     if sids:
65 |         output_type = 'sid'
66 |     else:
67 |         output_type = 'cid'
68 |     matched, unmatched = engine.id_exchange(np.unique(ids), source,
69 |                                             output_type=output_type)
70 |     if mapping:
71 |         with open('{}-mapping.txt'.format(prefix), 'wb') as f:
72 |             for key, value in matched.items():
73 |                 f.write('{}\t{}\n'.format(key, value))
74 |     else:
75 |         with open('{}-matched.txt'.format(prefix), 'wb') as f:
76 |             for value in matched.values():
77 |                 f.write('{}\n'.format(value))
78 |     if len(unmatched):
79 |         with open('{}-unmatched.txt'.format(prefix), 'wb') as f:
80 |             for value in unmatched:
81 |                 f.write('{}\n'.format(value))
82 | 
83 | if __name__ == '__main__':
84 |     args = parse_args()
85 |     record_ids = read_ids(args.input)
86 |     main(record_ids, args.source, args.prefix, args.sids, args.mapping,
87 |          args.delay)
88 | 


--------------------------------------------------------------------------------
/pubchem_utils/scripts/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skearnes/pubchem-utils/6dd796d4e0ef65641547c429ed39ce39d0742510/pubchem_utils/scripts/test/__init__.py


--------------------------------------------------------------------------------
/pubchem_utils/scripts/test/test_download_records.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for download_records.py.
 3 | """
 4 | import numpy as np
 5 | import shutil
 6 | import tempfile
 7 | import unittest
 8 | 
 9 | from .. import read_ids
10 | from ..download_records import main, parse_args
11 | 
12 | 
13 | class TestDownloadIds(unittest.TestCase):
14 |     """
15 |     Tests for download_records.py.
16 |     """
17 |     def setUp(self):
18 |         """
19 |         Set up tests.
20 |         """
21 |         self.temp_dir = tempfile.mkdtemp()
22 |         _, self.filename = tempfile.mkstemp(dir=self.temp_dir)
23 | 
24 |         # write CIDs
25 |         self.cids = np.asarray([2244])
26 |         _, self.cid_filename = tempfile.mkstemp(suffix='.txt',
27 |                                                 dir=self.temp_dir)
28 |         with open(self.cid_filename, 'wb') as f:
29 |             for cid in self.cids:
30 |                 f.write('{}\n'.format(cid))
31 | 
32 |         # write SIDs
33 |         self.sids = [179038559]
34 |         _, self.sid_filename = tempfile.mkstemp(suffix='.txt',
35 |                                                 dir=self.temp_dir)
36 |         with open(self.sid_filename, 'wb') as f:
37 |             for sid in self.sids:
38 |                 f.write('{}\n'.format(sid))
39 | 
40 |     def tearDown(self):
41 |         """
42 |         Clean up tests.
43 |         """
44 |         shutil.rmtree(self.temp_dir)
45 | 
46 |     def run_script(self, ids, args):
47 |         """
48 |         Run main loop of script.
49 |         """
50 |         main(ids, args.output, args.sids, args.download_format,
51 |              args.compression, args.use_3d, args.n_conformers, args.delay)
52 | 
53 |     def test_read_ids(self):
54 |         """
55 |         Test read_ids.
56 |         """
57 |         ids = read_ids(self.cid_filename)
58 |         assert np.array_equal(np.asarray(ids, dtype=self.cids.dtype),
59 |                               self.cids)
60 | 
61 |     def test_download_cid(self):
62 |         """
63 |         Download a CID.
64 |         """
65 |         ids = read_ids(self.cid_filename)
66 |         args = parse_args([self.cid_filename, self.filename])
67 |         self.run_script(ids, args)
68 | 


--------------------------------------------------------------------------------
/pubchem_utils/test/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for PubChem PUG interface.
  3 | """
  4 | import numpy as np
  5 | import os
  6 | import unittest
  7 | import urllib2
  8 | import tempfile
  9 | 
 10 | from .. import PubChem
 11 | 
 12 | 
 13 | class TestPubChem(unittest.TestCase):
 14 |     """
 15 |     Tests for PubChem.
 16 | 
 17 |     Reference comparisons are made when possible to records retrieved using the
 18 |     PubChem PUG REST interface via http://pubchem.ncbi.nlm.nih.gov/rest/pug.
 19 |     """
 20 |     def setUp(self):
 21 |         """
 22 |         Set up tests.
 23 |         """
 24 |         self.engine = PubChem(delay=3)  # shorten delay for tests
 25 |         self.rest_url = 'http://pubchem.ncbi.nlm.nih.gov/rest/pug'
 26 | 
 27 |     def identical_sdf(self, a, b):
 28 |         """
 29 |         Compare SDF records.
 30 | 
 31 |         SDF records downloaded from PubChem have a timestamp that should not be
 32 |         considered in the comparison.
 33 | 
 34 |         Parameters
 35 |         ----------
 36 |         a, b : str
 37 |             SDF records to compare.
 38 |         """
 39 |         if a == b:  # sometimes the timestamps match
 40 |             return True
 41 | 
 42 |         try:
 43 |             a_lines = a.split('\n')
 44 |             b_lines = b.split('\n')
 45 |             assert len(a_lines) == len(b_lines)
 46 |             for i in xrange(len(a_lines)):
 47 |                 if i == 1:
 48 |                     assert a_lines[i].strip().startswith('-OEChem')
 49 |                     assert b_lines[i].strip().startswith('-OEChem')
 50 |                     continue
 51 |                 assert a_lines[i] == b_lines[i]
 52 |             return True
 53 |         except AssertionError:
 54 |             return False
 55 | 
 56 |     def test_get_records_cid(self):
 57 |         """
 58 |         2D CID request with get_records().
 59 |         """
 60 |         url = os.path.join(self.rest_url, 'compound/cid/2244/SDF')
 61 |         ref = urllib2.urlopen(url).read()
 62 |         data = self.engine.get_records([2244])
 63 |         assert self.identical_sdf(data, ref)
 64 | 
 65 |     def test_get_record_cid(self):
 66 |         """
 67 |         2D CID request with get_record().
 68 |         """
 69 |         url = os.path.join(self.rest_url, 'compound/cid/2244/SDF')
 70 |         ref = urllib2.urlopen(url).read()
 71 |         data = self.engine.get_record(2244)
 72 |         assert self.identical_sdf(data, ref)
 73 | 
 74 |     def test_get_records_sid(self):
 75 |         """
 76 |         SID request with get_records().
 77 |         """
 78 |         url = os.path.join(self.rest_url, 'substance/sid/179038559/SDF')
 79 |         ref = urllib2.urlopen(url).read()
 80 |         data = self.engine.get_records([179038559], sids=True)
 81 |         assert self.identical_sdf(data, ref)
 82 | 
 83 |     def test_get_record_sid(self):
 84 |         """
 85 |         SID request with get_record().
 86 |         """
 87 |         url = os.path.join(self.rest_url, 'substance/sid/179038559/SDF')
 88 |         ref = urllib2.urlopen(url).read()
 89 |         data = self.engine.get_record(179038559, sid=True)
 90 |         assert self.identical_sdf(data, ref)
 91 | 
 92 |     def test_get_records_3d(self):
 93 |         """
 94 |         3D structure request with get_records().
 95 |         """
 96 |         url = os.path.join(self.rest_url,
 97 |                            'compound/cid/2244/SDF?record_type=3d')
 98 |         ref = urllib2.urlopen(url).read()
 99 |         data = self.engine.get_records([2244], use_3d=True)
100 |         assert self.identical_sdf(data, ref)
101 | 
102 |     def test_get_record_3d(self):
103 |         """
104 |         3D structure request with get_record().
105 |         """
106 |         url = os.path.join(self.rest_url,
107 |                            'compound/cid/2244/SDF?record_type=3d')
108 |         ref = urllib2.urlopen(url).read()
109 |         data = self.engine.get_record(2244, use_3d=True)
110 |         assert self.identical_sdf(data, ref)
111 | 
112 |     def test_aid_cids(self):
113 |         """
114 |         Fetch CIDs from an AID.
115 |         """
116 |         url = os.path.join(self.rest_url, 'assay/aid/466/cids/TXT')
117 |         ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
118 |         data = self.engine.get_ids_from_assay(466)
119 |         assert np.array_equal(data, ref)
120 | 
121 |     def test_aid_sids(self):
122 |         """
123 |         Fetch SIDs from an AID.
124 |         """
125 |         url = os.path.join(self.rest_url, 'assay/aid/466/sids/TXT')
126 |         ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
127 |         data = self.engine.get_ids_from_assay(466, sids=True)
128 |         assert np.array_equal(data, ref)
129 | 
130 |     def test_aid_active_cids(self):
131 |         """
132 |         Fetch active CIDs from an AID.
133 |         """
134 |         url = os.path.join(self.rest_url,
135 |                            'assay/aid/466/cids/TXT?cids_type=active')
136 |         ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
137 |         data = self.engine.get_ids_from_assay(466, activity_outcome='active')
138 |         assert np.array_equal(data, ref)
139 | 
140 |     def test_aid_inactive_cids(self):
141 |         """
142 |         Fetch inactive CIDs from an AID.
143 |         """
144 |         url = os.path.join(self.rest_url,
145 |                            'assay/aid/466/cids/TXT?cids_type=inactive')
146 |         ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
147 |         data = self.engine.get_ids_from_assay(466, activity_outcome='inactive')
148 |         assert np.array_equal(data, ref)
149 | 
150 |     def test_aid_active_sids(self):
151 |         """
152 |         Fetch active SIDs from an AID.
153 |         """
154 |         url = os.path.join(self.rest_url,
155 |                            'assay/aid/466/sids/TXT?sids_type=active')
156 |         ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
157 |         data = self.engine.get_ids_from_assay(466, sids=True,
158 |                                               activity_outcome='active')
159 |         assert np.array_equal(data, ref)
160 | 
161 |     def test_aid_inactive_sids(self):
162 |         """
163 |         Fetch inactive SIDs from an AID.
164 |         """
165 |         url = os.path.join(self.rest_url,
166 |                            'assay/aid/466/sids/TXT?sids_type=inactive')
167 |         ref = np.asarray(urllib2.urlopen(url).read().split(), dtype=int)
168 |         data = self.engine.get_ids_from_assay(466, sids=True,
169 |                                               activity_outcome='inactive')
170 |         assert np.array_equal(data, ref)
171 | 
172 |     def test_get_assay_data(self):
173 |         """
174 |         Test PubChem.get_assay_data.
175 |         """
176 |         data = self.engine.get_assay_data(504772)
177 |         assert len(data.splitlines()) == 332  # 331 records plus header
178 | 
179 |     def test_get_assay_descriptions(self):
180 |         """
181 |         Test PubChem.get_assay_descriptions.
182 |         """
183 |         data = self.engine.get_assay_descriptions([490])
184 |         assert len(data) == 1
185 |         assert data[0]['aid']['id'] == 490  # check AID
186 | 
187 |     def test_get_assay_descriptions_parallel(self):
188 |         """
189 |         Test PubChem.get_assay_descriptions with n_jobs > 1.
190 |         """
191 |         aids = [490, 466, 9, 548, 851]
192 |         data = self.engine.get_assay_descriptions(aids, n_jobs=2)
193 |         assert len(data) == 5
194 | 
195 |         # check AIDs are all present (order is not guaranteed)
196 |         desc_aids = []
197 |         for desc in data:
198 |             desc_aids.append(desc['aid']['id'])
199 |         assert np.array_equal(np.sort(aids), np.sort(desc_aids))
200 | 
201 |     def test_id_exchange(self):
202 |         """
203 |         Test PubChem.id_exchange.
204 |         """
205 |         data = self.engine.id_exchange('CHEMBL25')
206 |         assert data['CHEMBL25'] == 2244
207 | 
208 |     def test_structure_search_smiles(self):
209 |         """
210 |         Test PubChem.structure_search with SMILES queries.
211 |         """
212 |         smiles = self.engine.get_records([2244], download_format='smiles')
213 |         smiles = smiles.split()[1]
214 |         assert self.engine.structure_search(smiles) == 2244
215 | 
216 |     def test_structure_search_sdf(self):
217 |         """
218 |         Test PubChem.structure_search with SDF queries.
219 |         """
220 |         sdf = self.engine.get_records([2244])
221 |         assert self.engine.structure_search(
222 |             sdf, structure_format='sdf') == 2244
223 | 
224 |     def test_get_parent_cids(self):
225 |         """
226 |         Test PubChem.get_parent_cids.
227 |         """
228 |         same = self.engine.get_parent_cids([2244])
229 |         assert same == {2244}, same
230 |         parents = self.engine.get_parent_cids([23666729, 5338317])
231 |         assert parents == {2244, 3672}, parents
232 | 
233 |     def test_get_record_filename(self):
234 |         """
235 |         Test PubChem.get_record()'s filename kwarg.
236 |         """
237 |         fd, fn = tempfile.mkstemp()
238 |         try:
239 |             ref = self.engine.get_record(2244)
240 |             self.engine.get_record(2244, filename=fn)
241 |             with open(fn) as f:
242 |                 data = f.read()
243 |             assert self.identical_sdf(data, ref)
244 |         finally:
245 |             os.close(fd)
246 |             os.unlink(fn)
247 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | numpy
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | def main():
 5 |     setup(
 6 |         name='pubchem_utils',
 7 |         version='0.1',
 8 |         license='3-clause BSD',
 9 |         url='https://github.com/skearnes/pubchem-utils',
10 |         description='Utilities for interacting with PubChem',
11 |         packages=find_packages(),
12 |     )
13 | 
14 | if __name__ == '__main__':
15 |     main()
16 | 


--------------------------------------------------------------------------------