├── .gitignore ├── LICENSE ├── MANIFEST ├── README.md ├── demos └── demos.ipynb ├── devtools └── conda-recipe │ ├── README.md │ ├── build.sh │ ├── conda_build_config.yaml │ └── meta.yaml ├── pypdb ├── __init__.py ├── clients │ ├── __init__.py │ ├── data │ │ ├── EXAMPLES.md │ │ ├── __init__.py │ │ ├── data_types.py │ │ ├── graphql │ │ │ ├── __init__.py │ │ │ ├── graphql.py │ │ │ └── test_graphql.py │ │ └── test_data_types.py │ ├── fasta │ │ ├── __init__.py │ │ ├── fasta_client.py │ │ └── fasta_client_test.py │ ├── pdb │ │ ├── __init__.py │ │ ├── pdb_client.py │ │ └── pdb_client_test.py │ └── search │ │ ├── EXAMPLES.md │ │ ├── __init__.py │ │ ├── operators │ │ ├── __init__.py │ │ ├── chemical_operators.py │ │ ├── chemical_operators_test.py │ │ ├── seqmotif_operators.py │ │ ├── seqmotif_operators_test.py │ │ ├── sequence_operators.py │ │ ├── sequence_operators_test.py │ │ ├── structure_operators.py │ │ ├── structure_operators_test.py │ │ ├── text_operators.py │ │ └── text_operators_test.py │ │ ├── search_client.py │ │ └── search_client_test.py ├── conftest.py ├── pypdb.py └── util │ ├── __init__.py │ ├── http_requests.py │ └── test_http_requests.py ├── setup.cfg ├── setup.py └── tests └── test_pypdb.py /.gitignore: -------------------------------------------------------------------------------- 1 | ###Temporary Files created by text editor### 2 | *~ 3 | .vscode 4 | .vscode/ 5 | 6 | ### Remove DS STORE 7 | *.DS_Store 8 | 9 | ### Remove autosave files 10 | \#* 11 | 12 | ### Local development files 13 | scratch* 14 | 15 | ### Python ### 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | 20 | # Setup code for pypi 21 | \#setup\## 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | env/ 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | ### IPythonNotebook ### 74 | # Temporary data 75 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 William Gilpin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.cfg 3 | setup.py 4 | pypdb/pypdb.py 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyPDB 2 | 3 | A Python 3 toolkit for performing searches with the RCSB Protein Data Bank (PDB). This can be used to perform advanced searches for PDB IDs matching various criteria, as well as to look up information associated with specific PDB IDs. This tool allows standard operations that can be perfomed from within the PDB website (BLAST, PFAM lookup, etc.) to be performed from within Python scripts. 4 | 5 | If you use this module for any published work, please consider citing the accompanying paper 6 | 7 | Gilpin, W. "PyPDB: A Python API for the Protein Data Bank." 8 | Bioinformatics, Oxford Journals, 2016. 9 | 10 | ## Installation 11 | 12 | Install using pip: 13 | 14 | $ pip install pypdb 15 | 16 | To install the development version, which contains the latest features and fixes, install directly from GitHub using 17 | 18 | $ pip install git+https://github.com/williamgilpin/pypdb 19 | 20 | If you need to install directly from setup.py, 21 | 22 | $ python setup.py install 23 | 24 | Test the installation, and check that the code successfully connects to the PDB, navigate to the root directory and run 25 | 26 | $ pytest 27 | 28 | This code has been designed and tested for Python 3. 29 | 30 | ## Usage 31 | 32 | ### PDB Text Search 33 | This package can be used to get lists of PDB IDs associated with specific search terms, experiment types, structures, and other common criteria. To use the simple API, see the examples in [`demos/demos.ipynb`](demos/demos.ipynb). For advanced search and query logic, see the examples in [`search/EXAMPLES.md`](pypdb/clients/search/EXAMPLES.md). 34 | 35 | ### PDB Data Fetch 36 | Given a list of PDBs, this package can be used to fetch data associated with those PDBs, including their dates of deposition, lists of authors and associated publications, their sequences or structures, their top BLAST matches, and other query-specific attributes like lists of a ligands or chemical structure. To use the simple API, see the examples in [`demos/demos.ipynb`](demos/demos.ipynb). For advanced search and query logic, see the examples in [`data/EXAMPLES.md`](pypdb/clients/data/EXAMPLES.md). 37 | 38 | ## Issues and Feature Requests 39 | 40 | If you run into an issue, or if you find a workaround for an existing issue, please post your question or code as a GitHub issue. 41 | 42 | If posting a feature request, please check that your request is possible using [the current GUI on current RCSB website](https://www.rcsb.org/search/advanced). If so, please perform your search, and then click the link that says `JSON` in the upper right hand corner of the Advanced Search box. Please post that JSON code with your feature request. 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /devtools/conda-recipe/README.md: -------------------------------------------------------------------------------- 1 | # Instructions 2 | 3 | ## Required 4 | 5 | ```bash 6 | conda install conda-build anaconda-client 7 | ``` 8 | 9 | ## Building and pushing to https://anaconda.org/williamgilpin 10 | 11 | ```bash 12 | conda build . --no-anaconda-upload 13 | PACKAGE_OUTPUT=`conda build . --output` 14 | anaconda login 15 | anaconda upload --user williamgilpin $PACKAGE_OUTPUT 16 | conda build purge 17 | anaconda logout 18 | ``` 19 | 20 | ## Install 21 | 22 | ``` 23 | conda install -c williamgilpin pypdb 24 | ``` 25 | 26 | ## Additional Info 27 | https://docs.anaconda.com/anaconda-cloud/user-guide/tasks/work-with-packages 28 | -------------------------------------------------------------------------------- /devtools/conda-recipe/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt 2 | 3 | -------------------------------------------------------------------------------- /devtools/conda-recipe/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.7 3 | 4 | -------------------------------------------------------------------------------- /devtools/conda-recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: pypdb 3 | version: 1.310 4 | 5 | source: 6 | path: ../../ 7 | 8 | build: 9 | number: 0 10 | 11 | requirements: 12 | build: 13 | - python {{ python }} 14 | - setuptools 15 | - numpy 16 | - pytest 17 | 18 | run: 19 | - python {{ python }} 20 | - numpy 21 | - xmltodict 22 | - beautifulsoup4 23 | - matplotlib 24 | - urllib3 25 | - jsonschema 26 | 27 | about: 28 | home: http://www.wgilpin.com/pypdb_docs/html/ 29 | license: 'https://github.com/williamgilpin/pypdb/blob/master/LICENSE' 30 | summary: "This is a copy of https://github.com/williamgilpin/pypdb, developed by William Gilpin, built here to be installed with conda." 31 | description: | 32 | This is a copy of https://github.com/williamgilpin/pypdb, developed by William Gilpin. 33 | dev_url: https://github.com/williamgilpin/pypdb 34 | doc_url: http://www.wgilpin.com/pypdb_docs/html/ 35 | -------------------------------------------------------------------------------- /pypdb/__init__.py: -------------------------------------------------------------------------------- 1 | from .pypdb import * 2 | # from .pypdb.util import * 3 | -------------------------------------------------------------------------------- /pypdb/clients/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/data/EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # PyPDB Data Fetch from the PDB 2 | 3 | ## Helpful Links 4 | 5 | The data fetch module here is a Python wrapper for the [graphQL API](https://data.rcsb.org/#fetch-data-graphql). 6 | PDB's data API [organizes the data in the following way](https://data.rcsb.org/#data-organization): 7 | 8 | * `entry` 9 | * `entity` 10 | * `polymer_entity` 11 | * `branched_entity` 12 | * `nonpolymer_entity` 13 | * `entity_instance` 14 | * `polymer_entity_instance` 15 | * `branched_entity_instance` 16 | * `nonpolymer_entity_instance` 17 | * `assembly` 18 | * `chemical_component` 19 | 20 | In addition to these, the following are also options in the PDB, but are currently not implemented in PyPDB: 21 | 22 | * `PubMed` 23 | * `UniProt` 24 | * `DrugBank` 25 | 26 | The data schemas for all of these data types can be viewed [here](https://data.rcsb.org/#data-schema). 27 | These schemas allow the user to determine what keywords to ask for. 28 | The queries can be tested in-browser using the [GraphiQL tool](https://data.rcsb.org/graphql/index.html?query=%7B%0A%20%20entries(entry_ids%3A%20%5B%224HHB%22%5D)%20%7B%0A%20%20%20%20rcsb_id%0A%20%20%20%20struct%20%7B%0A%20%20%20%20%20%20title%0A%20%20%20%20%7D%0A%20%20%20%20exptl%20%7B%0A%20%20%20%20%20%20method%0A%20%20%20%20%7D%0A%20%20%7D%0A%7D). 29 | 30 | ## Examples 31 | 32 | All of the functionaly, and thus examples below, require the following imports: 33 | 34 | ```python 35 | from pypdb.clients.data.data_types import DataFetcher, DataType 36 | ``` 37 | 38 | ### Fetch entries using PDB IDs 39 | 40 | If we want to fetch some information about the PDB entries `4HHB`, `12CA`, and `3PQR`, we first create an instance of `DataFetcher`: 41 | 42 | ```python 43 | entry = DataFetcher(["4HHB", "12CA", "3PQR"], DataType.ENTRY) 44 | ``` 45 | 46 | The properties we will fetch for needs to be given as a python dictionary, commensurate with the [data schemas](https://data.rcsb.org/#data-schema): 47 | 48 | ```python 49 | property = {"exptl": ["method", "details"], "cell":["length_a", "length_b", "length_c"]} 50 | entry.add_property(property) 51 | ``` 52 | 53 | Then we fetch the data 54 | 55 | ```python 56 | entry.fetch_data() 57 | ``` 58 | 59 | where `entry.response` now contains a Python dictionary generated from the JSON formatted information fetched from the PDB. 60 | It is possible to convert this to a Pandas dataframe: 61 | 62 | ```python 63 | df = entry.return_data_as_pandas_df() 64 | ``` 65 | 66 | ### Fetch Assemblies 67 | 68 | Similarly to the `entry` case: 69 | 70 | ```python 71 | assembly = DataFetcher(["4HHB-1", "12CA-1", "3PQR-1"], DataType.ASSEMBLY) 72 | property = {"rcsb_assembly_info": ["entry_id", "assembly_id", "polymer_entity_instance_count"]} 73 | 74 | assembly.add_property(property) 75 | assembly.fetch_data() 76 | ``` 77 | 78 | Note that the IDs provided must be of the form `[entry_id]-[assembly_id]`. 79 | 80 | ### Fetch Polymer Entities 81 | 82 | ```python 83 | fetcher = DataFetcher(["2CPK_1","3WHM_1","2D5Z_1"], DataType.POLYMER_ENTITY) 84 | property = {"rcsb_id": [], 85 | "rcsb_entity_source_organism": ["ncbi_taxonomy_id", "ncbi_scientific_name"], 86 | "rcsb_cluster_membership": ["cluster_id", "identity"]} 87 | 88 | fetcher.add_property(property) 89 | fetcher.fetch_data() 90 | ``` 91 | 92 | The IDs provided must be of the form `[entry_id]_[entity_id]`. 93 | 94 | ### Fetch Polymer Entity Instance 95 | 96 | ```python 97 | fetcher = DataFetcher(["4HHB.A", "12CA.A", "3PQR.A"], DataType.POLYMER_ENTITY_INSTANCE) 98 | property = {"rcsb_id": [], 99 | "rcsb_polymer_instance_annotation": ["annotation_id", "name", "type"]} 100 | fetcher.add_property(property) 101 | fetcher.fetch_data() 102 | ``` 103 | 104 | In this case, IDs are of the form `[entry_id].[entity_id]`. 105 | 106 | ### Fetch Branched Entity 107 | 108 | ```python 109 | fetcher = DataFetcher(["5FMB_2", "6L63_3"], DataType.BRANCHED_ENTITY) 110 | property = {"pdbx_entity_branch": ["type"], 111 | "pdbx_entity_branch_descriptor": ["type", "descriptor"]} 112 | 113 | fetcher.add_property(property) 114 | fetcher.fetch_data() 115 | ``` 116 | 117 | ### Fetch Chemical Components 118 | 119 | ```python 120 | fetcher = DataFetcher(["NAG","EBW"], DataType.CHEMICAL_COMPONENT) 121 | property = {"rcsb_id":[], "chem_comp": ["type", "formula_weight","name","formula"], 122 | "rcsb_chem_comp_info":["initial_release_date"]} 123 | fetcher.add_property(property) 124 | fetcher.fetch_data() 125 | ``` -------------------------------------------------------------------------------- /pypdb/clients/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/data/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/data/data_types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for data types that can be accessed in the PDB DATA-API 3 | https://data.rcsb.org/#data-organization 4 | 5 | Namely: 6 | - entry 7 | - polymer entity 8 | - branched entity 9 | - non-polymer entity 10 | - polymer instance 11 | - branched instance 12 | - non-polymer instance 13 | - assembly 14 | - chemical component 15 | (currently not implemented:) 16 | - PubMed integrated data 17 | - UniProt integrated data 18 | - DrugBank integrated data 19 | """ 20 | from dataclasses import dataclass, field 21 | from enum import Enum 22 | 23 | #TODO: handle batch requests 24 | 25 | from pypdb.clients.data.graphql.graphql import search_graphql 26 | 27 | class DataType(Enum): 28 | ENTRY = "entries" 29 | POLYMER_ENTITY = "polymer_entities" 30 | BRANCHED_ENTITY = "branched_entities" 31 | NONPOLYMER_ENTITY = "nonpolymer_entities" 32 | POLYMER_ENTITY_INSTANCE = "polymer_entity_instances" 33 | BRANCHED_ENTITY_INSTANCE = "branched_entity_instances" 34 | NONPOLYMER_ENTITY_INSTANCE = "nonpolymer_entity_instances" 35 | ASSEMBLY = "assemblies" 36 | CHEMICAL_COMPONENT = "chem_comps" 37 | 38 | @dataclass 39 | class DataFetcher: 40 | """ 41 | General class that will host various data types, as detailed above. 42 | """ 43 | 44 | id: str | list 45 | data_type: DataType 46 | 47 | properties: dict = field(default_factory=dict) 48 | json_query: dict = field(default_factory=dict) 49 | response: dict = field(default_factory=dict) 50 | 51 | def __post_init__(self): 52 | """ 53 | Check types of IDs given, format accordingly. 54 | """ 55 | 56 | if isinstance(self.id, str): 57 | self.id = [self.id] 58 | 59 | if "entit" in self.data_type.value and "instance" not in self.data_type.value: 60 | for id in self.id: 61 | if '_' not in id: 62 | print(f"WARNING: {id} not valid for {self.data_type.value}.") 63 | elif "instance" in self.data_type.value: 64 | for id in self.id: 65 | if '.' not in id: 66 | print(f"WARNING: {id} not valid for {self.data_type.value}.") 67 | elif self.data_type == DataType.ASSEMBLY: 68 | for id in self.id: 69 | if '-' not in id: 70 | print(f"WARNING: {id} not valid for {self.data_type.value}.") 71 | 72 | def add_property(self, property): 73 | """ 74 | Add property to the list of data to fetch from the PDB. 75 | 76 | property is a python dict, with keys as properties, and 77 | values as subproperties. 78 | 79 | e.g.: 80 | 81 | {"cell": ["volume", "angle_beta"], "exptl": ["method"]} 82 | 83 | If the user is trying to add a property that already exists, 84 | the subproperties are merged. 85 | """ 86 | # check input data type 87 | if not isinstance(property, dict): 88 | raise TypeError 89 | # check data types of keys in dict 90 | if not all([isinstance(key, str) for key in property.keys()]): 91 | raise TypeError 92 | # check that values are lists of strings 93 | for key, value in property.items(): 94 | if isinstance(value, str): 95 | property[key] = [value] 96 | elif not isinstance(value, list): 97 | raise TypeError 98 | else: 99 | if not all([isinstance(val, str) for val in value]): 100 | raise TypeError 101 | 102 | # add properties to the dict 103 | for key, value in property.items(): 104 | if key not in self.properties: 105 | self.properties[key] = value 106 | else: 107 | self.properties[key] += value 108 | self.properties[key] = list(set(self.properties[key])) 109 | 110 | def generate_json_query(self): 111 | """ 112 | Given IDs, data type, and properties to fetch, create JSON query that 113 | will utilize graphql. 114 | """ 115 | if not self.properties: 116 | print("ERROR: no properties given to generate JSON query.") 117 | raise ValueError 118 | 119 | if self.data_type == DataType.ENTRY: 120 | q_str = "entry_ids" 121 | elif "entit" in self.data_type.value: 122 | if "instance" in self.data_type.value: 123 | q_str = "instance_ids" 124 | else: 125 | q_str = "entity_ids" 126 | elif self.data_type == DataType.ASSEMBLY: 127 | q_str = "assembly_ids" 128 | elif self.data_type == DataType.CHEMICAL_COMPONENT: 129 | q_str = "comp_ids" 130 | 131 | data_str = f"{self.data_type.value}({q_str}: [" + ",".join(f"\"{w}\"" for w in self.id) + "])" 132 | 133 | props_string = "" 134 | for key, val in self.properties.items(): 135 | if len(val) == 0: 136 | props_string += f"{key}," 137 | else: 138 | props_string += f"{key} {{" + ",".join(val) + "}" 139 | 140 | self.json_query = {'query': "{" + data_str + "{" + props_string + "}}"} 141 | 142 | 143 | def fetch_data(self): 144 | """ 145 | Once the JSON query is created, fetch data from the PDB, using graphql. 146 | """ 147 | if not self.json_query: 148 | self.generate_json_query() 149 | 150 | response = search_graphql(self.json_query) 151 | 152 | if "errors" in response: 153 | print("ERROR encountered in fetch_data().") 154 | for error in response['errors']: 155 | print(error['message']) 156 | 157 | return 158 | 159 | self.response = response 160 | 161 | if len(self.response['data'][self.data_type.value]) != len(self.id): 162 | print("WARNING: one or more IDs not found in the PDB.") 163 | 164 | def return_data_as_df_dict(self): 165 | """ 166 | Return the fetched data as a dict usable by pandas or polars. 167 | """ 168 | if not self.response: 169 | return None 170 | 171 | data = self.response['data'][self.data_type.value] 172 | 173 | # flatten data dictionary by joining property and subproperty names 174 | data_flat = {} 175 | for i, entry in enumerate(data): 176 | id = self.id[i] 177 | curr_dict = {} 178 | for key, values in entry.items(): 179 | if isinstance(values, list): 180 | v = values[0] 181 | else: 182 | v = values 183 | if isinstance(v, str): 184 | new_key = f"{key}" 185 | curr_dict[new_key] = v 186 | else: 187 | for subprop, val in v.items(): 188 | new_key = f"{key}.{subprop}" 189 | curr_dict[new_key] = val 190 | data_flat[id] = curr_dict 191 | 192 | return data_flat -------------------------------------------------------------------------------- /pypdb/clients/data/graphql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/data/graphql/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/data/graphql/graphql.py: -------------------------------------------------------------------------------- 1 | """Contains logic to perform arbitrary GraphQL searches against RCSB. 2 | 3 | For the differences between the GraphQL and RESTful searches, see: 4 | https://data.rcsb.org/index.html#gql-vs-rest 5 | """ 6 | import requests 7 | import warnings 8 | from typing import Any # DO NOT APPROVE: fix this to actual type 9 | 10 | RSCB_GRAPHQL_URL = "https://data.rcsb.org/graphql?query=" 11 | 12 | 13 | def search_graphql(graphql_json_query: str) -> Any: 14 | """Performs RCSB search with JSON query using GraphQL. 15 | 16 | For details on what the RCSB GraphQL interface is, see: 17 | https://data.rcsb.org/index.html#gql-api 18 | 19 | This function should return the equivalent information as this site: 20 | https://data.rcsb.org/graphql/index.html 21 | 22 | Args: 23 | graphql_json_query: GraphQL JSON query, as a string. Whitespace doesn't 24 | matter. e.g. "{entry(entry_id:"4HHB"){exptl{method}}}" 25 | """ 26 | 27 | response = requests.post(url=RSCB_GRAPHQL_URL, 28 | json=graphql_json_query) 29 | 30 | if not response.ok: 31 | warnings.warn(f"It appears request failed with: {response.text}") 32 | response.raise_for_status() 33 | 34 | 35 | return response.json() -------------------------------------------------------------------------------- /pypdb/clients/data/graphql/test_graphql.py: -------------------------------------------------------------------------------- 1 | """Unit tests for RCSB DATA API Python wrapper.""" 2 | import unittest 3 | from unittest import mock 4 | import requests 5 | 6 | from pypdb.clients.data.graphql import graphql 7 | 8 | class TestGraphQL(unittest.TestCase): 9 | @mock.patch.object(requests, "post") 10 | def test_simple_search(self, mock_post): 11 | json_query = {'query': '{ entry(entry_id: "4HHB"){struct {title}} }'} 12 | expected_return_json_as_dict = {'data': {'entry': {'struct': {'title': 'THE CRYSTAL STRUCTURE OF HUMAN DEOXYHAEMOGLOBIN AT 1.74 ANGSTROMS RESOLUTION'}}}} 13 | 14 | mock_response = mock.create_autospec(requests.Response, instance=True) 15 | mock_response.json.return_value = expected_return_json_as_dict 16 | mock_post.return_value = mock_response 17 | 18 | results = graphql.search_graphql(json_query) 19 | 20 | mock_post.assert_called_once_with(url=graphql.RSCB_GRAPHQL_URL, json=json_query) 21 | self.assertEqual(results, expected_return_json_as_dict) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() -------------------------------------------------------------------------------- /pypdb/clients/data/test_data_types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the data_types classes. 3 | """ 4 | import unittest 5 | from unittest import mock 6 | import requests 7 | from pypdb.clients.data.graphql.graphql import RSCB_GRAPHQL_URL 8 | 9 | from pypdb.clients.data.data_types import DataFetcher, DataType 10 | 11 | class TestEntry(unittest.TestCase): 12 | def test_create(self): 13 | entry = DataFetcher("4HHB", DataType.ENTRY) 14 | 15 | self.assertTrue(isinstance(entry.properties, dict)) 16 | self.assertTrue(not entry.properties) 17 | 18 | self.assertTrue(isinstance(entry.json_query, dict)) 19 | self.assertTrue(not entry.json_query) 20 | 21 | self.assertTrue(isinstance(entry.response, dict)) 22 | self.assertTrue(not entry.response) 23 | 24 | self.assertEqual(entry.id, ["4HHB"]) 25 | 26 | def test_generate_json_query(self): 27 | entry = DataFetcher("4HHB", DataType.ENTRY) 28 | 29 | property = {"exptl":["method", "details"]} 30 | 31 | entry.add_property(property) 32 | 33 | self.assertIsNotNone(entry.properties) 34 | 35 | entry.generate_json_query() 36 | 37 | self.assertTrue(isinstance(entry.json_query, dict)) 38 | self.assertTrue("query" in entry.json_query) 39 | 40 | def test_fetch_entry(self): 41 | entry = DataFetcher("4HHB", DataType.ENTRY) 42 | property = {"exptl":["method", "details"]} 43 | 44 | entry.add_property(property) 45 | 46 | entry.fetch_data() 47 | 48 | self.assertTrue(entry.response) 49 | 50 | def test_return_data_as_pandas_df(self): 51 | entry = DataFetcher(["4HHB", "12CA", "3PQR"], DataType.ENTRY) 52 | property = {"exptl":["method", "details"]} 53 | 54 | entry.add_property(property) 55 | 56 | entry.fetch_data() 57 | df = entry.return_data_as_pandas_df() 58 | 59 | self.assertTrue(df is not None) 60 | 61 | def test_assembly_fetch(self): 62 | assembly = DataFetcher(["4HHB-1", "12CA-1", "3PQR-1"], DataType.ASSEMBLY) 63 | property = {"rcsb_assembly_info": ["entry_id", "assembly_id", "polymer_entity_instance_count"]} 64 | 65 | assembly.add_property(property) 66 | assembly.fetch_data() 67 | 68 | self.assertFalse(not assembly.response) 69 | 70 | def test_polymer_entity_fetch(self): 71 | fetcher = DataFetcher(["2CPK_1","3WHM_1","2D5Z_1"], DataType.POLYMER_ENTITY) 72 | 73 | property = {"rcsb_id": [], 74 | "rcsb_entity_source_organism": ["ncbi_taxonomy_id", "ncbi_scientific_name"], 75 | "rcsb_cluster_membership": ["cluster_id", "identity"]} 76 | 77 | fetcher.add_property(property) 78 | fetcher.fetch_data() 79 | 80 | self.assertFalse(not fetcher.response) 81 | 82 | df = fetcher.return_data_as_pandas_df() 83 | self.assertFalse(df is None) 84 | 85 | def test_polymer_instance_fetch(self): 86 | fetcher = DataFetcher(["4HHB.A", "12CA.A", "3PQR.A"], DataType.POLYMER_ENTITY_INSTANCE) 87 | property = {"rcsb_id": [], 88 | "rcsb_polymer_instance_annotation": ["annotation_id", "name", "type"]} 89 | fetcher.add_property(property) 90 | fetcher.fetch_data() 91 | 92 | self.assertFalse(not fetcher.response) 93 | 94 | df = fetcher.return_data_as_pandas_df() 95 | self.assertFalse(df is None) 96 | 97 | def test_branched_entity_fetch(self): 98 | fetcher = DataFetcher(["5FMB_2", "6L63_3"], DataType.BRANCHED_ENTITY) 99 | property = {"pdbx_entity_branch": ["type"], 100 | "pdbx_entity_branch_descriptor": ["type", "descriptor"]} 101 | 102 | fetcher.add_property(property) 103 | fetcher.fetch_data() 104 | 105 | self.assertFalse(not fetcher.response) 106 | 107 | df = fetcher.return_data_as_pandas_df() 108 | self.assertFalse(df is None) 109 | 110 | def test_chem_comps_fetch(self): 111 | fetcher = DataFetcher(["NAG","EBW"], DataType.CHEMICAL_COMPONENT) 112 | property = {"rcsb_id":[], "chem_comp": ["type", "formula_weight","name","formula"], 113 | "rcsb_chem_comp_info":["initial_release_date"]} 114 | fetcher.add_property(property) 115 | fetcher.fetch_data() 116 | self.assertFalse(not fetcher.response) 117 | 118 | df = fetcher.return_data_as_pandas_df() 119 | self.assertFalse(df is None) 120 | 121 | if __name__ == '__main__': 122 | unittest.main() -------------------------------------------------------------------------------- /pypdb/clients/fasta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/fasta/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/fasta/fasta_client.py: -------------------------------------------------------------------------------- 1 | """Client to fetch FASTA files associated with structures from RCSB.""" 2 | 3 | from dataclasses import dataclass 4 | import re 5 | import requests 6 | from typing import Dict, List 7 | import warnings 8 | 9 | FASTA_BASE_URL = "https://www.rcsb.org/fasta/entry/" 10 | 11 | # Fasta Sequences are uniquely identified by a polymeric entity ID that looks 12 | # like `${ENTRY_ID}_{SEQUENCE_NUMBER}` (e.g. `5JUP_1` or `6TML_10`) 13 | PolymerEntity = str # Defines type-alias (Polymer entity IDs are strings) 14 | 15 | 16 | @dataclass 17 | class FastaSequence: 18 | """Class containing data for one FASTA sequence (one of many in a file).""" 19 | # Polymeric entity ID uniquely identifying this sequence 20 | entity_id: PolymerEntity # e.g. `"5RU3_1"` 21 | # Chains associated with this sequence 22 | chains: List[str] # e.g. `["A", "B"]` 23 | # Sequence associated with this entity 24 | sequence: str 25 | # Un-processed FASTA header for a sequence 26 | # (e.g. `5RU3_1|Chains A,B|Non-structural protein 3|Severe acute respiratory syndrome coronavirus 2 (2697049)`) 27 | fasta_header: str 28 | 29 | 30 | def _parse_fasta_text_to_list(raw_fasta_text: str) -> List[FastaSequence]: 31 | """Parses raw FASTA response into easy-to-use dict representation.""" 32 | # Gets list of FASTA chunks (one per sequence) 33 | fasta_sequence_chunks = raw_fasta_text.strip().split(">")[1:] 34 | 35 | fasta_list = [] 36 | for fasta_sequence_chunk in fasta_sequence_chunks: 37 | chunk_lines = fasta_sequence_chunk.split("\n") 38 | fasta_header = chunk_lines[0] 39 | fasta_sequence = "".join(chunk_lines[1:]) 40 | 41 | header_segments = fasta_header.split("|") 42 | entity_id = header_segments[0] 43 | # Derives associated chains from header 44 | chains = re.sub("Chains? ", "", header_segments[1]).split(",") 45 | 46 | fasta_list.append( 47 | FastaSequence(entity_id=entity_id, 48 | chains=chains, 49 | sequence=fasta_sequence, 50 | fasta_header=fasta_header)) 51 | return fasta_list 52 | 53 | 54 | def get_fasta_from_rcsb_entry(rcsb_id: str, 55 | verbosity: bool = True, 56 | ) -> List[FastaSequence]: 57 | """Fetches FASTA sequence associated with PDB structure from RCSB. 58 | 59 | Args: 60 | rcsb_id: RCSB accession code of the structure of interest. E.g. `"5RU3"` 61 | verbosity: Print out the search query to the console (default: True) 62 | 63 | Returns: 64 | Dictionary containing FASTA result, from polymer entity id to the 65 | `FastaSequence` object associated with that entity. 66 | """ 67 | 68 | if verbosity: 69 | print("Querying RCSB for the '{}' FASTA file.".format(rcsb_id)) 70 | response = requests.get(FASTA_BASE_URL + rcsb_id) 71 | 72 | if not response.ok: 73 | warnings.warn("It appears request failed with:" + response.text) 74 | response.raise_for_status() 75 | 76 | return _parse_fasta_text_to_list(response.text) 77 | -------------------------------------------------------------------------------- /pypdb/clients/fasta/fasta_client_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RCSB FASTA fetching logic.""" 2 | import pytest 3 | import requests 4 | import unittest 5 | from unittest import mock 6 | 7 | from pypdb.clients.fasta import fasta_client 8 | 9 | 10 | class TestFastaLogic(unittest.TestCase): 11 | @mock.patch.object(requests, "get") 12 | @mock.patch.object(fasta_client, "_parse_fasta_text_to_list") 13 | def test_get_fasta_file(self, mock_parse_fasta, mock_get): 14 | mock_response = mock.Mock() 15 | mock_response.ok = True 16 | mock_response.text = "fake_fasta_response" 17 | mock_get.return_value = mock_response 18 | 19 | fasta_client.get_fasta_from_rcsb_entry("6TML", verbosity=True) 20 | mock_get.assert_called_once_with( 21 | "https://www.rcsb.org/fasta/entry/6TML") 22 | mock_parse_fasta.assert_called_once_with("fake_fasta_response") 23 | 24 | def test_parse_fasta_file(self): 25 | 26 | test_fasta_raw_text = """ 27 | >6TML_1|Chains Q7,Q8,Q9,q7,q8,q9|ATPTG11|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601) 28 | MVRNQRYPASPVQEIFLPEPVPFVQFDQTAPSPNSPPAPLPSPSLSQCEEQKDRYR 29 | >6TML_2|Chain i9|ATPTG7|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601) 30 | MPSSSSEDAQGGNRFECVSNSTSPRRKNATKDEAACLQPRRSAVSGPREDVLCIR 31 | >6TML_32|Chains H1,H2,H3,H4|subunit c|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601) 32 | MFFSRLSLSALKAAPAREAL""" 33 | 34 | self.assertEqual( 35 | fasta_client._parse_fasta_text_to_list(test_fasta_raw_text), [ 36 | fasta_client.FastaSequence( 37 | entity_id="6TML_1", 38 | chains=["Q7", "Q8", "Q9", "q7", "q8", "q9"], 39 | sequence= 40 | "MVRNQRYPASPVQEIFLPEPVPFVQFDQTAPSPNSPPAPLPSPSLSQCEEQKDRYR", 41 | fasta_header= 42 | "6TML_1|Chains Q7,Q8,Q9,q7,q8,q9|ATPTG11|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)" 43 | ), 44 | fasta_client.FastaSequence( 45 | entity_id="6TML_2", 46 | chains=["i9"], 47 | sequence= 48 | "MPSSSSEDAQGGNRFECVSNSTSPRRKNATKDEAACLQPRRSAVSGPREDVLCIR", 49 | fasta_header= 50 | "6TML_2|Chain i9|ATPTG7|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)" 51 | ), 52 | fasta_client.FastaSequence( 53 | entity_id="6TML_32", 54 | chains=["H1", "H2", "H3", "H4"], 55 | sequence="MFFSRLSLSALKAAPAREAL", 56 | fasta_header= 57 | "6TML_32|Chains H1,H2,H3,H4|subunit c|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)" 58 | ) 59 | ]) 60 | -------------------------------------------------------------------------------- /pypdb/clients/pdb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/pdb/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/pdb/pdb_client.py: -------------------------------------------------------------------------------- 1 | """File containing logic to download PDB file entries from the RCSB Database.""" 2 | 3 | from enum import Enum 4 | import gzip 5 | from typing import Optional 6 | import warnings 7 | 8 | from pypdb.util import http_requests 9 | 10 | PDB_DOWNLOAD_BASE_URL = "https://files.rcsb.org/download/" 11 | 12 | 13 | class PDBFileType(Enum): 14 | PDB = "pdb" # Older file format. 15 | CIF = "cif" # Newer file format (replacing PDB file type) 16 | XML = "xml" # Another alternative representation. 17 | STRUCTFACT = "structfact" # For structural factors (only populated for some entries) 18 | 19 | 20 | def get_pdb_file(pdb_id: str, 21 | filetype=PDBFileType.PDB, 22 | compression=False) -> Optional[str]: 23 | '''Get the full PDB file associated with a PDB_ID 24 | 25 | Parameters 26 | ---------- 27 | 28 | pdb_id : A 4 character string giving a pdb entry of interest 29 | 30 | filetype: The file type. 31 | PDB is the older file format, 32 | CIF is the newer replacement. 33 | XML an also be obtained and parsed using the various xml tools included in PyPDB 34 | STRUCTFACT retrieves structure factors (only available for certain PDB entries) 35 | 36 | compression : Whether or not to request the data as a compressed (gz) version of the file 37 | (note that the compression is undone by this function) 38 | 39 | Returns 40 | ------- 41 | 42 | result : string 43 | The string representing the full PDB file as an uncompressed string. 44 | (returns None if the request to RCSB failed) 45 | 46 | Examples 47 | -------- 48 | >>> pdb_file = get_pdb_file('4lza', filetype='cif', compression=True) 49 | >>> print(pdb_file[:200]) 50 | data_4LZA 51 | # 52 | _entry.id 4LZA 53 | # 54 | _audit_conform.dict_name mmcif_pdbx.dic 55 | _audit_conform.dict_version 4.032 56 | _audit_conform.dict_location http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx 57 | 58 | ''' 59 | 60 | if filetype is PDBFileType.CIF and not compression: 61 | warnings.warn("Consider using `get_pdb_file` with compression=True " 62 | "for CIF files (it makes the file download faster!)") 63 | 64 | pdb_url_builder = [PDB_DOWNLOAD_BASE_URL, pdb_id] 65 | 66 | if filetype is PDBFileType.STRUCTFACT: 67 | pdb_url_builder.append("-sf.cif") 68 | else: 69 | pdb_url_builder += [".", filetype.value] 70 | 71 | if compression: 72 | pdb_url_builder += ".gz" 73 | 74 | pdb_url = "".join(pdb_url_builder) 75 | 76 | print( 77 | "Sending GET request to {} to fetch {}'s {} file as a string.".format( 78 | pdb_url, pdb_id, filetype.value)) 79 | 80 | response = http_requests.request_limited(pdb_url) 81 | 82 | if response is None or not response.ok: 83 | warnings.warn("Retrieval failed, returning None") 84 | return None 85 | 86 | if compression: 87 | return gzip.decompress(response.content) 88 | return response.text 89 | -------------------------------------------------------------------------------- /pypdb/clients/pdb/pdb_client_test.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import unittest 3 | from unittest import mock 4 | 5 | from pypdb.clients.pdb import pdb_client 6 | from pypdb.util import http_requests 7 | 8 | 9 | class TestPDBFileDownloading(unittest.TestCase): 10 | @mock.patch.object(http_requests, "request_limited", autospec=True) 11 | def test_unsuccessful_test_returns_none(self, mock_http_requests): 12 | 13 | mock_return_value = mock.Mock() 14 | mock_return_value.ok = False 15 | mock_http_requests.return_value = mock_return_value 16 | 17 | self.assertIsNone(pdb_client.get_pdb_file("5TML")) 18 | mock_http_requests.assert_called_once_with( 19 | "https://files.rcsb.org/download/5TML.pdb") 20 | 21 | @mock.patch.object(http_requests, "request_limited", autospec=True) 22 | @mock.patch.object(gzip, "decompress") 23 | def test_compressed_cif_file(self, mock_decompress, mock_http_requests): 24 | mock_return_value_cif = mock.Mock() 25 | mock_return_value_cif.ok = True 26 | mock_return_value_cif.content = "fake_compressed_cif" 27 | mock_http_requests.return_value = mock_return_value_cif 28 | mock_decompress.return_value = "fake_decompressed_cif" 29 | 30 | self.assertEqual( 31 | "fake_decompressed_cif", 32 | pdb_client.get_pdb_file("1A2B", 33 | pdb_client.PDBFileType.CIF, 34 | compression=True)) 35 | mock_http_requests.assert_called_once_with( 36 | "https://files.rcsb.org/download/1A2B.cif.gz") 37 | mock_decompress.assert_called_once_with("fake_compressed_cif") 38 | 39 | @mock.patch.object(http_requests, "request_limited", autospec=True) 40 | def test_umcompressed_pdb(self, mock_http_requests): 41 | mock_return_value_pdb = mock.Mock() 42 | mock_return_value_pdb.text = "fake_uncompressed_pdb" 43 | mock_return_value_pdb.ok = True 44 | mock_http_requests.return_value = mock_return_value_pdb 45 | 46 | self.assertEqual("fake_uncompressed_pdb", 47 | pdb_client.get_pdb_file("1234")) 48 | mock_http_requests.assert_called_once_with( 49 | "https://files.rcsb.org/download/1234.pdb") 50 | 51 | @mock.patch.object(http_requests, "request_limited", autospec=True) 52 | @mock.patch.object(gzip, "decompress") 53 | def test_compressed_structfact(self, mock_decompress, mock_http_requests): 54 | mock_return_value_pdb = mock.Mock() 55 | mock_return_value_pdb.content = "fake_compressed_structfact" 56 | mock_return_value_pdb.ok = True 57 | mock_http_requests.return_value = mock_return_value_pdb 58 | mock_decompress.return_value = "fake_decompressed_structfact" 59 | 60 | self.assertEqual( 61 | "fake_decompressed_structfact", 62 | pdb_client.get_pdb_file("HK97", 63 | pdb_client.PDBFileType.STRUCTFACT, 64 | compression=True)) 65 | mock_http_requests.assert_called_once_with( 66 | "https://files.rcsb.org/download/HK97-sf.cif.gz") 67 | mock_decompress.assert_called_once_with("fake_compressed_structfact") 68 | 69 | @mock.patch.object(http_requests, "request_limited", autospec=True) 70 | def test_uncompressed_xml(self, mock_http_requests): 71 | mock_return_value_pdb = mock.Mock() 72 | mock_return_value_pdb.text = "fake_uncompressed_xml" 73 | mock_return_value_pdb.ok = True 74 | mock_http_requests.return_value = mock_return_value_pdb 75 | 76 | self.assertEqual( 77 | "fake_uncompressed_xml", 78 | pdb_client.get_pdb_file("MI17", 79 | pdb_client.PDBFileType.XML, 80 | compression=False)) 81 | mock_http_requests.assert_called_once_with( 82 | "https://files.rcsb.org/download/MI17.xml") 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /pypdb/clients/search/EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # PyPDB Text Search 2 | 3 | ## Helpful Links 4 | 5 | The Search logic here is a Python wrapper around the RCSB's search logic. 6 | For in-the-weeds details on how each operator works, prefer to look at the 7 | [RCSB Search API documentation](https://search.rcsb.org/index.html) 8 | 9 | The search operators defined within the `operators` directory support querying 10 | RCSB attributes against the appropriate `if 11 | you are querying the RCSB Text Search Service (`all 12 | operators within `text_operators.py` should be supported. 13 | 14 | For a list of RCSB attributes associated with structures you can search, see 15 | [RCSB's List of Structure Attributes to Search](https://search.rcsb.org/structure-search-attributes.html) and [RCSB's List of Chemical Attributes to Search](https://search.rcsb.org/chemical-search-attributes.html) 16 | Note that not every structure will have every attribute. 17 | 18 | Two querying functions are currently supported by PyPDB: 19 | 20 | * `perform_search`: This function is good for simple queries 21 | * `perform_search_with_graph`: This function allows building complicated queries using RCSB's query node syntax. 22 | 23 | ## `perform_search` Examples 24 | 25 | ### Search for all entries that mention the word 'ribosome' 26 | 27 | ```python 28 | from pypdb.clients.search.search_client import perform_search 29 | from pypdb.clients.search.search_client import ReturnType 30 | from pypdb.clients.search.operators import text_operators 31 | 32 | 33 | search_operator = text_operators.DefaultOperator(value="ribosome") 34 | return_type = ReturnType.ENTRY 35 | 36 | results = perform_searchsearch_operator, return_type) 37 | ``` 38 | 39 | ### Search for polymers from 'Mus musculus' 40 | 41 | ```python 42 | from pypdb.clients.search.search_client import perform_search 43 | from pypdb.clients.search.search_client import ReturnType 44 | from pypdb.clients.search.operators import text_operators 45 | 46 | 47 | search_operator = text_operators.ExactMatchOperator(value="Mus musculus", 48 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 49 | return_type = ReturnType.POLYMER_ENTITY 50 | 51 | results = perform_search(search_operator, return_type) 52 | ``` 53 | 54 | ### Search for non-polymers from 'Mus musculus' or 'Homo sapiens' 55 | 56 | ```python 57 | from pypdb.clients.search.search_client import perform_search 58 | from pypdb.clients.search.search_client import ReturnType 59 | from pypdb.clients.search.operators import text_operators 60 | 61 | search_operator = text_operators.InOperator(values=["Mus musculus", "Homo sapiens"], 62 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 63 | return_type = ReturnType.NON_POLYMER_ENTITY 64 | 65 | results = perform_search(search_operator, return_type) 66 | ``` 67 | 68 | ### Search for polymer instances whose titles contain "actin" or "binding" or "protein" 69 | 70 | ```python 71 | from pypdb.clients.search.search_client import perform_search 72 | from pypdb.clients.search.search_client import ReturnType 73 | from pypdb.clients.search.operators import text_operators 74 | 75 | 76 | search_operator = text_operators.ContainsWordsOperator(value="actin-binding protein", 77 | attribute="struct.title") 78 | return_type = ReturnType.POLYMER_INSTANCE 79 | 80 | results = perform_search(search_operator, return_type) 81 | ``` 82 | 83 | ### Search for assemblies that contain the words "actin binding protein" 84 | 85 | (must be in that order). 86 | 87 | For example, "actin-binding protein" and "actin binding protein" will match, 88 | but "protein binding actin" will not. 89 | 90 | ```python 91 | from pypdb.clients.search.search_client import perform_search 92 | from pypdb.clients.search.search_client import ReturnType 93 | from pypdb.clients.search.operators import text_operators 94 | 95 | 96 | search_operator = text_operators.ContainsPhraseOperator(value="actin-binding protein", 97 | attribute="struct.title") 98 | return_type = ReturnType.ASSEMBLY 99 | 100 | results = perform_search(search_operator, return_type) 101 | ``` 102 | 103 | ### Search for entries released in 2019 or later 104 | 105 | ```python 106 | from pypdb.clients.search.search_client import perform_search 107 | from pypdb.clients.search.search_client import ReturnType 108 | from pypdb.clients.search.operators import text_operators 109 | 110 | 111 | search_operator = text_operators.ComparisonOperator( 112 | value="2019-01-01T00:00:00Z", 113 | attribute="rcsb_accession_info.initial_release_date", 114 | comparison_type=text_operators.ComparisonType.GREATER) 115 | return_type = ReturnType.ENTRY 116 | 117 | results = perform_search(search_operator, return_type) 118 | ``` 119 | 120 | ### Search for entries released only in 2019 or later 121 | 122 | ```python 123 | from pypdb.clients.search.search_client import perform_search 124 | from pypdb.clients.search.search_client import ReturnType 125 | from pypdb.clients.search.operators import text_operators 126 | 127 | 128 | search_operator = text_operators.RangeOperator( 129 | from_value="2019-01-01T00:00:00Z", 130 | to_value="2020-01-01T00:00:00Z", 131 | include_lower=True, 132 | include_upper=False, 133 | attribute="rcsb_accession_info.initial_release_date") 134 | return_type = ReturnType.ENTRY 135 | 136 | results = perform_search(search_operator, return_type) 137 | ``` 138 | 139 | ### Search for structures under 4 angstroms of resolution 140 | 141 | ```python 142 | from pypdb.clients.search.search_client import perform_search 143 | from pypdb.clients.search.search_client import ReturnType 144 | from pypdb.clients.search.operators import text_operators 145 | 146 | 147 | search_operator = text_operators.ComparisonOperator( 148 | value=4, 149 | attribute="rcsb_entry_info.resolution_combined", 150 | comparison_type=text_operators.ComparisonType.LESS) 151 | return_type = ReturnType.ENTRY 152 | 153 | results = perform_search(search_operator, return_type) 154 | ``` 155 | 156 | ### Search for structures with a given attribute 157 | 158 | (Admittedly every structure has a release date, but the same logic would 159 | apply for a more sparse RCSB attribute). 160 | 161 | ```python 162 | from pypdb.clients.search.search_client import perform_search 163 | from pypdb.clients.search.search_client import ReturnType 164 | from pypdb.clients.search.operators import text_operators 165 | 166 | 167 | search_operator = text_operators.ExistsOperator( 168 | attribute="rcsb_accession_info.initial_release_date") 169 | return_type = ReturnType.ENTRY 170 | 171 | results = perform_search(search_operator, return_type) 172 | ``` 173 | 174 | ### Search for top 100 structures matching the given protein sequence, by date 175 | 176 | (this sequence matches the SARS-CoV-2 NSP3 macrodomain) 177 | 178 | ```python 179 | from pypdb.clients.search.search_client import perform_search, RequestOptions 180 | from pypdb.clients.search.search_client import ReturnType 181 | from pypdb.clients.search.operators.sequence_operators import SequenceOperator 182 | from pypdb.clients.search.operators.sequence_operators import SequenceType 183 | 184 | results = perform_search( 185 | return_type=ReturnType.ENTRY, 186 | search_operator=SequenceOperator( 187 | sequence_type=SequenceType.PROTEIN, # if not explicitly specified, this will autoresolve 188 | sequence=( 189 | "SMVNSFSGYLKLTDNVYIKNADIVEEAKKVKPTVVVNAANVYLKHGGGVAGALNKATNNAMQVESDDY" 190 | "IATNGPLKVGGSCVLSGHNLAKHCLHVVGPNVNKGEDIQLLKSAYENFNQHEVLLAPLLSAGIFGADP" 191 | "IHSLRVCVDTVRTNVYLAVFDKNLYDKLVSSFL"), 192 | identity_cutoff=0.99, 193 | evalue_cutoff=1000 194 | ), 195 | request_options=RequestOptions( 196 | result_start_index=0, 197 | num_results=100, 198 | sort_by="rcsb_accession_info.initial_release_date", 199 | desc=False 200 | ), 201 | return_with_scores=True 202 | ) 203 | ``` 204 | 205 | ### Search for structures that match the sequence of an existing RCSB entry 206 | 207 | ```python 208 | from pypdb.clients.fasta.fasta_client import get_fasta_from_rcsb_entry 209 | from pypdb.clients.search.search_client import perform_search 210 | from pypdb.clients.search.search_client import ReturnType 211 | from pypdb.clients.search.operators.sequence_operators import SequenceOperator 212 | 213 | # Fetches the first sequence in the "6TML" fasta file 214 | fasta_sequence = get_fasta_from_rcsb_entry("6TML", verbosity=True)[0].sequence 215 | 216 | # Performs sequence search ('BLAST'-like) using the FASTA sequence 217 | results = perform_search( 218 | return_type=ReturnType.ENTRY, 219 | search_operator=SequenceOperator( 220 | sequence=fasta_sequence, 221 | identity_cutoff=0.99, 222 | evalue_cutoff=1000 223 | ), 224 | return_with_scores=True 225 | ) 226 | ``` 227 | 228 | ## `perform_search_with_graph` Example 229 | 230 | ### Search for 'Mus musculus' or 'Homo sapiens' structures after 2019 231 | 232 | ```python 233 | from pypdb.clients.search.search_client import perform_search_with_graph 234 | from pypdb.clients.search.search_client import ReturnType 235 | from pypdb.clients.search.search_client import QueryGroup, LogicalOperator 236 | from pypdb.clients.search.operators import text_operators 237 | 238 | # SearchOperator associated with structures with under 4 Angstroms of resolution 239 | under_4A_resolution_operator = text_operators.ComparisonOperator( 240 | value=4, 241 | attribute="rcsb_entry_info.resolution_combined", 242 | comparison_type=text_operators.ComparisonType.GREATER) 243 | 244 | # SearchOperator associated with entities containing 'Mus musculus' lineage 245 | is_mus_operator = text_operators.ExactMatchOperator( 246 | value="Mus musculus", 247 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 248 | 249 | # SearchOperator associated with entities containing 'Homo sapiens' lineage 250 | is_human_operator = text_operators.ExactMatchOperator( 251 | value="Homo sapiens", 252 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 253 | 254 | # QueryGroup associated with being either human or `Mus musculus` 255 | is_human_or_mus_group = QueryGroup( 256 | queries = [is_mus_operator, is_human_operator], 257 | logical_operator = LogicalOperator.OR 258 | ) 259 | 260 | # QueryGroup associated with being ((Human OR Mus) AND (Under 4 Angstroms)) 261 | is_under_4A_and_human_or_mus_group = QueryGroup( 262 | queries = [is_human_or_mus_group, under_4A_resolution_operator], 263 | logical_operator = LogicalOperator.AND 264 | ) 265 | 266 | results = perform_search_with_graph( 267 | query_object=is_under_4A_and_human_or_mus_group, 268 | return_type=ReturnType.ENTRY) 269 | ``` 270 | 271 | ## Search for Calcium-Bound Calmodulin Structures 272 | 273 | Note that "1CLL" corresponds to a Calmodulin structure bound to Ca2+. 274 | 275 | Also, searching for `rcsb_chem_comp_container_identifiers.comp_id` with 276 | an exact match to `"CA"` yields only structures in complex with Ca2+ 277 | (filtering out structures in complex with other metals like strontium). 278 | 279 | ```python 280 | from pypdb.clients.search.search_client import perform_search_with_graph 281 | from pypdb.clients.search.search_client import ReturnType 282 | from pypdb.clients.search.search_client import QueryGroup, LogicalOperator 283 | from pypdb.clients.search.operators import text_operators, structure_operators 284 | 285 | is_similar_to_1CLL = structure_operators.StructureOperator( 286 | pdb_entry_id="1CLL", 287 | assembly_id=1, 288 | search_mode=structure_operators.StructureSearchMode.STRICT_SHAPE_MATCH 289 | ) 290 | 291 | is_in_complex_with_calcium = text_operators.ExactMatchOperator( 292 | attribute="rcsb_chem_comp_container_identifiers.comp_id", 293 | value="CA" 294 | ) 295 | 296 | results = perform_search_with_graph( 297 | query_object=QueryGroup( 298 | logical_operator=LogicalOperator.AND, 299 | queries=[is_similar_to_1CLL, is_in_complex_with_calcium] 300 | ), 301 | return_type=ReturnType.ENTRY 302 | ) 303 | -------------------------------------------------------------------------------- /pypdb/clients/search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/search/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/search/operators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/search/operators/__init__.py -------------------------------------------------------------------------------- /pypdb/clients/search/operators/chemical_operators.py: -------------------------------------------------------------------------------- 1 | """Search operators corresponding to Chemical search using SMILES or InChI.""" 2 | 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import Any, Dict 6 | 7 | 8 | class DescriptorMatchingCriterion(Enum): 9 | """Criterion describing what constitutes a chemical 'match' in RCSB search. 10 | 11 | For definitions of these criteria, see: 12 | https://search.rcsb.org/#search-services 13 | """ 14 | GRAPH_STRICT = "graph-strict" 15 | GRAPH_RELAXED = "graph-relaxed" 16 | GRAPH_RELAXED_STEREO = "graph-relaxed-stereo" 17 | FINGERPRINT_SIMILARITY = "fingerprint-similarity" 18 | 19 | 20 | @dataclass 21 | class ChemicalOperator: 22 | """Search operator for Chemical searches using SMILES / InChI.""" 23 | # Descriptor for matching (i.e. a valid SMILES or InChI string) 24 | descriptor: str 25 | # Criterion for what constitutes a match ("graph-strict" by default) 26 | matching_criterion: DescriptorMatchingCriterion = DescriptorMatchingCriterion.GRAPH_STRICT 27 | 28 | def __post_init__(self): 29 | """Derives whether the chemical descriptor string is SMILES or InChI.""" 30 | # All InChI strings definitionally start with "InChI=" 31 | if self.descriptor.startswith("InChI="): 32 | self.descriptor_type = "InChI" 33 | else: 34 | # Otherwise, assume SMILES string by default 35 | self.descriptor_type = "SMILES" 36 | 37 | def _to_dict(self) -> Dict[str, Any]: 38 | return { 39 | "value": self.descriptor, 40 | "type": "descriptor", 41 | "descriptor_type": self.descriptor_type, 42 | "match_type": self.matching_criterion.value 43 | } 44 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/chemical_operators_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RCSB SeqMotif Search Service Operators.""" 2 | 3 | import unittest 4 | 5 | from pypdb.clients.search.operators import chemical_operators 6 | 7 | 8 | class TestChemicalOperators(unittest.TestCase): 9 | def test_chemical_operator_to_dict(self): 10 | # InChI 11 | inchi_operator = chemical_operators.ChemicalOperator( 12 | # Panadol 13 | descriptor= 14 | "InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)", 15 | matching_criterion=chemical_operators.DescriptorMatchingCriterion. 16 | GRAPH_RELAXED_STEREO) 17 | self.assertEqual(inchi_operator.descriptor_type, "InChI") 18 | self.assertEqual( 19 | inchi_operator._to_dict(), { 20 | "value": 21 | "InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)", 22 | "type": "descriptor", 23 | "descriptor_type": "InChI", 24 | "match_type": "graph-relaxed-stereo" 25 | }) 26 | 27 | # SMILES 28 | smiles_operator = chemical_operators.ChemicalOperator( 29 | descriptor= 30 | "CC(C)C[C@H](NC(=O)OCC1CCC(F)(F)CC1)C(=O)N[C@@H](C[C@@H]2CCNC2=O)[C@@H](O)[S](O)(=O)=O" 31 | ) 32 | self.assertEqual( 33 | smiles_operator._to_dict(), { 34 | "value": 35 | "CC(C)C[C@H](NC(=O)OCC1CCC(F)(F)CC1)C(=O)N[C@@H](C[C@@H]2CCNC2=O)[C@@H](O)[S](O)(=O)=O", 36 | "type": "descriptor", 37 | "descriptor_type": "SMILES", 38 | "match_type": "graph-strict" 39 | }) 40 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/seqmotif_operators.py: -------------------------------------------------------------------------------- 1 | """Operators associated with SeqMotif searching using RCSB Search API.""" 2 | 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import Any, Dict 6 | 7 | 8 | class SequenceType(Enum): 9 | """Type of sequence being searched for motifs.""" 10 | DNA = "pdb_dna_sequence" 11 | RNA = "pdb_rna_sequence" 12 | PROTEIN = "pdb_protein_sequence" 13 | 14 | 15 | class PatternType(Enum): 16 | """Type of pattern being used for SeqMotif search.""" 17 | SIMPLE = "simple" 18 | PROSITE = "prosite" 19 | REGEX = "regex" 20 | 21 | 22 | @dataclass 23 | class SeqMotifOperator: 24 | # Pattern to search with 25 | pattern: str 26 | sequence_type: SequenceType 27 | pattern_type: PatternType 28 | 29 | def _to_dict(self) -> Dict[str, Any]: 30 | return { 31 | "value": self.pattern, 32 | "pattern_type": self.pattern_type.value, 33 | "target": self.sequence_type.value 34 | } 35 | 36 | 37 | # DO NOT APPROVE: DO NOT APPROVE THIS CL UNTIL ADDED TO VALIDATION 38 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/seqmotif_operators_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RCSB SeqMotif Search Service Operators.""" 2 | 3 | import unittest 4 | 5 | from pypdb.clients.search.operators import seqmotif_operators 6 | 7 | 8 | class TestSeqMotifOperators(unittest.TestCase): 9 | def test_seqmotif_operator_to_dict(self): 10 | seqmotif_operator = seqmotif_operators.SeqMotifOperator( 11 | pattern_type=seqmotif_operators.PatternType.PROSITE, 12 | sequence_type=seqmotif_operators.SequenceType.PROTEIN, 13 | pattern="C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H.") 14 | 15 | self.assertEqual( 16 | seqmotif_operator._to_dict(), { 17 | "value": "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H.", 18 | "pattern_type": "prosite", 19 | "target": "pdb_protein_sequence" 20 | }) 21 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/sequence_operators.py: -------------------------------------------------------------------------------- 1 | """Search operator for searching sequences using MMseqs2 (BLAST-like).""" 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from typing import Any, Dict, Optional, Union 5 | 6 | 7 | class SequenceType(Enum): 8 | """Type of sequence being searched.""" 9 | DNA = "pdb_dna_sequence" 10 | RNA = "pdb_rna_sequence" 11 | PROTEIN = "pdb_protein_sequence" 12 | 13 | 14 | class CannotAutoresolveSequenceTypeError(Exception): 15 | """Raised when a sequence is ambiguous as to its `SequenceType`.""" 16 | 17 | 18 | @dataclass 19 | class SequenceOperator: 20 | """Default search operator; searches across available fields search, 21 | and returns a hit if a match happens in any field.""" 22 | sequence: str 23 | # If the sequence type is not specified, tries to autoresolve the type from 24 | # the sequence itself 25 | sequence_type: Optional[SequenceType] = None 26 | # Maximum E Value allowed for results 27 | # (see: https://www.ncbi.nlm.nih.gov/BLAST/tutorial/Altschul-1.html) 28 | evalue_cutoff: float = 100 29 | # Minimum identity cutoff allowed for results 30 | # (see: https://www.ncbi.nlm.nih.gov/books/NBK62051/def-item/identity/) 31 | identity_cutoff: float = 0.95 32 | 33 | def __post_init__(self): 34 | if self.sequence_type is None: 35 | self._autoresolve_sequence_type() 36 | 37 | def _autoresolve_sequence_type(self): 38 | unique_letters = set(list(self.sequence)) 39 | 40 | dna_letter_set = set(["A", "T", "C", "G"]) 41 | rna_letter_set = set(["A", "U", "C", "G"]) 42 | protein_letter_set = set(list("ABCDEFGHIKLMNPQRSTVWXYZ")) 43 | protein_fingerprint_set = set(list("BDEFHIKLMNPQRSVWXYZ")) 44 | if unique_letters.issubset(dna_letter_set) and "T" in unique_letters: 45 | self.sequence_type = SequenceType.DNA 46 | elif unique_letters.issubset(rna_letter_set) and "U" in unique_letters: 47 | self.sequence_type = SequenceType.RNA 48 | elif (unique_letters.issubset(protein_letter_set) 49 | and protein_fingerprint_set & unique_letters): 50 | self.sequence_type = SequenceType.PROTEIN 51 | else: 52 | raise CannotAutoresolveSequenceTypeError( 53 | "Sequence is ambiguous as to its SequenceType: `{}`".format( 54 | self.sequence)) 55 | 56 | def _to_dict(self) -> Dict[str, Any]: 57 | return { 58 | "evalue_cutoff": self.evalue_cutoff, 59 | "identity_cutoff": self.identity_cutoff, 60 | "target": self.sequence_type.value, # type: ignore 61 | "value": self.sequence 62 | } 63 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/sequence_operators_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RCSB Text Search Service Operators 2 | (admittedly, a lot is tested in `search_client_test.py` too) 3 | """ 4 | 5 | import pytest 6 | import unittest 7 | 8 | from pypdb.clients.search.operators import sequence_operators 9 | 10 | 11 | class TestSequenceOperators(unittest.TestCase): 12 | def test_sequence_operator(self): 13 | search_operator = sequence_operators.SequenceOperator( 14 | sequence="AUGAUUCGGCGCUAAAAAAAA", 15 | sequence_type=sequence_operators.SequenceType.RNA, 16 | evalue_cutoff=100, 17 | identity_cutoff=0.95) 18 | 19 | self.assertEqual( 20 | search_operator._to_dict(), { 21 | "evalue_cutoff": 100, 22 | "identity_cutoff": 0.95, 23 | "target": "pdb_rna_sequence", 24 | "value": "AUGAUUCGGCGCUAAAAAAAA", 25 | }) 26 | 27 | def test_autoresolve_sequence_type(self): 28 | self.assertEqual( 29 | sequence_operators.SequenceOperator("ATGGGGTAA").sequence_type, 30 | sequence_operators.SequenceType.DNA) 31 | self.assertEqual( 32 | sequence_operators.SequenceOperator("AUGGGGCCCUAA").sequence_type, 33 | sequence_operators.SequenceType.RNA) 34 | self.assertEqual( 35 | sequence_operators.SequenceOperator( 36 | "MAETREGGQSGAAS").sequence_type, 37 | sequence_operators.SequenceType.PROTEIN) 38 | with pytest.raises( 39 | sequence_operators.CannotAutoresolveSequenceTypeError): 40 | sequence_operators.SequenceOperator("AAAAAAAA") 41 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/structure_operators.py: -------------------------------------------------------------------------------- 1 | """Operators associated with RCSB structural search.""" 2 | 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | from typing import Any, Dict 6 | 7 | 8 | class StructureSearchMode(Enum): 9 | """Mode to search structures with. See: 10 | https://github.com/biocryst/biozernike/ 11 | """ 12 | STRICT_SHAPE_MATCH = "strict_shape_match" 13 | RELAXED_SHAPE_MATCH = "relaxed_shape_match" 14 | 15 | 16 | @dataclass 17 | class StructureOperator: 18 | """Operator to perform 3D Structural search using: 19 | https://github.com/biocryst/biozernike/ 20 | 21 | Will return similar 3D structures using default search options. 22 | """ 23 | # Entry and Assembly # for the chainstructure you want to use for search. 24 | # (results will show other PDB entities with similiar 3D Structures) 25 | pdb_entry_id: str 26 | assembly_id: int = 1 27 | # Structure search mode 28 | search_mode: StructureSearchMode = StructureSearchMode.STRICT_SHAPE_MATCH 29 | 30 | def _to_dict(self) -> Dict[str, Any]: 31 | return { 32 | "value": { 33 | "entry_id": self.pdb_entry_id, 34 | "assembly_id": str(self.assembly_id) 35 | }, 36 | "operator": self.search_mode.value 37 | } 38 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/structure_operators_test.py: -------------------------------------------------------------------------------- 1 | """Tests for Structural searches against RCSB Search API.""" 2 | 3 | import unittest 4 | 5 | from pypdb.clients.search.operators import structure_operators 6 | 7 | 8 | class TestStructureOperators(unittest.TestCase): 9 | def test_not_equals_operator(self): 10 | structure_operator = structure_operators.StructureOperator( 11 | pdb_entry_id="HK97", 12 | assembly_id=4, 13 | search_mode=structure_operators.StructureSearchMode. 14 | STRICT_SHAPE_MATCH) 15 | 16 | self.assertEqual( 17 | structure_operator._to_dict(), { 18 | "value": { 19 | "entry_id": "HK97", 20 | "assembly_id": "4" 21 | }, 22 | "operator": "strict_shape_match" 23 | }) 24 | 25 | structure_operator_two = structure_operators.StructureOperator( 26 | pdb_entry_id="CP77", 27 | assembly_id=7, 28 | search_mode=structure_operators.StructureSearchMode. 29 | RELAXED_SHAPE_MATCH) 30 | 31 | self.assertEqual( 32 | structure_operator_two._to_dict(), { 33 | "value": { 34 | "entry_id": "CP77", 35 | "assembly_id": "7" 36 | }, 37 | "operator": "relaxed_shape_match" 38 | }) 39 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/text_operators.py: -------------------------------------------------------------------------------- 1 | """Implementation of SearchOperators for text queries against RCSB API.""" 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from typing import Any, Dict, Union, List 5 | 6 | # --- Implementations of RCSB Queries for each SearchOperators --- 7 | # See: https://search.rcsb.org/index.html#search-operators for details 8 | 9 | # For information on available RCSB search attributes, see: 10 | # https://search.rcsb.org/search-attributes.html 11 | 12 | 13 | @dataclass 14 | class DefaultOperator: 15 | """Default search operator; searches across available fields search, 16 | and returns a hit if a match happens in any field.""" 17 | value: str 18 | 19 | def _to_dict(self) -> Dict[str, str]: 20 | return {"value": self.value} 21 | 22 | 23 | @dataclass 24 | class ExactMatchOperator: 25 | """Exact match operator indicates that the input value should match a field 26 | value exactly (including whitespaces, special characters and case).""" 27 | attribute: str 28 | value: Any 29 | 30 | def _to_dict(self) -> Dict[str, Any]: 31 | return { 32 | "attribute": self.attribute, 33 | "operator": "exact_match", 34 | "value": self.value 35 | } 36 | 37 | 38 | @dataclass 39 | class InOperator: 40 | """The in operator allows you to specify multiple values in a single search 41 | expression. It returns results if any value in a list of input values 42 | matches. It can be used instead of multiple OR conditions.""" 43 | attribute: str 44 | values: List[Any] # List of strings, numbers or date strings 45 | 46 | def _to_dict(self) -> Dict[str, Any]: 47 | return { 48 | "attribute": self.attribute, 49 | "operator": "in", 50 | "value": self.values 51 | } 52 | 53 | 54 | @dataclass 55 | class ContainsWordsOperator: 56 | """Searches attribute field to check if any words within `value` are found. 57 | 58 | For example, "actin-binding protein" will return results containing 59 | "actin" OR "binding" OR "protein" within the attribute. 60 | """ 61 | attribute: str 62 | value: str 63 | 64 | def _to_dict(self) -> Dict[str, str]: 65 | return { 66 | "attribute": self.attribute, 67 | "operator": "contains_words", 68 | "value": self.value 69 | } 70 | 71 | 72 | @dataclass 73 | class ContainsPhraseOperator: 74 | """Searches attribute, and returns hits if-and-only-if all words in the 75 | value are in the attribute field, in that order. 76 | 77 | For example, "actin-binding protein" will be interpreted as 78 | "actin" AND "binding" AND "protein" occurring in a given order.""" 79 | attribute: str 80 | value: str 81 | 82 | def _to_dict(self) -> Dict[str, str]: 83 | return { 84 | "attribute": self.attribute, 85 | "operator": "contains_phrase", 86 | "value": self.value 87 | } 88 | 89 | 90 | class ComparisonType(Enum): 91 | GREATER = "greater" 92 | GREATER_OR_EQUAL = "greater_or_equal" 93 | EQUAL = "equals" 94 | NOT_EQUAL = "not_equal" 95 | LESS_OR_EQUAL = "less_or_equal" 96 | LESS = "less" 97 | 98 | 99 | # TODO(lacoperon): Add support for initializing this, and RangeOperator, from 100 | # datetime.datetime objects for ease of use. 101 | 102 | 103 | @dataclass 104 | class ComparisonOperator: 105 | """Searches attribute, returns hits if the attribute field comparison to the 106 | value is True. 107 | 108 | For example, to get structures after a certain date, you could use the 109 | following: 110 | 111 | ``` 112 | date_filter_operator = ComparisonOperator( 113 | value="2019-01-01T00:00:00Z", 114 | attribute="rcsb_accession_info.initial_release_date", 115 | comparison_type=ComparisonType.GREATER) 116 | ``` 117 | """ 118 | 119 | attribute: str 120 | value: Any 121 | comparison_type: ComparisonType 122 | 123 | def _to_dict(self) -> Dict[str, Any]: 124 | if self.comparison_type is ComparisonType.NOT_EQUAL: 125 | param_dict = {"operator": "equals", "negation": True} 126 | else: 127 | param_dict = {"operator": self.comparison_type.value} 128 | 129 | param_dict["attribute"] = self.attribute 130 | param_dict["value"] = self.value 131 | 132 | return param_dict 133 | 134 | 135 | # @dataclass 136 | # class RangeOperator: 137 | # """Returns results with attributes within range.""" 138 | # attribute: str 139 | # from_value: Any 140 | # to_value: Any 141 | # include_lower: bool = True # Default inclusive 142 | # include_upper: bool = True # Default inclusive 143 | 144 | # def _to_dict(self) -> Dict[str, Any]: 145 | # return { 146 | # "operator": "range", 147 | # "attribute": self.attribute, 148 | # "value": { 149 | # "from": self.from_value, 150 | # "to": self.to_value, 151 | # "include_lower": self.include_lower, 152 | # "include_upper": self.include_upper 153 | # } 154 | # } 155 | 156 | 157 | @dataclass 158 | class RangeOperator: 159 | """Returns results with attributes within range..""" 160 | attribute: str 161 | from_value: Any 162 | to_value: Any 163 | include_lower: bool = True # Default inclusive 164 | include_upper: bool = True # Default inclusive 165 | negation: bool = False 166 | 167 | def _to_dict(self) -> Dict[str, Any]: 168 | return { 169 | "operator": "range", 170 | "attribute": self.attribute, 171 | "negation": self.negation, 172 | "value": {"from": self.from_value, 173 | "to": self.to_value}, 174 | } 175 | 176 | 177 | @dataclass 178 | class ExistsOperator: 179 | attribute: str 180 | 181 | def _to_dict(self) -> Dict[str, str]: 182 | return {"operator": "exists", "attribute": self.attribute} 183 | 184 | 185 | # An object of type `TextSearchOperator` can be any of the following classes: 186 | TextSearchOperator = Union[DefaultOperator, ExactMatchOperator, InOperator, 187 | ContainsWordsOperator, ContainsPhraseOperator, 188 | ComparisonOperator, RangeOperator, ExistsOperator] 189 | 190 | # List of all TextSearchOperator-associated classes, for backwards compatability 191 | # in terms of checking SearchOperator validity 192 | # (please change this when you change the `Union` definition) 193 | TEXT_SEARCH_OPERATORS = [ 194 | DefaultOperator, ExactMatchOperator, InOperator, ContainsWordsOperator, 195 | ContainsPhraseOperator, ComparisonOperator, RangeOperator, ExistsOperator 196 | ] 197 | -------------------------------------------------------------------------------- /pypdb/clients/search/operators/text_operators_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RCSB Text Search Service Operators 2 | (admittedly, a lot is tested in `search_client_test.py` too) 3 | """ 4 | 5 | import unittest 6 | 7 | from pypdb.clients.search.operators import text_operators 8 | 9 | 10 | class TestTextOperators(unittest.TestCase): 11 | def test_not_equals_operator(self): 12 | not_equals_operator = text_operators.ComparisonOperator( 13 | attribute="struct.favourite_marvel_movie", 14 | value="Thor: Ragnarok", 15 | comparison_type=text_operators.ComparisonType.NOT_EQUAL) 16 | 17 | self.assertEqual( 18 | not_equals_operator._to_dict(), { 19 | "attribute": "struct.favourite_marvel_movie", 20 | "value": "Thor: Ragnarok", 21 | "operator": "equals", 22 | "negation": True 23 | }) 24 | -------------------------------------------------------------------------------- /pypdb/clients/search/search_client.py: -------------------------------------------------------------------------------- 1 | """Barebones Python API Wrapper implementation around RCSB Search API. 2 | 3 | This file contains Python dataclasses that formalize the API within native 4 | Python objects. This aims to completely implement the Search API in Python. 5 | 6 | For RCSB API docs, see: https://search.rcsb.org/index.html 7 | """ 8 | 9 | # TODO(lacoperon): Implement request options 10 | 11 | from dataclasses import dataclass 12 | from enum import Enum 13 | import json 14 | import requests 15 | from typing import Any, Dict, List, Optional, Union 16 | import warnings 17 | 18 | from pypdb.clients.search.operators import sequence_operators 19 | from pypdb.clients.search.operators import text_operators 20 | from pypdb.clients.search.operators.chemical_operators import ChemicalOperator 21 | from pypdb.clients.search.operators.seqmotif_operators import SeqMotifOperator 22 | from pypdb.clients.search.operators.sequence_operators import SequenceOperator 23 | from pypdb.clients.search.operators.structure_operators import StructureOperator 24 | from pypdb.clients.search.operators.text_operators import TextSearchOperator 25 | 26 | SEARCH_URL_ENDPOINT: str = "https://search.rcsb.org/rcsbsearch/v2/query" 27 | """SearchOperators correspond to individual search operations. 28 | 29 | These can be used to search on their own using `perform_search`, or they can be 30 | aggregated together into a `QueryGroup` to search using multiple operators at 31 | once using `perform_search_with_graph`. 32 | """ 33 | SearchOperator = Union[TextSearchOperator, SequenceOperator, StructureOperator, 34 | SeqMotifOperator] 35 | 36 | 37 | class LogicalOperator(Enum): 38 | """Operation used to combine `QueryGroup` results.""" 39 | AND = "and" 40 | OR = "or" 41 | 42 | 43 | @dataclass 44 | class QueryGroup: 45 | """Group of search operators against RCSB Search API, 46 | whose independent results are aggregated with `logical_operator`. 47 | 48 | For example, for searches with `query_nodes=[n1,n2,n3]`, 49 | and `logical_operator=LogicalOperator.AND`, results will only be 50 | returned for hits that match all of n1, n2 and n3's queries. 51 | 52 | `logical_operator=LogicalOperator.OR` would return results that match any 53 | of n1, n2 or n3's queries. 54 | """ 55 | # Elements within the list of `queries` can either be `SearchOperator` 56 | # instances (corresponding to individual queries) 57 | # or `QueryGroup` instances (corresponding to groups of queries). 58 | # 59 | # This allows building arbitrarily complex query logic in the search tree. 60 | queries: List[Union[SearchOperator, "QueryGroup"]] 61 | 62 | # Boolean to aggregate the results of `queries`. 63 | logical_operator: LogicalOperator 64 | 65 | def _to_dict(self): 66 | return { 67 | "type": 68 | "group", 69 | "logical_operator": 70 | self.logical_operator.value, 71 | "nodes": [ 72 | _QueryNode(query)._to_dict() 73 | if type(query) is not QueryGroup else query._to_dict() 74 | for query in self.queries 75 | ] 76 | } 77 | 78 | 79 | class ReturnType(Enum): 80 | """For details, see: https://search.rcsb.org/index.html#return-type""" 81 | ENTRY = "entry" 82 | ASSEMBLY = "assembly" 83 | POLYMER_ENTITY = "polymer_entity" 84 | NON_POLYMER_ENTITY = "non_polymer_entity" 85 | POLYMER_INSTANCE = "polymer_instance" 86 | 87 | 88 | @dataclass 89 | class RequestOptions: 90 | """Options to configure which results are returned, and in what order.""" 91 | # Returns `num_results` results starting at`result_start_index` (pagination) 92 | # If these indices are not defined, defaults to return all results. 93 | # (returning all results can be slow for compute-intensive searches) 94 | result_start_index: Optional[int] = None 95 | num_results: Optional[int] = None 96 | # What attribute to sort by. 97 | # This should either be "score" (to sort by score), 98 | # or a valid RCSB attribute value 99 | # (e.g. "rcsb_accession_info.initial_release_date") 100 | sort_by: Optional[str] = "score" 101 | # Whether to sort by score ascending, or score descending 102 | desc: Optional[bool] = True 103 | 104 | def _to_dict(self): 105 | result_dict = {} 106 | if self.result_start_index != None and self.num_results != None: 107 | result_dict["paginate"] = { 108 | "start": self.result_start_index, 109 | "rows": self.num_results 110 | } 111 | 112 | if self.sort_by != None and self.desc != None: 113 | result_dict["sort"] = [{ 114 | "sort_by": self.sort_by, 115 | "direction": "desc" if self.desc else "asc" 116 | }] 117 | 118 | return result_dict 119 | 120 | 121 | @dataclass 122 | class ScoredResult: 123 | entity_id: str # PDB Entity ID (e.g. 5JUP for the entry return type) 124 | score: float 125 | 126 | 127 | RawJSONDictResponse = Dict[str, Any] 128 | 129 | 130 | def perform_search( 131 | search_operator: SearchOperator, 132 | return_type: ReturnType = ReturnType.ENTRY, 133 | request_options: Optional[RequestOptions] = None, 134 | return_with_scores: bool = False, 135 | return_raw_json_dict: bool = False, 136 | verbosity: bool = True, 137 | ) -> Union[List[str], List[ScoredResult], RawJSONDictResponse]: 138 | """Performs search specified by `search_operator`. 139 | Returns entity strings of type `return_type` that match the resulting hits. 140 | 141 | Strictly a subset of the functionality exposed in 142 | `perform_search_with_graph`, this function does not support searching on 143 | multiple conditions at once. 144 | 145 | If you require this functionality, please use `perform_search_with_graph` 146 | instead. 147 | 148 | Args: 149 | search_operator: Parameters defining the search condition. 150 | return_type: What type of RCSB entity to return. 151 | request_options: Object containing information for result pagination 152 | and sorting functionality. 153 | return_with_scores: Whether or not to return the entity results with 154 | their associated scores. For example, you might want to do this to 155 | get 156 | the top X hits that are similar to a certain protein sequence. 157 | (if this is true, returns List[ScoredResult] instead of List[str]) 158 | return_raw_json_dict: If True, this function returns the raw JSON 159 | response from RCSB, instead of a 160 | verbosity: Print out the search query to the console (default: True) 161 | 162 | Returns: 163 | List of entity ids, corresponding to entities that match the given 164 | query. 165 | 166 | If `return_with_scores=True`, returns a list of ScoredResult instead. 167 | If `return_raw_json_dict=True`, returns the raw JSON response from RCSB. 168 | 169 | Example usage to search for PDB entries that are from 'Mus musculus': 170 | ``` 171 | from pypdb.clients.search. import perform_search 172 | from pypdb.clients.search. import ReturnType 173 | from pypdb.clients.search.operators.text_operators import ExactMatchOperator 174 | pdb_ids = perform_search( 175 | search_operator=text_operators.ExactMatchOperator( 176 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name", 177 | value="Mus musculus" 178 | ), 179 | return_type=ReturnType.ENTRY) 180 | print(pdb_ids) 181 | ) 182 | ``` 183 | """ 184 | 185 | return perform_search_with_graph(query_object=search_operator, 186 | return_type=return_type, 187 | request_options=request_options, 188 | return_with_scores=return_with_scores, 189 | return_raw_json_dict=return_raw_json_dict, 190 | verbosity=verbosity) 191 | 192 | 193 | _SEARCH_OPERATORS = text_operators.TEXT_SEARCH_OPERATORS + [ 194 | SequenceOperator, StructureOperator, SeqMotifOperator 195 | ] 196 | 197 | 198 | def perform_search_with_graph( 199 | query_object: Union[SearchOperator, QueryGroup], 200 | return_type: ReturnType = ReturnType.ENTRY, 201 | request_options: Optional[RequestOptions] = None, 202 | return_with_scores: bool = False, 203 | return_raw_json_dict: bool = False, 204 | verbosity: bool = True, 205 | ) -> Union[List[str], RawJSONDictResponse, List[ScoredResult]]: 206 | """Performs specified search using RCSB's search node logic. 207 | 208 | Essentially, this allows you to ask multiple questions in one RCSB query. 209 | 210 | For example, you can ask for structures that satisfy all of the following 211 | conditions at once: 212 | * Are either from Mus musculus or from Homo sapiens lineage 213 | * Are both under 4 angstroms of resolution, and published after 2019 214 | * Are labelled as "actin-binding protein" OR 215 | contain "actin" AND "calmodulin" in their titles. 216 | 217 | See https://search.rcsb.org/index.html#building-search-request under 218 | "Terminal node" and "Group node" for more details. 219 | 220 | Args: 221 | query_object: Fully-specified SearchOperator or QueryGroup 222 | object corresponding to the desired search. 223 | return_type: Type of entities to return. 224 | return_with_scores: Whether or not to return the entity results with 225 | their associated scores. For example, you might want to do this to 226 | get the top X hits that are similar to a certain protein sequence. 227 | return_raw_json_dict: Whether to return raw JSON response. 228 | (for example, to analyze the scores of various matches) 229 | verbosity: Print out the search query to the console (default: True) 230 | 231 | Returns: 232 | List of strings, corresponding to hits in the database. Will be of the 233 | format specified by the `return_type`. 234 | 235 | If `return_with_scores=True`, returns a list of ScoredResult instead. 236 | If `return_raw_json_dict=True`, returns the raw JSON response from RCSB. 237 | """ 238 | 239 | if type(query_object) in _SEARCH_OPERATORS: 240 | cast_query_object = _QueryNode(query_object) # type: ignore 241 | else: 242 | # print(type(query_object)) 243 | cast_query_object = query_object # type: ignore 244 | 245 | if request_options is not None: 246 | request_options_dict = request_options._to_dict() 247 | else: 248 | request_options_dict = {'return_all_hits': True} 249 | 250 | rcsb_query_dict = { 251 | "query": cast_query_object._to_dict(), 252 | "request_options": request_options_dict, 253 | "return_type": return_type.value 254 | } 255 | 256 | if verbosity: 257 | print("Querying RCSB Search using the following parameters:\n %s \n" % 258 | json.dumps(rcsb_query_dict)) 259 | 260 | response = requests.post(url=SEARCH_URL_ENDPOINT, 261 | data=json.dumps(rcsb_query_dict), 262 | headers={"Content-Type": "application/json"}) 263 | 264 | # If your search queries are failing here, it could be that your attribute 265 | # doesn't support the SearchOperator you're using. 266 | # See: https://search.rcsb.org/search-attributes.html 267 | if not response.ok: 268 | warnings.warn("It appears request failed with:" + response.text) 269 | response.raise_for_status() 270 | 271 | # If specified, returns raw JSON response from RCSB as Dict 272 | # (rather than entity IDs as a string list) 273 | if return_raw_json_dict: 274 | return response.json() 275 | 276 | # Converts RCSB result to list of identifiers corresponding to 277 | # the `return_type`. Annotated with score if `return_with_scores`. 278 | results = [] 279 | for query_hit in response.json()["result_set"]: 280 | if return_with_scores: 281 | results.append( 282 | ScoredResult(entity_id=query_hit["identifier"], 283 | score=query_hit["score"])) 284 | else: 285 | results.append(query_hit["identifier"]) 286 | 287 | return results 288 | 289 | 290 | class SearchService(Enum): 291 | """Which type of field is being searched. 292 | 293 | Auto-inferred from search operator.""" 294 | BASIC_SEARCH = "full_text" 295 | TEXT = "text" 296 | SEQUENCE = "sequence" 297 | SEQMOTIF = "seqmotif" 298 | STRUCTURE = "structure" 299 | CHEMICAL = "chemical" 300 | 301 | 302 | class CannotInferSearchServiceException(Exception): 303 | """Raised when the RCSB Search API Service cannot be inferred.""" 304 | 305 | 306 | def _infer_search_service(search_operator: SearchOperator) -> SearchService: 307 | 308 | if isinstance(search_operator, text_operators.DefaultOperator): 309 | return SearchService.BASIC_SEARCH 310 | elif type(search_operator) in text_operators.TEXT_SEARCH_OPERATORS: 311 | return SearchService.TEXT 312 | elif type(search_operator) is SequenceOperator: 313 | return SearchService.SEQUENCE 314 | elif type(search_operator) is StructureOperator: 315 | return SearchService.STRUCTURE 316 | elif type(search_operator) is SeqMotifOperator: 317 | return SearchService.SEQMOTIF 318 | elif type(search_operator) is ChemicalOperator: 319 | return SearchService.CHEMICAL 320 | else: 321 | raise CannotInferSearchServiceException( 322 | "Cannot infer Search Service for {}".format(type(search_operator))) 323 | 324 | 325 | @dataclass 326 | class _QueryNode: 327 | """Individual query node, performing a query defined by the provided 328 | `search_operator` 329 | """ 330 | search_operator: SearchOperator 331 | 332 | def _to_dict(self): 333 | return { 334 | "type": "terminal", 335 | "service": _infer_search_service(self.search_operator).value, 336 | "parameters": self.search_operator._to_dict() 337 | } 338 | -------------------------------------------------------------------------------- /pypdb/clients/search/search_client_test.py: -------------------------------------------------------------------------------- 1 | """Tests for RCSB Search API Python wrapper.""" 2 | import json 3 | import pytest 4 | import requests 5 | import unittest 6 | from unittest import mock 7 | 8 | from pypdb.clients.search import search_client 9 | from pypdb.clients.search.operators import sequence_operators, text_operators 10 | 11 | 12 | class TestHTTPRequests(unittest.TestCase): 13 | @mock.patch.object(requests, "post") 14 | def test_default_operator_with_entry_return_value(self, mock_post): 15 | # Creates a mock HTTP response, as wrapped by `requests` 16 | canned_json_return_as_dict = { 17 | "result_set": [{ 18 | "identifier": "5JUP" 19 | }, { 20 | "identifier": "5JUS" 21 | }, { 22 | "identifier": "5JUO" 23 | }] 24 | } 25 | mock_response = mock.create_autospec(requests.Response, instance=True) 26 | mock_response.json.return_value = canned_json_return_as_dict 27 | mock_post.return_value = mock_response 28 | 29 | search_operator = text_operators.DefaultOperator(value="ribosome") 30 | return_type = search_client.ReturnType.ENTRY 31 | 32 | results = search_client.perform_search(search_operator, return_type) 33 | 34 | expected_json_dict = { 35 | 'query': { 36 | 'type': 'terminal', 37 | 'service': 'full_text', 38 | 'parameters': { 39 | 'value': 'ribosome' 40 | } 41 | }, 42 | 'request_options': { 43 | 'return_all_hits': True 44 | }, 45 | 'return_type': 'entry' 46 | } 47 | 48 | mock_post.assert_called_once_with( 49 | url=search_client.SEARCH_URL_ENDPOINT, 50 | data=json.dumps(expected_json_dict)) 51 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 52 | 53 | @mock.patch.object(requests, "post") 54 | def test_exact_match_operator_with_polymer_return(self, mock_post): 55 | # Creates a mock HTTP response, as wrapped by `requests` 56 | canned_json_return_as_dict = { 57 | "result_set": [{ 58 | "identifier": "5JUP" 59 | }, { 60 | "identifier": "5JUS" 61 | }, { 62 | "identifier": "5JUO" 63 | }] 64 | } 65 | mock_response = mock.create_autospec(requests.Response, instance=True) 66 | mock_response.json.return_value = canned_json_return_as_dict 67 | mock_post.return_value = mock_response 68 | 69 | search_operator = text_operators.ExactMatchOperator( 70 | value="Mus musculus", 71 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 72 | return_type = search_client.ReturnType.POLYMER_ENTITY 73 | 74 | results = search_client.perform_search(search_operator, return_type) 75 | 76 | expected_json_dict = { 77 | 'query': { 78 | 'type': 'terminal', 79 | 'service': 'text', 80 | 'parameters': { 81 | 'attribute': 82 | 'rcsb_entity_source_organism.taxonomy_lineage.name', 83 | 'operator': 'exact_match', 84 | 'value': 'Mus musculus' 85 | } 86 | }, 87 | 'request_options': { 88 | 'return_all_hits': True 89 | }, 90 | 'return_type': 'polymer_entity' 91 | } 92 | 93 | mock_post.assert_called_once_with( 94 | url=search_client.SEARCH_URL_ENDPOINT, 95 | data=json.dumps(expected_json_dict)) 96 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 97 | 98 | @mock.patch.object(requests, "post") 99 | def test_in_operator_with_non_polymer_return(self, mock_post): 100 | # Creates a mock HTTP response, as wrapped by `requests` 101 | canned_json_return_as_dict = { 102 | "result_set": [{ 103 | "identifier": "5JUP" 104 | }, { 105 | "identifier": "5JUS" 106 | }, { 107 | "identifier": "5JUO" 108 | }] 109 | } 110 | mock_response = mock.create_autospec(requests.Response, instance=True) 111 | mock_response.json.return_value = canned_json_return_as_dict 112 | mock_post.return_value = mock_response 113 | 114 | search_operator = text_operators.InOperator( 115 | values=["Mus musculus", "Homo sapiens"], 116 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 117 | return_type = search_client.ReturnType.NON_POLYMER_ENTITY 118 | 119 | results = search_client.perform_search(search_operator, return_type) 120 | 121 | expected_json_dict = { 122 | 'query': { 123 | 'type': 'terminal', 124 | 'service': 'text', 125 | 'parameters': { 126 | 'attribute': 127 | 'rcsb_entity_source_organism.taxonomy_lineage.name', 128 | 'operator': 'in', 129 | 'value': ['Mus musculus', 'Homo sapiens'] 130 | } 131 | }, 132 | 'request_options': { 133 | 'return_all_hits': True 134 | }, 135 | 'return_type': 'non_polymer_entity' 136 | } 137 | 138 | mock_post.assert_called_once_with( 139 | url=search_client.SEARCH_URL_ENDPOINT, 140 | data=json.dumps(expected_json_dict)) 141 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 142 | 143 | @mock.patch.object(requests, "post") 144 | def test_contains_words_operator_with_polymer_instance_return( 145 | self, mock_post): 146 | # Creates a mock HTTP response, as wrapped by `requests` 147 | canned_json_return_as_dict = { 148 | "result_set": [{ 149 | "identifier": "5JUP" 150 | }, { 151 | "identifier": "5JUS" 152 | }, { 153 | "identifier": "5JUO" 154 | }] 155 | } 156 | mock_response = mock.create_autospec(requests.Response, instance=True) 157 | mock_response.json.return_value = canned_json_return_as_dict 158 | mock_post.return_value = mock_response 159 | 160 | search_operator = text_operators.ContainsWordsOperator( 161 | value="actin-binding protein", attribute="struct.title") 162 | return_type = search_client.ReturnType.POLYMER_INSTANCE 163 | 164 | results = search_client.perform_search(search_operator, return_type) 165 | 166 | expected_json_dict = { 167 | 'query': { 168 | 'type': 'terminal', 169 | 'service': 'text', 170 | 'parameters': { 171 | 'attribute': 'struct.title', 172 | 'operator': 'contains_words', 173 | 'value': 'actin-binding protein' 174 | } 175 | }, 176 | 'request_options': { 177 | 'return_all_hits': True 178 | }, 179 | 'return_type': 'polymer_instance' 180 | } 181 | 182 | mock_post.assert_called_once_with( 183 | url=search_client.SEARCH_URL_ENDPOINT, 184 | data=json.dumps(expected_json_dict)) 185 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 186 | 187 | @mock.patch.object(requests, "post") 188 | def test_contains_phrase_operator_with_assembly_return(self, mock_post): 189 | # Creates a mock HTTP response, as wrapped by `requests` 190 | canned_json_return_as_dict = { 191 | "result_set": [{ 192 | "identifier": "5JUP" 193 | }, { 194 | "identifier": "5JUS" 195 | }, { 196 | "identifier": "5JUO" 197 | }] 198 | } 199 | mock_response = mock.create_autospec(requests.Response, instance=True) 200 | mock_response.json.return_value = canned_json_return_as_dict 201 | mock_post.return_value = mock_response 202 | 203 | search_operator = text_operators.ContainsPhraseOperator( 204 | value="actin-binding protein", attribute="struct.title") 205 | return_type = search_client.ReturnType.ASSEMBLY 206 | 207 | results = search_client.perform_search(search_operator, return_type) 208 | 209 | expected_json_dict = { 210 | 'query': { 211 | 'type': 'terminal', 212 | 'service': 'text', 213 | 'parameters': { 214 | 'attribute': 'struct.title', 215 | 'operator': 'contains_phrase', 216 | 'value': 'actin-binding protein' 217 | } 218 | }, 219 | 'request_options': { 220 | 'return_all_hits': True 221 | }, 222 | 'return_type': 'assembly' 223 | } 224 | 225 | mock_post.assert_called_once_with( 226 | url=search_client.SEARCH_URL_ENDPOINT, 227 | data=json.dumps(expected_json_dict)) 228 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 229 | 230 | @mock.patch.object(requests, "post") 231 | def test_comparison_operator_with_entry_return(self, mock_post): 232 | # Creates a mock HTTP response, as wrapped by `requests` 233 | canned_json_return_as_dict = { 234 | "result_set": [{ 235 | "identifier": "5JUP" 236 | }, { 237 | "identifier": "5JUS" 238 | }, { 239 | "identifier": "5JUO" 240 | }] 241 | } 242 | mock_response = mock.create_autospec(requests.Response, instance=True) 243 | mock_response.json.return_value = canned_json_return_as_dict 244 | mock_post.return_value = mock_response 245 | 246 | search_operator = text_operators.ComparisonOperator( 247 | value="2019-01-01T00:00:00Z", 248 | attribute="rcsb_accession_info.initial_release_date", 249 | comparison_type=text_operators.ComparisonType.GREATER) 250 | return_type = search_client.ReturnType.ENTRY 251 | 252 | results = search_client.perform_search(search_operator, return_type) 253 | 254 | expected_json_dict = { 255 | 'query': { 256 | 'type': 'terminal', 257 | 'service': 'text', 258 | 'parameters': { 259 | 'operator': 'greater', 260 | 'attribute': 'rcsb_accession_info.initial_release_date', 261 | 'value': '2019-01-01T00:00:00Z' 262 | } 263 | }, 264 | 'request_options': { 265 | 'return_all_hits': True 266 | }, 267 | 'return_type': 'entry' 268 | } 269 | 270 | mock_post.assert_called_once_with( 271 | url=search_client.SEARCH_URL_ENDPOINT, 272 | data=json.dumps(expected_json_dict)) 273 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 274 | 275 | @mock.patch.object(requests, "post") 276 | def test_range_operator_with_entry_return(self, mock_post): 277 | # Creates a mock HTTP response, as wrapped by `requests` 278 | canned_json_return_as_dict = { 279 | "result_set": [{ 280 | "identifier": "5JUP" 281 | }, { 282 | "identifier": "5JUS" 283 | }, { 284 | "identifier": "5JUO" 285 | }] 286 | } 287 | mock_response = mock.create_autospec(requests.Response, instance=True) 288 | mock_response.json.return_value = canned_json_return_as_dict 289 | mock_post.return_value = mock_response 290 | 291 | search_operator = text_operators.RangeOperator( 292 | from_value="2019-01-01T00:00:00Z", 293 | to_value="2019-06-30T00:00:00Z", 294 | include_lower=False, 295 | include_upper=True, 296 | attribute="rcsb_accession_info.initial_release_date") 297 | return_type = search_client.ReturnType.ENTRY 298 | 299 | results = search_client.perform_search(search_operator, return_type) 300 | 301 | expected_json_dict = { 302 | "query": { 303 | "type": "terminal", 304 | "service": "text", 305 | "parameters": { 306 | "operator": "range", 307 | "attribute": "rcsb_accession_info.initial_release_date", 308 | "negation": False, 309 | "value": ["2019-01-01T00:00:00Z", "2019-06-30T00:00:00Z"], 310 | } 311 | }, 312 | "request_options": { 313 | "return_all_hits": True 314 | }, 315 | "return_type": "entry" 316 | } 317 | 318 | mock_post.assert_called_once_with( 319 | url=search_client.SEARCH_URL_ENDPOINT, 320 | data=json.dumps(expected_json_dict)) 321 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 322 | 323 | @mock.patch.object(requests, "post") 324 | def test_exists_operator_with_entry_raw_json_response(self, mock_post): 325 | # Creates a mock HTTP response, as wrapped by `requests` 326 | canned_json_return_as_dict = { 327 | "result_set": [{ 328 | "identifier": "5JUP" 329 | }, { 330 | "identifier": "5JUS" 331 | }, { 332 | "identifier": "5JUO" 333 | }] 334 | } 335 | mock_response = mock.create_autospec(requests.Response, instance=True) 336 | mock_response.json.return_value = canned_json_return_as_dict 337 | mock_post.return_value = mock_response 338 | 339 | search_operator = text_operators.ExistsOperator( 340 | attribute="rcsb_accession_info.initial_release_date") 341 | return_type = search_client.ReturnType.ENTRY 342 | 343 | results = search_client.perform_search(search_operator, 344 | return_type, 345 | return_raw_json_dict=True) 346 | 347 | expected_json_dict = { 348 | "query": { 349 | "type": "terminal", 350 | "service": "text", 351 | "parameters": { 352 | "operator": "exists", 353 | "attribute": "rcsb_accession_info.initial_release_date", 354 | } 355 | }, 356 | "request_options": { 357 | "return_all_hits": True 358 | }, 359 | "return_type": "entry" 360 | } 361 | 362 | mock_post.assert_called_once_with( 363 | url=search_client.SEARCH_URL_ENDPOINT, 364 | data=json.dumps(expected_json_dict)) 365 | self.assertEqual(results, canned_json_return_as_dict) 366 | 367 | @mock.patch.object(requests, "post") 368 | def test_query_group_after_2019_and_either_musculus_or_human( 369 | self, mock_post): 370 | # Creates a mock HTTP response, as wrapped by `requests` 371 | canned_json_return_as_dict = { 372 | "result_set": [{ 373 | "identifier": "5JUP" 374 | }, { 375 | "identifier": "5JUS" 376 | }, { 377 | "identifier": "5JUO" 378 | }] 379 | } 380 | mock_response = mock.create_autospec(requests.Response, instance=True) 381 | mock_response.json.return_value = canned_json_return_as_dict 382 | mock_post.return_value = mock_response 383 | 384 | after_2019_query_node = text_operators.ComparisonOperator( 385 | value="2019-01-01T00:00:00Z", 386 | attribute="rcsb_accession_info.initial_release_date", 387 | comparison_type=text_operators.ComparisonType.GREATER) 388 | 389 | is_mus_query_node = text_operators.ExactMatchOperator( 390 | value="Mus musculus", 391 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 392 | 393 | is_human_query_node = text_operators.ExactMatchOperator( 394 | value="Homo sapiens", 395 | attribute="rcsb_entity_source_organism.taxonomy_lineage.name") 396 | 397 | is_human_or_mus_group = search_client.QueryGroup( 398 | queries=[is_mus_query_node, is_human_query_node], 399 | logical_operator=search_client.LogicalOperator.OR) 400 | 401 | is_after_2019_and_human_or_mus_group = search_client.QueryGroup( 402 | queries=[is_human_or_mus_group, after_2019_query_node], 403 | logical_operator=search_client.LogicalOperator.AND) 404 | 405 | return_type = search_client.ReturnType.ENTRY 406 | 407 | results = search_client.perform_search_with_graph( 408 | query_object=is_after_2019_and_human_or_mus_group, 409 | return_type=return_type) 410 | 411 | expected_json_dict = { 412 | "query": { 413 | "type": 414 | "group", 415 | "logical_operator": 416 | "and", 417 | "nodes": [{ 418 | 'type': 419 | 'group', 420 | 'logical_operator': 421 | 'or', 422 | 'nodes': [ 423 | { 424 | 'type': 'terminal', 425 | 'service': 'text', 426 | 'parameters': { 427 | 'attribute': 428 | 'rcsb_entity_source_organism.taxonomy_lineage.name', 429 | 'operator': 'exact_match', 430 | 'value': 'Mus musculus' 431 | } 432 | }, 433 | { 434 | 'type': 'terminal', 435 | 'service': 'text', 436 | 'parameters': { 437 | 'attribute': 438 | 'rcsb_entity_source_organism.taxonomy_lineage.name', 439 | 'operator': 'exact_match', 440 | 'value': 'Homo sapiens' 441 | } 442 | }, 443 | ] 444 | }, { 445 | 'type': 'terminal', 446 | 'service': 'text', 447 | 'parameters': { 448 | 'operator': 'greater', 449 | 'attribute': 450 | 'rcsb_accession_info.initial_release_date', 451 | 'value': '2019-01-01T00:00:00Z' 452 | } 453 | }] 454 | }, 455 | "request_options": { 456 | "return_all_hits": True 457 | }, 458 | "return_type": "entry" 459 | } 460 | 461 | mock_post.assert_called_once_with( 462 | url=search_client.SEARCH_URL_ENDPOINT, 463 | data=json.dumps(expected_json_dict)) 464 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 465 | 466 | @mock.patch.object(requests, "post") 467 | def test_query_structure_resolution(self, mock_post): 468 | # Creates a mock HTTP response, as wrapped by `requests` 469 | canned_json_return_as_dict = { 470 | "result_set": [{ 471 | "identifier": "5JUP" 472 | }, { 473 | "identifier": "5JUS" 474 | }, { 475 | "identifier": "5JUO" 476 | }] 477 | } 478 | mock_response = mock.create_autospec(requests.Response, instance=True) 479 | mock_response.json.return_value = canned_json_return_as_dict 480 | mock_post.return_value = mock_response 481 | 482 | search_operator = text_operators.ComparisonOperator( 483 | value=4, 484 | attribute="rcsb_entry_info.resolution_combined", 485 | comparison_type=text_operators.ComparisonType.LESS) 486 | return_type = search_client.ReturnType.ENTRY 487 | 488 | results = search_client.perform_search(search_operator, 489 | return_type, 490 | return_raw_json_dict=True) 491 | 492 | expected_json_dict = { 493 | "query": { 494 | "type": "terminal", 495 | "service": "text", 496 | "parameters": { 497 | "operator": "less", 498 | "attribute": "rcsb_entry_info.resolution_combined", 499 | "value": 4 500 | } 501 | }, 502 | "request_options": { 503 | "return_all_hits": True 504 | }, 505 | "return_type": "entry" 506 | } 507 | 508 | mock_post.assert_called_once_with( 509 | url=search_client.SEARCH_URL_ENDPOINT, 510 | data=json.dumps(expected_json_dict)) 511 | self.assertEqual(results, canned_json_return_as_dict) 512 | 513 | @mock.patch.object(requests, "post") 514 | def test_sequence_operator_search(self, mock_post): 515 | # Creates a mock HTTP response, as wrapped by `requests` 516 | canned_json_return_as_dict = { 517 | "result_set": [{ 518 | "identifier": "5JUP" 519 | }, { 520 | "identifier": "5JUS" 521 | }, { 522 | "identifier": "5JUO" 523 | }] 524 | } 525 | mock_response = mock.create_autospec(requests.Response, instance=True) 526 | mock_response.json.return_value = canned_json_return_as_dict 527 | mock_post.return_value = mock_response 528 | 529 | results = search_client.perform_search( 530 | search_operator=sequence_operators.SequenceOperator( 531 | sequence="ATGAGGTAA", 532 | sequence_type=sequence_operators.SequenceType.DNA, 533 | evalue_cutoff=100, 534 | identity_cutoff=0.90), 535 | return_type=search_client.ReturnType.ENTRY) 536 | 537 | expected_json_dict = { 538 | 'query': { 539 | 'type': 'terminal', 540 | 'service': 'sequence', 541 | 'parameters': { 542 | 'evalue_cutoff': 100, 543 | 'identity_cutoff': 0.90, 544 | 'target': 'pdb_dna_sequence', 545 | 'value': 'ATGAGGTAA' 546 | } 547 | }, 548 | 'request_options': { 549 | 'return_all_hits': True 550 | }, 551 | 'return_type': 'entry' 552 | } 553 | 554 | mock_post.assert_called_once_with( 555 | url=search_client.SEARCH_URL_ENDPOINT, 556 | data=json.dumps(expected_json_dict)) 557 | self.assertEqual(results, ["5JUP", "5JUS", "5JUO"]) 558 | 559 | def test_request_options_to_dict(self): 560 | request_options = search_client.RequestOptions( 561 | result_start_index=42, 562 | num_results=8675309, 563 | sort_by="fake.rcsb.attribute", 564 | desc=False) 565 | 566 | self.assertEqual( 567 | request_options._to_dict(), { 568 | "pager": { 569 | "start": 42, 570 | "rows": 8675309 571 | }, 572 | "sort": [{ 573 | "sort_by": "fake.rcsb.attribute", 574 | "direction": "asc" 575 | }] 576 | }) 577 | 578 | 579 | if __name__ == '__main__': 580 | unittest.main() 581 | -------------------------------------------------------------------------------- /pypdb/conftest.py: -------------------------------------------------------------------------------- 1 | # Sentinel file for `pytest` (to allow testing of PyPDB) 2 | -------------------------------------------------------------------------------- /pypdb/pypdb.py: -------------------------------------------------------------------------------- 1 | ''' 2 | PyPDB: A Python API for the RCSB Protein Data Bank 3 | 4 | ----- 5 | 6 | GitHub: https://github.com/williamgilpin/pypdb 7 | 8 | PyPI: https://pypi.python.org/pypi/pypdb 9 | 10 | ----- 11 | 12 | If you find this code useful, please consider citing the paper: 13 | 14 | Gilpin, W. "PyPDB: A Python API for the Protein Data Bank." 15 | Bioinformatics, Oxford Journals, 2015. 16 | 17 | ''' 18 | from collections import OrderedDict, Counter 19 | from itertools import repeat, chain 20 | import time 21 | import re 22 | import json 23 | import warnings 24 | 25 | from pypdb.util import http_requests 26 | from pypdb.clients.fasta import fasta_client 27 | from pypdb.clients.pdb import pdb_client 28 | from pypdb.clients.search import search_client 29 | from pypdb.clients.search.operators import sequence_operators 30 | 31 | warnings.simplefilter('always', DeprecationWarning) 32 | 33 | 34 | # New imports needed for the updated API 35 | from pypdb.clients.search.search_client import perform_search 36 | from pypdb.clients.search.search_client import ReturnType 37 | from pypdb.clients.search.operators import text_operators 38 | 39 | 40 | ''' 41 | ================= 42 | Functions for searching the RCSB PDB for lists of PDB IDs 43 | ================= 44 | ''' 45 | 46 | 47 | class Query(object): 48 | """ 49 | 50 | xThis objects takes search terms and specifications and creates object that 51 | can be used to query the Protein Data Bank 52 | 53 | Parameters 54 | ---------- 55 | search_term : str 56 | 57 | The specific term to search in the database. For specific query types, 58 | the strings that will yield valid results are limited to: 59 | 60 | 'HoldingsQuery' : A Ggeneral search of the metadata associated with PDB IDs 61 | 62 | 'ExpTypeQuery' : Experimental Method such as 'X-RAY', 'SOLID-STATE NMR', etc 63 | 64 | 'AdvancedKeywordQuery' : Any string that appears in the title or abstract 65 | 66 | 'StructureIdQuery' : Perform a search for a specific Structure ID 67 | 68 | 'ModifiedStructuresQuery' : Search for related structures 69 | 70 | 'AdvancedAuthorQuery' : Search by the names of authors associated with entries 71 | 72 | 'MotifQuery' : Search for a specific motif 73 | 74 | 'NoLigandQuery' : Find full list of PDB IDs without free ligrands 75 | 76 | query_type : str 77 | 78 | The type of query to perform, the easiest is an AdvancedKeywordQuery but more 79 | specific types of searches may also be performed 80 | 81 | return type : str 82 | The type of search result to return. Default "entry" returns a list of PDB IDs 83 | 84 | scan_params (optional) : dict() 85 | A dictionary containing an explicit nested search term. Use this option if you want to 86 | use pypdb's rate handling and other functions, but need to structure a complex JSON 87 | query not covered in the existing python package 88 | 89 | Examples 90 | -------- 91 | 92 | >>> found_pdbs = Query('actin network').search() 93 | >>> print(found_pdbs) 94 | ['1D7M', '3W3D', '4A7H', '4A7L', '4A7N'] 95 | 96 | >>> found_pdbs = Query('3W3D', query_type='ModifiedStructuresQuery').search() 97 | >>> print(found_pdbs[:5]) 98 | ['1A2N', '1ACF', '1AGX', '1APM', '1ARL'] 99 | 100 | >>> found_pdbs = found_pdbs = Query('T[AG]AGGY', query_type='MotifQuery').search() 101 | >>> print(found_pdbs) 102 | ['3LEZ', '3SGH', '4F47'] 103 | 104 | """ 105 | def __init__(self, 106 | search_term, 107 | query_type="full_text", 108 | return_type="entry", 109 | scan_params=None): 110 | """See help(Query) for documentation""" 111 | 112 | if query_type == "PubmedIdQuery": 113 | query_type = "text" 114 | query_subtype = "pmid" 115 | elif query_type == "TreeEntityQuery": 116 | query_type = "text" 117 | query_subtype = "taxid" 118 | elif query_type == "ExpTypeQuery": 119 | query_type = "text" 120 | query_subtype = "experiment_type" 121 | search_term = search_term.upper() 122 | if search_term not in [ 123 | "X-RAY DIFFRACTION", "ELECTRON MICROSCOPY", 124 | "SOLID-STATE NMR", "SOLUTION NMR", "NEUTRON DIFFRACTION", 125 | "ELECTRON CRYSTALLOGRAPHY", "POWDER DIFFRACTION", 126 | "FIBER DIFFRACTION", "SOLUTION SCATTERING", "EPR", 127 | "FLUORESCENCE TRANSFER", "INFRARED SPECTROSCOPY", 128 | "THEORETICAL MODEL" 129 | ]: 130 | warnings.warn( 131 | "Experimental type not recognized, search may fail .") 132 | elif query_type == "AdvancedAuthorQuery": 133 | query_type = "text" 134 | query_subtype = "author" 135 | elif query_type == "OrganismQuery": 136 | query_type = "text" 137 | query_subtype = "organism" 138 | elif query_type == "pfam": 139 | query_type = "text" 140 | query_subtype = "pfam" 141 | elif query_type == "uniprot": 142 | query_type = "text" 143 | query_subtype = "uniprot" 144 | else: 145 | query_subtype = None 146 | 147 | assert query_type in { 148 | "full_text", "text", "structure", "sequence", "seqmotif", "chemical" 149 | }, "Query type %s not recognized." % query_type 150 | 151 | assert return_type in {"entry", "polymer_entity" 152 | }, "Return type %s not supported." % return_type 153 | 154 | self.query_type = query_type 155 | self.search_term = search_term 156 | self.return_type = return_type 157 | self.url = "https://search.rcsb.org/rcsbsearch/v2/query?json=" 158 | composite_query = False 159 | if not scan_params: 160 | query_params = dict() 161 | query_params["type"] = "terminal" 162 | query_params["service"] = query_type 163 | 164 | if query_type in ["full_text", "text"]: 165 | query_params['parameters'] = {"value": search_term} 166 | 167 | elif query_type == "sequence": 168 | query_params['parameters'] = { 169 | "target": "pdb_protein_sequence", 170 | "value": search_term 171 | } 172 | elif query_type == "structure": 173 | query_params['parameters'] = { 174 | "operator": "relaxed_shape_match", 175 | "value": { 176 | "entry_id": search_term, 177 | "assembly_id": "1" 178 | } 179 | } 180 | 181 | # elif query_type=='AdvancedAuthorQuery': 182 | # query_params['description'] = 'Author Name: '+ search_term 183 | # query_params['searchType'] = 'All Authors' 184 | # query_params['audit_author.name'] = search_term 185 | # query_params['exactMatch'] = 'false' 186 | 187 | # elif query_type=='MotifQuery': 188 | # query_params['description'] = 'Motif Query For: '+ search_term 189 | # query_params['motif'] = search_term 190 | 191 | # elif query_type=='OrganismQuery': 192 | # # query_params['version'] = "B0905" 193 | # query_params['description'] = 'Organism Search: Organism Name='+ search_term 194 | # query_params['organismName'] = search_term 195 | # # composite_query = True 196 | # elif query_type=='TreeEntityQuery': 197 | # query_params['t'] = "1" 198 | # query_params['description'] = 'TaxonomyTree Search for OTHER SEQUENCES' 199 | # query_params['n'] = search_term 200 | # query_params['nodeDesc'] = "OTHER SEQUENCES" 201 | 202 | # elif query_type in ['StructureIdQuery','ModifiedStructuresQuery']: 203 | # query_params['structureIdList'] = search_term 204 | 205 | # elif query_type=='ExpTypeQuery': 206 | # query_params['experimentalMethod'] = search_term 207 | # query_params['description'] = 'Experimental Method Search : Experimental Method='+ search_term 208 | # query_params['mvStructure.expMethod.value']= search_term 209 | if query_subtype: 210 | 211 | if query_subtype == "pmid": 212 | query_params['parameters'] = { 213 | "operator": "in", 214 | "negation": False, 215 | "value": [search_term], 216 | "attribute": 217 | "rcsb_pubmed_container_identifiers.pubmed_id" 218 | } 219 | if query_subtype == "taxid": 220 | query_params['parameters'] = { 221 | "operator": 222 | "exact_match", 223 | "negation": 224 | False, 225 | "value": 226 | str(search_term), 227 | "attribute": 228 | "rcsb_entity_source_organism.taxonomy_lineage.id" 229 | } 230 | if query_subtype == "experiment_type": 231 | query_params['parameters'] = { 232 | "operator": "exact_match", 233 | "negation": False, 234 | "value": str(search_term), 235 | "attribute": "exptl.method" 236 | } 237 | if query_subtype == "author": 238 | query_params['parameters'] = { 239 | "operator": "exact_match", 240 | "negation": False, 241 | "value": str(search_term), 242 | "attribute": "rcsb_primary_citation.rcsb_authors" 243 | } 244 | if query_subtype == "organism": 245 | query_params['parameters'] = { 246 | "operator": 247 | "contains_words", 248 | "negation": 249 | False, 250 | "value": 251 | str(search_term), 252 | "attribute": 253 | "rcsb_entity_source_organism.taxonomy_lineage.name" 254 | } 255 | if query_subtype == "pfam": 256 | query_params['parameters'] = { 257 | "operator": "exact_match", 258 | "negation": False, 259 | "value": str(search_term), 260 | "attribute": 261 | "rcsb_polymer_entity_annotation.annotation_id" 262 | } 263 | if query_subtype == "uniprot": 264 | query_params['parameters'] = { 265 | "operator": "exact_match", 266 | "negation": False, 267 | "value": str(search_term), 268 | "attribute": 269 | "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession" 270 | } 271 | 272 | self.scan_params = dict() 273 | self.scan_params["query"] = query_params 274 | self.scan_params["return_type"] = return_type 275 | self.scan_params["request_options"] = {"results_verbosity": "verbose"} # v2 276 | 277 | if return_type == "entry": 278 | self.scan_params["request_options"] = {"return_all_hits": True} 279 | 280 | else: 281 | self.scan_params = scan_params 282 | 283 | def search(self, num_attempts=1, sleep_time=0.5): 284 | """ 285 | Perform a search of the Protein Data Bank using the REST API 286 | 287 | Parameters 288 | ---------- 289 | 290 | num_attempts : int 291 | In case of a failed retrieval, the number of attempts to try again 292 | sleep_time : int 293 | The amount of time to wait between requests, in case of 294 | API rate limits 295 | """ 296 | 297 | query_text = json.dumps(self.scan_params, indent=4) 298 | response = http_requests.request_limited(self.url, 299 | rtype="POST", 300 | headers={"Content-Type": "application/json"}, 301 | data=query_text) 302 | 303 | if response is None or response.status_code != 200: 304 | warnings.warn("Retrieval failed, returning None") 305 | return None 306 | 307 | response_val = json.loads(response.text) 308 | 309 | if self.return_type == "entry": 310 | idlist = walk_nested_dict(response_val, 311 | "identifier", 312 | maxdepth=25, 313 | outputs=[]) 314 | return idlist 315 | else: 316 | return response_val 317 | 318 | 319 | # def do_search(scan_params): 320 | # '''Convert dict() to XML object an then send query to the RCSB PDB 321 | 322 | # This function takes a valid query dict() object, converts it to XML, 323 | # and then sends a request to the PDB for a list of IDs corresponding 324 | # to search results 325 | 326 | # Parameters 327 | # ---------- 328 | 329 | # scan_params : dict 330 | # A dictionary of query attributes to use for 331 | # the search of the PDB 332 | 333 | # Returns 334 | # ------- 335 | 336 | # idlist : list 337 | # A list of PDB ids returned by the search 338 | 339 | # Examples 340 | # -------- 341 | # This method usually gets used in tandem with make_query 342 | 343 | # >>> a = make_query('actin network') 344 | # >>> print (a) 345 | # {'orgPdbQuery': {'description': 'Text Search for: actin', 346 | # 'keywords': 'actin', 347 | # 'queryType': 'AdvancedKeywordQuery'}} 348 | 349 | # >>> search_dict = make_query('actin network') 350 | # >>> found_pdbs = do_search(search_dict) 351 | # >>> print(found_pdbs) 352 | # ['1D7M', '3W3D', '4A7H', '4A7L', '4A7N'] 353 | 354 | # >>> search_dict = make_query('T[AG]AGGY',querytype='MotifQuery') 355 | # >>> found_pdbs = do_search(search_dict) 356 | # >>> print(found_pdbs) 357 | # ['3LEZ', '3SGH', '4F47'] 358 | # ''' 359 | # q = Query('search_term', 'HoldingsQuery', scan_params=scan_params) 360 | # return q.search() 361 | 362 | # def do_protsym_search(point_group, min_rmsd=0.0, max_rmsd=7.0): 363 | # '''Performs a protein symmetry search of the PDB 364 | 365 | # This function can search the Protein Data Bank based on how closely entries 366 | # match the user-specified symmetry group 367 | 368 | # Parameters 369 | # ---------- 370 | 371 | # point_group : str 372 | # The name of the symmetry point group to search. This includes all the standard 373 | # abbreviations for symmetry point groups (e.g., C1, C2, D2, T, O, I, H, A1) 374 | 375 | # min_rmsd : float 376 | # The smallest allowed total deviation (in Angstroms) for a result 377 | # to be classified as having a matching symmetry 378 | 379 | # max_rmsd : float 380 | # The largest allowed total deviation (in Angstroms) for a result 381 | # to be classified as having a matching symmetry 382 | 383 | # Returns 384 | # ------- 385 | 386 | # idlist : list of strings 387 | # A list of PDB IDs resulting from the search 388 | 389 | # Examples 390 | # -------- 391 | 392 | # >>> kk = do_protsym_search('C9', min_rmsd=0.0, max_rmsd=1.0) 393 | # >>> print(kk[:5]) 394 | # ['1KZU', '1NKZ', '2FKW', '3B8M', '3B8N'] 395 | 396 | # ''' 397 | # query_params = dict() 398 | # query_params['queryType'] = 'PointGroupQuery' 399 | # query_params['rMSDComparator'] = 'between' 400 | 401 | # query_params['pointGroup'] = point_group 402 | # query_params['rMSDMin'] = min_rmsd 403 | # query_params['rMSDMax'] = max_rmsd 404 | 405 | # scan_params = dict() 406 | # scan_params['orgPdbQuery'] = query_params 407 | # idlist = do_search(scan_params) 408 | # return idlist 409 | 410 | # def get_all(): 411 | # """Return a list of all PDB entries currently in the RCSB Protein Data Bank 412 | 413 | # Returns 414 | # ------- 415 | 416 | # out : list of str 417 | # A list of all of the PDB IDs currently in the RCSB PDB 418 | 419 | # Examples 420 | # -------- 421 | 422 | # >>> print(get_all()[:10]) 423 | # ['100D', '101D', '101M', '102D', '102L', '102M', '103D', '103L', '103M', '104D'] 424 | 425 | # """ 426 | 427 | # url = 'http://www.rcsb.org/pdb/rest/getCurrent' 428 | # #response = requests.get(url) 429 | # response = http_requests.request_limited(url) 430 | 431 | # if response.status_code == 200: 432 | # pass 433 | # else: 434 | # warnings.warn("Retrieval failed, returning None") 435 | # return None 436 | 437 | # result = str(response.text) 438 | 439 | # p = re.compile('structureId=\"...."') 440 | # matches = p.findall(str(result)) 441 | # out = list() 442 | # for item in matches: 443 | # out.append(item[-5:-1]) 444 | 445 | # return out 446 | ''' 447 | ================= 448 | Functions for looking up information given PDB ID 449 | ================= 450 | ''' 451 | 452 | 453 | def get_info(pdb_id, url_root='https://data.rcsb.org/rest/v1/core/entry/'): 454 | '''Look up all information about a given PDB ID 455 | 456 | Parameters 457 | ---------- 458 | 459 | pdb_id : string 460 | A 4 character string giving a pdb entry of interest 461 | 462 | url_root : string 463 | The string root of the specific url for the request type 464 | 465 | Returns 466 | ------- 467 | 468 | out : dict() 469 | An ordered dictionary object corresponding to entry information 470 | 471 | ''' 472 | pdb_id = pdb_id.replace(":", "/") # replace old entry identifier 473 | url = url_root + pdb_id 474 | response = http_requests.request_limited(url) 475 | 476 | if response is None or response.status_code != 200: 477 | warnings.warn("Retrieval failed, returning None") 478 | return None 479 | 480 | result = str(response.text) 481 | 482 | out = json.loads(result) 483 | 484 | return out 485 | 486 | 487 | get_all_info = get_info # Alias 488 | describe_pdb = get_info # Alias for now; eventually make this point to the Graph search https://data.rcsb.org/migration-guide.html#pdb-file-description 489 | get_entity_info = get_info # Alias 490 | 491 | 492 | def get_pdb_file(pdb_id: str, filetype='pdb', compression=False): 493 | """Deprecated wrapper for fetching PDB files from RCSB Database. 494 | 495 | For new uses, please use `pypdb/clients/pdb/pdb_client.py` 496 | """ 497 | 498 | warnings.warn( 499 | "The `get_pdb_file` function within pypdb.py is deprecated." 500 | "See `pypdb/clients/pdb/pdb_client.py` for a near-identical " 501 | "function to use", DeprecationWarning) 502 | 503 | if filetype == 'pdb': 504 | filetype_enum = pdb_client.PDBFileType.PDB 505 | elif filetype == 'cif': 506 | filetype_enum = pdb_client.PDBFileType.CIF 507 | elif filetype == 'xml': 508 | filetype_enum = pdb_client.PDBFileType.XML 509 | elif filetype == 'structfact': 510 | filetype_enum = pdb_client.PDBFileType.STRUCTFACT 511 | else: 512 | warnings.warn( 513 | "Filetype specified to `get_pdb_file` appears to be invalid") 514 | 515 | return pdb_client.get_pdb_file(pdb_id, filetype_enum, compression) 516 | 517 | 518 | # https://data.rcsb.org/migration-guide.html#chem-comp-description 519 | def describe_chemical(chem_id): 520 | # """ 521 | 522 | # Parameters 523 | # ---------- 524 | 525 | # chem_id : string 526 | # A 3 character string representing the full chemical sequence of interest (ie, NAG) 527 | 528 | # Returns 529 | # ------- 530 | 531 | # out : dict 532 | # A dictionary containing the chemical description associated with the PDB ID 533 | 534 | # Examples 535 | # -------- 536 | # >>> chem_desc = describe_chemical('NAG') 537 | # >>> print(chem_desc["rcsb_chem_comp_descriptor"]["smiles"]) 538 | # 'CC(=O)NC1C(C(C(OC1O)CO)O)O' 539 | # """ 540 | if (len(chem_id) > 3): 541 | raise Exception("Ligand id with more than 3 characters provided") 542 | 543 | return get_info(chem_id, url_root = 'https://data.rcsb.org/rest/v1/core/chemcomp/') 544 | 545 | # def get_ligands(pdb_id): 546 | # """Return ligands of given PDB ID 547 | 548 | # Parameters 549 | # ---------- 550 | 551 | # pdb_id : string 552 | # A 4 character string giving a pdb entry of interest 553 | 554 | # Returns 555 | # ------- 556 | 557 | # out : dict 558 | # A dictionary containing a list of ligands associated with the entry 559 | 560 | # Examples 561 | # -------- 562 | # >>> ligand_dict = get_ligands('100D') 563 | # >>> print(ligand_dict) 564 | # {'id': '100D', 565 | # 'ligandInfo': {'ligand': {'@chemicalID': 'SPM', 566 | # '@molecularWeight': '202.34', 567 | # '@structureId': '100D', 568 | # '@type': 'non-polymer', 569 | # 'InChI': 'InChI=1S/C10H26N4/c11-5-3-9-13-7-1-2-8-14-10-4-6-12/h13-14H,1-12H2', 570 | # 'InChIKey': 'PFNFFQXMRSDOHW-UHFFFAOYSA-N', 571 | # 'chemicalName': 'SPERMINE', 572 | # 'formula': 'C10 H26 N4', 573 | # 'smiles': 'C(CCNCCCN)CNCCCN'}}} 574 | 575 | # """ 576 | # out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/ligandInfo?structureId=') 577 | # out = to_dict(out) 578 | # return remove_at_sign(out['structureId']) 579 | 580 | # def get_gene_onto(pdb_id): 581 | # """Return ligands of given PDB_ID 582 | 583 | # Parameters 584 | # ---------- 585 | 586 | # pdb_id : string 587 | # A 4 character string giving a pdb entry of interest 588 | 589 | # Returns 590 | # ------- 591 | 592 | # out : dict 593 | # A dictionary containing the gene ontology information associated with the entry 594 | 595 | # Examples 596 | # -------- 597 | 598 | # >>> gene_info = get_gene_onto('4Z0L') 599 | # >>> print(gene_info['term'][0]) 600 | # {'@chainId': 'A', 601 | # '@id': 'GO:0001516', 602 | # '@structureId': '4Z0L', 603 | # 'detail': {'@definition': 'The chemical reactions and pathways resulting ' 604 | # 'in the formation of prostaglandins, any of a ' 605 | # 'group of biologically active metabolites which ' 606 | # 'contain a cyclopentane ring.', 607 | # '@name': 'prostaglandin biosynthetic process', 608 | # '@ontology': 'B', 609 | # '@synonyms': 'prostaglandin anabolism, prostaglandin ' 610 | # 'biosynthesis, prostaglandin formation, ' 611 | # 'prostaglandin synthesis'}} 612 | # """ 613 | # out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/goTerms?structureId=') 614 | # out = to_dict(out) 615 | # if not out['goTerms']: 616 | # return None 617 | # out = remove_at_sign(out['goTerms']) 618 | # return out 619 | 620 | # def get_seq_cluster(pdb_id_chain): 621 | # """Get the sequence cluster of a PDB ID plus a pdb_id plus a chain, 622 | 623 | # Parameters 624 | # ---------- 625 | 626 | # pdb_id_chain : string 627 | # A string denoting a 4 character PDB ID plus a one character chain 628 | # offset with a dot: XXXX.X, as in 2F5N.A 629 | 630 | # Returns 631 | # ------- 632 | 633 | # out : dict 634 | # A dictionary containing the sequence cluster associated with the PDB 635 | # entry and chain 636 | 637 | # Examples 638 | # -------- 639 | 640 | # >>> sclust = get_seq_cluster('2F5N.A') 641 | # >>> print(sclust['pdbChain'][:10]) 642 | # [{'@name': '4PD2.A', '@rank': '1'}, 643 | # {'@name': '3U6P.A', '@rank': '2'}, 644 | # {'@name': '4PCZ.A', '@rank': '3'}, 645 | # {'@name': '3GPU.A', '@rank': '4'}, 646 | # {'@name': '3JR5.A', '@rank': '5'}, 647 | # {'@name': '3SAU.A', '@rank': '6'}, 648 | # {'@name': '3GQ4.A', '@rank': '7'}, 649 | # {'@name': '1R2Z.A', '@rank': '8'}, 650 | # {'@name': '3U6E.A', '@rank': '9'}, 651 | # {'@name': '2XZF.A', '@rank': '10'}] 652 | 653 | # """ 654 | 655 | # url_root = 'http://www.rcsb.org/pdb/rest/sequenceCluster?structureId=' 656 | # out = get_info(pdb_id_chain, url_root = url_root) 657 | # out = to_dict(out) 658 | # return remove_at_sign(out['sequenceCluster']) 659 | 660 | 661 | def get_blast(pdb_id, chain_id='A', identity_cutoff=0.99, verbosity=True): 662 | """ 663 | --- 664 | WARNING: this function is deprecated and slated to be deleted due to RCSB 665 | API changes. 666 | 667 | See `pypdb/clients/search/EXAMPLES.md` for examples to use a 668 | `SequenceOperator` search to similar effect 669 | --- 670 | 671 | Return BLAST search results for a given PDB ID. 672 | 673 | Parameters 674 | ---------- 675 | pdb_id : string 676 | A 4 character string giving a pdb entry of interest 677 | 678 | chain_id : string 679 | A single character designating the chain ID of interest 680 | identity_cutoff: float 681 | Identity % at which to cut off results. 682 | 683 | 684 | Returns 685 | ------- 686 | 687 | out : List of PDB IDs that match the given search. 688 | 689 | Examples 690 | -------- 691 | 692 | >>> blast_results = get_blast('2F5N', chain_id='A') 693 | >>> print(blast_results[50]) 694 | PELPEVETVRRELEKRIVGQKIISIEATYPRMVL--TGFEQLKKELTGKTIQGISRRGKYLIFEIGDDFRLISHLRMEGKYRLATLDAPREKHDHL 695 | TMKFADG-QLIYADVRKFGTWELISTDQVLPYFLKKKIGPEPTYEDFDEKLFREKLRKSTKKIKPYLLEQTLVAGLGNIYVDEVLWLAKIHPEKET 696 | NQLIESSIHLLHDSIIEILQKAIKLGGSSIRTY-SALGSTGKMQNELQVYGKTGEKCSRCGAEIQKIKVAGRGTHFCPVCQQ 697 | """ 698 | 699 | warnings.warn( 700 | "The `get_blast` function is slated for deprecation." 701 | "See `pypdb/clients/search/EXAMPLES.md` for examples to use a" 702 | "`SequenceOperator` search to similar effect", DeprecationWarning) 703 | 704 | fasta_entries = fasta_client.get_fasta_from_rcsb_entry(pdb_id) 705 | valid_sequences = [ 706 | fasta_entry.sequence for fasta_entry in fasta_entries 707 | if chain_id in fasta_entry.chains 708 | ] 709 | 710 | matches_any_sequence_in_chain_query = search_client.QueryGroup( 711 | logical_operator=search_client.LogicalOperator.OR, queries=[]) 712 | for valid_sequence in valid_sequences: 713 | matches_any_sequence_in_chain_query.queries.append( 714 | sequence_operators.SequenceOperator( 715 | sequence=valid_sequence, 716 | identity_cutoff=identity_cutoff, 717 | evalue_cutoff=1000)) 718 | 719 | return search_client.perform_search_with_graph( 720 | query_object=matches_any_sequence_in_chain_query, 721 | return_raw_json_dict=True) 722 | 723 | 724 | # def get_pfam(pdb_id): 725 | # """Return PFAM annotations of given PDB_ID 726 | 727 | # Parameters 728 | # ---------- 729 | 730 | # pdb_id : string 731 | # A 4 character string giving a pdb entry of interest 732 | 733 | # Returns 734 | # ------- 735 | 736 | # out : dict 737 | # A dictionary containing the PFAM annotations for the specified PDB ID 738 | 739 | # Examples 740 | # -------- 741 | 742 | # >>> pfam_info = get_pfam('2LME') 743 | # >>> print(pfam_info) 744 | # {'pfamHit': {'@pfamAcc': 'PF03895.10', '@pfamName': 'YadA_anchor', 745 | # '@structureId': '2LME', '@pdbResNumEnd': '105', '@pdbResNumStart': '28', 746 | # '@pfamDesc': 'YadA-like C-terminal region', '@eValue': '5.0E-22', '@chainId': 'A'}} 747 | 748 | # """ 749 | # out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/hmmer?structureId=') 750 | # out = to_dict(out) 751 | # if not out['hmmer3']: 752 | # return dict() 753 | # return remove_at_sign(out['hmmer3']) 754 | 755 | # def get_clusters(pdb_id): 756 | # """Return cluster related web services of given PDB_ID 757 | 758 | # Parameters 759 | # ---------- 760 | 761 | # pdb_id : string 762 | # A 4 character string giving a pdb entry of interest 763 | 764 | # Returns 765 | # ------- 766 | 767 | # out : dict 768 | # A dictionary containing the representative clusters for the specified PDB ID 769 | 770 | # Examples 771 | # -------- 772 | 773 | # >>> clusts = get_clusters('4hhb.A') 774 | # >>> print(clusts) 775 | # {'pdbChain': {'@name': '2W72.A'}} 776 | 777 | # """ 778 | # out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/representatives?structureId=') 779 | # out = to_dict(out) 780 | # return remove_at_sign(out['representatives']) 781 | 782 | 783 | def find_results_gen(search_term, field='title'): 784 | ''' 785 | Return a generator of the results returned by a search of 786 | the protein data bank. This generator is used internally. 787 | 788 | Parameters 789 | ---------- 790 | 791 | search_term : str 792 | The search keyword 793 | 794 | field : str 795 | The type of information to record about each entry 796 | 797 | Examples 798 | -------- 799 | 800 | >>> result_gen = find_results_gen('bleb') 801 | >>> pprint.pprint([item for item in result_gen][:5]) 802 | ['MYOSIN II DICTYOSTELIUM DISCOIDEUM MOTOR DOMAIN S456Y BOUND WITH MGADP-BEFX', 803 | 'MYOSIN II DICTYOSTELIUM DISCOIDEUM MOTOR DOMAIN S456Y BOUND WITH MGADP-ALF4', 804 | 'DICTYOSTELIUM DISCOIDEUM MYOSIN II MOTOR DOMAIN S456E WITH BOUND MGADP-BEFX', 805 | 'MYOSIN II DICTYOSTELIUM DISCOIDEUM MOTOR DOMAIN S456E BOUND WITH MGADP-ALF4', 806 | 'The structural basis of blebbistatin inhibition and specificity for myosin ' 807 | 'II'] 808 | 809 | ''' 810 | search_result_ids = Query(search_term).search() 811 | 812 | all_titles = [] 813 | for pdb_id in search_result_ids: 814 | result = get_info(pdb_id) 815 | if field in result.keys(): 816 | yield result[field] 817 | 818 | 819 | def find_papers(search_term, max_results=10, **kwargs): 820 | ''' 821 | Return an ordered list of the top papers returned by a keyword search of 822 | the RCSB PDB 823 | 824 | Parameters 825 | ---------- 826 | 827 | search_term : str 828 | The search keyword 829 | 830 | max_results : int 831 | The maximum number of results to return 832 | 833 | Returns 834 | ------- 835 | 836 | all_papers : list of strings 837 | A descending-order list containing the top papers associated with 838 | the search term in the PDB 839 | 840 | Examples 841 | -------- 842 | 843 | >>> matching_papers = find_papers('crispr',max_results=3) 844 | >>> print(matching_papers) 845 | ['Crystal structure of a CRISPR-associated protein from thermus thermophilus', 846 | 'CRYSTAL STRUCTURE OF HYPOTHETICAL PROTEIN SSO1404 FROM SULFOLOBUS SOLFATARICUS P2', 847 | 'NMR solution structure of a CRISPR repeat binding protein'] 848 | 849 | ''' 850 | all_papers = list() 851 | id_list = Query(search_term).search() 852 | for pdb_id in id_list[:max_results]: 853 | pdb_info = get_info(pdb_id) 854 | all_papers += [item["title"] for item in pdb_info["citation"]] 855 | return remove_dupes(all_papers) 856 | 857 | 858 | # def find_authors(search_term, **kwargs): 859 | # '''Return an ordered list of the top authors returned by a keyword search of 860 | # the RCSB PDB 861 | 862 | # This function is based on the number of unique PDB entries a given author has 863 | # his or her name associated with, and not author order or the ranking of the 864 | # entry in the keyword search results. So if an author tends to publish on topics 865 | # related to the search_term a lot, even if those papers are not the best match for 866 | # the exact search, he or she will have priority in this function over an author 867 | # who wrote the one paper that is most relevant to the search term. For the latter 868 | # option, just do a standard keyword search using do_search. 869 | 870 | # Parameters 871 | # ---------- 872 | 873 | # search_term : str 874 | # The search keyword 875 | 876 | # max_results : int 877 | # The maximum number of results to return 878 | 879 | # Returns 880 | # ------- 881 | 882 | # out : list of str 883 | 884 | # Examples 885 | # -------- 886 | 887 | # >>> top_authors = find_authors('crispr',max_results=100) 888 | # >>> print(top_authors[:10]) 889 | # ['Doudna, J.A.', 'Jinek, M.', 'Ke, A.', 'Li, H.', 'Nam, K.H.'] 890 | 891 | # ''' 892 | 893 | # all_individuals = parse_results_gen(search_term, field='citation_authors', **kwargs) 894 | 895 | # full_author_list = [] 896 | # for individual in all_individuals: 897 | # individual = individual.replace('.,', '.;') 898 | # author_list_clean = [x.strip() for x in individual.split(';')] 899 | # full_author_list+=author_list_clean 900 | 901 | # out = list(chain.from_iterable(repeat(ii, c) for ii,c in Counter(full_author_list).most_common())) 902 | 903 | # return remove_dupes(out) 904 | 905 | # def find_dates(search_term, **kwargs): 906 | # ''' 907 | # Return an ordered list of the PDB submission dates returned by a 908 | # keyword search of the RCSB PDB. This can be used to assess the 909 | # popularity of a gievne keyword or topic 910 | 911 | # Parameters 912 | # ---------- 913 | 914 | # search_term : str 915 | # The search keyword 916 | 917 | # max_results : int 918 | # The maximum number of results to return 919 | 920 | # Returns 921 | # ------- 922 | 923 | # all_dates : list of str 924 | # A list of calendar strings associated with the search term, these can 925 | # be converted directly into time or datetime objects 926 | 927 | # ''' 928 | # all_dates = parse_results_gen(search_term, field='deposition_date', **kwargs) 929 | # return all_dates 930 | ''' 931 | ================= 932 | Helper Functions 933 | ================= 934 | ''' 935 | 936 | 937 | def to_dict(odict): 938 | '''Convert OrderedDict to dict 939 | 940 | Takes a nested, OrderedDict() object and outputs a 941 | normal dictionary of the lowest-level key:val pairs 942 | 943 | Parameters 944 | ---------- 945 | 946 | odict : OrderedDict 947 | 948 | Returns 949 | ------- 950 | 951 | out : dict 952 | 953 | A dictionary corresponding to the flattened form of 954 | the input OrderedDict 955 | 956 | ''' 957 | 958 | out = json.loads(json.dumps(odict)) 959 | return out 960 | 961 | 962 | def remove_at_sign(kk): 963 | '''Remove the '@' character from the beginning of key names in a dict() 964 | 965 | Parameters 966 | ---------- 967 | 968 | kk : dict 969 | A dictionary containing keys with the @ character 970 | (this pops up a lot in converted XML) 971 | 972 | Returns 973 | ------- 974 | 975 | kk : dict (modified in place) 976 | A dictionary where the @ character has been removed 977 | 978 | ''' 979 | tagged_keys = [thing for thing in kk.keys() if thing.startswith('@')] 980 | for tag_key in tagged_keys: 981 | kk[tag_key[1:]] = kk.pop(tag_key) 982 | 983 | return kk 984 | 985 | 986 | def remove_dupes(list_with_dupes): 987 | '''Remove duplicate entries from a list while preserving order 988 | 989 | This function uses Python's standard equivalence testing methods in 990 | order to determine if two elements of a list are identical. So if in the list [a,b,c] 991 | the condition a == b is True, then regardless of whether a and b are strings, ints, 992 | or other, then b will be removed from the list: [a, c] 993 | 994 | Parameters 995 | ---------- 996 | 997 | list_with_dupes : list 998 | A list containing duplicate elements 999 | 1000 | Returns 1001 | ------- 1002 | out : list 1003 | The list with the duplicate entries removed by the order preserved 1004 | 1005 | 1006 | Examples 1007 | -------- 1008 | >>> a = [1,3,2,4,2] 1009 | >>> print(remove_dupes(a)) 1010 | [1,3,2,4] 1011 | 1012 | ''' 1013 | visited = set() 1014 | visited_add = visited.add 1015 | out = [ 1016 | entry for entry in list_with_dupes 1017 | if not (entry in visited or visited_add(entry)) 1018 | ] 1019 | return out 1020 | 1021 | 1022 | def walk_nested_dict(my_result, term, outputs=[], depth=0, maxdepth=25): 1023 | ''' 1024 | For a nested dictionary that may itself comprise lists of 1025 | dictionaries of unknown length, determine if a key is anywhere 1026 | in any of the dictionaries using a depth-first search 1027 | 1028 | Parameters 1029 | ---------- 1030 | 1031 | my_result : dict 1032 | A nested dict containing lists, dicts, and other objects as vals 1033 | 1034 | term : str 1035 | The name of the key stored somewhere in the tree 1036 | 1037 | maxdepth : int 1038 | The maximum depth to search the results tree 1039 | 1040 | depth : int 1041 | The depth of the search so far. 1042 | Users don't usually access this. 1043 | 1044 | outputs : list 1045 | All of the positive search results collected so far. 1046 | Users don't usually access this. 1047 | 1048 | Returns 1049 | ------- 1050 | 1051 | outputs : list 1052 | All of the search results. 1053 | 1054 | ''' 1055 | 1056 | if depth > maxdepth: 1057 | warnings.warn( 1058 | 'Maximum recursion depth exceeded. Returned None for the search results,' 1059 | + ' try increasing the maxdepth keyword argument.') 1060 | return None 1061 | 1062 | depth = depth + 1 1063 | 1064 | if type(my_result) == dict: 1065 | if term in my_result.keys(): 1066 | outputs.append(my_result[term]) 1067 | 1068 | else: 1069 | new_results = list(my_result.values()) 1070 | walk_nested_dict(new_results, 1071 | term, 1072 | outputs=outputs, 1073 | depth=depth, 1074 | maxdepth=maxdepth) 1075 | 1076 | elif type(my_result) == list: 1077 | for item in my_result: 1078 | walk_nested_dict(item, 1079 | term, 1080 | outputs=outputs, 1081 | depth=depth, 1082 | maxdepth=maxdepth) 1083 | 1084 | else: 1085 | pass 1086 | # dead leaf 1087 | 1088 | # this conditional may not be necessary 1089 | if outputs: 1090 | return outputs 1091 | else: 1092 | return None 1093 | -------------------------------------------------------------------------------- /pypdb/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/util/__init__.py -------------------------------------------------------------------------------- /pypdb/util/http_requests.py: -------------------------------------------------------------------------------- 1 | """Utility functions for requesting URLs over HTTP""" 2 | 3 | from typing import Optional 4 | 5 | import time 6 | import requests 7 | import warnings 8 | 9 | 10 | def request_limited(url: str, 11 | rtype: str = "GET", 12 | num_attempts: int = 3, 13 | sleep_time=0.5, 14 | **kwargs) -> Optional[requests.models.Response]: 15 | """ 16 | HTML request with rate-limiting base on response code 17 | 18 | 19 | Parameters 20 | ---------- 21 | url : str 22 | The url for the request 23 | rtype : str 24 | The request type (oneof ["GET", "POST"]) 25 | num_attempts : int 26 | In case of a failed retrieval, the number of attempts to try again 27 | sleep_time : int 28 | The amount of time to wait between requests, in case of 29 | API rate limits 30 | **kwargs : dict 31 | The keyword arguments to pass to the request 32 | 33 | Returns 34 | ------- 35 | 36 | response : requests.models.Response 37 | The server response object. Only returned if request was successful, 38 | otherwise returns None. 39 | 40 | """ 41 | 42 | if rtype not in ["GET", "POST"]: 43 | warnings.warn("Request type not recognized") 44 | return None 45 | 46 | total_attempts = 0 47 | while (total_attempts <= num_attempts): 48 | if rtype == "GET": 49 | response = requests.get(url, **kwargs) 50 | elif rtype == "POST": 51 | response = requests.post(url, **kwargs) 52 | 53 | if response.status_code == 200: 54 | return response 55 | 56 | if response.status_code == 429: 57 | curr_sleep = (1 + total_attempts) * sleep_time 58 | warnings.warn("Too many requests, waiting " + str(curr_sleep) + 59 | " s") 60 | time.sleep(curr_sleep) 61 | elif 500 <= response.status_code < 600: 62 | warnings.warn("Server error encountered. Retrying") 63 | total_attempts += 1 64 | 65 | warnings.warn("Too many failures on requests. Exiting...") 66 | return None 67 | -------------------------------------------------------------------------------- /pypdb/util/test_http_requests.py: -------------------------------------------------------------------------------- 1 | import time 2 | import requests 3 | import unittest 4 | from unittest import mock 5 | import warnings 6 | 7 | from pypdb.util import http_requests 8 | 9 | 10 | class TestHTTPRequests(unittest.TestCase): 11 | @mock.patch.object(warnings, "warn", autospec=True) 12 | @mock.patch.object(time, "sleep", autospec=True) 13 | def test_fails_with_invalid_request(self, mock_sleep, mock_warnings): 14 | self.assertIsNone( 15 | http_requests.request_limited(url="http://protein_data_bank.com", 16 | rtype="MAIL")) 17 | mock_warnings.assert_called_once_with("Request type not recognized") 18 | self.assertEqual(len(mock_sleep.mock_calls), 0) 19 | 20 | @mock.patch.object(requests, "get", autospec=True) 21 | @mock.patch.object(time, "sleep", autospec=True) 22 | def test_get__first_try_success(self, mock_sleep, mock_get): 23 | mock_response = mock.create_autospec(requests.models.Response) 24 | mock_response.status_code = 200 # A-OK! 25 | mock_get.return_value = mock_response 26 | 27 | self.assertEqual( 28 | http_requests.request_limited(url="http://get_your_proteins.com", 29 | rtype="GET"), mock_response) 30 | mock_get.assert_called_once_with("http://get_your_proteins.com") 31 | self.assertEqual(len(mock_sleep.mock_calls), 0) 32 | 33 | @mock.patch.object(requests, "post", autospec=True) 34 | @mock.patch.object(time, "sleep", autospec=True) 35 | def test_post__first_try_success(self, mock_sleep, mock_post): 36 | mock_response = mock.create_autospec(requests.models.Response) 37 | mock_response.status_code = 200 # A-OK! 38 | mock_post.return_value = mock_response 39 | 40 | self.assertEqual( 41 | http_requests.request_limited(url="http://get_your_proteins.com", 42 | rtype="POST"), mock_response) 43 | mock_post.assert_called_once_with("http://get_your_proteins.com") 44 | self.assertEqual(len(mock_sleep.mock_calls), 0) 45 | 46 | @mock.patch.object(requests, "get", autospec=True) 47 | @mock.patch.object(time, "sleep", autospec=True) 48 | def test_get__succeeds_third_try(self, mock_sleep, mock_get): 49 | # Busy response 50 | mock_busy_response = mock.create_autospec(requests.models.Response) 51 | mock_busy_response.status_code = 429 52 | # Server Error response 53 | mock_error_response = mock.create_autospec(requests.models.Response) 54 | mock_error_response.status_code = 504 55 | # All good (200) 56 | mock_ok_response = mock.create_autospec(requests.models.Response) 57 | mock_ok_response.status_code = 200 58 | 59 | # Mocks `requests.get` to return Busy, then Server Error, then OK 60 | mock_get.side_effect = [ 61 | mock_busy_response, mock_error_response, mock_ok_response 62 | ] 63 | 64 | self.assertEqual( 65 | http_requests.request_limited(url="http://get_your_proteins.com", 66 | rtype="GET"), mock_ok_response) 67 | self.assertEqual(len(mock_get.mock_calls), 3) 68 | mock_get.assert_called_with("http://get_your_proteins.com") 69 | # Should only sleep on being throttled (not server error) 70 | self.assertEqual(len(mock_sleep.mock_calls), 1) 71 | 72 | @mock.patch.object(warnings, "warn", autospec=True) 73 | @mock.patch.object(requests, "post", autospec=True) 74 | @mock.patch.object(time, "sleep", autospec=True) 75 | def test_post__repeatedly_fails_return_nothing(self, mock_sleep, mock_post, 76 | mock_warn): 77 | # Busy response 78 | mock_busy_response = mock.create_autospec(requests.models.Response) 79 | mock_busy_response.status_code = 429 80 | mock_post.return_value = mock_busy_response 81 | 82 | self.assertIsNone( 83 | http_requests.request_limited(url="http://protein_data_bank.com", 84 | rtype="POST")) 85 | mock_warn.assert_called_with( 86 | "Too many failures on requests. Exiting...") 87 | 88 | self.assertEqual(len(mock_post.mock_calls), 4) 89 | mock_post.assert_called_with("http://protein_data_bank.com") 90 | self.assertEqual(len(mock_sleep.mock_calls), 4) 91 | 92 | 93 | if __name__ == '__main__': 94 | unittest.main() 95 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # read the contents of the README file so that PyPI can use it as the long description 4 | from pathlib import Path 5 | this_directory = Path(__file__).parent 6 | long_description = (this_directory / "README.md").read_text() 7 | 8 | modules_list = [ 9 | "pypdb", 10 | "pypdb.util", 11 | "pypdb.clients", 12 | "pypdb.clients.search", 13 | "pypdb.clients.search.operators", 14 | "pypdb.clients.data", 15 | "pypdb.clients.data.graphql", 16 | "pypdb.clients.fasta", 17 | "pypdb.clients.pdb", 18 | ] 19 | 20 | setup( 21 | name='pypdb', 22 | packages=modules_list, # same as 'name' 23 | py_modules=modules_list, 24 | version='2.04', 25 | install_requires=['requests'], 26 | description='A Python wrapper for the RCSB Protein Data Bank (PDB) API', 27 | author='William Gilpin', 28 | author_email='firstnamelastname@gmail.com', 29 | url='https://github.com/williamgilpin/pypdb', 30 | download_url='https://github.com/williamgilpin/pypdb/tarball/0.6', 31 | keywords=['protein', 'data', 'RESTful', 'api'], 32 | classifiers=[], 33 | long_description=long_description, 34 | long_description_content_type='text/markdown' 35 | ) 36 | -------------------------------------------------------------------------------- /tests/test_pypdb.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | ## Import from local directory 4 | import sys 5 | sys.path.insert(0, '../pypdb') 6 | from pypdb import * 7 | 8 | # TODO(ejwilliams): Write generic logic, to execute `test_*.py` files 9 | # within the pypdb directory (removing need for sys.path hack) 10 | 11 | # aa_index[s] for s in seq_dict[k] if s in aa_index.keys()] 12 | 13 | class TestSearchFunctions(unittest.TestCase): 14 | 15 | def test_searchterm(self): 16 | found_pdbs = Query('ribosome').search() 17 | self.assertTrue(len(found_pdbs) > 0) 18 | self.assertTrue(type(found_pdbs[0]) == str) 19 | 20 | # an error page would be a longer string 21 | self.assertTrue(len(found_pdbs[0]) < 10) 22 | 23 | def test_pubmed(self): 24 | found_pdbs = Query(27499440, "PubmedIdQuery").search() 25 | self.assertTrue(len(found_pdbs) > 0) 26 | 27 | def test_treeentity(self): 28 | found_pdbs = Query('6239', 'TreeEntityQuery').search() 29 | self.assertTrue(len(found_pdbs) > 0) 30 | 31 | def test_exptype(self): 32 | found_pdbs = Query('SOLID-STATE NMR', 'ExpTypeQuery').search() 33 | self.assertTrue(len(found_pdbs) > 0) 34 | 35 | def test_structure(self): 36 | found_pdbs = Query('2E8D', 'structure').search() 37 | self.assertTrue(len(found_pdbs) > 0) 38 | 39 | def test_advancedauthor(self): 40 | found_pdbs = Query('Perutz, M.F.', 'AdvancedAuthorQuery').search() 41 | self.assertTrue(len(found_pdbs) > 0) 42 | 43 | def test_organism(self): 44 | found_pdbs = Query('Dictyostelium', 'OrganismQuery').search() 45 | self.assertTrue(len(found_pdbs) > 0) 46 | 47 | 48 | 49 | # def test_blast(self): 50 | # found_pdbs = blast_from_sequence( 51 | # 'MTKIANKYEVIDNVEKLEKALKRLREAQSVYATYTQEQVDKIFFEAAMAANKMRIPLAKMAVE' 52 | # + 'ETGMGVVEDKVIKNHYASEYIYNAYKNTKTCGVIEEDPAFGIKKIAEPLGVIAAVIPTTNP' 53 | # + 'TSTAIFKTLIALKTRNAIIISPHPRAKNSTIEAAKIVLEAAVKAGAPEGIIGWIDVPSLEL' 54 | # + 'TNLVMREADVILATGGPGLVKAAYSSGKPAIGVGAGNTPAIIDDSADIVLAVNSIIHSKTF' 55 | # + 'DNGMICASEQSVIVLDGVYKEVKKEFEKRGCYFLNEDETEKVRKTIIINGALNAKIVGQKA' 56 | # + 'HTIANLAGFEVPETTKILIGEVTSVDISEEFAHEKLCPVLAMYRAKDFDDALDKAERLVAD' 57 | # + 'GGFGHTSSLYIDTVTQKEKLQKFSERMKTCRILVNTPSSQGGIGDLYNFKLAPSL', 58 | # 1e-20) 59 | # self.assertTrue(len(found_pdbs) > 0) 60 | # self.assertTrue(type(found_pdbs[0][0]) == str) 61 | 62 | # # an error page would be a longer string 63 | # self.assertTrue(len(found_pdbs[0][0]) < 10) 64 | 65 | 66 | if __name__ == '__main__': 67 | unittest.main() 68 | --------------------------------------------------------------------------------