├── .gitignore
├── LICENSE
├── MANIFEST
├── README.md
├── demos
    └── demos.ipynb
├── devtools
    └── conda-recipe
    │   ├── README.md
    │   ├── build.sh
    │   ├── conda_build_config.yaml
    │   └── meta.yaml
├── pypdb
    ├── __init__.py
    ├── clients
    │   ├── __init__.py
    │   ├── data
    │   │   ├── EXAMPLES.md
    │   │   ├── __init__.py
    │   │   ├── data_types.py
    │   │   ├── graphql
    │   │   │   ├── __init__.py
    │   │   │   ├── graphql.py
    │   │   │   └── test_graphql.py
    │   │   └── test_data_types.py
    │   ├── fasta
    │   │   ├── __init__.py
    │   │   ├── fasta_client.py
    │   │   └── fasta_client_test.py
    │   ├── pdb
    │   │   ├── __init__.py
    │   │   ├── pdb_client.py
    │   │   └── pdb_client_test.py
    │   └── search
    │   │   ├── EXAMPLES.md
    │   │   ├── __init__.py
    │   │   ├── operators
    │   │       ├── __init__.py
    │   │       ├── chemical_operators.py
    │   │       ├── chemical_operators_test.py
    │   │       ├── seqmotif_operators.py
    │   │       ├── seqmotif_operators_test.py
    │   │       ├── sequence_operators.py
    │   │       ├── sequence_operators_test.py
    │   │       ├── structure_operators.py
    │   │       ├── structure_operators_test.py
    │   │       ├── text_operators.py
    │   │       └── text_operators_test.py
    │   │   ├── search_client.py
    │   │   └── search_client_test.py
    ├── conftest.py
    ├── pypdb.py
    └── util
    │   ├── __init__.py
    │   ├── http_requests.py
    │   └── test_http_requests.py
├── setup.cfg
├── setup.py
└── tests
    └── test_pypdb.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ###Temporary Files created by text editor###
 2 | *~
 3 | .vscode
 4 | .vscode/
 5 | 
 6 | ### Remove DS STORE
 7 | *.DS_Store
 8 | 
 9 | ### Remove autosave files
10 | \#*
11 | 
12 | ### Local development files
13 | scratch*
14 | 
15 | ### Python ###
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | 
20 | # Setup code for pypi
21 | \#setup\##
22 | 
23 | # C extensions
24 | *.so
25 | 
26 | # Distribution / packaging
27 | .Python
28 | env/
29 | build/
30 | develop-eggs/
31 | dist/
32 | downloads/
33 | eggs/
34 | lib/
35 | lib64/
36 | parts/
37 | sdist/
38 | var/
39 | *.egg-info/
40 | .installed.cfg
41 | *.egg
42 | 
43 | # PyInstaller
44 | #  Usually these files are written by a python script from a template
45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
46 | *.manifest
47 | *.spec
48 | 
49 | # Installer logs
50 | pip-log.txt
51 | pip-delete-this-directory.txt
52 | 
53 | # Unit test / coverage reports
54 | htmlcov/
55 | .tox/
56 | .coverage
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | 
61 | # Translations
62 | *.mo
63 | *.pot
64 | 
65 | # Django stuff:
66 | *.log
67 | 
68 | # Sphinx documentation
69 | docs/_build/
70 | 
71 | # PyBuilder
72 | target/
73 | ### IPythonNotebook ###
74 | # Temporary data
75 | .ipynb_checkpoints/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 William Gilpin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | setup.cfg
3 | setup.py
4 | pypdb/pypdb.py
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyPDB
 2 | 
 3 | A Python 3 toolkit for performing searches with the RCSB Protein Data Bank (PDB). This can be used to perform advanced searches for PDB IDs matching various criteria, as well as to look up information associated with specific PDB IDs. This tool allows standard operations that can be perfomed from within the PDB website (BLAST, PFAM lookup, etc.) to be performed from within Python scripts.
 4 | 
 5 | If you use this module for any published work, please consider citing the accompanying paper
 6 | 
 7 |       Gilpin, W. "PyPDB: A Python API for the Protein Data Bank."
 8 |       Bioinformatics, Oxford Journals, 2016.
 9 | 
10 | ## Installation
11 | 
12 | Install using pip:
13 | 
14 |     $ pip install pypdb
15 | 
16 | To install the development version, which contains the latest features and fixes, install directly from GitHub using
17 | 
18 |     $ pip install git+https://github.com/williamgilpin/pypdb
19 | 
20 | If you need to  install directly from setup.py,
21 | 
22 |     $ python setup.py install
23 | 
24 | Test the installation, and check that the code successfully connects to the PDB, navigate to the root directory and run
25 | 
26 |     $ pytest
27 | 
28 | This code has been designed and tested for Python 3.
29 | 
30 | ## Usage
31 | 
32 | ### PDB Text Search
33 | This package can be used to get lists of PDB IDs associated with specific search terms, experiment types, structures, and other common criteria. To use the simple API, see the examples in [`demos/demos.ipynb`](demos/demos.ipynb). For advanced search and query logic, see the examples in [`search/EXAMPLES.md`](pypdb/clients/search/EXAMPLES.md).
34 | 
35 | ### PDB Data Fetch
36 | Given a list of PDBs, this package can be used to fetch data associated with those PDBs, including their dates of deposition, lists of authors and associated publications, their sequences or structures, their top BLAST matches, and other query-specific attributes like lists of a ligands or chemical structure.  To use the simple API, see the examples in [`demos/demos.ipynb`](demos/demos.ipynb). For advanced search and query logic, see the examples in [`data/EXAMPLES.md`](pypdb/clients/data/EXAMPLES.md).
37 | 
38 | ## Issues and Feature Requests
39 | 
40 | If you run into an issue, or if you find a workaround for an existing issue, please post your question or code as a GitHub issue.
41 | 
42 | If posting a feature request, please check that your request is possible using [the current GUI on current RCSB website](https://www.rcsb.org/search/advanced). If so, please perform your search, and then click the link that says `JSON` in the upper right hand corner of the Advanced Search box. Please post that JSON code with your feature request.
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/devtools/conda-recipe/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions
 2 | 
 3 | ## Required
 4 | 
 5 | ```bash
 6 | conda install conda-build anaconda-client
 7 | ```
 8 | 
 9 | ## Building and pushing to https://anaconda.org/williamgilpin
10 | 
11 | ```bash
12 | conda build . --no-anaconda-upload
13 | PACKAGE_OUTPUT=`conda build . --output`
14 | anaconda login
15 | anaconda upload --user williamgilpin $PACKAGE_OUTPUT
16 | conda build purge
17 | anaconda logout
18 | ```
19 | 
20 | ## Install
21 | 
22 | ```
23 | conda install -c williamgilpin pypdb
24 | ```
25 | 
26 | ## Additional Info
27 | https://docs.anaconda.com/anaconda-cloud/user-guide/tasks/work-with-packages
28 | 


--------------------------------------------------------------------------------
/devtools/conda-recipe/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt 
2 | 
3 | 


--------------------------------------------------------------------------------
/devtools/conda-recipe/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | python:
2 |     - 3.7
3 | 
4 | 


--------------------------------------------------------------------------------
/devtools/conda-recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: pypdb
 3 |   version: 1.310
 4 | 
 5 | source:
 6 |   path: ../../
 7 | 
 8 | build:
 9 |   number: 0
10 | 
11 | requirements:
12 |   build:
13 |     - python {{ python }}
14 |     - setuptools
15 |     - numpy
16 |     - pytest
17 | 
18 |   run:
19 |     - python  {{ python }}
20 |     - numpy
21 |     - xmltodict
22 |     - beautifulsoup4
23 |     - matplotlib
24 |     - urllib3
25 |     - jsonschema
26 | 
27 | about:
28 |   home: http://www.wgilpin.com/pypdb_docs/html/
29 |   license: 'https://github.com/williamgilpin/pypdb/blob/master/LICENSE'
30 |   summary: "This is a copy of https://github.com/williamgilpin/pypdb, developed by William Gilpin, built here to be installed with conda."
31 |   description: |
32 |     This is a copy of https://github.com/williamgilpin/pypdb, developed by William Gilpin.
33 |   dev_url: https://github.com/williamgilpin/pypdb
34 |   doc_url: http://www.wgilpin.com/pypdb_docs/html/
35 | 


--------------------------------------------------------------------------------
/pypdb/__init__.py:
--------------------------------------------------------------------------------
1 | from .pypdb import *
2 | # from .pypdb.util import *
3 | 


--------------------------------------------------------------------------------
/pypdb/clients/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/data/EXAMPLES.md:
--------------------------------------------------------------------------------
  1 | # PyPDB Data Fetch from the PDB
  2 | 
  3 | ## Helpful Links
  4 | 
  5 | The data fetch module here is a Python wrapper for the [graphQL API](https://data.rcsb.org/#fetch-data-graphql).
  6 | PDB's data API [organizes the data in the following way](https://data.rcsb.org/#data-organization):
  7 | 
  8 | * `entry`
  9 | * `entity`
 10 |     * `polymer_entity`
 11 |     * `branched_entity`
 12 |     * `nonpolymer_entity`
 13 | * `entity_instance`
 14 |     * `polymer_entity_instance`
 15 |     * `branched_entity_instance`
 16 |     * `nonpolymer_entity_instance`
 17 | * `assembly`
 18 | * `chemical_component`
 19 | 
 20 | In addition to these, the following are also options in the PDB, but are currently not implemented in PyPDB:
 21 | 
 22 | * `PubMed`
 23 | * `UniProt`
 24 | * `DrugBank`
 25 | 
 26 | The data schemas for all of these data types can be viewed [here](https://data.rcsb.org/#data-schema).
 27 | These schemas allow the user to determine what keywords to ask for.
 28 | The queries can be tested in-browser using the [GraphiQL tool](https://data.rcsb.org/graphql/index.html?query=%7B%0A%20%20entries(entry_ids%3A%20%5B%224HHB%22%5D)%20%7B%0A%20%20%20%20rcsb_id%0A%20%20%20%20struct%20%7B%0A%20%20%20%20%20%20title%0A%20%20%20%20%7D%0A%20%20%20%20exptl%20%7B%0A%20%20%20%20%20%20method%0A%20%20%20%20%7D%0A%20%20%7D%0A%7D).
 29 | 
 30 | ## Examples
 31 | 
 32 | All of the functionaly, and thus examples below, require the following imports:
 33 | 
 34 | ```python
 35 | from pypdb.clients.data.data_types import DataFetcher, DataType
 36 | ```
 37 | 
 38 | ### Fetch entries using PDB IDs
 39 | 
 40 | If we want to fetch some information about the PDB entries `4HHB`, `12CA`, and `3PQR`, we first create an instance of `DataFetcher`:
 41 | 
 42 | ```python
 43 | entry = DataFetcher(["4HHB", "12CA", "3PQR"], DataType.ENTRY)
 44 | ```
 45 | 
 46 | The properties we will fetch for needs to be given as a python dictionary, commensurate with the [data schemas](https://data.rcsb.org/#data-schema):
 47 | 
 48 | ```python
 49 | property = {"exptl": ["method", "details"], "cell":["length_a", "length_b", "length_c"]}
 50 | entry.add_property(property)
 51 | ```
 52 | 
 53 | Then we fetch the data
 54 | 
 55 | ```python
 56 | entry.fetch_data()
 57 | ```
 58 | 
 59 | where `entry.response` now contains a Python dictionary generated from the JSON formatted information fetched from the PDB.
 60 | It is possible to convert this to a Pandas dataframe:
 61 | 
 62 | ```python
 63 | df = entry.return_data_as_pandas_df()
 64 | ```
 65 | 
 66 | ### Fetch Assemblies
 67 | 
 68 | Similarly to the `entry` case:
 69 | 
 70 | ```python
 71 | assembly = DataFetcher(["4HHB-1", "12CA-1", "3PQR-1"], DataType.ASSEMBLY)
 72 | property = {"rcsb_assembly_info": ["entry_id", "assembly_id", "polymer_entity_instance_count"]}
 73 | 
 74 | assembly.add_property(property)
 75 | assembly.fetch_data()
 76 | ```
 77 | 
 78 | Note that the IDs provided must be of the form `[entry_id]-[assembly_id]`. 
 79 | 
 80 | ### Fetch Polymer Entities
 81 | 
 82 | ```python
 83 | fetcher = DataFetcher(["2CPK_1","3WHM_1","2D5Z_1"], DataType.POLYMER_ENTITY)
 84 | property = {"rcsb_id": [], 
 85 |             "rcsb_entity_source_organism": ["ncbi_taxonomy_id", "ncbi_scientific_name"],
 86 |             "rcsb_cluster_membership": ["cluster_id", "identity"]}
 87 | 
 88 | fetcher.add_property(property)
 89 | fetcher.fetch_data()
 90 | ```
 91 | 
 92 | The IDs provided must be of the form `[entry_id]_[entity_id]`.
 93 | 
 94 | ### Fetch Polymer Entity Instance
 95 | 
 96 | ```python
 97 | fetcher = DataFetcher(["4HHB.A", "12CA.A", "3PQR.A"], DataType.POLYMER_ENTITY_INSTANCE)
 98 | property = {"rcsb_id": [],
 99 |             "rcsb_polymer_instance_annotation": ["annotation_id", "name", "type"]}
100 | fetcher.add_property(property)
101 | fetcher.fetch_data()
102 | ```
103 | 
104 | In this case, IDs are of the form `[entry_id].[entity_id]`.
105 | 
106 | ### Fetch Branched Entity
107 | 
108 | ```python
109 | fetcher = DataFetcher(["5FMB_2", "6L63_3"], DataType.BRANCHED_ENTITY)
110 | property = {"pdbx_entity_branch": ["type"],
111 |             "pdbx_entity_branch_descriptor": ["type", "descriptor"]}
112 | 
113 | fetcher.add_property(property)
114 | fetcher.fetch_data()
115 | ```
116 | 
117 | ### Fetch Chemical Components
118 | 
119 | ```python
120 | fetcher = DataFetcher(["NAG","EBW"], DataType.CHEMICAL_COMPONENT)
121 | property = {"rcsb_id":[], "chem_comp": ["type", "formula_weight","name","formula"],
122 |             "rcsb_chem_comp_info":["initial_release_date"]}
123 | fetcher.add_property(property)
124 | fetcher.fetch_data()
125 | ```


--------------------------------------------------------------------------------
/pypdb/clients/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/data/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/data/data_types.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Class for data types that can be accessed in the PDB DATA-API
  3 | https://data.rcsb.org/#data-organization
  4 | 
  5 | Namely:
  6 | - entry
  7 | - polymer entity
  8 | - branched entity
  9 | - non-polymer entity
 10 | - polymer instance
 11 | - branched instance
 12 | - non-polymer instance
 13 | - assembly
 14 | - chemical component
 15 | (currently not implemented:)
 16 | - PubMed integrated data
 17 | - UniProt integrated data
 18 | - DrugBank integrated data
 19 | """
 20 | from dataclasses import dataclass, field
 21 | from enum import Enum
 22 | 
 23 | #TODO: handle batch requests
 24 | 
 25 | from pypdb.clients.data.graphql.graphql import search_graphql
 26 | 
 27 | class DataType(Enum):
 28 |     ENTRY = "entries"
 29 |     POLYMER_ENTITY = "polymer_entities"
 30 |     BRANCHED_ENTITY = "branched_entities"
 31 |     NONPOLYMER_ENTITY = "nonpolymer_entities"
 32 |     POLYMER_ENTITY_INSTANCE = "polymer_entity_instances"
 33 |     BRANCHED_ENTITY_INSTANCE = "branched_entity_instances"
 34 |     NONPOLYMER_ENTITY_INSTANCE = "nonpolymer_entity_instances"
 35 |     ASSEMBLY = "assemblies"
 36 |     CHEMICAL_COMPONENT = "chem_comps"
 37 | 
 38 | @dataclass
 39 | class DataFetcher:
 40 |     """
 41 |     General class that will host various data types, as detailed above.
 42 |     """
 43 | 
 44 |     id: str | list
 45 |     data_type: DataType
 46 | 
 47 |     properties: dict = field(default_factory=dict)
 48 |     json_query: dict = field(default_factory=dict)
 49 |     response: dict = field(default_factory=dict)
 50 | 
 51 |     def __post_init__(self):
 52 |         """
 53 |         Check types of IDs given, format accordingly.
 54 |         """
 55 | 
 56 |         if isinstance(self.id, str):
 57 |             self.id = [self.id]
 58 | 
 59 |         if "entit" in self.data_type.value and "instance" not in self.data_type.value:
 60 |             for id in self.id:
 61 |                 if '_' not in id:
 62 |                     print(f"WARNING: {id} not valid for {self.data_type.value}.")
 63 |         elif "instance" in self.data_type.value:
 64 |             for id in self.id:
 65 |                 if '.' not in id:
 66 |                     print(f"WARNING: {id} not valid for {self.data_type.value}.")
 67 |         elif self.data_type == DataType.ASSEMBLY:
 68 |             for id in self.id:
 69 |                 if '-' not in id:
 70 |                     print(f"WARNING: {id} not valid for {self.data_type.value}.")
 71 | 
 72 |     def add_property(self, property):
 73 |         """
 74 |         Add property to the list of data to fetch from the PDB.
 75 | 
 76 |         property is a python dict, with keys as properties, and
 77 |         values as subproperties.
 78 | 
 79 |         e.g.:
 80 | 
 81 |         {"cell": ["volume", "angle_beta"], "exptl": ["method"]}
 82 | 
 83 |         If the user is trying to add a property that already exists,
 84 |         the subproperties are merged.
 85 |         """
 86 |         # check input data type
 87 |         if not isinstance(property, dict):
 88 |             raise TypeError
 89 |         # check data types of keys in dict
 90 |         if not all([isinstance(key, str) for key in property.keys()]):
 91 |             raise TypeError
 92 |         # check that values are lists of strings
 93 |         for key, value in property.items():
 94 |             if isinstance(value, str):
 95 |                 property[key] = [value]
 96 |             elif not isinstance(value, list):
 97 |                 raise TypeError
 98 |             else:
 99 |                 if not all([isinstance(val, str) for val in value]):
100 |                     raise TypeError
101 | 
102 |         # add properties to the dict
103 |         for key, value in property.items():
104 |             if key not in self.properties:
105 |                 self.properties[key] = value
106 |             else:
107 |                 self.properties[key] += value
108 |                 self.properties[key] = list(set(self.properties[key]))
109 | 
110 |     def generate_json_query(self):
111 |         """
112 |         Given IDs, data type, and properties to fetch, create JSON query that
113 |         will utilize graphql.
114 |         """
115 |         if not self.properties:
116 |             print("ERROR: no properties given to generate JSON query.")
117 |             raise ValueError
118 | 
119 |         if self.data_type == DataType.ENTRY:
120 |             q_str = "entry_ids"
121 |         elif "entit" in self.data_type.value:
122 |             if "instance" in self.data_type.value:
123 |                 q_str = "instance_ids"
124 |             else:
125 |                 q_str = "entity_ids"
126 |         elif self.data_type == DataType.ASSEMBLY:
127 |             q_str = "assembly_ids"
128 |         elif self.data_type == DataType.CHEMICAL_COMPONENT:
129 |             q_str = "comp_ids"
130 | 
131 |         data_str = f"{self.data_type.value}({q_str}: [" + ",".join(f"\"{w}\"" for w in self.id) + "])"
132 | 
133 |         props_string = ""
134 |         for key, val in self.properties.items():
135 |             if len(val) == 0:
136 |                 props_string += f"{key},"
137 |             else:
138 |                 props_string += f"{key} {{" + ",".join(val) + "}"
139 | 
140 |         self.json_query = {'query': "{" + data_str + "{" + props_string + "}}"}
141 | 
142 | 
143 |     def fetch_data(self):
144 |         """
145 |         Once the JSON query is created, fetch data from the PDB, using graphql.
146 |         """
147 |         if not self.json_query:
148 |             self.generate_json_query()
149 | 
150 |         response = search_graphql(self.json_query)
151 | 
152 |         if "errors" in response:
153 |             print("ERROR encountered in fetch_data().")
154 |             for error in response['errors']:
155 |                 print(error['message'])
156 | 
157 |             return
158 | 
159 |         self.response = response
160 | 
161 |         if len(self.response['data'][self.data_type.value]) != len(self.id):
162 |             print("WARNING: one or more IDs not found in the PDB.")
163 | 
164 |     def return_data_as_df_dict(self):
165 |         """
166 |         Return the fetched data as a dict usable by pandas or polars.
167 |         """
168 |         if not self.response:
169 |             return None
170 | 
171 |         data = self.response['data'][self.data_type.value]
172 | 
173 |         # flatten data dictionary by joining property and subproperty names
174 |         data_flat = {}
175 |         for i, entry in enumerate(data):
176 |             id = self.id[i]
177 |             curr_dict = {}
178 |             for key, values in entry.items():
179 |                 if isinstance(values, list):
180 |                     v = values[0]
181 |                 else:
182 |                     v = values
183 |                 if isinstance(v, str):
184 |                     new_key = f"{key}"
185 |                     curr_dict[new_key] = v
186 |                 else:
187 |                     for subprop, val in v.items():
188 |                         new_key = f"{key}.{subprop}"
189 |                         curr_dict[new_key] = val
190 |             data_flat[id] = curr_dict
191 | 
192 |         return data_flat


--------------------------------------------------------------------------------
/pypdb/clients/data/graphql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/data/graphql/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/data/graphql/graphql.py:
--------------------------------------------------------------------------------
 1 | """Contains logic to perform arbitrary GraphQL searches against RCSB.
 2 | 
 3 | For the differences between the GraphQL and RESTful searches, see:
 4 | https://data.rcsb.org/index.html#gql-vs-rest
 5 | """
 6 | import requests
 7 | import  warnings
 8 | from typing import Any  # DO NOT APPROVE: fix this to actual type
 9 | 
10 | RSCB_GRAPHQL_URL = "https://data.rcsb.org/graphql?query="
11 | 
12 | 
13 | def search_graphql(graphql_json_query: str) -> Any:
14 |     """Performs RCSB search with JSON query using GraphQL.
15 | 
16 |     For details on what the RCSB GraphQL interface is, see:
17 |         https://data.rcsb.org/index.html#gql-api
18 | 
19 |     This function should return the equivalent information as this site:
20 |         https://data.rcsb.org/graphql/index.html
21 | 
22 |     Args:
23 |         graphql_json_query: GraphQL JSON query, as a string. Whitespace doesn't
24 |             matter. e.g. "{entry(entry_id:"4HHB"){exptl{method}}}"
25 |     """
26 | 
27 |     response = requests.post(url=RSCB_GRAPHQL_URL,
28 |                              json=graphql_json_query)
29 | 
30 |     if not response.ok:
31 |         warnings.warn(f"It appears request failed with: {response.text}")
32 |         response.raise_for_status()
33 | 
34 | 
35 |     return response.json()


--------------------------------------------------------------------------------
/pypdb/clients/data/graphql/test_graphql.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for RCSB DATA API Python wrapper."""
 2 | import unittest
 3 | from unittest import mock
 4 | import requests
 5 | 
 6 | from pypdb.clients.data.graphql import graphql
 7 | 
 8 | class TestGraphQL(unittest.TestCase):
 9 |     @mock.patch.object(requests, "post")
10 |     def test_simple_search(self, mock_post):
11 |         json_query = {'query': '{ entry(entry_id: "4HHB"){struct {title}} }'}
12 |         expected_return_json_as_dict = {'data': {'entry': {'struct': {'title': 'THE CRYSTAL STRUCTURE OF HUMAN DEOXYHAEMOGLOBIN AT 1.74 ANGSTROMS RESOLUTION'}}}}
13 | 
14 |         mock_response = mock.create_autospec(requests.Response, instance=True)
15 |         mock_response.json.return_value = expected_return_json_as_dict
16 |         mock_post.return_value = mock_response
17 | 
18 |         results = graphql.search_graphql(json_query)
19 | 
20 |         mock_post.assert_called_once_with(url=graphql.RSCB_GRAPHQL_URL, json=json_query)
21 |         self.assertEqual(results, expected_return_json_as_dict)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     unittest.main()


--------------------------------------------------------------------------------
/pypdb/clients/data/test_data_types.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests for the data_types classes.
  3 | """
  4 | import unittest
  5 | from unittest import mock
  6 | import requests
  7 | from pypdb.clients.data.graphql.graphql import RSCB_GRAPHQL_URL
  8 | 
  9 | from pypdb.clients.data.data_types import DataFetcher, DataType
 10 | 
 11 | class TestEntry(unittest.TestCase):
 12 |     def test_create(self):
 13 |         entry = DataFetcher("4HHB", DataType.ENTRY)
 14 | 
 15 |         self.assertTrue(isinstance(entry.properties, dict))
 16 |         self.assertTrue(not entry.properties)
 17 | 
 18 |         self.assertTrue(isinstance(entry.json_query, dict))
 19 |         self.assertTrue(not entry.json_query)
 20 | 
 21 |         self.assertTrue(isinstance(entry.response, dict))
 22 |         self.assertTrue(not entry.response)
 23 | 
 24 |         self.assertEqual(entry.id, ["4HHB"])
 25 | 
 26 |     def test_generate_json_query(self):
 27 |         entry = DataFetcher("4HHB", DataType.ENTRY)
 28 | 
 29 |         property = {"exptl":["method", "details"]}
 30 | 
 31 |         entry.add_property(property)
 32 | 
 33 |         self.assertIsNotNone(entry.properties)
 34 | 
 35 |         entry.generate_json_query()
 36 | 
 37 |         self.assertTrue(isinstance(entry.json_query, dict))
 38 |         self.assertTrue("query" in entry.json_query)
 39 | 
 40 |     def test_fetch_entry(self):
 41 |         entry = DataFetcher("4HHB", DataType.ENTRY)
 42 |         property = {"exptl":["method", "details"]}
 43 | 
 44 |         entry.add_property(property)
 45 | 
 46 |         entry.fetch_data()
 47 | 
 48 |         self.assertTrue(entry.response)
 49 | 
 50 |     def test_return_data_as_pandas_df(self):
 51 |         entry = DataFetcher(["4HHB", "12CA", "3PQR"], DataType.ENTRY)
 52 |         property = {"exptl":["method", "details"]}
 53 | 
 54 |         entry.add_property(property)
 55 | 
 56 |         entry.fetch_data()
 57 |         df = entry.return_data_as_pandas_df()
 58 | 
 59 |         self.assertTrue(df is not None)
 60 | 
 61 |     def test_assembly_fetch(self):
 62 |         assembly = DataFetcher(["4HHB-1", "12CA-1", "3PQR-1"], DataType.ASSEMBLY)
 63 |         property = {"rcsb_assembly_info": ["entry_id", "assembly_id", "polymer_entity_instance_count"]}
 64 | 
 65 |         assembly.add_property(property)
 66 |         assembly.fetch_data()
 67 | 
 68 |         self.assertFalse(not assembly.response)
 69 | 
 70 |     def test_polymer_entity_fetch(self):
 71 |         fetcher = DataFetcher(["2CPK_1","3WHM_1","2D5Z_1"], DataType.POLYMER_ENTITY)
 72 | 
 73 |         property = {"rcsb_id": [], 
 74 |                     "rcsb_entity_source_organism": ["ncbi_taxonomy_id", "ncbi_scientific_name"],
 75 |                     "rcsb_cluster_membership": ["cluster_id", "identity"]}
 76 | 
 77 |         fetcher.add_property(property)
 78 |         fetcher.fetch_data()
 79 | 
 80 |         self.assertFalse(not fetcher.response)
 81 | 
 82 |         df = fetcher.return_data_as_pandas_df()
 83 |         self.assertFalse(df is None)
 84 | 
 85 |     def test_polymer_instance_fetch(self):
 86 |         fetcher = DataFetcher(["4HHB.A", "12CA.A", "3PQR.A"], DataType.POLYMER_ENTITY_INSTANCE)
 87 |         property = {"rcsb_id": [],
 88 |                     "rcsb_polymer_instance_annotation": ["annotation_id", "name", "type"]}
 89 |         fetcher.add_property(property)
 90 |         fetcher.fetch_data()
 91 | 
 92 |         self.assertFalse(not fetcher.response)
 93 | 
 94 |         df = fetcher.return_data_as_pandas_df()
 95 |         self.assertFalse(df is None)
 96 | 
 97 |     def test_branched_entity_fetch(self):
 98 |         fetcher = DataFetcher(["5FMB_2", "6L63_3"], DataType.BRANCHED_ENTITY)
 99 |         property = {"pdbx_entity_branch": ["type"],
100 |                     "pdbx_entity_branch_descriptor": ["type", "descriptor"]}
101 | 
102 |         fetcher.add_property(property)
103 |         fetcher.fetch_data()
104 | 
105 |         self.assertFalse(not fetcher.response)
106 | 
107 |         df = fetcher.return_data_as_pandas_df()
108 |         self.assertFalse(df is None)
109 | 
110 |     def test_chem_comps_fetch(self):
111 |         fetcher = DataFetcher(["NAG","EBW"], DataType.CHEMICAL_COMPONENT)
112 |         property = {"rcsb_id":[], "chem_comp": ["type", "formula_weight","name","formula"],
113 |                     "rcsb_chem_comp_info":["initial_release_date"]}
114 |         fetcher.add_property(property)
115 |         fetcher.fetch_data()
116 |         self.assertFalse(not fetcher.response)
117 | 
118 |         df = fetcher.return_data_as_pandas_df()
119 |         self.assertFalse(df is None)
120 | 
121 | if __name__ == '__main__':
122 |     unittest.main()


--------------------------------------------------------------------------------
/pypdb/clients/fasta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/fasta/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/fasta/fasta_client.py:
--------------------------------------------------------------------------------
 1 | """Client to fetch FASTA files associated with structures from RCSB."""
 2 | 
 3 | from dataclasses import dataclass
 4 | import re
 5 | import requests
 6 | from typing import Dict, List
 7 | import warnings
 8 | 
 9 | FASTA_BASE_URL = "https://www.rcsb.org/fasta/entry/"
10 | 
11 | # Fasta Sequences are uniquely identified by a polymeric entity ID that looks
12 | # like `${ENTRY_ID}_{SEQUENCE_NUMBER}` (e.g. `5JUP_1` or `6TML_10`)
13 | PolymerEntity = str  # Defines type-alias (Polymer entity IDs are strings)
14 | 
15 | 
16 | @dataclass
17 | class FastaSequence:
18 |     """Class containing data for one FASTA sequence (one of many in a file)."""
19 |     # Polymeric entity ID uniquely identifying this sequence
20 |     entity_id: PolymerEntity  # e.g. `"5RU3_1"`
21 |     # Chains associated with this sequence
22 |     chains: List[str]  # e.g. `["A", "B"]`
23 |     # Sequence associated with this entity
24 |     sequence: str
25 |     # Un-processed FASTA header for a sequence
26 |     # (e.g. `5RU3_1|Chains A,B|Non-structural protein 3|Severe acute respiratory syndrome coronavirus 2 (2697049)`)
27 |     fasta_header: str
28 | 
29 | 
30 | def _parse_fasta_text_to_list(raw_fasta_text: str) -> List[FastaSequence]:
31 |     """Parses raw FASTA response into easy-to-use dict representation."""
32 |     # Gets list of FASTA chunks (one per sequence)
33 |     fasta_sequence_chunks = raw_fasta_text.strip().split(">")[1:]
34 | 
35 |     fasta_list = []
36 |     for fasta_sequence_chunk in fasta_sequence_chunks:
37 |         chunk_lines = fasta_sequence_chunk.split("\n")
38 |         fasta_header = chunk_lines[0]
39 |         fasta_sequence = "".join(chunk_lines[1:])
40 | 
41 |         header_segments = fasta_header.split("|")
42 |         entity_id = header_segments[0]
43 |         # Derives associated chains from header
44 |         chains = re.sub("Chains? ", "", header_segments[1]).split(",")
45 | 
46 |         fasta_list.append(
47 |             FastaSequence(entity_id=entity_id,
48 |                           chains=chains,
49 |                           sequence=fasta_sequence,
50 |                           fasta_header=fasta_header))
51 |     return fasta_list
52 | 
53 | 
54 | def get_fasta_from_rcsb_entry(rcsb_id: str,
55 |                               verbosity: bool = True,
56 |                               ) -> List[FastaSequence]:
57 |     """Fetches FASTA sequence associated with PDB structure from RCSB.
58 | 
59 |     Args:
60 |       rcsb_id: RCSB accession code of the structure of interest. E.g. `"5RU3"`
61 |       verbosity: Print out the search query to the console (default: True)
62 | 
63 |     Returns:
64 |       Dictionary containing FASTA result, from polymer entity id to the
65 |       `FastaSequence` object associated with that entity.
66 |     """
67 | 
68 |     if verbosity:
69 |         print("Querying RCSB for the '{}' FASTA file.".format(rcsb_id))
70 |     response = requests.get(FASTA_BASE_URL + rcsb_id)
71 | 
72 |     if not response.ok:
73 |         warnings.warn("It appears request failed with:" + response.text)
74 |         response.raise_for_status()
75 | 
76 |     return _parse_fasta_text_to_list(response.text)
77 | 


--------------------------------------------------------------------------------
/pypdb/clients/fasta/fasta_client_test.py:
--------------------------------------------------------------------------------
 1 | """Tests for RCSB FASTA fetching logic."""
 2 | import pytest
 3 | import requests
 4 | import unittest
 5 | from unittest import mock
 6 | 
 7 | from pypdb.clients.fasta import fasta_client
 8 | 
 9 | 
10 | class TestFastaLogic(unittest.TestCase):
11 |     @mock.patch.object(requests, "get")
12 |     @mock.patch.object(fasta_client, "_parse_fasta_text_to_list")
13 |     def test_get_fasta_file(self, mock_parse_fasta, mock_get):
14 |         mock_response = mock.Mock()
15 |         mock_response.ok = True
16 |         mock_response.text = "fake_fasta_response"
17 |         mock_get.return_value = mock_response
18 | 
19 |         fasta_client.get_fasta_from_rcsb_entry("6TML", verbosity=True)
20 |         mock_get.assert_called_once_with(
21 |             "https://www.rcsb.org/fasta/entry/6TML")
22 |         mock_parse_fasta.assert_called_once_with("fake_fasta_response")
23 | 
24 |     def test_parse_fasta_file(self):
25 | 
26 |         test_fasta_raw_text = """
27 | >6TML_1|Chains Q7,Q8,Q9,q7,q8,q9|ATPTG11|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)
28 | MVRNQRYPASPVQEIFLPEPVPFVQFDQTAPSPNSPPAPLPSPSLSQCEEQKDRYR
29 | >6TML_2|Chain i9|ATPTG7|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)
30 | MPSSSSEDAQGGNRFECVSNSTSPRRKNATKDEAACLQPRRSAVSGPREDVLCIR
31 | >6TML_32|Chains H1,H2,H3,H4|subunit c|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)
32 | MFFSRLSLSALKAAPAREAL"""
33 | 
34 |         self.assertEqual(
35 |             fasta_client._parse_fasta_text_to_list(test_fasta_raw_text), [
36 |                 fasta_client.FastaSequence(
37 |                     entity_id="6TML_1",
38 |                     chains=["Q7", "Q8", "Q9", "q7", "q8", "q9"],
39 |                     sequence=
40 |                     "MVRNQRYPASPVQEIFLPEPVPFVQFDQTAPSPNSPPAPLPSPSLSQCEEQKDRYR",
41 |                     fasta_header=
42 |                     "6TML_1|Chains Q7,Q8,Q9,q7,q8,q9|ATPTG11|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)"
43 |                 ),
44 |                 fasta_client.FastaSequence(
45 |                     entity_id="6TML_2",
46 |                     chains=["i9"],
47 |                     sequence=
48 |                     "MPSSSSEDAQGGNRFECVSNSTSPRRKNATKDEAACLQPRRSAVSGPREDVLCIR",
49 |                     fasta_header=
50 |                     "6TML_2|Chain i9|ATPTG7|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)"
51 |                 ),
52 |                 fasta_client.FastaSequence(
53 |                     entity_id="6TML_32",
54 |                     chains=["H1", "H2", "H3", "H4"],
55 |                     sequence="MFFSRLSLSALKAAPAREAL",
56 |                     fasta_header=
57 |                     "6TML_32|Chains H1,H2,H3,H4|subunit c|Toxoplasma gondii (strain ATCC 50853 / GT1) (507601)"
58 |                 )
59 |             ])
60 | 


--------------------------------------------------------------------------------
/pypdb/clients/pdb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/pdb/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/pdb/pdb_client.py:
--------------------------------------------------------------------------------
 1 | """File containing logic to download PDB file entries from the RCSB Database."""
 2 | 
 3 | from enum import Enum
 4 | import gzip
 5 | from typing import Optional
 6 | import warnings
 7 | 
 8 | from pypdb.util import http_requests
 9 | 
10 | PDB_DOWNLOAD_BASE_URL = "https://files.rcsb.org/download/"
11 | 
12 | 
13 | class PDBFileType(Enum):
14 |     PDB = "pdb"  # Older file format.
15 |     CIF = "cif"  # Newer file format (replacing PDB file type)
16 |     XML = "xml"  # Another alternative representation.
17 |     STRUCTFACT = "structfact"  # For structural factors (only populated for some entries)
18 | 
19 | 
20 | def get_pdb_file(pdb_id: str,
21 |                  filetype=PDBFileType.PDB,
22 |                  compression=False) -> Optional[str]:
23 |     '''Get the full PDB file associated with a PDB_ID
24 | 
25 |     Parameters
26 |     ----------
27 | 
28 |     pdb_id : A 4 character string giving a pdb entry of interest
29 | 
30 |     filetype: The file type.
31 |         PDB is the older file format,
32 |         CIF is the newer replacement.
33 |         XML an also be obtained and parsed using the various xml tools included in PyPDB
34 |         STRUCTFACT retrieves structure factors (only available for certain PDB entries)
35 | 
36 |     compression : Whether or not to request the data as a compressed (gz) version of the file
37 |         (note that the compression is undone by this function)
38 | 
39 |     Returns
40 |     -------
41 | 
42 |     result : string
43 |         The string representing the full PDB file as an uncompressed string.
44 |         (returns None if the request to RCSB failed)
45 | 
46 |     Examples
47 |     --------
48 |     >>> pdb_file = get_pdb_file('4lza', filetype='cif', compression=True)
49 |     >>> print(pdb_file[:200])
50 |     data_4LZA
51 |     #
52 |     _entry.id   4LZA
53 |     #
54 |     _audit_conform.dict_name       mmcif_pdbx.dic
55 |     _audit_conform.dict_version    4.032
56 |     _audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx
57 | 
58 |     '''
59 | 
60 |     if filetype is PDBFileType.CIF and not compression:
61 |         warnings.warn("Consider using `get_pdb_file` with compression=True "
62 |                       "for CIF files (it makes the file download faster!)")
63 | 
64 |     pdb_url_builder = [PDB_DOWNLOAD_BASE_URL, pdb_id]
65 | 
66 |     if filetype is PDBFileType.STRUCTFACT:
67 |         pdb_url_builder.append("-sf.cif")
68 |     else:
69 |         pdb_url_builder += [".", filetype.value]
70 | 
71 |     if compression:
72 |         pdb_url_builder += ".gz"
73 | 
74 |     pdb_url = "".join(pdb_url_builder)
75 | 
76 |     print(
77 |         "Sending GET request to {} to fetch {}'s {} file as a string.".format(
78 |             pdb_url, pdb_id, filetype.value))
79 | 
80 |     response = http_requests.request_limited(pdb_url)
81 | 
82 |     if response is None or not response.ok:
83 |         warnings.warn("Retrieval failed, returning None")
84 |         return None
85 | 
86 |     if compression:
87 |         return gzip.decompress(response.content)
88 |     return response.text
89 | 


--------------------------------------------------------------------------------
/pypdb/clients/pdb/pdb_client_test.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import unittest
 3 | from unittest import mock
 4 | 
 5 | from pypdb.clients.pdb import pdb_client
 6 | from pypdb.util import http_requests
 7 | 
 8 | 
 9 | class TestPDBFileDownloading(unittest.TestCase):
10 |     @mock.patch.object(http_requests, "request_limited", autospec=True)
11 |     def test_unsuccessful_test_returns_none(self, mock_http_requests):
12 | 
13 |         mock_return_value = mock.Mock()
14 |         mock_return_value.ok = False
15 |         mock_http_requests.return_value = mock_return_value
16 | 
17 |         self.assertIsNone(pdb_client.get_pdb_file("5TML"))
18 |         mock_http_requests.assert_called_once_with(
19 |             "https://files.rcsb.org/download/5TML.pdb")
20 | 
21 |     @mock.patch.object(http_requests, "request_limited", autospec=True)
22 |     @mock.patch.object(gzip, "decompress")
23 |     def test_compressed_cif_file(self, mock_decompress, mock_http_requests):
24 |         mock_return_value_cif = mock.Mock()
25 |         mock_return_value_cif.ok = True
26 |         mock_return_value_cif.content = "fake_compressed_cif"
27 |         mock_http_requests.return_value = mock_return_value_cif
28 |         mock_decompress.return_value = "fake_decompressed_cif"
29 | 
30 |         self.assertEqual(
31 |             "fake_decompressed_cif",
32 |             pdb_client.get_pdb_file("1A2B",
33 |                                     pdb_client.PDBFileType.CIF,
34 |                                     compression=True))
35 |         mock_http_requests.assert_called_once_with(
36 |             "https://files.rcsb.org/download/1A2B.cif.gz")
37 |         mock_decompress.assert_called_once_with("fake_compressed_cif")
38 | 
39 |     @mock.patch.object(http_requests, "request_limited", autospec=True)
40 |     def test_umcompressed_pdb(self, mock_http_requests):
41 |         mock_return_value_pdb = mock.Mock()
42 |         mock_return_value_pdb.text = "fake_uncompressed_pdb"
43 |         mock_return_value_pdb.ok = True
44 |         mock_http_requests.return_value = mock_return_value_pdb
45 | 
46 |         self.assertEqual("fake_uncompressed_pdb",
47 |                          pdb_client.get_pdb_file("1234"))
48 |         mock_http_requests.assert_called_once_with(
49 |             "https://files.rcsb.org/download/1234.pdb")
50 | 
51 |     @mock.patch.object(http_requests, "request_limited", autospec=True)
52 |     @mock.patch.object(gzip, "decompress")
53 |     def test_compressed_structfact(self, mock_decompress, mock_http_requests):
54 |         mock_return_value_pdb = mock.Mock()
55 |         mock_return_value_pdb.content = "fake_compressed_structfact"
56 |         mock_return_value_pdb.ok = True
57 |         mock_http_requests.return_value = mock_return_value_pdb
58 |         mock_decompress.return_value = "fake_decompressed_structfact"
59 | 
60 |         self.assertEqual(
61 |             "fake_decompressed_structfact",
62 |             pdb_client.get_pdb_file("HK97",
63 |                                     pdb_client.PDBFileType.STRUCTFACT,
64 |                                     compression=True))
65 |         mock_http_requests.assert_called_once_with(
66 |             "https://files.rcsb.org/download/HK97-sf.cif.gz")
67 |         mock_decompress.assert_called_once_with("fake_compressed_structfact")
68 | 
69 |     @mock.patch.object(http_requests, "request_limited", autospec=True)
70 |     def test_uncompressed_xml(self, mock_http_requests):
71 |         mock_return_value_pdb = mock.Mock()
72 |         mock_return_value_pdb.text = "fake_uncompressed_xml"
73 |         mock_return_value_pdb.ok = True
74 |         mock_http_requests.return_value = mock_return_value_pdb
75 | 
76 |         self.assertEqual(
77 |             "fake_uncompressed_xml",
78 |             pdb_client.get_pdb_file("MI17",
79 |                                     pdb_client.PDBFileType.XML,
80 |                                     compression=False))
81 |         mock_http_requests.assert_called_once_with(
82 |             "https://files.rcsb.org/download/MI17.xml")
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/EXAMPLES.md:
--------------------------------------------------------------------------------
  1 | # PyPDB Text Search
  2 | 
  3 | ## Helpful Links
  4 | 
  5 | The Search logic here is a Python wrapper around the RCSB's search logic.
  6 | For in-the-weeds details on how each operator works, prefer to look at the
  7 | [RCSB Search API documentation](https://search.rcsb.org/index.html)
  8 | 
  9 | The search operators defined within the `operators` directory support querying
 10 | RCSB attributes against the appropriate `if
 11 | you are querying the RCSB Text Search Service (`all
 12 | operators within `text_operators.py` should be supported.
 13 | 
 14 | For a list of RCSB attributes associated with structures you can search, see
 15 | [RCSB's List of Structure Attributes to Search](https://search.rcsb.org/structure-search-attributes.html) and [RCSB's List of Chemical Attributes to Search](https://search.rcsb.org/chemical-search-attributes.html)
 16 | Note that not every structure will have every attribute.
 17 | 
 18 | Two querying functions are currently supported by PyPDB:
 19 | 
 20 | * `perform_search`: This function is good for simple queries
 21 | * `perform_search_with_graph`: This function allows building complicated queries using RCSB's query node syntax.
 22 | 
 23 | ## `perform_search` Examples
 24 | 
 25 | ### Search for all entries that mention the word 'ribosome'
 26 | 
 27 | ```python
 28 | from pypdb.clients.search.search_client import perform_search
 29 | from pypdb.clients.search.search_client import ReturnType
 30 | from pypdb.clients.search.operators import text_operators
 31 | 
 32 | 
 33 | search_operator = text_operators.DefaultOperator(value="ribosome")
 34 | return_type = ReturnType.ENTRY
 35 | 
 36 | results = perform_searchsearch_operator, return_type)
 37 | ```
 38 | 
 39 | ### Search for polymers from 'Mus musculus'
 40 | 
 41 | ```python
 42 | from pypdb.clients.search.search_client import perform_search
 43 | from pypdb.clients.search.search_client import ReturnType
 44 | from pypdb.clients.search.operators import text_operators
 45 | 
 46 | 
 47 | search_operator = text_operators.ExactMatchOperator(value="Mus musculus",
 48 |                                                     attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
 49 | return_type = ReturnType.POLYMER_ENTITY
 50 | 
 51 | results = perform_search(search_operator, return_type)
 52 | ```
 53 | 
 54 | ### Search for non-polymers from 'Mus musculus' or 'Homo sapiens'
 55 | 
 56 | ```python
 57 | from pypdb.clients.search.search_client import perform_search
 58 | from pypdb.clients.search.search_client import ReturnType
 59 | from pypdb.clients.search.operators import text_operators
 60 | 
 61 | search_operator = text_operators.InOperator(values=["Mus musculus", "Homo sapiens"],
 62 |                                             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
 63 | return_type = ReturnType.NON_POLYMER_ENTITY
 64 | 
 65 | results = perform_search(search_operator, return_type)
 66 | ```
 67 | 
 68 | ### Search for polymer instances whose titles contain "actin" or "binding" or "protein"
 69 | 
 70 | ```python
 71 | from pypdb.clients.search.search_client import perform_search
 72 | from pypdb.clients.search.search_client import ReturnType
 73 | from pypdb.clients.search.operators import text_operators
 74 | 
 75 | 
 76 | search_operator = text_operators.ContainsWordsOperator(value="actin-binding protein",
 77 |                                             attribute="struct.title")
 78 | return_type = ReturnType.POLYMER_INSTANCE
 79 | 
 80 | results = perform_search(search_operator, return_type)
 81 | ```
 82 | 
 83 | ### Search for assemblies that contain the words "actin binding protein"
 84 | 
 85 | (must be in that order).
 86 | 
 87 | For example, "actin-binding protein" and "actin binding protein" will match,
 88 | but "protein binding actin" will not.
 89 | 
 90 | ```python
 91 | from pypdb.clients.search.search_client import perform_search
 92 | from pypdb.clients.search.search_client import ReturnType
 93 | from pypdb.clients.search.operators import text_operators
 94 | 
 95 | 
 96 | search_operator = text_operators.ContainsPhraseOperator(value="actin-binding protein",
 97 |                                             attribute="struct.title")
 98 | return_type = ReturnType.ASSEMBLY
 99 | 
100 | results = perform_search(search_operator, return_type)
101 | ```
102 | 
103 | ### Search for entries released in 2019 or later
104 | 
105 | ```python
106 | from pypdb.clients.search.search_client import perform_search
107 | from pypdb.clients.search.search_client import ReturnType
108 | from pypdb.clients.search.operators import text_operators
109 | 
110 | 
111 | search_operator = text_operators.ComparisonOperator(
112 |        value="2019-01-01T00:00:00Z",
113 |        attribute="rcsb_accession_info.initial_release_date",
114 |        comparison_type=text_operators.ComparisonType.GREATER)
115 | return_type = ReturnType.ENTRY
116 | 
117 | results = perform_search(search_operator, return_type)
118 | ```
119 | 
120 | ### Search for entries released only in 2019 or later
121 | 
122 | ```python
123 | from pypdb.clients.search.search_client import perform_search
124 | from pypdb.clients.search.search_client import ReturnType
125 | from pypdb.clients.search.operators import text_operators
126 | 
127 | 
128 | search_operator = text_operators.RangeOperator(
129 |     from_value="2019-01-01T00:00:00Z",
130 |     to_value="2020-01-01T00:00:00Z",
131 |     include_lower=True,
132 |     include_upper=False,
133 |     attribute="rcsb_accession_info.initial_release_date")
134 | return_type = ReturnType.ENTRY
135 | 
136 | results = perform_search(search_operator, return_type)
137 | ```
138 | 
139 | ### Search for structures under 4 angstroms of resolution
140 | 
141 | ```python
142 | from pypdb.clients.search.search_client import perform_search
143 | from pypdb.clients.search.search_client import ReturnType
144 | from pypdb.clients.search.operators import text_operators
145 | 
146 | 
147 | search_operator = text_operators.ComparisonOperator(
148 |            value=4,
149 |            attribute="rcsb_entry_info.resolution_combined",
150 |            comparison_type=text_operators.ComparisonType.LESS)
151 | return_type = ReturnType.ENTRY
152 | 
153 | results = perform_search(search_operator, return_type)
154 | ```
155 | 
156 | ### Search for structures with a given attribute
157 | 
158 | (Admittedly every structure has a release date, but the same logic would
159 |  apply for a more sparse RCSB attribute).
160 | 
161 | ```python
162 | from pypdb.clients.search.search_client import perform_search
163 | from pypdb.clients.search.search_client import ReturnType
164 | from pypdb.clients.search.operators import text_operators
165 | 
166 | 
167 | search_operator = text_operators.ExistsOperator(
168 |     attribute="rcsb_accession_info.initial_release_date")
169 | return_type = ReturnType.ENTRY
170 | 
171 | results = perform_search(search_operator, return_type)
172 | ```
173 | 
174 | ### Search for top 100 structures matching the given protein sequence, by date
175 | 
176 | (this sequence matches the SARS-CoV-2 NSP3 macrodomain)
177 | 
178 | ```python
179 | from pypdb.clients.search.search_client import perform_search, RequestOptions
180 | from pypdb.clients.search.search_client import ReturnType
181 | from pypdb.clients.search.operators.sequence_operators import SequenceOperator
182 | from pypdb.clients.search.operators.sequence_operators import SequenceType
183 | 
184 | results = perform_search(
185 |         return_type=ReturnType.ENTRY,
186 |     search_operator=SequenceOperator(
187 |         sequence_type=SequenceType.PROTEIN, # if not explicitly specified, this will autoresolve
188 |         sequence=(
189 |           "SMVNSFSGYLKLTDNVYIKNADIVEEAKKVKPTVVVNAANVYLKHGGGVAGALNKATNNAMQVESDDY"
190 |           "IATNGPLKVGGSCVLSGHNLAKHCLHVVGPNVNKGEDIQLLKSAYENFNQHEVLLAPLLSAGIFGADP"
191 |           "IHSLRVCVDTVRTNVYLAVFDKNLYDKLVSSFL"),
192 |         identity_cutoff=0.99,
193 |         evalue_cutoff=1000
194 |       ),
195 |     request_options=RequestOptions(
196 |         result_start_index=0,
197 |         num_results=100,
198 |         sort_by="rcsb_accession_info.initial_release_date",
199 |         desc=False
200 |       ),
201 |     return_with_scores=True
202 | )
203 | ```
204 | 
205 | ### Search for structures that match the sequence of an existing RCSB entry
206 | 
207 | ```python
208 | from pypdb.clients.fasta.fasta_client import get_fasta_from_rcsb_entry
209 | from pypdb.clients.search.search_client import perform_search
210 | from pypdb.clients.search.search_client import ReturnType
211 | from pypdb.clients.search.operators.sequence_operators import SequenceOperator
212 | 
213 | # Fetches the first sequence in the "6TML" fasta file
214 | fasta_sequence = get_fasta_from_rcsb_entry("6TML", verbosity=True)[0].sequence
215 | 
216 | # Performs sequence search ('BLAST'-like) using the FASTA sequence
217 | results = perform_search(
218 |         return_type=ReturnType.ENTRY,
219 |     search_operator=SequenceOperator(
220 |         sequence=fasta_sequence,
221 |         identity_cutoff=0.99,
222 |         evalue_cutoff=1000
223 |       ),
224 |     return_with_scores=True
225 | )
226 | ```
227 | 
228 | ## `perform_search_with_graph` Example
229 | 
230 | ### Search for 'Mus musculus' or 'Homo sapiens' structures after 2019
231 | 
232 | ```python
233 | from pypdb.clients.search.search_client import perform_search_with_graph
234 | from pypdb.clients.search.search_client import ReturnType
235 | from pypdb.clients.search.search_client import QueryGroup, LogicalOperator
236 | from pypdb.clients.search.operators import text_operators
237 | 
238 | # SearchOperator associated with structures with under 4 Angstroms of resolution
239 | under_4A_resolution_operator = text_operators.ComparisonOperator(
240 |        value=4,
241 |        attribute="rcsb_entry_info.resolution_combined",
242 |        comparison_type=text_operators.ComparisonType.GREATER)
243 | 
244 | # SearchOperator associated with entities containing 'Mus musculus' lineage
245 | is_mus_operator = text_operators.ExactMatchOperator(
246 |             value="Mus musculus",
247 |             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
248 | 
249 | # SearchOperator associated with entities containing 'Homo sapiens' lineage
250 | is_human_operator = text_operators.ExactMatchOperator(
251 |             value="Homo sapiens",
252 |             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
253 | 
254 | # QueryGroup associated with being either human or `Mus musculus`
255 | is_human_or_mus_group = QueryGroup(
256 |     queries = [is_mus_operator, is_human_operator],
257 |     logical_operator = LogicalOperator.OR
258 | )
259 | 
260 | # QueryGroup associated with being ((Human OR Mus) AND (Under 4 Angstroms))
261 | is_under_4A_and_human_or_mus_group = QueryGroup(
262 |     queries = [is_human_or_mus_group, under_4A_resolution_operator],
263 |     logical_operator = LogicalOperator.AND
264 | )
265 | 
266 | results = perform_search_with_graph(
267 |   query_object=is_under_4A_and_human_or_mus_group,
268 |   return_type=ReturnType.ENTRY)
269 | ```
270 | 
271 | ## Search for Calcium-Bound Calmodulin Structures
272 | 
273 | Note that "1CLL" corresponds to a Calmodulin structure bound to Ca2+.
274 | 
275 | Also, searching for `rcsb_chem_comp_container_identifiers.comp_id` with
276 | an exact match to `"CA"` yields only structures in complex with Ca2+
277 | (filtering out structures in complex with other metals like strontium).
278 | 
279 | ```python
280 | from pypdb.clients.search.search_client import perform_search_with_graph
281 | from pypdb.clients.search.search_client import ReturnType
282 | from pypdb.clients.search.search_client import QueryGroup, LogicalOperator
283 | from pypdb.clients.search.operators import text_operators, structure_operators
284 | 
285 | is_similar_to_1CLL = structure_operators.StructureOperator(
286 |     pdb_entry_id="1CLL",
287 |     assembly_id=1,
288 |     search_mode=structure_operators.StructureSearchMode.STRICT_SHAPE_MATCH
289 | )
290 | 
291 | is_in_complex_with_calcium = text_operators.ExactMatchOperator(
292 |     attribute="rcsb_chem_comp_container_identifiers.comp_id",
293 |     value="CA"
294 | )
295 | 
296 | results = perform_search_with_graph(
297 |   query_object=QueryGroup(
298 |     logical_operator=LogicalOperator.AND,
299 |     queries=[is_similar_to_1CLL, is_in_complex_with_calcium]
300 |   ),
301 |   return_type=ReturnType.ENTRY
302 | )
303 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/search/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/clients/search/operators/__init__.py


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/chemical_operators.py:
--------------------------------------------------------------------------------
 1 | """Search operators corresponding to Chemical search using SMILES or InChI."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from enum import Enum
 5 | from typing import Any, Dict
 6 | 
 7 | 
 8 | class DescriptorMatchingCriterion(Enum):
 9 |     """Criterion describing what constitutes a chemical 'match' in RCSB search.
10 | 
11 |     For definitions of these criteria, see:
12 |     https://search.rcsb.org/#search-services
13 |     """
14 |     GRAPH_STRICT = "graph-strict"
15 |     GRAPH_RELAXED = "graph-relaxed"
16 |     GRAPH_RELAXED_STEREO = "graph-relaxed-stereo"
17 |     FINGERPRINT_SIMILARITY = "fingerprint-similarity"
18 | 
19 | 
20 | @dataclass
21 | class ChemicalOperator:
22 |     """Search operator for Chemical searches using SMILES / InChI."""
23 |     # Descriptor for matching (i.e. a valid SMILES or InChI string)
24 |     descriptor: str
25 |     # Criterion for what constitutes a match ("graph-strict" by default)
26 |     matching_criterion: DescriptorMatchingCriterion = DescriptorMatchingCriterion.GRAPH_STRICT
27 | 
28 |     def __post_init__(self):
29 |         """Derives whether the chemical descriptor string is SMILES or InChI."""
30 |         # All InChI strings definitionally start with "InChI="
31 |         if self.descriptor.startswith("InChI="):
32 |             self.descriptor_type = "InChI"
33 |         else:
34 |             # Otherwise, assume SMILES string by default
35 |             self.descriptor_type = "SMILES"
36 | 
37 |     def _to_dict(self) -> Dict[str, Any]:
38 |         return {
39 |             "value": self.descriptor,
40 |             "type": "descriptor",
41 |             "descriptor_type": self.descriptor_type,
42 |             "match_type": self.matching_criterion.value
43 |         }
44 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/chemical_operators_test.py:
--------------------------------------------------------------------------------
 1 | """Tests for RCSB SeqMotif Search Service Operators."""
 2 | 
 3 | import unittest
 4 | 
 5 | from pypdb.clients.search.operators import chemical_operators
 6 | 
 7 | 
 8 | class TestChemicalOperators(unittest.TestCase):
 9 |     def test_chemical_operator_to_dict(self):
10 |         # InChI
11 |         inchi_operator = chemical_operators.ChemicalOperator(
12 |             # Panadol
13 |             descriptor=
14 |             "InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)",
15 |             matching_criterion=chemical_operators.DescriptorMatchingCriterion.
16 |             GRAPH_RELAXED_STEREO)
17 |         self.assertEqual(inchi_operator.descriptor_type, "InChI")
18 |         self.assertEqual(
19 |             inchi_operator._to_dict(), {
20 |                 "value":
21 |                 "InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)",
22 |                 "type": "descriptor",
23 |                 "descriptor_type": "InChI",
24 |                 "match_type": "graph-relaxed-stereo"
25 |             })
26 | 
27 |         # SMILES
28 |         smiles_operator = chemical_operators.ChemicalOperator(
29 |             descriptor=
30 |             "CC(C)C[C@H](NC(=O)OCC1CCC(F)(F)CC1)C(=O)N[C@@H](C[C@@H]2CCNC2=O)[C@@H](O)[S](O)(=O)=O"
31 |         )
32 |         self.assertEqual(
33 |             smiles_operator._to_dict(), {
34 |                 "value":
35 |                 "CC(C)C[C@H](NC(=O)OCC1CCC(F)(F)CC1)C(=O)N[C@@H](C[C@@H]2CCNC2=O)[C@@H](O)[S](O)(=O)=O",
36 |                 "type": "descriptor",
37 |                 "descriptor_type": "SMILES",
38 |                 "match_type": "graph-strict"
39 |             })
40 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/seqmotif_operators.py:
--------------------------------------------------------------------------------
 1 | """Operators associated with SeqMotif searching using RCSB Search API."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from enum import Enum
 5 | from typing import Any, Dict
 6 | 
 7 | 
 8 | class SequenceType(Enum):
 9 |     """Type of sequence being searched for motifs."""
10 |     DNA = "pdb_dna_sequence"
11 |     RNA = "pdb_rna_sequence"
12 |     PROTEIN = "pdb_protein_sequence"
13 | 
14 | 
15 | class PatternType(Enum):
16 |     """Type of pattern being used for SeqMotif search."""
17 |     SIMPLE = "simple"
18 |     PROSITE = "prosite"
19 |     REGEX = "regex"
20 | 
21 | 
22 | @dataclass
23 | class SeqMotifOperator:
24 |     # Pattern to search with
25 |     pattern: str
26 |     sequence_type: SequenceType
27 |     pattern_type: PatternType
28 | 
29 |     def _to_dict(self) -> Dict[str, Any]:
30 |         return {
31 |             "value": self.pattern,
32 |             "pattern_type": self.pattern_type.value,
33 |             "target": self.sequence_type.value
34 |         }
35 | 
36 | 
37 | # DO NOT APPROVE: DO NOT APPROVE THIS CL UNTIL ADDED TO VALIDATION
38 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/seqmotif_operators_test.py:
--------------------------------------------------------------------------------
 1 | """Tests for RCSB SeqMotif Search Service Operators."""
 2 | 
 3 | import unittest
 4 | 
 5 | from pypdb.clients.search.operators import seqmotif_operators
 6 | 
 7 | 
 8 | class TestSeqMotifOperators(unittest.TestCase):
 9 |     def test_seqmotif_operator_to_dict(self):
10 |         seqmotif_operator = seqmotif_operators.SeqMotifOperator(
11 |             pattern_type=seqmotif_operators.PatternType.PROSITE,
12 |             sequence_type=seqmotif_operators.SequenceType.PROTEIN,
13 |             pattern="C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H.")
14 | 
15 |         self.assertEqual(
16 |             seqmotif_operator._to_dict(), {
17 |                 "value": "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H.",
18 |                 "pattern_type": "prosite",
19 |                 "target": "pdb_protein_sequence"
20 |             })
21 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/sequence_operators.py:
--------------------------------------------------------------------------------
 1 | """Search operator for searching sequences using MMseqs2 (BLAST-like)."""
 2 | from dataclasses import dataclass
 3 | from enum import Enum
 4 | from typing import Any, Dict, Optional, Union
 5 | 
 6 | 
 7 | class SequenceType(Enum):
 8 |     """Type of sequence being searched."""
 9 |     DNA = "pdb_dna_sequence"
10 |     RNA = "pdb_rna_sequence"
11 |     PROTEIN = "pdb_protein_sequence"
12 | 
13 | 
14 | class CannotAutoresolveSequenceTypeError(Exception):
15 |     """Raised when a sequence is ambiguous as to its `SequenceType`."""
16 | 
17 | 
18 | @dataclass
19 | class SequenceOperator:
20 |     """Default search operator; searches across available fields search,
21 |     and returns a hit if a match happens in any field."""
22 |     sequence: str
23 |     # If the sequence type is not specified, tries to autoresolve the type from
24 |     # the sequence itself
25 |     sequence_type: Optional[SequenceType] = None
26 |     # Maximum E Value allowed for results
27 |     # (see: https://www.ncbi.nlm.nih.gov/BLAST/tutorial/Altschul-1.html)
28 |     evalue_cutoff: float = 100
29 |     # Minimum identity cutoff allowed for results
30 |     # (see: https://www.ncbi.nlm.nih.gov/books/NBK62051/def-item/identity/)
31 |     identity_cutoff: float = 0.95
32 | 
33 |     def __post_init__(self):
34 |         if self.sequence_type is None:
35 |             self._autoresolve_sequence_type()
36 | 
37 |     def _autoresolve_sequence_type(self):
38 |         unique_letters = set(list(self.sequence))
39 | 
40 |         dna_letter_set = set(["A", "T", "C", "G"])
41 |         rna_letter_set = set(["A", "U", "C", "G"])
42 |         protein_letter_set = set(list("ABCDEFGHIKLMNPQRSTVWXYZ"))
43 |         protein_fingerprint_set = set(list("BDEFHIKLMNPQRSVWXYZ"))
44 |         if unique_letters.issubset(dna_letter_set) and "T" in unique_letters:
45 |             self.sequence_type = SequenceType.DNA
46 |         elif unique_letters.issubset(rna_letter_set) and "U" in unique_letters:
47 |             self.sequence_type = SequenceType.RNA
48 |         elif (unique_letters.issubset(protein_letter_set)
49 |               and protein_fingerprint_set & unique_letters):
50 |             self.sequence_type = SequenceType.PROTEIN
51 |         else:
52 |             raise CannotAutoresolveSequenceTypeError(
53 |                 "Sequence is ambiguous as to its SequenceType: `{}`".format(
54 |                     self.sequence))
55 | 
56 |     def _to_dict(self) -> Dict[str, Any]:
57 |         return {
58 |             "evalue_cutoff": self.evalue_cutoff,
59 |             "identity_cutoff": self.identity_cutoff,
60 |             "target": self.sequence_type.value,  # type: ignore
61 |             "value": self.sequence
62 |         }
63 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/sequence_operators_test.py:
--------------------------------------------------------------------------------
 1 | """Tests for RCSB Text Search Service Operators
 2 | (admittedly, a lot is tested in `search_client_test.py` too)
 3 | """
 4 | 
 5 | import pytest
 6 | import unittest
 7 | 
 8 | from pypdb.clients.search.operators import sequence_operators
 9 | 
10 | 
11 | class TestSequenceOperators(unittest.TestCase):
12 |     def test_sequence_operator(self):
13 |         search_operator = sequence_operators.SequenceOperator(
14 |             sequence="AUGAUUCGGCGCUAAAAAAAA",
15 |             sequence_type=sequence_operators.SequenceType.RNA,
16 |             evalue_cutoff=100,
17 |             identity_cutoff=0.95)
18 | 
19 |         self.assertEqual(
20 |             search_operator._to_dict(), {
21 |                 "evalue_cutoff": 100,
22 |                 "identity_cutoff": 0.95,
23 |                 "target": "pdb_rna_sequence",
24 |                 "value": "AUGAUUCGGCGCUAAAAAAAA",
25 |             })
26 | 
27 |     def test_autoresolve_sequence_type(self):
28 |         self.assertEqual(
29 |             sequence_operators.SequenceOperator("ATGGGGTAA").sequence_type,
30 |             sequence_operators.SequenceType.DNA)
31 |         self.assertEqual(
32 |             sequence_operators.SequenceOperator("AUGGGGCCCUAA").sequence_type,
33 |             sequence_operators.SequenceType.RNA)
34 |         self.assertEqual(
35 |             sequence_operators.SequenceOperator(
36 |                 "MAETREGGQSGAAS").sequence_type,
37 |             sequence_operators.SequenceType.PROTEIN)
38 |         with pytest.raises(
39 |                 sequence_operators.CannotAutoresolveSequenceTypeError):
40 |             sequence_operators.SequenceOperator("AAAAAAAA")
41 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/structure_operators.py:
--------------------------------------------------------------------------------
 1 | """Operators associated with RCSB structural search."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from enum import Enum
 5 | from typing import Any, Dict
 6 | 
 7 | 
 8 | class StructureSearchMode(Enum):
 9 |     """Mode to search structures with. See:
10 |     https://github.com/biocryst/biozernike/
11 |     """
12 |     STRICT_SHAPE_MATCH = "strict_shape_match"
13 |     RELAXED_SHAPE_MATCH = "relaxed_shape_match"
14 | 
15 | 
16 | @dataclass
17 | class StructureOperator:
18 |     """Operator to perform 3D Structural search using:
19 |     https://github.com/biocryst/biozernike/
20 | 
21 |     Will return similar 3D structures using default search options.
22 |     """
23 |     # Entry and Assembly # for the chainstructure you want to use for search.
24 |     # (results will show other PDB entities with similiar 3D Structures)
25 |     pdb_entry_id: str
26 |     assembly_id: int = 1
27 |     # Structure search mode
28 |     search_mode: StructureSearchMode = StructureSearchMode.STRICT_SHAPE_MATCH
29 | 
30 |     def _to_dict(self) -> Dict[str, Any]:
31 |         return {
32 |             "value": {
33 |                 "entry_id": self.pdb_entry_id,
34 |                 "assembly_id": str(self.assembly_id)
35 |             },
36 |             "operator": self.search_mode.value
37 |         }
38 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/structure_operators_test.py:
--------------------------------------------------------------------------------
 1 | """Tests for Structural searches against RCSB Search API."""
 2 | 
 3 | import unittest
 4 | 
 5 | from pypdb.clients.search.operators import structure_operators
 6 | 
 7 | 
 8 | class TestStructureOperators(unittest.TestCase):
 9 |     def test_not_equals_operator(self):
10 |         structure_operator = structure_operators.StructureOperator(
11 |             pdb_entry_id="HK97",
12 |             assembly_id=4,
13 |             search_mode=structure_operators.StructureSearchMode.
14 |             STRICT_SHAPE_MATCH)
15 | 
16 |         self.assertEqual(
17 |             structure_operator._to_dict(), {
18 |                 "value": {
19 |                     "entry_id": "HK97",
20 |                     "assembly_id": "4"
21 |                 },
22 |                 "operator": "strict_shape_match"
23 |             })
24 | 
25 |         structure_operator_two = structure_operators.StructureOperator(
26 |             pdb_entry_id="CP77",
27 |             assembly_id=7,
28 |             search_mode=structure_operators.StructureSearchMode.
29 |             RELAXED_SHAPE_MATCH)
30 | 
31 |         self.assertEqual(
32 |             structure_operator_two._to_dict(), {
33 |                 "value": {
34 |                     "entry_id": "CP77",
35 |                     "assembly_id": "7"
36 |                 },
37 |                 "operator": "relaxed_shape_match"
38 |             })
39 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/text_operators.py:
--------------------------------------------------------------------------------
  1 | """Implementation of SearchOperators for text queries against RCSB API."""
  2 | from dataclasses import dataclass
  3 | from enum import Enum
  4 | from typing import Any, Dict, Union, List
  5 | 
  6 | # --- Implementations of RCSB Queries for each SearchOperators ---
  7 | # See: https://search.rcsb.org/index.html#search-operators for details
  8 | 
  9 | # For information on available RCSB search attributes, see:
 10 | # https://search.rcsb.org/search-attributes.html
 11 | 
 12 | 
 13 | @dataclass
 14 | class DefaultOperator:
 15 |     """Default search operator; searches across available fields search,
 16 |     and returns a hit if a match happens in any field."""
 17 |     value: str
 18 | 
 19 |     def _to_dict(self) -> Dict[str, str]:
 20 |         return {"value": self.value}
 21 | 
 22 | 
 23 | @dataclass
 24 | class ExactMatchOperator:
 25 |     """Exact match operator indicates that the input value should match a field
 26 |     value exactly (including whitespaces, special characters and case)."""
 27 |     attribute: str
 28 |     value: Any
 29 | 
 30 |     def _to_dict(self) -> Dict[str, Any]:
 31 |         return {
 32 |             "attribute": self.attribute,
 33 |             "operator": "exact_match",
 34 |             "value": self.value
 35 |         }
 36 | 
 37 | 
 38 | @dataclass
 39 | class InOperator:
 40 |     """The in operator allows you to specify multiple values in a single search
 41 |     expression. It returns results if any value in a list of input values
 42 |     matches. It can be used instead of multiple OR conditions."""
 43 |     attribute: str
 44 |     values: List[Any]  # List of strings, numbers or date strings
 45 | 
 46 |     def _to_dict(self) -> Dict[str, Any]:
 47 |         return {
 48 |             "attribute": self.attribute,
 49 |             "operator": "in",
 50 |             "value": self.values
 51 |         }
 52 | 
 53 | 
 54 | @dataclass
 55 | class ContainsWordsOperator:
 56 |     """Searches attribute field to check if any words within `value` are found.
 57 | 
 58 |     For example, "actin-binding protein" will return results containing
 59 |     "actin" OR "binding" OR "protein" within the attribute.
 60 |     """
 61 |     attribute: str
 62 |     value: str
 63 | 
 64 |     def _to_dict(self) -> Dict[str, str]:
 65 |         return {
 66 |             "attribute": self.attribute,
 67 |             "operator": "contains_words",
 68 |             "value": self.value
 69 |         }
 70 | 
 71 | 
 72 | @dataclass
 73 | class ContainsPhraseOperator:
 74 |     """Searches attribute, and returns hits if-and-only-if all words in the
 75 |     value are in the attribute field, in that order.
 76 | 
 77 |     For example, "actin-binding protein" will be interpreted as
 78 |     "actin" AND "binding" AND "protein" occurring in a given order."""
 79 |     attribute: str
 80 |     value: str
 81 | 
 82 |     def _to_dict(self) -> Dict[str, str]:
 83 |         return {
 84 |             "attribute": self.attribute,
 85 |             "operator": "contains_phrase",
 86 |             "value": self.value
 87 |         }
 88 | 
 89 | 
 90 | class ComparisonType(Enum):
 91 |     GREATER = "greater"
 92 |     GREATER_OR_EQUAL = "greater_or_equal"
 93 |     EQUAL = "equals"
 94 |     NOT_EQUAL = "not_equal"
 95 |     LESS_OR_EQUAL = "less_or_equal"
 96 |     LESS = "less"
 97 | 
 98 | 
 99 | # TODO(lacoperon): Add support for initializing this, and RangeOperator, from
100 | #                  datetime.datetime objects for ease of use.
101 | 
102 | 
103 | @dataclass
104 | class ComparisonOperator:
105 |     """Searches attribute, returns hits if the attribute field comparison to the
106 |     value is True.
107 | 
108 |     For example, to get structures after a certain date, you could use the
109 |     following:
110 | 
111 |     ```
112 |     date_filter_operator = ComparisonOperator(
113 |           value="2019-01-01T00:00:00Z",
114 |           attribute="rcsb_accession_info.initial_release_date",
115 |           comparison_type=ComparisonType.GREATER)
116 |     ```
117 |     """
118 | 
119 |     attribute: str
120 |     value: Any
121 |     comparison_type: ComparisonType
122 | 
123 |     def _to_dict(self) -> Dict[str, Any]:
124 |         if self.comparison_type is ComparisonType.NOT_EQUAL:
125 |             param_dict = {"operator": "equals", "negation": True}
126 |         else:
127 |             param_dict = {"operator": self.comparison_type.value}
128 | 
129 |         param_dict["attribute"] = self.attribute
130 |         param_dict["value"] = self.value
131 | 
132 |         return param_dict
133 | 
134 | 
135 | # @dataclass
136 | # class RangeOperator:
137 | #     """Returns results with attributes within range."""
138 | #     attribute: str
139 | #     from_value: Any
140 | #     to_value: Any
141 | #     include_lower: bool = True  # Default inclusive
142 | #     include_upper: bool = True  # Default inclusive
143 | 
144 | #     def _to_dict(self) -> Dict[str, Any]:
145 | #         return {
146 | #             "operator": "range",
147 | #             "attribute": self.attribute,
148 | #             "value": {
149 | #                 "from": self.from_value,
150 | #                 "to": self.to_value,
151 | #                 "include_lower": self.include_lower,
152 | #                 "include_upper": self.include_upper
153 | #             }
154 | #         }
155 | 
156 | 
157 | @dataclass
158 | class RangeOperator:
159 |     """Returns results with attributes within range.."""
160 |     attribute: str
161 |     from_value: Any
162 |     to_value: Any
163 |     include_lower: bool = True  # Default inclusive
164 |     include_upper: bool = True  # Default inclusive
165 |     negation: bool = False
166 | 
167 |     def _to_dict(self) -> Dict[str, Any]:
168 |         return {
169 |             "operator": "range",
170 |             "attribute": self.attribute,
171 |             "negation": self.negation,
172 |             "value": {"from": self.from_value,
173 |                       "to": self.to_value},
174 |         }
175 | 
176 | 
177 | @dataclass
178 | class ExistsOperator:
179 |     attribute: str
180 | 
181 |     def _to_dict(self) -> Dict[str, str]:
182 |         return {"operator": "exists", "attribute": self.attribute}
183 | 
184 | 
185 | # An object of type `TextSearchOperator` can be any of the following classes:
186 | TextSearchOperator = Union[DefaultOperator, ExactMatchOperator, InOperator,
187 |                            ContainsWordsOperator, ContainsPhraseOperator,
188 |                            ComparisonOperator, RangeOperator, ExistsOperator]
189 | 
190 | # List of all TextSearchOperator-associated classes, for backwards compatability
191 | # in terms of checking SearchOperator validity
192 | # (please change this when you change the `Union` definition)
193 | TEXT_SEARCH_OPERATORS = [
194 |     DefaultOperator, ExactMatchOperator, InOperator, ContainsWordsOperator,
195 |     ContainsPhraseOperator, ComparisonOperator, RangeOperator, ExistsOperator
196 | ]
197 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/operators/text_operators_test.py:
--------------------------------------------------------------------------------
 1 | """Tests for RCSB Text Search Service Operators
 2 | (admittedly, a lot is tested in `search_client_test.py` too)
 3 | """
 4 | 
 5 | import unittest
 6 | 
 7 | from pypdb.clients.search.operators import text_operators
 8 | 
 9 | 
10 | class TestTextOperators(unittest.TestCase):
11 |     def test_not_equals_operator(self):
12 |         not_equals_operator = text_operators.ComparisonOperator(
13 |             attribute="struct.favourite_marvel_movie",
14 |             value="Thor: Ragnarok",
15 |             comparison_type=text_operators.ComparisonType.NOT_EQUAL)
16 | 
17 |         self.assertEqual(
18 |             not_equals_operator._to_dict(), {
19 |                 "attribute": "struct.favourite_marvel_movie",
20 |                 "value": "Thor: Ragnarok",
21 |                 "operator": "equals",
22 |                 "negation": True
23 |             })
24 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/search_client.py:
--------------------------------------------------------------------------------
  1 | """Barebones Python API Wrapper implementation around RCSB Search API.
  2 | 
  3 | This file contains Python dataclasses that formalize the API within native
  4 | Python objects. This aims to completely implement the Search API in Python.
  5 | 
  6 | For RCSB API docs, see: https://search.rcsb.org/index.html
  7 | """
  8 | 
  9 | # TODO(lacoperon): Implement request options
 10 | 
 11 | from dataclasses import dataclass
 12 | from enum import Enum
 13 | import json
 14 | import requests
 15 | from typing import Any, Dict, List, Optional, Union
 16 | import warnings
 17 | 
 18 | from pypdb.clients.search.operators import sequence_operators
 19 | from pypdb.clients.search.operators import text_operators
 20 | from pypdb.clients.search.operators.chemical_operators import ChemicalOperator
 21 | from pypdb.clients.search.operators.seqmotif_operators import SeqMotifOperator
 22 | from pypdb.clients.search.operators.sequence_operators import SequenceOperator
 23 | from pypdb.clients.search.operators.structure_operators import StructureOperator
 24 | from pypdb.clients.search.operators.text_operators import TextSearchOperator
 25 | 
 26 | SEARCH_URL_ENDPOINT: str = "https://search.rcsb.org/rcsbsearch/v2/query"
 27 | """SearchOperators correspond to individual search operations.
 28 | 
 29 | These can be used to search on their own using `perform_search`, or they can be
 30 | aggregated together into a `QueryGroup` to search using multiple operators at
 31 | once using `perform_search_with_graph`.
 32 | """
 33 | SearchOperator = Union[TextSearchOperator, SequenceOperator, StructureOperator,
 34 |                        SeqMotifOperator]
 35 | 
 36 | 
 37 | class LogicalOperator(Enum):
 38 |     """Operation used to combine `QueryGroup` results."""
 39 |     AND = "and"
 40 |     OR = "or"
 41 | 
 42 | 
 43 | @dataclass
 44 | class QueryGroup:
 45 |     """Group of search operators against RCSB Search API,
 46 |     whose independent results are aggregated with `logical_operator`.
 47 | 
 48 |     For example, for searches with `query_nodes=[n1,n2,n3]`,
 49 |     and `logical_operator=LogicalOperator.AND`, results will only be
 50 |     returned for hits that match all of n1, n2 and n3's queries.
 51 | 
 52 |     `logical_operator=LogicalOperator.OR` would return results that match any
 53 |     of n1, n2 or n3's queries.
 54 |     """
 55 |     # Elements within the list of `queries` can either be `SearchOperator`
 56 |     # instances (corresponding to individual queries)
 57 |     # or `QueryGroup` instances (corresponding to groups of queries).
 58 |     #
 59 |     # This allows building arbitrarily complex query logic in the search tree.
 60 |     queries: List[Union[SearchOperator, "QueryGroup"]]
 61 | 
 62 |     # Boolean to aggregate the results of `queries`.
 63 |     logical_operator: LogicalOperator
 64 | 
 65 |     def _to_dict(self):
 66 |         return {
 67 |             "type":
 68 |             "group",
 69 |             "logical_operator":
 70 |             self.logical_operator.value,
 71 |             "nodes": [
 72 |                 _QueryNode(query)._to_dict()
 73 |                 if type(query) is not QueryGroup else query._to_dict()
 74 |                 for query in self.queries
 75 |             ]
 76 |         }
 77 | 
 78 | 
 79 | class ReturnType(Enum):
 80 |     """For details, see: https://search.rcsb.org/index.html#return-type"""
 81 |     ENTRY = "entry"
 82 |     ASSEMBLY = "assembly"
 83 |     POLYMER_ENTITY = "polymer_entity"
 84 |     NON_POLYMER_ENTITY = "non_polymer_entity"
 85 |     POLYMER_INSTANCE = "polymer_instance"
 86 | 
 87 | 
 88 | @dataclass
 89 | class RequestOptions:
 90 |     """Options to configure which results are returned, and in what order."""
 91 |     # Returns `num_results` results starting at`result_start_index` (pagination)
 92 |     # If these indices are not defined, defaults to return all results.
 93 |     # (returning all results can be slow for compute-intensive searches)
 94 |     result_start_index: Optional[int] = None
 95 |     num_results: Optional[int] = None
 96 |     # What attribute to sort by.
 97 |     # This should either be "score"  (to sort by score),
 98 |     # or a valid RCSB attribute value
 99 |     # (e.g. "rcsb_accession_info.initial_release_date")
100 |     sort_by: Optional[str] = "score"
101 |     # Whether to sort by score ascending, or score descending
102 |     desc: Optional[bool] = True
103 | 
104 |     def _to_dict(self):
105 |         result_dict = {}
106 |         if self.result_start_index != None and self.num_results != None:
107 |             result_dict["paginate"] = {
108 |                 "start": self.result_start_index,
109 |                 "rows": self.num_results
110 |             }
111 | 
112 |         if self.sort_by != None and self.desc != None:
113 |             result_dict["sort"] = [{
114 |                 "sort_by": self.sort_by,
115 |                 "direction": "desc" if self.desc else "asc"
116 |             }]
117 | 
118 |         return result_dict
119 | 
120 | 
121 | @dataclass
122 | class ScoredResult:
123 |     entity_id: str  # PDB Entity ID (e.g. 5JUP for the entry return type)
124 |     score: float
125 | 
126 | 
127 | RawJSONDictResponse = Dict[str, Any]
128 | 
129 | 
130 | def perform_search(
131 |     search_operator: SearchOperator,
132 |     return_type: ReturnType = ReturnType.ENTRY,
133 |     request_options: Optional[RequestOptions] = None,
134 |     return_with_scores: bool = False,
135 |     return_raw_json_dict: bool = False,
136 |     verbosity: bool = True,
137 | ) -> Union[List[str], List[ScoredResult], RawJSONDictResponse]:
138 |     """Performs search specified by `search_operator`.
139 |     Returns entity strings of type `return_type` that match the resulting hits.
140 | 
141 |     Strictly a subset of the functionality exposed in
142 |     `perform_search_with_graph`, this function does not support searching on
143 |     multiple conditions at once.
144 | 
145 |     If you require this functionality, please use `perform_search_with_graph`
146 |     instead.
147 | 
148 |     Args:
149 |         search_operator: Parameters defining the search condition.
150 |         return_type: What type of RCSB entity to return.
151 |         request_options: Object containing information for result pagination
152 |           and sorting functionality.
153 |         return_with_scores: Whether or not to return the entity results with
154 |             their associated scores. For example, you might want to do this to
155 |             get
156 |             the top X hits that are similar to a certain protein sequence.
157 |             (if this is true, returns List[ScoredResult] instead of List[str])
158 |         return_raw_json_dict: If True, this function returns the raw JSON
159 |             response from RCSB, instead of a
160 |         verbosity: Print out the search query to the console (default: True)
161 | 
162 |     Returns:
163 |         List of entity ids, corresponding to entities that match the given
164 |         query.
165 | 
166 |         If `return_with_scores=True`, returns a list of ScoredResult instead.
167 |         If `return_raw_json_dict=True`, returns the raw JSON response from RCSB.
168 | 
169 |     Example usage to search for PDB entries that are from 'Mus musculus':
170 |     ```
171 |     from pypdb.clients.search. import perform_search
172 |     from pypdb.clients.search. import ReturnType
173 |     from pypdb.clients.search.operators.text_operators import ExactMatchOperator
174 |     pdb_ids = perform_search(
175 |                search_operator=text_operators.ExactMatchOperator(
176 |                  attribute="rcsb_entity_source_organism.taxonomy_lineage.name",
177 |                  value="Mus musculus"
178 |                ),
179 |                return_type=ReturnType.ENTRY)
180 |     print(pdb_ids)
181 |     )
182 |     ```
183 |     """
184 | 
185 |     return perform_search_with_graph(query_object=search_operator,
186 |                                      return_type=return_type,
187 |                                      request_options=request_options,
188 |                                      return_with_scores=return_with_scores,
189 |                                      return_raw_json_dict=return_raw_json_dict,
190 |                                      verbosity=verbosity)
191 | 
192 | 
193 | _SEARCH_OPERATORS = text_operators.TEXT_SEARCH_OPERATORS + [
194 |     SequenceOperator, StructureOperator, SeqMotifOperator
195 | ]
196 | 
197 | 
198 | def perform_search_with_graph(
199 |     query_object: Union[SearchOperator, QueryGroup],
200 |     return_type: ReturnType = ReturnType.ENTRY,
201 |     request_options: Optional[RequestOptions] = None,
202 |     return_with_scores: bool = False,
203 |     return_raw_json_dict: bool = False,
204 |     verbosity: bool = True,
205 | ) -> Union[List[str], RawJSONDictResponse, List[ScoredResult]]:
206 |     """Performs specified search using RCSB's search node logic.
207 | 
208 |     Essentially, this allows you to ask multiple questions in one RCSB query.
209 | 
210 |     For example, you can ask for structures that satisfy all of the following
211 |     conditions at once:
212 |         * Are either from Mus musculus or from Homo sapiens lineage
213 |         * Are both under 4 angstroms of resolution, and published after 2019
214 |         * Are labelled as "actin-binding protein" OR
215 |             contain "actin" AND "calmodulin" in their titles.
216 | 
217 |     See https://search.rcsb.org/index.html#building-search-request under
218 |     "Terminal node" and "Group node" for more details.
219 | 
220 |     Args:
221 |         query_object: Fully-specified SearchOperator or QueryGroup
222 |             object corresponding to the desired search.
223 |         return_type: Type of entities to return.
224 |         return_with_scores: Whether or not to return the entity results with
225 |             their associated scores. For example, you might want to do this to
226 |             get the top X hits that are similar to a certain protein sequence.
227 |         return_raw_json_dict: Whether to return raw JSON response.
228 |             (for example, to analyze the scores of various matches)
229 |         verbosity: Print out the search query to the console (default: True)
230 | 
231 |     Returns:
232 |         List of strings, corresponding to hits in the database. Will be of the
233 |         format specified by the `return_type`.
234 | 
235 |         If `return_with_scores=True`, returns a list of ScoredResult instead.
236 |         If `return_raw_json_dict=True`, returns the raw JSON response from RCSB.
237 |     """
238 | 
239 |     if type(query_object) in _SEARCH_OPERATORS:
240 |         cast_query_object = _QueryNode(query_object)  # type: ignore
241 |     else:
242 |         # print(type(query_object))
243 |         cast_query_object = query_object  # type: ignore
244 | 
245 |     if request_options is not None:
246 |         request_options_dict = request_options._to_dict()
247 |     else:
248 |         request_options_dict = {'return_all_hits': True}
249 | 
250 |     rcsb_query_dict = {
251 |         "query": cast_query_object._to_dict(),
252 |         "request_options": request_options_dict,
253 |         "return_type": return_type.value
254 |     }
255 | 
256 |     if verbosity:
257 |         print("Querying RCSB Search using the following parameters:\n %s \n" %
258 |               json.dumps(rcsb_query_dict))
259 | 
260 |     response = requests.post(url=SEARCH_URL_ENDPOINT,
261 |                              data=json.dumps(rcsb_query_dict),
262 |                              headers={"Content-Type": "application/json"})
263 | 
264 |     # If your search queries are failing here, it could be that your attribute
265 |     # doesn't support the SearchOperator you're using.
266 |     # See: https://search.rcsb.org/search-attributes.html
267 |     if not response.ok:
268 |         warnings.warn("It appears request failed with:" + response.text)
269 |         response.raise_for_status()
270 | 
271 |     # If specified, returns raw JSON response from RCSB as Dict
272 |     # (rather than entity IDs as a string list)
273 |     if return_raw_json_dict:
274 |         return response.json()
275 | 
276 |     # Converts RCSB result to list of identifiers corresponding to
277 |     # the `return_type`. Annotated with score if `return_with_scores`.
278 |     results = []
279 |     for query_hit in response.json()["result_set"]:
280 |         if return_with_scores:
281 |             results.append(
282 |                 ScoredResult(entity_id=query_hit["identifier"],
283 |                              score=query_hit["score"]))
284 |         else:
285 |             results.append(query_hit["identifier"])
286 | 
287 |     return results
288 | 
289 | 
290 | class SearchService(Enum):
291 |     """Which type of field is being searched.
292 | 
293 |     Auto-inferred from search operator."""
294 |     BASIC_SEARCH = "full_text"
295 |     TEXT = "text"
296 |     SEQUENCE = "sequence"
297 |     SEQMOTIF = "seqmotif"
298 |     STRUCTURE = "structure"
299 |     CHEMICAL = "chemical"
300 | 
301 | 
302 | class CannotInferSearchServiceException(Exception):
303 |     """Raised when the RCSB Search API Service cannot be inferred."""
304 | 
305 | 
306 | def _infer_search_service(search_operator: SearchOperator) -> SearchService:
307 | 
308 |     if isinstance(search_operator, text_operators.DefaultOperator):
309 |         return SearchService.BASIC_SEARCH
310 |     elif type(search_operator) in text_operators.TEXT_SEARCH_OPERATORS:
311 |         return SearchService.TEXT
312 |     elif type(search_operator) is SequenceOperator:
313 |         return SearchService.SEQUENCE
314 |     elif type(search_operator) is StructureOperator:
315 |         return SearchService.STRUCTURE
316 |     elif type(search_operator) is SeqMotifOperator:
317 |         return SearchService.SEQMOTIF
318 |     elif type(search_operator) is ChemicalOperator:
319 |         return SearchService.CHEMICAL
320 |     else:
321 |         raise CannotInferSearchServiceException(
322 |             "Cannot infer Search Service for {}".format(type(search_operator)))
323 | 
324 | 
325 | @dataclass
326 | class _QueryNode:
327 |     """Individual query node, performing a query defined by the provided
328 |     `search_operator`
329 |     """
330 |     search_operator: SearchOperator
331 | 
332 |     def _to_dict(self):
333 |         return {
334 |             "type": "terminal",
335 |             "service": _infer_search_service(self.search_operator).value,
336 |             "parameters": self.search_operator._to_dict()
337 |         }
338 | 


--------------------------------------------------------------------------------
/pypdb/clients/search/search_client_test.py:
--------------------------------------------------------------------------------
  1 | """Tests for RCSB Search API Python wrapper."""
  2 | import json
  3 | import pytest
  4 | import requests
  5 | import unittest
  6 | from unittest import mock
  7 | 
  8 | from pypdb.clients.search import search_client
  9 | from pypdb.clients.search.operators import sequence_operators, text_operators
 10 | 
 11 | 
 12 | class TestHTTPRequests(unittest.TestCase):
 13 |     @mock.patch.object(requests, "post")
 14 |     def test_default_operator_with_entry_return_value(self, mock_post):
 15 |         # Creates a mock HTTP response, as wrapped by `requests`
 16 |         canned_json_return_as_dict = {
 17 |             "result_set": [{
 18 |                 "identifier": "5JUP"
 19 |             }, {
 20 |                 "identifier": "5JUS"
 21 |             }, {
 22 |                 "identifier": "5JUO"
 23 |             }]
 24 |         }
 25 |         mock_response = mock.create_autospec(requests.Response, instance=True)
 26 |         mock_response.json.return_value = canned_json_return_as_dict
 27 |         mock_post.return_value = mock_response
 28 | 
 29 |         search_operator = text_operators.DefaultOperator(value="ribosome")
 30 |         return_type = search_client.ReturnType.ENTRY
 31 | 
 32 |         results = search_client.perform_search(search_operator, return_type)
 33 | 
 34 |         expected_json_dict = {
 35 |             'query': {
 36 |                 'type': 'terminal',
 37 |                 'service': 'full_text',
 38 |                 'parameters': {
 39 |                     'value': 'ribosome'
 40 |                 }
 41 |             },
 42 |             'request_options': {
 43 |                 'return_all_hits': True
 44 |             },
 45 |             'return_type': 'entry'
 46 |         }
 47 | 
 48 |         mock_post.assert_called_once_with(
 49 |             url=search_client.SEARCH_URL_ENDPOINT,
 50 |             data=json.dumps(expected_json_dict))
 51 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
 52 | 
 53 |     @mock.patch.object(requests, "post")
 54 |     def test_exact_match_operator_with_polymer_return(self, mock_post):
 55 |         # Creates a mock HTTP response, as wrapped by `requests`
 56 |         canned_json_return_as_dict = {
 57 |             "result_set": [{
 58 |                 "identifier": "5JUP"
 59 |             }, {
 60 |                 "identifier": "5JUS"
 61 |             }, {
 62 |                 "identifier": "5JUO"
 63 |             }]
 64 |         }
 65 |         mock_response = mock.create_autospec(requests.Response, instance=True)
 66 |         mock_response.json.return_value = canned_json_return_as_dict
 67 |         mock_post.return_value = mock_response
 68 | 
 69 |         search_operator = text_operators.ExactMatchOperator(
 70 |             value="Mus musculus",
 71 |             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
 72 |         return_type = search_client.ReturnType.POLYMER_ENTITY
 73 | 
 74 |         results = search_client.perform_search(search_operator, return_type)
 75 | 
 76 |         expected_json_dict = {
 77 |             'query': {
 78 |                 'type': 'terminal',
 79 |                 'service': 'text',
 80 |                 'parameters': {
 81 |                     'attribute':
 82 |                     'rcsb_entity_source_organism.taxonomy_lineage.name',
 83 |                     'operator': 'exact_match',
 84 |                     'value': 'Mus musculus'
 85 |                 }
 86 |             },
 87 |             'request_options': {
 88 |                 'return_all_hits': True
 89 |             },
 90 |             'return_type': 'polymer_entity'
 91 |         }
 92 | 
 93 |         mock_post.assert_called_once_with(
 94 |             url=search_client.SEARCH_URL_ENDPOINT,
 95 |             data=json.dumps(expected_json_dict))
 96 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
 97 | 
 98 |     @mock.patch.object(requests, "post")
 99 |     def test_in_operator_with_non_polymer_return(self, mock_post):
100 |         # Creates a mock HTTP response, as wrapped by `requests`
101 |         canned_json_return_as_dict = {
102 |             "result_set": [{
103 |                 "identifier": "5JUP"
104 |             }, {
105 |                 "identifier": "5JUS"
106 |             }, {
107 |                 "identifier": "5JUO"
108 |             }]
109 |         }
110 |         mock_response = mock.create_autospec(requests.Response, instance=True)
111 |         mock_response.json.return_value = canned_json_return_as_dict
112 |         mock_post.return_value = mock_response
113 | 
114 |         search_operator = text_operators.InOperator(
115 |             values=["Mus musculus", "Homo sapiens"],
116 |             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
117 |         return_type = search_client.ReturnType.NON_POLYMER_ENTITY
118 | 
119 |         results = search_client.perform_search(search_operator, return_type)
120 | 
121 |         expected_json_dict = {
122 |             'query': {
123 |                 'type': 'terminal',
124 |                 'service': 'text',
125 |                 'parameters': {
126 |                     'attribute':
127 |                     'rcsb_entity_source_organism.taxonomy_lineage.name',
128 |                     'operator': 'in',
129 |                     'value': ['Mus musculus', 'Homo sapiens']
130 |                 }
131 |             },
132 |             'request_options': {
133 |                 'return_all_hits': True
134 |             },
135 |             'return_type': 'non_polymer_entity'
136 |         }
137 | 
138 |         mock_post.assert_called_once_with(
139 |             url=search_client.SEARCH_URL_ENDPOINT,
140 |             data=json.dumps(expected_json_dict))
141 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
142 | 
143 |     @mock.patch.object(requests, "post")
144 |     def test_contains_words_operator_with_polymer_instance_return(
145 |             self, mock_post):
146 |         # Creates a mock HTTP response, as wrapped by `requests`
147 |         canned_json_return_as_dict = {
148 |             "result_set": [{
149 |                 "identifier": "5JUP"
150 |             }, {
151 |                 "identifier": "5JUS"
152 |             }, {
153 |                 "identifier": "5JUO"
154 |             }]
155 |         }
156 |         mock_response = mock.create_autospec(requests.Response, instance=True)
157 |         mock_response.json.return_value = canned_json_return_as_dict
158 |         mock_post.return_value = mock_response
159 | 
160 |         search_operator = text_operators.ContainsWordsOperator(
161 |             value="actin-binding protein", attribute="struct.title")
162 |         return_type = search_client.ReturnType.POLYMER_INSTANCE
163 | 
164 |         results = search_client.perform_search(search_operator, return_type)
165 | 
166 |         expected_json_dict = {
167 |             'query': {
168 |                 'type': 'terminal',
169 |                 'service': 'text',
170 |                 'parameters': {
171 |                     'attribute': 'struct.title',
172 |                     'operator': 'contains_words',
173 |                     'value': 'actin-binding protein'
174 |                 }
175 |             },
176 |             'request_options': {
177 |                 'return_all_hits': True
178 |             },
179 |             'return_type': 'polymer_instance'
180 |         }
181 | 
182 |         mock_post.assert_called_once_with(
183 |             url=search_client.SEARCH_URL_ENDPOINT,
184 |             data=json.dumps(expected_json_dict))
185 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
186 | 
187 |     @mock.patch.object(requests, "post")
188 |     def test_contains_phrase_operator_with_assembly_return(self, mock_post):
189 |         # Creates a mock HTTP response, as wrapped by `requests`
190 |         canned_json_return_as_dict = {
191 |             "result_set": [{
192 |                 "identifier": "5JUP"
193 |             }, {
194 |                 "identifier": "5JUS"
195 |             }, {
196 |                 "identifier": "5JUO"
197 |             }]
198 |         }
199 |         mock_response = mock.create_autospec(requests.Response, instance=True)
200 |         mock_response.json.return_value = canned_json_return_as_dict
201 |         mock_post.return_value = mock_response
202 | 
203 |         search_operator = text_operators.ContainsPhraseOperator(
204 |             value="actin-binding protein", attribute="struct.title")
205 |         return_type = search_client.ReturnType.ASSEMBLY
206 | 
207 |         results = search_client.perform_search(search_operator, return_type)
208 | 
209 |         expected_json_dict = {
210 |             'query': {
211 |                 'type': 'terminal',
212 |                 'service': 'text',
213 |                 'parameters': {
214 |                     'attribute': 'struct.title',
215 |                     'operator': 'contains_phrase',
216 |                     'value': 'actin-binding protein'
217 |                 }
218 |             },
219 |             'request_options': {
220 |                 'return_all_hits': True
221 |             },
222 |             'return_type': 'assembly'
223 |         }
224 | 
225 |         mock_post.assert_called_once_with(
226 |             url=search_client.SEARCH_URL_ENDPOINT,
227 |             data=json.dumps(expected_json_dict))
228 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
229 | 
230 |     @mock.patch.object(requests, "post")
231 |     def test_comparison_operator_with_entry_return(self, mock_post):
232 |         # Creates a mock HTTP response, as wrapped by `requests`
233 |         canned_json_return_as_dict = {
234 |             "result_set": [{
235 |                 "identifier": "5JUP"
236 |             }, {
237 |                 "identifier": "5JUS"
238 |             }, {
239 |                 "identifier": "5JUO"
240 |             }]
241 |         }
242 |         mock_response = mock.create_autospec(requests.Response, instance=True)
243 |         mock_response.json.return_value = canned_json_return_as_dict
244 |         mock_post.return_value = mock_response
245 | 
246 |         search_operator = text_operators.ComparisonOperator(
247 |             value="2019-01-01T00:00:00Z",
248 |             attribute="rcsb_accession_info.initial_release_date",
249 |             comparison_type=text_operators.ComparisonType.GREATER)
250 |         return_type = search_client.ReturnType.ENTRY
251 | 
252 |         results = search_client.perform_search(search_operator, return_type)
253 | 
254 |         expected_json_dict = {
255 |             'query': {
256 |                 'type': 'terminal',
257 |                 'service': 'text',
258 |                 'parameters': {
259 |                     'operator': 'greater',
260 |                     'attribute': 'rcsb_accession_info.initial_release_date',
261 |                     'value': '2019-01-01T00:00:00Z'
262 |                 }
263 |             },
264 |             'request_options': {
265 |                 'return_all_hits': True
266 |             },
267 |             'return_type': 'entry'
268 |         }
269 | 
270 |         mock_post.assert_called_once_with(
271 |             url=search_client.SEARCH_URL_ENDPOINT,
272 |             data=json.dumps(expected_json_dict))
273 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
274 | 
275 |     @mock.patch.object(requests, "post")
276 |     def test_range_operator_with_entry_return(self, mock_post):
277 |         # Creates a mock HTTP response, as wrapped by `requests`
278 |         canned_json_return_as_dict = {
279 |             "result_set": [{
280 |                 "identifier": "5JUP"
281 |             }, {
282 |                 "identifier": "5JUS"
283 |             }, {
284 |                 "identifier": "5JUO"
285 |             }]
286 |         }
287 |         mock_response = mock.create_autospec(requests.Response, instance=True)
288 |         mock_response.json.return_value = canned_json_return_as_dict
289 |         mock_post.return_value = mock_response
290 | 
291 |         search_operator = text_operators.RangeOperator(
292 |             from_value="2019-01-01T00:00:00Z",
293 |             to_value="2019-06-30T00:00:00Z",
294 |             include_lower=False,
295 |             include_upper=True,
296 |             attribute="rcsb_accession_info.initial_release_date")
297 |         return_type = search_client.ReturnType.ENTRY
298 | 
299 |         results = search_client.perform_search(search_operator, return_type)
300 | 
301 |         expected_json_dict = {
302 |             "query": {
303 |                 "type": "terminal",
304 |                 "service": "text",
305 |                 "parameters": {
306 |                     "operator": "range",
307 |                     "attribute": "rcsb_accession_info.initial_release_date",
308 |                     "negation": False,
309 |                     "value": ["2019-01-01T00:00:00Z", "2019-06-30T00:00:00Z"],
310 |                 }
311 |             },
312 |             "request_options": {
313 |                 "return_all_hits": True
314 |             },
315 |             "return_type": "entry"
316 |         }
317 | 
318 |         mock_post.assert_called_once_with(
319 |             url=search_client.SEARCH_URL_ENDPOINT,
320 |             data=json.dumps(expected_json_dict))
321 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
322 | 
323 |     @mock.patch.object(requests, "post")
324 |     def test_exists_operator_with_entry_raw_json_response(self, mock_post):
325 |         # Creates a mock HTTP response, as wrapped by `requests`
326 |         canned_json_return_as_dict = {
327 |             "result_set": [{
328 |                 "identifier": "5JUP"
329 |             }, {
330 |                 "identifier": "5JUS"
331 |             }, {
332 |                 "identifier": "5JUO"
333 |             }]
334 |         }
335 |         mock_response = mock.create_autospec(requests.Response, instance=True)
336 |         mock_response.json.return_value = canned_json_return_as_dict
337 |         mock_post.return_value = mock_response
338 | 
339 |         search_operator = text_operators.ExistsOperator(
340 |             attribute="rcsb_accession_info.initial_release_date")
341 |         return_type = search_client.ReturnType.ENTRY
342 | 
343 |         results = search_client.perform_search(search_operator,
344 |                                                return_type,
345 |                                                return_raw_json_dict=True)
346 | 
347 |         expected_json_dict = {
348 |             "query": {
349 |                 "type": "terminal",
350 |                 "service": "text",
351 |                 "parameters": {
352 |                     "operator": "exists",
353 |                     "attribute": "rcsb_accession_info.initial_release_date",
354 |                 }
355 |             },
356 |             "request_options": {
357 |                 "return_all_hits": True
358 |             },
359 |             "return_type": "entry"
360 |         }
361 | 
362 |         mock_post.assert_called_once_with(
363 |             url=search_client.SEARCH_URL_ENDPOINT,
364 |             data=json.dumps(expected_json_dict))
365 |         self.assertEqual(results, canned_json_return_as_dict)
366 | 
367 |     @mock.patch.object(requests, "post")
368 |     def test_query_group_after_2019_and_either_musculus_or_human(
369 |             self, mock_post):
370 |         # Creates a mock HTTP response, as wrapped by `requests`
371 |         canned_json_return_as_dict = {
372 |             "result_set": [{
373 |                 "identifier": "5JUP"
374 |             }, {
375 |                 "identifier": "5JUS"
376 |             }, {
377 |                 "identifier": "5JUO"
378 |             }]
379 |         }
380 |         mock_response = mock.create_autospec(requests.Response, instance=True)
381 |         mock_response.json.return_value = canned_json_return_as_dict
382 |         mock_post.return_value = mock_response
383 | 
384 |         after_2019_query_node = text_operators.ComparisonOperator(
385 |             value="2019-01-01T00:00:00Z",
386 |             attribute="rcsb_accession_info.initial_release_date",
387 |             comparison_type=text_operators.ComparisonType.GREATER)
388 | 
389 |         is_mus_query_node = text_operators.ExactMatchOperator(
390 |             value="Mus musculus",
391 |             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
392 | 
393 |         is_human_query_node = text_operators.ExactMatchOperator(
394 |             value="Homo sapiens",
395 |             attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
396 | 
397 |         is_human_or_mus_group = search_client.QueryGroup(
398 |             queries=[is_mus_query_node, is_human_query_node],
399 |             logical_operator=search_client.LogicalOperator.OR)
400 | 
401 |         is_after_2019_and_human_or_mus_group = search_client.QueryGroup(
402 |             queries=[is_human_or_mus_group, after_2019_query_node],
403 |             logical_operator=search_client.LogicalOperator.AND)
404 | 
405 |         return_type = search_client.ReturnType.ENTRY
406 | 
407 |         results = search_client.perform_search_with_graph(
408 |             query_object=is_after_2019_and_human_or_mus_group,
409 |             return_type=return_type)
410 | 
411 |         expected_json_dict = {
412 |             "query": {
413 |                 "type":
414 |                 "group",
415 |                 "logical_operator":
416 |                 "and",
417 |                 "nodes": [{
418 |                     'type':
419 |                     'group',
420 |                     'logical_operator':
421 |                     'or',
422 |                     'nodes': [
423 |                         {
424 |                             'type': 'terminal',
425 |                             'service': 'text',
426 |                             'parameters': {
427 |                                 'attribute':
428 |                                 'rcsb_entity_source_organism.taxonomy_lineage.name',
429 |                                 'operator': 'exact_match',
430 |                                 'value': 'Mus musculus'
431 |                             }
432 |                         },
433 |                         {
434 |                             'type': 'terminal',
435 |                             'service': 'text',
436 |                             'parameters': {
437 |                                 'attribute':
438 |                                 'rcsb_entity_source_organism.taxonomy_lineage.name',
439 |                                 'operator': 'exact_match',
440 |                                 'value': 'Homo sapiens'
441 |                             }
442 |                         },
443 |                     ]
444 |                 }, {
445 |                     'type': 'terminal',
446 |                     'service': 'text',
447 |                     'parameters': {
448 |                         'operator': 'greater',
449 |                         'attribute':
450 |                         'rcsb_accession_info.initial_release_date',
451 |                         'value': '2019-01-01T00:00:00Z'
452 |                     }
453 |                 }]
454 |             },
455 |             "request_options": {
456 |                 "return_all_hits": True
457 |             },
458 |             "return_type": "entry"
459 |         }
460 | 
461 |         mock_post.assert_called_once_with(
462 |             url=search_client.SEARCH_URL_ENDPOINT,
463 |             data=json.dumps(expected_json_dict))
464 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
465 | 
466 |     @mock.patch.object(requests, "post")
467 |     def test_query_structure_resolution(self, mock_post):
468 |         # Creates a mock HTTP response, as wrapped by `requests`
469 |         canned_json_return_as_dict = {
470 |             "result_set": [{
471 |                 "identifier": "5JUP"
472 |             }, {
473 |                 "identifier": "5JUS"
474 |             }, {
475 |                 "identifier": "5JUO"
476 |             }]
477 |         }
478 |         mock_response = mock.create_autospec(requests.Response, instance=True)
479 |         mock_response.json.return_value = canned_json_return_as_dict
480 |         mock_post.return_value = mock_response
481 | 
482 |         search_operator = text_operators.ComparisonOperator(
483 |             value=4,
484 |             attribute="rcsb_entry_info.resolution_combined",
485 |             comparison_type=text_operators.ComparisonType.LESS)
486 |         return_type = search_client.ReturnType.ENTRY
487 | 
488 |         results = search_client.perform_search(search_operator,
489 |                                                return_type,
490 |                                                return_raw_json_dict=True)
491 | 
492 |         expected_json_dict = {
493 |             "query": {
494 |                 "type": "terminal",
495 |                 "service": "text",
496 |                 "parameters": {
497 |                     "operator": "less",
498 |                     "attribute": "rcsb_entry_info.resolution_combined",
499 |                     "value": 4
500 |                 }
501 |             },
502 |             "request_options": {
503 |                 "return_all_hits": True
504 |             },
505 |             "return_type": "entry"
506 |         }
507 | 
508 |         mock_post.assert_called_once_with(
509 |             url=search_client.SEARCH_URL_ENDPOINT,
510 |             data=json.dumps(expected_json_dict))
511 |         self.assertEqual(results, canned_json_return_as_dict)
512 | 
513 |     @mock.patch.object(requests, "post")
514 |     def test_sequence_operator_search(self, mock_post):
515 |         # Creates a mock HTTP response, as wrapped by `requests`
516 |         canned_json_return_as_dict = {
517 |             "result_set": [{
518 |                 "identifier": "5JUP"
519 |             }, {
520 |                 "identifier": "5JUS"
521 |             }, {
522 |                 "identifier": "5JUO"
523 |             }]
524 |         }
525 |         mock_response = mock.create_autospec(requests.Response, instance=True)
526 |         mock_response.json.return_value = canned_json_return_as_dict
527 |         mock_post.return_value = mock_response
528 | 
529 |         results = search_client.perform_search(
530 |             search_operator=sequence_operators.SequenceOperator(
531 |                 sequence="ATGAGGTAA",
532 |                 sequence_type=sequence_operators.SequenceType.DNA,
533 |                 evalue_cutoff=100,
534 |                 identity_cutoff=0.90),
535 |             return_type=search_client.ReturnType.ENTRY)
536 | 
537 |         expected_json_dict = {
538 |             'query': {
539 |                 'type': 'terminal',
540 |                 'service': 'sequence',
541 |                 'parameters': {
542 |                     'evalue_cutoff': 100,
543 |                     'identity_cutoff': 0.90,
544 |                     'target': 'pdb_dna_sequence',
545 |                     'value': 'ATGAGGTAA'
546 |                 }
547 |             },
548 |             'request_options': {
549 |                 'return_all_hits': True
550 |             },
551 |             'return_type': 'entry'
552 |         }
553 | 
554 |         mock_post.assert_called_once_with(
555 |             url=search_client.SEARCH_URL_ENDPOINT,
556 |             data=json.dumps(expected_json_dict))
557 |         self.assertEqual(results, ["5JUP", "5JUS", "5JUO"])
558 | 
559 |     def test_request_options_to_dict(self):
560 |         request_options = search_client.RequestOptions(
561 |             result_start_index=42,
562 |             num_results=8675309,
563 |             sort_by="fake.rcsb.attribute",
564 |             desc=False)
565 | 
566 |         self.assertEqual(
567 |             request_options._to_dict(), {
568 |                 "pager": {
569 |                     "start": 42,
570 |                     "rows": 8675309
571 |                 },
572 |                 "sort": [{
573 |                     "sort_by": "fake.rcsb.attribute",
574 |                     "direction": "asc"
575 |                 }]
576 |             })
577 | 
578 | 
579 | if __name__ == '__main__':
580 |     unittest.main()
581 | 


--------------------------------------------------------------------------------
/pypdb/conftest.py:
--------------------------------------------------------------------------------
1 | # Sentinel file for `pytest` (to allow testing of PyPDB)
2 | 


--------------------------------------------------------------------------------
/pypdb/pypdb.py:
--------------------------------------------------------------------------------
   1 | '''
   2 | PyPDB: A Python API for the RCSB Protein Data Bank
   3 | 
   4 | -----
   5 | 
   6 | GitHub: https://github.com/williamgilpin/pypdb
   7 | 
   8 | PyPI: https://pypi.python.org/pypi/pypdb
   9 | 
  10 | -----
  11 | 
  12 | If you find this code useful, please consider citing the paper:
  13 | 
  14 |     Gilpin, W. "PyPDB: A Python API for the Protein Data Bank."
  15 |     Bioinformatics, Oxford Journals, 2015.
  16 | 
  17 | '''
  18 | from collections import OrderedDict, Counter
  19 | from itertools import repeat, chain
  20 | import time
  21 | import re
  22 | import json
  23 | import warnings
  24 | 
  25 | from pypdb.util import http_requests
  26 | from pypdb.clients.fasta import fasta_client
  27 | from pypdb.clients.pdb import pdb_client
  28 | from pypdb.clients.search import search_client
  29 | from pypdb.clients.search.operators import sequence_operators
  30 | 
  31 | warnings.simplefilter('always', DeprecationWarning)
  32 | 
  33 | 
  34 | # New imports needed for the updated API
  35 | from pypdb.clients.search.search_client import perform_search
  36 | from pypdb.clients.search.search_client import ReturnType
  37 | from pypdb.clients.search.operators import text_operators
  38 | 
  39 | 
  40 | '''
  41 | =================
  42 | Functions for searching the RCSB PDB for lists of PDB IDs
  43 | =================
  44 | '''
  45 | 
  46 | 
  47 | class Query(object):
  48 |     """
  49 | 
  50 |     xThis objects takes search terms and specifications and creates object that
  51 |     can be used to query the Protein Data Bank
  52 | 
  53 |     Parameters
  54 |     ----------
  55 |     search_term : str
  56 | 
  57 |         The specific term to search in the database. For specific query types,
  58 |         the strings that will yield valid results are limited to:
  59 | 
  60 |         'HoldingsQuery' : A Ggeneral search of the metadata associated with PDB IDs
  61 | 
  62 |         'ExpTypeQuery' : Experimental Method such as 'X-RAY', 'SOLID-STATE NMR', etc
  63 | 
  64 |         'AdvancedKeywordQuery' : Any string that appears in the title or abstract
  65 | 
  66 |         'StructureIdQuery' :  Perform a search for a specific Structure ID
  67 | 
  68 |         'ModifiedStructuresQuery' : Search for related structures
  69 | 
  70 |         'AdvancedAuthorQuery' : Search by the names of authors associated with entries
  71 | 
  72 |         'MotifQuery' : Search for a specific motif
  73 | 
  74 |         'NoLigandQuery' : Find full list of PDB IDs without free ligrands
  75 | 
  76 |     query_type : str
  77 | 
  78 |         The type of query to perform, the easiest is an AdvancedKeywordQuery but more
  79 |         specific types of searches may also be performed
  80 | 
  81 |     return type : str
  82 |         The type of search result to return. Default "entry" returns a list of PDB IDs
  83 | 
  84 |     scan_params (optional) : dict()
  85 |             A dictionary containing an explicit nested search term. Use this option if you want to
  86 |             use pypdb's rate handling and other functions, but need to structure a complex JSON
  87 |             query not covered in the existing python package
  88 | 
  89 |     Examples
  90 |     --------
  91 | 
  92 |     >>> found_pdbs = Query('actin network').search()
  93 |     >>> print(found_pdbs)
  94 |     ['1D7M', '3W3D', '4A7H', '4A7L', '4A7N']
  95 | 
  96 |     >>> found_pdbs = Query('3W3D', query_type='ModifiedStructuresQuery').search()
  97 |     >>> print(found_pdbs[:5])
  98 |     ['1A2N', '1ACF', '1AGX', '1APM', '1ARL']
  99 | 
 100 |     >>> found_pdbs = found_pdbs = Query('T[AG]AGGY', query_type='MotifQuery').search()
 101 |     >>> print(found_pdbs)
 102 |     ['3LEZ', '3SGH', '4F47']
 103 | 
 104 |     """
 105 |     def __init__(self,
 106 |                  search_term,
 107 |                  query_type="full_text",
 108 |                  return_type="entry",
 109 |                  scan_params=None):
 110 |         """See help(Query) for documentation"""
 111 | 
 112 |         if query_type == "PubmedIdQuery":
 113 |             query_type = "text"
 114 |             query_subtype = "pmid"
 115 |         elif query_type == "TreeEntityQuery":
 116 |             query_type = "text"
 117 |             query_subtype = "taxid"
 118 |         elif query_type == "ExpTypeQuery":
 119 |             query_type = "text"
 120 |             query_subtype = "experiment_type"
 121 |             search_term = search_term.upper()
 122 |             if search_term not in [
 123 |                     "X-RAY DIFFRACTION", "ELECTRON MICROSCOPY",
 124 |                     "SOLID-STATE NMR", "SOLUTION NMR", "NEUTRON DIFFRACTION",
 125 |                     "ELECTRON CRYSTALLOGRAPHY", "POWDER DIFFRACTION",
 126 |                     "FIBER DIFFRACTION", "SOLUTION SCATTERING", "EPR",
 127 |                     "FLUORESCENCE TRANSFER", "INFRARED SPECTROSCOPY",
 128 |                     "THEORETICAL MODEL"
 129 |             ]:
 130 |                 warnings.warn(
 131 |                     "Experimental type not recognized, search may fail .")
 132 |         elif query_type == "AdvancedAuthorQuery":
 133 |             query_type = "text"
 134 |             query_subtype = "author"
 135 |         elif query_type == "OrganismQuery":
 136 |             query_type = "text"
 137 |             query_subtype = "organism"
 138 |         elif query_type == "pfam":
 139 |             query_type = "text"
 140 |             query_subtype = "pfam"
 141 |         elif query_type == "uniprot":
 142 |             query_type = "text"
 143 |             query_subtype = "uniprot"
 144 |         else:
 145 |             query_subtype = None
 146 | 
 147 |         assert query_type in {
 148 |             "full_text", "text", "structure", "sequence", "seqmotif", "chemical"
 149 |         }, "Query type %s not recognized." % query_type
 150 | 
 151 |         assert return_type in {"entry", "polymer_entity"
 152 |                                }, "Return type %s not supported." % return_type
 153 | 
 154 |         self.query_type = query_type
 155 |         self.search_term = search_term
 156 |         self.return_type = return_type
 157 |         self.url = "https://search.rcsb.org/rcsbsearch/v2/query?json="
 158 |         composite_query = False
 159 |         if not scan_params:
 160 |             query_params = dict()
 161 |             query_params["type"] = "terminal"
 162 |             query_params["service"] = query_type
 163 | 
 164 |             if query_type in ["full_text", "text"]:
 165 |                 query_params['parameters'] = {"value": search_term}
 166 | 
 167 |             elif query_type == "sequence":
 168 |                 query_params['parameters'] = {
 169 |                     "target": "pdb_protein_sequence",
 170 |                     "value": search_term
 171 |                 }
 172 |             elif query_type == "structure":
 173 |                 query_params['parameters'] = {
 174 |                     "operator": "relaxed_shape_match",
 175 |                     "value": {
 176 |                         "entry_id": search_term,
 177 |                         "assembly_id": "1"
 178 |                     }
 179 |                 }
 180 | 
 181 | #             elif query_type=='AdvancedAuthorQuery':
 182 | #                 query_params['description'] = 'Author Name: '+ search_term
 183 | #                 query_params['searchType'] = 'All Authors'
 184 | #                 query_params['audit_author.name'] = search_term
 185 | #                 query_params['exactMatch'] = 'false'
 186 | 
 187 | #             elif query_type=='MotifQuery':
 188 | #                 query_params['description'] = 'Motif Query For: '+ search_term
 189 | #                 query_params['motif'] = search_term
 190 | 
 191 | #             elif query_type=='OrganismQuery':
 192 | # #                 query_params['version'] = "B0905"
 193 | #                 query_params['description'] = 'Organism Search: Organism Name='+ search_term
 194 | #                 query_params['organismName'] = search_term
 195 | # #                 composite_query = True
 196 | #             elif query_type=='TreeEntityQuery':
 197 | #                 query_params['t'] = "1"
 198 | #                 query_params['description'] = 'TaxonomyTree Search for OTHER SEQUENCES'
 199 | #                 query_params['n'] = search_term
 200 | #                 query_params['nodeDesc'] = "OTHER SEQUENCES"
 201 | 
 202 | #             elif query_type in ['StructureIdQuery','ModifiedStructuresQuery']:
 203 | #                 query_params['structureIdList'] = search_term
 204 | 
 205 | #             elif query_type=='ExpTypeQuery':
 206 | #                 query_params['experimentalMethod'] = search_term
 207 | #                 query_params['description'] = 'Experimental Method Search : Experimental Method='+ search_term
 208 | #                 query_params['mvStructure.expMethod.value']= search_term
 209 |             if query_subtype:
 210 | 
 211 |                 if query_subtype == "pmid":
 212 |                     query_params['parameters'] = {
 213 |                         "operator": "in",
 214 |                         "negation": False,
 215 |                         "value": [search_term],
 216 |                         "attribute":
 217 |                         "rcsb_pubmed_container_identifiers.pubmed_id"
 218 |                     }
 219 |                 if query_subtype == "taxid":
 220 |                     query_params['parameters'] = {
 221 |                         "operator":
 222 |                         "exact_match",
 223 |                         "negation":
 224 |                         False,
 225 |                         "value":
 226 |                         str(search_term),
 227 |                         "attribute":
 228 |                         "rcsb_entity_source_organism.taxonomy_lineage.id"
 229 |                     }
 230 |                 if query_subtype == "experiment_type":
 231 |                     query_params['parameters'] = {
 232 |                         "operator": "exact_match",
 233 |                         "negation": False,
 234 |                         "value": str(search_term),
 235 |                         "attribute": "exptl.method"
 236 |                     }
 237 |                 if query_subtype == "author":
 238 |                     query_params['parameters'] = {
 239 |                         "operator": "exact_match",
 240 |                         "negation": False,
 241 |                         "value": str(search_term),
 242 |                         "attribute": "rcsb_primary_citation.rcsb_authors"
 243 |                     }
 244 |                 if query_subtype == "organism":
 245 |                     query_params['parameters'] = {
 246 |                         "operator":
 247 |                         "contains_words",
 248 |                         "negation":
 249 |                         False,
 250 |                         "value":
 251 |                         str(search_term),
 252 |                         "attribute":
 253 |                         "rcsb_entity_source_organism.taxonomy_lineage.name"
 254 |                     }
 255 |                 if query_subtype == "pfam":
 256 |                     query_params['parameters'] = {
 257 |                         "operator": "exact_match",
 258 |                         "negation": False,
 259 |                         "value": str(search_term),
 260 |                         "attribute":
 261 |                         "rcsb_polymer_entity_annotation.annotation_id"
 262 |                     }
 263 |                 if query_subtype == "uniprot":
 264 |                     query_params['parameters'] = {
 265 |                         "operator": "exact_match",
 266 |                         "negation": False,
 267 |                         "value": str(search_term),
 268 |                         "attribute": 
 269 |                         "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession"
 270 |                         }
 271 | 
 272 |             self.scan_params = dict()
 273 |             self.scan_params["query"] = query_params
 274 |             self.scan_params["return_type"] = return_type
 275 |             self.scan_params["request_options"] = {"results_verbosity": "verbose"} # v2
 276 | 
 277 |             if return_type == "entry":
 278 |                 self.scan_params["request_options"] = {"return_all_hits": True}
 279 | 
 280 |         else:
 281 |             self.scan_params = scan_params
 282 | 
 283 |     def search(self, num_attempts=1, sleep_time=0.5):
 284 |         """
 285 |         Perform a search of the Protein Data Bank using the REST API
 286 | 
 287 |         Parameters
 288 |         ----------
 289 | 
 290 |         num_attempts : int
 291 |             In case of a failed retrieval, the number of attempts to try again
 292 |         sleep_time : int
 293 |             The amount of time to wait between requests, in case of
 294 |             API rate limits
 295 |         """
 296 | 
 297 |         query_text = json.dumps(self.scan_params, indent=4)
 298 |         response = http_requests.request_limited(self.url,
 299 |                                                  rtype="POST",
 300 |                                                  headers={"Content-Type": "application/json"},
 301 |                                                  data=query_text)
 302 | 
 303 |         if response is None or response.status_code != 200:
 304 |             warnings.warn("Retrieval failed, returning None")
 305 |             return None
 306 | 
 307 |         response_val = json.loads(response.text)
 308 | 
 309 |         if self.return_type == "entry":
 310 |             idlist = walk_nested_dict(response_val,
 311 |                                       "identifier",
 312 |                                       maxdepth=25,
 313 |                                       outputs=[])
 314 |             return idlist
 315 |         else:
 316 |             return response_val
 317 | 
 318 | 
 319 | # def do_search(scan_params):
 320 | #     '''Convert dict() to XML object an then send query to the RCSB PDB
 321 | 
 322 | #     This function takes a valid query dict() object, converts it to XML,
 323 | #     and then sends a request to the PDB for a list of IDs corresponding
 324 | #     to search results
 325 | 
 326 | #     Parameters
 327 | #     ----------
 328 | 
 329 | #     scan_params : dict
 330 | #         A dictionary of query attributes to use for
 331 | #         the search of the PDB
 332 | 
 333 | #     Returns
 334 | #     -------
 335 | 
 336 | #     idlist : list
 337 | #         A list of PDB ids returned by the search
 338 | 
 339 | #     Examples
 340 | #     --------
 341 | #     This method usually gets used in tandem with make_query
 342 | 
 343 | #     >>> a = make_query('actin network')
 344 | #     >>> print (a)
 345 | #     {'orgPdbQuery': {'description': 'Text Search for: actin',
 346 | #     'keywords': 'actin',
 347 | #     'queryType': 'AdvancedKeywordQuery'}}
 348 | 
 349 | #     >>> search_dict = make_query('actin network')
 350 | #     >>> found_pdbs = do_search(search_dict)
 351 | #     >>> print(found_pdbs)
 352 | #     ['1D7M', '3W3D', '4A7H', '4A7L', '4A7N']
 353 | 
 354 | #     >>> search_dict = make_query('T[AG]AGGY',querytype='MotifQuery')
 355 | #     >>> found_pdbs = do_search(search_dict)
 356 | #     >>> print(found_pdbs)
 357 | #     ['3LEZ', '3SGH', '4F47']
 358 | #     '''
 359 | #     q = Query('search_term', 'HoldingsQuery', scan_params=scan_params)
 360 | #     return q.search()
 361 | 
 362 | # def do_protsym_search(point_group, min_rmsd=0.0, max_rmsd=7.0):
 363 | #     '''Performs a protein symmetry search of the PDB
 364 | 
 365 | #     This function can search the Protein Data Bank based on how closely entries
 366 | #     match the user-specified symmetry group
 367 | 
 368 | #     Parameters
 369 | #     ----------
 370 | 
 371 | #     point_group : str
 372 | #         The name of the symmetry point group to search. This includes all the standard
 373 | #         abbreviations for symmetry point groups (e.g., C1, C2, D2, T, O, I, H, A1)
 374 | 
 375 | #     min_rmsd : float
 376 | #         The smallest allowed total deviation (in Angstroms) for a result
 377 | #         to be classified as having a matching symmetry
 378 | 
 379 | #     max_rmsd : float
 380 | #         The largest allowed total deviation (in Angstroms) for a result
 381 | #         to be classified as having a matching symmetry
 382 | 
 383 | #     Returns
 384 | #     -------
 385 | 
 386 | #     idlist : list of strings
 387 | #         A list of PDB IDs resulting from the search
 388 | 
 389 | #     Examples
 390 | #     --------
 391 | 
 392 | #     >>> kk = do_protsym_search('C9', min_rmsd=0.0, max_rmsd=1.0)
 393 | #     >>> print(kk[:5])
 394 | #     ['1KZU', '1NKZ', '2FKW', '3B8M', '3B8N']
 395 | 
 396 | #     '''
 397 | #     query_params = dict()
 398 | #     query_params['queryType'] = 'PointGroupQuery'
 399 | #     query_params['rMSDComparator'] = 'between'
 400 | 
 401 | #     query_params['pointGroup'] = point_group
 402 | #     query_params['rMSDMin'] = min_rmsd
 403 | #     query_params['rMSDMax'] = max_rmsd
 404 | 
 405 | #     scan_params = dict()
 406 | #     scan_params['orgPdbQuery'] = query_params
 407 | #     idlist =  do_search(scan_params)
 408 | #     return idlist
 409 | 
 410 | # def get_all():
 411 | #     """Return a list of all PDB entries currently in the RCSB Protein Data Bank
 412 | 
 413 | #     Returns
 414 | #     -------
 415 | 
 416 | #     out : list of str
 417 | #         A list of all of the PDB IDs currently in the RCSB PDB
 418 | 
 419 | #     Examples
 420 | #     --------
 421 | 
 422 | #     >>> print(get_all()[:10])
 423 | #     ['100D', '101D', '101M', '102D', '102L', '102M', '103D', '103L', '103M', '104D']
 424 | 
 425 | #     """
 426 | 
 427 | #     url = 'http://www.rcsb.org/pdb/rest/getCurrent'
 428 | #     #response = requests.get(url)
 429 | #     response = http_requests.request_limited(url)
 430 | 
 431 | #     if response.status_code == 200:
 432 | #         pass
 433 | #     else:
 434 | #         warnings.warn("Retrieval failed, returning None")
 435 | #         return None
 436 | 
 437 | #     result  = str(response.text)
 438 | 
 439 | #     p = re.compile('structureId=\"...."')
 440 | #     matches = p.findall(str(result))
 441 | #     out = list()
 442 | #     for item in matches:
 443 | #         out.append(item[-5:-1])
 444 | 
 445 | #     return out
 446 | '''
 447 | =================
 448 | Functions for looking up information given PDB ID
 449 | =================
 450 | '''
 451 | 
 452 | 
 453 | def get_info(pdb_id, url_root='https://data.rcsb.org/rest/v1/core/entry/'):
 454 |     '''Look up all information about a given PDB ID
 455 | 
 456 |     Parameters
 457 |     ----------
 458 | 
 459 |     pdb_id : string
 460 |         A 4 character string giving a pdb entry of interest
 461 | 
 462 |     url_root : string
 463 |         The string root of the specific url for the request type
 464 | 
 465 |     Returns
 466 |     -------
 467 | 
 468 |     out : dict()
 469 |         An ordered dictionary object corresponding to entry information
 470 | 
 471 |     '''
 472 |     pdb_id = pdb_id.replace(":", "/")  # replace old entry identifier
 473 |     url = url_root + pdb_id
 474 |     response = http_requests.request_limited(url)
 475 | 
 476 |     if response is None or response.status_code != 200:
 477 |         warnings.warn("Retrieval failed, returning None")
 478 |         return None
 479 | 
 480 |     result = str(response.text)
 481 | 
 482 |     out = json.loads(result)
 483 | 
 484 |     return out
 485 | 
 486 | 
 487 | get_all_info = get_info  # Alias
 488 | describe_pdb = get_info  # Alias for now; eventually make this point to the Graph search https://data.rcsb.org/migration-guide.html#pdb-file-description
 489 | get_entity_info = get_info  # Alias
 490 | 
 491 | 
 492 | def get_pdb_file(pdb_id: str, filetype='pdb', compression=False):
 493 |     """Deprecated wrapper for fetching PDB files from RCSB Database.
 494 | 
 495 |     For new uses, please use `pypdb/clients/pdb/pdb_client.py`
 496 |     """
 497 | 
 498 |     warnings.warn(
 499 |         "The `get_pdb_file` function within pypdb.py is deprecated."
 500 |         "See `pypdb/clients/pdb/pdb_client.py` for a near-identical "
 501 |         "function to use", DeprecationWarning)
 502 | 
 503 |     if filetype == 'pdb':
 504 |         filetype_enum = pdb_client.PDBFileType.PDB
 505 |     elif filetype == 'cif':
 506 |         filetype_enum = pdb_client.PDBFileType.CIF
 507 |     elif filetype == 'xml':
 508 |         filetype_enum = pdb_client.PDBFileType.XML
 509 |     elif filetype == 'structfact':
 510 |         filetype_enum = pdb_client.PDBFileType.STRUCTFACT
 511 |     else:
 512 |         warnings.warn(
 513 |             "Filetype specified to `get_pdb_file` appears to be invalid")
 514 | 
 515 |     return pdb_client.get_pdb_file(pdb_id, filetype_enum, compression)
 516 | 
 517 | 
 518 | # https://data.rcsb.org/migration-guide.html#chem-comp-description
 519 | def describe_chemical(chem_id):
 520 | #     """
 521 | 
 522 | #     Parameters
 523 | #     ----------
 524 | 
 525 | #     chem_id : string
 526 | #         A 3 character string representing the full chemical sequence of interest (ie, NAG)
 527 | 
 528 | #     Returns
 529 | #     -------
 530 | 
 531 | #     out : dict
 532 | #         A dictionary containing the chemical description associated with the PDB ID
 533 | 
 534 | #     Examples
 535 | #     --------
 536 | #     >>> chem_desc = describe_chemical('NAG')
 537 | #     >>> print(chem_desc["rcsb_chem_comp_descriptor"]["smiles"])
 538 | #     'CC(=O)NC1C(C(C(OC1O)CO)O)O'
 539 | #     """
 540 |     if (len(chem_id) > 3):
 541 |         raise Exception("Ligand id with more than 3 characters provided")
 542 | 
 543 |     return get_info(chem_id, url_root = 'https://data.rcsb.org/rest/v1/core/chemcomp/')
 544 | 
 545 | # def get_ligands(pdb_id):
 546 | #     """Return ligands of given PDB ID
 547 | 
 548 | #     Parameters
 549 | #     ----------
 550 | 
 551 | #     pdb_id : string
 552 | #         A 4 character string giving a pdb entry of interest
 553 | 
 554 | #     Returns
 555 | #     -------
 556 | 
 557 | #     out : dict
 558 | #         A dictionary containing a list of ligands associated with the entry
 559 | 
 560 | #     Examples
 561 | #     --------
 562 | #     >>> ligand_dict = get_ligands('100D')
 563 | #     >>> print(ligand_dict)
 564 | #     {'id': '100D',
 565 | #     'ligandInfo': {'ligand': {'@chemicalID': 'SPM',
 566 | #                            '@molecularWeight': '202.34',
 567 | #                            '@structureId': '100D',
 568 | #                            '@type': 'non-polymer',
 569 | #                            'InChI': 'InChI=1S/C10H26N4/c11-5-3-9-13-7-1-2-8-14-10-4-6-12/h13-14H,1-12H2',
 570 | #                            'InChIKey': 'PFNFFQXMRSDOHW-UHFFFAOYSA-N',
 571 | #                            'chemicalName': 'SPERMINE',
 572 | #                            'formula': 'C10 H26 N4',
 573 | #                            'smiles': 'C(CCNCCCN)CNCCCN'}}}
 574 | 
 575 | #     """
 576 | #     out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/ligandInfo?structureId=')
 577 | #     out = to_dict(out)
 578 | #     return remove_at_sign(out['structureId'])
 579 | 
 580 | # def get_gene_onto(pdb_id):
 581 | #     """Return ligands of given PDB_ID
 582 | 
 583 | #     Parameters
 584 | #     ----------
 585 | 
 586 | #     pdb_id : string
 587 | #         A 4 character string giving a pdb entry of interest
 588 | 
 589 | #     Returns
 590 | #     -------
 591 | 
 592 | #     out : dict
 593 | #         A dictionary containing the gene ontology information associated with the entry
 594 | 
 595 | #     Examples
 596 | #     --------
 597 | 
 598 | #     >>> gene_info = get_gene_onto('4Z0L')
 599 | #     >>> print(gene_info['term'][0])
 600 | #     {'@chainId': 'A',
 601 | #      '@id': 'GO:0001516',
 602 | #      '@structureId': '4Z0L',
 603 | #      'detail': {'@definition': 'The chemical reactions and pathways resulting '
 604 | #                                'in the formation of prostaglandins, any of a '
 605 | #                                'group of biologically active metabolites which '
 606 | #                                'contain a cyclopentane ring.',
 607 | #                 '@name': 'prostaglandin biosynthetic process',
 608 | #                 '@ontology': 'B',
 609 | #                 '@synonyms': 'prostaglandin anabolism, prostaglandin '
 610 | #                              'biosynthesis, prostaglandin formation, '
 611 | #                              'prostaglandin synthesis'}}
 612 | #     """
 613 | #     out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/goTerms?structureId=')
 614 | #     out = to_dict(out)
 615 | #     if not out['goTerms']:
 616 | #         return None
 617 | #     out = remove_at_sign(out['goTerms'])
 618 | #     return out
 619 | 
 620 | # def get_seq_cluster(pdb_id_chain):
 621 | #     """Get the sequence cluster of a PDB ID plus a pdb_id plus a chain,
 622 | 
 623 | #     Parameters
 624 | #     ----------
 625 | 
 626 | #     pdb_id_chain : string
 627 | #         A string denoting a 4 character PDB ID plus a one character chain
 628 | #         offset with a dot: XXXX.X, as in 2F5N.A
 629 | 
 630 | #     Returns
 631 | #     -------
 632 | 
 633 | #     out : dict
 634 | #         A dictionary containing the sequence cluster associated with the PDB
 635 | #         entry and chain
 636 | 
 637 | #     Examples
 638 | #     --------
 639 | 
 640 | #     >>> sclust = get_seq_cluster('2F5N.A')
 641 | #     >>> print(sclust['pdbChain'][:10])
 642 | #     [{'@name': '4PD2.A', '@rank': '1'},
 643 | #      {'@name': '3U6P.A', '@rank': '2'},
 644 | #      {'@name': '4PCZ.A', '@rank': '3'},
 645 | #      {'@name': '3GPU.A', '@rank': '4'},
 646 | #      {'@name': '3JR5.A', '@rank': '5'},
 647 | #      {'@name': '3SAU.A', '@rank': '6'},
 648 | #      {'@name': '3GQ4.A', '@rank': '7'},
 649 | #      {'@name': '1R2Z.A', '@rank': '8'},
 650 | #      {'@name': '3U6E.A', '@rank': '9'},
 651 | #      {'@name': '2XZF.A', '@rank': '10'}]
 652 | 
 653 | #     """
 654 | 
 655 | #     url_root = 'http://www.rcsb.org/pdb/rest/sequenceCluster?structureId='
 656 | #     out = get_info(pdb_id_chain, url_root = url_root)
 657 | #     out = to_dict(out)
 658 | #     return remove_at_sign(out['sequenceCluster'])
 659 | 
 660 | 
 661 | def get_blast(pdb_id, chain_id='A', identity_cutoff=0.99, verbosity=True):
 662 |     """
 663 |     ---
 664 |     WARNING: this function is deprecated and slated to be deleted due to RCSB
 665 |     API changes.
 666 | 
 667 |     See `pypdb/clients/search/EXAMPLES.md` for examples to use a
 668 |     `SequenceOperator` search to similar effect
 669 |     ---
 670 | 
 671 |     Return BLAST search results for a given PDB ID.
 672 | 
 673 |     Parameters
 674 |     ----------
 675 |     pdb_id : string
 676 |         A 4 character string giving a pdb entry of interest
 677 | 
 678 |     chain_id : string
 679 |         A single character designating the chain ID of interest
 680 |     identity_cutoff: float
 681 |         Identity % at which to cut off results.
 682 | 
 683 | 
 684 |     Returns
 685 |     -------
 686 | 
 687 |     out : List of PDB IDs that match the given search.
 688 | 
 689 |     Examples
 690 |     --------
 691 | 
 692 |     >>> blast_results = get_blast('2F5N', chain_id='A')
 693 |     >>> print(blast_results[50])
 694 |     PELPEVETVRRELEKRIVGQKIISIEATYPRMVL--TGFEQLKKELTGKTIQGISRRGKYLIFEIGDDFRLISHLRMEGKYRLATLDAPREKHDHL
 695 |     TMKFADG-QLIYADVRKFGTWELISTDQVLPYFLKKKIGPEPTYEDFDEKLFREKLRKSTKKIKPYLLEQTLVAGLGNIYVDEVLWLAKIHPEKET
 696 |     NQLIESSIHLLHDSIIEILQKAIKLGGSSIRTY-SALGSTGKMQNELQVYGKTGEKCSRCGAEIQKIKVAGRGTHFCPVCQQ
 697 |     """
 698 | 
 699 |     warnings.warn(
 700 |         "The `get_blast` function is slated for deprecation."
 701 |         "See `pypdb/clients/search/EXAMPLES.md` for examples to use a"
 702 |         "`SequenceOperator` search to similar effect", DeprecationWarning)
 703 | 
 704 |     fasta_entries = fasta_client.get_fasta_from_rcsb_entry(pdb_id)
 705 |     valid_sequences = [
 706 |         fasta_entry.sequence for fasta_entry in fasta_entries
 707 |         if chain_id in fasta_entry.chains
 708 |     ]
 709 | 
 710 |     matches_any_sequence_in_chain_query = search_client.QueryGroup(
 711 |         logical_operator=search_client.LogicalOperator.OR, queries=[])
 712 |     for valid_sequence in valid_sequences:
 713 |         matches_any_sequence_in_chain_query.queries.append(
 714 |             sequence_operators.SequenceOperator(
 715 |                 sequence=valid_sequence,
 716 |                 identity_cutoff=identity_cutoff,
 717 |                 evalue_cutoff=1000))
 718 | 
 719 |     return search_client.perform_search_with_graph(
 720 |         query_object=matches_any_sequence_in_chain_query,
 721 |         return_raw_json_dict=True)
 722 | 
 723 | 
 724 | # def get_pfam(pdb_id):
 725 | #     """Return PFAM annotations of given PDB_ID
 726 | 
 727 | #     Parameters
 728 | #     ----------
 729 | 
 730 | #     pdb_id : string
 731 | #         A 4 character string giving a pdb entry of interest
 732 | 
 733 | #     Returns
 734 | #     -------
 735 | 
 736 | #     out : dict
 737 | #         A dictionary containing the PFAM annotations for the specified PDB ID
 738 | 
 739 | #     Examples
 740 | #     --------
 741 | 
 742 | #     >>> pfam_info = get_pfam('2LME')
 743 | #     >>> print(pfam_info)
 744 | #     {'pfamHit': {'@pfamAcc': 'PF03895.10', '@pfamName': 'YadA_anchor',
 745 | #     '@structureId': '2LME', '@pdbResNumEnd': '105', '@pdbResNumStart': '28',
 746 | #     '@pfamDesc': 'YadA-like C-terminal region', '@eValue': '5.0E-22', '@chainId': 'A'}}
 747 | 
 748 | #     """
 749 | #     out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/hmmer?structureId=')
 750 | #     out = to_dict(out)
 751 | #     if not out['hmmer3']:
 752 | #         return dict()
 753 | #     return remove_at_sign(out['hmmer3'])
 754 | 
 755 | # def get_clusters(pdb_id):
 756 | #     """Return cluster related web services of given PDB_ID
 757 | 
 758 | #     Parameters
 759 | #     ----------
 760 | 
 761 | #     pdb_id : string
 762 | #         A 4 character string giving a pdb entry of interest
 763 | 
 764 | #     Returns
 765 | #     -------
 766 | 
 767 | #     out : dict
 768 | #         A dictionary containing the representative clusters for the specified PDB ID
 769 | 
 770 | #     Examples
 771 | #     --------
 772 | 
 773 | #     >>> clusts = get_clusters('4hhb.A')
 774 | #     >>> print(clusts)
 775 | #     {'pdbChain': {'@name': '2W72.A'}}
 776 | 
 777 | #     """
 778 | #     out = get_info(pdb_id, url_root = 'http://www.rcsb.org/pdb/rest/representatives?structureId=')
 779 | #     out = to_dict(out)
 780 | #     return remove_at_sign(out['representatives'])
 781 | 
 782 | 
 783 | def find_results_gen(search_term, field='title'):
 784 |     '''
 785 |     Return a generator of the results returned by a search of
 786 |     the protein data bank. This generator is used internally.
 787 | 
 788 |     Parameters
 789 |     ----------
 790 | 
 791 |     search_term : str
 792 |         The search keyword
 793 | 
 794 |     field : str
 795 |         The type of information to record about each entry
 796 | 
 797 |     Examples
 798 |     --------
 799 | 
 800 |     >>> result_gen = find_results_gen('bleb')
 801 |     >>> pprint.pprint([item for item in result_gen][:5])
 802 |     ['MYOSIN II DICTYOSTELIUM DISCOIDEUM MOTOR DOMAIN S456Y BOUND WITH MGADP-BEFX',
 803 |      'MYOSIN II DICTYOSTELIUM DISCOIDEUM MOTOR DOMAIN S456Y BOUND WITH MGADP-ALF4',
 804 |      'DICTYOSTELIUM DISCOIDEUM MYOSIN II MOTOR DOMAIN S456E WITH BOUND MGADP-BEFX',
 805 |      'MYOSIN II DICTYOSTELIUM DISCOIDEUM MOTOR DOMAIN S456E BOUND WITH MGADP-ALF4',
 806 |      'The structural basis of blebbistatin inhibition and specificity for myosin '
 807 |      'II']
 808 | 
 809 |     '''
 810 |     search_result_ids = Query(search_term).search()
 811 | 
 812 |     all_titles = []
 813 |     for pdb_id in search_result_ids:
 814 |         result = get_info(pdb_id)
 815 |         if field in result.keys():
 816 |             yield result[field]
 817 | 
 818 | 
 819 | def find_papers(search_term, max_results=10, **kwargs):
 820 |     '''
 821 |     Return an ordered list of the top papers returned by a keyword search of
 822 |     the RCSB PDB
 823 | 
 824 |     Parameters
 825 |     ----------
 826 | 
 827 |     search_term : str
 828 |         The search keyword
 829 | 
 830 |     max_results : int
 831 |         The maximum number of results to return
 832 | 
 833 |     Returns
 834 |     -------
 835 | 
 836 |     all_papers : list of strings
 837 |         A descending-order list containing the top papers associated with
 838 |         the search term in the PDB
 839 | 
 840 |     Examples
 841 |     --------
 842 | 
 843 |     >>> matching_papers = find_papers('crispr',max_results=3)
 844 |     >>> print(matching_papers)
 845 |     ['Crystal structure of a CRISPR-associated protein from thermus thermophilus',
 846 |     'CRYSTAL STRUCTURE OF HYPOTHETICAL PROTEIN SSO1404 FROM SULFOLOBUS SOLFATARICUS P2',
 847 |     'NMR solution structure of a CRISPR repeat binding protein']
 848 | 
 849 |     '''
 850 |     all_papers = list()
 851 |     id_list = Query(search_term).search()
 852 |     for pdb_id in id_list[:max_results]:
 853 |         pdb_info = get_info(pdb_id)
 854 |         all_papers += [item["title"] for item in pdb_info["citation"]]
 855 |     return remove_dupes(all_papers)
 856 | 
 857 | 
 858 | # def find_authors(search_term, **kwargs):
 859 | #     '''Return an ordered list of the top authors returned by a keyword search of
 860 | #     the RCSB PDB
 861 | 
 862 | #     This function is based on the number of unique PDB entries a given author has
 863 | #     his or her name associated with, and not author order or the ranking of the
 864 | #     entry in the keyword search results. So if an author tends to publish on topics
 865 | #     related to the search_term a lot, even if those papers are not the best match for
 866 | #     the exact search, he or she will have priority in this function over an author
 867 | #     who wrote the one paper that is most relevant to the search term. For the latter
 868 | #     option, just do a standard keyword search using do_search.
 869 | 
 870 | #     Parameters
 871 | #     ----------
 872 | 
 873 | #     search_term : str
 874 | #         The search keyword
 875 | 
 876 | #     max_results : int
 877 | #         The maximum number of results to return
 878 | 
 879 | #     Returns
 880 | #     -------
 881 | 
 882 | #     out : list of str
 883 | 
 884 | #     Examples
 885 | #     --------
 886 | 
 887 | #     >>> top_authors = find_authors('crispr',max_results=100)
 888 | #     >>> print(top_authors[:10])
 889 | #     ['Doudna, J.A.', 'Jinek, M.', 'Ke, A.', 'Li, H.', 'Nam, K.H.']
 890 | 
 891 | #     '''
 892 | 
 893 | #     all_individuals = parse_results_gen(search_term, field='citation_authors', **kwargs)
 894 | 
 895 | #     full_author_list = []
 896 | #     for individual in all_individuals:
 897 | #         individual = individual.replace('.,', '.;')
 898 | #         author_list_clean = [x.strip() for x in individual.split(';')]
 899 | #         full_author_list+=author_list_clean
 900 | 
 901 | #     out = list(chain.from_iterable(repeat(ii, c) for ii,c in Counter(full_author_list).most_common()))
 902 | 
 903 | #     return remove_dupes(out)
 904 | 
 905 | # def find_dates(search_term, **kwargs):
 906 | #     '''
 907 | #     Return an ordered list of the PDB submission dates returned by a
 908 | #     keyword search of the RCSB PDB. This can be used to assess the
 909 | #     popularity of a gievne keyword or topic
 910 | 
 911 | #     Parameters
 912 | #     ----------
 913 | 
 914 | #     search_term : str
 915 | #         The search keyword
 916 | 
 917 | #     max_results : int
 918 | #         The maximum number of results to return
 919 | 
 920 | #     Returns
 921 | #     -------
 922 | 
 923 | #     all_dates : list of str
 924 | #         A list of calendar strings associated with the search term, these can
 925 | #         be converted directly into time or datetime objects
 926 | 
 927 | #     '''
 928 | #     all_dates = parse_results_gen(search_term, field='deposition_date', **kwargs)
 929 | #     return all_dates
 930 | '''
 931 | =================
 932 | Helper Functions
 933 | =================
 934 | '''
 935 | 
 936 | 
 937 | def to_dict(odict):
 938 |     '''Convert OrderedDict to dict
 939 | 
 940 |     Takes a nested, OrderedDict() object and outputs a
 941 |     normal dictionary of the lowest-level key:val pairs
 942 | 
 943 |     Parameters
 944 |     ----------
 945 | 
 946 |     odict : OrderedDict
 947 | 
 948 |     Returns
 949 |     -------
 950 | 
 951 |     out : dict
 952 | 
 953 |         A dictionary corresponding to the flattened form of
 954 |         the input OrderedDict
 955 | 
 956 |     '''
 957 | 
 958 |     out = json.loads(json.dumps(odict))
 959 |     return out
 960 | 
 961 | 
 962 | def remove_at_sign(kk):
 963 |     '''Remove the '@' character from the beginning of key names in a dict()
 964 | 
 965 |     Parameters
 966 |     ----------
 967 | 
 968 |     kk : dict
 969 |         A dictionary containing keys with the @ character
 970 |         (this pops up a lot in converted XML)
 971 | 
 972 |     Returns
 973 |     -------
 974 | 
 975 |     kk : dict (modified in place)
 976 |         A dictionary where the @ character has been removed
 977 | 
 978 |     '''
 979 |     tagged_keys = [thing for thing in kk.keys() if thing.startswith('@')]
 980 |     for tag_key in tagged_keys:
 981 |         kk[tag_key[1:]] = kk.pop(tag_key)
 982 | 
 983 |     return kk
 984 | 
 985 | 
 986 | def remove_dupes(list_with_dupes):
 987 |     '''Remove duplicate entries from a list while preserving order
 988 | 
 989 |     This function uses Python's standard equivalence testing methods in
 990 |     order to determine if two elements of a list are identical. So if in the list [a,b,c]
 991 |     the condition a == b is True, then regardless of whether a and b are strings, ints,
 992 |     or other, then b will be removed from the list: [a, c]
 993 | 
 994 |     Parameters
 995 |     ----------
 996 | 
 997 |     list_with_dupes : list
 998 |         A list containing duplicate elements
 999 | 
1000 |     Returns
1001 |     -------
1002 |     out : list
1003 |         The list with the duplicate entries removed by the order preserved
1004 | 
1005 | 
1006 |     Examples
1007 |     --------
1008 |     >>> a = [1,3,2,4,2]
1009 |     >>> print(remove_dupes(a))
1010 |     [1,3,2,4]
1011 | 
1012 |     '''
1013 |     visited = set()
1014 |     visited_add = visited.add
1015 |     out = [
1016 |         entry for entry in list_with_dupes
1017 |         if not (entry in visited or visited_add(entry))
1018 |     ]
1019 |     return out
1020 | 
1021 | 
1022 | def walk_nested_dict(my_result, term, outputs=[], depth=0, maxdepth=25):
1023 |     '''
1024 |     For a nested dictionary that may itself comprise lists of
1025 |     dictionaries of unknown length, determine if a key is anywhere
1026 |     in any of the dictionaries using a depth-first search
1027 | 
1028 |     Parameters
1029 |     ----------
1030 | 
1031 |     my_result : dict
1032 |         A nested dict containing lists, dicts, and other objects as vals
1033 | 
1034 |     term : str
1035 |         The name of the key stored somewhere in the tree
1036 | 
1037 |     maxdepth : int
1038 |         The maximum depth to search the results tree
1039 | 
1040 |     depth : int
1041 |         The depth of the search so far.
1042 |         Users don't usually access this.
1043 | 
1044 |     outputs : list
1045 |         All of the positive search results collected so far.
1046 |         Users don't usually access this.
1047 | 
1048 |     Returns
1049 |     -------
1050 | 
1051 |     outputs : list
1052 |         All of the search results.
1053 | 
1054 |     '''
1055 | 
1056 |     if depth > maxdepth:
1057 |         warnings.warn(
1058 |             'Maximum recursion depth exceeded. Returned None for the search results,'
1059 |             + ' try increasing the maxdepth keyword argument.')
1060 |         return None
1061 | 
1062 |     depth = depth + 1
1063 | 
1064 |     if type(my_result) == dict:
1065 |         if term in my_result.keys():
1066 |             outputs.append(my_result[term])
1067 | 
1068 |         else:
1069 |             new_results = list(my_result.values())
1070 |             walk_nested_dict(new_results,
1071 |                              term,
1072 |                              outputs=outputs,
1073 |                              depth=depth,
1074 |                              maxdepth=maxdepth)
1075 | 
1076 |     elif type(my_result) == list:
1077 |         for item in my_result:
1078 |             walk_nested_dict(item,
1079 |                              term,
1080 |                              outputs=outputs,
1081 |                              depth=depth,
1082 |                              maxdepth=maxdepth)
1083 | 
1084 |     else:
1085 |         pass
1086 |         # dead leaf
1087 | 
1088 |     # this conditional may not be necessary
1089 |     if outputs:
1090 |         return outputs
1091 |     else:
1092 |         return None
1093 | 


--------------------------------------------------------------------------------
/pypdb/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/williamgilpin/pypdb/444caf0ad0cfb50cd13d4fd6490c35ee97972ad5/pypdb/util/__init__.py


--------------------------------------------------------------------------------
/pypdb/util/http_requests.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for requesting URLs over HTTP"""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | import time
 6 | import requests
 7 | import warnings
 8 | 
 9 | 
10 | def request_limited(url: str,
11 |                     rtype: str = "GET",
12 |                     num_attempts: int = 3,
13 |                     sleep_time=0.5,
14 |                     **kwargs) -> Optional[requests.models.Response]:
15 |     """
16 |     HTML request with rate-limiting base on response code
17 | 
18 | 
19 |     Parameters
20 |     ----------
21 |     url : str
22 |         The url for the request
23 |     rtype : str
24 |         The request type (oneof ["GET", "POST"])
25 |     num_attempts : int
26 |         In case of a failed retrieval, the number of attempts to try again
27 |     sleep_time : int
28 |         The amount of time to wait between requests, in case of
29 |         API rate limits
30 |     **kwargs : dict
31 |         The keyword arguments to pass to the request
32 | 
33 |     Returns
34 |     -------
35 | 
36 |     response : requests.models.Response
37 |         The server response object. Only returned if request was successful,
38 |         otherwise returns None.
39 | 
40 |     """
41 | 
42 |     if rtype not in ["GET", "POST"]:
43 |         warnings.warn("Request type not recognized")
44 |         return None
45 | 
46 |     total_attempts = 0
47 |     while (total_attempts <= num_attempts):
48 |         if rtype == "GET":
49 |             response = requests.get(url, **kwargs)
50 |         elif rtype == "POST":
51 |             response = requests.post(url, **kwargs)
52 | 
53 |         if response.status_code == 200:
54 |             return response
55 | 
56 |         if response.status_code == 429:
57 |             curr_sleep = (1 + total_attempts) * sleep_time
58 |             warnings.warn("Too many requests, waiting " + str(curr_sleep) +
59 |                           " s")
60 |             time.sleep(curr_sleep)
61 |         elif 500 <= response.status_code < 600:
62 |             warnings.warn("Server error encountered. Retrying")
63 |         total_attempts += 1
64 | 
65 |     warnings.warn("Too many failures on requests. Exiting...")
66 |     return None
67 | 


--------------------------------------------------------------------------------
/pypdb/util/test_http_requests.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import requests
 3 | import unittest
 4 | from unittest import mock
 5 | import warnings
 6 | 
 7 | from pypdb.util import http_requests
 8 | 
 9 | 
10 | class TestHTTPRequests(unittest.TestCase):
11 |     @mock.patch.object(warnings, "warn", autospec=True)
12 |     @mock.patch.object(time, "sleep", autospec=True)
13 |     def test_fails_with_invalid_request(self, mock_sleep, mock_warnings):
14 |         self.assertIsNone(
15 |             http_requests.request_limited(url="http://protein_data_bank.com",
16 |                                           rtype="MAIL"))
17 |         mock_warnings.assert_called_once_with("Request type not recognized")
18 |         self.assertEqual(len(mock_sleep.mock_calls), 0)
19 | 
20 |     @mock.patch.object(requests, "get", autospec=True)
21 |     @mock.patch.object(time, "sleep", autospec=True)
22 |     def test_get__first_try_success(self, mock_sleep, mock_get):
23 |         mock_response = mock.create_autospec(requests.models.Response)
24 |         mock_response.status_code = 200  # A-OK!
25 |         mock_get.return_value = mock_response
26 | 
27 |         self.assertEqual(
28 |             http_requests.request_limited(url="http://get_your_proteins.com",
29 |                                           rtype="GET"), mock_response)
30 |         mock_get.assert_called_once_with("http://get_your_proteins.com")
31 |         self.assertEqual(len(mock_sleep.mock_calls), 0)
32 | 
33 |     @mock.patch.object(requests, "post", autospec=True)
34 |     @mock.patch.object(time, "sleep", autospec=True)
35 |     def test_post__first_try_success(self, mock_sleep, mock_post):
36 |         mock_response = mock.create_autospec(requests.models.Response)
37 |         mock_response.status_code = 200  # A-OK!
38 |         mock_post.return_value = mock_response
39 | 
40 |         self.assertEqual(
41 |             http_requests.request_limited(url="http://get_your_proteins.com",
42 |                                           rtype="POST"), mock_response)
43 |         mock_post.assert_called_once_with("http://get_your_proteins.com")
44 |         self.assertEqual(len(mock_sleep.mock_calls), 0)
45 | 
46 |     @mock.patch.object(requests, "get", autospec=True)
47 |     @mock.patch.object(time, "sleep", autospec=True)
48 |     def test_get__succeeds_third_try(self, mock_sleep, mock_get):
49 |         # Busy response
50 |         mock_busy_response = mock.create_autospec(requests.models.Response)
51 |         mock_busy_response.status_code = 429
52 |         # Server Error response
53 |         mock_error_response = mock.create_autospec(requests.models.Response)
54 |         mock_error_response.status_code = 504
55 |         # All good (200)
56 |         mock_ok_response = mock.create_autospec(requests.models.Response)
57 |         mock_ok_response.status_code = 200
58 | 
59 |         # Mocks `requests.get` to return Busy, then Server Error, then OK
60 |         mock_get.side_effect = [
61 |             mock_busy_response, mock_error_response, mock_ok_response
62 |         ]
63 | 
64 |         self.assertEqual(
65 |             http_requests.request_limited(url="http://get_your_proteins.com",
66 |                                           rtype="GET"), mock_ok_response)
67 |         self.assertEqual(len(mock_get.mock_calls), 3)
68 |         mock_get.assert_called_with("http://get_your_proteins.com")
69 |         # Should only sleep on being throttled (not server error)
70 |         self.assertEqual(len(mock_sleep.mock_calls), 1)
71 | 
72 |     @mock.patch.object(warnings, "warn", autospec=True)
73 |     @mock.patch.object(requests, "post", autospec=True)
74 |     @mock.patch.object(time, "sleep", autospec=True)
75 |     def test_post__repeatedly_fails_return_nothing(self, mock_sleep, mock_post,
76 |                                                    mock_warn):
77 |         # Busy response
78 |         mock_busy_response = mock.create_autospec(requests.models.Response)
79 |         mock_busy_response.status_code = 429
80 |         mock_post.return_value = mock_busy_response
81 | 
82 |         self.assertIsNone(
83 |             http_requests.request_limited(url="http://protein_data_bank.com",
84 |                                           rtype="POST"))
85 |         mock_warn.assert_called_with(
86 |             "Too many failures on requests. Exiting...")
87 | 
88 |         self.assertEqual(len(mock_post.mock_calls), 4)
89 |         mock_post.assert_called_with("http://protein_data_bank.com")
90 |         self.assertEqual(len(mock_sleep.mock_calls), 4)
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     unittest.main()
95 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | # read the contents of the README file so that PyPI can use it as the long description                               
 4 | from pathlib import Path
 5 | this_directory = Path(__file__).parent
 6 | long_description = (this_directory / "README.md").read_text()
 7 | 
 8 | modules_list = [
 9 |     "pypdb",
10 |     "pypdb.util",
11 |     "pypdb.clients",
12 |     "pypdb.clients.search",
13 |     "pypdb.clients.search.operators",
14 |     "pypdb.clients.data",
15 |     "pypdb.clients.data.graphql",
16 |     "pypdb.clients.fasta",
17 |     "pypdb.clients.pdb",
18 | ]
19 | 
20 | setup(
21 |     name='pypdb',
22 |     packages=modules_list,  # same as 'name'
23 |     py_modules=modules_list,
24 |     version='2.04',
25 |     install_requires=['requests'],
26 |     description='A Python wrapper for the RCSB Protein Data Bank (PDB) API',
27 |     author='William Gilpin',
28 |     author_email='firstnamelastname@gmail.com',
29 |     url='https://github.com/williamgilpin/pypdb',
30 |     download_url='https://github.com/williamgilpin/pypdb/tarball/0.6',
31 |     keywords=['protein', 'data', 'RESTful', 'api'],
32 |     classifiers=[],
33 |     long_description=long_description,
34 |     long_description_content_type='text/markdown'
35 | )
36 | 


--------------------------------------------------------------------------------
/tests/test_pypdb.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | ## Import from local directory
 4 | import sys
 5 | sys.path.insert(0, '../pypdb')
 6 | from pypdb import *
 7 | 
 8 | # TODO(ejwilliams): Write generic logic, to execute `test_*.py` files
 9 | # within the pypdb directory (removing need for sys.path hack)
10 | 
11 | # aa_index[s] for s in seq_dict[k] if s in aa_index.keys()]
12 | 
13 | class TestSearchFunctions(unittest.TestCase):
14 | 
15 |     def test_searchterm(self):
16 |         found_pdbs = Query('ribosome').search()
17 |         self.assertTrue(len(found_pdbs) > 0)
18 |         self.assertTrue(type(found_pdbs[0]) == str)
19 | 
20 |         # an error page would be a longer string
21 |         self.assertTrue(len(found_pdbs[0]) < 10)
22 | 
23 |     def test_pubmed(self):
24 |         found_pdbs = Query(27499440, "PubmedIdQuery").search()
25 |         self.assertTrue(len(found_pdbs) > 0)
26 | 
27 |     def test_treeentity(self):
28 |         found_pdbs = Query('6239', 'TreeEntityQuery').search()
29 |         self.assertTrue(len(found_pdbs) > 0)
30 | 
31 |     def test_exptype(self):
32 |         found_pdbs = Query('SOLID-STATE NMR', 'ExpTypeQuery').search()
33 |         self.assertTrue(len(found_pdbs) > 0)
34 | 
35 |     def test_structure(self):
36 |         found_pdbs = Query('2E8D', 'structure').search()
37 |         self.assertTrue(len(found_pdbs) > 0)
38 | 
39 |     def test_advancedauthor(self):
40 |         found_pdbs = Query('Perutz, M.F.', 'AdvancedAuthorQuery').search()
41 |         self.assertTrue(len(found_pdbs) > 0)
42 | 
43 |     def test_organism(self):
44 |         found_pdbs = Query('Dictyostelium', 'OrganismQuery').search()
45 |         self.assertTrue(len(found_pdbs) > 0)
46 | 
47 |     
48 | 
49 |     # def test_blast(self):
50 |     #     found_pdbs = blast_from_sequence(
51 |     #         'MTKIANKYEVIDNVEKLEKALKRLREAQSVYATYTQEQVDKIFFEAAMAANKMRIPLAKMAVE'
52 |     #         + 'ETGMGVVEDKVIKNHYASEYIYNAYKNTKTCGVIEEDPAFGIKKIAEPLGVIAAVIPTTNP'
53 |     #         + 'TSTAIFKTLIALKTRNAIIISPHPRAKNSTIEAAKIVLEAAVKAGAPEGIIGWIDVPSLEL'
54 |     #         + 'TNLVMREADVILATGGPGLVKAAYSSGKPAIGVGAGNTPAIIDDSADIVLAVNSIIHSKTF'
55 |     #         + 'DNGMICASEQSVIVLDGVYKEVKKEFEKRGCYFLNEDETEKVRKTIIINGALNAKIVGQKA'
56 |     #         + 'HTIANLAGFEVPETTKILIGEVTSVDISEEFAHEKLCPVLAMYRAKDFDDALDKAERLVAD'
57 |     #         + 'GGFGHTSSLYIDTVTQKEKLQKFSERMKTCRILVNTPSSQGGIGDLYNFKLAPSL',
58 |     #         1e-20)
59 |     #     self.assertTrue(len(found_pdbs) > 0)
60 |     #     self.assertTrue(type(found_pdbs[0][0]) == str)
61 | 
62 |     #     # an error page would be a longer string
63 |     #     self.assertTrue(len(found_pdbs[0][0]) < 10)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     unittest.main()
68 | 


--------------------------------------------------------------------------------