├── .gitignore
├── .readthedocs.yml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── NOTICE.md
├── README.md
├── azure-pipelines.yml
├── azure-template-publish-job.yml
├── azure-template-tox-job.yml
├── codemeta.json
├── docs
├── Makefile
├── conf.py
├── data_api
│ ├── additional_examples.md
│ ├── api.rst
│ ├── implementation_details.md
│ ├── query_construction.md
│ └── quickstart.md
├── index.rst
├── make.bat
├── requirements.txt
└── search_api
│ ├── additional_examples.md
│ ├── api.rst
│ ├── attributes.md
│ ├── query_construction.md
│ └── quickstart.md
├── notebooks
├── covid.ipynb
├── data_quickstart.ipynb
├── multisearch.ipynb
├── search_data_workflow.ipynb
├── search_examples.ipynb
└── search_quickstart.ipynb
├── pylintrc
├── rcsbapi
├── __init__.py
├── config.py
├── const.py
├── data
│ ├── __init__.py
│ ├── data_query.py
│ ├── data_schema.py
│ └── resources
│ │ ├── assembly.json
│ │ ├── branched_entity.json
│ │ ├── branched_entity_instance.json
│ │ ├── chem_comp.json
│ │ ├── data_api_schema.json
│ │ ├── drugbank.json
│ │ ├── entry.json
│ │ ├── nonpolymer_entity.json
│ │ ├── nonpolymer_entity_instance.json
│ │ ├── polymer_entity.json
│ │ ├── polymer_entity_instance.json
│ │ ├── pubmed.json
│ │ └── uniprot.json
├── dev_tools
│ └── update_schema.py
└── search
│ ├── __init__.py
│ ├── resources
│ ├── chemical_schema.json
│ └── structure_schema.json
│ ├── search_query.py
│ └── search_schema.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── test-data
│ ├── 2mnr.cif
│ ├── 4hhb-assembly1.cif.gz
│ ├── 4hhb.bcif
│ ├── 4hhb.cif
│ ├── 4hhb.pdb
│ ├── 4hhb.pdb1
│ ├── 4hhb.pdb1.gz
│ ├── 7n0r.cif.gz
│ ├── 7n0r.pdb.gz
│ └── invalid.txt
├── test_data_query.py
├── test_data_schema.py
├── test_search_query.py
└── test_search_schema.py
└── tox.ini
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # VS Code settings
163 | .vscode/
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | build:
9 | os: "ubuntu-22.04"
10 | tools:
11 | python: "3.9"
12 |
13 | # Build documentation in the docs/ directory with Sphinx
14 | sphinx:
15 | configuration: docs/conf.py
16 |
17 | # Optionally build your docs in additional formats such as PDF
18 | formats:
19 | - pdf
20 |
21 | python:
22 | install:
23 | - requirements: docs/requirements.txt
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## v1.1.3 (2025-05-05)
4 |
5 | - Fix: Update regex pattern for instances in `const.py` to support suffixes longer than one character (e.g., "1S5L.AA")
6 |
7 | ## v1.1.2 (2025-03-20)
8 |
9 | - Update how `dataclass` attributes are created in `const.py`
10 |
11 | ## v1.1.1 (2025-03-13)
12 |
13 | - Add missing dependency for building documentation
14 | - Add docstrings
15 |
16 | ## v1.1.0 (2025-03-12)
17 |
18 | - Add `ALL_STRUCTURES` object, allowing Data API queries for all PDB structures and chemical components
19 | - Add `progress_bar` and `batch_size` parameters to Data API package's `.exec`
20 | - Add `group` function to Search API package to enforce nested grouping
21 | - Update README with new citation information
22 | - Update search schemas: 1.48.0 -> 1.49.0
23 | - Update data schemas:
24 | - entry schema 9.0.3 -> 9.0.4
25 | - polymer_entity_instance schema 10.0.2 -> 10.0.3
26 | - nonpolymer_entity_instance schema 10.0.0 -> 10.0.1
27 |
28 | ## v1.0.1 (2025-01-17)
29 |
30 | - Add import to `const.py` for compatibility with Python 3.8
31 | - Update search schemas: 1.47.7 -> 1.48.0
32 |
33 | ## v1.0.0 (2024-11-6)
34 |
35 | - Release version 1.0.0 of package
36 | - Update search schemas: 1.47.6 -> 1.47.7
37 | - Update data schemas:
38 | - entry schema 9.0.2 -> 9.0.3
39 | - chem_comp schema 7.1.3 -> 7.1.4
40 | - Update documentation
41 |
42 | ## v0.5.0 (2024-10-28)
43 |
44 | - Separate out package-wide settings into immutable constants (`const.py`) and configurable parameters (`config.py`)
45 | - Renamed `rcsb_attributes` -> `search_attributes`
46 | - Automatically capitalize input_ids
47 | - Added `dev_tools` directory and updated `update_schema.py`
48 | - Search API `chemical_schema` and `structure_schema` at v1.47.6
49 | - Update documentation
50 |
51 | ## v0.4.0 (2024-10-15)
52 |
53 | - Merge [rcsbsearchapi package](https://github.com/rcsb/py-rcsbsearchapi/tree/2ba4d82ed1ff23c4ba5d07d4dec63f6f4030207d) into package as separate `rcsbapi.search` module
54 | - Renamed several classes and methods in this process:
55 | - `SequenceQuery` -> `SeqSimilarityQuery`
56 | - `StructureMotifResidue` -> `StructMotifResidue`
57 | - `Range` -> `FacetRange`
58 | - `rcsb_query_editor_url` -> `get_editor_link`
59 | - `rcsb_query_builder_url` -> `get_query_builder_link`
60 | - Renamed several files and classes to prevent overlap with future developments:
61 | - `data/query.py` -> `data/data_query.py`
62 | - `data/schema.py` -> `data/schema_query.py`
63 | - `Query()` Data API class -> `DataQuery()`
64 | - `Schema()` Data API class -> `DataSchema()`
65 | - `search/search.py` -> `search/search_query.py`
66 | - `search/schema.py` -> `search/search_schema.py`
67 | - Automatically change singular "input_type" to plural when possible
68 | - Add warning message if fully qualified field path not provided
69 | - Update documentation
70 |
71 | ## v0.3.0 (2024-08-23)
72 |
73 | - Falls back to local schema file when fetch fails
74 | - Supports dot separated field names for requesting data
75 | - `get_unique_fields` deleted and replaced with `find_paths`
76 | - `find_field_names` changed to return only field names, no descriptions
77 | - Executing queries called with `.exec()`
78 | - Updates to documentation
79 | - See [PR #31](https://github.com/rcsb/py-rcsb-api/pull/31) for full details
80 | - Updated data_api_schema.json and added all schema files on https://data.rcsb.org/#data-schema
81 |
82 | ## v0.2.0 (2024-07-25)
83 |
84 | - Updates to Query methods
85 | - Added GraphQL query validation
86 | - Updates to documentation
87 |
88 | ## v0.1.0 (2024-07-22)
89 |
90 | - First release!
91 | - Provides Pythonic interface for interacting with RCSB.org Data API
92 | - Automated Data API schema parsing via Schema.py
93 | - Enables query building and execution via Query.py
94 | - Documentation and example notebooks
95 | - See [PR #23](https://github.com/rcsb/py-rcsb-api/pull/23) for full details
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 RCSB PDB
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | #
2 | # File: py-rcsb-api/MANIFEST.in
3 | #
4 | include HISTORY.txt
5 | include requirements.txt
6 | include README.md
7 | #
8 |
--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
1 | # Third-Party Copyright Notices
2 | `rcsb-api` uses third-party libraries or other resources that may
3 | be distributed under licenses different than the `rcsb-api` software.
4 |
5 | In the event that we accidentally failed to list a required notice,
6 | please bring it to our attention through the creation of a [GitHub issue](https://github.com/rcsb/py-rcsb-api/issues).
7 |
8 | The attached notices are provided for information only.
9 |
10 | ## [rcsbsearchapi](https://github.com/rcsb/py-rcsbsearchapi)
11 |
12 | BSD 3-Clause License
13 | --------------------
14 |
15 | Copyright 2024 rcsbsearchapi Contributors
16 |
17 | Redistribution and use in source and binary forms, with or without
18 | modification, are permitted provided that the following conditions are met:
19 |
20 | 1. Redistributions of source code must retain the above copyright notice,
21 | this list of conditions and the following disclaimer.
22 |
23 | 2. Redistributions in binary form must reproduce the above copyright notice,
24 | this list of conditions and the following disclaimer in the documentation
25 | and/or other materials provided with the distribution.
26 |
27 | 3. Neither the name of the copyright holder nor the names of its contributors
28 | may be used to endorse or promote products derived from this software
29 | without specific prior written permission.
30 |
31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
32 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
34 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
35 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
37 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
39 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://pypi.org/project/rcsb-api/)
2 | [](https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_build/latest?definitionId=40&branchName=master)
3 | [](https://rcsbapi.readthedocs.io/en/latest/?badge=latest)
4 | [](https://doi.org/10.5281/zenodo.14052470)
5 | [](https://www.bestpractices.dev/projects/10424)
6 | [](https://fairsoftwarechecklist.net/v0.2?f=31&a=30112&i=32111&r=133)
7 | [](https://fair-software.eu)
8 |
9 | # rcsb-api
10 | Python interface for RCSB PDB API services at RCSB.org.
11 |
12 | This package requires Python 3.8 or later.
13 |
14 |
15 | ## Installation
16 | Get it from PyPI:
17 |
18 | pip install rcsb-api
19 |
20 | Or, download from [GitHub](https://github.com/rcsb/py-rcsb-api/)
21 |
22 |
23 | ## Getting Started
24 | Full documentation available at [readthedocs](https://rcsbapi.readthedocs.io/en/latest/).
25 |
26 | The [RCSB PDB Search API](https://search.rcsb.org) supports RESTful requests according to a defined [schema](https://search.rcsb.org/redoc/index.html). This package provides an `rcsbapi.search` module that simplifies generating complex search queries.
27 |
28 | The [RCSB PDB Data API](https://data.rcsb.org) supports requests using [GraphQL](https://graphql.org/), a language for API queries. This package provides an `rcsbapi.data` module that simplifies generating queries in GraphQL syntax.
29 |
30 | ### Search API
31 | The `rcsbapi.search` module supports all available [Advanced Search](https://www.rcsb.org/search/advanced) services, as listed below. For more details on their usage, see [Search Service Types](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#search-service-types).
32 |
33 | |Search service |QueryType |
34 | |----------------------------------|--------------------------|
35 | |Full-text |`TextQuery()` |
36 | |Attribute (structure or chemical) |`AttributeQuery()` |
37 | |Sequence similarity |`SeqSimilarityQuery()` |
38 | |Sequence motif |`SeqMotifQuery()` |
39 | |Structure similarity |`StructSimilarityQuery()` |
40 | |Structure motif |`StructMotifQuery()` |
41 | |Chemical similarity |`ChemSimilarityQuery()` |
42 |
43 | #### Search API Examples
44 | To perform a search for all structures from humans associated with the term "Hemoglobin", you can combine a "full-text" query (`TextQuery`) with an "attribute" query (`AttributeQuery`):
45 |
46 | ```python
47 | from rcsbapi.search import AttributeQuery, TextQuery
48 | from rcsbapi.search import search_attributes as attrs
49 |
50 | # Construct a "full-text" sub-query for structures associated with the term "Hemoglobin"
51 | q1 = TextQuery(value="Hemoglobin")
52 |
53 | # Construct an "attribute" sub-query to search for structures from humans
54 | q2 = AttributeQuery(
55 | attribute="rcsb_entity_source_organism.scientific_name",
56 | operator="exact_match", # Other operators include "contains_phrase", "exists", and more
57 | value="Homo sapiens"
58 | )
59 | # OR, do so by using Python bitwise operators:
60 | q2 = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens"
61 |
62 | # Combine the sub-queries (can sub-group using parentheses and standard operators, "&", "|", etc.)
63 | query = q1 & q2
64 |
65 | # Fetch the results by iterating over the query execution
66 | for rId in query():
67 | print(rId)
68 |
69 | # OR, capture them into a variable
70 | results = list(query())
71 | ```
72 |
73 | These examples are in `operator` syntax. You can also make queries in `fluent` syntax. Learn more about both syntaxes and implementation details in [Query Syntax and Execution](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#query-syntax-and-execution).
74 |
75 |
76 | ### Data API
77 | The `rcsbapi.data` module allows you to easily construct GraphQL queries to the RCSB.org Data API.
78 |
79 | This is done by specifying the following input:
80 | - "input_type": the data hierarchy level you are starting from (e.g., "entry", "polymer_entity", etc.) (See full list [here](https://rcsbapi.readthedocs.io/en/latest/data_api/query_construction.html#input-type)).
81 | - "input_ids": the list of IDs for which to fetch data (corresponding to the specified "input_type")
82 | - "return_data_list": the list of data items ("fields") to retrieve. (Available fields can be explored [here](https://data.rcsb.org/data-attributes.html) or via the [GraphiQL editor's Documentation Explorer panel](https://data.rcsb.org/graphql/index.html).)
83 |
84 | #### Data API Examples
85 | This is a [simple query](https://data.rcsb.org/graphql/index.html?query=%7B%0A%20%20entry(entry_id%3A%20%224HHB%22)%20%7B%0A%20%20%20%20exptl%20%7B%0A%20%20%20%20%20%20method%0A%20%20%20%20%7D%0A%20%20%7D%0A%7D) requesting the experimental method of a structure with PDB ID 4HHB (Hemoglobin).
86 |
87 | The query must be executed using the `.exec()` method, which will return the JSON response as well as store the response as an attribute of the `DataQuery` object. From the object, you can access the Data API response, get an interactive editor link, or access the arguments used to create the query.
88 | The package is able to automatically build queries based on the "input_type" and path segment passed into "return_data_list". If using this package in code intended for long-term use, it's recommended to use fully qualified paths. When autocompletion is being used, an WARNING message will be printed out as a reminder.
89 |
90 | ```python
91 | from rcsbapi.data import DataQuery as Query
92 | query = Query(
93 | input_type="entries",
94 | input_ids=["4HHB"],
95 | return_data_list=["exptl.method"]
96 | )
97 | print(query.exec())
98 | ```
99 | Data is returned in JSON format
100 | ```json
101 | {
102 | "data": {
103 | "entries": [
104 | {
105 | "rcsb_id": "4HHB",
106 | "exptl": [
107 | {
108 | "method": "X-RAY DIFFRACTION"
109 | }
110 | ]
111 | }
112 | ]
113 | }
114 | }
115 | ```
116 |
117 | Here is a [more complex query](https://data.rcsb.org/graphql/index.html?query=%7B%0A%20%20polymer_entities(entity_ids%3A%5B%222CPK_1%22%2C%223WHM_1%22%2C%222D5Z_1%22%5D)%20%7B%0A%20%20%20%20rcsb_id%0A%20%20%20%20rcsb_entity_source_organism%20%7B%0A%20%20%20%20%20%20ncbi_taxonomy_id%0A%20%20%20%20%20%20ncbi_scientific_name%0A%20%20%20%20%7D%0A%20%20%20%20rcsb_cluster_membership%20%7B%0A%20%20%20%20%20%20cluster_id%0A%20%20%20%20%20%20identity%0A%20%20%20%20%7D%0A%20%20%7D%0A%7D). Note that periods can be used to further specify requested data in return_data_list. Also note multiple return data items and ids can be requested in one query.
118 | ```python
119 | from rcsbapi.data import DataQuery as Query
120 | query = Query(
121 | input_type="polymer_entities",
122 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"],
123 | return_data_list=[
124 | "polymer_entities.rcsb_id",
125 | "rcsb_entity_source_organism.ncbi_taxonomy_id",
126 | "rcsb_entity_source_organism.ncbi_scientific_name",
127 | "cluster_id",
128 | "identity"
129 | ]
130 | )
131 | print(query.exec())
132 | ```
133 |
134 | ## Jupyter Notebooks
135 | Several Jupyter notebooks with example use cases and workflows for all package modules are provided under [notebooks](notebooks/).
136 |
137 | For example, one notebook using both Search and Data API packages for a COVID-19 related example is available in [notebooks/search_data_workflow.ipynb](notebooks/search_data_workflow.ipynb) or online through Google Colab
.
138 |
139 |
140 | ## Citing
141 | Please cite the ``rcsb-api`` package with the following reference:
142 |
143 | > Dennis W. Piehl, Brinda Vallat, Ivana Truong, Habiba Morsy, Rusham Bhatt,
144 | > Santiago Blaumann, Pratyoy Biswas, Yana Rose, Sebastian Bittrich, Jose M. Duarte,
145 | > Joan Segura, Chunxiao Bi, Douglas Myers-Turnbull, Brian P. Hudson, Christine Zardecki,
146 | > Stephen K. Burley. rcsb-api: Python Toolkit for Streamlining Access to RCSB Protein
147 | > Data Bank APIs, Journal of Molecular Biology, 2025.
148 | > DOI: [10.1016/j.jmb.2025.168970](https://doi.org/10.1016/j.jmb.2025.168970)
149 |
150 | You should also cite the RCSB.org API services this package utilizes:
151 |
152 | > Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi
153 | > Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley,
154 | > John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards
155 | > Integrated Searching and Efficient Access to Macromolecular Structure Data
156 | > from the PDB Archive, Journal of Molecular Biology, 2020.
157 | > DOI: [10.1016/j.jmb.2020.11.003](https://doi.org/10.1016/j.jmb.2020.11.003)
158 |
159 |
160 | ## Documentation and Support
161 | Please refer to the [readthedocs page](https://rcsbapi.readthedocs.io/en/latest/index.html) to learn more about package usage and other available features as well as to see more examples.
162 |
163 | If you experience any issues installing or using the package, please submit an issue on [GitHub](https://github.com/rcsb/py-rcsb-api/issues) and we will try to respond in a timely manner.
164 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | # File: azure-pipelines.yml
2 | # Date: 30-May-2024
3 | #
4 | name: $(BuildDefinitionName)_$(Date:yyyyMMdd)$(Rev:.rr)
5 |
6 | trigger:
7 | - master
8 |
9 | pr:
10 | - master
11 | - staging
12 |
13 | schedules:
14 | - cron: "0 12 * * 0"
15 | displayName: Weekly Sunday build
16 | branches:
17 | include:
18 | - master
19 | always: true
20 |
21 | jobs:
22 | - template: azure-template-tox-job.yml
23 | parameters: {tox: 'format_pep8', python: '3.9', os: 'linux'}
24 | - template: azure-template-tox-job.yml
25 | parameters: {tox: 'lint_pylint', python: '3.9', os: 'linux'}
26 | - template: azure-template-tox-job.yml
27 | parameters: {tox: 'test_coverage', python: '3.9', os: 'linux'}
28 | #
29 | - template: azure-template-tox-job.yml
30 | parameters: {tox: 'py39', python: '3.9', os: 'linux'}
31 | - template: azure-template-tox-job.yml
32 | parameters: {tox: 'py39', python: '3.9', os: 'macos'}
33 | #
34 | - template: azure-template-publish-job.yml
35 | parameters: {tox: 'py39', python: '3.9', os: 'macos'}
36 | - template: azure-template-publish-job.yml
37 | parameters: {tox: 'py39', python: '3.9', os: 'linux'}
38 | #
39 |
--------------------------------------------------------------------------------
/azure-template-publish-job.yml:
--------------------------------------------------------------------------------
1 | # File: azure-template-publish-job.yml
2 | # Date: 8-Jun-2023
3 | #
4 | ##
5 | parameters:
6 | tox: ""
7 | python: ""
8 | os: "linux"
9 | fixtures: ""
10 |
11 | jobs:
12 | - job: ${{ format('publish_{0}_{1}', parameters.tox, parameters.os) }}
13 | pool:
14 | ${{ if eq(parameters.os, 'macos') }}:
15 | vmImage: 'macOS-15'
16 | ${{ if eq(parameters.os, 'linux') }}:
17 | vmImage: 'ubuntu-latest'
18 | dependsOn:
19 | - ${{ format('build_test_{0}_{1}', parameters.tox, parameters.os) }}
20 | condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'), eq(variables['Build.SourceBranch'], 'refs/heads/master'))
21 | #
22 | steps:
23 | - task: UsePythonVersion@0
24 | inputs:
25 | versionSpec: ${{ parameters.python }}
26 | addToPath: true
27 | displayName: setup python
28 | #
29 | #- checkout: self
30 | # submodules: true
31 | #
32 | - download: current
33 | artifact: ${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }}
34 |
35 | - download: current
36 | artifact: ${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }}
37 | #
38 | - script: ls -lR $(Pipeline.Workspace)/${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }}
39 | displayName: "Listing of downloaded artifacts"
40 | #
41 | - script: python -m pip install --upgrade pip twine setuptools wheel
42 | displayName: 'Install packaging tools'
43 | #
44 | - task: DownloadSecureFile@1
45 | name: pypicred
46 | displayName: 'Download PyPI credentials'
47 | inputs:
48 | secureFile: 'PYPIRC-AZURE'
49 |
50 | - ${{ if startsWith(parameters.os, 'linux') }}:
51 | - script: twine upload --verbose --skip-existing -r pypi --config-file $(pypicred.secureFilePath) $(Pipeline.Workspace)/${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }}/*
52 | displayName: "Linux upload sdist and source wheel to PyPi ..."
53 | continueOnError: true
54 | #
55 | - ${{ if startsWith(parameters.os, 'macos') }}:
56 | - script: twine upload --verbose --skip-existing -r pypi --config-file $(pypicred.secureFilePath) $(Pipeline.Workspace)/${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }}/*
57 | displayName: "Mac upload sdist and binary wheel to PyPi ..."
58 | continueOnError: true
--------------------------------------------------------------------------------
/azure-template-tox-job.yml:
--------------------------------------------------------------------------------
1 | # File: azure-template-tox-job.yml
2 | # Date: 30-May-2024
3 | #
4 | ##
5 | parameters:
6 | tox: ""
7 | python: ""
8 | os: "linux"
9 | fixtures: ""
10 |
11 | jobs:
12 | - job: ${{ format('build_test_{0}_{1}', parameters.tox, parameters.os) }}
13 | timeoutInMinutes: 0
14 | pool:
15 | ${{ if eq(parameters.os, 'macos') }}:
16 | vmImage: 'macOS-15'
17 | ${{ if eq(parameters.os, 'linux') }}:
18 | vmImage: 'ubuntu-latest'
19 |
20 | variables:
21 | - group: py-shared-variables
22 |
23 | steps:
24 | #
25 | # ensure the required Python versions are available
26 | - task: UsePythonVersion@0
27 | inputs:
28 | versionSpec: ${{ parameters.python }}
29 | addToPath: true
30 | displayName: setup python
31 | #
32 | - checkout: self
33 | submodules: true
34 | #
35 | - ${{ if startsWith(parameters.os, 'macos') }}:
36 | - bash: |
37 | set -e
38 | ls -la /Applications/Xcode*
39 | sudo xcode-select --switch /Applications/Xcode_16.app/Contents/Developer
40 | which g++
41 | c++ --version
42 | displayName: "setup Xcode"
43 | # ----------------------------------------------
44 | - ${{ if startsWith(parameters.os, 'linux') }}:
45 | - script: which apt
46 | displayName: 'Installing OS dependencies'
47 | - script: apt-cache policy | grep http | awk '{print $2 $3}' | sort -u
48 | displayName: 'Checking for repos'
49 | #
50 | - script: "python -c \"import sys; print(sys.version); print(sys.executable)\""
51 | displayName: show python information
52 | #
53 | - script: python -m pip install --upgrade pip tox
54 | displayName: 'Install tools'
55 | #
56 | - script: pip install -r requirements.txt
57 | displayName: 'Install dependencies'
58 | #
59 | - ${{ if startsWith(parameters.tox, 'py') }}:
60 | - script: |
61 | export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
62 | ${{ format('python -m tox -e {0}', parameters.tox) }}
63 | displayName: 'Running tox task'
64 | - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.9')) }}:
65 | - script: |
66 | export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
67 | ${{ format('python -m tox -e {0}-py39', parameters.tox) }}
68 | displayName: 'Running tox task'
69 | - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.8')) }}:
70 | - script: |
71 | export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV)
72 | ${{ format('python -m tox -e {0}-py38', parameters.tox) }}
73 | displayName: 'Running tox task'
74 | #
75 | # Build artifacts if this is a test target (i.e. labeled as py##)
76 | #
77 | - ${{ if startsWith(parameters.tox, 'py') }}:
78 | - script: pip install --upgrade pip twine setuptools wheel
79 | displayName: "Acquire build tools"
80 | - script: python setup.py sdist --dist-dir "$(System.DefaultWorkingDirectory)/dist"
81 | displayName: "Build source dist"
82 | - script: python setup.py bdist_wheel --dist-dir "$(System.DefaultWorkingDirectory)/dist"
83 | displayName: "Build wheel"
84 | #
85 | - script: python setup.py sdist --dist-dir "$(System.DefaultWorkingDirectory)/udist"
86 | displayName: "Build source dist"
87 | #
88 | # Check the install artifacts
89 | - script: ls -lR "$(System.DefaultWorkingDirectory)/dist" "$(System.DefaultWorkingDirectory)/udist"
90 | displayName: "Listing of installed software"
91 | #
92 | - publish: $(System.DefaultWorkingDirectory)/dist
93 | artifact: ${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }}
94 | #
95 | - publish: $(System.DefaultWorkingDirectory)/udist
96 | artifact: ${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }}
97 | #
--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
1 | {
2 | "@context": "https://w3id.org/codemeta/3.0",
3 | "type": "SoftwareSourceCode",
4 | "applicationCategory": "Structural Biology, Bioinformatics",
5 | "codeRepository": "https://github.com/rcsb/py-rcsb-api",
6 | "dateCreated": "2024-04-15",
7 | "dateModified": "2025-03-20",
8 | "datePublished": "2024-07-22",
9 | "description": "Python interface for RCSB PDB API services at RCSB.org.",
10 | "downloadUrl": "https://github.com/rcsb/py-rcsb-api/archive/refs/heads/master.zip",
11 | "funder": {
12 | "type": "Organization",
13 | "name": "US National Science Foundation (DBI-2321666), US Department of Energy (DE-SC0019749), National Cancer Institute, National Institute of Allergy and Infectious Diseases, and National Institute of General Medical Sciences of the National Institutes of Health (R01GM157729)"
14 | },
15 | "keywords": [
16 | "structural biology",
17 | "bioinformatics",
18 | "protein structure",
19 | "application programming interface",
20 | "APIs"
21 | ],
22 | "license": "https://spdx.org/licenses/MIT",
23 | "name": "rcsb-api",
24 | "operatingSystem": [
25 | "Linux",
26 | "Windows",
27 | "MacOS"
28 | ],
29 | "programmingLanguage": "Python 3",
30 | "relatedLink": [
31 | "https://pypi.org/project/rcsb-api/",
32 | "https://rcsbapi.readthedocs.io/en/latest/index.html"
33 | ],
34 | "softwareRequirements": "https://github.com/rcsb/py-rcsb-api/blob/master/requirements.txt",
35 | "version": "1.1.2",
36 | "codemeta:contIntegration": {
37 | "id": "https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_build/latest?definitionId=40&branchName=master"
38 | },
39 | "continuousIntegration": "https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_build/latest?definitionId=40&branchName=master",
40 | "developmentStatus": "active",
41 | "funding": "DBI-2321666, DE-SC0019749, R01GM157729",
42 | "issueTracker": "https://github.com/rcsb/py-rcsb-api/issues",
43 | "referencePublication": "https://doi.org/10.1016/j.jmb.2025.168970"
44 | }
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | livehtml:
18 | sphinx-autobuild -b html -z "$(SOURCEDIR)/../rcsbapi" "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
19 |
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.abspath(".."))
17 | import rcsbapi # noqa: E402
18 |
19 | # -- Project information -----------------------------------------------------
20 |
21 | project = "rcsb-api"
22 | copyright = "2024, RCSB PDB"
23 | author = "RCSB PDB"
24 |
25 | # The version info for the project you're documenting, acts as replacement for
26 | # |version| and |release|, also used in various other places throughout the
27 | # built documents.
28 | #
29 | # The short X.Y version.
30 | version = rcsbapi.__version__.split("-")[0]
31 | # The full version, including alpha/beta/rc tags
32 | release = rcsbapi.__version__
33 |
34 |
35 | # -- General configuration ---------------------------------------------------
36 |
37 | # Add any Sphinx extension module names here, as strings. They can be
38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
39 | # ones.
40 | extensions = [
41 | "sphinx.ext.autodoc",
42 | "sphinx.ext.coverage",
43 | "sphinx.ext.napoleon",
44 | "myst_parser",
45 | ]
46 | # source_suffix = [".rst", ".md"] # Redundant with newer sphinx versions
47 |
48 | # Add any paths that contain templates here, relative to this directory.
49 | templates_path = ["_templates"]
50 |
51 | # List of patterns, relative to source directory, that match files and
52 | # directories to ignore when looking for source files.
53 | # This pattern also affects html_static_path and html_extra_path.
54 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
55 |
56 | # Napoleon settings
57 | # napoleon_google_docstring = True
58 | napoleon_numpy_docstring = False
59 | # napoleon_include_init_with_doc = False
60 | # napoleon_include_private_with_doc = False
61 | # napoleon_include_special_with_doc = True
62 | # napoleon_use_admonition_for_examples = False
63 | # napoleon_use_admonition_for_notes = False
64 | # napoleon_use_admonition_for_references = False
65 | # napoleon_use_ivar = False
66 | # napoleon_use_param = True
67 | # napoleon_use_rtype = True
68 |
69 |
70 | # -- Options for HTML output -------------------------------------------------
71 |
72 | # The theme to use for HTML and HTML Help pages. See the documentation for
73 | # a list of builtin themes.
74 | html_theme = "sphinx_rtd_theme"
75 |
76 |
77 | # Add any paths that contain custom static files (such as style sheets) here,
78 | # relative to this directory. They are copied after the builtin static files,
79 | # so a file named "default.css" will overwrite the builtin "default.css".
80 | html_static_path = []
81 |
--------------------------------------------------------------------------------
/docs/data_api/additional_examples.md:
--------------------------------------------------------------------------------
1 | # Additional Examples
2 | Most examples come from [RCSB PDB Data API documentation](https://data.rcsb.org/#examples)
3 |
4 | ### Entries
5 | Fetch information about structure title and experimental method for PDB entries:
6 | ```python
7 | from rcsbapi.data import DataQuery as Query
8 |
9 | query = Query(
10 | input_type="entries",
11 | input_ids=["1STP", "2JEF", "1CDG"],
12 | return_data_list=["entries.rcsb_id", "struct.title", "exptl.method"]
13 | )
14 | result_dict = query.exec()
15 | print(result_dict)
16 | ```
17 | Performs the following GraphQL query:
18 | ```
19 | {
20 | entries(entry_ids: ["1STP", "2JEF", "1CDG"]) {
21 | rcsb_id
22 | struct {
23 | title
24 | }
25 | exptl {
26 | method
27 | }
28 | }
29 | }
30 | ```
31 | To find more about the return_data_list dot notation, see [ValueError: Not a unique field](query_construction.md#valueerror-not-a-unique-field)
32 |
33 | ### Primary Citation
34 | Fetch primary citation information (structure authors, PubMed ID, DOI) and release date for PDB entries:
35 |
36 | ```python
37 | from rcsbapi.data import DataQuery as Query
38 |
39 | query = Query(
40 | input_type="entries",
41 | input_ids=["1STP", "2JEF", "1CDG"],
42 | return_data_list=[
43 | "entries.rcsb_id",
44 | "rcsb_accession_info.initial_release_date",
45 | "audit_author.name",
46 | "rcsb_primary_citation.pdbx_database_id_PubMed",
47 | "rcsb_primary_citation.pdbx_database_id_DOI"
48 | ]
49 | )
50 | result_dict = query.exec()
51 | print(result_dict)
52 | ```
53 | Performs the following GraphQL query:
54 | ```
55 | {
56 | entries(entry_ids: ["1STP", "2JEF", "1CDG"]) {
57 | rcsb_id
58 | rcsb_accession_info {
59 | initial_release_date
60 | }
61 | audit_author {
62 | name
63 | }
64 | rcsb_primary_citation {
65 | pdbx_database_id_PubMed
66 | pdbx_database_id_DOI
67 | }
68 | }
69 | }
70 | ```
71 |
72 | ### Polymer Entities
73 | Fetch taxonomy information and information about membership in the sequence clusters for polymer entities:
74 |
75 | ```python
76 | from rcsbapi.data import DataQuery as Query
77 |
78 | query = Query(
79 | input_type="polymer_entities",
80 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"],
81 | return_data_list=[
82 | "polymer_entities.rcsb_id",
83 | "rcsb_entity_source_organism.ncbi_taxonomy_id",
84 | "rcsb_entity_source_organism.ncbi_scientific_name",
85 | "cluster_id",
86 | "identity"
87 | ]
88 | )
89 | result_dict = query.exec()
90 | print(result_dict)
91 | ```
92 | Performs the following GraphQL query:
93 | ```
94 | {
95 | polymer_entities(entity_ids: ["2CPK_1", "3WHM_1", "2D5Z_1"]) {
96 | rcsb_id
97 | rcsb_entity_source_organism {
98 | ncbi_taxonomy_id
99 | ncbi_scientific_name
100 | }
101 | rcsb_cluster_membership {
102 | cluster_id
103 | identity
104 | }
105 | }
106 | }
107 | ```
108 |
109 | ### Polymer Instances
110 | Fetch information about the domain assignments for polymer entity instances:
111 |
112 | ```python
113 | from rcsbapi.data import DataQuery as Query
114 |
115 | query = Query(
116 | input_type="polymer_entity_instances",
117 | input_ids=["4HHB.A", "12CA.A", "3PQR.A"],
118 | return_data_list=[
119 | "polymer_entity_instances.rcsb_id",
120 | "rcsb_polymer_instance_annotation.annotation_id",
121 | "rcsb_polymer_instance_annotation.name",
122 | "rcsb_polymer_instance_annotation.type"
123 | ]
124 | )
125 | result_dict = query.exec()
126 | print(result_dict)
127 | ```
128 | Performs the following GraphQL query:
129 | ```
130 | {
131 | polymer_entity_instances(instance_ids: ["4HHB.A", "12CA.A", "3PQR.A"]) {
132 | rcsb_id
133 | rcsb_polymer_instance_annotation {
134 | annotation_id
135 | name
136 | type
137 | }
138 | }
139 | }
140 | ```
141 |
142 | ### Carbohydrates
143 | Query branched entities (sugars or oligosaccharides) for commonly used linear descriptors:
144 |
145 | ```python
146 | from rcsbapi.data import DataQuery as Query
147 |
148 | query = Query(
149 | input_type="branched_entities",
150 | input_ids=["5FMB_2", "6L63_3"],
151 | return_data_list=[
152 | "rcsb_id",
153 | "pdbx_entity_branch.type",
154 | "pdbx_entity_branch_descriptor.type",
155 | "pdbx_entity_branch_descriptor.descriptor"
156 | ]
157 | )
158 | result_dict = query.exec()
159 | print(result_dict)
160 | ```
161 | Performs the following GraphQL query:
162 | ```
163 | {
164 | branched_entities(entity_ids: ["5FMB_2", "6L63_3"]) {
165 | rcsb_id
166 | pdbx_entity_branch {
167 | type
168 | }
169 | pdbx_entity_branch_descriptor {
170 | type
171 | descriptor
172 | }
173 | }
174 | }
175 | ```
176 |
177 | ### Sequence Positional Features
178 |
179 | Sequence positional features describe regions or sites of interest in the PDB sequences, such as binding sites, active sites, linear motifs, local secondary structure, structural and functional domains, etc. Positional annotations include depositor-provided information available in the PDB archive as well as annotations integrated from external resources (e.g. UniProtKB).
180 |
181 | This example queries `polymer_entity_instances` positional features. The query returns features of different types: for example, CATH and SCOP classifications assignments integrated from UniProtKB data, or the secondary structure annotations from the PDB archive data calculated by the data-processing program called MAXIT (Macromolecular Exchange and Input Tool) that is based on an earlier ProMotif implementation.
182 |
183 | ```python
184 | from rcsbapi.data import DataQuery as Query
185 |
186 | query = Query(
187 | input_type="polymer_entity_instances",
188 | input_ids=["1NDO.A"],
189 | return_data_list=[
190 | "polymer_entity_instances.rcsb_id",
191 | "rcsb_polymer_instance_feature.type",
192 | "rcsb_polymer_instance_feature.feature_positions.beg_seq_id",
193 | "rcsb_polymer_instance_feature.feature_positions.end_seq_id"
194 | ]
195 | )
196 | result_dict = query.exec()
197 | print(result_dict)
198 | ```
199 | Performs the following GraphQL query:
200 | ```
201 | {
202 | polymer_entity_instances(instance_ids: ["1NDO.A"]) {
203 | rcsb_id
204 | rcsb_polymer_instance_feature {
205 | type
206 | feature_positions {
207 | beg_seq_id
208 | end_seq_id
209 | }
210 | }
211 | }
212 | }
213 | ```
214 |
215 | ### Reference Sequence Identifiers
216 | This example shows how to access identifiers related to entries (cross-references) and found in data collections other than PDB. Each cross-reference is described by the database name and the database accession. A single entry can have cross-references to several databases, e.g. UniProt and GenBank in 7NHM, or no cross-references, e.g. 5L2G:
217 | ```python
218 | from rcsbapi.data import DataQuery as Query
219 |
220 | query = Query(
221 | input_type="entries",
222 | input_ids=["7NHM", "5L2G"],
223 | return_data_list=[
224 | "polymer_entities.rcsb_id",
225 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
226 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name"
227 | ]
228 | )
229 | result_dict = query.exec()
230 | print(result_dict)
231 | ```
232 | Performs the following GraphQL query:
233 | ```
234 | {
235 | entries(entry_ids: ["7NHM", "5L2G"]){
236 | polymer_entities {
237 | rcsb_id
238 | rcsb_polymer_entity_container_identifiers {
239 | reference_sequence_identifiers {
240 | database_accession
241 | database_name
242 | }
243 | }
244 | }
245 | }
246 | }
247 | ```
248 |
249 | ### Chemical Components
250 | Query for specific items in the chemical component dictionary based on a given list of CCD ids:
251 |
252 | ```python
253 | from rcsbapi.data import DataQuery as Query
254 |
255 | query = Query(
256 | input_type="chem_comps",
257 | input_ids=["NAG", "EBW"],
258 | return_data_list=[
259 | "chem_comps.rcsb_id",
260 | "chem_comp.type",
261 | "chem_comp.formula_weight",
262 | "chem_comp.name",
263 | "chem_comp.formula",
264 | "rcsb_chem_comp_info.initial_release_date"
265 | ]
266 | )
267 | result_dict = query.exec()
268 | print(result_dict)
269 | ```
270 | Performs the following GraphQL query:
271 | ```
272 | {
273 | chem_comps(comp_ids: ["NAG", "EBW"]) {
274 | rcsb_id
275 | chem_comp {
276 | type
277 | formula_weight
278 | name
279 | formula
280 | }
281 | rcsb_chem_comp_info {
282 | initial_release_date
283 | }
284 | }
285 | }
286 | ```
287 |
288 | ### Computed Structure Models
289 | This example shows how to get a list of global Model Quality Assessment metrics for AlphaFold structure of Hemoglobin subunit beta:
290 |
291 | ```python
292 | from rcsbapi.data import DataQuery as Query
293 |
294 | query = Query(
295 | input_type="entries",
296 | input_ids=["AF_AFP68871F1"],
297 | return_data_list=["rcsb_id", "ma_qa_metric_global.type", "ma_qa_metric_global.value"]
298 | )
299 | result_dict = query.exec()
300 | print(result_dict)
301 | ```
302 | Performs the following GraphQL query:
303 | ```
304 | {
305 | entries(entry_ids: ["AF_AFP68871F1"]) {
306 | rcsb_id
307 | rcsb_ma_qa_metric_global {
308 | ma_qa_metric_global {
309 | type
310 | value
311 | }
312 | }
313 | }
314 | }
315 | ```
316 |
317 | ### PubMed
318 | This example gets the abstract text of the paper with the specified PubMed ID.
319 |
320 | ```python
321 | from rcsbapi.data import DataQuery as Query
322 |
323 | query = Query(
324 | input_type="pubmed",
325 | return_data_list=["rcsb_pubmed_abstract_text"],
326 | input_ids=["6726807"]
327 | )
328 |
329 | result_dict = query.exec()
330 | print(result_dict)
331 | ```
332 |
333 | Performs the following GraphQL query:
334 | ```
335 | {
336 | pubmed(pubmed_id: 6726807) {
337 | rcsb_pubmed_abstract_text
338 | }
339 | }
340 | ```
341 |
342 | ### UniProt
343 | This example gets a description of the function of a protein based on the UniProt ID.
344 |
345 | ```python
346 | from rcsbapi.data import DataQuery as Query
347 |
348 | query = Query(
349 | input_type="uniprot",
350 | return_data_list=["function.details"],
351 | input_ids=["P68871"]
352 | )
353 |
354 | result_dict = query.exec()
355 | print(result_dict)
356 | ```
357 |
358 | Performs the following GraphQL query:
359 | ```
360 | {
361 | uniprot(uniprot_id: "P68871") {
362 | rcsb_uniprot_protein {
363 | function {
364 | details
365 | }
366 | }
367 | }
368 | }
369 | ```
370 |
--------------------------------------------------------------------------------
/docs/data_api/api.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | *****************
3 |
4 | .. automodule:: rcsbapi.data
5 | :members:
6 | :special-members: __init__
7 |
--------------------------------------------------------------------------------
/docs/data_api/implementation_details.md:
--------------------------------------------------------------------------------
1 | # Implementation Details
2 | ### Parsing Schema
3 | Upon initialization of the package, the GraphQL schema is fetched from the GraphQL Data API endpoint. After fetching the schema, the Python package parses the schema and creates a graph object to represent it within the package. This graph representation of how fields and types connect is key to how queries are automatically constructed using a path finding algorithm. The graph is constructed as a directed graph in [rustworkx](https://www.rustworkx.org/), so `rustworkx` must be able to be installed on your machine to use this. If you experience installation or usage issues, please create an issue on [GitHub](https://github.com/rcsb/py-rcsb-api/issues) and we will consider implementing alternative support.
4 |
5 | ### Constructing queries
6 | Queries are constructed by finding every [simple path](https://en.wikipedia.org/wiki/Simple_path#:~:text=Simple%20path%20(graph%20theory)%2C,does%20not%20have%20repeating%20vertices) from the `input_type` to each final requested field in `return_data_list`. The simple paths are searched for path(s) matching the given path in `return_data_list`. The given path must be sufficiently specific to allow for only one possible path. If there are multiple possible paths, a [ValueError](query_construction.md#valueerror-not-a-unique-field) is raised.
7 |
8 | ### Error Handling
9 | In GraphQL, all requests return HTTP status code 200 and instead, errors appear in the returned JSON. The package will parse these errors, throwing a `ValueError` and displaying the corresponding error message or messages. To access the full query and return JSON in an interactive editor, you can use the `get_editor_link()` method on the DataQuery object. (see [Helpful Methods](query_construction.md#get_editor_link))
--------------------------------------------------------------------------------
/docs/data_api/query_construction.md:
--------------------------------------------------------------------------------
1 | # Query Construction
2 |
3 | ## Query Objects
4 | Constructing a query object requires three inputs. The JSON response to a query is stored in the `response` attribute of a Query object and can be accessed using the `get_response()` method.
5 | ```python
6 | from rcsbapi.data import DataQuery as Query
7 |
8 | # Constructing the Query object
9 | query = Query(
10 | input_type="entries",
11 | input_ids=["4HHB"],
12 | return_data_list=["exptl.method"]
13 | )
14 |
15 | # Executing the query
16 | query.exec()
17 |
18 | # Accessing the response
19 | # Can also print using print(query.exec())
20 | print(query.get_response())
21 | ```
22 |
23 | ### input_type
24 | Specifies which data hierarchy level from which you are starting your query (e.g., `"entry"`, `"polymer_entity"`, etc.).
25 |
26 | Also called "root fields", these represent designated points from which you can begin querying. This includes `"entries"`, `"polymer_entities"`, `"polymer_entity_instances"`, etc. Singular input_types are automatically converted to their plural form when possible to allow for more flexibility in `input_ids`. For the full list of `input_type`s see below:
27 |
28 |
29 | Full list of input_types
30 |
31 | - `"entry"`
32 | - `"entries"`
33 | - `"polymer_entity"`
34 | - `"polymer_entities"`
35 | - `"branched_entity"`
36 | - `"branched_entities"`
37 | - `"nonpolymer_entity"`
38 | - `"nonpolymer_entities"`
39 | - `"polymer_entity_instance"`
40 | - `"polymer_entity_instances"`
41 | - `"nonpolymer_entity_instance"`
42 | - `"nonpolymer_entity_instances"`
43 | - `"branched_entity_instance"`
44 | - `"branched_entity_instances"`
45 | - `"assembly"`
46 | - `"assemblies"`
47 | - `"interface"`
48 | - `"interfaces"`
49 | - `"uniprot"`
50 | - `"pubmed"`
51 | - `"chem_comp"`
52 | - `"chem_comps"`
53 | - `"entry_group"`
54 | - `"entry_groups"`
55 | - `"polymer_entity_group"`
56 | - `"polymer_entity_groups"`
57 | - `"group_provenance"`
58 |
59 |
60 |
61 | ### input_ids
62 | Specifies which entries, entities, etc you would like to request data for.
63 |
64 | This can be a dictionary or a list. Dictionaries must be passed with specific keys corresponding to the input_type. You can find the key names by using the `get_input_id_dict(input_type)` method (see [Helpful Methods](query_construction.md#get-input-id-dict)) or by looking in the [GraphiQL editor](https://data.rcsb.org/graphql/index.html) Docs menu. Lists must be passed in PDB identifier format.
65 |
66 |
67 |
68 | |Type|PDB ID Format|Example|
69 | |---|---|---|
70 | |entries|entry id|4HHB|
71 | |polymer, branched, or non-polymer entities|[entry_id]_[entity_id]|4HHB_1|
72 | |polymer, branched, or non-polymer entity instances|[entry_id].[asym_id]|4HHB.A|
73 | |biological assemblies|[entry_id]-[assembly_id]|4HHB-1|
74 | |interface|[entry_id]-[assembly_id].[interface_id]|4HHB-1.1|
75 |
76 |
77 |
78 | Dictionaries and Lists will be treated equivalently for the `input_ids` argument. For example, these `input_ids` arguments are equivalent.
79 |
80 | ```python
81 | # input_type is polymer_entity_instance
82 | input_ids=["4HHB.A"]
83 | input_ids={"entry_id": "4HHB", "asym_id": "A"}
84 | ```
85 | ```python
86 | # input_type is polymer_entity_instances (plural)
87 | input_ids=["4HHB.A", "4HHB.B"]
88 | input_ids={"instance_ids": ["4HHB.A", "4HHB.B"]}
89 | ```
90 |
91 | While it is generally more efficient and easier to interpret results if you use a refined list of IDs, if you would like to request a set of data for all IDs within an `input_type`, you can use the `ALL_STRUCTURES` variable. This will set `input_ids` to all IDs for the given `input_type` if supported.
92 |
93 | ```python
94 | from rcsbapi.data import DataQuery as Query
95 | from rcsbapi.data import ALL_STRUCTURES
96 |
97 | # Using `ALL_STRUCTURES` with `input_type` "entries"
98 | # will use all experimentally-determined entry IDs
99 | query = Query(
100 | input_type="entries",
101 | input_ids=ALL_STRUCTURES,
102 | return_data_list=["exptl.method"]
103 | )
104 |
105 | # Executing the query with a progress bar
106 | query.exec(progress_bar=True)
107 |
108 | print(query.get_response())
109 | ```
110 |
111 | ### return_data_list
112 | These are the data that you are requesting (or "fields").
113 |
114 | In GraphQL syntax, the final requested data must be a "scalar" type (string, integer, boolean). However, if you request non-scalar data, the package will auto-populate the query to include all fields under the specified data until scalars are reached. Once you receive the query response and understand what specific data you would like to request, you can refine your query by requesting more specific fields.
115 |
116 | The "rcsb_id" field will automatically be added to all queries allowing for easier parsing of the returned JSON. You can turn this off by setting the optional `add_rcsb_id` argument to False.
117 |
118 | ```python
119 | from rcsbapi.data import DataQuery as Query
120 |
121 | query = Query(
122 | input_type="entries",
123 | input_ids=["4HHB"],
124 | return_data_list=["exptl"]
125 | )
126 | result_dict = query.exec()
127 | print(result_dict)
128 | ```
129 | ```json
130 | {
131 | "data": {
132 | "entries": [
133 | {
134 | "rcsb_id": "4HHB",
135 | "exptl": [
136 | {
137 | "method_details": null,
138 | "method": "X-RAY DIFFRACTION",
139 | "crystals_number": null,
140 | "details": null
141 | }
142 | ]
143 | }
144 | ]
145 | }
146 | }
147 | ```
148 | This query can be made more concise by specifying a field, like "method". In this case, the field name "method" is redundant because it appears under other types and must be further specified using dot notation. For more details see [ValueError: Not a unique field](query_construction.md#valueerror-not-a-unique-field)
149 | ```python
150 | from rcsbapi.data import DataQuery as Query
151 |
152 | query = Query(
153 | input_type="entries",
154 | input_ids=["4HHB"],
155 | return_data_list=["exptl.method"]
156 | )
157 | result_dict = query.exec()
158 | print(result_dict)
159 | ```
160 | ```json
161 | {
162 | "data": {
163 | "entries": [
164 | {
165 | "rcsb_id": "4HHB",
166 | "exptl": [
167 | {
168 | "method": "X-RAY DIFFRACTION"
169 | }
170 | ]
171 | }
172 | ]
173 | }
174 | }
175 | ```
176 |
177 | ### Executing Large Queries
178 | When executing large queries, the package will batch the `input_ids` before requesting and merge the responses into one JSON object. The default batch size is 5,000, but this value can be adjusted in the `exec` method. To see a progress bar that tracks which batches have been completed, you can set `progress_bar` to `True`.
179 |
180 | ```python
181 | from rcsbapi.data import DataQuery as Query
182 | from rcsbapi.data import ALL_STRUCTURES
183 |
184 | query = Query(
185 | input_type="entries",
186 | input_ids=ALL_STRUCTURES,
187 | return_data_list=["exptl.method"]
188 | )
189 |
190 | # Executing query with larger batch size
191 | # and progress bar
192 | query.exec(
193 | batch_size=7000,
194 | progress_bar=True
195 | )
196 |
197 | print(query.get_response())
198 | ```
199 |
200 | ## Helpful Methods
201 | There are several methods included to make working with query objects easier. These methods can help you refine your queries to request exactly and only what you want, as well as further understand the GraphQL syntax.
202 |
203 | ### get_editor_link()
204 | This method returns the link to a [GraphiQL](https://data.rcsb.org/graphql/index.html) window with the query. From the window, you can use the user interface to explore other fields and refine your query. Method of the `DataQuery` class.
205 |
206 | ```python
207 | from rcsbapi.data import DataQuery as Query
208 |
209 | query = Query(
210 | input_type="entries",
211 | input_ids=["4HHB"],
212 | return_data_list=["exptl"]
213 | )
214 | editor_link = query.get_editor_link()
215 | print(editor_link)
216 | ```
217 |
218 | ### find_paths()
219 | Given a redundant field, this method finds all paths from an `input_type` to nodes named as `return_data_name`. Method of the `DataSchema` class.
220 |
221 | ```python
222 | from rcsbapi.data import DataSchema
223 |
224 | schema = DataSchema()
225 | schema.find_paths(input_type="entries", return_data_name="id")
226 | ```
227 |
228 | To return a dictionary with descriptions for each path, set `descriptions` to true.
229 | ```python
230 | schema.find_paths(input_type="entries", return_data_name="id", descriptions=True)
231 | ```
232 |
233 | ### find_field_names()
234 | Given a string, this method will return all fields containing that string.
235 |
236 | ```python
237 | from rcsbapi.data import DataSchema
238 |
239 | schema = DataSchema()
240 | schema.find_field_names("exptl")
241 | ```
242 |
243 | ### get_input_id_dict()
244 | Given an `input_type`, returns a dictionary with the corresponding input keys and descriptions of each key. Method of the `DataSchema` class.
245 |
246 | ```python
247 | from rcsbapi.data import DataSchema
248 |
249 | schema = DataSchema()
250 | schema.get_input_id_dict("polymer_entity_instance")
251 | ```
252 |
253 | ## Troubleshooting
254 | ### ValueError: Not a unique field
255 | Some fields are redundant within our GraphQL Data API schema. For example, "id" appears over 50 times. To allow for specific querying, redundant fields are identified by the syntax `....`. If you request a redundant field without this syntax, a `ValueError` will be returned stating that the field exists, but is not unique. You can then use `find_paths(input_type, return_data_name)` to find a path that would specify the desired field.
256 |
257 | ```python
258 | from rcsbapi.data import DataQuery as Query
259 |
260 | # querying a redundant field
261 | query = Query(
262 | input_type="entries",
263 | input_ids=["4HHB"],
264 | return_data_list=["id"]
265 | )
266 | result_dict = query.exec()
267 | print(result_dict)
268 | ```
269 | ```
270 | ValueError: "id" exists, but is not a unique field, must specify further.
271 | 10 of 118 possible paths:
272 | entries.assemblies.branched_entity_instances.branched_entity.chem_comp_monomers.chem_comp.id
273 | entries.assemblies.branched_entity_instances.branched_entity.chem_comp_monomers.rcsb_bird_citation.id
274 | ...
275 |
276 | For all paths run:
277 | from rcsbapi.data import DataSchema
278 | schema = DataSchema()
279 | schema.find_paths("entries", "id")
280 | ```
281 | ```python
282 | from rcsbapi.data import DataSchema
283 |
284 | # run find_paths(input_type, return_data_name)
285 | schema = DataSchema()
286 | print(schema.find_paths(input_type="entries", return_data_name="id"))
287 | ```
288 |
289 | ```python
290 | # select desired field from the returned list
291 | ['citation.id',
292 | 'diffrn.id'
293 | 'entry.id'
294 | ...
295 | 'polymer_entities.prd.chem_comp.id',
296 | 'polymer_entities.prd.rcsb_bird_citation.id',
297 | 'polymer_entities.prd.rcsb_chem_comp_annotation.annotation_lineage.id']
298 | ```
299 | ```python
300 | from rcsbapi.data import DataQuery as Query
301 |
302 | # valid query
303 | query = Query(
304 | input_type="entries",
305 | input_ids=["4HHB"],
306 | return_data_list=["entry.id"]
307 | )
308 | result_dict = query.exec()
309 | print(result_dict)
310 | ```
--------------------------------------------------------------------------------
/docs/data_api/quickstart.md:
--------------------------------------------------------------------------------
1 | # Quickstart
2 |
3 | ## Installation
4 | Get it from PyPI:
5 |
6 | pip install rcsb-api
7 |
8 | Or, download from [GitHub](https://github.com/rcsb/py-rcsb-api)
9 |
10 | ## Import
11 | To import this package, use:
12 | ```python
13 | from rcsbapi.data import DataSchema, DataQuery
14 | ```
15 |
16 | ## Getting Started
17 | The [RCSB PDB Data API](https://data.rcsb.org) supports requests using [GraphQL](https://graphql.org/), a language for API queries. This package simplifies generating queries in GraphQL syntax.
18 |
19 | To generate a query in this package, you would create a `DataQuery` object. The query must be executed using the `.exec()` method, which will return the JSON response as well as store the response as an attribute of the `DataQuery` object. From the object, you can access the Data API response, get an interactive editor link, or access the arguments used to create the query.
20 |
21 | The package is able to automatically build queries based on the "input_type" and path segment passed into "return_data_list". If using this package in code intended for long-term use, it's recommended to use the fully qualified path (a complete path from input type to the final data field). When autocompletion is being used, a WARNING message will be printed out as a reminder.
22 |
23 | To suppress the warning, either use the fully qualified path (as in the below example) or set the `suppress_autocomplete_warning` argument to True. To suppress the warning for all queries, change the SUPPRESS_AUTOCOMPLETE_WARNING flag in config.py.
24 |
25 |
26 | ```python
27 | from rcsbapi.data import DataQuery as Query
28 |
29 | query = Query(
30 | input_type="entries",
31 | input_ids=["4HHB"],
32 | return_data_list=["exptl.method"]
33 | )
34 |
35 | result_dict = query.exec()
36 | print(result_dict)
37 | # print(query.get_response()) would be equivalent
38 | ```
39 | Data is returned in JSON format
40 | ```json
41 | {
42 | "data": {
43 | "entries": [
44 | {
45 | "rcsb_id": "4HHB",
46 | "exptl": [
47 | {
48 | "method": "X-RAY DIFFRACTION"
49 | }
50 | ]
51 | }
52 | ]
53 | }
54 | }
55 | ```
56 |
57 | ### GraphQL
58 | This is the equivalent query in GraphQL syntax.
59 | ```
60 | {
61 | entries(entry_ids: ["4HHB"]) { # returns type "CoreEntry"
62 | exptl { # returns type "Exptl"
63 | method # returns a scalar (string)
64 | }
65 | }
66 | }
67 | ```
68 | GraphQL is built on "types" and their associated "fields". All types and their fields are defined in a "schema". An example of a type in our schema is "CoreEntry" and a field under CoreEntry is "exptl" (experimental). Upon initialization, the Data API package fetches the schema from the RCSB PDB website (See [Implementation Details](implementation_details.md) for more).
69 |
70 | In GraphQL, you must begin your query at specific fields. These are fields like `entries`, `polymer_entities`, and `polymer_entity_instances` (see full list [here](query_construction.md#input-type)). Each field can return a scalar (e.g. string, integer) or a type. Every query must ultimately request scalar value(s), which can be seen in the example query below. As shown in the example, fields are explicitly included in queries while types are implicit. Types are named in CamelCase (CoreEntry) while fields are in snake case (exptl or audit_author).
71 |
72 | ### Autocompletion of Queries
73 | One way this package simplifies making requests is by adding fields that return scalars into the generated query if you request a field that returns a type.
74 | ```python
75 | from rcsbapi.data import DataQuery as Query
76 |
77 | query = Query(
78 | input_type="entries",
79 | input_ids=["4HHB"],
80 | # Requesting "exptl" will return a query requesting exptl.method, exptl.details, etc
81 | return_data_list=["exptl"]
82 | )
83 | result_dict = query.exec()
84 | print(result_dict)
85 | ```
86 | This creates a valid query even though "exptl" doesn't return a scalar. However, the resulting query will be more verbose, requesting all scalar fields under "exptl" (see [return_data_list](query_construction.md#return-data-list)).
87 |
88 | ## Jupyter Notebooks
89 | A notebook briefly summarizing the [readthedocs](https://rcsbapi.readthedocs.io/en/latest/index.html) is available in [notebooks/data_quickstart.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/data_quickstart.ipynb) or online through Google Colab
90 |
91 | Another notebook using both Search and Data API packages for a COVID-19 related example is available in [notebooks/search_data_workflow.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/search_data_workflow.ipynb) or online through Google Colab
.
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | rcsb-api - Query RCSB PDB data from Python
2 | ===============================================
3 |
4 | The ``rcsb-api`` package provides a Python interface to
5 | `RCSB PDB API services `_.
6 | Use it to search and fetch macromolecular structure data from RCSB PDB `at RCSB.org `_.
7 |
8 | Availability
9 | ------------
10 |
11 | Get it from PyPI:
12 |
13 | .. code-block:: bash
14 |
15 | pip install rcsb-api
16 |
17 | Or, download from `GitHub `_
18 |
19 |
20 | Contents
21 | --------
22 |
23 | .. toctree::
24 | :caption: Search API
25 | :maxdepth: 2
26 |
27 | search_api/quickstart.md
28 | search_api/query_construction.md
29 | search_api/attributes.md
30 | search_api/additional_examples.md
31 | search_api/api.rst
32 |
33 | .. toctree::
34 | :caption: Data API
35 | :maxdepth: 2
36 |
37 | data_api/quickstart.md
38 | data_api/query_construction.md
39 | data_api/implementation_details.md
40 | data_api/additional_examples.md
41 | data_api/api.rst
42 |
43 |
44 | License
45 | -------
46 |
47 | Code is licensed under the MIT license. See the
48 | `LICENSE `_ for details.
49 |
50 |
51 | Citing
52 | ------
53 |
54 | Please cite the ``rcsb-api`` package with the following reference:
55 |
56 | Dennis W. Piehl, Brinda Vallat, Ivana Truong, Habiba Morsy, Rusham Bhatt, Santiago Blaumann, Pratyoy Biswas, Yana Rose, Sebastian Bittrich, Jose M. Duarte, Joan Segura, Chunxiao Bi, Douglas Myers-Turnbull, Brian P. Hudson, Christine Zardecki, Stephen K. Burley. rcsb-api: Python Toolkit for Streamlining Access to RCSB Protein Data Bank APIs, Journal of Molecular Biology, 2025. DOI: https://doi.org/10.1016/j.jmb.2025.168970
57 |
58 | You should also cite the RCSB.org API services this package utilizes:
59 |
60 | Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley, John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards Integrated Searching and Efficient Access to Macromolecular Structure Data from the PDB Archive, Journal of Molecular Biology, 2020. DOI: https://doi.org/10.1016/j.jmb.2020.11.003
61 |
62 |
63 | Support
64 | ------
65 |
66 | If you experience any issues installing or using the package, please submit an issue on
67 | `GitHub `_ and we will try to respond in a timely manner.
68 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Pin dependencies for the docs
2 | # Should be kept up-to-date with setup.py
3 | sphinx==5.3.0
4 | sphinx-rtd-theme==0.5.2
5 | typing-extensions==3.7.4.3
6 | myst-parser==1.0.0
7 | jinja2<3.1
8 | requests >= 2.0.0
9 | rustworkx
10 | graphql-core
11 | tqdm
12 |
--------------------------------------------------------------------------------
/docs/search_api/api.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | *****************
3 |
4 | .. automodule:: rcsbapi.search
5 | :members:
6 | :special-members: __init__
7 |
8 | .. autoclass:: rcsbapi.search.search_schema.SearchSchemaGroup
9 | :members: search, get_attribute_details, get_attribute_type
10 |
11 | .. autoclass:: rcsbapi.search.search_query.Session
12 | :members: get_editor_link, get_query_builder_link
13 |
--------------------------------------------------------------------------------
/docs/search_api/attributes.md:
--------------------------------------------------------------------------------
1 | # Exploring Schema Attributes
2 |
3 | Attributes are pieces of information associated with a PDB structure that can be searched for or compared to a value using an [`AttributeQuery`](quickstart.md#getting-started). There are [structure attributes](https://search.rcsb.org/structure-search-attributes.html) and [chemical attributes](https://search.rcsb.org/chemical-search-attributes.html), which are both stored in `search_attributes`. This can be imported as shown below:
4 |
5 | ```python
6 | # import search_attributes as attrs for a shorter name
7 | from rcsbapi.search import search_attributes as attrs
8 | ```
9 |
10 | There are several helpful methods to search for attribute names or explore other information related to attributes.
11 |
12 | ### search()
13 | Given a string, this method will return an iterable of `Attr` objects with names that contain the given string. You can also use [regular expression (regex)](https://en.wikipedia.org/wiki/Regular_expression) strings.
14 |
15 | ```python
16 | matching_attrs = attrs.search("author")
17 |
18 | for attr in matching_attrs:
19 | print(attr)
20 | ```
21 |
22 | ### get_attribute_details()
23 | Given a full or partial attribute name, return a set of an `Attr` or associated `Attr`s with attribute names, search service types, and descriptions.
24 |
25 | ```python
26 | from rcsbapi.search import search_attributes as attrs
27 |
28 | # Use a full name to get details for a specific attribute
29 | print(attrs.get_attribute_details("rcsb_entity_source_organism.scientific_name"))
30 |
31 | # Use a partial name to get the details of all attributes associated with that name
32 | # Below code gets details for ".common_name", ".source_type", etc in addition to ".scientific_name"
33 | print(attrs.get_attribute_details("rcsb_entity_source_organism"))
34 | ```
35 |
36 | ### get_attribute_type()
37 | Given a full attribute name, return the search service type (`text` for structure attributes and `text_chem` for chemical attributes).
38 |
39 | ```python
40 | from rcsbapi.search import search_attributes as attrs
41 |
42 | print(attrs.get_attribute_type("rcsb_entity_source_organism.scientific_name"))
43 | ```
--------------------------------------------------------------------------------
/docs/search_api/quickstart.md:
--------------------------------------------------------------------------------
1 | # Quickstart
2 |
3 | ## Installation
4 |
5 | Get it from PyPI:
6 |
7 | pip install rcsb-api
8 |
9 | Or, download from [GitHub](https://github.com/rcsb/py-rcsb-api)
10 |
11 | ## Getting Started
12 | ### Basic Query Construction
13 |
14 | #### Full-text search
15 | To perform a "full-text" search for structures associated with the term "Hemoglobin", you can create a `TextQuery`:
16 |
17 | ```python
18 | from rcsbapi.search import TextQuery
19 |
20 | # Search for structures associated with the phrase "Hemoglobin"
21 | query = TextQuery(value="Hemoglobin")
22 |
23 | # Execute the query by running it as a function
24 | results = query()
25 |
26 | # Results are returned as an iterator of result identifiers.
27 | for rid in results:
28 | print(rid)
29 | ```
30 |
31 | #### Attribute search
32 | To perform a search for specific structure or chemical attributes, you can create an `AttributeQuery`.
33 |
34 | ```python
35 | from rcsbapi.search import AttributeQuery
36 |
37 | # Construct a query searching for structures from humans
38 | query = AttributeQuery(
39 | attribute="rcsb_entity_source_organism.scientific_name",
40 | operator="exact_match", # Other operators include "contains_phrase", "exists", and more
41 | value="Homo sapiens"
42 | )
43 |
44 | # Execute query and construct a list from results
45 | results = list(query())
46 | print(results)
47 | ```
48 |
49 | Refer to the [Search Attributes](https://search.rcsb.org/structure-search-attributes.html) and [Chemical Attributes](https://search.rcsb.org/chemical-search-attributes.html) documentation for a full list of attributes and applicable operators.
50 |
51 | Alternatively, you can construct attribute queries with comparative operators using the `search_attributes` object (which also allows for names to be tab-completed):
52 |
53 | ```python
54 | from rcsbapi.search import search_attributes as attrs
55 |
56 | # Search for structures from humans
57 | query = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens"
58 |
59 | # Run query and construct a list from results
60 | results = list(query())
61 | print(results)
62 | ```
63 |
64 | #### Grouping sub-queries
65 |
66 | You can combine multiple queries using Python bitwise operators.
67 |
68 | ```python
69 | from rcsbapi.search import search_attributes as attrs
70 |
71 | # Query for human epidermal growth factor receptor (EGFR) structures (UniProt ID P00533)
72 | # with investigational or experimental drugs bound
73 | q1 = attrs.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession == "P00533"
74 | q2 = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens"
75 | q3 = attrs.drugbank_info.drug_groups == "investigational"
76 | q4 = attrs.drugbank_info.drug_groups == "experimental"
77 |
78 | # Structures matching UniProt ID P00533 AND from humans
79 | # AND (investigational OR experimental drug group)
80 | query = q1 & q2 & (q3 | q4)
81 |
82 | # Execute query and print first 10 ids
83 | results = list(query())
84 | print(results[:10])
85 | ```
86 |
87 | These examples are in "operator" syntax. You can also make queries in "fluent" syntax. Learn more about both syntaxes and implementation details in [Query Syntax and Execution](query_construction.md#query-syntax-and-execution).
88 |
89 | ### Supported Search Services
90 | The list of supported search service types are listed in the table below. For more details on their usage, see [Search Service Types](query_construction.md#search-service-types).
91 |
92 | |Search service |QueryType |
93 | |----------------------------------|--------------------------|
94 | |Full-text |`TextQuery()` |
95 | |Attribute (structure or chemical) |`AttributeQuery()` |
96 | |Sequence similarity |`SeqSimilarityQuery()` |
97 | |Sequence motif |`SeqMotifQuery()` |
98 | |Structure similarity |`StructSimilarityQuery()` |
99 | |Structure motif |`StructMotifQuery()` |
100 | |Chemical similarity |`ChemSimilarityQuery()` |
101 |
102 | Learn more about available search services on the [RCSB PDB Search API docs](https://search.rcsb.org/#search-services).
103 |
104 | ## Jupyter Notebooks
105 | A runnable jupyter notebook is available in [notebooks/search_quickstart.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/search_quickstart.ipynb), or can be run online using Google Colab:
106 |
107 |
108 | An additional Covid-19 related example is in [notebooks/covid.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/covid.ipynb):
109 |
110 |
--------------------------------------------------------------------------------
/notebooks/covid.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "46b8b87a",
6 | "metadata": {},
7 | "source": [
8 | "
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "deb1fbf6",
14 | "metadata": {},
15 | "source": [
16 | "# RCSB PDB Search API: Covid-19 Use-Case\n",
17 | "\n",
18 | "\n",
19 | "Start by installing the package:\n",
20 | "\n",
21 | " pip install rcsb-api\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "id": "0e3979a2",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "%pip install rcsb-api"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "id": "married-burden",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from rcsbapi.search import search_attributes as attrs, TextQuery"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "id": "266c28ab",
47 | "metadata": {},
48 | "source": [
49 | "## Demo\n",
50 | "\n",
51 | "We are interested in how the antiviral drug boceprevir interacts with the Covid-19 virus, so we'll construct a query with the following specifications:\n",
52 | "- Source Organism is \"COVID-19 virus\"\n",
53 | "- Associated with the word \"protease\"\n",
54 | "- Bound to ligand \"Boceprevir\"\n",
55 | "\n",
56 | "[RCSB Query](http://www.rcsb.org/search?request=%7B%22query%22%3A%7B%22type%22%3A%22group%22%2C%22logical_operator%22%3A%22and%22%2C%22nodes%22%3A%5B%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22rcsb_entity_source_organism.taxonomy_lineage.name%22%2C%22operator%22%3A%22exact_match%22%2C%22value%22%3A%22COVID-19%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A0%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22value%22%3A%22protease%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A1%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22chem_comp.name%22%2C%22operator%22%3A%22contains_words%22%2C%22value%22%3A%22Boceprevir%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A2%7D%5D%7D%2C%22return_type%22%3A%22entry%22%2C%22request_info%22%3A%7B%22query_id%22%3A%2270e677a6376b4c5eba8b4f2b73866c92%22%2C%22src%22%3A%22ui%22%7D%7D)"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "id": "collectible-thread",
62 | "metadata": {},
63 | "source": [
64 | "## Operator syntax\n",
65 | "- Uses python comparison operators to compare attributes to a value (`==`, `<`, `<=`, etc)\n",
66 | "- Combine using set operators (`&`, `|`, `~`, etc)\n",
67 | "- Execute queries as functions"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "id": "confidential-behavior",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == \"COVID-19 virus\"\n",
78 | "q2 = TextQuery(\"protease\")\n",
79 | "q3 = attrs.chem_comp.name.contains_words(\"Boceprevir\")\n",
80 | "query = q1 & q2 & q3\n",
81 | "\n",
82 | "list(query())"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "id": "uniform-allen",
88 | "metadata": {},
89 | "source": [
90 | "## Fluent syntax\n",
91 | "\n",
92 | "A second syntax is available with a [fluent interface](https://en.wikipedia.org/wiki/Fluent_interface), similar to popular data science packages like tidyverse and Apache Spark. Function calls are chained together.\n",
93 | "\n",
94 | "Here's an example around a second antiviral, remdesivir. The drug interferes with RNA polymerase, replacing an adenine and causing early chain termination. When integrated into RNA, the nucleotide formed from remdesivir has residue code F86."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "id": "irish-navigator",
101 | "metadata": {
102 | "scrolled": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "query = attrs.struct.title.contains_phrase(\"RNA polymerase\")\\\n",
107 | " .or_(attrs.struct.title).contains_words(\"RdRp\")\\\n",
108 | " .and_(attrs.rcsb_entity_source_organism.taxonomy_lineage.name).exact_match(\"COVID-19 virus\")\\\n",
109 | " .and_(attrs.rcsb_chem_comp_container_identifiers.comp_id).exact_match(\"F86\")\\\n",
110 | " \n",
111 | "list(query.exec())\n"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "distant-graduate",
117 | "metadata": {},
118 | "source": [
119 | "## Try it!\n",
120 | "\n",
121 | "[rcsbapi.readthedocs.io](https://rcsbapi.readthedocs.io/en/latest/)"
122 | ]
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.12.6"
142 | },
143 | "toc": {
144 | "base_numbering": 1,
145 | "nav_menu": {},
146 | "number_sections": true,
147 | "sideBar": true,
148 | "skip_h1_title": false,
149 | "title_cell": "Table of Contents",
150 | "title_sidebar": "Contents",
151 | "toc_cell": false,
152 | "toc_position": {},
153 | "toc_section_display": true,
154 | "toc_window_display": false
155 | },
156 | "varInspector": {
157 | "cols": {
158 | "lenName": 16,
159 | "lenType": 16,
160 | "lenVar": 40
161 | },
162 | "kernels_config": {
163 | "python": {
164 | "delete_cmd_postfix": "",
165 | "delete_cmd_prefix": "del ",
166 | "library": "var_list.py",
167 | "varRefreshCmd": "print(var_dic_list())"
168 | },
169 | "r": {
170 | "delete_cmd_postfix": ") ",
171 | "delete_cmd_prefix": "rm(",
172 | "library": "var_list.r",
173 | "varRefreshCmd": "cat(var_dic_list()) "
174 | }
175 | },
176 | "types_to_exclude": [
177 | "module",
178 | "function",
179 | "builtin_function_or_method",
180 | "instance",
181 | "_Feature"
182 | ],
183 | "window_display": false
184 | }
185 | },
186 | "nbformat": 4,
187 | "nbformat_minor": 5
188 | }
189 |
--------------------------------------------------------------------------------
/notebooks/data_quickstart.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# RCSB PDB Data API: Quickstart\n",
15 | "\n",
16 | "This Quickstart notebook will walk through the basics of creating and executing queries using the `rcsbapi.data` module of the `rcsb-api` package. For more in-depth documentation, reference the [readthedocs page](https://rcsbapi.readthedocs.io/en/latest/data_api/quickstart.html).\n",
17 | "\n",
18 | "\\\n",
19 | "Before beginning, you must install the package:\n",
20 | "\n",
21 | "```pip install rcsb-api```"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "scrolled": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "%pip install rcsb-api"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 25,
38 | "metadata": {
39 | "scrolled": false
40 | },
41 | "outputs": [],
42 | "source": [
43 | "from rcsbapi.data import DataQuery as Query\n",
44 | "import json # for easy-to-read output"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "## Creating and executing queries"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "To create a `Query` object, you need to provide three arguments:\n",
59 | "- `input_type`: input_types are points where you can begin your query. Some examples are `entries`, `polymer_entities`, and `polymer_entity_instances`. For a full list of input_types see the [readthedocs](https://rcsbapi.readthedocs.io/en/latest/data_api/query_construction.html#input-type).\n",
60 | "- `input_ids`: input_ids are accepted as a list or dictionary of PDB-formatted IDs.\n",
61 | "- `return_data_list`: list of data items to return. These must be unique path segments (using dots to separate each name). Further explained [below](#Providing-specific-and-unique-field-names/paths).\n",
62 | "\n",
63 | "(More details on input arguments can be found in [readthedocs: Query Construction](https://rcsbapi.readthedocs.io/en/latest/data_api/query_construction.html).)\n",
64 | "\n",
65 | "For example, to create a `Query` object requesting all non-polymer components of a structure (ions, cofactors, etc):"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "query = Query(\n",
75 | " input_type=\"entries\",\n",
76 | " input_ids=[\"4HHB\"],\n",
77 | " return_data_list=[\"nonpolymer_bound_components\"] # must be unique field or unique path segment\n",
78 | ")\n",
79 | "\n",
80 | "# Note: When the package autocompletes a path, it prints an Warning message\n",
81 | "# To suppress this warning, either use the fully qualified path (\"rcsb_entry_info.nonpolymer_bound_components\"),\n",
82 | "# or set the `suppress_autocomplete_warning` to True.\n"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "After creating a `Query` object, you can run it with `.exec()` or view the GraphQL query with `.get_editor_link()`:"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "# Execute the query and print the results\n",
99 | "return_data = query.exec()\n",
100 | "print(json.dumps(return_data, indent=2)) # prints return_data with easy-to-read formatting\n",
101 | "\n",
102 | "## Expected Output:\n",
103 | "# {\n",
104 | "# \"data\": {\n",
105 | "# \"entries\": [\n",
106 | "# {\n",
107 | "# \"rcsb_id\": \"4HHB\",\n",
108 | "# \"rcsb_entry_info\": {\n",
109 | "# \"nonpolymer_bound_components\": [\n",
110 | "# \"HEM\"\n",
111 | "# ]\n",
112 | "# }\n",
113 | "# }\n",
114 | "# ]\n",
115 | "# }\n",
116 | "# }"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "# Print the GraphQL editor URL\n",
126 | "query.get_editor_link()"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "### Querying multiple IDs\n",
134 | "You can search multiple entries by starting from `input_type` \"entries\" and passing in a list of `input_ids`."
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "query = Query(\n",
144 | " input_type=\"entries\",\n",
145 | " input_ids=[\"4HHB\", \"12CA\", \"3PQR\"],\n",
146 | " return_data_list=[\"nonpolymer_bound_components\"]\n",
147 | ")\n",
148 | "return_data = query.exec()\n",
149 | "print(json.dumps(return_data, indent=2))"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "### Querying multiple data items\n",
157 | "You can also request multiple data items by adding to the `return_data_list`."
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "# Query multiple fields in return_data_list\n",
167 | "query = Query(\n",
168 | " input_type=\"entries\",\n",
169 | " input_ids=[\"4HHB\", \"12CA\", \"3PQR\"],\n",
170 | " return_data_list=[\n",
171 | " \"nonpolymer_bound_components\",\n",
172 | " \"citation.title\",\n",
173 | " \"rcsb_entry_info.polymer_composition\"\n",
174 | " ]\n",
175 | ")\n",
176 | "return_data = query.exec()\n",
177 | "print(json.dumps(return_data, indent=2))"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "### Autocompletion of nested fields\n",
185 | "If there are fields nested under a requested data item in `return_data_list`, the package will add all sub-fields to the query. This allows you to make more general requests to get all information under that field (e.g., `\"exptl\"`). If you would like a more precise query, you can request specific fields (e.g., `\"exptl.method\"`)."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "# Requesting \"exptl\" gets all fields underneath that field\n",
195 | "query = Query(\n",
196 | " input_type=\"entries\",\n",
197 | " input_ids=[\"4HHB\"],\n",
198 | " return_data_list=[\"exptl\"] # requests exptl.crystals_number, exptl.method, etc.\n",
199 | ")\n",
200 | "return_data = query.exec()\n",
201 | "print(json.dumps(return_data, indent=2))"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "# To view the generated GraphQL query:\n",
211 | "query.get_editor_link()"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "### Querying different `input_types`\n",
219 | "You can also start queries from various `input_types` (e.g., `polymer_entities`, `polymer_entity_instances`, `uniprot`). (For more examples, see [readthedocs: Additional Examples](https://rcsbapi.readthedocs.io/en/latest/data_api/additional_examples.html))"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "# Search from input_type \"polymer_entities\"\n",
229 | "query = Query(\n",
230 | " input_type=\"polymer_entities\",\n",
231 | " input_ids=[\"2CPK_1\", \"3WHM_1\", \"2D5Z_1\"],\n",
232 | " return_data_list=[\n",
233 | " \"polymer_entities.rcsb_id\",\n",
234 | " \"rcsb_entity_source_organism.ncbi_taxonomy_id\",\n",
235 | " \"rcsb_entity_source_organism.ncbi_scientific_name\",\n",
236 | " \"cluster_id\",\n",
237 | " \"identity\"\n",
238 | " ]\n",
239 | ")\n",
240 | "return_data = query.exec()\n",
241 | "print(json.dumps(return_data, indent=2))"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "# Search from input_type \"polymer_entity_instances\"\n",
251 | "query = Query(\n",
252 | " input_type=\"polymer_entity_instances\",\n",
253 | " input_ids=[\"4HHB.A\", \"12CA.A\", \"3PQR.A\"],\n",
254 | " return_data_list=[\n",
255 | " \"polymer_entity_instances.rcsb_id\",\n",
256 | " \"rcsb_polymer_instance_annotation.annotation_id\",\n",
257 | " \"rcsb_polymer_instance_annotation.name\",\n",
258 | " \"rcsb_polymer_instance_annotation.type\"\n",
259 | " ]\n",
260 | ")\n",
261 | "return_data = query.exec()\n",
262 | "print(json.dumps(return_data, indent=2))"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "# Search from input_type \"uniprot\"\n",
272 | "query = Query(\n",
273 | " input_type=\"uniprot\",\n",
274 | " input_ids=[\"P68871\"],\n",
275 | " return_data_list=[\n",
276 | " \"rcsb_uniprot_annotation\"\n",
277 | " ]\n",
278 | ")\n",
279 | "return_data = query.exec()\n",
280 | "print(json.dumps(return_data, indent=2))"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {},
286 | "source": [
287 | "## Determining fields for `return_data_list`"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {},
293 | "source": [
294 | "### Providing specific and unique field names/paths\n",
295 | "There are some fields that must be further specified using multiple fields separated by dots. This is because some fields are redundant within our GraphQL Data API schema. For example, “id” appears over 50 times.\n",
296 | "\n",
297 | "For example, the field, `\"polymer_composition\"`, is redundant between several nodes: "
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "# The field \"polymer_composition\" isn't specific enough\n",
307 | "query = Query(\n",
308 | " input_type=\"entries\",\n",
309 | " input_ids=[\"4HHB\"],\n",
310 | " return_data_list=[\"polymer_composition\"]\n",
311 | ")\n",
312 | "\n",
313 | "# This will throw a ValueError, which will print out up to 10 valid paths that you can use instead"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "```\n",
321 | "ValueError: Given path \"polymer_composition\" not specific enough. Use one or more of these paths in return_data_list argument:\n",
322 | "\n",
323 | "3 of 3 possible paths:\n",
324 | " assemblies.interfaces.rcsb_interface_info.polymer_composition\n",
325 | " assemblies.rcsb_assembly_info.polymer_composition\n",
326 | " rcsb_entry_info.polymer_composition\n",
327 | "```"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "To get a list of all possible paths for a given field name, you can use the `DataSchema().find_paths()` method:\n",
335 | "```python\n",
336 | "from rcsbapi.data import DataSchema\n",
337 | "schema = DataSchema()\n",
338 | "schema.find_paths(input_type, field_name_or_path_segment)\n",
339 | "```\n",
340 | "For example:"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "# Find all paths:\n",
350 | "from rcsbapi.data import DataSchema\n",
351 | "\n",
352 | "schema = DataSchema()\n",
353 | "schema.find_paths(input_type=\"entries\", return_data_name=\"polymer_composition\")"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "# By looking through the list, find the intended field path\n",
363 | "query = Query(\n",
364 | " input_type=\"entries\",\n",
365 | " input_ids=[\"4HHB\"],\n",
366 | " return_data_list=[\"rcsb_entry_info.polymer_composition\"]\n",
367 | ")\n",
368 | "return_data = query.exec()\n",
369 | "print(json.dumps(return_data, indent=2))"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "### Discovering field names\n",
377 | "If you're unsure which fields exist, you can call `find_field_names(search_substring)`.\n",
378 | "\n",
379 | "For example, to find all fields containing `\"comp\"`:"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "from rcsbapi.data import DataSchema\n",
389 | "\n",
390 | "schema = DataSchema()\n",
391 | "schema.find_field_names(\"comp\")"
392 | ]
393 | },
394 | {
395 | "cell_type": "markdown",
396 | "metadata": {},
397 | "source": [
398 | "Note that once you identify which field you want to use, you may need to also run the `find_paths()` method mentioned above on the field name to identify the set of possible paths for `return_data_list`. "
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "# Find all paths for the field `\"chem_comps\"`:\n",
408 | "schema.find_paths(input_type=\"entries\", return_data_name=\"chem_comp\")"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "metadata": {},
414 | "source": [
415 | "For more in-depth documentation, go to [readthedocs](https://rcsbapi.readthedocs.io/en/latest/data_api/quickstart.html)."
416 | ]
417 | }
418 | ],
419 | "metadata": {
420 | "kernelspec": {
421 | "display_name": "Python 3",
422 | "language": "python",
423 | "name": "python3"
424 | },
425 | "language_info": {
426 | "codemirror_mode": {
427 | "name": "ipython",
428 | "version": 3
429 | },
430 | "file_extension": ".py",
431 | "mimetype": "text/x-python",
432 | "name": "python",
433 | "nbconvert_exporter": "python",
434 | "pygments_lexer": "ipython3",
435 | "version": "3.12.6"
436 | }
437 | },
438 | "nbformat": 4,
439 | "nbformat_minor": 4
440 | }
441 |
--------------------------------------------------------------------------------
/notebooks/multisearch.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Enabling Computational Biology Research\n",
15 | "\n",
16 | "This tool can be an integral resource for computational biologists performing data analysis or iterative processes on big datasets from the RCSB PDB. Our tool supports data automation which is essential for any researcher or computational biologists wanting to work with huge datasets. Furthermore, our tool can be incorporated within a larger research workflow to quickly and seamlessly retrieve RCSB PDB data in an automated way."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "Below is an example of how a computational biologist may use our tool for data automation to facilitate their research. The first query below finds protein structures with a similar protein sequence to the target protein. The retrieved data are then used as search parameters for a set of iterative search queries that find structurally similar proteins that are bound to small molecules. Then, the researcher can use their own workflow to further investigate how the protein structures and small molecules interact."
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "%pip install rcsb-api"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from rcsbapi.search import SeqSimilarityQuery, AttributeQuery, StructSimilarityQuery"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "# Search for similar sequences to a protein of interest\n",
51 | "q1 = SeqSimilarityQuery(\"DTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNEL\" + \n",
52 | " \"TEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCE\" + \n",
53 | " \"KQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYL\" + \n",
54 | " \"YEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETM\" + \n",
55 | " \"REKVLTSSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEV\" + \n",
56 | " \"TKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKE\" + \n",
57 | " \"CCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAK\" + \n",
58 | " \"DAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHAC\" +\n",
59 | " \"YSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKV\" + \n",
60 | " \"PQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLC\" + \n",
61 | " \"VLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLF\" + \n",
62 | " \"TFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVA\" +\n",
63 | " \"FVDKCCAADDKEACFAVEGPKLVVSTQTALA\")\n",
64 | "\n",
65 | "sequence_similarity_results = list(q1(return_type=\"polymer_entity\"))\n",
66 | "print(\"Sequences similar to query:\")\n",
67 | "print(sequence_similarity_results)\n",
68 | "\n",
69 | "for i in range(5):\n",
70 | " similar_protein = sequence_similarity_results[i]\n",
71 | "\n",
72 | " entry_id = similar_protein[:-2]\n",
73 | "\n",
74 | " # Search for structures with small molecule(s)\n",
75 | " small_molecule_query = AttributeQuery(\n",
76 | " attribute=\"rcsb_nonpolymer_entity_annotation.comp_id\",\n",
77 | " operator=\"exists\",\n",
78 | " value=None\n",
79 | " )\n",
80 | "\n",
81 | " # Search for structurally similar proteins\n",
82 | " struct_similarity_query = StructSimilarityQuery(\n",
83 | " structure_search_type=\"entry_id\",\n",
84 | " entry_id=entry_id,\n",
85 | " structure_input_type=\"assembly_id\",\n",
86 | " assembly_id=\"1\", # assemblyid = 1 by default\n",
87 | " operator=\"strict_shape_match\",\n",
88 | " target_search_space=\"assembly\"\n",
89 | " )\n",
90 | "\n",
91 | " group_query = struct_similarity_query & small_molecule_query\n",
92 | "\n",
93 | " print(\"Protein structures similar to\", similar_protein, \"bound to a small molecule:\")\n",
94 | " print(list(group_query(\"assembly\")))"
95 | ]
96 | }
97 | ],
98 | "metadata": {
99 | "kernelspec": {
100 | "display_name": "Python 3",
101 | "language": "python",
102 | "name": "python3"
103 | },
104 | "language_info": {
105 | "codemirror_mode": {
106 | "name": "ipython",
107 | "version": 3
108 | },
109 | "file_extension": ".py",
110 | "mimetype": "text/x-python",
111 | "name": "python",
112 | "nbconvert_exporter": "python",
113 | "pygments_lexer": "ipython3",
114 | "version": "3.12.6"
115 | }
116 | },
117 | "nbformat": 4,
118 | "nbformat_minor": 2
119 | }
120 |
--------------------------------------------------------------------------------
/notebooks/search_data_workflow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## RCSB PDB Data API: Search and Data API Workflow Demo\n",
15 | "\n",
16 | "This quick-start notebook will walk through the basics of using the Search and Data API sub-packages together. For more in-depth documentation reference the [readthedocs](https://rcsbapi.readthedocs.io/en/latest/).\n",
17 | "\n",
18 | "\\\n",
19 | "install the package: \n",
20 | "\n",
21 | "```pip install rcsb-api```"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "%pip install rcsb-api"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "In this demo, we are interested in finding potential drugs to treat COVID-19 and collecting the associated literature in order to conduct further research. To do this, we will:\n",
38 | " 1. Construct a query to fetch COVID-19 viruses with ligands bound (Search API module)\n",
39 | " 2. Find information about each ligand (PDB ID, associated publication titles, links to publications) (Data API module)\n",
40 | " 3. Parse our results and output in an easy-to-read format"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "### Python Search API: Find COVID-19 Structures with Ligand Bound\n",
48 | "\n",
49 | "We'll start by constructing a Search API query that specifies the following:\n",
50 | "- Source organism is \"COVID-19 virus\" \n",
51 | "- Nonpolymer_enitity that is the subject of investigation in the structure\n",
52 | "- Modified chemical component is present"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from rcsbapi.search import search_attributes as attrs\n",
62 | "\n",
63 | "# Create each subquery\n",
64 | "q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == \"COVID-19 virus\"\n",
65 | "q2 = attrs.rcsb_nonpolymer_entity_annotation.type == \"SUBJECT_OF_INVESTIGATION\"\n",
66 | "q3 = attrs.rcsb_polymer_entity_feature_summary.type == \"modified_monomer\"\n",
67 | "\n",
68 | "# Combine using bitwise operators (&, |, ~, etc)\n",
69 | "query = q1 & q2 & q3\n",
70 | "\n",
71 | "# Call the query as a function to execute it\n",
72 | "result_list = query()\n",
73 | "\n",
74 | "# Save and print the first ten resilts\n",
75 | "short_result_list = (list(result_list)[0:10])\n",
76 | "print(short_result_list)\n"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "### Python Data API: Find Information About Structures\n",
84 | "\n",
85 | "Once we have the PDB IDs, we can query them using the Data API for information related to the structure. \n",
86 | "\n",
87 | "In this case, we will find the following for the first 10 results:\n",
88 | "- ID\n",
89 | "- Chemical component IDs\n",
90 | "- Whether the chemical component is the subject of investigation\n",
91 | "- Title of associated publication\n",
92 | "- Digital Object Identifier (DOI) if applicable"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "from rcsbapi.data import DataQuery as Query\n",
102 | "\n",
103 | "query = Query(\n",
104 | " input_type=\"entries\",\n",
105 | " input_ids=short_result_list,\n",
106 | " return_data_list=[\n",
107 | " \"entries.rcsb_id\",\n",
108 | " \"rcsb_nonpolymer_entity_instance_container_identifiers.comp_id\",\n",
109 | " \"is_subject_of_investigation\",\n",
110 | " \"citation.title\",\n",
111 | " \"citation.pdbx_database_id_DOI\"\n",
112 | " ] \n",
113 | ")\n",
114 | "query.exec()\n",
115 | "query.get_response()"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "### Parsing the Result\n",
123 | "\n",
124 | "The result of the request is returned in JSON format. We can refer to the JSON output to understand the data structure and then parse it for the information that is useful to us.\n",
125 | "In this case, we will\n",
126 | "- Confirm the subject of investigation and find the ID if it exists (comp_id)\n",
127 | "- Find the publication title \n",
128 | "- Construct a link to the publication using the DOI\n",
129 | "- Put these data into a dictionary"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "from pprint import pprint # for easier-to-read output\n",
139 | "\n",
140 | "json = query.get_response()[\"data\"][\"entries\"]\n",
141 | "output_dict = {}\n",
142 | "\n",
143 | "# iterate through the result of each entry requested\n",
144 | "for entry_dict in json:\n",
145 | " rcsb_id = entry_dict[\"rcsb_id\"]\n",
146 | "\n",
147 | " # Check for non-polymer subject of investigation, then append to chem_id_list\n",
148 | " for entity_dict in entry_dict[\"nonpolymer_entities\"]:\n",
149 | " for instance_dict in entity_dict[\"nonpolymer_entity_instances\"]:\n",
150 | " is_subject = instance_dict[\"rcsb_nonpolymer_instance_validation_score\"][0][\"is_subject_of_investigation\"]\n",
151 | " if is_subject == \"Y\":\n",
152 | " comp_id = instance_dict[\"rcsb_nonpolymer_entity_instance_container_identifiers\"][\"comp_id\"]\n",
153 | "\n",
154 | " # Find publication title\n",
155 | " title = entry_dict[\"citation\"][0][\"title\"]\n",
156 | "\n",
157 | " # Construct link from DOI (only exists if paper has been published or is on preprint server)\n",
158 | " base_link = \"https://doi.org/\"\n",
159 | " doi_link = \"\"\n",
160 | " if entry_dict[\"citation\"][0][\"pdbx_database_id_DOI\"] is not None:\n",
161 | " doi_link += base_link + entry_dict[\"citation\"][0][\"pdbx_database_id_DOI\"]\n",
162 | "\n",
163 | " # Add to dictionary\n",
164 | " output_dict[rcsb_id] = {\"title\": title, \"link\": doi_link, \"subject_of_investigation\": comp_id, }\n",
165 | "\n",
166 | "pprint(output_dict)"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "### Try it for yourself\n",
174 | "Combining use of our Search and Data API sub-packages can make programmatic access to RCSB PDB easier than ever!"
175 | ]
176 | }
177 | ],
178 | "metadata": {
179 | "kernelspec": {
180 | "display_name": "Python 3",
181 | "language": "python",
182 | "name": "python3"
183 | },
184 | "language_info": {
185 | "codemirror_mode": {
186 | "name": "ipython",
187 | "version": 3
188 | },
189 | "file_extension": ".py",
190 | "mimetype": "text/x-python",
191 | "name": "python",
192 | "nbconvert_exporter": "python",
193 | "pygments_lexer": "ipython3",
194 | "version": "3.12.6"
195 | }
196 | },
197 | "nbformat": 4,
198 | "nbformat_minor": 2
199 | }
200 |
--------------------------------------------------------------------------------
/notebooks/search_quickstart.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "177dc250",
6 | "metadata": {},
7 | "source": [
8 | "
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "upper-filing",
14 | "metadata": {},
15 | "source": [
16 | "# RCSB PDB Search API: Quickstart\n",
17 | "\n",
18 | "This quickstart notebook will walk through the basics of creating and executing queries using the `rcsbapi.search` package of the `rcsb-api` package. For more in-depth documentation, reference the [readthedocs page](https://rcsbapi.readthedocs.io/en/latest/search_api/quickstart.html).\n",
19 | "\n",
20 | "\\\n",
21 | "Before beginning, you must install the package:\n",
22 | "\n",
23 | "```pip install rcsb-api```"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "id": "aef3a8f5",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "%pip install rcsb-api"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 17,
39 | "id": "african-monthly",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "from rcsbapi.search import TextQuery, AttributeQuery\n",
44 | "from rcsbapi.search import search_attributes as attrs"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "51db8156",
50 | "metadata": {},
51 | "source": [
52 | "## Full-text search\n",
53 | "To perform a \"full-text\" search for structures associated with the term \"Hemoglobin\", you can create a `TextQuery`:"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 18,
59 | "id": "110a70a9",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Search for structures associated with the phrase \"Hemoglobin\"\n",
64 | "query = TextQuery(value=\"Hemoglobin\")\n",
65 | "\n",
66 | "# Execute the query by running it as a function\n",
67 | "results = query()\n",
68 | "\n",
69 | "# Results are returned as an iterator of result identifiers.\n",
70 | "for rid in results:\n",
71 | " print(rid)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "id": "a4c2d12b",
77 | "metadata": {},
78 | "source": [
79 | "## Attribute search\n",
80 | "To perform a search for specific structure or chemical attributes, you can create an `AttributeQuery`."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 19,
86 | "id": "79005229",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# Construct a query searching for structures from humans\n",
91 | "query = AttributeQuery(\n",
92 | " attribute=\"rcsb_entity_source_organism.scientific_name\",\n",
93 | " operator=\"exact_match\", # Other operators include \"contains_phrase\", \"exists\", and more\n",
94 | " value=\"Homo sapiens\"\n",
95 | ")\n",
96 | "\n",
97 | "# Execute query and construct a list from results\n",
98 | "results = list(query())\n",
99 | "print(results)"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "id": "8aec7e7e",
105 | "metadata": {},
106 | "source": [
107 | "Refer to the [Search Attributes](https://search.rcsb.org/structure-search-attributes.html) and [Chemical Attributes](https://search.rcsb.org/chemical-search-attributes.html) documentation for a full list of attributes and applicable operators.\n",
108 | "\n",
109 | "Alternatively, you can construct attribute queries with comparative operators using the `search_attributes` object (which also allows for names to be tab-completed):"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 20,
115 | "id": "1a01cb80",
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "# Search for structures from humans\n",
120 | "query = attrs.rcsb_entity_source_organism.scientific_name == \"Homo sapiens\"\n",
121 | "\n",
122 | "# Run query and construct a list from results\n",
123 | "results = list(query())\n",
124 | "print(results)"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "id": "fe2daa02",
130 | "metadata": {},
131 | "source": [
132 | "## Grouping sub-queries\n",
133 | "\n",
134 | "You can combine multiple queries using Python bitwise operators. "
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 21,
140 | "id": "a23da8e7",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "# Query for human epidermal growth factor receptor (EGFR) structures (UniProt ID P00533)\n",
145 | "# with investigational or experimental drugs bound\n",
146 | "q1 = attrs.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession == \"P00533\"\n",
147 | "q2 = attrs.rcsb_entity_source_organism.scientific_name == \"Homo sapiens\"\n",
148 | "q3 = attrs.drugbank_info.drug_groups == \"investigational\"\n",
149 | "q4 = attrs.drugbank_info.drug_groups == \"experimental\"\n",
150 | "\n",
151 | "# Structures matching UniProt ID P00533 AND from humans\n",
152 | "# AND (investigational OR experimental drug group)\n",
153 | "query = q1 & q2 & (q3 | q4)\n",
154 | "\n",
155 | "# Execute query and print first 10 ids\n",
156 | "results = list(query())\n",
157 | "print(results[:10])"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "id": "9d3692c4",
163 | "metadata": {},
164 | "source": [
165 | "These examples are in \"operator\" syntax. You can also make queries in \"fluent\" syntax. Learn more about both syntaxes and implementation details in [Query Syntax and Execution](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#query-syntax-and-execution).\n",
166 | "\n",
167 | "### Supported Search Services\n",
168 | "The list of supported search service types are listed in the table below. For more details on their usage, see [Search Service Types](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#search-service-types).\n",
169 | "\n",
170 | "|Search service |QueryType |\n",
171 | "|----------------------------------|--------------------------|\n",
172 | "|Full-text |`TextQuery()` |\n",
173 | "|Attribute (structure or chemical) |`AttributeQuery()` |\n",
174 | "|Sequence similarity |`SeqSimilarityQuery()` |\n",
175 | "|Sequence motif |`SeqMotifQuery()` |\n",
176 | "|Structure similarity |`StructSimilarityQuery()` |\n",
177 | "|Structure motif |`StructMotifQuery()` |\n",
178 | "|Chemical similarity |`ChemSimilarityQuery()` |\n",
179 | "\n",
180 | "Learn more about available search services on the [RCSB PDB Search API docs](https://search.rcsb.org/#search-services)."
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "id": "e2b42fd8",
186 | "metadata": {},
187 | "source": [
188 | "For more in-depth documentation, go to [readthedocs](https://rcsbapi.readthedocs.io/en/latest/index.html)"
189 | ]
190 | }
191 | ],
192 | "metadata": {
193 | "kernelspec": {
194 | "display_name": "Python 3",
195 | "language": "python",
196 | "name": "python3"
197 | },
198 | "language_info": {
199 | "codemirror_mode": {
200 | "name": "ipython",
201 | "version": 3
202 | },
203 | "file_extension": ".py",
204 | "mimetype": "text/x-python",
205 | "name": "python",
206 | "nbconvert_exporter": "python",
207 | "pygments_lexer": "ipython3",
208 | "version": "3.12.6"
209 | },
210 | "toc": {
211 | "base_numbering": 1,
212 | "nav_menu": {},
213 | "number_sections": true,
214 | "sideBar": true,
215 | "skip_h1_title": false,
216 | "title_cell": "Table of Contents",
217 | "title_sidebar": "Contents",
218 | "toc_cell": false,
219 | "toc_position": {},
220 | "toc_section_display": true,
221 | "toc_window_display": false
222 | },
223 | "varInspector": {
224 | "cols": {
225 | "lenName": 16,
226 | "lenType": 16,
227 | "lenVar": 40
228 | },
229 | "kernels_config": {
230 | "python": {
231 | "delete_cmd_postfix": "",
232 | "delete_cmd_prefix": "del ",
233 | "library": "var_list.py",
234 | "varRefreshCmd": "print(var_dic_list())"
235 | },
236 | "r": {
237 | "delete_cmd_postfix": ") ",
238 | "delete_cmd_prefix": "rm(",
239 | "library": "var_list.r",
240 | "varRefreshCmd": "cat(var_dic_list()) "
241 | }
242 | },
243 | "types_to_exclude": [
244 | "module",
245 | "function",
246 | "builtin_function_or_method",
247 | "instance",
248 | "_Feature"
249 | ],
250 | "window_display": false
251 | }
252 | },
253 | "nbformat": 4,
254 | "nbformat_minor": 5
255 | }
256 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # A comma-separated list of package or module names from where C extensions may
4 | # be loaded. Extensions are loading into the active Python interpreter and may
5 | # run arbitrary code.
6 | extension-pkg-whitelist=MySQLdb
7 |
8 | # Add files or directories to the blacklist. They should be base names, not
9 | # paths.
10 | ignore=CVS
11 |
12 | # Add files or directories matching the regex patterns to the blacklist. The
13 | # regex matches against base names, not paths.
14 | ignore-patterns=
15 |
16 | # Python code to execute, usually for sys.path manipulation such as
17 | # pygtk.require().
18 | #init-hook=
19 |
20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21 | # number of processors available to use.
22 | jobs=1
23 |
24 | # Control the amount of potential inferred values when inferring a single
25 | # object. This can help the performance when dealing with large functions or
26 | # complex, nested conditions.
27 | limit-inference-results=100
28 |
29 | # List of plugins (as comma separated values of python modules names) to load,
30 | # usually to register additional checkers.
31 | load-plugins=
32 |
33 | # Pickle collected data for later comparisons.
34 | persistent=yes
35 |
36 | # Specify a configuration file.
37 | #rcfile=
38 |
39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
40 | # user-friendly hints instead of false-positive error messages.
41 | suggestion-mode=yes
42 |
43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
44 | # active Python interpreter and may run arbitrary code.
45 | unsafe-load-any-extension=no
46 |
47 |
48 | [MESSAGES CONTROL]
49 |
50 | # Only show warnings with the listed confidence levels. Leave empty to show
51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
52 | confidence=
53 |
54 | # Disable the message, report, category or checker with the given id(s). You
55 | # can either give multiple identifiers separated by comma (,) or put this
56 | # option multiple times (only on the command line, not in the configuration
57 | # file where it should appear only once). You can also use "--disable=all" to
58 | # disable everything first and then reenable specific checks. For example, if
59 | # you want to run only the similarities checker, you can use "--disable=all
60 | # --enable=similarities". If you want to run only the classes checker, but have
61 | # no Warning level messages displayed, use "--disable=all --enable=classes
62 | # --disable=W".
63 | disable=missing-docstring,
64 | empty-docstring,
65 | bad-continuation,
66 | print-statement,
67 | parameter-unpacking,
68 | unpacking-in-except,
69 | old-raise-syntax,
70 | backtick,
71 | import-star-module-level,
72 | raw-checker-failed,
73 | bad-inline-option,
74 | locally-disabled,
75 | file-ignored,
76 | suppressed-message,
77 | useless-suppression,
78 | deprecated-pragma,
79 | use-symbolic-message-instead,
80 | broad-except,
81 | apply-builtin,
82 | basestring-builtin,
83 | buffer-builtin,
84 | cmp-builtin,
85 | coerce-builtin,
86 | execfile-builtin,
87 | file-builtin,
88 | long-builtin,
89 | raw_input-builtin,
90 | reduce-builtin,
91 | standarderror-builtin,
92 | unicode-builtin,
93 | xrange-builtin,
94 | coerce-method,
95 | delslice-method,
96 | getslice-method,
97 | setslice-method,
98 | no-absolute-import,
99 | old-division,
100 | dict-iter-method,
101 | dict-view-method,
102 | next-method-called,
103 | metaclass-assignment,
104 | indexing-exception,
105 | raising-string,
106 | reload-builtin,
107 | oct-method,
108 | hex-method,
109 | nonzero-method,
110 | cmp-method,
111 | input-builtin,
112 | round-builtin,
113 | intern-builtin,
114 | unichr-builtin,
115 | map-builtin-not-iterating,
116 | zip-builtin-not-iterating,
117 | range-builtin-not-iterating,
118 | filter-builtin-not-iterating,
119 | using-cmp-argument,
120 | div-method,
121 | idiv-method,
122 | rdiv-method,
123 | exception-message-attribute,
124 | invalid-str-codec,
125 | sys-max-int,
126 | bad-python3-import,
127 | deprecated-string-function,
128 | deprecated-str-translate-call,
129 | deprecated-itertools-function,
130 | deprecated-types-field,
131 | next-method-defined,
132 | dict-items-not-iterating,
133 | dict-keys-not-iterating,
134 | dict-values-not-iterating,
135 | deprecated-operator-function,
136 | deprecated-urllib-function,
137 | xreadlines-attribute,
138 | deprecated-sys-function,
139 | exception-escape,
140 | comprehension-escape,
141 | raise-missing-from,
142 | W0707,
143 | W0238,
144 | no-member,
145 | unused-argument,
146 | protected-access
147 |
148 | # Enable the message, report, category or checker with the given id(s). You can
149 | # either give multiple identifier separated by comma (,) or put this option
150 | # multiple time (only on the command line, not in the configuration file where
151 | # it should appear only once). See also the "--disable" option for examples.
152 | enable=c-extension-no-member
153 |
154 |
155 | [REPORTS]
156 |
157 | # Python expression which should return a note less than 10 (10 is the highest
158 | # note). You have access to the variables errors warning, statement which
159 | # respectively contain the number of errors / warnings messages and the total
160 | # number of statements analyzed. This is used by the global evaluation report
161 | # (RP0004).
162 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
163 |
164 | # Template used to display messages. This is a python new-style format string
165 | # used to format the message information. See doc for all details.
166 | #msg-template=
167 |
168 | # Set the output format. Available formats are text, parseable, colorized, json
169 | # and msvs (visual studio). You can also give a reporter class, e.g.
170 | # mypackage.mymodule.MyReporterClass.
171 | output-format=text
172 |
173 | # Tells whether to display a full report or only the messages.
174 | reports=no
175 |
176 | # Activate the evaluation score.
177 | score=yes
178 |
179 |
180 | [REFACTORING]
181 |
182 | # Maximum number of nested blocks for function / method body
183 | max-nested-blocks=5
184 |
185 | # Complete name of functions that never returns. When checking for
186 | # inconsistent-return-statements if a never returning function is called then
187 | # it will be considered as an explicit return statement and no message will be
188 | # printed.
189 | never-returning-functions=sys.exit
190 |
191 |
192 | [LOGGING]
193 |
194 | # Format style used to check logging format string. `old` means using %
195 | # formatting, while `new` is for `{}` formatting.
196 | logging-format-style=old
197 |
198 | # Logging modules to check that the string format arguments are in logging
199 | # function parameter format.
200 | logging-modules=logging
201 |
202 |
203 | [SPELLING]
204 |
205 | # Limits count of emitted suggestions for spelling mistakes.
206 | max-spelling-suggestions=4
207 |
208 | # Spelling dictionary name. Available dictionaries: none. To make it working
209 | # install python-enchant package..
210 | spelling-dict=
211 |
212 | # List of comma separated words that should not be checked.
213 | spelling-ignore-words=
214 |
215 | # A path to a file that contains private dictionary; one word per line.
216 | spelling-private-dict-file=
217 |
218 | # Tells whether to store unknown words to indicated private dictionary in
219 | # --spelling-private-dict-file option instead of raising a message.
220 | spelling-store-unknown-words=no
221 |
222 |
223 | [MISCELLANEOUS]
224 |
225 | # List of note tags to take in consideration, separated by a comma.
226 | notes=FIXME,
227 | XXX,
228 | TODO
229 |
230 |
231 | [TYPECHECK]
232 |
233 | # List of decorators that produce context managers, such as
234 | # contextlib.contextmanager. Add to this list to register other decorators that
235 | # produce valid context managers.
236 | contextmanager-decorators=contextlib.contextmanager
237 |
238 | # List of members which are set dynamically and missed by pylint inference
239 | # system, and so shouldn't trigger E1101 when accessed. Python regular
240 | # expressions are accepted.
241 | generated-members=
242 |
243 | # Tells whether missing members accessed in mixin class should be ignored. A
244 | # mixin class is detected if its name ends with "mixin" (case insensitive).
245 | ignore-mixin-members=yes
246 |
247 | # Tells whether to warn about missing members when the owner of the attribute
248 | # is inferred to be None.
249 | ignore-none=yes
250 |
251 | # This flag controls whether pylint should warn about no-member and similar
252 | # checks whenever an opaque object is returned when inferring. The inference
253 | # can return multiple potential results while evaluating a Python object, but
254 | # some branches might not be evaluated, which results in partial inference. In
255 | # that case, it might be useful to still emit no-member and other checks for
256 | # the rest of the inferred objects.
257 | ignore-on-opaque-inference=yes
258 |
259 | # List of class names for which member attributes should not be checked (useful
260 | # for classes with dynamically set attributes). This supports the use of
261 | # qualified names.
262 | ignored-classes=optparse.Values,thread._local,_thread._local
263 |
264 | # List of module names for which member attributes should not be checked
265 | # (useful for modules/projects where namespaces are manipulated during runtime
266 | # and thus existing member attributes cannot be deduced by static analysis. It
267 | # supports qualified module names, as well as Unix pattern matching.
268 | ignored-modules=
269 |
270 | # Show a hint with possible names when a member name was not found. The aspect
271 | # of finding the hint is based on edit distance.
272 | missing-member-hint=yes
273 |
274 | # The minimum edit distance a name should have in order to be considered a
275 | # similar match for a missing member name.
276 | missing-member-hint-distance=1
277 |
278 | # The total number of similar names that should be taken in consideration when
279 | # showing a hint for a missing member.
280 | missing-member-max-choices=1
281 |
282 |
283 | [VARIABLES]
284 |
285 | # List of additional names supposed to be defined in builtins. Remember that
286 | # you should avoid defining new builtins when possible.
287 | additional-builtins=
288 |
289 | # Tells whether unused global variables should be treated as a violation.
290 | allow-global-unused-variables=yes
291 |
292 | # List of strings which can identify a callback function by name. A callback
293 | # name must start or end with one of those strings.
294 | callbacks=cb_,
295 | _cb
296 |
297 | # A regular expression matching the name of dummy variables (i.e. expected to
298 | # not be used).
299 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
300 |
301 | # Argument names that match this expression will be ignored. Default to name
302 | # with leading underscore.
303 | ignored-argument-names=_.*|^ignored_|^unused_
304 |
305 | # Tells whether we should check for unused import in __init__ files.
306 | init-import=no
307 |
308 | # List of qualified module names which can have objects that can redefine
309 | # builtins.
310 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
311 |
312 |
313 | [FORMAT]
314 |
315 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
316 | expected-line-ending-format=
317 |
318 | # Regexp for a line that is allowed to be longer than the limit.
319 | ignore-long-lines=^\s*(# )??$
320 |
321 | # Number of spaces of indent required inside a hanging or continued line.
322 | indent-after-paren=4
323 |
324 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
325 | # tab).
326 | indent-string=' '
327 |
328 | # Maximum number of characters on a single line.
329 | max-line-length=240
330 |
331 | # Maximum number of lines in a module.
332 | max-module-lines=1000
333 |
334 | # Allow the body of a class to be on the same line as the declaration if body
335 | # contains single statement.
336 | single-line-class-stmt=no
337 |
338 | # Allow the body of an if to be on the same line as the test if there is no
339 | # else.
340 | single-line-if-stmt=no
341 |
342 |
343 | [SIMILARITIES]
344 |
345 | # Ignore comments when computing similarities.
346 | ignore-comments=yes
347 |
348 | # Ignore docstrings when computing similarities.
349 | ignore-docstrings=yes
350 |
351 | # Ignore imports when computing similarities.
352 | ignore-imports=no
353 |
354 | # Minimum lines number of a similarity.
355 | min-similarity-lines=4
356 |
357 |
358 | [BASIC]
359 |
360 | # Naming style matching correct argument names.
361 | argument-naming-style=snake_case
362 |
363 | # Regular expression matching correct argument names. Overrides argument-
364 | # naming-style.
365 | # argument-rgx=[a-z_][a-zA-Z0-9]{1,30}$
366 |
367 | # Naming style matching correct attribute names.
368 | attr-naming-style=snake_case
369 |
370 | # Regular expression matching correct attribute names. Overrides attr-naming-
371 | # style.
372 | # attr-rgx=_?_?[a-z][A-Za-z0-9]{1,40}$
373 |
374 | # Bad variable names which should always be refused, separated by a comma.
375 | bad-names=foo,
376 | bar,
377 | baz,
378 | toto,
379 | tutu,
380 | tata
381 |
382 | # Naming style matching correct class attribute names.
383 | class-attribute-naming-style=camelCase
384 |
385 | # Regular expression matching correct class attribute names. Overrides class-
386 | # attribute-naming-style.
387 | # class-attribute-rgx=_?_?[a-z][A-Za-z0-9]{1,40}$
388 |
389 | # Naming style matching correct class names.
390 | class-naming-style=PascalCase
391 |
392 | # Regular expression matching correct class names. Overrides class-naming-
393 | # style.
394 | #class-rgx=
395 |
396 | # Naming style matching correct constant names.
397 | const-naming-style=any
398 |
399 | # Regular expression matching correct constant names. Overrides const-naming-
400 | # style.
401 | #const-rgx=
402 |
403 | # Minimum line length for functions/classes that require docstrings, shorter
404 | # ones are exempt.
405 | docstring-min-length=-1
406 |
407 | # Naming style matching correct function names.
408 | function-naming-style=camelCase
409 |
410 | # Regular expression matching correct function names. Overrides function-
411 | # naming-style.
412 | #function-rgx=
413 |
414 | # Good variable names which should always be accepted, separated by a comma.
415 | good-names=_,
416 | i,
417 | j,
418 | k,
419 | v,
420 | ii,
421 | jj,
422 | kk,
423 | # t,
424 | # c,
425 | # d,
426 | e,
427 | # r,
428 | # s,
429 | # v,
430 | # p,
431 | # ts,
432 | # tS,
433 | ok,
434 | logger
435 |
436 | # Include a hint for the correct naming format with invalid-name.
437 | include-naming-hint=no
438 |
439 | # Naming style matching correct inline iteration names.
440 | inlinevar-naming-style=any
441 |
442 | # Regular expression matching correct inline iteration names. Overrides
443 | # inlinevar-naming-style.
444 | #inlinevar-rgx=
445 |
446 | # Naming style matching correct method names.
447 | method-naming-style=snake_case
448 |
449 | # Regular expression matching correct method names. Overrides method-naming-
450 | # style.
451 | # method-rgx=_?_?[a-z][A-Za-z0-9]{1,40}_?_?$
452 |
453 | # Naming style matching correct module names.
454 | module-naming-style=any
455 |
456 | # Regular expression matching correct module names. Overrides module-naming-
457 | # style.
458 | #module-rgx=
459 |
460 | # Colon-delimited sets of names that determine each other's naming style when
461 | # the name regexes allow several styles.
462 | name-group=
463 |
464 | # Regular expression which should only match function or class names that do
465 | # not require a docstring.
466 | no-docstring-rgx=^_
467 |
468 | # List of decorators that produce properties, such as abc.abstractproperty. Add
469 | # to this list to register other decorators that produce valid properties.
470 | # These decorators are taken in consideration only for invalid-name.
471 | property-classes=abc.abstractproperty
472 |
473 | # Naming style matching correct variable names.
474 | variable-naming-style=snake_case
475 |
476 | # Regular expression matching correct variable names. Overrides variable-
477 | # naming-style.
478 | # variable-rgx=[a-z_][a-zA-Z0-9]{1,40}$
479 |
480 |
481 | [STRING]
482 |
483 | # This flag controls whether the implicit-str-concat-in-sequence should
484 | # generate a warning on implicit string concatenation in sequences defined over
485 | # several lines.
486 | check-str-concat-over-line-jumps=no
487 |
488 |
489 | [IMPORTS]
490 |
491 | # Allow wildcard imports from modules that define __all__.
492 | allow-wildcard-with-all=no
493 |
494 | # Analyse import fallback blocks. This can be used to support both Python 2 and
495 | # 3 compatible code, which means that the block might have code that exists
496 | # only in one or another interpreter, leading to false positives when analysed.
497 | analyse-fallback-blocks=no
498 |
499 | # Deprecated modules which should not be used, separated by a comma.
500 | deprecated-modules=optparse,tkinter.tix
501 |
502 | # Create a graph of external dependencies in the given file (report RP0402 must
503 | # not be disabled).
504 | ext-import-graph=
505 |
506 | # Create a graph of every (i.e. internal and external) dependencies in the
507 | # given file (report RP0402 must not be disabled).
508 | import-graph=
509 |
510 | # Create a graph of internal dependencies in the given file (report RP0402 must
511 | # not be disabled).
512 | int-import-graph=
513 |
514 | # Force import order to recognize a module as part of the standard
515 | # compatibility libraries.
516 | known-standard-library=
517 |
518 | # Force import order to recognize a module as part of a third party library.
519 | known-third-party=enchant
520 |
521 |
522 | [CLASSES]
523 |
524 | # List of method names used to declare (i.e. assign) instance attributes.
525 | defining-attr-methods=__init__,
526 | __new__,
527 | setUp
528 |
529 | # List of member names, which should be excluded from the protected access
530 | # warning.
531 | exclude-protected=_asdict,
532 | _fields,
533 | _replace,
534 | _source,
535 | _make
536 |
537 | # List of valid names for the first argument in a class method.
538 | valid-classmethod-first-arg=cls
539 |
540 | # List of valid names for the first argument in a metaclass class method.
541 | valid-metaclass-classmethod-first-arg=cls
542 |
543 |
544 | [DESIGN]
545 |
546 | # Maximum number of arguments for function / method.
547 | max-args=5
548 |
549 | # Maximum number of attributes for a class (see R0902).
550 | max-attributes=7
551 |
552 | # Maximum number of boolean expressions in an if statement.
553 | max-bool-expr=5
554 |
555 | # Maximum number of branch for function / method body.
556 | max-branches=12
557 |
558 | # Maximum number of locals for function / method body.
559 | max-locals=15
560 |
561 | # Maximum number of parents for a class (see R0901).
562 | max-parents=7
563 |
564 | # Maximum number of public methods for a class (see R0904).
565 | max-public-methods=20
566 |
567 | # Maximum number of return / yield for function / method body.
568 | max-returns=6
569 |
570 | # Maximum number of statements in function / method body.
571 | max-statements=50
572 |
573 | # Minimum number of public methods for a class (see R0903).
574 | min-public-methods=2
575 |
576 |
577 | [EXCEPTIONS]
578 |
579 | # Exceptions that will emit a warning when being caught. Defaults to
580 | # "BaseException, Exception".
581 | overgeneral-exceptions=BaseException,
582 | Exception
--------------------------------------------------------------------------------
/rcsbapi/__init__.py:
--------------------------------------------------------------------------------
1 | __docformat__ = "restructuredtext en"
2 | __author__ = "Dennis Piehl"
3 | __email__ = "dennis.piehl@rcsb.org"
4 | __license__ = "MIT"
5 | __version__ = "1.1.3"
6 |
7 | __path__ = __import__("pkgutil").extend_path(__path__, __name__)
8 |
9 | import logging
10 |
11 | logging.basicConfig(level=logging.WARNING, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
12 | logger = logging.getLogger()
13 |
--------------------------------------------------------------------------------
/rcsbapi/config.py:
--------------------------------------------------------------------------------
1 | """
2 | Configurable settings for rcsb-api
3 |
4 | These settings can be overridden at runtime.
5 |
6 | For example, you can turn off autocompletion warning messages by
7 | modifying the `SUPPRESS_AUTOCOMPLETE_WARNING` setting as follows:
8 |
9 | Example:
10 | from rcsbapi.config import config
11 |
12 | # Override the default warning suppression flag
13 | config.SUPPRESS_AUTOCOMPLETE_WARNING = True
14 | """
15 | import logging
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | class Config:
21 | API_TIMEOUT: int = 60
22 | SEARCH_API_REQUESTS_PER_SECOND: int = 10
23 | SUPPRESS_AUTOCOMPLETE_WARNING: bool = False
24 | INPUT_ID_LIMIT: int = 5000
25 |
26 | def __setattr__(self, name, value):
27 | """Verify attribute exists when a user tries to set a configuration parameter, and ensure proper typing.
28 | Raises an error if user accidentally tries to create a new, unused attribute (e.g., due to a typo or misspelling),
29 | or sets it to an unexpected type.
30 | """
31 | # Verify attribute exists
32 | if not hasattr(self, name):
33 | raise AttributeError(f"'{name}' is not a valid attribute of Config class")
34 |
35 | # Enforce consistent typing
36 | expected_type = self.__annotations__.get(name, None)
37 | if expected_type and not isinstance(value, expected_type):
38 | raise TypeError(f"Expected type '{expected_type.__name__}' for attribute '{name}', but got '{type(value).__name__}'")
39 | super().__setattr__(name, value)
40 |
41 |
42 | config = Config()
43 |
--------------------------------------------------------------------------------
/rcsbapi/const.py:
--------------------------------------------------------------------------------
1 | """
2 | Constants for rcsb-api (immutable and cannot be overridden)
3 |
4 | These constants define fixed values used throughout the rcsb-api package,
5 | including API endpoints, search services, and schema URLs. The values are
6 | immutable and protected from modification during runtime.
7 | """
8 |
9 | from __future__ import annotations
10 | from dataclasses import dataclass, field
11 | from types import MappingProxyType
12 | from typing import List
13 |
14 |
15 | @dataclass(frozen=True)
16 | class Const:
17 | # Search API constants
18 | STRUCTURE_INDEX: int = 0
19 | CHEMICAL_INDEX: int = 0
20 | SEARCH_API_REQUEST_SCHEMA_URL: str = "https://search.rcsb.org/schema/search/request/json-schema-rcsb_search_query.json"
21 | SEARCH_OPENAPI_SCHEMA_URL: str = "https://search.rcsb.org/openapi.json"
22 | STRUCTURE_ATTRIBUTE_SEARCH_SERVICE: str = "text"
23 | CHEMICAL_ATTRIBUTE_SEARCH_SERVICE: str = "text_chem"
24 | FULL_TEXT_SEARCH_SERVICE: str = "full_text"
25 | SEQUENCE_SEARCH_SERVICE: str = "sequence"
26 | SEQMOTIF_SEARCH_SERVICE: str = "seqmotif"
27 | STRUCT_SIM_SEARCH_SERVICE: str = "structure"
28 | STRUCTMOTIF_SEARCH_SERVICE: str = "strucmotif"
29 | CHEM_SIM_SEARCH_SERVICE: str = "chemical"
30 | SEQUENCE_SEARCH_MIN_NUM_OF_RESIDUES: int = 25
31 | SEQMOTIF_SEARCH_MIN_CHARACTERS: int = 2
32 | STRUCT_MOTIF_MIN_RESIDUES: int = 2
33 | STRUCT_MOTIF_MAX_RESIDUES: int = 10
34 | RCSB_SEARCH_API_QUERY_URL: str = "https://search.rcsb.org/rcsbsearch/v2/query"
35 | UPLOAD_URL: str = "https://user-upload.rcsb.org/v1/putMultipart"
36 | RETURN_UP_URL: str = "https://user-upload.rcsb.org/v1/download/"
37 |
38 | SEARCH_API_SCHEMA_DIR: str = "search/resources"
39 | SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL: str = "http://search.rcsb.org/rcsbsearch/v2/metadata/schema"
40 | SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME: str = "structure_schema.json"
41 | SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL: str = "https://search.rcsb.org/rcsbsearch/v2/metadata/chemical/schema"
42 | SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME: str = "chemical_schema.json"
43 |
44 | # Data API constants
45 | DATA_API_ENDPOINT: str = "https://data.rcsb.org/graphql"
46 | DATA_API_SCHEMA_DIR: str = "data/resources"
47 | DATA_API_SCHEMA_FILENAME: str = "data_api_schema.json"
48 | DATA_API_SCHEMA_BASE_URL: str = "https://data.rcsb.org/rest/v1/schema/"
49 | DATA_API_SCHEMA_ENDPOINT_TO_FILE: MappingProxyType[str, str] = field(default_factory=lambda: MappingProxyType({
50 | "entry": "entry.json",
51 | "polymer_entity": "polymer_entity.json",
52 | "branched_entity": "branched_entity.json",
53 | "nonpolymer_entity": "nonpolymer_entity.json",
54 | "polymer_entity_instance": "polymer_entity_instance.json",
55 | "branched_entity_instance": "branched_entity_instance.json",
56 | "nonpolymer_entity_instance": "nonpolymer_entity_instance.json",
57 | "assembly": "assembly.json",
58 | "chem_comp": "chem_comp.json",
59 | "pubmed": "pubmed.json",
60 | "uniprot": "uniprot.json",
61 | "drugbank": "drugbank.json",
62 | }))
63 |
64 | SINGULAR_TO_PLURAL: MappingProxyType[str, str] = field(default_factory=lambda: MappingProxyType({
65 | "entry": "entries",
66 | "polymer_entity": "polymer_entities",
67 | "branched_entity": "branched_entities",
68 | "nonpolymer_entity": "nonpolymer_entities",
69 | "polymer_entity_instance": "polymer_entity_instances",
70 | "nonpolymer_entity_instance": "nonpolymer_entity_instances",
71 | "branched_entity_instance": "branched_entity_instances",
72 | "assembly": "assemblies",
73 | "interface": "interfaces",
74 | "uniprot": "",
75 | "pubmed": "",
76 | "chem_comp": "chem_comps",
77 | "entry_group": "entry_groups",
78 | "polymer_entity_group": "polymer_entity_groups",
79 | "group_provenance": ""
80 | }))
81 | #
82 | ID_TO_SEPARATOR: MappingProxyType[str, str] = field(default_factory=lambda: MappingProxyType({
83 | "entity_id": "_",
84 | "asym_id": ".",
85 | "assembly_id": "-",
86 | "interface_id": "."
87 | }))
88 |
89 | # Regex strings for IDs
90 | DATA_API_INPUT_TYPE_TO_REGEX: MappingProxyType[str, List[str]] = field(default_factory=lambda: MappingProxyType({
91 | "entry": [r"^(MA|AF|ma|af)_[A-Z0-9]*$", r"^[A-Za-z0-9]{4}$"],
92 | "entity": [r"^(MA|AF|ma|af)_[A-Z0-9]*_[0-9]+$", r"^[A-Z0-9]{4}_[0-9]+$"],
93 | "instance": [r"^(MA|AF|ma|af)_[A-Z0-9]*\.[A-Za-z]+$", r"^[A-Z0-9]{4}\.[A-Za-z]+$"],
94 | "assembly": [r"^(MA|AF|ma|af)_[A-Z0-9]*-[0-9]+$", r"^[A-Z0-9]{4}-[0-9]+$"],
95 | "interface": [r"^(MA|AF|ma|af)_[A-Z0-9]*-[0-9]+\.[0-9]+$", r"^[A-Z0-9]{4}-[0-9]+\.[0-9]+$"],
96 | # Regex for uniprot: https://www.uniprot.org/help/accession_numbers
97 | "uniprot": [r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"]
98 | }))
99 |
100 | INPUT_TYPE_TO_ALL_STRUCTURES_ENDPOINT: MappingProxyType[str, List[str]] = field(default_factory=lambda: MappingProxyType({
101 | "entries": ["https://data.rcsb.org/rest/v1/holdings/current/entry_ids"],
102 | "chem_comps": ["https://data.rcsb.org/rest/v1/holdings/current/ccd_ids", "https://data.rcsb.org/rest/v1/holdings/current/prd_ids"]
103 | }))
104 |
105 |
106 | const = Const()
107 |
--------------------------------------------------------------------------------
/rcsbapi/data/__init__.py:
--------------------------------------------------------------------------------
1 | """RCSB PDB Data API"""
2 | from .data_schema import DataSchema
3 |
4 | DATA_SCHEMA = DataSchema()
5 |
6 | # This is needed because __getattr__ will be called twice on import,
7 | # so ALL_STRUCTURES should be cached to avoid initializing twice
8 | _import_cache: dict = {}
9 |
10 |
11 | def __getattr__(name: str):
12 | """Overloading __getattr__ so that when ALL_STRUCTURES is accessed for the first time,
13 | ALL_STRUCTURES object will be built.
14 |
15 | Args:
16 | name (str): attribute name
17 | """
18 | if name == "ALL_STRUCTURES":
19 | if name not in _import_cache:
20 | from .data_query import AllStructures
21 | ALL_STRUCTURES = AllStructures()
22 | _import_cache[name] = ALL_STRUCTURES
23 |
24 | return _import_cache[name] # Return cached instance
25 |
26 | # keep functionality of original __getattr__
27 | raise AttributeError(f"Module {repr(__name__)} has no attribute {repr(name)}")
28 |
29 |
30 | from .data_query import DataQuery # noqa:E402
31 |
32 | __all__ = ["DataQuery", "DataSchema"]
33 |
--------------------------------------------------------------------------------
/rcsbapi/data/data_query.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import urllib.parse
3 | import re
4 | import time
5 | from typing import Any, Union, List, Dict, Optional, Tuple
6 | import json
7 | import requests
8 | from tqdm import tqdm
9 | from rcsbapi.data import DATA_SCHEMA
10 | from ..config import config
11 | from ..const import const
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class DataQuery:
17 | """
18 | Class for Data API queries.
19 | """
20 | def __init__(
21 | self,
22 | input_type: str,
23 | input_ids: Union[List[str], Dict[str, str], Dict[str, List[str]]],
24 | return_data_list: List[str],
25 | add_rcsb_id: bool = True,
26 | suppress_autocomplete_warning: bool = False
27 | ):
28 | """
29 | Query object for Data API requests.
30 |
31 | Args:
32 | input_type (str): query input type
33 | (e.g., "entry", "polymer_entity_instance", etc.)
34 | input_ids (list or dict): list (or singular dict) of ids for which to request information
35 | (e.g., ["4HHB", "2LGI"])
36 | return_data_list (list): list of data to return (field names)
37 | (e.g., ["rcsb_id", "exptl.method"])
38 | add_rcsb_id (bool, optional): whether to automatically add .rcsb_id to queries. Defaults to True.
39 | """
40 | suppress_autocomplete_warning = config.SUPPRESS_AUTOCOMPLETE_WARNING if config.SUPPRESS_AUTOCOMPLETE_WARNING else suppress_autocomplete_warning
41 |
42 | if not isinstance(input_ids, AllStructures):
43 | if isinstance(input_ids, list):
44 | if len(input_ids) > config.INPUT_ID_LIMIT:
45 | logger.warning("More than %d input_ids. Query will be slower to complete.", config.INPUT_ID_LIMIT)
46 | if isinstance(input_ids, dict):
47 | for value in input_ids.values():
48 | if len(value) > config.INPUT_ID_LIMIT:
49 | logger.warning("More than %d input_ids. Query will be slower to complete.", config.INPUT_ID_LIMIT)
50 |
51 | self._input_type, self._input_ids = self._process_input_ids(input_type, input_ids)
52 | self._return_data_list = return_data_list
53 | self._query = DATA_SCHEMA.construct_query(
54 | input_type=self._input_type,
55 | input_ids=self._input_ids,
56 | return_data_list=return_data_list,
57 | add_rcsb_id=add_rcsb_id,
58 | suppress_autocomplete_warning=suppress_autocomplete_warning
59 | )
60 | """GraphQL query as a string"""
61 | self._response: Optional[Dict[str, Any]] = None
62 | """JSON response to query, will be assigned after executing"""
63 |
64 | def _process_input_ids(self, input_type: str, input_ids: Union[List[str], Dict[str, str], Dict[str, List[str]]]) -> Tuple[str, List[str]]:
65 | """Convert input_type to plural if possible.
66 | Set input_ids to be a list of ids.
67 | If using ALL_STRUCTURES, return the id list corresponding to the input type.
68 |
69 | Args:
70 | input_type (str): query input type
71 | (e.g., "entry", "polymer_entity_instance", etc.)
72 | input_ids (Union[List[str], Dict[str, str], Dict[str, List[str]]]): list/dict of ids to request information for
73 |
74 | Returns:
75 | Tuple[str, List[str]]: returns a tuple of converted input_type and list of input_ids
76 | """
77 | # If input_ids is ALL_STRUCTURES, return appropriate list of ids
78 | if isinstance(input_ids, AllStructures):
79 | new_input_ids = input_ids.get_all_ids(input_type)
80 | return (input_type, new_input_ids)
81 |
82 | # Convert _input_type to plural if applicable
83 | converted = False
84 | if DATA_SCHEMA._root_dict[input_type][0]["kind"] != "LIST":
85 | plural_type = const.SINGULAR_TO_PLURAL[input_type]
86 | if plural_type:
87 | input_type = plural_type
88 | converted = True
89 |
90 | # Set _input_ids
91 | if isinstance(input_ids, dict):
92 | if converted:
93 | # If converted and input_ids is a dict, join into PDB id format
94 | if isinstance(input_ids, dict):
95 | join_id = ""
96 | for k, v in input_ids.items():
97 | assert isinstance(v, str) # for mypy
98 | if k in const.ID_TO_SEPARATOR:
99 | join_id += const.ID_TO_SEPARATOR[k] + v
100 | else:
101 | join_id += v
102 |
103 | input_ids = [join_id]
104 |
105 | else:
106 | # If not converted, retrieve id list from dictionary
107 | input_ids = list(input_ids[DATA_SCHEMA._root_dict[input_type][0]["name"]])
108 |
109 | # Make all input_ids uppercase
110 | input_ids = [id.upper() for id in input_ids]
111 |
112 | assert isinstance(input_ids, list)
113 | return (input_type, input_ids)
114 |
115 | def get_input_ids(self) -> List[str]:
116 | """get input_ids used to make query
117 |
118 | Returns:
119 | Union[List[str], Dict[str, List[str]], Dict[str, str]]: input id list or dictionary
120 | """
121 | return self._input_ids
122 |
123 | def get_input_type(self) -> str:
124 | """get input_type used to make query
125 |
126 | Returns:
127 | str: input_type
128 | (e.g., "entry", "polymer_entity_instance", etc.)
129 | """
130 | return self._input_type
131 |
132 | def get_return_data_list(self) -> List[str]:
133 | """get return_data_list used to make query
134 |
135 | Returns:
136 | List[str]: return_data_list
137 | (e.g., ["rcsb_id", "exptl.method"])
138 | """
139 | return self._return_data_list
140 |
141 | def get_query(self) -> str:
142 | """get GraphQL query
143 |
144 | Returns:
145 | str: query in GraphQL syntax
146 | """
147 | return self._query
148 |
149 | def get_response(self) -> Union[None, Dict[str, Any]]:
150 | """get JSON response to executed query
151 |
152 | Returns:
153 | Dict[str, Any]: JSON object
154 | """
155 | return self._response
156 |
157 | def get_editor_link(self) -> str:
158 | """get url to interactive GraphiQL editor
159 |
160 | Returns:
161 | str: GraphiQL url
162 | """
163 | editor_base_link = str(const.DATA_API_ENDPOINT) + "/index.html?query="
164 | return editor_base_link + urllib.parse.quote(self._query)
165 |
166 | def exec(self, batch_size: int = 5000, progress_bar: bool = False) -> Dict[str, Any]:
167 | """POST a GraphQL query and get response
168 |
169 | Returns:
170 | Dict[str, Any]: JSON object
171 | """
172 | if len(self._input_ids) > batch_size:
173 | batched_ids: Union[List[List[str]], tqdm] = self._batch_ids(batch_size)
174 | else:
175 | batched_ids = [self._input_ids]
176 | response_json: Dict[str, Any] = {}
177 |
178 | if progress_bar is True:
179 | batched_ids = tqdm(batched_ids)
180 |
181 | for id_batch in batched_ids:
182 | query = re.sub(r"\[([^]]+)\]", f"{id_batch}".replace("'", '"'), self._query)
183 | part_response = requests.post(
184 | headers={"Content-Type": "application/graphql"},
185 | data=query,
186 | url=const.DATA_API_ENDPOINT,
187 | timeout=config.API_TIMEOUT
188 | ).json()
189 | self._parse_gql_error(part_response)
190 | time.sleep(0.2)
191 | if not response_json:
192 | response_json = part_response
193 | else:
194 | response_json = self._merge_response(response_json, part_response)
195 |
196 | if "data" in response_json.keys():
197 | query_response = response_json["data"][self._input_type]
198 | if query_response is None:
199 | logger.warning("Input produced no results. Check that input ids are valid")
200 | if isinstance(query_response, list):
201 | if len(query_response) == 0:
202 | logger.warning("Input produced no results. Check that input ids are valid")
203 | self._response = response_json
204 | return response_json
205 |
206 | def _parse_gql_error(self, response_json: Dict[str, Any]):
207 | if "errors" in response_json.keys():
208 | error_msg_list: list[str] = []
209 | for error_dict in response_json["errors"]:
210 | error_msg_list.append(error_dict["message"])
211 | combined_error_msg: str = ""
212 | for i, error_msg in enumerate(error_msg_list):
213 | combined_error_msg += f"{i+1}. {error_msg}\n"
214 | raise ValueError(f"{combined_error_msg}. Run .get_editor_link() to get a link to GraphiQL editor with query")
215 |
216 | def _batch_ids(self, batch_size: int) -> List[List[str]]: # assumes that plural types have only one arg, which is true right now
217 | """split queries with large numbers of input_ids into smaller batches
218 |
219 | Args:
220 | batch_size (int): max size of batches
221 |
222 | Returns:
223 | List[List[str]]: nested list where each list is a batch of ids
224 | """
225 | batched_ids: List[List[str]] = []
226 | i = 0
227 | while i < len(self._input_ids):
228 | count = 0
229 | batch_list: List[str] = []
230 | while count < batch_size and i < len(self._input_ids):
231 | batch_list.append(self._input_ids[i])
232 | count += 1
233 | i += 1
234 | if len(batch_list) > 0:
235 | batched_ids.append(batch_list)
236 | return batched_ids
237 |
238 | def _merge_response(self, merge_into_response: Dict[str, Any], to_merge_response: Dict[str, Any]):
239 | """merge two JSON responses. Used after batching ids to merge responses from each batch.
240 |
241 | Args:
242 | merge_into_response (Dict[str, Any])
243 | to_merge_response (Dict[str, Any])
244 |
245 | Returns:
246 | Dict : merged JSON response, formatted as if it was one request
247 | """
248 | combined_response = merge_into_response
249 | combined_response["data"][self._input_type] += to_merge_response["data"][self._input_type]
250 | return combined_response
251 |
252 |
253 | class AllStructures:
254 | """Class for representing all structures of different `input_types`
255 | """
256 | def __init__(self):
257 | """initialize AllStructures object
258 | """
259 | self.ALL_STRUCTURES = self.reload()
260 |
261 | def reload(self) -> dict[str, List[str]]:
262 | """Build dictionary of IDs based on endpoints defined in const
263 |
264 | Returns:
265 | dict[str, List[str]]: ALL_STRUCTURES object
266 | """
267 | ALL_STRUCTURES = {}
268 | for input_type, endpoints in const.INPUT_TYPE_TO_ALL_STRUCTURES_ENDPOINT.items():
269 | all_ids: List[str] = []
270 | for endpoint in endpoints:
271 | response = requests.get(endpoint, timeout=60)
272 | if response.status_code == 200:
273 | all_ids.extend(json.loads(response.text))
274 | else:
275 | response.raise_for_status()
276 | ALL_STRUCTURES[input_type] = all_ids
277 |
278 | return ALL_STRUCTURES
279 |
280 | def get_all_ids(self, input_type: str) -> List[str]:
281 | """Get all ids of a certain `input_type`
282 |
283 | Args:
284 | input_type (str): `input_type` string
285 |
286 | Raises:
287 | ValueError: raise an error if the `input_type` isn't in ALL_STRUCTURES
288 |
289 | Returns:
290 | List[str]: list of IDS of specified `input_type`
291 | """
292 | if input_type in self.ALL_STRUCTURES:
293 | return self.ALL_STRUCTURES[input_type]
294 | else:
295 | raise ValueError(f"ALL_STRUCTURES is not yet available for input_type {input_type}")
296 |
--------------------------------------------------------------------------------
/rcsbapi/data/resources/pubmed.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "http://json-schema.org/draft-04/schema#",
3 | "title": "Core PubMed",
4 | "description": "JSON schema for core PubMed collection in RCSB Data Warehouse.",
5 | "required": [
6 | "rcsb_pubmed_container_identifiers"
7 | ],
8 | "type": "object",
9 | "properties": {
10 | "rcsb_id": {
11 | "type": "string",
12 | "description": "Unique integer value assigned to each PubMed record.",
13 | "rcsb_search_context": [
14 | "exact-match"
15 | ]
16 | },
17 | "rcsb_pubmed_container_identifiers": {
18 | "type": "object",
19 | "properties": {
20 | "pubmed_id": {
21 | "type": "integer",
22 | "description": "UID assigned to each PubMed record.",
23 | "rcsb_search_context": [
24 | "default-match"
25 | ],
26 | "examples": [
27 | 15937111
28 | ],
29 | "rcsb_description": [
30 | {
31 | "text": "PubMed ID",
32 | "context": "brief"
33 | }
34 | ],
35 | "rcsb_search_group": [
36 | {
37 | "group_name": "ID(s) and Keywords",
38 | "priority_order": 18
39 | }
40 | ]
41 | }
42 | },
43 | "additionalProperties": false
44 | },
45 | "rcsb_pubmed_central_id": {
46 | "type": "string",
47 | "description": "Unique integer value assigned to each PubMed Central record."
48 | },
49 | "rcsb_pubmed_doi": {
50 | "type": "string",
51 | "description": "Persistent identifier used to provide a link to an article location on the Internet."
52 | },
53 | "rcsb_pubmed_abstract_text": {
54 | "type": "string",
55 | "description": "A concise, accurate and factual mini-version of the paper contents.",
56 | "rcsb_search_context": [
57 | "full-text"
58 | ],
59 | "rcsb_description": [
60 | {
61 | "text": "PubMed Abstract",
62 | "context": "brief"
63 | }
64 | ],
65 | "rcsb_search_group": [
66 | {
67 | "group_name": "Publications Primary",
68 | "priority_order": 30
69 | }
70 | ]
71 | },
72 | "rcsb_pubmed_affiliation_info": {
73 | "type": "array",
74 | "minItems": 1,
75 | "uniqueItems": true,
76 | "items": {
77 | "type": "string"
78 | },
79 | "description": "The institution(s) that the author is affiliated with. Multiple affiliations per author are allowed."
80 | },
81 | "rcsb_pubmed_mesh_descriptors": {
82 | "type": "array",
83 | "minItems": 1,
84 | "uniqueItems": true,
85 | "items": {
86 | "type": "string"
87 | },
88 | "description": "NLM controlled vocabulary, Medical Subject Headings (MeSH), is used to characterize the content of the articles represented by MEDLINE citations."
89 | },
90 | "rcsb_pubmed_mesh_descriptors_lineage": {
91 | "type": "array",
92 | "minItems": 1,
93 | "uniqueItems": true,
94 | "items": {
95 | "type": "object",
96 | "properties": {
97 | "id": {
98 | "type": "string",
99 | "description": "Identifier for MeSH classification term.",
100 | "rcsb_search_context": [
101 | "exact-match"
102 | ],
103 | "examples": [
104 | "E01.370.225.500.388",
105 | "H01.181"
106 | ],
107 | "rcsb_description": [
108 | {
109 | "text": "MeSH Identifier",
110 | "context": "brief"
111 | }
112 | ],
113 | "rcsb_search_group": [
114 | {
115 | "group_name": "Publications Primary",
116 | "priority_order": 35
117 | }
118 | ]
119 | },
120 | "name": {
121 | "type": "string",
122 | "description": "MeSH classification term.",
123 | "rcsb_search_context": [
124 | "exact-match",
125 | "full-text"
126 | ],
127 | "examples": [
128 | "Chemistry",
129 | "Mammals",
130 | "Therapeutic Uses"
131 | ],
132 | "rcsb_description": [
133 | {
134 | "text": "MeSH Descriptor",
135 | "context": "brief"
136 | }
137 | ],
138 | "rcsb_search_group": [
139 | {
140 | "group_name": "Publications Primary",
141 | "priority_order": 36
142 | }
143 | ]
144 | },
145 | "depth": {
146 | "type": "integer",
147 | "description": "Hierarchy depth.",
148 | "rcsb_search_context": [
149 | "default-match"
150 | ]
151 | }
152 | },
153 | "additionalProperties": false
154 | },
155 | "rcsb_nested_indexing": true,
156 | "description": "Members of the MeSH classification lineage."
157 | }
158 | },
159 | "additionalProperties": false
160 | }
--------------------------------------------------------------------------------
/rcsbapi/dev_tools/update_schema.py:
--------------------------------------------------------------------------------
1 | """Update the distribution json files; for developer use only
2 |
3 | This script updates the search and data API schema files.
4 | After updating, it prints a message about which schemas were updated along with version numbers.
5 |
6 | Run this before releasing a new version of the rcsb-api package and
7 | copy/paste the printed message into the CHANGELOG if any schemas were updated.
8 |
9 | The endpoints for requesting online schemas and paths for writing the new schema files
10 | are in the .const file.
11 | """
12 |
13 | import json
14 | from pathlib import Path
15 | from typing import Dict, Literal, List
16 | import requests
17 |
18 | try:
19 | from rcsbapi.search.search_query import SEARCH_SCHEMA # instance of SearchSchema
20 | except Exception:
21 | # ignore errors that may occur parsing the schema
22 | pass
23 |
24 | from rcsbapi.data import DATA_SCHEMA
25 | from rcsbapi.const import const
26 | from rcsbapi.config import config
27 |
28 |
29 | def make_version_dict(file_list: List[str], package: Literal["search", "data"]) -> Dict:
30 | current_version_dict = {}
31 | for f_name in file_list:
32 | path = Path(__file__).parent.parent.joinpath(package, "resources", f_name)
33 | with open(path, "r", encoding="utf-8") as file:
34 | schema = json.load(file)
35 | if "$comment" in schema:
36 | if package == "search":
37 | version = schema["$comment"].lower().replace("schema version: ", "")
38 | else:
39 | version = schema["$comment"].lower().replace("schema_version: ", "")
40 | current_version_dict[f_name] = version
41 | else:
42 | current_version_dict[f_name] = ""
43 | return current_version_dict
44 |
45 |
46 | def update_schema(
47 | f_name: str,
48 | file_url: str,
49 | package: Literal["search", "data"],
50 | ) -> str:
51 | # Define path: py-rcsb-api/rcsbapi//resources/
52 | path = Path(__file__).parent.parent.joinpath(package, "resources", f_name)
53 | with open(path, "wt", encoding="utf-8") as file:
54 | new_schema = SEARCH_SCHEMA._fetch_schema(file_url)
55 | json.dump(new_schema, file, indent=4)
56 | if "$comment" in new_schema:
57 | if package == "search":
58 | version = new_schema["$comment"].lower().replace("schema version: ", "")
59 | else:
60 | version = new_schema["$comment"].lower().replace("schema_version: ", "")
61 | else:
62 | version = ""
63 | return version
64 |
65 |
66 | def make_changelog_msg(
67 | file_list: List[str],
68 | package: Literal["search", "data"],
69 | current_ver_dict: Dict[str, str],
70 | new_ver_dict: Dict[str, str],
71 | ) -> str:
72 | msg = ""
73 | for f_name in file_list:
74 | if (current_ver_dict[f_name] == new_ver_dict[f_name]) or (current_ver_dict[f_name] == ""):
75 | continue
76 |
77 | if not msg:
78 | msg = f"- Update {package} schemas: \n"
79 | msg += f" - {f_name.replace('.json', '')} schema {current_ver_dict[f_name]} -> {new_ver_dict[f_name]}\n"
80 | return msg
81 |
82 |
83 | if __name__ == "__main__":
84 | # Find current schema versions
85 | search_current_ver_dict = make_version_dict(
86 | file_list=[const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME, const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME],
87 | package="search"
88 | )
89 | data_current_ver_dict = make_version_dict(
90 | file_list=list(const.DATA_API_SCHEMA_ENDPOINT_TO_FILE.values()),
91 | package="data"
92 | )
93 |
94 | # Update Search API schemas
95 | search_url_to_file = {
96 | const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL: const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME,
97 | const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL: const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME,
98 | }
99 | search_version_dict: dict[str, str] = {}
100 | for url, file_name in search_url_to_file.items():
101 | search_version = update_schema(
102 | f_name=file_name,
103 | file_url=url,
104 | package="search"
105 | )
106 | search_version_dict[file_name] = search_version
107 |
108 | # Update Data API schemas
109 | data_version_dict: dict[str, str] = {}
110 | for endpoint, file_name in const.DATA_API_SCHEMA_ENDPOINT_TO_FILE.items():
111 | data_version = update_schema(
112 | f_name=file_name,
113 | file_url=const.DATA_API_SCHEMA_BASE_URL + endpoint,
114 | package="data"
115 | )
116 | data_version_dict[file_name] = data_version
117 |
118 | # Update full GraphQL Data API schema
119 | query = DATA_SCHEMA._get_introspection_query()
120 | schema_response = requests.post(headers={"Content-Type": "application/graphql"}, data=query, url=const.DATA_API_ENDPOINT, timeout=config.API_TIMEOUT)
121 | assert schema_response.status_code == 200
122 | data_schema_path = Path(__file__).parent.parent.joinpath(const.DATA_API_SCHEMA_DIR, const.DATA_API_SCHEMA_FILENAME)
123 | with open(data_schema_path, "wt", encoding="utf-8") as f:
124 | json.dump(schema_response.json(), f, indent=4)
125 |
126 | # Check if search schema version numbers are the same as each other
127 | version_list = list(search_version_dict.values())
128 | curr_ver_list = list(search_current_ver_dict.values())
129 | if (
130 | all(ver == version_list[0] for ver in version_list)
131 | and all(curr_ver == curr_ver_list[0] for curr_ver in curr_ver_list)
132 | ):
133 | if not all(curr_ver == version_list[0] for curr_ver in list(search_current_ver_dict.values())):
134 | print(f"- Update search schemas: {curr_ver_list[0]} -> {version_list[0]}")
135 | else:
136 | print("Search schemas are up-to-date")
137 | else:
138 | # Make search package CHANGELOG message
139 | search_file_list = list(search_version_dict.keys())
140 | update_msg = make_changelog_msg(
141 | file_list=search_file_list,
142 | package="search",
143 | current_ver_dict=search_current_ver_dict,
144 | new_ver_dict=search_version_dict
145 | )
146 | if update_msg:
147 | print(update_msg)
148 | else:
149 | print("Data schema are up-to-date")
150 |
151 | # Make data package CHANGELOG message
152 | version_list = list(data_version_dict.values())
153 | data_file_list = list(data_version_dict.keys())
154 | update_msg = make_changelog_msg(
155 | file_list=data_file_list,
156 | package="data",
157 | current_ver_dict=data_current_ver_dict,
158 | new_ver_dict=data_version_dict
159 | )
160 | if update_msg:
161 | print(update_msg)
162 | else:
163 | print("Data schema are up-to-date")
164 |
--------------------------------------------------------------------------------
/rcsbapi/search/__init__.py:
--------------------------------------------------------------------------------
1 | """RCSB PDB Search API"""
2 |
3 | from typing import List
4 | from .search_query import SEARCH_SCHEMA # noqa: F401
5 | from .search_query import Attr, AttributeQuery, TextQuery
6 | from .search_query import SeqSimilarityQuery, SeqMotifQuery, ChemSimilarityQuery, StructSimilarityQuery, StructMotifResidue, StructMotifQuery
7 | from .search_query import Facet, FacetRange, TerminalFilter, GroupFilter, FilterFacet, Sort, GroupBy, RankingCriteriaType
8 | from .search_query import Group
9 |
10 | search_attributes = SEARCH_SCHEMA.search_attributes
11 | group = Group.group
12 |
13 |
14 | def __dir__() -> List[str]:
15 | return sorted(__all__)
16 |
17 |
18 | __all__ = [
19 | "search_attributes",
20 | "Attr",
21 | "TextQuery",
22 | "AttributeQuery",
23 | "SeqSimilarityQuery",
24 | "SeqMotifQuery",
25 | "ChemSimilarityQuery",
26 | "StructSimilarityQuery",
27 | "StructMotifResidue",
28 | "StructMotifQuery",
29 | "Facet",
30 | "FacetRange",
31 | "TerminalFilter",
32 | "GroupFilter",
33 | "FilterFacet",
34 | "Sort", # Rename to prevent overlap?
35 | "GroupBy",
36 | "RankingCriteriaType",
37 | ]
38 |
--------------------------------------------------------------------------------
/rcsbapi/search/search_schema.py:
--------------------------------------------------------------------------------
1 | """Parse the full RCSB PDB search schema
2 |
3 | Provides access to all valid attributes for search queries.
4 | """
5 | import os
6 | import json
7 | import logging
8 | from pathlib import Path
9 | import re
10 | import warnings
11 | from typing import List, Union
12 | import requests
13 | from ..const import const
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | class SearchSchemaGroup:
19 | """A non-leaf node in the RCSB PDB schema. Leaves are Attr values."""
20 |
21 | def __init__(self, attr_type):
22 | self.Attr = attr_type # Attr or AttrLeaf
23 | self._members = {} # Dictionary to store members
24 |
25 | def search(self, pattern: Union[str, re.Pattern], flags=0):
26 | """Find all attributes in the schema matching a regular expression.
27 |
28 | Returns:
29 | A list of Attr objects whose attribute matches.
30 | """
31 | matcher = re.compile(pattern, flags=flags)
32 | filter_match = filter(lambda a: matcher.search(a.attribute), self)
33 | return list(filter_match)
34 |
35 | def list(self):
36 | """Get a list of full names for all structure and chemical attributes"""
37 | all_list = []
38 | for attr in self:
39 | attr_dict = vars(attr)
40 | name = attr_dict["attribute"]
41 | all_list.append(name)
42 | return all_list
43 |
44 | def __iter__(self):
45 | """Iterate over all leaf nodes
46 |
47 | Example:
48 | >>> [a for a in attrs if "stoichiometry" in a.attribute]
49 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]
50 | """
51 |
52 | def leaves(self, attr_type):
53 | for k, v in self._members.items():
54 | if isinstance(v, attr_type):
55 | yield v
56 | elif isinstance(v, SearchSchemaGroup):
57 | yield from iter(v)
58 | # skips ["Attr"] key in __dict__
59 | elif v is attr_type:
60 | continue
61 | else:
62 | # Shouldn't happen
63 | raise TypeError(f"Unrecognized member {k!r}: {v!r}")
64 |
65 | return leaves(self, self.Attr)
66 |
67 | def get_attribute_details(self, attribute: str):
68 | """Return attribute information given full or partial attribute name
69 |
70 | Args:
71 | attribute (str): Full attribute name
72 | (e.g., "rcsb_id", "rcsb_entity_source_organism.scientific_name")
73 |
74 | Returns:
75 | str: Return corresponding attribute description if there's a match
76 | """
77 |
78 | def leaves(d):
79 | for v in d.values():
80 | if "attribute" in v:
81 | yield v
82 | else:
83 | yield from leaves(v)
84 |
85 | split_attr = attribute.split(".")
86 | ptr = self # dictionary of attributes
87 | for level in split_attr:
88 | if level not in ptr:
89 | warnings.warn(f"Attribute path segment '{level}' (for input '{attribute}') not found in schema.", UserWarning)
90 | return None
91 | ptr = ptr[level]
92 | if "attribute" in ptr.__dict__ and getattr(ptr, "attribute") == attribute: # must be .__dict__ so both SearchSchemaGroup and Attr are compared as dictionaries
93 | return ptr
94 | else:
95 | return {c for c in leaves(ptr)}
96 |
97 | def get_attribute_type(self, attribute: str) -> Union[str, None]:
98 | """Return attribute type given full attribute name
99 |
100 | Args:
101 | attribute (str): Full attribute name
102 | (e.g., "rcsb_id", "rcsb_entity_source_organism.scientific_name")
103 |
104 | Returns:
105 | Union[str, None]: Return search service if there's a match.
106 | structure search: "text"
107 | chemical search: "chem_text"
108 | both: ["text", "chem_text"] (raises error later)
109 | """
110 | split_attr = attribute.split(".")
111 | ptr = self # dictionary of attributes
112 | for level in split_attr:
113 | if level not in ptr:
114 | warnings.warn(f"Attribute path segment '{level}' (for input '{attribute}') not found in schema.", UserWarning)
115 | return None
116 | ptr = ptr[level]
117 | if "attribute" in ptr.__dict__ and getattr(ptr, "attribute") == attribute: # must be .__dict__ so both SearchSchemaGroup and Attr are compared as dictionaries
118 | return getattr(ptr, "type")
119 | warnings.warn(f"Incomplete attribute path '{attribute}' - must specify fully qualified path to leaf attribute node.", UserWarning)
120 | return None
121 |
122 | # Below methods are for making SearchSchemaGroup behave as a Dict (be able to access through keys, etc).
123 | # This is used for automatically determining search service based on attribute name.
124 |
125 | def __getitem__(self, key):
126 | """Allow dictionary-like access to members by key."""
127 | return self._members[key]
128 |
129 | def __setitem__(self, key, value):
130 | """Set a member in the schema like a dictionary."""
131 | self._members[key] = value
132 |
133 | def __delitem__(self, key):
134 | """Delete a member from the schema like a dictionary."""
135 | del self._members[key]
136 |
137 | def __contains__(self, key):
138 | """Check if a member exists in the schema."""
139 | return key in self._members
140 |
141 | def keys(self):
142 | return self._members.keys()
143 |
144 | def values(self):
145 | return self._members.values()
146 |
147 | def items(self):
148 | return self._members.items()
149 |
150 | def __str__(self):
151 | return "\n".join(f"{key}: {value}" for key, value in self._members.items())
152 |
153 | def __hash__(self):
154 | """Make the object hashable using the hash of its members."""
155 | return hash(frozenset(self._members.items()))
156 |
157 |
158 | class SearchSchema:
159 | def __init__(
160 | self,
161 | attr_type,
162 | refetch=True,
163 | use_fallback=True,
164 | reload=True,
165 | struct_attr_schema_url=const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL,
166 | struct_attr_schema_file=os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME),
167 | chem_attr_schema_url=const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL,
168 | chem_attr_schema_file=os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME),
169 | ):
170 | """Initialize SearchSchema object with all known RCSB PDB attributes.
171 |
172 | This is provided to ease autocompletion as compared to creating Attr objects from
173 | strings. For example,
174 | ::
175 |
176 | search_attributes.rcsb_nonpolymer_instance_feature_summary.chem_id
177 |
178 | is equivalent to
179 | ::
180 |
181 | Attr('rcsb_nonpolymer_instance_feature_summary.chem_id')
182 |
183 | All attributes in `search_attributes` can be iterated over.
184 |
185 | >>> [a for a in search_attributes if "stoichiometry" in a.attribute]
186 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]
187 |
188 | Attributes matching a regular expression can also be filtered:
189 |
190 | >>> list(search_attributes.search('rcsb.*stoichiometry'))
191 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]a
192 | """
193 | self.Attr = attr_type
194 | if reload:
195 | self.struct_schema = self._reload_schema(struct_attr_schema_url, struct_attr_schema_file, refetch, use_fallback)
196 | self.chem_schema = self._reload_schema(chem_attr_schema_url, chem_attr_schema_file, refetch, use_fallback)
197 | self.search_attributes = self._make_schema_group()
198 |
199 | def _reload_schema(self, schema_url: str, schema_file: str, refetch=True, use_fallback=True):
200 | sD = {}
201 | if refetch:
202 | sD = self._fetch_schema(schema_url)
203 | if not sD and use_fallback:
204 | sD = self._load_json_schema(schema_file)
205 | return sD
206 |
207 | def _make_schema_group(self) -> SearchSchemaGroup:
208 | schemas = [(self.struct_schema, const.STRUCTURE_ATTRIBUTE_SEARCH_SERVICE, ""), (self.chem_schema, const.CHEMICAL_ATTRIBUTE_SEARCH_SERVICE, "")]
209 | schema = self._make_group("", schemas)
210 | assert isinstance(schema, SearchSchemaGroup) # for type checking
211 | return schema
212 |
213 | def _fetch_schema(self, url: str):
214 | "Request the current schema from the web"
215 | logger.info("Requesting %s", url)
216 | response = requests.get(url, timeout=None)
217 | if response.status_code == 200:
218 | return response.json()
219 | else:
220 | logger.debug("HTTP response status code %r", response.status_code)
221 | return None
222 |
223 | def _load_json_schema(self, schema_file):
224 | logger.info("Loading attribute schema from file")
225 | path = Path(__file__).parent.parent.joinpath(schema_file)
226 | with open(path, "r", encoding="utf-8") as file:
227 | latest = json.load(file)
228 | return latest
229 |
230 | def _make_group(self, fullname: str, nodeL: List):
231 | """Represent this node of the schema as a python object
232 |
233 | Params:
234 | - name: full dot-separated attribute name
235 |
236 | Returns:
237 | An Attr (Leaf nodes) or SearchSchemaGroup (object nodes)
238 | """
239 | group = SearchSchemaGroup(self.Attr)
240 | for node, attrtype, desc in nodeL:
241 | if "anyOf" in node:
242 | children = {self._make_group(fullname, [(n, attrtype, n.get("description", node.get("description", desc)))]) for n in node["anyOf"]}
243 | # Currently only deal with anyOf in leaf nodes
244 | assert len(children) == 1, f"type of {fullname} couldn't be determined"
245 | return next(iter(children))
246 | if "oneOf" in node:
247 | children = {self._make_group(fullname, [(n, attrtype, n.get("description", desc))]) for n in node["oneOf"]}
248 | # Currently only deal with oneOf in leaf nodes
249 | assert len(children) == 1, f"type of {fullname} couldn't be determined"
250 | return next(iter(children))
251 | if "allOf" in node:
252 | children = {self._make_group(fullname, [(n, attrtype, n.get("description", desc))]) for n in node["allOf"]}
253 | # Currently only deal with allOf in leaf nodes
254 | assert len(children) == 1, f"type of {fullname} couldn't be determined"
255 | return next(iter(children))
256 | if node["type"] in ("string", "number", "integer", "date"):
257 | # For nodes that occur in both schemas, list of both descriptions will be passed in through desc arg
258 | if isinstance(desc, list):
259 | return self.Attr(fullname, attrtype, desc)
260 | # For non-redundant nodes
261 | return self.Attr(fullname, attrtype, node.get("description", desc))
262 | elif node["type"] == "array":
263 | # skip to items
264 | return self._make_group(fullname, [(node["items"], attrtype, node.get("description", desc))])
265 | elif node["type"] == "object":
266 | for childname, childnode in node["properties"].items():
267 | fullchildname = f"{fullname}.{childname}" if fullname else childname
268 | # setattr(group, childname, childgroup)
269 | if childname in group:
270 | assert not isinstance(group[childname], dict) # redundant name must not have nested attributes
271 |
272 | # Create attrtype and description lists with existing and current value.
273 | # List type triggers error if user doesn't specify service for redundant attribute.
274 | currentattr = getattr(group[childname], "type")
275 | attrlist = [currentattr, attrtype]
276 |
277 | currentdescript = getattr(group[childname], "description")
278 | descriptlist = [currentdescript, childnode.get("description", desc)]
279 |
280 | childgroup = self._make_group(fullchildname, [(childnode, attrlist, descriptlist)])
281 | else:
282 | childgroup = self._make_group(fullchildname, [(childnode, attrtype, childnode.get("description", desc))])
283 | # adding to SearchSchemaGroup as a dict allows for determining search service by attribute name with O(1) lookup
284 | group[childname] = childgroup
285 |
286 | # adding to SearchSchemaGroup as an attribute allows for tab-completion for search_attributes/attrs
287 | setattr(group, childname, childgroup)
288 | else:
289 | raise TypeError(f"Unrecognized node type {node['type']!r} of {fullname}")
290 | return group
291 |
292 | def _set_leaves(self, d: dict) -> dict:
293 | """Converts Attr objects to dictionary format."""
294 | for leaf in d:
295 | if isinstance(d[leaf], self.Attr):
296 | d[leaf] = d[leaf].__dict__
297 | else:
298 | d[leaf] = self._set_leaves(d[leaf])
299 | return d
300 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests >= 2.0.0
2 | rustworkx
3 | graphql-core
4 | tqdm
5 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # use py2.py3 tag for pure-python dist:
3 | universal=1
4 |
5 | [metadata]
6 | description_file = README.md
7 |
8 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # File: setup.py
2 | # Date: 20-May-2024
3 | #
4 | # Update:
5 | #
6 | import re
7 |
8 | from setuptools import find_packages
9 | from setuptools import setup
10 |
11 | packages = []
12 | thisPackage = "rcsb-api"
13 |
14 | with open("rcsbapi/__init__.py", "r", encoding="utf-8") as fd:
15 | version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1)
16 |
17 | # Load packages from requirements*.txt
18 | with open("requirements.txt", "r", encoding="utf-8") as ifh:
19 | packagesRequired = [ln.strip() for ln in ifh.readlines()]
20 |
21 | with open("README.md", "r", encoding="utf-8") as ifh:
22 | longDescription = ifh.read()
23 |
24 | if not version:
25 | raise RuntimeError("Cannot find version information")
26 |
27 | setup(
28 | name=thisPackage,
29 | version=version,
30 | description="Python package interface for RCSB.org API services",
31 | long_description_content_type="text/markdown",
32 | long_description=longDescription,
33 | python_requires=">=3.8,<4.0",
34 | author="Dennis Piehl",
35 | author_email="dennis.piehl@rcsb.org",
36 | url="https://github.com/rcsb/py-rcsb-api",
37 | #
38 | license="MIT",
39 | classifiers=[
40 | "Programming Language :: Python",
41 | "Programming Language :: Python :: 3",
42 | "Programming Language :: Python :: 3 :: Only",
43 | "Programming Language :: Python :: 3.8",
44 | "Programming Language :: Python :: 3.9",
45 | "Programming Language :: Python :: 3.10",
46 | "Programming Language :: Python :: 3.11",
47 | "Programming Language :: Python :: 3.12",
48 | "Programming Language :: Python :: 3.13",
49 | "Development Status :: 4 - Beta",
50 | # 'Development Status :: 5 - Production/Stable',
51 | "Operating System :: OS Independent",
52 | "Intended Audience :: Science/Research",
53 | "Topic :: Scientific/Engineering :: Bio-Informatics",
54 | "Natural Language :: English",
55 | "License :: OSI Approved :: MIT License",
56 | "Typing :: Typed",
57 | ],
58 | entry_points={"console_scripts": []},
59 | #
60 | install_requires=packagesRequired,
61 | packages=find_packages(exclude=["tests", "tests-*", "tests.*"]),
62 | package_data={
63 | # If any package contains *.md or *.rst ... files, include them:
64 | "": ["*.md", "*.rst", "*.txt", "*.cfg", "rcsbapi/*/resources/*"]
65 | },
66 | #
67 | test_suite="tests",
68 | tests_require=["tox"],
69 | #
70 | # Not configured ...
71 | extras_require={
72 | "dev": ["check-manifest"],
73 | "test": ["coverage"],
74 | "tests": ["tox", "pylint", "black>=21.5b1", "flake8"],
75 | # should match docs/requirements.txt
76 | "docs": ["sphinx", "sphinx-rtd-theme", "myst-parser"],
77 | },
78 | # Added for
79 | command_options={"build_sphinx": {"project": ("setup.py", thisPackage), "version": ("setup.py", version), "release": ("setup.py", version)}},
80 | # This setting for namespace package support -
81 | zip_safe=False,
82 | )
83 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test-data/4hhb-assembly1.cif.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/4hhb-assembly1.cif.gz
--------------------------------------------------------------------------------
/tests/test-data/4hhb.bcif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/4hhb.bcif
--------------------------------------------------------------------------------
/tests/test-data/4hhb.pdb1.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/4hhb.pdb1.gz
--------------------------------------------------------------------------------
/tests/test-data/7n0r.cif.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/7n0r.cif.gz
--------------------------------------------------------------------------------
/tests/test-data/7n0r.pdb.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/7n0r.pdb.gz
--------------------------------------------------------------------------------
/tests/test-data/invalid.txt:
--------------------------------------------------------------------------------
1 | This is an invalid file that should cause the program to throw a type error when this is passed through the file upload function.
--------------------------------------------------------------------------------
/tests/test_data_query.py:
--------------------------------------------------------------------------------
1 | ##
2 | # File: testquery.py
3 | # Author:
4 | # Date:
5 | # Version:
6 | #
7 | # Update:
8 | #
9 | #
10 | ##
11 | """
12 | Tests for all functions of the schema file. (Work in progress)
13 | """
14 |
15 | __docformat__ = "google en"
16 | __author__ = ""
17 | __email__ = ""
18 | __license__ = ""
19 |
20 | import logging
21 |
22 | # import importlib
23 | # import platform
24 | # import resource
25 | import time
26 | import unittest
27 | import requests
28 |
29 | from rcsbapi.search import search_attributes as attrs
30 | from rcsbapi.data import DataSchema, DataQuery
31 | from rcsbapi.config import config
32 | from rcsbapi.const import const
33 |
34 | logger = logging.getLogger(__name__)
35 | logger.setLevel(logging.INFO)
36 |
37 |
38 | class QueryTests(unittest.TestCase):
39 | def setUp(self):
40 | self.__startTime = time.time()
41 | logger.info("Starting %s at %s", self.id().split(".")[-1], time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
42 |
43 | def tearDown(self) -> None:
44 | endTime = time.time()
45 | logger.info("Completed %s at %s (%.4f seconds)", self.id().split(".")[-1], time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
46 |
47 | def testGetEditorLink(self):
48 | # query_str = '{ entries(entry_ids: ["4HHB", "1IYE"]) {\n exptl {\n method_details\n method\n details\n crystals_number\n }\n}}'
49 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": ["4HHB", "1IYE"]}, return_data_list=["exptl"])
50 | url = query_obj.get_editor_link()
51 | response_json = requests.get(url, timeout=10)
52 | self.assertEqual(response_json.status_code, 200)
53 |
54 | def testExec(self):
55 | with self.subTest("1. Batching into requests with fewer Ids"):
56 | input_ids = []
57 | for _ in range(165):
58 | input_ids.append("4HHB")
59 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": input_ids}, return_data_list=["exptl"])
60 | query_obj.exec()
61 | # assert that the batch and merge functions are called
62 | # assert len of results is same as num of input ids
63 |
64 | def testLowercaseIds(self):
65 | with self.subTest(msg="1. List of IDs"):
66 | try:
67 | query_obj = DataQuery(input_type="entries", input_ids=["4hhb"], return_data_list=["exptl.method"])
68 | query_obj.exec()
69 | except Exception as error:
70 | self.fail(f"Failed unexpectedly: {error}")
71 | with self.subTest(msg="2. Dictionary of IDs"):
72 | try:
73 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": ["4hhb", "1iye"]}, return_data_list=["exptl"])
74 | query_obj.exec()
75 | except Exception as error:
76 | self.fail(f"Failed unexpectedly: {error}")
77 | with self.subTest(msg="2. IDs with separators"):
78 | try:
79 | query_obj = DataQuery(input_type="interfaces", input_ids=["4hhb-1.1"], return_data_list=["rcsb_interface_info.interface_area"])
80 | query_obj.exec()
81 | except Exception as error:
82 | self.fail(f"Failed unexpectedly: {error}")
83 | with self.subTest(msg="3. Pubmed IDs"):
84 | try:
85 | query_obj = DataQuery(input_type="pubmed", input_ids=["6726807"], return_data_list=["rcsb_pubmed_doi"])
86 | query_obj.exec()
87 | except Exception as error:
88 | self.fail(f"Failed unexpectedly: {error}")
89 | with self.subTest(msg="3. UniProt IDs"):
90 | try:
91 | query_obj = DataQuery(input_type="uniprot", input_ids=["p68871"], return_data_list=["rcsb_id"])
92 | query_obj.exec()
93 | except Exception as error:
94 | self.fail(f"Failed unexpectedly: {error}")
95 |
96 | def testParseGQLError(self):
97 | pass
98 |
99 | def testBatchIDs(self):
100 | input_ids = []
101 | for _ in range(165):
102 | input_ids.append("4HHB")
103 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": input_ids}, return_data_list=["exptl"])
104 | batch_size = 50
105 | batched_ids = query_obj._batch_ids(batch_size)
106 | total_ids = 0
107 | for batch in batched_ids:
108 | len_id_batch = len(batch)
109 | self.assertLessEqual(len_id_batch, batch_size)
110 | total_ids += len_id_batch
111 | self.assertEqual(len(query_obj.get_input_ids()), total_ids)
112 |
113 | def testMergeResponse(self):
114 | # assert that the lengths are combined and all ids are present?
115 | pass
116 |
117 | def testDocs(self):
118 | with self.subTest(msg="1. Initialize Schema"):
119 | schema = DataSchema()
120 |
121 | with self.subTest(msg="2. README 1"):
122 | try:
123 | query_obj = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl.method"])
124 | query_obj.exec()
125 | except Exception as error:
126 | self.fail(f"Failed unexpectedly: {error}")
127 |
128 | with self.subTest(msg="3. README 2"):
129 | try:
130 | query_obj = DataQuery(
131 | input_type="polymer_entities",
132 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"],
133 | return_data_list=[
134 | "polymer_entities.rcsb_id",
135 | "rcsb_entity_source_organism.ncbi_taxonomy_id",
136 | "rcsb_entity_source_organism.ncbi_scientific_name",
137 | "cluster_id",
138 | "identity",
139 | ],
140 | )
141 | query_obj.exec()
142 | except Exception as error:
143 | self.fail(f"Failed unexpectedly: {error}")
144 |
145 | with self.subTest(msg="4. Quickstart 1"):
146 | try:
147 | query_obj = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl.method"])
148 | query_obj.exec()
149 | except Exception as error:
150 | self.fail(f"Failed unexpectedly: {error}")
151 |
152 | with self.subTest(msg="5. Quickstart 2, autocompletion"):
153 | try:
154 | query_obj = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl"])
155 | query_obj.exec()
156 | except Exception as error:
157 | self.fail(f"Failed unexpectedly: {error}")
158 |
159 | with self.subTest(msg="4. Helpful methods, get_editor_link()"):
160 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl"])
161 | response = requests.get(query.get_editor_link(), timeout=5)
162 | self.assertEqual(response.status_code, 200)
163 |
164 | with self.subTest(msg="5. Helpful methods, find_paths()"):
165 | try:
166 | schema.find_paths(input_type="entries", return_data_name="id")
167 | except Exception as error:
168 | self.fail(f"Failed unexpectedly: {error}")
169 |
170 | with self.subTest(msg="6. Helpful methods, get_input_id_dict"):
171 | test_dict = schema.get_input_id_dict("polymer_entity_instance")
172 | polymer_instance_keys = ["entry_id", "asym_id"]
173 | for key in polymer_instance_keys:
174 | self.assertIn(key, test_dict.keys())
175 | for value in test_dict.values():
176 | self.assertIsNotNone(value)
177 |
178 | with self.subTest(msg="7. Troubleshooting, Not a unique field"):
179 | with self.assertRaises(ValueError):
180 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["id"])
181 | try:
182 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["entry.id"])
183 | except Exception as error:
184 | self.fail(f"Failed unexpectedly: {error}")
185 |
186 | def testAddExamples(self):
187 | with self.subTest(msg="1. Entries"):
188 | try:
189 | query = DataQuery(input_type="entries", input_ids=["1STP", "2JEF", "1CDG"], return_data_list=["entries.rcsb_id", "struct.title", "exptl.method"])
190 | query.exec()
191 | except Exception as error:
192 | self.fail(f"Failed unexpectedly: {error}")
193 |
194 | with self.subTest(msg="2. Primary Citation"):
195 | try:
196 | query = DataQuery(
197 | input_type="entries",
198 | input_ids=["1STP", "2JEF", "1CDG"],
199 | return_data_list=[
200 | "entries.rcsb_id",
201 | "rcsb_accession_info.initial_release_date",
202 | "audit_author.name",
203 | "rcsb_primary_citation.pdbx_database_id_PubMed",
204 | "rcsb_primary_citation.pdbx_database_id_DOI",
205 | ],
206 | )
207 | query.exec()
208 | except Exception as error:
209 | self.fail(f"Failed unexpectedly: {error}")
210 |
211 | with self.subTest(msg="3. Polymer Entities"):
212 | try:
213 | query = DataQuery(
214 | input_type="polymer_entities",
215 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"],
216 | return_data_list=[
217 | "polymer_entities.rcsb_id",
218 | "rcsb_entity_source_organism.ncbi_taxonomy_id",
219 | "rcsb_entity_source_organism.ncbi_scientific_name",
220 | "cluster_id",
221 | "identity",
222 | ],
223 | )
224 | query.exec()
225 | except Exception as error:
226 | self.fail(f"Failed unexpectedly: {error}")
227 |
228 | with self.subTest(msg="4. Polymer Instances"):
229 | try:
230 | query = DataQuery(
231 | input_type="polymer_entity_instances",
232 | input_ids=["4HHB.A", "12CA.A", "3PQR.A"],
233 | return_data_list=[
234 | "polymer_entity_instances.rcsb_id",
235 | "rcsb_polymer_instance_annotation.annotation_id",
236 | "rcsb_polymer_instance_annotation.name",
237 | "rcsb_polymer_instance_annotation.type",
238 | ],
239 | )
240 | query.exec()
241 | except Exception as error:
242 | self.fail(f"Failed unexpectedly: {error}")
243 |
244 | with self.subTest(msg="5. Carbohydrates"):
245 | try:
246 | query = DataQuery(
247 | input_type="branched_entities",
248 | input_ids=["5FMB_2", "6L63_3"],
249 | return_data_list=["pdbx_entity_branch.type", "pdbx_entity_branch_descriptor.type", "pdbx_entity_branch_descriptor.descriptor"],
250 | )
251 | query.exec()
252 | except Exception as error:
253 | self.fail(f"Failed unexpectedly: {error}")
254 |
255 | with self.subTest(msg="6. Sequence Positional Features"):
256 | try:
257 | query = DataQuery(
258 | input_type="polymer_entity_instances",
259 | input_ids={"instance_ids": ["1NDO.A"]},
260 | return_data_list=[
261 | "polymer_entity_instances.rcsb_id",
262 | "rcsb_polymer_instance_feature.type",
263 | "rcsb_polymer_instance_feature.feature_positions.beg_seq_id",
264 | "rcsb_polymer_instance_feature.feature_positions.end_seq_id",
265 | ],
266 | )
267 | query.exec()
268 | except Exception as error:
269 | self.fail(f"Failed unexpectedly: {error}")
270 |
271 | with self.subTest(msg="7. Reference Sequence Identifiers"):
272 | try:
273 | query = DataQuery(
274 | input_type="entries",
275 | input_ids=["7NHM", "5L2G"],
276 | return_data_list=[
277 | "entries.rcsb_id",
278 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
279 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
280 | ],
281 | )
282 | query.exec()
283 | except Exception as error:
284 | self.fail(f"Failed unexpectedly: {error}")
285 |
286 | with self.subTest(msg="8. Chemical Components"):
287 | try:
288 | query = DataQuery(
289 | input_type="chem_comps",
290 | input_ids=["NAG", "EBW"],
291 | return_data_list=[
292 | "chem_comps.rcsb_id",
293 | "chem_comp.type",
294 | "chem_comp.formula_weight",
295 | "chem_comp.name",
296 | "chem_comp.formula",
297 | "rcsb_chem_comp_info.initial_release_date",
298 | ],
299 | )
300 | query.exec()
301 | except Exception as error:
302 | self.fail(f"Failed unexpectedly: {error}")
303 |
304 | with self.subTest(msg="9. Computed Structure Models"):
305 | try:
306 | query = DataQuery(input_type="entries", input_ids=["AF_AFP68871F1"], return_data_list=["ma_qa_metric_global.type", "ma_qa_metric_global.value"])
307 | query.exec()
308 | except Exception as error:
309 | self.fail(f"Failed unexpectedly: {error}")
310 |
311 | def testQuickstartNotebook(self):
312 | with self.subTest(msg="1. Initialize Schema"):
313 | schema = DataSchema()
314 | with self.subTest(msg="2. GraphQL example query"):
315 | query = """
316 | {
317 | entry(entry_id: "4HHB") {
318 | rcsb_entry_info {
319 | nonpolymer_bound_components
320 | }
321 | }
322 | }
323 | """
324 | response_json = requests.post(headers={"Content-Type": "application/graphql"}, data=query, url=const.DATA_API_ENDPOINT, timeout=config.API_TIMEOUT).json()
325 | self.assertNotIn("errors", response_json.keys())
326 | with self.subTest(msg="4. Making Queries"):
327 | try:
328 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["nonpolymer_bound_components"])
329 | query.exec()
330 | except Exception as error:
331 | self.fail(f"Failed unexpectedly: {error}")
332 | with self.subTest(msg="5. input_ids, mult args"):
333 | try:
334 | query = DataQuery(input_type="polymer_entity_instances", input_ids=["4HHB.A"], return_data_list=["nonpolymer_bound_components"])
335 | query.exec()
336 | except Exception as error:
337 | self.fail(f"Failed unexpectedly: {error}")
338 | with self.subTest(msg="6. input_ids, list as entry input_ids"):
339 | try:
340 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["nonpolymer_bound_components"])
341 | query.exec()
342 | except Exception as error:
343 | self.fail(f"Failed unexpectedly: {error}")
344 | with self.subTest(msg="7. input_ids, list as polymer instance input_ids"):
345 | try:
346 | query = DataQuery(input_type="polymer_entity_instances", input_ids=["4HHB.A"], return_data_list=["nonpolymer_bound_components"])
347 | query.exec()
348 | except Exception as error:
349 | self.fail(f"Failed unexpectedly: {error}")
350 | with self.subTest(msg="8. return_data_list, Not a unique field error"):
351 | with self.assertRaises(ValueError):
352 | query = DataQuery(input_type="polymer_entity_instances", input_ids=["4HHB.A"], return_data_list=["polymer_composition"])
353 | query.exec()
354 | with self.subTest(msg="9. return_data_list, find_paths() methods"):
355 | try:
356 | schema = DataSchema()
357 | schema.find_paths("polymer_entity_instances", "polymer_composition")
358 | except Exception as error:
359 | self.fail(f"Failed unexpectedly: {error}")
360 | with self.subTest(msg="10. return_data_list, corrected query with non-redundant field"):
361 | try:
362 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["rcsb_entry_info.polymer_composition"])
363 | query.exec()
364 | except Exception as error:
365 | self.fail(f"Failed unexpectedly: {error}")
366 | with self.subTest(msg="11. find_field_names()"):
367 | try:
368 | schema.find_field_names("polymer_composition")
369 | except Exception as error:
370 | self.fail(f"Failed unexpectedly: {error}")
371 | try:
372 | schema.find_field_names("comp")
373 | except Exception as error:
374 | self.fail(f"Failed unexpectedly: {error}")
375 | with self.subTest(msg="12. More complex queries, multiple ids"):
376 | try:
377 | query = DataQuery(input_type="entries", input_ids=["4HHB", "12CA", "3PQR"], return_data_list=["nonpolymer_bound_components"])
378 | query.exec()
379 | except Exception as error:
380 | self.fail(f"Failed unexpectedly: {error}")
381 | with self.subTest(msg="13. More complex queries, multiple return data"):
382 | try:
383 | query = DataQuery(
384 | input_type="entries", input_ids=["4HHB"], return_data_list=["citation.title", "nonpolymer_bound_components", "rcsb_entry_info.polymer_composition"]
385 | )
386 | query.exec()
387 | except Exception as error:
388 | self.fail(f"Failed unexpectedly: {error}")
389 |
390 | def testSearchDataNotebook(self):
391 | with self.subTest(msg="1. Construct search API query and request"):
392 | # search API query and request
393 | try:
394 | q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == "COVID-19 virus"
395 | q2 = attrs.rcsb_nonpolymer_entity_annotation.type == "SUBJECT_OF_INVESTIGATION"
396 | q3 = attrs.rcsb_polymer_entity_feature_summary.type == "modified_monomer"
397 | query = q1 & q2 & q3
398 | result_list = query()
399 | except Exception as error:
400 | self.fail(f"Failed unexpectedly: {error}")
401 | self.assertGreaterEqual(len(list(result_list)), 10)
402 | with self.subTest(msg="2. Construct data API query and parse result"):
403 | try:
404 | data_query = DataQuery(
405 | input_type="entries",
406 | # input ids removed because "rcsb_nonpolymer_instance_validation_score" is None: "6W61", "7ARF", "7JPZ", "7JQ3"
407 | input_ids=["7AWU", "7C8B", "7JP0", "7JQ0", "7JQ1", "7JQ2"],
408 | return_data_list=[
409 | "entries.rcsb_id",
410 | "rcsb_nonpolymer_entity_instance_container_identifiers.comp_id",
411 | "is_subject_of_investigation",
412 | "citation.title",
413 | "citation.pdbx_database_id_DOI",
414 | ],
415 | )
416 | data_query.exec()
417 | except Exception as error:
418 | self.fail(f"Failed unexpectedly: {error}")
419 | try:
420 | json = data_query.get_response()["data"]["entries"]
421 | json[0]["rcsb_id"]
422 | json[0]["nonpolymer_entities"]
423 | json[0]["nonpolymer_entities"][0]["nonpolymer_entity_instances"]
424 | json[0]["nonpolymer_entities"][0]["nonpolymer_entity_instances"][0]["rcsb_nonpolymer_instance_validation_score"][0]["is_subject_of_investigation"]
425 | json[0]["nonpolymer_entities"][0]["nonpolymer_entity_instances"][0]["rcsb_nonpolymer_entity_instance_container_identifiers"]["comp_id"]
426 | json[0]["citation"][0]["title"]
427 | json[0]["citation"][0]["pdbx_database_id_DOI"]
428 | except Exception as error:
429 | self.fail(f"Failed unexpectedly: {error}")
430 |
431 | def testAllStructures(self):
432 | from rcsbapi.data import ALL_STRUCTURES
433 |
434 | with self.subTest("1. Test entries ALL_STRUCTURES"):
435 | try:
436 | data_query = DataQuery(
437 | input_type="entries",
438 | input_ids=ALL_STRUCTURES,
439 | return_data_list=["exptl.method"],
440 | )
441 | data_query.exec()
442 | except Exception as error:
443 | self.fail(f"Failed unexpectedly: {error}")
444 |
445 | with self.subTest("2. Test chem_comps ALL_STRUCTURES"):
446 | try:
447 | data_query = DataQuery(
448 | input_type="chem_comps",
449 | input_ids=ALL_STRUCTURES,
450 | return_data_list=["chem_comps.rcsb_id"],
451 | )
452 | data_query.exec()
453 | except Exception as error:
454 | self.fail(f"Failed unexpectedly: {error}")
455 |
456 |
457 | def buildQuery():
458 | suiteSelect = unittest.TestSuite()
459 | suiteSelect.addTest(QueryTests("testGetEditorLink"))
460 | suiteSelect.addTest(QueryTests("testExec"))
461 | suiteSelect.addTest(QueryTests("testLowercaseIds"))
462 | suiteSelect.addTest(QueryTests("testBatchIDs"))
463 | suiteSelect.addTest(QueryTests("testDocs"))
464 | suiteSelect.addTest(QueryTests("testAddExamples"))
465 | suiteSelect.addTest(QueryTests("testQuickstartNotebook"))
466 | suiteSelect.addTest(QueryTests("testSearchDataNotebook"))
467 | suiteSelect.addTest(QueryTests("testAllStructures"))
468 | return suiteSelect
469 |
470 |
471 | if __name__ == "__main__":
472 | mySuite = buildQuery()
473 | unittest.TextTestRunner(verbosity=2).run(mySuite)
474 |
--------------------------------------------------------------------------------
/tests/test_search_schema.py:
--------------------------------------------------------------------------------
1 | ##
2 | # File: testschema.py
3 | # Author: Spencer Bliven/Santiago Blaumann
4 | # Date: 6/7/23
5 | # Version: 1.0
6 | #
7 | # Update:
8 | #
9 | #
10 | ##
11 | """
12 | Tests for all functions of the schema file.
13 | """
14 |
15 | __docformat__ = "google en"
16 | __author__ = "Santiago Blaumann"
17 | __email__ = "santiago.blaumann@rcsb.org"
18 | __license__ = "BSD 3-Clause"
19 |
20 | import logging
21 | import platform
22 | import resource
23 | import time
24 | import unittest
25 | import os
26 |
27 | from rcsbapi.search import search_attributes as attrs
28 | from rcsbapi.search import SEARCH_SCHEMA
29 | from rcsbapi.const import const
30 |
31 | logger = logging.getLogger(__name__)
32 | logger.setLevel(logging.INFO)
33 |
34 |
35 | class SchemaTests(unittest.TestCase):
36 | def setUp(self):
37 | self.__startTime = time.time()
38 | logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
39 |
40 | def tearDown(self):
41 | unitS = "MB" if platform.system() == "Darwin" else "GB"
42 | rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
43 | logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
44 | endTime = time.time()
45 | logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
46 |
47 | def testSchema(self):
48 | ok = attrs.rcsb_id.attribute == "rcsb_id"
49 | self.assertTrue(ok)
50 | ok2 = attrs.rcsb_struct_symmetry.symbol.attribute == "rcsb_struct_symmetry.symbol"
51 | self.assertTrue(ok2)
52 | logger.info("Schema test results: ok : (%r), ok2: (%r)", ok, ok2)
53 |
54 | def testSchemaVersion(self):
55 | # Check structure attribute schema version
56 | webSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL)
57 | localSchema = SEARCH_SCHEMA._load_json_schema(os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME))
58 | webVer = webSchema.get("$comment").split()[-1]
59 | localVer = localSchema.get("$comment").split()[-1]
60 | ok = len(localVer.split(".")) == 3 and len(webVer.split(".")) == 3
61 | self.assertTrue(ok)
62 | logger.info("ok is %r", ok)
63 | webVerMajorMinor = float(".".join(webVer.split(".")[0:2]))
64 | localVerMajorMinor = float(".".join(localVer.split(".")[0:2]))
65 | ok = localVerMajorMinor <= webVerMajorMinor and localVerMajorMinor >= webVerMajorMinor - 0.10
66 | logger.info("ok is %r", ok)
67 | self.assertTrue(ok)
68 | logger.info("Metadata schema tests results: local version (%r) and web version (%s)", localVer, webVer)
69 | # Check chemical attribute schema version
70 | webSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL)
71 | localSchema = SEARCH_SCHEMA._load_json_schema(os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME))
72 | webVer = webSchema.get("$comment").split()[-1]
73 | localVer = localSchema.get("$comment").split()[-1]
74 | ok = len(localVer.split(".")) == 3 and len(webVer.split(".")) == 3
75 | self.assertTrue(ok)
76 | logger.info("ok is %r", ok)
77 | webVerMajorMinor = float(".".join(webVer.split(".")[0:2]))
78 | localVerMajorMinor = float(".".join(localVer.split(".")[0:2]))
79 | ok = localVerMajorMinor <= webVerMajorMinor and localVerMajorMinor >= webVerMajorMinor - 0.10
80 | logger.info("ok is %r", ok)
81 | self.assertTrue(ok)
82 | logger.info("Chemical schema tests results: local version (%r) and web version (%s)", localVer, webVer)
83 |
84 | def testFetchSchema(self):
85 | # check fetching of structure attribute schema
86 | fetchSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL)
87 | ok = fetchSchema is not None
88 | logger.info("ok is %r", ok)
89 | self.assertTrue(ok)
90 | fetchSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL)
91 | ok = fetchSchema is not None
92 | logger.info("ok is %r", ok)
93 | self.assertTrue(ok)
94 | errorURL = "https://httpbin.org/status/404"
95 | fetchSchema = SEARCH_SCHEMA._fetch_schema(errorURL)
96 | ok = fetchSchema is None
97 | logger.info("ok is %r", ok)
98 | self.assertTrue(ok)
99 |
100 | def testRcsbAttrs(self):
101 | with self.subTest(msg="1. Check type and descriptions exist for attributes"):
102 | for attr in attrs:
103 | attr_dict = vars(attr)
104 | desc = attr_dict["description"]
105 | self.assertIsNotNone(desc)
106 |
107 | with self.subTest(msg="2. Check searching for attribute details"):
108 | attr_details = attrs.get_attribute_details("drugbank_info.drug_groups")
109 | for obj_attr in ["attribute", "type", "description"]:
110 | self.assertIn(obj_attr, vars(attr_details).keys())
111 |
112 | # special case because rcsb_id is in both structure and chemical attributes
113 | attr_dict = vars(attrs.get_attribute_details("rcsb_id"))
114 | self.assertIsInstance(attr_dict["type"], list)
115 | self.assertIsInstance(attr_dict["description"], list)
116 |
117 | attr_details = attrs.get_attribute_details("foo")
118 | self.assertIsNone(attr_details)
119 |
120 |
121 | def buildSchema():
122 | suiteSelect = unittest.TestSuite()
123 | suiteSelect.addTest(SchemaTests("testSchema"))
124 | suiteSelect.addTest(SchemaTests("testSchemaVersion"))
125 | suiteSelect.addTest(SchemaTests("testFetchSchema"))
126 | suiteSelect.addTest(SchemaTests("testRcsbAttrs"))
127 |
128 | return suiteSelect
129 |
130 |
131 | if __name__ == "__main__":
132 | mySuite = buildSchema()
133 | unittest.TextTestRunner(verbosity=2).run(mySuite)
134 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # File: tox.ini (Templated version)
2 | #
3 | [local_settings]
4 | # project specific local settings
5 | test_pattern = "test*.py"
6 | #
7 | # Source pathls (unquoted space separated list of files/directories) for linting and format checks
8 | source_paths = rcsbapi setup.py
9 | #
10 | # Start directory path for test discovery
11 | # Each path must reference valid directory that is searchable by Python (i.e. contains __init__.py)
12 | # ** It is harmless to point to paths containing no tests.
13 | #
14 | test_path_1 = "tests"
15 | # These are placeholders valid source directories without tests files
16 | # test_path_2 = "rcsbapi/data"
17 | # test_path_3 = "rcsbapi"
18 | # test_path_4 = "rcsbapi"
19 | #
20 | # Comma separate list of directories for which test coverage will be evaluated
21 | coverage_source_paths = "rcsbapi,tests"
22 | # coverage_exclude_paths = "rcsbapi/__init__.py"
23 | coverage_cutoff = 65
24 | #
25 | ## --------------------------------------------------------------------------
26 | ## ---------- No project specific setting beyond this point --------------
27 | ##
28 | [tox]
29 | # The complete list of supported test environments to setup and invoke
30 | envlist = format_pep8-{py39}, lint_pylint-{py39}, format_black-{py39}, py{39}, test_coverage-{py39}
31 | #
32 | minversion = 3.4.0
33 | skip_missing_interpreters = true
34 | skipsdist = false
35 |
36 | [testenv]
37 | passenv = CONFIG_SUPPORT_TOKEN_ENV
38 | allowlist_externals = echo
39 | commands =
40 | echo "Starting default tests in testenv"
41 | basepython = py39: python3.9
42 |
43 | [testenv:py39]
44 | description = 'Run unit tests (unittest runner) using {envpython}'
45 | platform=
46 | macos: darwin
47 | linux: linux
48 | skip_install = false
49 | recreate = true
50 | alwayscopy=true
51 | package = editable-legacy
52 | deps =
53 | -r requirements.txt
54 | commands =
55 | echo "Starting {envname}"
56 | {envpython} -V
57 | {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_1} --pattern "{[local_settings]test_pattern}"
58 | # {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_2} --pattern "{[local_settings]test_pattern}"
59 | # {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_3} --pattern "{[local_settings]test_pattern}"
60 | # {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_4} --pattern "{[local_settings]test_pattern}"
61 | echo "Completed {envname}"
62 |
63 | #
64 | [testenv:format_pep8-py39]
65 | description = 'Run selected PEP8 compliance checks (flake8)'
66 | platform=
67 | macos: darwin
68 | linux: linux
69 | deps =
70 | flake8
71 | # This plugin is no longer compatible with latest pydocstyles -
72 | # flake8-docstrings>=0.2.7
73 | flake8-import-order>=0.9
74 | -r requirements.txt
75 | commands =
76 | # Exceptions: D for docstrings, I for imports order and formatting, E302 is slice spacing - W503 multiline spacing incompatible with black
77 | flake8 --max-line-length=185 --ignore=D,I,E203,W503 {[local_settings]source_paths}
78 |
79 | #
80 | [testenv:lint_pylint-py39]
81 | description = 'Run linting compliance checks (pylint)'
82 | platform=
83 | macos: darwin
84 | linux: linux
85 | deps =
86 | pylint
87 | -r requirements.txt
88 | commands =
89 | echo "Starting {envname}"
90 | pylint --disable=R,C --reports=n --rcfile={toxinidir}/pylintrc {[local_settings]source_paths}
91 | echo "Completed {envname}"
92 |
93 | #
94 | [testenv:format_black-py39]
95 | description = 'Run format compliance checks (black)'
96 | platform=
97 | macos: darwin
98 | linux: linux
99 | deps =
100 | black>=21.5b1
101 | -r requirements.txt
102 | # isort>=4.3.20
103 | commands =
104 | echo "Starting {envname}"
105 | black --check --line-length 185 {[local_settings]source_paths}
106 | # isort -rc rcsb/utils --check-only
107 | echo "Completed {envname}"
108 |
109 | #
110 | [testenv:test_coverage-py39]
111 | description = 'Run test coverage analysis'
112 | platform=
113 | macos: darwin
114 | linux: linux
115 | recreate = true
116 | alwayscopy=true
117 | package = editable-legacy
118 | deps =
119 | coverage
120 | -r requirements.txt
121 |
122 | commands =
123 | echo "Starting {envname}"
124 | coverage erase
125 | coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_1} --pattern "{[local_settings]test_pattern}"
126 | # coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_2} --pattern "{[local_settings]test_pattern}"
127 | # coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_3} --pattern "{[local_settings]test_pattern}"
128 | # coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_4} --pattern "{[local_settings]test_pattern}"
129 | echo " ------- Consolidating {envname} data ----------"
130 | coverage combine
131 | echo " ------- Building {envname} reports ----------"
132 | coverage report --fail-under={[local_settings]coverage_cutoff}
133 | - coverage xml
134 | echo "Completed {envname}"
--------------------------------------------------------------------------------