├── .gitignore ├── .readthedocs.yml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── NOTICE.md ├── README.md ├── azure-pipelines.yml ├── azure-template-publish-job.yml ├── azure-template-tox-job.yml ├── codemeta.json ├── docs ├── Makefile ├── conf.py ├── data_api │ ├── additional_examples.md │ ├── api.rst │ ├── implementation_details.md │ ├── query_construction.md │ └── quickstart.md ├── index.rst ├── make.bat ├── requirements.txt └── search_api │ ├── additional_examples.md │ ├── api.rst │ ├── attributes.md │ ├── query_construction.md │ └── quickstart.md ├── notebooks ├── covid.ipynb ├── data_quickstart.ipynb ├── multisearch.ipynb ├── search_data_workflow.ipynb ├── search_examples.ipynb └── search_quickstart.ipynb ├── pylintrc ├── rcsbapi ├── __init__.py ├── config.py ├── const.py ├── data │ ├── __init__.py │ ├── data_query.py │ ├── data_schema.py │ └── resources │ │ ├── assembly.json │ │ ├── branched_entity.json │ │ ├── branched_entity_instance.json │ │ ├── chem_comp.json │ │ ├── data_api_schema.json │ │ ├── drugbank.json │ │ ├── entry.json │ │ ├── nonpolymer_entity.json │ │ ├── nonpolymer_entity_instance.json │ │ ├── polymer_entity.json │ │ ├── polymer_entity_instance.json │ │ ├── pubmed.json │ │ └── uniprot.json ├── dev_tools │ └── update_schema.py └── search │ ├── __init__.py │ ├── resources │ ├── chemical_schema.json │ └── structure_schema.json │ ├── search_query.py │ └── search_schema.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test-data │ ├── 2mnr.cif │ ├── 4hhb-assembly1.cif.gz │ ├── 4hhb.bcif │ ├── 4hhb.cif │ ├── 4hhb.pdb │ ├── 4hhb.pdb1 │ ├── 4hhb.pdb1.gz │ ├── 7n0r.cif.gz │ ├── 7n0r.pdb.gz │ └── invalid.txt ├── test_data_query.py ├── test_data_schema.py ├── test_search_query.py └── test_search_schema.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # VS Code settings 163 | .vscode/ -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: "ubuntu-22.04" 10 | tools: 11 | python: "3.9" 12 | 13 | # Build documentation in the docs/ directory with Sphinx 14 | sphinx: 15 | configuration: docs/conf.py 16 | 17 | # Optionally build your docs in additional formats such as PDF 18 | formats: 19 | - pdf 20 | 21 | python: 22 | install: 23 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v1.1.3 (2025-05-05) 4 | 5 | - Fix: Update regex pattern for instances in `const.py` to support suffixes longer than one character (e.g., "1S5L.AA") 6 | 7 | ## v1.1.2 (2025-03-20) 8 | 9 | - Update how `dataclass` attributes are created in `const.py` 10 | 11 | ## v1.1.1 (2025-03-13) 12 | 13 | - Add missing dependency for building documentation 14 | - Add docstrings 15 | 16 | ## v1.1.0 (2025-03-12) 17 | 18 | - Add `ALL_STRUCTURES` object, allowing Data API queries for all PDB structures and chemical components 19 | - Add `progress_bar` and `batch_size` parameters to Data API package's `.exec` 20 | - Add `group` function to Search API package to enforce nested grouping 21 | - Update README with new citation information 22 | - Update search schemas: 1.48.0 -> 1.49.0 23 | - Update data schemas: 24 | - entry schema 9.0.3 -> 9.0.4 25 | - polymer_entity_instance schema 10.0.2 -> 10.0.3 26 | - nonpolymer_entity_instance schema 10.0.0 -> 10.0.1 27 | 28 | ## v1.0.1 (2025-01-17) 29 | 30 | - Add import to `const.py` for compatibility with Python 3.8 31 | - Update search schemas: 1.47.7 -> 1.48.0 32 | 33 | ## v1.0.0 (2024-11-6) 34 | 35 | - Release version 1.0.0 of package 36 | - Update search schemas: 1.47.6 -> 1.47.7 37 | - Update data schemas: 38 | - entry schema 9.0.2 -> 9.0.3 39 | - chem_comp schema 7.1.3 -> 7.1.4 40 | - Update documentation 41 | 42 | ## v0.5.0 (2024-10-28) 43 | 44 | - Separate out package-wide settings into immutable constants (`const.py`) and configurable parameters (`config.py`) 45 | - Renamed `rcsb_attributes` -> `search_attributes` 46 | - Automatically capitalize input_ids 47 | - Added `dev_tools` directory and updated `update_schema.py` 48 | - Search API `chemical_schema` and `structure_schema` at v1.47.6 49 | - Update documentation 50 | 51 | ## v0.4.0 (2024-10-15) 52 | 53 | - Merge [rcsbsearchapi package](https://github.com/rcsb/py-rcsbsearchapi/tree/2ba4d82ed1ff23c4ba5d07d4dec63f6f4030207d) into package as separate `rcsbapi.search` module 54 | - Renamed several classes and methods in this process: 55 | - `SequenceQuery` -> `SeqSimilarityQuery` 56 | - `StructureMotifResidue` -> `StructMotifResidue` 57 | - `Range` -> `FacetRange` 58 | - `rcsb_query_editor_url` -> `get_editor_link` 59 | - `rcsb_query_builder_url` -> `get_query_builder_link` 60 | - Renamed several files and classes to prevent overlap with future developments: 61 | - `data/query.py` -> `data/data_query.py` 62 | - `data/schema.py` -> `data/schema_query.py` 63 | - `Query()` Data API class -> `DataQuery()` 64 | - `Schema()` Data API class -> `DataSchema()` 65 | - `search/search.py` -> `search/search_query.py` 66 | - `search/schema.py` -> `search/search_schema.py` 67 | - Automatically change singular "input_type" to plural when possible 68 | - Add warning message if fully qualified field path not provided 69 | - Update documentation 70 | 71 | ## v0.3.0 (2024-08-23) 72 | 73 | - Falls back to local schema file when fetch fails 74 | - Supports dot separated field names for requesting data 75 | - `get_unique_fields` deleted and replaced with `find_paths` 76 | - `find_field_names` changed to return only field names, no descriptions 77 | - Executing queries called with `.exec()` 78 | - Updates to documentation 79 | - See [PR #31](https://github.com/rcsb/py-rcsb-api/pull/31) for full details 80 | - Updated data_api_schema.json and added all schema files on https://data.rcsb.org/#data-schema 81 | 82 | ## v0.2.0 (2024-07-25) 83 | 84 | - Updates to Query methods 85 | - Added GraphQL query validation 86 | - Updates to documentation 87 | 88 | ## v0.1.0 (2024-07-22) 89 | 90 | - First release! 91 | - Provides Pythonic interface for interacting with RCSB.org Data API 92 | - Automated Data API schema parsing via Schema.py 93 | - Enables query building and execution via Query.py 94 | - Documentation and example notebooks 95 | - See [PR #23](https://github.com/rcsb/py-rcsb-api/pull/23) for full details -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 RCSB PDB 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # 2 | # File: py-rcsb-api/MANIFEST.in 3 | # 4 | include HISTORY.txt 5 | include requirements.txt 6 | include README.md 7 | # 8 | -------------------------------------------------------------------------------- /NOTICE.md: -------------------------------------------------------------------------------- 1 | # Third-Party Copyright Notices 2 | `rcsb-api` uses third-party libraries or other resources that may 3 | be distributed under licenses different than the `rcsb-api` software. 4 | 5 | In the event that we accidentally failed to list a required notice, 6 | please bring it to our attention through the creation of a [GitHub issue](https://github.com/rcsb/py-rcsb-api/issues). 7 | 8 | The attached notices are provided for information only. 9 | 10 | ## [rcsbsearchapi](https://github.com/rcsb/py-rcsbsearchapi) 11 | 12 | BSD 3-Clause License 13 | -------------------- 14 | 15 | Copyright 2024 rcsbsearchapi Contributors 16 | 17 | Redistribution and use in source and binary forms, with or without 18 | modification, are permitted provided that the following conditions are met: 19 | 20 | 1. Redistributions of source code must retain the above copyright notice, 21 | this list of conditions and the following disclaimer. 22 | 23 | 2. Redistributions in binary form must reproduce the above copyright notice, 24 | this list of conditions and the following disclaimer in the documentation 25 | and/or other materials provided with the distribution. 26 | 27 | 3. Neither the name of the copyright holder nor the names of its contributors 28 | may be used to endorse or promote products derived from this software 29 | without specific prior written permission. 30 | 31 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 32 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 34 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 35 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 37 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 38 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 39 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 40 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPi Release](https://img.shields.io/pypi/v/rcsb-api.svg)](https://pypi.org/project/rcsb-api/) 2 | [![Build Status](https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_apis/build/status/rcsb.py-rcsb-api?branchName=master)](https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_build/latest?definitionId=40&branchName=master) 3 | [![Documentation Status](https://readthedocs.org/projects/rcsbapi/badge/?version=latest)](https://rcsbapi.readthedocs.io/en/latest/?badge=latest) 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14052470.svg)](https://doi.org/10.5281/zenodo.14052470) 5 | [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10424/badge)](https://www.bestpractices.dev/projects/10424) 6 | [![FAIR checklist badge](https://fairsoftwarechecklist.net/badge.svg)](https://fairsoftwarechecklist.net/v0.2?f=31&a=30112&i=32111&r=133) 7 | [![fair-software.eu](https://img.shields.io/badge/fair--software.eu-%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F%20%20%E2%97%8F-green)](https://fair-software.eu) 8 | 9 | # rcsb-api 10 | Python interface for RCSB PDB API services at RCSB.org. 11 | 12 | This package requires Python 3.8 or later. 13 | 14 | 15 | ## Installation 16 | Get it from PyPI: 17 | 18 | pip install rcsb-api 19 | 20 | Or, download from [GitHub](https://github.com/rcsb/py-rcsb-api/) 21 | 22 | 23 | ## Getting Started 24 | Full documentation available at [readthedocs](https://rcsbapi.readthedocs.io/en/latest/). 25 | 26 | The [RCSB PDB Search API](https://search.rcsb.org) supports RESTful requests according to a defined [schema](https://search.rcsb.org/redoc/index.html). This package provides an `rcsbapi.search` module that simplifies generating complex search queries. 27 | 28 | The [RCSB PDB Data API](https://data.rcsb.org) supports requests using [GraphQL](https://graphql.org/), a language for API queries. This package provides an `rcsbapi.data` module that simplifies generating queries in GraphQL syntax. 29 | 30 | ### Search API 31 | The `rcsbapi.search` module supports all available [Advanced Search](https://www.rcsb.org/search/advanced) services, as listed below. For more details on their usage, see [Search Service Types](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#search-service-types). 32 | 33 | |Search service |QueryType | 34 | |----------------------------------|--------------------------| 35 | |Full-text |`TextQuery()` | 36 | |Attribute (structure or chemical) |`AttributeQuery()` | 37 | |Sequence similarity |`SeqSimilarityQuery()` | 38 | |Sequence motif |`SeqMotifQuery()` | 39 | |Structure similarity |`StructSimilarityQuery()` | 40 | |Structure motif |`StructMotifQuery()` | 41 | |Chemical similarity |`ChemSimilarityQuery()` | 42 | 43 | #### Search API Examples 44 | To perform a search for all structures from humans associated with the term "Hemoglobin", you can combine a "full-text" query (`TextQuery`) with an "attribute" query (`AttributeQuery`): 45 | 46 | ```python 47 | from rcsbapi.search import AttributeQuery, TextQuery 48 | from rcsbapi.search import search_attributes as attrs 49 | 50 | # Construct a "full-text" sub-query for structures associated with the term "Hemoglobin" 51 | q1 = TextQuery(value="Hemoglobin") 52 | 53 | # Construct an "attribute" sub-query to search for structures from humans 54 | q2 = AttributeQuery( 55 | attribute="rcsb_entity_source_organism.scientific_name", 56 | operator="exact_match", # Other operators include "contains_phrase", "exists", and more 57 | value="Homo sapiens" 58 | ) 59 | # OR, do so by using Python bitwise operators: 60 | q2 = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens" 61 | 62 | # Combine the sub-queries (can sub-group using parentheses and standard operators, "&", "|", etc.) 63 | query = q1 & q2 64 | 65 | # Fetch the results by iterating over the query execution 66 | for rId in query(): 67 | print(rId) 68 | 69 | # OR, capture them into a variable 70 | results = list(query()) 71 | ``` 72 | 73 | These examples are in `operator` syntax. You can also make queries in `fluent` syntax. Learn more about both syntaxes and implementation details in [Query Syntax and Execution](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#query-syntax-and-execution). 74 | 75 | 76 | ### Data API 77 | The `rcsbapi.data` module allows you to easily construct GraphQL queries to the RCSB.org Data API. 78 | 79 | This is done by specifying the following input: 80 | - "input_type": the data hierarchy level you are starting from (e.g., "entry", "polymer_entity", etc.) (See full list [here](https://rcsbapi.readthedocs.io/en/latest/data_api/query_construction.html#input-type)). 81 | - "input_ids": the list of IDs for which to fetch data (corresponding to the specified "input_type") 82 | - "return_data_list": the list of data items ("fields") to retrieve. (Available fields can be explored [here](https://data.rcsb.org/data-attributes.html) or via the [GraphiQL editor's Documentation Explorer panel](https://data.rcsb.org/graphql/index.html).) 83 | 84 | #### Data API Examples 85 | This is a [simple query](https://data.rcsb.org/graphql/index.html?query=%7B%0A%20%20entry(entry_id%3A%20%224HHB%22)%20%7B%0A%20%20%20%20exptl%20%7B%0A%20%20%20%20%20%20method%0A%20%20%20%20%7D%0A%20%20%7D%0A%7D) requesting the experimental method of a structure with PDB ID 4HHB (Hemoglobin). 86 | 87 | The query must be executed using the `.exec()` method, which will return the JSON response as well as store the response as an attribute of the `DataQuery` object. From the object, you can access the Data API response, get an interactive editor link, or access the arguments used to create the query. 88 | The package is able to automatically build queries based on the "input_type" and path segment passed into "return_data_list". If using this package in code intended for long-term use, it's recommended to use fully qualified paths. When autocompletion is being used, an WARNING message will be printed out as a reminder. 89 | 90 | ```python 91 | from rcsbapi.data import DataQuery as Query 92 | query = Query( 93 | input_type="entries", 94 | input_ids=["4HHB"], 95 | return_data_list=["exptl.method"] 96 | ) 97 | print(query.exec()) 98 | ``` 99 | Data is returned in JSON format 100 | ```json 101 | { 102 | "data": { 103 | "entries": [ 104 | { 105 | "rcsb_id": "4HHB", 106 | "exptl": [ 107 | { 108 | "method": "X-RAY DIFFRACTION" 109 | } 110 | ] 111 | } 112 | ] 113 | } 114 | } 115 | ``` 116 | 117 | Here is a [more complex query](https://data.rcsb.org/graphql/index.html?query=%7B%0A%20%20polymer_entities(entity_ids%3A%5B%222CPK_1%22%2C%223WHM_1%22%2C%222D5Z_1%22%5D)%20%7B%0A%20%20%20%20rcsb_id%0A%20%20%20%20rcsb_entity_source_organism%20%7B%0A%20%20%20%20%20%20ncbi_taxonomy_id%0A%20%20%20%20%20%20ncbi_scientific_name%0A%20%20%20%20%7D%0A%20%20%20%20rcsb_cluster_membership%20%7B%0A%20%20%20%20%20%20cluster_id%0A%20%20%20%20%20%20identity%0A%20%20%20%20%7D%0A%20%20%7D%0A%7D). Note that periods can be used to further specify requested data in return_data_list. Also note multiple return data items and ids can be requested in one query. 118 | ```python 119 | from rcsbapi.data import DataQuery as Query 120 | query = Query( 121 | input_type="polymer_entities", 122 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"], 123 | return_data_list=[ 124 | "polymer_entities.rcsb_id", 125 | "rcsb_entity_source_organism.ncbi_taxonomy_id", 126 | "rcsb_entity_source_organism.ncbi_scientific_name", 127 | "cluster_id", 128 | "identity" 129 | ] 130 | ) 131 | print(query.exec()) 132 | ``` 133 | 134 | ## Jupyter Notebooks 135 | Several Jupyter notebooks with example use cases and workflows for all package modules are provided under [notebooks](notebooks/). 136 | 137 | For example, one notebook using both Search and Data API packages for a COVID-19 related example is available in [notebooks/search_data_workflow.ipynb](notebooks/search_data_workflow.ipynb) or online through Google Colab Open In Colab. 138 | 139 | 140 | ## Citing 141 | Please cite the ``rcsb-api`` package with the following reference: 142 | 143 | > Dennis W. Piehl, Brinda Vallat, Ivana Truong, Habiba Morsy, Rusham Bhatt, 144 | > Santiago Blaumann, Pratyoy Biswas, Yana Rose, Sebastian Bittrich, Jose M. Duarte, 145 | > Joan Segura, Chunxiao Bi, Douglas Myers-Turnbull, Brian P. Hudson, Christine Zardecki, 146 | > Stephen K. Burley. rcsb-api: Python Toolkit for Streamlining Access to RCSB Protein 147 | > Data Bank APIs, Journal of Molecular Biology, 2025. 148 | > DOI: [10.1016/j.jmb.2025.168970](https://doi.org/10.1016/j.jmb.2025.168970) 149 | 150 | You should also cite the RCSB.org API services this package utilizes: 151 | 152 | > Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi 153 | > Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley, 154 | > John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards 155 | > Integrated Searching and Efficient Access to Macromolecular Structure Data 156 | > from the PDB Archive, Journal of Molecular Biology, 2020. 157 | > DOI: [10.1016/j.jmb.2020.11.003](https://doi.org/10.1016/j.jmb.2020.11.003) 158 | 159 | 160 | ## Documentation and Support 161 | Please refer to the [readthedocs page](https://rcsbapi.readthedocs.io/en/latest/index.html) to learn more about package usage and other available features as well as to see more examples. 162 | 163 | If you experience any issues installing or using the package, please submit an issue on [GitHub](https://github.com/rcsb/py-rcsb-api/issues) and we will try to respond in a timely manner. 164 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # File: azure-pipelines.yml 2 | # Date: 30-May-2024 3 | # 4 | name: $(BuildDefinitionName)_$(Date:yyyyMMdd)$(Rev:.rr) 5 | 6 | trigger: 7 | - master 8 | 9 | pr: 10 | - master 11 | - staging 12 | 13 | schedules: 14 | - cron: "0 12 * * 0" 15 | displayName: Weekly Sunday build 16 | branches: 17 | include: 18 | - master 19 | always: true 20 | 21 | jobs: 22 | - template: azure-template-tox-job.yml 23 | parameters: {tox: 'format_pep8', python: '3.9', os: 'linux'} 24 | - template: azure-template-tox-job.yml 25 | parameters: {tox: 'lint_pylint', python: '3.9', os: 'linux'} 26 | - template: azure-template-tox-job.yml 27 | parameters: {tox: 'test_coverage', python: '3.9', os: 'linux'} 28 | # 29 | - template: azure-template-tox-job.yml 30 | parameters: {tox: 'py39', python: '3.9', os: 'linux'} 31 | - template: azure-template-tox-job.yml 32 | parameters: {tox: 'py39', python: '3.9', os: 'macos'} 33 | # 34 | - template: azure-template-publish-job.yml 35 | parameters: {tox: 'py39', python: '3.9', os: 'macos'} 36 | - template: azure-template-publish-job.yml 37 | parameters: {tox: 'py39', python: '3.9', os: 'linux'} 38 | # 39 | -------------------------------------------------------------------------------- /azure-template-publish-job.yml: -------------------------------------------------------------------------------- 1 | # File: azure-template-publish-job.yml 2 | # Date: 8-Jun-2023 3 | # 4 | ## 5 | parameters: 6 | tox: "" 7 | python: "" 8 | os: "linux" 9 | fixtures: "" 10 | 11 | jobs: 12 | - job: ${{ format('publish_{0}_{1}', parameters.tox, parameters.os) }} 13 | pool: 14 | ${{ if eq(parameters.os, 'macos') }}: 15 | vmImage: 'macOS-15' 16 | ${{ if eq(parameters.os, 'linux') }}: 17 | vmImage: 'ubuntu-latest' 18 | dependsOn: 19 | - ${{ format('build_test_{0}_{1}', parameters.tox, parameters.os) }} 20 | condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'), eq(variables['Build.SourceBranch'], 'refs/heads/master')) 21 | # 22 | steps: 23 | - task: UsePythonVersion@0 24 | inputs: 25 | versionSpec: ${{ parameters.python }} 26 | addToPath: true 27 | displayName: setup python 28 | # 29 | #- checkout: self 30 | # submodules: true 31 | # 32 | - download: current 33 | artifact: ${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }} 34 | 35 | - download: current 36 | artifact: ${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }} 37 | # 38 | - script: ls -lR $(Pipeline.Workspace)/${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }} 39 | displayName: "Listing of downloaded artifacts" 40 | # 41 | - script: python -m pip install --upgrade pip twine setuptools wheel 42 | displayName: 'Install packaging tools' 43 | # 44 | - task: DownloadSecureFile@1 45 | name: pypicred 46 | displayName: 'Download PyPI credentials' 47 | inputs: 48 | secureFile: 'PYPIRC-AZURE' 49 | 50 | - ${{ if startsWith(parameters.os, 'linux') }}: 51 | - script: twine upload --verbose --skip-existing -r pypi --config-file $(pypicred.secureFilePath) $(Pipeline.Workspace)/${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }}/* 52 | displayName: "Linux upload sdist and source wheel to PyPi ..." 53 | continueOnError: true 54 | # 55 | - ${{ if startsWith(parameters.os, 'macos') }}: 56 | - script: twine upload --verbose --skip-existing -r pypi --config-file $(pypicred.secureFilePath) $(Pipeline.Workspace)/${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }}/* 57 | displayName: "Mac upload sdist and binary wheel to PyPi ..." 58 | continueOnError: true -------------------------------------------------------------------------------- /azure-template-tox-job.yml: -------------------------------------------------------------------------------- 1 | # File: azure-template-tox-job.yml 2 | # Date: 30-May-2024 3 | # 4 | ## 5 | parameters: 6 | tox: "" 7 | python: "" 8 | os: "linux" 9 | fixtures: "" 10 | 11 | jobs: 12 | - job: ${{ format('build_test_{0}_{1}', parameters.tox, parameters.os) }} 13 | timeoutInMinutes: 0 14 | pool: 15 | ${{ if eq(parameters.os, 'macos') }}: 16 | vmImage: 'macOS-15' 17 | ${{ if eq(parameters.os, 'linux') }}: 18 | vmImage: 'ubuntu-latest' 19 | 20 | variables: 21 | - group: py-shared-variables 22 | 23 | steps: 24 | # 25 | # ensure the required Python versions are available 26 | - task: UsePythonVersion@0 27 | inputs: 28 | versionSpec: ${{ parameters.python }} 29 | addToPath: true 30 | displayName: setup python 31 | # 32 | - checkout: self 33 | submodules: true 34 | # 35 | - ${{ if startsWith(parameters.os, 'macos') }}: 36 | - bash: | 37 | set -e 38 | ls -la /Applications/Xcode* 39 | sudo xcode-select --switch /Applications/Xcode_16.app/Contents/Developer 40 | which g++ 41 | c++ --version 42 | displayName: "setup Xcode" 43 | # ---------------------------------------------- 44 | - ${{ if startsWith(parameters.os, 'linux') }}: 45 | - script: which apt 46 | displayName: 'Installing OS dependencies' 47 | - script: apt-cache policy | grep http | awk '{print $2 $3}' | sort -u 48 | displayName: 'Checking for repos' 49 | # 50 | - script: "python -c \"import sys; print(sys.version); print(sys.executable)\"" 51 | displayName: show python information 52 | # 53 | - script: python -m pip install --upgrade pip tox 54 | displayName: 'Install tools' 55 | # 56 | - script: pip install -r requirements.txt 57 | displayName: 'Install dependencies' 58 | # 59 | - ${{ if startsWith(parameters.tox, 'py') }}: 60 | - script: | 61 | export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) 62 | ${{ format('python -m tox -e {0}', parameters.tox) }} 63 | displayName: 'Running tox task' 64 | - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.9')) }}: 65 | - script: | 66 | export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) 67 | ${{ format('python -m tox -e {0}-py39', parameters.tox) }} 68 | displayName: 'Running tox task' 69 | - ${{ if and(not(startsWith(parameters.tox, 'py')), startsWith(parameters.python, '3.8')) }}: 70 | - script: | 71 | export CONFIG_SUPPORT_TOKEN_ENV=$(VAR_CONFIG_SUPPORT_TOKEN_ENV) 72 | ${{ format('python -m tox -e {0}-py38', parameters.tox) }} 73 | displayName: 'Running tox task' 74 | # 75 | # Build artifacts if this is a test target (i.e. labeled as py##) 76 | # 77 | - ${{ if startsWith(parameters.tox, 'py') }}: 78 | - script: pip install --upgrade pip twine setuptools wheel 79 | displayName: "Acquire build tools" 80 | - script: python setup.py sdist --dist-dir "$(System.DefaultWorkingDirectory)/dist" 81 | displayName: "Build source dist" 82 | - script: python setup.py bdist_wheel --dist-dir "$(System.DefaultWorkingDirectory)/dist" 83 | displayName: "Build wheel" 84 | # 85 | - script: python setup.py sdist --dist-dir "$(System.DefaultWorkingDirectory)/udist" 86 | displayName: "Build source dist" 87 | # 88 | # Check the install artifacts 89 | - script: ls -lR "$(System.DefaultWorkingDirectory)/dist" "$(System.DefaultWorkingDirectory)/udist" 90 | displayName: "Listing of installed software" 91 | # 92 | - publish: $(System.DefaultWorkingDirectory)/dist 93 | artifact: ${{ format('sw_{0}_{1}', parameters.tox, parameters.os) }} 94 | # 95 | - publish: $(System.DefaultWorkingDirectory)/udist 96 | artifact: ${{ format('sw_u_{0}_{1}', parameters.tox, parameters.os) }} 97 | # -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://w3id.org/codemeta/3.0", 3 | "type": "SoftwareSourceCode", 4 | "applicationCategory": "Structural Biology, Bioinformatics", 5 | "codeRepository": "https://github.com/rcsb/py-rcsb-api", 6 | "dateCreated": "2024-04-15", 7 | "dateModified": "2025-03-20", 8 | "datePublished": "2024-07-22", 9 | "description": "Python interface for RCSB PDB API services at RCSB.org.", 10 | "downloadUrl": "https://github.com/rcsb/py-rcsb-api/archive/refs/heads/master.zip", 11 | "funder": { 12 | "type": "Organization", 13 | "name": "US National Science Foundation (DBI-2321666), US Department of Energy (DE-SC0019749), National Cancer Institute, National Institute of Allergy and Infectious Diseases, and National Institute of General Medical Sciences of the National Institutes of Health (R01GM157729)" 14 | }, 15 | "keywords": [ 16 | "structural biology", 17 | "bioinformatics", 18 | "protein structure", 19 | "application programming interface", 20 | "APIs" 21 | ], 22 | "license": "https://spdx.org/licenses/MIT", 23 | "name": "rcsb-api", 24 | "operatingSystem": [ 25 | "Linux", 26 | "Windows", 27 | "MacOS" 28 | ], 29 | "programmingLanguage": "Python 3", 30 | "relatedLink": [ 31 | "https://pypi.org/project/rcsb-api/", 32 | "https://rcsbapi.readthedocs.io/en/latest/index.html" 33 | ], 34 | "softwareRequirements": "https://github.com/rcsb/py-rcsb-api/blob/master/requirements.txt", 35 | "version": "1.1.2", 36 | "codemeta:contIntegration": { 37 | "id": "https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_build/latest?definitionId=40&branchName=master" 38 | }, 39 | "continuousIntegration": "https://dev.azure.com/rcsb/RCSB%20PDB%20Python%20Projects/_build/latest?definitionId=40&branchName=master", 40 | "developmentStatus": "active", 41 | "funding": "DBI-2321666, DE-SC0019749, R01GM157729", 42 | "issueTracker": "https://github.com/rcsb/py-rcsb-api/issues", 43 | "referencePublication": "https://doi.org/10.1016/j.jmb.2025.168970" 44 | } -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | livehtml: 18 | sphinx-autobuild -b html -z "$(SOURCEDIR)/../rcsbapi" "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O) 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("..")) 17 | import rcsbapi # noqa: E402 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "rcsb-api" 22 | copyright = "2024, RCSB PDB" 23 | author = "RCSB PDB" 24 | 25 | # The version info for the project you're documenting, acts as replacement for 26 | # |version| and |release|, also used in various other places throughout the 27 | # built documents. 28 | # 29 | # The short X.Y version. 30 | version = rcsbapi.__version__.split("-")[0] 31 | # The full version, including alpha/beta/rc tags 32 | release = rcsbapi.__version__ 33 | 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | "sphinx.ext.autodoc", 42 | "sphinx.ext.coverage", 43 | "sphinx.ext.napoleon", 44 | "myst_parser", 45 | ] 46 | # source_suffix = [".rst", ".md"] # Redundant with newer sphinx versions 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ["_templates"] 50 | 51 | # List of patterns, relative to source directory, that match files and 52 | # directories to ignore when looking for source files. 53 | # This pattern also affects html_static_path and html_extra_path. 54 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 55 | 56 | # Napoleon settings 57 | # napoleon_google_docstring = True 58 | napoleon_numpy_docstring = False 59 | # napoleon_include_init_with_doc = False 60 | # napoleon_include_private_with_doc = False 61 | # napoleon_include_special_with_doc = True 62 | # napoleon_use_admonition_for_examples = False 63 | # napoleon_use_admonition_for_notes = False 64 | # napoleon_use_admonition_for_references = False 65 | # napoleon_use_ivar = False 66 | # napoleon_use_param = True 67 | # napoleon_use_rtype = True 68 | 69 | 70 | # -- Options for HTML output ------------------------------------------------- 71 | 72 | # The theme to use for HTML and HTML Help pages. See the documentation for 73 | # a list of builtin themes. 74 | html_theme = "sphinx_rtd_theme" 75 | 76 | 77 | # Add any paths that contain custom static files (such as style sheets) here, 78 | # relative to this directory. They are copied after the builtin static files, 79 | # so a file named "default.css" will overwrite the builtin "default.css". 80 | html_static_path = [] 81 | -------------------------------------------------------------------------------- /docs/data_api/additional_examples.md: -------------------------------------------------------------------------------- 1 | # Additional Examples 2 | Most examples come from [RCSB PDB Data API documentation](https://data.rcsb.org/#examples) 3 | 4 | ### Entries 5 | Fetch information about structure title and experimental method for PDB entries: 6 | ```python 7 | from rcsbapi.data import DataQuery as Query 8 | 9 | query = Query( 10 | input_type="entries", 11 | input_ids=["1STP", "2JEF", "1CDG"], 12 | return_data_list=["entries.rcsb_id", "struct.title", "exptl.method"] 13 | ) 14 | result_dict = query.exec() 15 | print(result_dict) 16 | ``` 17 | Performs the following GraphQL query: 18 | ``` 19 | { 20 | entries(entry_ids: ["1STP", "2JEF", "1CDG"]) { 21 | rcsb_id 22 | struct { 23 | title 24 | } 25 | exptl { 26 | method 27 | } 28 | } 29 | } 30 | ``` 31 | To find more about the return_data_list dot notation, see [ValueError: Not a unique field](query_construction.md#valueerror-not-a-unique-field) 32 | 33 | ### Primary Citation 34 | Fetch primary citation information (structure authors, PubMed ID, DOI) and release date for PDB entries: 35 | 36 | ```python 37 | from rcsbapi.data import DataQuery as Query 38 | 39 | query = Query( 40 | input_type="entries", 41 | input_ids=["1STP", "2JEF", "1CDG"], 42 | return_data_list=[ 43 | "entries.rcsb_id", 44 | "rcsb_accession_info.initial_release_date", 45 | "audit_author.name", 46 | "rcsb_primary_citation.pdbx_database_id_PubMed", 47 | "rcsb_primary_citation.pdbx_database_id_DOI" 48 | ] 49 | ) 50 | result_dict = query.exec() 51 | print(result_dict) 52 | ``` 53 | Performs the following GraphQL query: 54 | ``` 55 | { 56 | entries(entry_ids: ["1STP", "2JEF", "1CDG"]) { 57 | rcsb_id 58 | rcsb_accession_info { 59 | initial_release_date 60 | } 61 | audit_author { 62 | name 63 | } 64 | rcsb_primary_citation { 65 | pdbx_database_id_PubMed 66 | pdbx_database_id_DOI 67 | } 68 | } 69 | } 70 | ``` 71 | 72 | ### Polymer Entities 73 | Fetch taxonomy information and information about membership in the sequence clusters for polymer entities: 74 | 75 | ```python 76 | from rcsbapi.data import DataQuery as Query 77 | 78 | query = Query( 79 | input_type="polymer_entities", 80 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"], 81 | return_data_list=[ 82 | "polymer_entities.rcsb_id", 83 | "rcsb_entity_source_organism.ncbi_taxonomy_id", 84 | "rcsb_entity_source_organism.ncbi_scientific_name", 85 | "cluster_id", 86 | "identity" 87 | ] 88 | ) 89 | result_dict = query.exec() 90 | print(result_dict) 91 | ``` 92 | Performs the following GraphQL query: 93 | ``` 94 | { 95 | polymer_entities(entity_ids: ["2CPK_1", "3WHM_1", "2D5Z_1"]) { 96 | rcsb_id 97 | rcsb_entity_source_organism { 98 | ncbi_taxonomy_id 99 | ncbi_scientific_name 100 | } 101 | rcsb_cluster_membership { 102 | cluster_id 103 | identity 104 | } 105 | } 106 | } 107 | ``` 108 | 109 | ### Polymer Instances 110 | Fetch information about the domain assignments for polymer entity instances: 111 | 112 | ```python 113 | from rcsbapi.data import DataQuery as Query 114 | 115 | query = Query( 116 | input_type="polymer_entity_instances", 117 | input_ids=["4HHB.A", "12CA.A", "3PQR.A"], 118 | return_data_list=[ 119 | "polymer_entity_instances.rcsb_id", 120 | "rcsb_polymer_instance_annotation.annotation_id", 121 | "rcsb_polymer_instance_annotation.name", 122 | "rcsb_polymer_instance_annotation.type" 123 | ] 124 | ) 125 | result_dict = query.exec() 126 | print(result_dict) 127 | ``` 128 | Performs the following GraphQL query: 129 | ``` 130 | { 131 | polymer_entity_instances(instance_ids: ["4HHB.A", "12CA.A", "3PQR.A"]) { 132 | rcsb_id 133 | rcsb_polymer_instance_annotation { 134 | annotation_id 135 | name 136 | type 137 | } 138 | } 139 | } 140 | ``` 141 | 142 | ### Carbohydrates 143 | Query branched entities (sugars or oligosaccharides) for commonly used linear descriptors: 144 | 145 | ```python 146 | from rcsbapi.data import DataQuery as Query 147 | 148 | query = Query( 149 | input_type="branched_entities", 150 | input_ids=["5FMB_2", "6L63_3"], 151 | return_data_list=[ 152 | "rcsb_id", 153 | "pdbx_entity_branch.type", 154 | "pdbx_entity_branch_descriptor.type", 155 | "pdbx_entity_branch_descriptor.descriptor" 156 | ] 157 | ) 158 | result_dict = query.exec() 159 | print(result_dict) 160 | ``` 161 | Performs the following GraphQL query: 162 | ``` 163 | { 164 | branched_entities(entity_ids: ["5FMB_2", "6L63_3"]) { 165 | rcsb_id 166 | pdbx_entity_branch { 167 | type 168 | } 169 | pdbx_entity_branch_descriptor { 170 | type 171 | descriptor 172 | } 173 | } 174 | } 175 | ``` 176 | 177 | ### Sequence Positional Features 178 | 179 | Sequence positional features describe regions or sites of interest in the PDB sequences, such as binding sites, active sites, linear motifs, local secondary structure, structural and functional domains, etc. Positional annotations include depositor-provided information available in the PDB archive as well as annotations integrated from external resources (e.g. UniProtKB). 180 | 181 | This example queries `polymer_entity_instances` positional features. The query returns features of different types: for example, CATH and SCOP classifications assignments integrated from UniProtKB data, or the secondary structure annotations from the PDB archive data calculated by the data-processing program called MAXIT (Macromolecular Exchange and Input Tool) that is based on an earlier ProMotif implementation. 182 | 183 | ```python 184 | from rcsbapi.data import DataQuery as Query 185 | 186 | query = Query( 187 | input_type="polymer_entity_instances", 188 | input_ids=["1NDO.A"], 189 | return_data_list=[ 190 | "polymer_entity_instances.rcsb_id", 191 | "rcsb_polymer_instance_feature.type", 192 | "rcsb_polymer_instance_feature.feature_positions.beg_seq_id", 193 | "rcsb_polymer_instance_feature.feature_positions.end_seq_id" 194 | ] 195 | ) 196 | result_dict = query.exec() 197 | print(result_dict) 198 | ``` 199 | Performs the following GraphQL query: 200 | ``` 201 | { 202 | polymer_entity_instances(instance_ids: ["1NDO.A"]) { 203 | rcsb_id 204 | rcsb_polymer_instance_feature { 205 | type 206 | feature_positions { 207 | beg_seq_id 208 | end_seq_id 209 | } 210 | } 211 | } 212 | } 213 | ``` 214 | 215 | ### Reference Sequence Identifiers 216 | This example shows how to access identifiers related to entries (cross-references) and found in data collections other than PDB. Each cross-reference is described by the database name and the database accession. A single entry can have cross-references to several databases, e.g. UniProt and GenBank in 7NHM, or no cross-references, e.g. 5L2G: 217 | ```python 218 | from rcsbapi.data import DataQuery as Query 219 | 220 | query = Query( 221 | input_type="entries", 222 | input_ids=["7NHM", "5L2G"], 223 | return_data_list=[ 224 | "polymer_entities.rcsb_id", 225 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", 226 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name" 227 | ] 228 | ) 229 | result_dict = query.exec() 230 | print(result_dict) 231 | ``` 232 | Performs the following GraphQL query: 233 | ``` 234 | { 235 | entries(entry_ids: ["7NHM", "5L2G"]){ 236 | polymer_entities { 237 | rcsb_id 238 | rcsb_polymer_entity_container_identifiers { 239 | reference_sequence_identifiers { 240 | database_accession 241 | database_name 242 | } 243 | } 244 | } 245 | } 246 | } 247 | ``` 248 | 249 | ### Chemical Components 250 | Query for specific items in the chemical component dictionary based on a given list of CCD ids: 251 | 252 | ```python 253 | from rcsbapi.data import DataQuery as Query 254 | 255 | query = Query( 256 | input_type="chem_comps", 257 | input_ids=["NAG", "EBW"], 258 | return_data_list=[ 259 | "chem_comps.rcsb_id", 260 | "chem_comp.type", 261 | "chem_comp.formula_weight", 262 | "chem_comp.name", 263 | "chem_comp.formula", 264 | "rcsb_chem_comp_info.initial_release_date" 265 | ] 266 | ) 267 | result_dict = query.exec() 268 | print(result_dict) 269 | ``` 270 | Performs the following GraphQL query: 271 | ``` 272 | { 273 | chem_comps(comp_ids: ["NAG", "EBW"]) { 274 | rcsb_id 275 | chem_comp { 276 | type 277 | formula_weight 278 | name 279 | formula 280 | } 281 | rcsb_chem_comp_info { 282 | initial_release_date 283 | } 284 | } 285 | } 286 | ``` 287 | 288 | ### Computed Structure Models 289 | This example shows how to get a list of global Model Quality Assessment metrics for AlphaFold structure of Hemoglobin subunit beta: 290 | 291 | ```python 292 | from rcsbapi.data import DataQuery as Query 293 | 294 | query = Query( 295 | input_type="entries", 296 | input_ids=["AF_AFP68871F1"], 297 | return_data_list=["rcsb_id", "ma_qa_metric_global.type", "ma_qa_metric_global.value"] 298 | ) 299 | result_dict = query.exec() 300 | print(result_dict) 301 | ``` 302 | Performs the following GraphQL query: 303 | ``` 304 | { 305 | entries(entry_ids: ["AF_AFP68871F1"]) { 306 | rcsb_id 307 | rcsb_ma_qa_metric_global { 308 | ma_qa_metric_global { 309 | type 310 | value 311 | } 312 | } 313 | } 314 | } 315 | ``` 316 | 317 | ### PubMed 318 | This example gets the abstract text of the paper with the specified PubMed ID. 319 | 320 | ```python 321 | from rcsbapi.data import DataQuery as Query 322 | 323 | query = Query( 324 | input_type="pubmed", 325 | return_data_list=["rcsb_pubmed_abstract_text"], 326 | input_ids=["6726807"] 327 | ) 328 | 329 | result_dict = query.exec() 330 | print(result_dict) 331 | ``` 332 | 333 | Performs the following GraphQL query: 334 | ``` 335 | { 336 | pubmed(pubmed_id: 6726807) { 337 | rcsb_pubmed_abstract_text 338 | } 339 | } 340 | ``` 341 | 342 | ### UniProt 343 | This example gets a description of the function of a protein based on the UniProt ID. 344 | 345 | ```python 346 | from rcsbapi.data import DataQuery as Query 347 | 348 | query = Query( 349 | input_type="uniprot", 350 | return_data_list=["function.details"], 351 | input_ids=["P68871"] 352 | ) 353 | 354 | result_dict = query.exec() 355 | print(result_dict) 356 | ``` 357 | 358 | Performs the following GraphQL query: 359 | ``` 360 | { 361 | uniprot(uniprot_id: "P68871") { 362 | rcsb_uniprot_protein { 363 | function { 364 | details 365 | } 366 | } 367 | } 368 | } 369 | ``` 370 | -------------------------------------------------------------------------------- /docs/data_api/api.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ***************** 3 | 4 | .. automodule:: rcsbapi.data 5 | :members: 6 | :special-members: __init__ 7 | -------------------------------------------------------------------------------- /docs/data_api/implementation_details.md: -------------------------------------------------------------------------------- 1 | # Implementation Details 2 | ### Parsing Schema 3 | Upon initialization of the package, the GraphQL schema is fetched from the GraphQL Data API endpoint. After fetching the schema, the Python package parses the schema and creates a graph object to represent it within the package. This graph representation of how fields and types connect is key to how queries are automatically constructed using a path finding algorithm. The graph is constructed as a directed graph in [rustworkx](https://www.rustworkx.org/), so `rustworkx` must be able to be installed on your machine to use this. If you experience installation or usage issues, please create an issue on [GitHub](https://github.com/rcsb/py-rcsb-api/issues) and we will consider implementing alternative support. 4 | 5 | ### Constructing queries 6 | Queries are constructed by finding every [simple path](https://en.wikipedia.org/wiki/Simple_path#:~:text=Simple%20path%20(graph%20theory)%2C,does%20not%20have%20repeating%20vertices) from the `input_type` to each final requested field in `return_data_list`. The simple paths are searched for path(s) matching the given path in `return_data_list`. The given path must be sufficiently specific to allow for only one possible path. If there are multiple possible paths, a [ValueError](query_construction.md#valueerror-not-a-unique-field) is raised. 7 | 8 | ### Error Handling 9 | In GraphQL, all requests return HTTP status code 200 and instead, errors appear in the returned JSON. The package will parse these errors, throwing a `ValueError` and displaying the corresponding error message or messages. To access the full query and return JSON in an interactive editor, you can use the `get_editor_link()` method on the DataQuery object. (see [Helpful Methods](query_construction.md#get_editor_link)) -------------------------------------------------------------------------------- /docs/data_api/query_construction.md: -------------------------------------------------------------------------------- 1 | # Query Construction 2 | 3 | ## Query Objects 4 | Constructing a query object requires three inputs. The JSON response to a query is stored in the `response` attribute of a Query object and can be accessed using the `get_response()` method. 5 | ```python 6 | from rcsbapi.data import DataQuery as Query 7 | 8 | # Constructing the Query object 9 | query = Query( 10 | input_type="entries", 11 | input_ids=["4HHB"], 12 | return_data_list=["exptl.method"] 13 | ) 14 | 15 | # Executing the query 16 | query.exec() 17 | 18 | # Accessing the response 19 | # Can also print using print(query.exec()) 20 | print(query.get_response()) 21 | ``` 22 | 23 | ### input_type 24 | Specifies which data hierarchy level from which you are starting your query (e.g., `"entry"`, `"polymer_entity"`, etc.). 25 | 26 | Also called "root fields", these represent designated points from which you can begin querying. This includes `"entries"`, `"polymer_entities"`, `"polymer_entity_instances"`, etc. Singular input_types are automatically converted to their plural form when possible to allow for more flexibility in `input_ids`. For the full list of `input_type`s see below: 27 | 28 |
29 | Full list of input_types 30 | 31 | - `"entry"` 32 | - `"entries"` 33 | - `"polymer_entity"` 34 | - `"polymer_entities"` 35 | - `"branched_entity"` 36 | - `"branched_entities"` 37 | - `"nonpolymer_entity"` 38 | - `"nonpolymer_entities"` 39 | - `"polymer_entity_instance"` 40 | - `"polymer_entity_instances"` 41 | - `"nonpolymer_entity_instance"` 42 | - `"nonpolymer_entity_instances"` 43 | - `"branched_entity_instance"` 44 | - `"branched_entity_instances"` 45 | - `"assembly"` 46 | - `"assemblies"` 47 | - `"interface"` 48 | - `"interfaces"` 49 | - `"uniprot"` 50 | - `"pubmed"` 51 | - `"chem_comp"` 52 | - `"chem_comps"` 53 | - `"entry_group"` 54 | - `"entry_groups"` 55 | - `"polymer_entity_group"` 56 | - `"polymer_entity_groups"` 57 | - `"group_provenance"` 58 | 59 |
60 | 61 | ### input_ids 62 | Specifies which entries, entities, etc you would like to request data for. 63 | 64 | This can be a dictionary or a list. Dictionaries must be passed with specific keys corresponding to the input_type. You can find the key names by using the `get_input_id_dict(input_type)` method (see [Helpful Methods](query_construction.md#get-input-id-dict)) or by looking in the [GraphiQL editor](https://data.rcsb.org/graphql/index.html) Docs menu. Lists must be passed in PDB identifier format. 65 | 66 |
67 | 68 | |Type|PDB ID Format|Example| 69 | |---|---|---| 70 | |entries|entry id|4HHB| 71 | |polymer, branched, or non-polymer entities|[entry_id]_[entity_id]|4HHB_1| 72 | |polymer, branched, or non-polymer entity instances|[entry_id].[asym_id]|4HHB.A| 73 | |biological assemblies|[entry_id]-[assembly_id]|4HHB-1| 74 | |interface|[entry_id]-[assembly_id].[interface_id]|4HHB-1.1| 75 | 76 |
77 | 78 | Dictionaries and Lists will be treated equivalently for the `input_ids` argument. For example, these `input_ids` arguments are equivalent. 79 | 80 | ```python 81 | # input_type is polymer_entity_instance 82 | input_ids=["4HHB.A"] 83 | input_ids={"entry_id": "4HHB", "asym_id": "A"} 84 | ``` 85 | ```python 86 | # input_type is polymer_entity_instances (plural) 87 | input_ids=["4HHB.A", "4HHB.B"] 88 | input_ids={"instance_ids": ["4HHB.A", "4HHB.B"]} 89 | ``` 90 | 91 | While it is generally more efficient and easier to interpret results if you use a refined list of IDs, if you would like to request a set of data for all IDs within an `input_type`, you can use the `ALL_STRUCTURES` variable. This will set `input_ids` to all IDs for the given `input_type` if supported. 92 | 93 | ```python 94 | from rcsbapi.data import DataQuery as Query 95 | from rcsbapi.data import ALL_STRUCTURES 96 | 97 | # Using `ALL_STRUCTURES` with `input_type` "entries" 98 | # will use all experimentally-determined entry IDs 99 | query = Query( 100 | input_type="entries", 101 | input_ids=ALL_STRUCTURES, 102 | return_data_list=["exptl.method"] 103 | ) 104 | 105 | # Executing the query with a progress bar 106 | query.exec(progress_bar=True) 107 | 108 | print(query.get_response()) 109 | ``` 110 | 111 | ### return_data_list 112 | These are the data that you are requesting (or "fields"). 113 | 114 | In GraphQL syntax, the final requested data must be a "scalar" type (string, integer, boolean). However, if you request non-scalar data, the package will auto-populate the query to include all fields under the specified data until scalars are reached. Once you receive the query response and understand what specific data you would like to request, you can refine your query by requesting more specific fields. 115 | 116 | The "rcsb_id" field will automatically be added to all queries allowing for easier parsing of the returned JSON. You can turn this off by setting the optional `add_rcsb_id` argument to False. 117 | 118 | ```python 119 | from rcsbapi.data import DataQuery as Query 120 | 121 | query = Query( 122 | input_type="entries", 123 | input_ids=["4HHB"], 124 | return_data_list=["exptl"] 125 | ) 126 | result_dict = query.exec() 127 | print(result_dict) 128 | ``` 129 | ```json 130 | { 131 | "data": { 132 | "entries": [ 133 | { 134 | "rcsb_id": "4HHB", 135 | "exptl": [ 136 | { 137 | "method_details": null, 138 | "method": "X-RAY DIFFRACTION", 139 | "crystals_number": null, 140 | "details": null 141 | } 142 | ] 143 | } 144 | ] 145 | } 146 | } 147 | ``` 148 | This query can be made more concise by specifying a field, like "method". In this case, the field name "method" is redundant because it appears under other types and must be further specified using dot notation. For more details see [ValueError: Not a unique field](query_construction.md#valueerror-not-a-unique-field) 149 | ```python 150 | from rcsbapi.data import DataQuery as Query 151 | 152 | query = Query( 153 | input_type="entries", 154 | input_ids=["4HHB"], 155 | return_data_list=["exptl.method"] 156 | ) 157 | result_dict = query.exec() 158 | print(result_dict) 159 | ``` 160 | ```json 161 | { 162 | "data": { 163 | "entries": [ 164 | { 165 | "rcsb_id": "4HHB", 166 | "exptl": [ 167 | { 168 | "method": "X-RAY DIFFRACTION" 169 | } 170 | ] 171 | } 172 | ] 173 | } 174 | } 175 | ``` 176 | 177 | ### Executing Large Queries 178 | When executing large queries, the package will batch the `input_ids` before requesting and merge the responses into one JSON object. The default batch size is 5,000, but this value can be adjusted in the `exec` method. To see a progress bar that tracks which batches have been completed, you can set `progress_bar` to `True`. 179 | 180 | ```python 181 | from rcsbapi.data import DataQuery as Query 182 | from rcsbapi.data import ALL_STRUCTURES 183 | 184 | query = Query( 185 | input_type="entries", 186 | input_ids=ALL_STRUCTURES, 187 | return_data_list=["exptl.method"] 188 | ) 189 | 190 | # Executing query with larger batch size 191 | # and progress bar 192 | query.exec( 193 | batch_size=7000, 194 | progress_bar=True 195 | ) 196 | 197 | print(query.get_response()) 198 | ``` 199 | 200 | ## Helpful Methods 201 | There are several methods included to make working with query objects easier. These methods can help you refine your queries to request exactly and only what you want, as well as further understand the GraphQL syntax. 202 | 203 | ### get_editor_link() 204 | This method returns the link to a [GraphiQL](https://data.rcsb.org/graphql/index.html) window with the query. From the window, you can use the user interface to explore other fields and refine your query. Method of the `DataQuery` class. 205 | 206 | ```python 207 | from rcsbapi.data import DataQuery as Query 208 | 209 | query = Query( 210 | input_type="entries", 211 | input_ids=["4HHB"], 212 | return_data_list=["exptl"] 213 | ) 214 | editor_link = query.get_editor_link() 215 | print(editor_link) 216 | ``` 217 | 218 | ### find_paths() 219 | Given a redundant field, this method finds all paths from an `input_type` to nodes named as `return_data_name`. Method of the `DataSchema` class. 220 | 221 | ```python 222 | from rcsbapi.data import DataSchema 223 | 224 | schema = DataSchema() 225 | schema.find_paths(input_type="entries", return_data_name="id") 226 | ``` 227 | 228 | To return a dictionary with descriptions for each path, set `descriptions` to true. 229 | ```python 230 | schema.find_paths(input_type="entries", return_data_name="id", descriptions=True) 231 | ``` 232 | 233 | ### find_field_names() 234 | Given a string, this method will return all fields containing that string. 235 | 236 | ```python 237 | from rcsbapi.data import DataSchema 238 | 239 | schema = DataSchema() 240 | schema.find_field_names("exptl") 241 | ``` 242 | 243 | ### get_input_id_dict() 244 | Given an `input_type`, returns a dictionary with the corresponding input keys and descriptions of each key. Method of the `DataSchema` class. 245 | 246 | ```python 247 | from rcsbapi.data import DataSchema 248 | 249 | schema = DataSchema() 250 | schema.get_input_id_dict("polymer_entity_instance") 251 | ``` 252 | 253 | ## Troubleshooting 254 | ### ValueError: Not a unique field 255 | Some fields are redundant within our GraphQL Data API schema. For example, "id" appears over 50 times. To allow for specific querying, redundant fields are identified by the syntax `....`. If you request a redundant field without this syntax, a `ValueError` will be returned stating that the field exists, but is not unique. You can then use `find_paths(input_type, return_data_name)` to find a path that would specify the desired field. 256 | 257 | ```python 258 | from rcsbapi.data import DataQuery as Query 259 | 260 | # querying a redundant field 261 | query = Query( 262 | input_type="entries", 263 | input_ids=["4HHB"], 264 | return_data_list=["id"] 265 | ) 266 | result_dict = query.exec() 267 | print(result_dict) 268 | ``` 269 | ``` 270 | ValueError: "id" exists, but is not a unique field, must specify further. 271 | 10 of 118 possible paths: 272 | entries.assemblies.branched_entity_instances.branched_entity.chem_comp_monomers.chem_comp.id 273 | entries.assemblies.branched_entity_instances.branched_entity.chem_comp_monomers.rcsb_bird_citation.id 274 | ... 275 | 276 | For all paths run: 277 | from rcsbapi.data import DataSchema 278 | schema = DataSchema() 279 | schema.find_paths("entries", "id") 280 | ``` 281 | ```python 282 | from rcsbapi.data import DataSchema 283 | 284 | # run find_paths(input_type, return_data_name) 285 | schema = DataSchema() 286 | print(schema.find_paths(input_type="entries", return_data_name="id")) 287 | ``` 288 | 289 | ```python 290 | # select desired field from the returned list 291 | ['citation.id', 292 | 'diffrn.id' 293 | 'entry.id' 294 | ... 295 | 'polymer_entities.prd.chem_comp.id', 296 | 'polymer_entities.prd.rcsb_bird_citation.id', 297 | 'polymer_entities.prd.rcsb_chem_comp_annotation.annotation_lineage.id'] 298 | ``` 299 | ```python 300 | from rcsbapi.data import DataQuery as Query 301 | 302 | # valid query 303 | query = Query( 304 | input_type="entries", 305 | input_ids=["4HHB"], 306 | return_data_list=["entry.id"] 307 | ) 308 | result_dict = query.exec() 309 | print(result_dict) 310 | ``` -------------------------------------------------------------------------------- /docs/data_api/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quickstart 2 | 3 | ## Installation 4 | Get it from PyPI: 5 | 6 | pip install rcsb-api 7 | 8 | Or, download from [GitHub](https://github.com/rcsb/py-rcsb-api) 9 | 10 | ## Import 11 | To import this package, use: 12 | ```python 13 | from rcsbapi.data import DataSchema, DataQuery 14 | ``` 15 | 16 | ## Getting Started 17 | The [RCSB PDB Data API](https://data.rcsb.org) supports requests using [GraphQL](https://graphql.org/), a language for API queries. This package simplifies generating queries in GraphQL syntax. 18 | 19 | To generate a query in this package, you would create a `DataQuery` object. The query must be executed using the `.exec()` method, which will return the JSON response as well as store the response as an attribute of the `DataQuery` object. From the object, you can access the Data API response, get an interactive editor link, or access the arguments used to create the query. 20 | 21 | The package is able to automatically build queries based on the "input_type" and path segment passed into "return_data_list". If using this package in code intended for long-term use, it's recommended to use the fully qualified path (a complete path from input type to the final data field). When autocompletion is being used, a WARNING message will be printed out as a reminder. 22 | 23 | To suppress the warning, either use the fully qualified path (as in the below example) or set the `suppress_autocomplete_warning` argument to True. To suppress the warning for all queries, change the SUPPRESS_AUTOCOMPLETE_WARNING flag in config.py. 24 | 25 | 26 | ```python 27 | from rcsbapi.data import DataQuery as Query 28 | 29 | query = Query( 30 | input_type="entries", 31 | input_ids=["4HHB"], 32 | return_data_list=["exptl.method"] 33 | ) 34 | 35 | result_dict = query.exec() 36 | print(result_dict) 37 | # print(query.get_response()) would be equivalent 38 | ``` 39 | Data is returned in JSON format 40 | ```json 41 | { 42 | "data": { 43 | "entries": [ 44 | { 45 | "rcsb_id": "4HHB", 46 | "exptl": [ 47 | { 48 | "method": "X-RAY DIFFRACTION" 49 | } 50 | ] 51 | } 52 | ] 53 | } 54 | } 55 | ``` 56 | 57 | ### GraphQL 58 | This is the equivalent query in GraphQL syntax. 59 | ``` 60 | { 61 | entries(entry_ids: ["4HHB"]) { # returns type "CoreEntry" 62 | exptl { # returns type "Exptl" 63 | method # returns a scalar (string) 64 | } 65 | } 66 | } 67 | ``` 68 | GraphQL is built on "types" and their associated "fields". All types and their fields are defined in a "schema". An example of a type in our schema is "CoreEntry" and a field under CoreEntry is "exptl" (experimental). Upon initialization, the Data API package fetches the schema from the RCSB PDB website (See [Implementation Details](implementation_details.md) for more). 69 | 70 | In GraphQL, you must begin your query at specific fields. These are fields like `entries`, `polymer_entities`, and `polymer_entity_instances` (see full list [here](query_construction.md#input-type)). Each field can return a scalar (e.g. string, integer) or a type. Every query must ultimately request scalar value(s), which can be seen in the example query below. As shown in the example, fields are explicitly included in queries while types are implicit. Types are named in CamelCase (CoreEntry) while fields are in snake case (exptl or audit_author). 71 | 72 | ### Autocompletion of Queries 73 | One way this package simplifies making requests is by adding fields that return scalars into the generated query if you request a field that returns a type. 74 | ```python 75 | from rcsbapi.data import DataQuery as Query 76 | 77 | query = Query( 78 | input_type="entries", 79 | input_ids=["4HHB"], 80 | # Requesting "exptl" will return a query requesting exptl.method, exptl.details, etc 81 | return_data_list=["exptl"] 82 | ) 83 | result_dict = query.exec() 84 | print(result_dict) 85 | ``` 86 | This creates a valid query even though "exptl" doesn't return a scalar. However, the resulting query will be more verbose, requesting all scalar fields under "exptl" (see [return_data_list](query_construction.md#return-data-list)). 87 | 88 | ## Jupyter Notebooks 89 | A notebook briefly summarizing the [readthedocs](https://rcsbapi.readthedocs.io/en/latest/index.html) is available in [notebooks/data_quickstart.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/data_quickstart.ipynb) or online through Google Colab Open In Colab 90 | 91 | Another notebook using both Search and Data API packages for a COVID-19 related example is available in [notebooks/search_data_workflow.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/search_data_workflow.ipynb) or online through Google Colab Open In Colab. -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | rcsb-api - Query RCSB PDB data from Python 2 | =============================================== 3 | 4 | The ``rcsb-api`` package provides a Python interface to 5 | `RCSB PDB API services `_. 6 | Use it to search and fetch macromolecular structure data from RCSB PDB `at RCSB.org `_. 7 | 8 | Availability 9 | ------------ 10 | 11 | Get it from PyPI: 12 | 13 | .. code-block:: bash 14 | 15 | pip install rcsb-api 16 | 17 | Or, download from `GitHub `_ 18 | 19 | 20 | Contents 21 | -------- 22 | 23 | .. toctree:: 24 | :caption: Search API 25 | :maxdepth: 2 26 | 27 | search_api/quickstart.md 28 | search_api/query_construction.md 29 | search_api/attributes.md 30 | search_api/additional_examples.md 31 | search_api/api.rst 32 | 33 | .. toctree:: 34 | :caption: Data API 35 | :maxdepth: 2 36 | 37 | data_api/quickstart.md 38 | data_api/query_construction.md 39 | data_api/implementation_details.md 40 | data_api/additional_examples.md 41 | data_api/api.rst 42 | 43 | 44 | License 45 | ------- 46 | 47 | Code is licensed under the MIT license. See the 48 | `LICENSE `_ for details. 49 | 50 | 51 | Citing 52 | ------ 53 | 54 | Please cite the ``rcsb-api`` package with the following reference: 55 | 56 | Dennis W. Piehl, Brinda Vallat, Ivana Truong, Habiba Morsy, Rusham Bhatt, Santiago Blaumann, Pratyoy Biswas, Yana Rose, Sebastian Bittrich, Jose M. Duarte, Joan Segura, Chunxiao Bi, Douglas Myers-Turnbull, Brian P. Hudson, Christine Zardecki, Stephen K. Burley. rcsb-api: Python Toolkit for Streamlining Access to RCSB Protein Data Bank APIs, Journal of Molecular Biology, 2025. DOI: https://doi.org/10.1016/j.jmb.2025.168970 57 | 58 | You should also cite the RCSB.org API services this package utilizes: 59 | 60 | Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley, John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards Integrated Searching and Efficient Access to Macromolecular Structure Data from the PDB Archive, Journal of Molecular Biology, 2020. DOI: https://doi.org/10.1016/j.jmb.2020.11.003 61 | 62 | 63 | Support 64 | ------ 65 | 66 | If you experience any issues installing or using the package, please submit an issue on 67 | `GitHub `_ and we will try to respond in a timely manner. 68 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Pin dependencies for the docs 2 | # Should be kept up-to-date with setup.py 3 | sphinx==5.3.0 4 | sphinx-rtd-theme==0.5.2 5 | typing-extensions==3.7.4.3 6 | myst-parser==1.0.0 7 | jinja2<3.1 8 | requests >= 2.0.0 9 | rustworkx 10 | graphql-core 11 | tqdm 12 | -------------------------------------------------------------------------------- /docs/search_api/api.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ***************** 3 | 4 | .. automodule:: rcsbapi.search 5 | :members: 6 | :special-members: __init__ 7 | 8 | .. autoclass:: rcsbapi.search.search_schema.SearchSchemaGroup 9 | :members: search, get_attribute_details, get_attribute_type 10 | 11 | .. autoclass:: rcsbapi.search.search_query.Session 12 | :members: get_editor_link, get_query_builder_link 13 | -------------------------------------------------------------------------------- /docs/search_api/attributes.md: -------------------------------------------------------------------------------- 1 | # Exploring Schema Attributes 2 | 3 | Attributes are pieces of information associated with a PDB structure that can be searched for or compared to a value using an [`AttributeQuery`](quickstart.md#getting-started). There are [structure attributes](https://search.rcsb.org/structure-search-attributes.html) and [chemical attributes](https://search.rcsb.org/chemical-search-attributes.html), which are both stored in `search_attributes`. This can be imported as shown below: 4 | 5 | ```python 6 | # import search_attributes as attrs for a shorter name 7 | from rcsbapi.search import search_attributes as attrs 8 | ``` 9 | 10 | There are several helpful methods to search for attribute names or explore other information related to attributes. 11 | 12 | ### search() 13 | Given a string, this method will return an iterable of `Attr` objects with names that contain the given string. You can also use [regular expression (regex)](https://en.wikipedia.org/wiki/Regular_expression) strings. 14 | 15 | ```python 16 | matching_attrs = attrs.search("author") 17 | 18 | for attr in matching_attrs: 19 | print(attr) 20 | ``` 21 | 22 | ### get_attribute_details() 23 | Given a full or partial attribute name, return a set of an `Attr` or associated `Attr`s with attribute names, search service types, and descriptions. 24 | 25 | ```python 26 | from rcsbapi.search import search_attributes as attrs 27 | 28 | # Use a full name to get details for a specific attribute 29 | print(attrs.get_attribute_details("rcsb_entity_source_organism.scientific_name")) 30 | 31 | # Use a partial name to get the details of all attributes associated with that name 32 | # Below code gets details for ".common_name", ".source_type", etc in addition to ".scientific_name" 33 | print(attrs.get_attribute_details("rcsb_entity_source_organism")) 34 | ``` 35 | 36 | ### get_attribute_type() 37 | Given a full attribute name, return the search service type (`text` for structure attributes and `text_chem` for chemical attributes). 38 | 39 | ```python 40 | from rcsbapi.search import search_attributes as attrs 41 | 42 | print(attrs.get_attribute_type("rcsb_entity_source_organism.scientific_name")) 43 | ``` -------------------------------------------------------------------------------- /docs/search_api/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quickstart 2 | 3 | ## Installation 4 | 5 | Get it from PyPI: 6 | 7 | pip install rcsb-api 8 | 9 | Or, download from [GitHub](https://github.com/rcsb/py-rcsb-api) 10 | 11 | ## Getting Started 12 | ### Basic Query Construction 13 | 14 | #### Full-text search 15 | To perform a "full-text" search for structures associated with the term "Hemoglobin", you can create a `TextQuery`: 16 | 17 | ```python 18 | from rcsbapi.search import TextQuery 19 | 20 | # Search for structures associated with the phrase "Hemoglobin" 21 | query = TextQuery(value="Hemoglobin") 22 | 23 | # Execute the query by running it as a function 24 | results = query() 25 | 26 | # Results are returned as an iterator of result identifiers. 27 | for rid in results: 28 | print(rid) 29 | ``` 30 | 31 | #### Attribute search 32 | To perform a search for specific structure or chemical attributes, you can create an `AttributeQuery`. 33 | 34 | ```python 35 | from rcsbapi.search import AttributeQuery 36 | 37 | # Construct a query searching for structures from humans 38 | query = AttributeQuery( 39 | attribute="rcsb_entity_source_organism.scientific_name", 40 | operator="exact_match", # Other operators include "contains_phrase", "exists", and more 41 | value="Homo sapiens" 42 | ) 43 | 44 | # Execute query and construct a list from results 45 | results = list(query()) 46 | print(results) 47 | ``` 48 | 49 | Refer to the [Search Attributes](https://search.rcsb.org/structure-search-attributes.html) and [Chemical Attributes](https://search.rcsb.org/chemical-search-attributes.html) documentation for a full list of attributes and applicable operators. 50 | 51 | Alternatively, you can construct attribute queries with comparative operators using the `search_attributes` object (which also allows for names to be tab-completed): 52 | 53 | ```python 54 | from rcsbapi.search import search_attributes as attrs 55 | 56 | # Search for structures from humans 57 | query = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens" 58 | 59 | # Run query and construct a list from results 60 | results = list(query()) 61 | print(results) 62 | ``` 63 | 64 | #### Grouping sub-queries 65 | 66 | You can combine multiple queries using Python bitwise operators. 67 | 68 | ```python 69 | from rcsbapi.search import search_attributes as attrs 70 | 71 | # Query for human epidermal growth factor receptor (EGFR) structures (UniProt ID P00533) 72 | # with investigational or experimental drugs bound 73 | q1 = attrs.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession == "P00533" 74 | q2 = attrs.rcsb_entity_source_organism.scientific_name == "Homo sapiens" 75 | q3 = attrs.drugbank_info.drug_groups == "investigational" 76 | q4 = attrs.drugbank_info.drug_groups == "experimental" 77 | 78 | # Structures matching UniProt ID P00533 AND from humans 79 | # AND (investigational OR experimental drug group) 80 | query = q1 & q2 & (q3 | q4) 81 | 82 | # Execute query and print first 10 ids 83 | results = list(query()) 84 | print(results[:10]) 85 | ``` 86 | 87 | These examples are in "operator" syntax. You can also make queries in "fluent" syntax. Learn more about both syntaxes and implementation details in [Query Syntax and Execution](query_construction.md#query-syntax-and-execution). 88 | 89 | ### Supported Search Services 90 | The list of supported search service types are listed in the table below. For more details on their usage, see [Search Service Types](query_construction.md#search-service-types). 91 | 92 | |Search service |QueryType | 93 | |----------------------------------|--------------------------| 94 | |Full-text |`TextQuery()` | 95 | |Attribute (structure or chemical) |`AttributeQuery()` | 96 | |Sequence similarity |`SeqSimilarityQuery()` | 97 | |Sequence motif |`SeqMotifQuery()` | 98 | |Structure similarity |`StructSimilarityQuery()` | 99 | |Structure motif |`StructMotifQuery()` | 100 | |Chemical similarity |`ChemSimilarityQuery()` | 101 | 102 | Learn more about available search services on the [RCSB PDB Search API docs](https://search.rcsb.org/#search-services). 103 | 104 | ## Jupyter Notebooks 105 | A runnable jupyter notebook is available in [notebooks/search_quickstart.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/search_quickstart.ipynb), or can be run online using Google Colab: 106 | Open In Colab 107 | 108 | An additional Covid-19 related example is in [notebooks/covid.ipynb](https://github.com/rcsb/py-rcsb-api/blob/master/notebooks/covid.ipynb): 109 | Open In Colab 110 | -------------------------------------------------------------------------------- /notebooks/covid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "46b8b87a", 6 | "metadata": {}, 7 | "source": [ 8 | "\"Open" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "deb1fbf6", 14 | "metadata": {}, 15 | "source": [ 16 | "# RCSB PDB Search API: Covid-19 Use-Case\n", 17 | "\n", 18 | "\n", 19 | "Start by installing the package:\n", 20 | "\n", 21 | " pip install rcsb-api\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "0e3979a2", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "%pip install rcsb-api" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "married-burden", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from rcsbapi.search import search_attributes as attrs, TextQuery" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "266c28ab", 47 | "metadata": {}, 48 | "source": [ 49 | "## Demo\n", 50 | "\n", 51 | "We are interested in how the antiviral drug boceprevir interacts with the Covid-19 virus, so we'll construct a query with the following specifications:\n", 52 | "- Source Organism is \"COVID-19 virus\"\n", 53 | "- Associated with the word \"protease\"\n", 54 | "- Bound to ligand \"Boceprevir\"\n", 55 | "\n", 56 | "[RCSB Query](http://www.rcsb.org/search?request=%7B%22query%22%3A%7B%22type%22%3A%22group%22%2C%22logical_operator%22%3A%22and%22%2C%22nodes%22%3A%5B%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22rcsb_entity_source_organism.taxonomy_lineage.name%22%2C%22operator%22%3A%22exact_match%22%2C%22value%22%3A%22COVID-19%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A0%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22value%22%3A%22protease%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A1%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22chem_comp.name%22%2C%22operator%22%3A%22contains_words%22%2C%22value%22%3A%22Boceprevir%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A2%7D%5D%7D%2C%22return_type%22%3A%22entry%22%2C%22request_info%22%3A%7B%22query_id%22%3A%2270e677a6376b4c5eba8b4f2b73866c92%22%2C%22src%22%3A%22ui%22%7D%7D)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "collectible-thread", 62 | "metadata": {}, 63 | "source": [ 64 | "## Operator syntax\n", 65 | "- Uses python comparison operators to compare attributes to a value (`==`, `<`, `<=`, etc)\n", 66 | "- Combine using set operators (`&`, `|`, `~`, etc)\n", 67 | "- Execute queries as functions" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "confidential-behavior", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == \"COVID-19 virus\"\n", 78 | "q2 = TextQuery(\"protease\")\n", 79 | "q3 = attrs.chem_comp.name.contains_words(\"Boceprevir\")\n", 80 | "query = q1 & q2 & q3\n", 81 | "\n", 82 | "list(query())" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "uniform-allen", 88 | "metadata": {}, 89 | "source": [ 90 | "## Fluent syntax\n", 91 | "\n", 92 | "A second syntax is available with a [fluent interface](https://en.wikipedia.org/wiki/Fluent_interface), similar to popular data science packages like tidyverse and Apache Spark. Function calls are chained together.\n", 93 | "\n", 94 | "Here's an example around a second antiviral, remdesivir. The drug interferes with RNA polymerase, replacing an adenine and causing early chain termination. When integrated into RNA, the nucleotide formed from remdesivir has residue code F86." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "irish-navigator", 101 | "metadata": { 102 | "scrolled": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "query = attrs.struct.title.contains_phrase(\"RNA polymerase\")\\\n", 107 | " .or_(attrs.struct.title).contains_words(\"RdRp\")\\\n", 108 | " .and_(attrs.rcsb_entity_source_organism.taxonomy_lineage.name).exact_match(\"COVID-19 virus\")\\\n", 109 | " .and_(attrs.rcsb_chem_comp_container_identifiers.comp_id).exact_match(\"F86\")\\\n", 110 | " \n", 111 | "list(query.exec())\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "distant-graduate", 117 | "metadata": {}, 118 | "source": [ 119 | "## Try it!\n", 120 | "\n", 121 | "[rcsbapi.readthedocs.io](https://rcsbapi.readthedocs.io/en/latest/)" 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.12.6" 142 | }, 143 | "toc": { 144 | "base_numbering": 1, 145 | "nav_menu": {}, 146 | "number_sections": true, 147 | "sideBar": true, 148 | "skip_h1_title": false, 149 | "title_cell": "Table of Contents", 150 | "title_sidebar": "Contents", 151 | "toc_cell": false, 152 | "toc_position": {}, 153 | "toc_section_display": true, 154 | "toc_window_display": false 155 | }, 156 | "varInspector": { 157 | "cols": { 158 | "lenName": 16, 159 | "lenType": 16, 160 | "lenVar": 40 161 | }, 162 | "kernels_config": { 163 | "python": { 164 | "delete_cmd_postfix": "", 165 | "delete_cmd_prefix": "del ", 166 | "library": "var_list.py", 167 | "varRefreshCmd": "print(var_dic_list())" 168 | }, 169 | "r": { 170 | "delete_cmd_postfix": ") ", 171 | "delete_cmd_prefix": "rm(", 172 | "library": "var_list.r", 173 | "varRefreshCmd": "cat(var_dic_list()) " 174 | } 175 | }, 176 | "types_to_exclude": [ 177 | "module", 178 | "function", 179 | "builtin_function_or_method", 180 | "instance", 181 | "_Feature" 182 | ], 183 | "window_display": false 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 5 188 | } 189 | -------------------------------------------------------------------------------- /notebooks/data_quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# RCSB PDB Data API: Quickstart\n", 15 | "\n", 16 | "This Quickstart notebook will walk through the basics of creating and executing queries using the `rcsbapi.data` module of the `rcsb-api` package. For more in-depth documentation, reference the [readthedocs page](https://rcsbapi.readthedocs.io/en/latest/data_api/quickstart.html).\n", 17 | "\n", 18 | "\\\n", 19 | "Before beginning, you must install the package:\n", 20 | "\n", 21 | "```pip install rcsb-api```" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "scrolled": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "%pip install rcsb-api" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 25, 38 | "metadata": { 39 | "scrolled": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "from rcsbapi.data import DataQuery as Query\n", 44 | "import json # for easy-to-read output" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Creating and executing queries" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "To create a `Query` object, you need to provide three arguments:\n", 59 | "- `input_type`: input_types are points where you can begin your query. Some examples are `entries`, `polymer_entities`, and `polymer_entity_instances`. For a full list of input_types see the [readthedocs](https://rcsbapi.readthedocs.io/en/latest/data_api/query_construction.html#input-type).\n", 60 | "- `input_ids`: input_ids are accepted as a list or dictionary of PDB-formatted IDs.\n", 61 | "- `return_data_list`: list of data items to return. These must be unique path segments (using dots to separate each name). Further explained [below](#Providing-specific-and-unique-field-names/paths).\n", 62 | "\n", 63 | "(More details on input arguments can be found in [readthedocs: Query Construction](https://rcsbapi.readthedocs.io/en/latest/data_api/query_construction.html).)\n", 64 | "\n", 65 | "For example, to create a `Query` object requesting all non-polymer components of a structure (ions, cofactors, etc):" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "query = Query(\n", 75 | " input_type=\"entries\",\n", 76 | " input_ids=[\"4HHB\"],\n", 77 | " return_data_list=[\"nonpolymer_bound_components\"] # must be unique field or unique path segment\n", 78 | ")\n", 79 | "\n", 80 | "# Note: When the package autocompletes a path, it prints an Warning message\n", 81 | "# To suppress this warning, either use the fully qualified path (\"rcsb_entry_info.nonpolymer_bound_components\"),\n", 82 | "# or set the `suppress_autocomplete_warning` to True.\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "After creating a `Query` object, you can run it with `.exec()` or view the GraphQL query with `.get_editor_link()`:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Execute the query and print the results\n", 99 | "return_data = query.exec()\n", 100 | "print(json.dumps(return_data, indent=2)) # prints return_data with easy-to-read formatting\n", 101 | "\n", 102 | "## Expected Output:\n", 103 | "# {\n", 104 | "# \"data\": {\n", 105 | "# \"entries\": [\n", 106 | "# {\n", 107 | "# \"rcsb_id\": \"4HHB\",\n", 108 | "# \"rcsb_entry_info\": {\n", 109 | "# \"nonpolymer_bound_components\": [\n", 110 | "# \"HEM\"\n", 111 | "# ]\n", 112 | "# }\n", 113 | "# }\n", 114 | "# ]\n", 115 | "# }\n", 116 | "# }" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "# Print the GraphQL editor URL\n", 126 | "query.get_editor_link()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### Querying multiple IDs\n", 134 | "You can search multiple entries by starting from `input_type` \"entries\" and passing in a list of `input_ids`." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "query = Query(\n", 144 | " input_type=\"entries\",\n", 145 | " input_ids=[\"4HHB\", \"12CA\", \"3PQR\"],\n", 146 | " return_data_list=[\"nonpolymer_bound_components\"]\n", 147 | ")\n", 148 | "return_data = query.exec()\n", 149 | "print(json.dumps(return_data, indent=2))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Querying multiple data items\n", 157 | "You can also request multiple data items by adding to the `return_data_list`." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Query multiple fields in return_data_list\n", 167 | "query = Query(\n", 168 | " input_type=\"entries\",\n", 169 | " input_ids=[\"4HHB\", \"12CA\", \"3PQR\"],\n", 170 | " return_data_list=[\n", 171 | " \"nonpolymer_bound_components\",\n", 172 | " \"citation.title\",\n", 173 | " \"rcsb_entry_info.polymer_composition\"\n", 174 | " ]\n", 175 | ")\n", 176 | "return_data = query.exec()\n", 177 | "print(json.dumps(return_data, indent=2))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "### Autocompletion of nested fields\n", 185 | "If there are fields nested under a requested data item in `return_data_list`, the package will add all sub-fields to the query. This allows you to make more general requests to get all information under that field (e.g., `\"exptl\"`). If you would like a more precise query, you can request specific fields (e.g., `\"exptl.method\"`)." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "# Requesting \"exptl\" gets all fields underneath that field\n", 195 | "query = Query(\n", 196 | " input_type=\"entries\",\n", 197 | " input_ids=[\"4HHB\"],\n", 198 | " return_data_list=[\"exptl\"] # requests exptl.crystals_number, exptl.method, etc.\n", 199 | ")\n", 200 | "return_data = query.exec()\n", 201 | "print(json.dumps(return_data, indent=2))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# To view the generated GraphQL query:\n", 211 | "query.get_editor_link()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### Querying different `input_types`\n", 219 | "You can also start queries from various `input_types` (e.g., `polymer_entities`, `polymer_entity_instances`, `uniprot`). (For more examples, see [readthedocs: Additional Examples](https://rcsbapi.readthedocs.io/en/latest/data_api/additional_examples.html))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# Search from input_type \"polymer_entities\"\n", 229 | "query = Query(\n", 230 | " input_type=\"polymer_entities\",\n", 231 | " input_ids=[\"2CPK_1\", \"3WHM_1\", \"2D5Z_1\"],\n", 232 | " return_data_list=[\n", 233 | " \"polymer_entities.rcsb_id\",\n", 234 | " \"rcsb_entity_source_organism.ncbi_taxonomy_id\",\n", 235 | " \"rcsb_entity_source_organism.ncbi_scientific_name\",\n", 236 | " \"cluster_id\",\n", 237 | " \"identity\"\n", 238 | " ]\n", 239 | ")\n", 240 | "return_data = query.exec()\n", 241 | "print(json.dumps(return_data, indent=2))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "# Search from input_type \"polymer_entity_instances\"\n", 251 | "query = Query(\n", 252 | " input_type=\"polymer_entity_instances\",\n", 253 | " input_ids=[\"4HHB.A\", \"12CA.A\", \"3PQR.A\"],\n", 254 | " return_data_list=[\n", 255 | " \"polymer_entity_instances.rcsb_id\",\n", 256 | " \"rcsb_polymer_instance_annotation.annotation_id\",\n", 257 | " \"rcsb_polymer_instance_annotation.name\",\n", 258 | " \"rcsb_polymer_instance_annotation.type\"\n", 259 | " ]\n", 260 | ")\n", 261 | "return_data = query.exec()\n", 262 | "print(json.dumps(return_data, indent=2))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# Search from input_type \"uniprot\"\n", 272 | "query = Query(\n", 273 | " input_type=\"uniprot\",\n", 274 | " input_ids=[\"P68871\"],\n", 275 | " return_data_list=[\n", 276 | " \"rcsb_uniprot_annotation\"\n", 277 | " ]\n", 278 | ")\n", 279 | "return_data = query.exec()\n", 280 | "print(json.dumps(return_data, indent=2))" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "## Determining fields for `return_data_list`" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### Providing specific and unique field names/paths\n", 295 | "There are some fields that must be further specified using multiple fields separated by dots. This is because some fields are redundant within our GraphQL Data API schema. For example, “id” appears over 50 times.\n", 296 | "\n", 297 | "For example, the field, `\"polymer_composition\"`, is redundant between several nodes: " 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# The field \"polymer_composition\" isn't specific enough\n", 307 | "query = Query(\n", 308 | " input_type=\"entries\",\n", 309 | " input_ids=[\"4HHB\"],\n", 310 | " return_data_list=[\"polymer_composition\"]\n", 311 | ")\n", 312 | "\n", 313 | "# This will throw a ValueError, which will print out up to 10 valid paths that you can use instead" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "```\n", 321 | "ValueError: Given path \"polymer_composition\" not specific enough. Use one or more of these paths in return_data_list argument:\n", 322 | "\n", 323 | "3 of 3 possible paths:\n", 324 | " assemblies.interfaces.rcsb_interface_info.polymer_composition\n", 325 | " assemblies.rcsb_assembly_info.polymer_composition\n", 326 | " rcsb_entry_info.polymer_composition\n", 327 | "```" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "To get a list of all possible paths for a given field name, you can use the `DataSchema().find_paths()` method:\n", 335 | "```python\n", 336 | "from rcsbapi.data import DataSchema\n", 337 | "schema = DataSchema()\n", 338 | "schema.find_paths(input_type, field_name_or_path_segment)\n", 339 | "```\n", 340 | "For example:" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "# Find all paths:\n", 350 | "from rcsbapi.data import DataSchema\n", 351 | "\n", 352 | "schema = DataSchema()\n", 353 | "schema.find_paths(input_type=\"entries\", return_data_name=\"polymer_composition\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "# By looking through the list, find the intended field path\n", 363 | "query = Query(\n", 364 | " input_type=\"entries\",\n", 365 | " input_ids=[\"4HHB\"],\n", 366 | " return_data_list=[\"rcsb_entry_info.polymer_composition\"]\n", 367 | ")\n", 368 | "return_data = query.exec()\n", 369 | "print(json.dumps(return_data, indent=2))" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "### Discovering field names\n", 377 | "If you're unsure which fields exist, you can call `find_field_names(search_substring)`.\n", 378 | "\n", 379 | "For example, to find all fields containing `\"comp\"`:" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "from rcsbapi.data import DataSchema\n", 389 | "\n", 390 | "schema = DataSchema()\n", 391 | "schema.find_field_names(\"comp\")" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "Note that once you identify which field you want to use, you may need to also run the `find_paths()` method mentioned above on the field name to identify the set of possible paths for `return_data_list`. " 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "# Find all paths for the field `\"chem_comps\"`:\n", 408 | "schema.find_paths(input_type=\"entries\", return_data_name=\"chem_comp\")" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "For more in-depth documentation, go to [readthedocs](https://rcsbapi.readthedocs.io/en/latest/data_api/quickstart.html)." 416 | ] 417 | } 418 | ], 419 | "metadata": { 420 | "kernelspec": { 421 | "display_name": "Python 3", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.12.6" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 4 440 | } 441 | -------------------------------------------------------------------------------- /notebooks/multisearch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Enabling Computational Biology Research\n", 15 | "\n", 16 | "This tool can be an integral resource for computational biologists performing data analysis or iterative processes on big datasets from the RCSB PDB. Our tool supports data automation which is essential for any researcher or computational biologists wanting to work with huge datasets. Furthermore, our tool can be incorporated within a larger research workflow to quickly and seamlessly retrieve RCSB PDB data in an automated way." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Below is an example of how a computational biologist may use our tool for data automation to facilitate their research. The first query below finds protein structures with a similar protein sequence to the target protein. The retrieved data are then used as search parameters for a set of iterative search queries that find structurally similar proteins that are bound to small molecules. Then, the researcher can use their own workflow to further investigate how the protein structures and small molecules interact." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "%pip install rcsb-api" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from rcsbapi.search import SeqSimilarityQuery, AttributeQuery, StructSimilarityQuery" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Search for similar sequences to a protein of interest\n", 51 | "q1 = SeqSimilarityQuery(\"DTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNEL\" + \n", 52 | " \"TEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCE\" + \n", 53 | " \"KQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYL\" + \n", 54 | " \"YEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETM\" + \n", 55 | " \"REKVLTSSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEV\" + \n", 56 | " \"TKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKE\" + \n", 57 | " \"CCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAK\" + \n", 58 | " \"DAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHAC\" +\n", 59 | " \"YSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKV\" + \n", 60 | " \"PQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLC\" + \n", 61 | " \"VLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLF\" + \n", 62 | " \"TFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVA\" +\n", 63 | " \"FVDKCCAADDKEACFAVEGPKLVVSTQTALA\")\n", 64 | "\n", 65 | "sequence_similarity_results = list(q1(return_type=\"polymer_entity\"))\n", 66 | "print(\"Sequences similar to query:\")\n", 67 | "print(sequence_similarity_results)\n", 68 | "\n", 69 | "for i in range(5):\n", 70 | " similar_protein = sequence_similarity_results[i]\n", 71 | "\n", 72 | " entry_id = similar_protein[:-2]\n", 73 | "\n", 74 | " # Search for structures with small molecule(s)\n", 75 | " small_molecule_query = AttributeQuery(\n", 76 | " attribute=\"rcsb_nonpolymer_entity_annotation.comp_id\",\n", 77 | " operator=\"exists\",\n", 78 | " value=None\n", 79 | " )\n", 80 | "\n", 81 | " # Search for structurally similar proteins\n", 82 | " struct_similarity_query = StructSimilarityQuery(\n", 83 | " structure_search_type=\"entry_id\",\n", 84 | " entry_id=entry_id,\n", 85 | " structure_input_type=\"assembly_id\",\n", 86 | " assembly_id=\"1\", # assemblyid = 1 by default\n", 87 | " operator=\"strict_shape_match\",\n", 88 | " target_search_space=\"assembly\"\n", 89 | " )\n", 90 | "\n", 91 | " group_query = struct_similarity_query & small_molecule_query\n", 92 | "\n", 93 | " print(\"Protein structures similar to\", similar_protein, \"bound to a small molecule:\")\n", 94 | " print(list(group_query(\"assembly\")))" 95 | ] 96 | } 97 | ], 98 | "metadata": { 99 | "kernelspec": { 100 | "display_name": "Python 3", 101 | "language": "python", 102 | "name": "python3" 103 | }, 104 | "language_info": { 105 | "codemirror_mode": { 106 | "name": "ipython", 107 | "version": 3 108 | }, 109 | "file_extension": ".py", 110 | "mimetype": "text/x-python", 111 | "name": "python", 112 | "nbconvert_exporter": "python", 113 | "pygments_lexer": "ipython3", 114 | "version": "3.12.6" 115 | } 116 | }, 117 | "nbformat": 4, 118 | "nbformat_minor": 2 119 | } 120 | -------------------------------------------------------------------------------- /notebooks/search_data_workflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## RCSB PDB Data API: Search and Data API Workflow Demo\n", 15 | "\n", 16 | "This quick-start notebook will walk through the basics of using the Search and Data API sub-packages together. For more in-depth documentation reference the [readthedocs](https://rcsbapi.readthedocs.io/en/latest/).\n", 17 | "\n", 18 | "\\\n", 19 | "install the package: \n", 20 | "\n", 21 | "```pip install rcsb-api```" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "%pip install rcsb-api" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In this demo, we are interested in finding potential drugs to treat COVID-19 and collecting the associated literature in order to conduct further research. To do this, we will:\n", 38 | " 1. Construct a query to fetch COVID-19 viruses with ligands bound (Search API module)\n", 39 | " 2. Find information about each ligand (PDB ID, associated publication titles, links to publications) (Data API module)\n", 40 | " 3. Parse our results and output in an easy-to-read format" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Python Search API: Find COVID-19 Structures with Ligand Bound\n", 48 | "\n", 49 | "We'll start by constructing a Search API query that specifies the following:\n", 50 | "- Source organism is \"COVID-19 virus\" \n", 51 | "- Nonpolymer_enitity that is the subject of investigation in the structure\n", 52 | "- Modified chemical component is present" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from rcsbapi.search import search_attributes as attrs\n", 62 | "\n", 63 | "# Create each subquery\n", 64 | "q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == \"COVID-19 virus\"\n", 65 | "q2 = attrs.rcsb_nonpolymer_entity_annotation.type == \"SUBJECT_OF_INVESTIGATION\"\n", 66 | "q3 = attrs.rcsb_polymer_entity_feature_summary.type == \"modified_monomer\"\n", 67 | "\n", 68 | "# Combine using bitwise operators (&, |, ~, etc)\n", 69 | "query = q1 & q2 & q3\n", 70 | "\n", 71 | "# Call the query as a function to execute it\n", 72 | "result_list = query()\n", 73 | "\n", 74 | "# Save and print the first ten resilts\n", 75 | "short_result_list = (list(result_list)[0:10])\n", 76 | "print(short_result_list)\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Python Data API: Find Information About Structures\n", 84 | "\n", 85 | "Once we have the PDB IDs, we can query them using the Data API for information related to the structure. \n", 86 | "\n", 87 | "In this case, we will find the following for the first 10 results:\n", 88 | "- ID\n", 89 | "- Chemical component IDs\n", 90 | "- Whether the chemical component is the subject of investigation\n", 91 | "- Title of associated publication\n", 92 | "- Digital Object Identifier (DOI) if applicable" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from rcsbapi.data import DataQuery as Query\n", 102 | "\n", 103 | "query = Query(\n", 104 | " input_type=\"entries\",\n", 105 | " input_ids=short_result_list,\n", 106 | " return_data_list=[\n", 107 | " \"entries.rcsb_id\",\n", 108 | " \"rcsb_nonpolymer_entity_instance_container_identifiers.comp_id\",\n", 109 | " \"is_subject_of_investigation\",\n", 110 | " \"citation.title\",\n", 111 | " \"citation.pdbx_database_id_DOI\"\n", 112 | " ] \n", 113 | ")\n", 114 | "query.exec()\n", 115 | "query.get_response()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### Parsing the Result\n", 123 | "\n", 124 | "The result of the request is returned in JSON format. We can refer to the JSON output to understand the data structure and then parse it for the information that is useful to us.\n", 125 | "In this case, we will\n", 126 | "- Confirm the subject of investigation and find the ID if it exists (comp_id)\n", 127 | "- Find the publication title \n", 128 | "- Construct a link to the publication using the DOI\n", 129 | "- Put these data into a dictionary" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "from pprint import pprint # for easier-to-read output\n", 139 | "\n", 140 | "json = query.get_response()[\"data\"][\"entries\"]\n", 141 | "output_dict = {}\n", 142 | "\n", 143 | "# iterate through the result of each entry requested\n", 144 | "for entry_dict in json:\n", 145 | " rcsb_id = entry_dict[\"rcsb_id\"]\n", 146 | "\n", 147 | " # Check for non-polymer subject of investigation, then append to chem_id_list\n", 148 | " for entity_dict in entry_dict[\"nonpolymer_entities\"]:\n", 149 | " for instance_dict in entity_dict[\"nonpolymer_entity_instances\"]:\n", 150 | " is_subject = instance_dict[\"rcsb_nonpolymer_instance_validation_score\"][0][\"is_subject_of_investigation\"]\n", 151 | " if is_subject == \"Y\":\n", 152 | " comp_id = instance_dict[\"rcsb_nonpolymer_entity_instance_container_identifiers\"][\"comp_id\"]\n", 153 | "\n", 154 | " # Find publication title\n", 155 | " title = entry_dict[\"citation\"][0][\"title\"]\n", 156 | "\n", 157 | " # Construct link from DOI (only exists if paper has been published or is on preprint server)\n", 158 | " base_link = \"https://doi.org/\"\n", 159 | " doi_link = \"\"\n", 160 | " if entry_dict[\"citation\"][0][\"pdbx_database_id_DOI\"] is not None:\n", 161 | " doi_link += base_link + entry_dict[\"citation\"][0][\"pdbx_database_id_DOI\"]\n", 162 | "\n", 163 | " # Add to dictionary\n", 164 | " output_dict[rcsb_id] = {\"title\": title, \"link\": doi_link, \"subject_of_investigation\": comp_id, }\n", 165 | "\n", 166 | "pprint(output_dict)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### Try it for yourself\n", 174 | "Combining use of our Search and Data API sub-packages can make programmatic access to RCSB PDB easier than ever!" 175 | ] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.12.6" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /notebooks/search_quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "177dc250", 6 | "metadata": {}, 7 | "source": [ 8 | "\"Open" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "upper-filing", 14 | "metadata": {}, 15 | "source": [ 16 | "# RCSB PDB Search API: Quickstart\n", 17 | "\n", 18 | "This quickstart notebook will walk through the basics of creating and executing queries using the `rcsbapi.search` package of the `rcsb-api` package. For more in-depth documentation, reference the [readthedocs page](https://rcsbapi.readthedocs.io/en/latest/search_api/quickstart.html).\n", 19 | "\n", 20 | "\\\n", 21 | "Before beginning, you must install the package:\n", 22 | "\n", 23 | "```pip install rcsb-api```" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "aef3a8f5", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "%pip install rcsb-api" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 17, 39 | "id": "african-monthly", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from rcsbapi.search import TextQuery, AttributeQuery\n", 44 | "from rcsbapi.search import search_attributes as attrs" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "51db8156", 50 | "metadata": {}, 51 | "source": [ 52 | "## Full-text search\n", 53 | "To perform a \"full-text\" search for structures associated with the term \"Hemoglobin\", you can create a `TextQuery`:" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 18, 59 | "id": "110a70a9", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Search for structures associated with the phrase \"Hemoglobin\"\n", 64 | "query = TextQuery(value=\"Hemoglobin\")\n", 65 | "\n", 66 | "# Execute the query by running it as a function\n", 67 | "results = query()\n", 68 | "\n", 69 | "# Results are returned as an iterator of result identifiers.\n", 70 | "for rid in results:\n", 71 | " print(rid)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "a4c2d12b", 77 | "metadata": {}, 78 | "source": [ 79 | "## Attribute search\n", 80 | "To perform a search for specific structure or chemical attributes, you can create an `AttributeQuery`." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 19, 86 | "id": "79005229", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Construct a query searching for structures from humans\n", 91 | "query = AttributeQuery(\n", 92 | " attribute=\"rcsb_entity_source_organism.scientific_name\",\n", 93 | " operator=\"exact_match\", # Other operators include \"contains_phrase\", \"exists\", and more\n", 94 | " value=\"Homo sapiens\"\n", 95 | ")\n", 96 | "\n", 97 | "# Execute query and construct a list from results\n", 98 | "results = list(query())\n", 99 | "print(results)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "8aec7e7e", 105 | "metadata": {}, 106 | "source": [ 107 | "Refer to the [Search Attributes](https://search.rcsb.org/structure-search-attributes.html) and [Chemical Attributes](https://search.rcsb.org/chemical-search-attributes.html) documentation for a full list of attributes and applicable operators.\n", 108 | "\n", 109 | "Alternatively, you can construct attribute queries with comparative operators using the `search_attributes` object (which also allows for names to be tab-completed):" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 20, 115 | "id": "1a01cb80", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# Search for structures from humans\n", 120 | "query = attrs.rcsb_entity_source_organism.scientific_name == \"Homo sapiens\"\n", 121 | "\n", 122 | "# Run query and construct a list from results\n", 123 | "results = list(query())\n", 124 | "print(results)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "fe2daa02", 130 | "metadata": {}, 131 | "source": [ 132 | "## Grouping sub-queries\n", 133 | "\n", 134 | "You can combine multiple queries using Python bitwise operators. " 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 21, 140 | "id": "a23da8e7", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Query for human epidermal growth factor receptor (EGFR) structures (UniProt ID P00533)\n", 145 | "# with investigational or experimental drugs bound\n", 146 | "q1 = attrs.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession == \"P00533\"\n", 147 | "q2 = attrs.rcsb_entity_source_organism.scientific_name == \"Homo sapiens\"\n", 148 | "q3 = attrs.drugbank_info.drug_groups == \"investigational\"\n", 149 | "q4 = attrs.drugbank_info.drug_groups == \"experimental\"\n", 150 | "\n", 151 | "# Structures matching UniProt ID P00533 AND from humans\n", 152 | "# AND (investigational OR experimental drug group)\n", 153 | "query = q1 & q2 & (q3 | q4)\n", 154 | "\n", 155 | "# Execute query and print first 10 ids\n", 156 | "results = list(query())\n", 157 | "print(results[:10])" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "9d3692c4", 163 | "metadata": {}, 164 | "source": [ 165 | "These examples are in \"operator\" syntax. You can also make queries in \"fluent\" syntax. Learn more about both syntaxes and implementation details in [Query Syntax and Execution](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#query-syntax-and-execution).\n", 166 | "\n", 167 | "### Supported Search Services\n", 168 | "The list of supported search service types are listed in the table below. For more details on their usage, see [Search Service Types](https://rcsbapi.readthedocs.io/en/latest/search_api/query_construction.html#search-service-types).\n", 169 | "\n", 170 | "|Search service |QueryType |\n", 171 | "|----------------------------------|--------------------------|\n", 172 | "|Full-text |`TextQuery()` |\n", 173 | "|Attribute (structure or chemical) |`AttributeQuery()` |\n", 174 | "|Sequence similarity |`SeqSimilarityQuery()` |\n", 175 | "|Sequence motif |`SeqMotifQuery()` |\n", 176 | "|Structure similarity |`StructSimilarityQuery()` |\n", 177 | "|Structure motif |`StructMotifQuery()` |\n", 178 | "|Chemical similarity |`ChemSimilarityQuery()` |\n", 179 | "\n", 180 | "Learn more about available search services on the [RCSB PDB Search API docs](https://search.rcsb.org/#search-services)." 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "id": "e2b42fd8", 186 | "metadata": {}, 187 | "source": [ 188 | "For more in-depth documentation, go to [readthedocs](https://rcsbapi.readthedocs.io/en/latest/index.html)" 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 3", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.12.6" 209 | }, 210 | "toc": { 211 | "base_numbering": 1, 212 | "nav_menu": {}, 213 | "number_sections": true, 214 | "sideBar": true, 215 | "skip_h1_title": false, 216 | "title_cell": "Table of Contents", 217 | "title_sidebar": "Contents", 218 | "toc_cell": false, 219 | "toc_position": {}, 220 | "toc_section_display": true, 221 | "toc_window_display": false 222 | }, 223 | "varInspector": { 224 | "cols": { 225 | "lenName": 16, 226 | "lenType": 16, 227 | "lenVar": 40 228 | }, 229 | "kernels_config": { 230 | "python": { 231 | "delete_cmd_postfix": "", 232 | "delete_cmd_prefix": "del ", 233 | "library": "var_list.py", 234 | "varRefreshCmd": "print(var_dic_list())" 235 | }, 236 | "r": { 237 | "delete_cmd_postfix": ") ", 238 | "delete_cmd_prefix": "rm(", 239 | "library": "var_list.r", 240 | "varRefreshCmd": "cat(var_dic_list()) " 241 | } 242 | }, 243 | "types_to_exclude": [ 244 | "module", 245 | "function", 246 | "builtin_function_or_method", 247 | "instance", 248 | "_Feature" 249 | ], 250 | "window_display": false 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 5 255 | } 256 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist=MySQLdb 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python modules names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=missing-docstring, 64 | empty-docstring, 65 | bad-continuation, 66 | print-statement, 67 | parameter-unpacking, 68 | unpacking-in-except, 69 | old-raise-syntax, 70 | backtick, 71 | import-star-module-level, 72 | raw-checker-failed, 73 | bad-inline-option, 74 | locally-disabled, 75 | file-ignored, 76 | suppressed-message, 77 | useless-suppression, 78 | deprecated-pragma, 79 | use-symbolic-message-instead, 80 | broad-except, 81 | apply-builtin, 82 | basestring-builtin, 83 | buffer-builtin, 84 | cmp-builtin, 85 | coerce-builtin, 86 | execfile-builtin, 87 | file-builtin, 88 | long-builtin, 89 | raw_input-builtin, 90 | reduce-builtin, 91 | standarderror-builtin, 92 | unicode-builtin, 93 | xrange-builtin, 94 | coerce-method, 95 | delslice-method, 96 | getslice-method, 97 | setslice-method, 98 | no-absolute-import, 99 | old-division, 100 | dict-iter-method, 101 | dict-view-method, 102 | next-method-called, 103 | metaclass-assignment, 104 | indexing-exception, 105 | raising-string, 106 | reload-builtin, 107 | oct-method, 108 | hex-method, 109 | nonzero-method, 110 | cmp-method, 111 | input-builtin, 112 | round-builtin, 113 | intern-builtin, 114 | unichr-builtin, 115 | map-builtin-not-iterating, 116 | zip-builtin-not-iterating, 117 | range-builtin-not-iterating, 118 | filter-builtin-not-iterating, 119 | using-cmp-argument, 120 | div-method, 121 | idiv-method, 122 | rdiv-method, 123 | exception-message-attribute, 124 | invalid-str-codec, 125 | sys-max-int, 126 | bad-python3-import, 127 | deprecated-string-function, 128 | deprecated-str-translate-call, 129 | deprecated-itertools-function, 130 | deprecated-types-field, 131 | next-method-defined, 132 | dict-items-not-iterating, 133 | dict-keys-not-iterating, 134 | dict-values-not-iterating, 135 | deprecated-operator-function, 136 | deprecated-urllib-function, 137 | xreadlines-attribute, 138 | deprecated-sys-function, 139 | exception-escape, 140 | comprehension-escape, 141 | raise-missing-from, 142 | W0707, 143 | W0238, 144 | no-member, 145 | unused-argument, 146 | protected-access 147 | 148 | # Enable the message, report, category or checker with the given id(s). You can 149 | # either give multiple identifier separated by comma (,) or put this option 150 | # multiple time (only on the command line, not in the configuration file where 151 | # it should appear only once). See also the "--disable" option for examples. 152 | enable=c-extension-no-member 153 | 154 | 155 | [REPORTS] 156 | 157 | # Python expression which should return a note less than 10 (10 is the highest 158 | # note). You have access to the variables errors warning, statement which 159 | # respectively contain the number of errors / warnings messages and the total 160 | # number of statements analyzed. This is used by the global evaluation report 161 | # (RP0004). 162 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 163 | 164 | # Template used to display messages. This is a python new-style format string 165 | # used to format the message information. See doc for all details. 166 | #msg-template= 167 | 168 | # Set the output format. Available formats are text, parseable, colorized, json 169 | # and msvs (visual studio). You can also give a reporter class, e.g. 170 | # mypackage.mymodule.MyReporterClass. 171 | output-format=text 172 | 173 | # Tells whether to display a full report or only the messages. 174 | reports=no 175 | 176 | # Activate the evaluation score. 177 | score=yes 178 | 179 | 180 | [REFACTORING] 181 | 182 | # Maximum number of nested blocks for function / method body 183 | max-nested-blocks=5 184 | 185 | # Complete name of functions that never returns. When checking for 186 | # inconsistent-return-statements if a never returning function is called then 187 | # it will be considered as an explicit return statement and no message will be 188 | # printed. 189 | never-returning-functions=sys.exit 190 | 191 | 192 | [LOGGING] 193 | 194 | # Format style used to check logging format string. `old` means using % 195 | # formatting, while `new` is for `{}` formatting. 196 | logging-format-style=old 197 | 198 | # Logging modules to check that the string format arguments are in logging 199 | # function parameter format. 200 | logging-modules=logging 201 | 202 | 203 | [SPELLING] 204 | 205 | # Limits count of emitted suggestions for spelling mistakes. 206 | max-spelling-suggestions=4 207 | 208 | # Spelling dictionary name. Available dictionaries: none. To make it working 209 | # install python-enchant package.. 210 | spelling-dict= 211 | 212 | # List of comma separated words that should not be checked. 213 | spelling-ignore-words= 214 | 215 | # A path to a file that contains private dictionary; one word per line. 216 | spelling-private-dict-file= 217 | 218 | # Tells whether to store unknown words to indicated private dictionary in 219 | # --spelling-private-dict-file option instead of raising a message. 220 | spelling-store-unknown-words=no 221 | 222 | 223 | [MISCELLANEOUS] 224 | 225 | # List of note tags to take in consideration, separated by a comma. 226 | notes=FIXME, 227 | XXX, 228 | TODO 229 | 230 | 231 | [TYPECHECK] 232 | 233 | # List of decorators that produce context managers, such as 234 | # contextlib.contextmanager. Add to this list to register other decorators that 235 | # produce valid context managers. 236 | contextmanager-decorators=contextlib.contextmanager 237 | 238 | # List of members which are set dynamically and missed by pylint inference 239 | # system, and so shouldn't trigger E1101 when accessed. Python regular 240 | # expressions are accepted. 241 | generated-members= 242 | 243 | # Tells whether missing members accessed in mixin class should be ignored. A 244 | # mixin class is detected if its name ends with "mixin" (case insensitive). 245 | ignore-mixin-members=yes 246 | 247 | # Tells whether to warn about missing members when the owner of the attribute 248 | # is inferred to be None. 249 | ignore-none=yes 250 | 251 | # This flag controls whether pylint should warn about no-member and similar 252 | # checks whenever an opaque object is returned when inferring. The inference 253 | # can return multiple potential results while evaluating a Python object, but 254 | # some branches might not be evaluated, which results in partial inference. In 255 | # that case, it might be useful to still emit no-member and other checks for 256 | # the rest of the inferred objects. 257 | ignore-on-opaque-inference=yes 258 | 259 | # List of class names for which member attributes should not be checked (useful 260 | # for classes with dynamically set attributes). This supports the use of 261 | # qualified names. 262 | ignored-classes=optparse.Values,thread._local,_thread._local 263 | 264 | # List of module names for which member attributes should not be checked 265 | # (useful for modules/projects where namespaces are manipulated during runtime 266 | # and thus existing member attributes cannot be deduced by static analysis. It 267 | # supports qualified module names, as well as Unix pattern matching. 268 | ignored-modules= 269 | 270 | # Show a hint with possible names when a member name was not found. The aspect 271 | # of finding the hint is based on edit distance. 272 | missing-member-hint=yes 273 | 274 | # The minimum edit distance a name should have in order to be considered a 275 | # similar match for a missing member name. 276 | missing-member-hint-distance=1 277 | 278 | # The total number of similar names that should be taken in consideration when 279 | # showing a hint for a missing member. 280 | missing-member-max-choices=1 281 | 282 | 283 | [VARIABLES] 284 | 285 | # List of additional names supposed to be defined in builtins. Remember that 286 | # you should avoid defining new builtins when possible. 287 | additional-builtins= 288 | 289 | # Tells whether unused global variables should be treated as a violation. 290 | allow-global-unused-variables=yes 291 | 292 | # List of strings which can identify a callback function by name. A callback 293 | # name must start or end with one of those strings. 294 | callbacks=cb_, 295 | _cb 296 | 297 | # A regular expression matching the name of dummy variables (i.e. expected to 298 | # not be used). 299 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 300 | 301 | # Argument names that match this expression will be ignored. Default to name 302 | # with leading underscore. 303 | ignored-argument-names=_.*|^ignored_|^unused_ 304 | 305 | # Tells whether we should check for unused import in __init__ files. 306 | init-import=no 307 | 308 | # List of qualified module names which can have objects that can redefine 309 | # builtins. 310 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 311 | 312 | 313 | [FORMAT] 314 | 315 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 316 | expected-line-ending-format= 317 | 318 | # Regexp for a line that is allowed to be longer than the limit. 319 | ignore-long-lines=^\s*(# )??$ 320 | 321 | # Number of spaces of indent required inside a hanging or continued line. 322 | indent-after-paren=4 323 | 324 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 325 | # tab). 326 | indent-string=' ' 327 | 328 | # Maximum number of characters on a single line. 329 | max-line-length=240 330 | 331 | # Maximum number of lines in a module. 332 | max-module-lines=1000 333 | 334 | # Allow the body of a class to be on the same line as the declaration if body 335 | # contains single statement. 336 | single-line-class-stmt=no 337 | 338 | # Allow the body of an if to be on the same line as the test if there is no 339 | # else. 340 | single-line-if-stmt=no 341 | 342 | 343 | [SIMILARITIES] 344 | 345 | # Ignore comments when computing similarities. 346 | ignore-comments=yes 347 | 348 | # Ignore docstrings when computing similarities. 349 | ignore-docstrings=yes 350 | 351 | # Ignore imports when computing similarities. 352 | ignore-imports=no 353 | 354 | # Minimum lines number of a similarity. 355 | min-similarity-lines=4 356 | 357 | 358 | [BASIC] 359 | 360 | # Naming style matching correct argument names. 361 | argument-naming-style=snake_case 362 | 363 | # Regular expression matching correct argument names. Overrides argument- 364 | # naming-style. 365 | # argument-rgx=[a-z_][a-zA-Z0-9]{1,30}$ 366 | 367 | # Naming style matching correct attribute names. 368 | attr-naming-style=snake_case 369 | 370 | # Regular expression matching correct attribute names. Overrides attr-naming- 371 | # style. 372 | # attr-rgx=_?_?[a-z][A-Za-z0-9]{1,40}$ 373 | 374 | # Bad variable names which should always be refused, separated by a comma. 375 | bad-names=foo, 376 | bar, 377 | baz, 378 | toto, 379 | tutu, 380 | tata 381 | 382 | # Naming style matching correct class attribute names. 383 | class-attribute-naming-style=camelCase 384 | 385 | # Regular expression matching correct class attribute names. Overrides class- 386 | # attribute-naming-style. 387 | # class-attribute-rgx=_?_?[a-z][A-Za-z0-9]{1,40}$ 388 | 389 | # Naming style matching correct class names. 390 | class-naming-style=PascalCase 391 | 392 | # Regular expression matching correct class names. Overrides class-naming- 393 | # style. 394 | #class-rgx= 395 | 396 | # Naming style matching correct constant names. 397 | const-naming-style=any 398 | 399 | # Regular expression matching correct constant names. Overrides const-naming- 400 | # style. 401 | #const-rgx= 402 | 403 | # Minimum line length for functions/classes that require docstrings, shorter 404 | # ones are exempt. 405 | docstring-min-length=-1 406 | 407 | # Naming style matching correct function names. 408 | function-naming-style=camelCase 409 | 410 | # Regular expression matching correct function names. Overrides function- 411 | # naming-style. 412 | #function-rgx= 413 | 414 | # Good variable names which should always be accepted, separated by a comma. 415 | good-names=_, 416 | i, 417 | j, 418 | k, 419 | v, 420 | ii, 421 | jj, 422 | kk, 423 | # t, 424 | # c, 425 | # d, 426 | e, 427 | # r, 428 | # s, 429 | # v, 430 | # p, 431 | # ts, 432 | # tS, 433 | ok, 434 | logger 435 | 436 | # Include a hint for the correct naming format with invalid-name. 437 | include-naming-hint=no 438 | 439 | # Naming style matching correct inline iteration names. 440 | inlinevar-naming-style=any 441 | 442 | # Regular expression matching correct inline iteration names. Overrides 443 | # inlinevar-naming-style. 444 | #inlinevar-rgx= 445 | 446 | # Naming style matching correct method names. 447 | method-naming-style=snake_case 448 | 449 | # Regular expression matching correct method names. Overrides method-naming- 450 | # style. 451 | # method-rgx=_?_?[a-z][A-Za-z0-9]{1,40}_?_?$ 452 | 453 | # Naming style matching correct module names. 454 | module-naming-style=any 455 | 456 | # Regular expression matching correct module names. Overrides module-naming- 457 | # style. 458 | #module-rgx= 459 | 460 | # Colon-delimited sets of names that determine each other's naming style when 461 | # the name regexes allow several styles. 462 | name-group= 463 | 464 | # Regular expression which should only match function or class names that do 465 | # not require a docstring. 466 | no-docstring-rgx=^_ 467 | 468 | # List of decorators that produce properties, such as abc.abstractproperty. Add 469 | # to this list to register other decorators that produce valid properties. 470 | # These decorators are taken in consideration only for invalid-name. 471 | property-classes=abc.abstractproperty 472 | 473 | # Naming style matching correct variable names. 474 | variable-naming-style=snake_case 475 | 476 | # Regular expression matching correct variable names. Overrides variable- 477 | # naming-style. 478 | # variable-rgx=[a-z_][a-zA-Z0-9]{1,40}$ 479 | 480 | 481 | [STRING] 482 | 483 | # This flag controls whether the implicit-str-concat-in-sequence should 484 | # generate a warning on implicit string concatenation in sequences defined over 485 | # several lines. 486 | check-str-concat-over-line-jumps=no 487 | 488 | 489 | [IMPORTS] 490 | 491 | # Allow wildcard imports from modules that define __all__. 492 | allow-wildcard-with-all=no 493 | 494 | # Analyse import fallback blocks. This can be used to support both Python 2 and 495 | # 3 compatible code, which means that the block might have code that exists 496 | # only in one or another interpreter, leading to false positives when analysed. 497 | analyse-fallback-blocks=no 498 | 499 | # Deprecated modules which should not be used, separated by a comma. 500 | deprecated-modules=optparse,tkinter.tix 501 | 502 | # Create a graph of external dependencies in the given file (report RP0402 must 503 | # not be disabled). 504 | ext-import-graph= 505 | 506 | # Create a graph of every (i.e. internal and external) dependencies in the 507 | # given file (report RP0402 must not be disabled). 508 | import-graph= 509 | 510 | # Create a graph of internal dependencies in the given file (report RP0402 must 511 | # not be disabled). 512 | int-import-graph= 513 | 514 | # Force import order to recognize a module as part of the standard 515 | # compatibility libraries. 516 | known-standard-library= 517 | 518 | # Force import order to recognize a module as part of a third party library. 519 | known-third-party=enchant 520 | 521 | 522 | [CLASSES] 523 | 524 | # List of method names used to declare (i.e. assign) instance attributes. 525 | defining-attr-methods=__init__, 526 | __new__, 527 | setUp 528 | 529 | # List of member names, which should be excluded from the protected access 530 | # warning. 531 | exclude-protected=_asdict, 532 | _fields, 533 | _replace, 534 | _source, 535 | _make 536 | 537 | # List of valid names for the first argument in a class method. 538 | valid-classmethod-first-arg=cls 539 | 540 | # List of valid names for the first argument in a metaclass class method. 541 | valid-metaclass-classmethod-first-arg=cls 542 | 543 | 544 | [DESIGN] 545 | 546 | # Maximum number of arguments for function / method. 547 | max-args=5 548 | 549 | # Maximum number of attributes for a class (see R0902). 550 | max-attributes=7 551 | 552 | # Maximum number of boolean expressions in an if statement. 553 | max-bool-expr=5 554 | 555 | # Maximum number of branch for function / method body. 556 | max-branches=12 557 | 558 | # Maximum number of locals for function / method body. 559 | max-locals=15 560 | 561 | # Maximum number of parents for a class (see R0901). 562 | max-parents=7 563 | 564 | # Maximum number of public methods for a class (see R0904). 565 | max-public-methods=20 566 | 567 | # Maximum number of return / yield for function / method body. 568 | max-returns=6 569 | 570 | # Maximum number of statements in function / method body. 571 | max-statements=50 572 | 573 | # Minimum number of public methods for a class (see R0903). 574 | min-public-methods=2 575 | 576 | 577 | [EXCEPTIONS] 578 | 579 | # Exceptions that will emit a warning when being caught. Defaults to 580 | # "BaseException, Exception". 581 | overgeneral-exceptions=BaseException, 582 | Exception -------------------------------------------------------------------------------- /rcsbapi/__init__.py: -------------------------------------------------------------------------------- 1 | __docformat__ = "restructuredtext en" 2 | __author__ = "Dennis Piehl" 3 | __email__ = "dennis.piehl@rcsb.org" 4 | __license__ = "MIT" 5 | __version__ = "1.1.3" 6 | 7 | __path__ = __import__("pkgutil").extend_path(__path__, __name__) 8 | 9 | import logging 10 | 11 | logging.basicConfig(level=logging.WARNING, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s") 12 | logger = logging.getLogger() 13 | -------------------------------------------------------------------------------- /rcsbapi/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configurable settings for rcsb-api 3 | 4 | These settings can be overridden at runtime. 5 | 6 | For example, you can turn off autocompletion warning messages by 7 | modifying the `SUPPRESS_AUTOCOMPLETE_WARNING` setting as follows: 8 | 9 | Example: 10 | from rcsbapi.config import config 11 | 12 | # Override the default warning suppression flag 13 | config.SUPPRESS_AUTOCOMPLETE_WARNING = True 14 | """ 15 | import logging 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class Config: 21 | API_TIMEOUT: int = 60 22 | SEARCH_API_REQUESTS_PER_SECOND: int = 10 23 | SUPPRESS_AUTOCOMPLETE_WARNING: bool = False 24 | INPUT_ID_LIMIT: int = 5000 25 | 26 | def __setattr__(self, name, value): 27 | """Verify attribute exists when a user tries to set a configuration parameter, and ensure proper typing. 28 | Raises an error if user accidentally tries to create a new, unused attribute (e.g., due to a typo or misspelling), 29 | or sets it to an unexpected type. 30 | """ 31 | # Verify attribute exists 32 | if not hasattr(self, name): 33 | raise AttributeError(f"'{name}' is not a valid attribute of Config class") 34 | 35 | # Enforce consistent typing 36 | expected_type = self.__annotations__.get(name, None) 37 | if expected_type and not isinstance(value, expected_type): 38 | raise TypeError(f"Expected type '{expected_type.__name__}' for attribute '{name}', but got '{type(value).__name__}'") 39 | super().__setattr__(name, value) 40 | 41 | 42 | config = Config() 43 | -------------------------------------------------------------------------------- /rcsbapi/const.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants for rcsb-api (immutable and cannot be overridden) 3 | 4 | These constants define fixed values used throughout the rcsb-api package, 5 | including API endpoints, search services, and schema URLs. The values are 6 | immutable and protected from modification during runtime. 7 | """ 8 | 9 | from __future__ import annotations 10 | from dataclasses import dataclass, field 11 | from types import MappingProxyType 12 | from typing import List 13 | 14 | 15 | @dataclass(frozen=True) 16 | class Const: 17 | # Search API constants 18 | STRUCTURE_INDEX: int = 0 19 | CHEMICAL_INDEX: int = 0 20 | SEARCH_API_REQUEST_SCHEMA_URL: str = "https://search.rcsb.org/schema/search/request/json-schema-rcsb_search_query.json" 21 | SEARCH_OPENAPI_SCHEMA_URL: str = "https://search.rcsb.org/openapi.json" 22 | STRUCTURE_ATTRIBUTE_SEARCH_SERVICE: str = "text" 23 | CHEMICAL_ATTRIBUTE_SEARCH_SERVICE: str = "text_chem" 24 | FULL_TEXT_SEARCH_SERVICE: str = "full_text" 25 | SEQUENCE_SEARCH_SERVICE: str = "sequence" 26 | SEQMOTIF_SEARCH_SERVICE: str = "seqmotif" 27 | STRUCT_SIM_SEARCH_SERVICE: str = "structure" 28 | STRUCTMOTIF_SEARCH_SERVICE: str = "strucmotif" 29 | CHEM_SIM_SEARCH_SERVICE: str = "chemical" 30 | SEQUENCE_SEARCH_MIN_NUM_OF_RESIDUES: int = 25 31 | SEQMOTIF_SEARCH_MIN_CHARACTERS: int = 2 32 | STRUCT_MOTIF_MIN_RESIDUES: int = 2 33 | STRUCT_MOTIF_MAX_RESIDUES: int = 10 34 | RCSB_SEARCH_API_QUERY_URL: str = "https://search.rcsb.org/rcsbsearch/v2/query" 35 | UPLOAD_URL: str = "https://user-upload.rcsb.org/v1/putMultipart" 36 | RETURN_UP_URL: str = "https://user-upload.rcsb.org/v1/download/" 37 | 38 | SEARCH_API_SCHEMA_DIR: str = "search/resources" 39 | SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL: str = "http://search.rcsb.org/rcsbsearch/v2/metadata/schema" 40 | SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME: str = "structure_schema.json" 41 | SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL: str = "https://search.rcsb.org/rcsbsearch/v2/metadata/chemical/schema" 42 | SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME: str = "chemical_schema.json" 43 | 44 | # Data API constants 45 | DATA_API_ENDPOINT: str = "https://data.rcsb.org/graphql" 46 | DATA_API_SCHEMA_DIR: str = "data/resources" 47 | DATA_API_SCHEMA_FILENAME: str = "data_api_schema.json" 48 | DATA_API_SCHEMA_BASE_URL: str = "https://data.rcsb.org/rest/v1/schema/" 49 | DATA_API_SCHEMA_ENDPOINT_TO_FILE: MappingProxyType[str, str] = field(default_factory=lambda: MappingProxyType({ 50 | "entry": "entry.json", 51 | "polymer_entity": "polymer_entity.json", 52 | "branched_entity": "branched_entity.json", 53 | "nonpolymer_entity": "nonpolymer_entity.json", 54 | "polymer_entity_instance": "polymer_entity_instance.json", 55 | "branched_entity_instance": "branched_entity_instance.json", 56 | "nonpolymer_entity_instance": "nonpolymer_entity_instance.json", 57 | "assembly": "assembly.json", 58 | "chem_comp": "chem_comp.json", 59 | "pubmed": "pubmed.json", 60 | "uniprot": "uniprot.json", 61 | "drugbank": "drugbank.json", 62 | })) 63 | 64 | SINGULAR_TO_PLURAL: MappingProxyType[str, str] = field(default_factory=lambda: MappingProxyType({ 65 | "entry": "entries", 66 | "polymer_entity": "polymer_entities", 67 | "branched_entity": "branched_entities", 68 | "nonpolymer_entity": "nonpolymer_entities", 69 | "polymer_entity_instance": "polymer_entity_instances", 70 | "nonpolymer_entity_instance": "nonpolymer_entity_instances", 71 | "branched_entity_instance": "branched_entity_instances", 72 | "assembly": "assemblies", 73 | "interface": "interfaces", 74 | "uniprot": "", 75 | "pubmed": "", 76 | "chem_comp": "chem_comps", 77 | "entry_group": "entry_groups", 78 | "polymer_entity_group": "polymer_entity_groups", 79 | "group_provenance": "" 80 | })) 81 | # 82 | ID_TO_SEPARATOR: MappingProxyType[str, str] = field(default_factory=lambda: MappingProxyType({ 83 | "entity_id": "_", 84 | "asym_id": ".", 85 | "assembly_id": "-", 86 | "interface_id": "." 87 | })) 88 | 89 | # Regex strings for IDs 90 | DATA_API_INPUT_TYPE_TO_REGEX: MappingProxyType[str, List[str]] = field(default_factory=lambda: MappingProxyType({ 91 | "entry": [r"^(MA|AF|ma|af)_[A-Z0-9]*$", r"^[A-Za-z0-9]{4}$"], 92 | "entity": [r"^(MA|AF|ma|af)_[A-Z0-9]*_[0-9]+$", r"^[A-Z0-9]{4}_[0-9]+$"], 93 | "instance": [r"^(MA|AF|ma|af)_[A-Z0-9]*\.[A-Za-z]+$", r"^[A-Z0-9]{4}\.[A-Za-z]+$"], 94 | "assembly": [r"^(MA|AF|ma|af)_[A-Z0-9]*-[0-9]+$", r"^[A-Z0-9]{4}-[0-9]+$"], 95 | "interface": [r"^(MA|AF|ma|af)_[A-Z0-9]*-[0-9]+\.[0-9]+$", r"^[A-Z0-9]{4}-[0-9]+\.[0-9]+$"], 96 | # Regex for uniprot: https://www.uniprot.org/help/accession_numbers 97 | "uniprot": [r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"] 98 | })) 99 | 100 | INPUT_TYPE_TO_ALL_STRUCTURES_ENDPOINT: MappingProxyType[str, List[str]] = field(default_factory=lambda: MappingProxyType({ 101 | "entries": ["https://data.rcsb.org/rest/v1/holdings/current/entry_ids"], 102 | "chem_comps": ["https://data.rcsb.org/rest/v1/holdings/current/ccd_ids", "https://data.rcsb.org/rest/v1/holdings/current/prd_ids"] 103 | })) 104 | 105 | 106 | const = Const() 107 | -------------------------------------------------------------------------------- /rcsbapi/data/__init__.py: -------------------------------------------------------------------------------- 1 | """RCSB PDB Data API""" 2 | from .data_schema import DataSchema 3 | 4 | DATA_SCHEMA = DataSchema() 5 | 6 | # This is needed because __getattr__ will be called twice on import, 7 | # so ALL_STRUCTURES should be cached to avoid initializing twice 8 | _import_cache: dict = {} 9 | 10 | 11 | def __getattr__(name: str): 12 | """Overloading __getattr__ so that when ALL_STRUCTURES is accessed for the first time, 13 | ALL_STRUCTURES object will be built. 14 | 15 | Args: 16 | name (str): attribute name 17 | """ 18 | if name == "ALL_STRUCTURES": 19 | if name not in _import_cache: 20 | from .data_query import AllStructures 21 | ALL_STRUCTURES = AllStructures() 22 | _import_cache[name] = ALL_STRUCTURES 23 | 24 | return _import_cache[name] # Return cached instance 25 | 26 | # keep functionality of original __getattr__ 27 | raise AttributeError(f"Module {repr(__name__)} has no attribute {repr(name)}") 28 | 29 | 30 | from .data_query import DataQuery # noqa:E402 31 | 32 | __all__ = ["DataQuery", "DataSchema"] 33 | -------------------------------------------------------------------------------- /rcsbapi/data/data_query.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import urllib.parse 3 | import re 4 | import time 5 | from typing import Any, Union, List, Dict, Optional, Tuple 6 | import json 7 | import requests 8 | from tqdm import tqdm 9 | from rcsbapi.data import DATA_SCHEMA 10 | from ..config import config 11 | from ..const import const 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class DataQuery: 17 | """ 18 | Class for Data API queries. 19 | """ 20 | def __init__( 21 | self, 22 | input_type: str, 23 | input_ids: Union[List[str], Dict[str, str], Dict[str, List[str]]], 24 | return_data_list: List[str], 25 | add_rcsb_id: bool = True, 26 | suppress_autocomplete_warning: bool = False 27 | ): 28 | """ 29 | Query object for Data API requests. 30 | 31 | Args: 32 | input_type (str): query input type 33 | (e.g., "entry", "polymer_entity_instance", etc.) 34 | input_ids (list or dict): list (or singular dict) of ids for which to request information 35 | (e.g., ["4HHB", "2LGI"]) 36 | return_data_list (list): list of data to return (field names) 37 | (e.g., ["rcsb_id", "exptl.method"]) 38 | add_rcsb_id (bool, optional): whether to automatically add .rcsb_id to queries. Defaults to True. 39 | """ 40 | suppress_autocomplete_warning = config.SUPPRESS_AUTOCOMPLETE_WARNING if config.SUPPRESS_AUTOCOMPLETE_WARNING else suppress_autocomplete_warning 41 | 42 | if not isinstance(input_ids, AllStructures): 43 | if isinstance(input_ids, list): 44 | if len(input_ids) > config.INPUT_ID_LIMIT: 45 | logger.warning("More than %d input_ids. Query will be slower to complete.", config.INPUT_ID_LIMIT) 46 | if isinstance(input_ids, dict): 47 | for value in input_ids.values(): 48 | if len(value) > config.INPUT_ID_LIMIT: 49 | logger.warning("More than %d input_ids. Query will be slower to complete.", config.INPUT_ID_LIMIT) 50 | 51 | self._input_type, self._input_ids = self._process_input_ids(input_type, input_ids) 52 | self._return_data_list = return_data_list 53 | self._query = DATA_SCHEMA.construct_query( 54 | input_type=self._input_type, 55 | input_ids=self._input_ids, 56 | return_data_list=return_data_list, 57 | add_rcsb_id=add_rcsb_id, 58 | suppress_autocomplete_warning=suppress_autocomplete_warning 59 | ) 60 | """GraphQL query as a string""" 61 | self._response: Optional[Dict[str, Any]] = None 62 | """JSON response to query, will be assigned after executing""" 63 | 64 | def _process_input_ids(self, input_type: str, input_ids: Union[List[str], Dict[str, str], Dict[str, List[str]]]) -> Tuple[str, List[str]]: 65 | """Convert input_type to plural if possible. 66 | Set input_ids to be a list of ids. 67 | If using ALL_STRUCTURES, return the id list corresponding to the input type. 68 | 69 | Args: 70 | input_type (str): query input type 71 | (e.g., "entry", "polymer_entity_instance", etc.) 72 | input_ids (Union[List[str], Dict[str, str], Dict[str, List[str]]]): list/dict of ids to request information for 73 | 74 | Returns: 75 | Tuple[str, List[str]]: returns a tuple of converted input_type and list of input_ids 76 | """ 77 | # If input_ids is ALL_STRUCTURES, return appropriate list of ids 78 | if isinstance(input_ids, AllStructures): 79 | new_input_ids = input_ids.get_all_ids(input_type) 80 | return (input_type, new_input_ids) 81 | 82 | # Convert _input_type to plural if applicable 83 | converted = False 84 | if DATA_SCHEMA._root_dict[input_type][0]["kind"] != "LIST": 85 | plural_type = const.SINGULAR_TO_PLURAL[input_type] 86 | if plural_type: 87 | input_type = plural_type 88 | converted = True 89 | 90 | # Set _input_ids 91 | if isinstance(input_ids, dict): 92 | if converted: 93 | # If converted and input_ids is a dict, join into PDB id format 94 | if isinstance(input_ids, dict): 95 | join_id = "" 96 | for k, v in input_ids.items(): 97 | assert isinstance(v, str) # for mypy 98 | if k in const.ID_TO_SEPARATOR: 99 | join_id += const.ID_TO_SEPARATOR[k] + v 100 | else: 101 | join_id += v 102 | 103 | input_ids = [join_id] 104 | 105 | else: 106 | # If not converted, retrieve id list from dictionary 107 | input_ids = list(input_ids[DATA_SCHEMA._root_dict[input_type][0]["name"]]) 108 | 109 | # Make all input_ids uppercase 110 | input_ids = [id.upper() for id in input_ids] 111 | 112 | assert isinstance(input_ids, list) 113 | return (input_type, input_ids) 114 | 115 | def get_input_ids(self) -> List[str]: 116 | """get input_ids used to make query 117 | 118 | Returns: 119 | Union[List[str], Dict[str, List[str]], Dict[str, str]]: input id list or dictionary 120 | """ 121 | return self._input_ids 122 | 123 | def get_input_type(self) -> str: 124 | """get input_type used to make query 125 | 126 | Returns: 127 | str: input_type 128 | (e.g., "entry", "polymer_entity_instance", etc.) 129 | """ 130 | return self._input_type 131 | 132 | def get_return_data_list(self) -> List[str]: 133 | """get return_data_list used to make query 134 | 135 | Returns: 136 | List[str]: return_data_list 137 | (e.g., ["rcsb_id", "exptl.method"]) 138 | """ 139 | return self._return_data_list 140 | 141 | def get_query(self) -> str: 142 | """get GraphQL query 143 | 144 | Returns: 145 | str: query in GraphQL syntax 146 | """ 147 | return self._query 148 | 149 | def get_response(self) -> Union[None, Dict[str, Any]]: 150 | """get JSON response to executed query 151 | 152 | Returns: 153 | Dict[str, Any]: JSON object 154 | """ 155 | return self._response 156 | 157 | def get_editor_link(self) -> str: 158 | """get url to interactive GraphiQL editor 159 | 160 | Returns: 161 | str: GraphiQL url 162 | """ 163 | editor_base_link = str(const.DATA_API_ENDPOINT) + "/index.html?query=" 164 | return editor_base_link + urllib.parse.quote(self._query) 165 | 166 | def exec(self, batch_size: int = 5000, progress_bar: bool = False) -> Dict[str, Any]: 167 | """POST a GraphQL query and get response 168 | 169 | Returns: 170 | Dict[str, Any]: JSON object 171 | """ 172 | if len(self._input_ids) > batch_size: 173 | batched_ids: Union[List[List[str]], tqdm] = self._batch_ids(batch_size) 174 | else: 175 | batched_ids = [self._input_ids] 176 | response_json: Dict[str, Any] = {} 177 | 178 | if progress_bar is True: 179 | batched_ids = tqdm(batched_ids) 180 | 181 | for id_batch in batched_ids: 182 | query = re.sub(r"\[([^]]+)\]", f"{id_batch}".replace("'", '"'), self._query) 183 | part_response = requests.post( 184 | headers={"Content-Type": "application/graphql"}, 185 | data=query, 186 | url=const.DATA_API_ENDPOINT, 187 | timeout=config.API_TIMEOUT 188 | ).json() 189 | self._parse_gql_error(part_response) 190 | time.sleep(0.2) 191 | if not response_json: 192 | response_json = part_response 193 | else: 194 | response_json = self._merge_response(response_json, part_response) 195 | 196 | if "data" in response_json.keys(): 197 | query_response = response_json["data"][self._input_type] 198 | if query_response is None: 199 | logger.warning("Input produced no results. Check that input ids are valid") 200 | if isinstance(query_response, list): 201 | if len(query_response) == 0: 202 | logger.warning("Input produced no results. Check that input ids are valid") 203 | self._response = response_json 204 | return response_json 205 | 206 | def _parse_gql_error(self, response_json: Dict[str, Any]): 207 | if "errors" in response_json.keys(): 208 | error_msg_list: list[str] = [] 209 | for error_dict in response_json["errors"]: 210 | error_msg_list.append(error_dict["message"]) 211 | combined_error_msg: str = "" 212 | for i, error_msg in enumerate(error_msg_list): 213 | combined_error_msg += f"{i+1}. {error_msg}\n" 214 | raise ValueError(f"{combined_error_msg}. Run .get_editor_link() to get a link to GraphiQL editor with query") 215 | 216 | def _batch_ids(self, batch_size: int) -> List[List[str]]: # assumes that plural types have only one arg, which is true right now 217 | """split queries with large numbers of input_ids into smaller batches 218 | 219 | Args: 220 | batch_size (int): max size of batches 221 | 222 | Returns: 223 | List[List[str]]: nested list where each list is a batch of ids 224 | """ 225 | batched_ids: List[List[str]] = [] 226 | i = 0 227 | while i < len(self._input_ids): 228 | count = 0 229 | batch_list: List[str] = [] 230 | while count < batch_size and i < len(self._input_ids): 231 | batch_list.append(self._input_ids[i]) 232 | count += 1 233 | i += 1 234 | if len(batch_list) > 0: 235 | batched_ids.append(batch_list) 236 | return batched_ids 237 | 238 | def _merge_response(self, merge_into_response: Dict[str, Any], to_merge_response: Dict[str, Any]): 239 | """merge two JSON responses. Used after batching ids to merge responses from each batch. 240 | 241 | Args: 242 | merge_into_response (Dict[str, Any]) 243 | to_merge_response (Dict[str, Any]) 244 | 245 | Returns: 246 | Dict : merged JSON response, formatted as if it was one request 247 | """ 248 | combined_response = merge_into_response 249 | combined_response["data"][self._input_type] += to_merge_response["data"][self._input_type] 250 | return combined_response 251 | 252 | 253 | class AllStructures: 254 | """Class for representing all structures of different `input_types` 255 | """ 256 | def __init__(self): 257 | """initialize AllStructures object 258 | """ 259 | self.ALL_STRUCTURES = self.reload() 260 | 261 | def reload(self) -> dict[str, List[str]]: 262 | """Build dictionary of IDs based on endpoints defined in const 263 | 264 | Returns: 265 | dict[str, List[str]]: ALL_STRUCTURES object 266 | """ 267 | ALL_STRUCTURES = {} 268 | for input_type, endpoints in const.INPUT_TYPE_TO_ALL_STRUCTURES_ENDPOINT.items(): 269 | all_ids: List[str] = [] 270 | for endpoint in endpoints: 271 | response = requests.get(endpoint, timeout=60) 272 | if response.status_code == 200: 273 | all_ids.extend(json.loads(response.text)) 274 | else: 275 | response.raise_for_status() 276 | ALL_STRUCTURES[input_type] = all_ids 277 | 278 | return ALL_STRUCTURES 279 | 280 | def get_all_ids(self, input_type: str) -> List[str]: 281 | """Get all ids of a certain `input_type` 282 | 283 | Args: 284 | input_type (str): `input_type` string 285 | 286 | Raises: 287 | ValueError: raise an error if the `input_type` isn't in ALL_STRUCTURES 288 | 289 | Returns: 290 | List[str]: list of IDS of specified `input_type` 291 | """ 292 | if input_type in self.ALL_STRUCTURES: 293 | return self.ALL_STRUCTURES[input_type] 294 | else: 295 | raise ValueError(f"ALL_STRUCTURES is not yet available for input_type {input_type}") 296 | -------------------------------------------------------------------------------- /rcsbapi/data/resources/pubmed.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "title": "Core PubMed", 4 | "description": "JSON schema for core PubMed collection in RCSB Data Warehouse.", 5 | "required": [ 6 | "rcsb_pubmed_container_identifiers" 7 | ], 8 | "type": "object", 9 | "properties": { 10 | "rcsb_id": { 11 | "type": "string", 12 | "description": "Unique integer value assigned to each PubMed record.", 13 | "rcsb_search_context": [ 14 | "exact-match" 15 | ] 16 | }, 17 | "rcsb_pubmed_container_identifiers": { 18 | "type": "object", 19 | "properties": { 20 | "pubmed_id": { 21 | "type": "integer", 22 | "description": "UID assigned to each PubMed record.", 23 | "rcsb_search_context": [ 24 | "default-match" 25 | ], 26 | "examples": [ 27 | 15937111 28 | ], 29 | "rcsb_description": [ 30 | { 31 | "text": "PubMed ID", 32 | "context": "brief" 33 | } 34 | ], 35 | "rcsb_search_group": [ 36 | { 37 | "group_name": "ID(s) and Keywords", 38 | "priority_order": 18 39 | } 40 | ] 41 | } 42 | }, 43 | "additionalProperties": false 44 | }, 45 | "rcsb_pubmed_central_id": { 46 | "type": "string", 47 | "description": "Unique integer value assigned to each PubMed Central record." 48 | }, 49 | "rcsb_pubmed_doi": { 50 | "type": "string", 51 | "description": "Persistent identifier used to provide a link to an article location on the Internet." 52 | }, 53 | "rcsb_pubmed_abstract_text": { 54 | "type": "string", 55 | "description": "A concise, accurate and factual mini-version of the paper contents.", 56 | "rcsb_search_context": [ 57 | "full-text" 58 | ], 59 | "rcsb_description": [ 60 | { 61 | "text": "PubMed Abstract", 62 | "context": "brief" 63 | } 64 | ], 65 | "rcsb_search_group": [ 66 | { 67 | "group_name": "Publications Primary", 68 | "priority_order": 30 69 | } 70 | ] 71 | }, 72 | "rcsb_pubmed_affiliation_info": { 73 | "type": "array", 74 | "minItems": 1, 75 | "uniqueItems": true, 76 | "items": { 77 | "type": "string" 78 | }, 79 | "description": "The institution(s) that the author is affiliated with. Multiple affiliations per author are allowed." 80 | }, 81 | "rcsb_pubmed_mesh_descriptors": { 82 | "type": "array", 83 | "minItems": 1, 84 | "uniqueItems": true, 85 | "items": { 86 | "type": "string" 87 | }, 88 | "description": "NLM controlled vocabulary, Medical Subject Headings (MeSH), is used to characterize the content of the articles represented by MEDLINE citations." 89 | }, 90 | "rcsb_pubmed_mesh_descriptors_lineage": { 91 | "type": "array", 92 | "minItems": 1, 93 | "uniqueItems": true, 94 | "items": { 95 | "type": "object", 96 | "properties": { 97 | "id": { 98 | "type": "string", 99 | "description": "Identifier for MeSH classification term.", 100 | "rcsb_search_context": [ 101 | "exact-match" 102 | ], 103 | "examples": [ 104 | "E01.370.225.500.388", 105 | "H01.181" 106 | ], 107 | "rcsb_description": [ 108 | { 109 | "text": "MeSH Identifier", 110 | "context": "brief" 111 | } 112 | ], 113 | "rcsb_search_group": [ 114 | { 115 | "group_name": "Publications Primary", 116 | "priority_order": 35 117 | } 118 | ] 119 | }, 120 | "name": { 121 | "type": "string", 122 | "description": "MeSH classification term.", 123 | "rcsb_search_context": [ 124 | "exact-match", 125 | "full-text" 126 | ], 127 | "examples": [ 128 | "Chemistry", 129 | "Mammals", 130 | "Therapeutic Uses" 131 | ], 132 | "rcsb_description": [ 133 | { 134 | "text": "MeSH Descriptor", 135 | "context": "brief" 136 | } 137 | ], 138 | "rcsb_search_group": [ 139 | { 140 | "group_name": "Publications Primary", 141 | "priority_order": 36 142 | } 143 | ] 144 | }, 145 | "depth": { 146 | "type": "integer", 147 | "description": "Hierarchy depth.", 148 | "rcsb_search_context": [ 149 | "default-match" 150 | ] 151 | } 152 | }, 153 | "additionalProperties": false 154 | }, 155 | "rcsb_nested_indexing": true, 156 | "description": "Members of the MeSH classification lineage." 157 | } 158 | }, 159 | "additionalProperties": false 160 | } -------------------------------------------------------------------------------- /rcsbapi/dev_tools/update_schema.py: -------------------------------------------------------------------------------- 1 | """Update the distribution json files; for developer use only 2 | 3 | This script updates the search and data API schema files. 4 | After updating, it prints a message about which schemas were updated along with version numbers. 5 | 6 | Run this before releasing a new version of the rcsb-api package and 7 | copy/paste the printed message into the CHANGELOG if any schemas were updated. 8 | 9 | The endpoints for requesting online schemas and paths for writing the new schema files 10 | are in the .const file. 11 | """ 12 | 13 | import json 14 | from pathlib import Path 15 | from typing import Dict, Literal, List 16 | import requests 17 | 18 | try: 19 | from rcsbapi.search.search_query import SEARCH_SCHEMA # instance of SearchSchema 20 | except Exception: 21 | # ignore errors that may occur parsing the schema 22 | pass 23 | 24 | from rcsbapi.data import DATA_SCHEMA 25 | from rcsbapi.const import const 26 | from rcsbapi.config import config 27 | 28 | 29 | def make_version_dict(file_list: List[str], package: Literal["search", "data"]) -> Dict: 30 | current_version_dict = {} 31 | for f_name in file_list: 32 | path = Path(__file__).parent.parent.joinpath(package, "resources", f_name) 33 | with open(path, "r", encoding="utf-8") as file: 34 | schema = json.load(file) 35 | if "$comment" in schema: 36 | if package == "search": 37 | version = schema["$comment"].lower().replace("schema version: ", "") 38 | else: 39 | version = schema["$comment"].lower().replace("schema_version: ", "") 40 | current_version_dict[f_name] = version 41 | else: 42 | current_version_dict[f_name] = "" 43 | return current_version_dict 44 | 45 | 46 | def update_schema( 47 | f_name: str, 48 | file_url: str, 49 | package: Literal["search", "data"], 50 | ) -> str: 51 | # Define path: py-rcsb-api/rcsbapi//resources/ 52 | path = Path(__file__).parent.parent.joinpath(package, "resources", f_name) 53 | with open(path, "wt", encoding="utf-8") as file: 54 | new_schema = SEARCH_SCHEMA._fetch_schema(file_url) 55 | json.dump(new_schema, file, indent=4) 56 | if "$comment" in new_schema: 57 | if package == "search": 58 | version = new_schema["$comment"].lower().replace("schema version: ", "") 59 | else: 60 | version = new_schema["$comment"].lower().replace("schema_version: ", "") 61 | else: 62 | version = "" 63 | return version 64 | 65 | 66 | def make_changelog_msg( 67 | file_list: List[str], 68 | package: Literal["search", "data"], 69 | current_ver_dict: Dict[str, str], 70 | new_ver_dict: Dict[str, str], 71 | ) -> str: 72 | msg = "" 73 | for f_name in file_list: 74 | if (current_ver_dict[f_name] == new_ver_dict[f_name]) or (current_ver_dict[f_name] == ""): 75 | continue 76 | 77 | if not msg: 78 | msg = f"- Update {package} schemas: \n" 79 | msg += f" - {f_name.replace('.json', '')} schema {current_ver_dict[f_name]} -> {new_ver_dict[f_name]}\n" 80 | return msg 81 | 82 | 83 | if __name__ == "__main__": 84 | # Find current schema versions 85 | search_current_ver_dict = make_version_dict( 86 | file_list=[const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME, const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME], 87 | package="search" 88 | ) 89 | data_current_ver_dict = make_version_dict( 90 | file_list=list(const.DATA_API_SCHEMA_ENDPOINT_TO_FILE.values()), 91 | package="data" 92 | ) 93 | 94 | # Update Search API schemas 95 | search_url_to_file = { 96 | const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL: const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME, 97 | const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL: const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME, 98 | } 99 | search_version_dict: dict[str, str] = {} 100 | for url, file_name in search_url_to_file.items(): 101 | search_version = update_schema( 102 | f_name=file_name, 103 | file_url=url, 104 | package="search" 105 | ) 106 | search_version_dict[file_name] = search_version 107 | 108 | # Update Data API schemas 109 | data_version_dict: dict[str, str] = {} 110 | for endpoint, file_name in const.DATA_API_SCHEMA_ENDPOINT_TO_FILE.items(): 111 | data_version = update_schema( 112 | f_name=file_name, 113 | file_url=const.DATA_API_SCHEMA_BASE_URL + endpoint, 114 | package="data" 115 | ) 116 | data_version_dict[file_name] = data_version 117 | 118 | # Update full GraphQL Data API schema 119 | query = DATA_SCHEMA._get_introspection_query() 120 | schema_response = requests.post(headers={"Content-Type": "application/graphql"}, data=query, url=const.DATA_API_ENDPOINT, timeout=config.API_TIMEOUT) 121 | assert schema_response.status_code == 200 122 | data_schema_path = Path(__file__).parent.parent.joinpath(const.DATA_API_SCHEMA_DIR, const.DATA_API_SCHEMA_FILENAME) 123 | with open(data_schema_path, "wt", encoding="utf-8") as f: 124 | json.dump(schema_response.json(), f, indent=4) 125 | 126 | # Check if search schema version numbers are the same as each other 127 | version_list = list(search_version_dict.values()) 128 | curr_ver_list = list(search_current_ver_dict.values()) 129 | if ( 130 | all(ver == version_list[0] for ver in version_list) 131 | and all(curr_ver == curr_ver_list[0] for curr_ver in curr_ver_list) 132 | ): 133 | if not all(curr_ver == version_list[0] for curr_ver in list(search_current_ver_dict.values())): 134 | print(f"- Update search schemas: {curr_ver_list[0]} -> {version_list[0]}") 135 | else: 136 | print("Search schemas are up-to-date") 137 | else: 138 | # Make search package CHANGELOG message 139 | search_file_list = list(search_version_dict.keys()) 140 | update_msg = make_changelog_msg( 141 | file_list=search_file_list, 142 | package="search", 143 | current_ver_dict=search_current_ver_dict, 144 | new_ver_dict=search_version_dict 145 | ) 146 | if update_msg: 147 | print(update_msg) 148 | else: 149 | print("Data schema are up-to-date") 150 | 151 | # Make data package CHANGELOG message 152 | version_list = list(data_version_dict.values()) 153 | data_file_list = list(data_version_dict.keys()) 154 | update_msg = make_changelog_msg( 155 | file_list=data_file_list, 156 | package="data", 157 | current_ver_dict=data_current_ver_dict, 158 | new_ver_dict=data_version_dict 159 | ) 160 | if update_msg: 161 | print(update_msg) 162 | else: 163 | print("Data schema are up-to-date") 164 | -------------------------------------------------------------------------------- /rcsbapi/search/__init__.py: -------------------------------------------------------------------------------- 1 | """RCSB PDB Search API""" 2 | 3 | from typing import List 4 | from .search_query import SEARCH_SCHEMA # noqa: F401 5 | from .search_query import Attr, AttributeQuery, TextQuery 6 | from .search_query import SeqSimilarityQuery, SeqMotifQuery, ChemSimilarityQuery, StructSimilarityQuery, StructMotifResidue, StructMotifQuery 7 | from .search_query import Facet, FacetRange, TerminalFilter, GroupFilter, FilterFacet, Sort, GroupBy, RankingCriteriaType 8 | from .search_query import Group 9 | 10 | search_attributes = SEARCH_SCHEMA.search_attributes 11 | group = Group.group 12 | 13 | 14 | def __dir__() -> List[str]: 15 | return sorted(__all__) 16 | 17 | 18 | __all__ = [ 19 | "search_attributes", 20 | "Attr", 21 | "TextQuery", 22 | "AttributeQuery", 23 | "SeqSimilarityQuery", 24 | "SeqMotifQuery", 25 | "ChemSimilarityQuery", 26 | "StructSimilarityQuery", 27 | "StructMotifResidue", 28 | "StructMotifQuery", 29 | "Facet", 30 | "FacetRange", 31 | "TerminalFilter", 32 | "GroupFilter", 33 | "FilterFacet", 34 | "Sort", # Rename to prevent overlap? 35 | "GroupBy", 36 | "RankingCriteriaType", 37 | ] 38 | -------------------------------------------------------------------------------- /rcsbapi/search/search_schema.py: -------------------------------------------------------------------------------- 1 | """Parse the full RCSB PDB search schema 2 | 3 | Provides access to all valid attributes for search queries. 4 | """ 5 | import os 6 | import json 7 | import logging 8 | from pathlib import Path 9 | import re 10 | import warnings 11 | from typing import List, Union 12 | import requests 13 | from ..const import const 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class SearchSchemaGroup: 19 | """A non-leaf node in the RCSB PDB schema. Leaves are Attr values.""" 20 | 21 | def __init__(self, attr_type): 22 | self.Attr = attr_type # Attr or AttrLeaf 23 | self._members = {} # Dictionary to store members 24 | 25 | def search(self, pattern: Union[str, re.Pattern], flags=0): 26 | """Find all attributes in the schema matching a regular expression. 27 | 28 | Returns: 29 | A list of Attr objects whose attribute matches. 30 | """ 31 | matcher = re.compile(pattern, flags=flags) 32 | filter_match = filter(lambda a: matcher.search(a.attribute), self) 33 | return list(filter_match) 34 | 35 | def list(self): 36 | """Get a list of full names for all structure and chemical attributes""" 37 | all_list = [] 38 | for attr in self: 39 | attr_dict = vars(attr) 40 | name = attr_dict["attribute"] 41 | all_list.append(name) 42 | return all_list 43 | 44 | def __iter__(self): 45 | """Iterate over all leaf nodes 46 | 47 | Example: 48 | >>> [a for a in attrs if "stoichiometry" in a.attribute] 49 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')] 50 | """ 51 | 52 | def leaves(self, attr_type): 53 | for k, v in self._members.items(): 54 | if isinstance(v, attr_type): 55 | yield v 56 | elif isinstance(v, SearchSchemaGroup): 57 | yield from iter(v) 58 | # skips ["Attr"] key in __dict__ 59 | elif v is attr_type: 60 | continue 61 | else: 62 | # Shouldn't happen 63 | raise TypeError(f"Unrecognized member {k!r}: {v!r}") 64 | 65 | return leaves(self, self.Attr) 66 | 67 | def get_attribute_details(self, attribute: str): 68 | """Return attribute information given full or partial attribute name 69 | 70 | Args: 71 | attribute (str): Full attribute name 72 | (e.g., "rcsb_id", "rcsb_entity_source_organism.scientific_name") 73 | 74 | Returns: 75 | str: Return corresponding attribute description if there's a match 76 | """ 77 | 78 | def leaves(d): 79 | for v in d.values(): 80 | if "attribute" in v: 81 | yield v 82 | else: 83 | yield from leaves(v) 84 | 85 | split_attr = attribute.split(".") 86 | ptr = self # dictionary of attributes 87 | for level in split_attr: 88 | if level not in ptr: 89 | warnings.warn(f"Attribute path segment '{level}' (for input '{attribute}') not found in schema.", UserWarning) 90 | return None 91 | ptr = ptr[level] 92 | if "attribute" in ptr.__dict__ and getattr(ptr, "attribute") == attribute: # must be .__dict__ so both SearchSchemaGroup and Attr are compared as dictionaries 93 | return ptr 94 | else: 95 | return {c for c in leaves(ptr)} 96 | 97 | def get_attribute_type(self, attribute: str) -> Union[str, None]: 98 | """Return attribute type given full attribute name 99 | 100 | Args: 101 | attribute (str): Full attribute name 102 | (e.g., "rcsb_id", "rcsb_entity_source_organism.scientific_name") 103 | 104 | Returns: 105 | Union[str, None]: Return search service if there's a match. 106 | structure search: "text" 107 | chemical search: "chem_text" 108 | both: ["text", "chem_text"] (raises error later) 109 | """ 110 | split_attr = attribute.split(".") 111 | ptr = self # dictionary of attributes 112 | for level in split_attr: 113 | if level not in ptr: 114 | warnings.warn(f"Attribute path segment '{level}' (for input '{attribute}') not found in schema.", UserWarning) 115 | return None 116 | ptr = ptr[level] 117 | if "attribute" in ptr.__dict__ and getattr(ptr, "attribute") == attribute: # must be .__dict__ so both SearchSchemaGroup and Attr are compared as dictionaries 118 | return getattr(ptr, "type") 119 | warnings.warn(f"Incomplete attribute path '{attribute}' - must specify fully qualified path to leaf attribute node.", UserWarning) 120 | return None 121 | 122 | # Below methods are for making SearchSchemaGroup behave as a Dict (be able to access through keys, etc). 123 | # This is used for automatically determining search service based on attribute name. 124 | 125 | def __getitem__(self, key): 126 | """Allow dictionary-like access to members by key.""" 127 | return self._members[key] 128 | 129 | def __setitem__(self, key, value): 130 | """Set a member in the schema like a dictionary.""" 131 | self._members[key] = value 132 | 133 | def __delitem__(self, key): 134 | """Delete a member from the schema like a dictionary.""" 135 | del self._members[key] 136 | 137 | def __contains__(self, key): 138 | """Check if a member exists in the schema.""" 139 | return key in self._members 140 | 141 | def keys(self): 142 | return self._members.keys() 143 | 144 | def values(self): 145 | return self._members.values() 146 | 147 | def items(self): 148 | return self._members.items() 149 | 150 | def __str__(self): 151 | return "\n".join(f"{key}: {value}" for key, value in self._members.items()) 152 | 153 | def __hash__(self): 154 | """Make the object hashable using the hash of its members.""" 155 | return hash(frozenset(self._members.items())) 156 | 157 | 158 | class SearchSchema: 159 | def __init__( 160 | self, 161 | attr_type, 162 | refetch=True, 163 | use_fallback=True, 164 | reload=True, 165 | struct_attr_schema_url=const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL, 166 | struct_attr_schema_file=os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME), 167 | chem_attr_schema_url=const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL, 168 | chem_attr_schema_file=os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME), 169 | ): 170 | """Initialize SearchSchema object with all known RCSB PDB attributes. 171 | 172 | This is provided to ease autocompletion as compared to creating Attr objects from 173 | strings. For example, 174 | :: 175 | 176 | search_attributes.rcsb_nonpolymer_instance_feature_summary.chem_id 177 | 178 | is equivalent to 179 | :: 180 | 181 | Attr('rcsb_nonpolymer_instance_feature_summary.chem_id') 182 | 183 | All attributes in `search_attributes` can be iterated over. 184 | 185 | >>> [a for a in search_attributes if "stoichiometry" in a.attribute] 186 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')] 187 | 188 | Attributes matching a regular expression can also be filtered: 189 | 190 | >>> list(search_attributes.search('rcsb.*stoichiometry')) 191 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]a 192 | """ 193 | self.Attr = attr_type 194 | if reload: 195 | self.struct_schema = self._reload_schema(struct_attr_schema_url, struct_attr_schema_file, refetch, use_fallback) 196 | self.chem_schema = self._reload_schema(chem_attr_schema_url, chem_attr_schema_file, refetch, use_fallback) 197 | self.search_attributes = self._make_schema_group() 198 | 199 | def _reload_schema(self, schema_url: str, schema_file: str, refetch=True, use_fallback=True): 200 | sD = {} 201 | if refetch: 202 | sD = self._fetch_schema(schema_url) 203 | if not sD and use_fallback: 204 | sD = self._load_json_schema(schema_file) 205 | return sD 206 | 207 | def _make_schema_group(self) -> SearchSchemaGroup: 208 | schemas = [(self.struct_schema, const.STRUCTURE_ATTRIBUTE_SEARCH_SERVICE, ""), (self.chem_schema, const.CHEMICAL_ATTRIBUTE_SEARCH_SERVICE, "")] 209 | schema = self._make_group("", schemas) 210 | assert isinstance(schema, SearchSchemaGroup) # for type checking 211 | return schema 212 | 213 | def _fetch_schema(self, url: str): 214 | "Request the current schema from the web" 215 | logger.info("Requesting %s", url) 216 | response = requests.get(url, timeout=None) 217 | if response.status_code == 200: 218 | return response.json() 219 | else: 220 | logger.debug("HTTP response status code %r", response.status_code) 221 | return None 222 | 223 | def _load_json_schema(self, schema_file): 224 | logger.info("Loading attribute schema from file") 225 | path = Path(__file__).parent.parent.joinpath(schema_file) 226 | with open(path, "r", encoding="utf-8") as file: 227 | latest = json.load(file) 228 | return latest 229 | 230 | def _make_group(self, fullname: str, nodeL: List): 231 | """Represent this node of the schema as a python object 232 | 233 | Params: 234 | - name: full dot-separated attribute name 235 | 236 | Returns: 237 | An Attr (Leaf nodes) or SearchSchemaGroup (object nodes) 238 | """ 239 | group = SearchSchemaGroup(self.Attr) 240 | for node, attrtype, desc in nodeL: 241 | if "anyOf" in node: 242 | children = {self._make_group(fullname, [(n, attrtype, n.get("description", node.get("description", desc)))]) for n in node["anyOf"]} 243 | # Currently only deal with anyOf in leaf nodes 244 | assert len(children) == 1, f"type of {fullname} couldn't be determined" 245 | return next(iter(children)) 246 | if "oneOf" in node: 247 | children = {self._make_group(fullname, [(n, attrtype, n.get("description", desc))]) for n in node["oneOf"]} 248 | # Currently only deal with oneOf in leaf nodes 249 | assert len(children) == 1, f"type of {fullname} couldn't be determined" 250 | return next(iter(children)) 251 | if "allOf" in node: 252 | children = {self._make_group(fullname, [(n, attrtype, n.get("description", desc))]) for n in node["allOf"]} 253 | # Currently only deal with allOf in leaf nodes 254 | assert len(children) == 1, f"type of {fullname} couldn't be determined" 255 | return next(iter(children)) 256 | if node["type"] in ("string", "number", "integer", "date"): 257 | # For nodes that occur in both schemas, list of both descriptions will be passed in through desc arg 258 | if isinstance(desc, list): 259 | return self.Attr(fullname, attrtype, desc) 260 | # For non-redundant nodes 261 | return self.Attr(fullname, attrtype, node.get("description", desc)) 262 | elif node["type"] == "array": 263 | # skip to items 264 | return self._make_group(fullname, [(node["items"], attrtype, node.get("description", desc))]) 265 | elif node["type"] == "object": 266 | for childname, childnode in node["properties"].items(): 267 | fullchildname = f"{fullname}.{childname}" if fullname else childname 268 | # setattr(group, childname, childgroup) 269 | if childname in group: 270 | assert not isinstance(group[childname], dict) # redundant name must not have nested attributes 271 | 272 | # Create attrtype and description lists with existing and current value. 273 | # List type triggers error if user doesn't specify service for redundant attribute. 274 | currentattr = getattr(group[childname], "type") 275 | attrlist = [currentattr, attrtype] 276 | 277 | currentdescript = getattr(group[childname], "description") 278 | descriptlist = [currentdescript, childnode.get("description", desc)] 279 | 280 | childgroup = self._make_group(fullchildname, [(childnode, attrlist, descriptlist)]) 281 | else: 282 | childgroup = self._make_group(fullchildname, [(childnode, attrtype, childnode.get("description", desc))]) 283 | # adding to SearchSchemaGroup as a dict allows for determining search service by attribute name with O(1) lookup 284 | group[childname] = childgroup 285 | 286 | # adding to SearchSchemaGroup as an attribute allows for tab-completion for search_attributes/attrs 287 | setattr(group, childname, childgroup) 288 | else: 289 | raise TypeError(f"Unrecognized node type {node['type']!r} of {fullname}") 290 | return group 291 | 292 | def _set_leaves(self, d: dict) -> dict: 293 | """Converts Attr objects to dictionary format.""" 294 | for leaf in d: 295 | if isinstance(d[leaf], self.Attr): 296 | d[leaf] = d[leaf].__dict__ 297 | else: 298 | d[leaf] = self._set_leaves(d[leaf]) 299 | return d 300 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests >= 2.0.0 2 | rustworkx 3 | graphql-core 4 | tqdm 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # use py2.py3 tag for pure-python dist: 3 | universal=1 4 | 5 | [metadata] 6 | description_file = README.md 7 | 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # File: setup.py 2 | # Date: 20-May-2024 3 | # 4 | # Update: 5 | # 6 | import re 7 | 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | 11 | packages = [] 12 | thisPackage = "rcsb-api" 13 | 14 | with open("rcsbapi/__init__.py", "r", encoding="utf-8") as fd: 15 | version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) 16 | 17 | # Load packages from requirements*.txt 18 | with open("requirements.txt", "r", encoding="utf-8") as ifh: 19 | packagesRequired = [ln.strip() for ln in ifh.readlines()] 20 | 21 | with open("README.md", "r", encoding="utf-8") as ifh: 22 | longDescription = ifh.read() 23 | 24 | if not version: 25 | raise RuntimeError("Cannot find version information") 26 | 27 | setup( 28 | name=thisPackage, 29 | version=version, 30 | description="Python package interface for RCSB.org API services", 31 | long_description_content_type="text/markdown", 32 | long_description=longDescription, 33 | python_requires=">=3.8,<4.0", 34 | author="Dennis Piehl", 35 | author_email="dennis.piehl@rcsb.org", 36 | url="https://github.com/rcsb/py-rcsb-api", 37 | # 38 | license="MIT", 39 | classifiers=[ 40 | "Programming Language :: Python", 41 | "Programming Language :: Python :: 3", 42 | "Programming Language :: Python :: 3 :: Only", 43 | "Programming Language :: Python :: 3.8", 44 | "Programming Language :: Python :: 3.9", 45 | "Programming Language :: Python :: 3.10", 46 | "Programming Language :: Python :: 3.11", 47 | "Programming Language :: Python :: 3.12", 48 | "Programming Language :: Python :: 3.13", 49 | "Development Status :: 4 - Beta", 50 | # 'Development Status :: 5 - Production/Stable', 51 | "Operating System :: OS Independent", 52 | "Intended Audience :: Science/Research", 53 | "Topic :: Scientific/Engineering :: Bio-Informatics", 54 | "Natural Language :: English", 55 | "License :: OSI Approved :: MIT License", 56 | "Typing :: Typed", 57 | ], 58 | entry_points={"console_scripts": []}, 59 | # 60 | install_requires=packagesRequired, 61 | packages=find_packages(exclude=["tests", "tests-*", "tests.*"]), 62 | package_data={ 63 | # If any package contains *.md or *.rst ... files, include them: 64 | "": ["*.md", "*.rst", "*.txt", "*.cfg", "rcsbapi/*/resources/*"] 65 | }, 66 | # 67 | test_suite="tests", 68 | tests_require=["tox"], 69 | # 70 | # Not configured ... 71 | extras_require={ 72 | "dev": ["check-manifest"], 73 | "test": ["coverage"], 74 | "tests": ["tox", "pylint", "black>=21.5b1", "flake8"], 75 | # should match docs/requirements.txt 76 | "docs": ["sphinx", "sphinx-rtd-theme", "myst-parser"], 77 | }, 78 | # Added for 79 | command_options={"build_sphinx": {"project": ("setup.py", thisPackage), "version": ("setup.py", version), "release": ("setup.py", version)}}, 80 | # This setting for namespace package support - 81 | zip_safe=False, 82 | ) 83 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/__init__.py -------------------------------------------------------------------------------- /tests/test-data/4hhb-assembly1.cif.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/4hhb-assembly1.cif.gz -------------------------------------------------------------------------------- /tests/test-data/4hhb.bcif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/4hhb.bcif -------------------------------------------------------------------------------- /tests/test-data/4hhb.pdb1.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/4hhb.pdb1.gz -------------------------------------------------------------------------------- /tests/test-data/7n0r.cif.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/7n0r.cif.gz -------------------------------------------------------------------------------- /tests/test-data/7n0r.pdb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcsb/py-rcsb-api/20478cf70eb6f84a30bb1c26fe2ccd8a97dbb276/tests/test-data/7n0r.pdb.gz -------------------------------------------------------------------------------- /tests/test-data/invalid.txt: -------------------------------------------------------------------------------- 1 | This is an invalid file that should cause the program to throw a type error when this is passed through the file upload function. -------------------------------------------------------------------------------- /tests/test_data_query.py: -------------------------------------------------------------------------------- 1 | ## 2 | # File: testquery.py 3 | # Author: 4 | # Date: 5 | # Version: 6 | # 7 | # Update: 8 | # 9 | # 10 | ## 11 | """ 12 | Tests for all functions of the schema file. (Work in progress) 13 | """ 14 | 15 | __docformat__ = "google en" 16 | __author__ = "" 17 | __email__ = "" 18 | __license__ = "" 19 | 20 | import logging 21 | 22 | # import importlib 23 | # import platform 24 | # import resource 25 | import time 26 | import unittest 27 | import requests 28 | 29 | from rcsbapi.search import search_attributes as attrs 30 | from rcsbapi.data import DataSchema, DataQuery 31 | from rcsbapi.config import config 32 | from rcsbapi.const import const 33 | 34 | logger = logging.getLogger(__name__) 35 | logger.setLevel(logging.INFO) 36 | 37 | 38 | class QueryTests(unittest.TestCase): 39 | def setUp(self): 40 | self.__startTime = time.time() 41 | logger.info("Starting %s at %s", self.id().split(".")[-1], time.strftime("%Y %m %d %H:%M:%S", time.localtime())) 42 | 43 | def tearDown(self) -> None: 44 | endTime = time.time() 45 | logger.info("Completed %s at %s (%.4f seconds)", self.id().split(".")[-1], time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) 46 | 47 | def testGetEditorLink(self): 48 | # query_str = '{ entries(entry_ids: ["4HHB", "1IYE"]) {\n exptl {\n method_details\n method\n details\n crystals_number\n }\n}}' 49 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": ["4HHB", "1IYE"]}, return_data_list=["exptl"]) 50 | url = query_obj.get_editor_link() 51 | response_json = requests.get(url, timeout=10) 52 | self.assertEqual(response_json.status_code, 200) 53 | 54 | def testExec(self): 55 | with self.subTest("1. Batching into requests with fewer Ids"): 56 | input_ids = [] 57 | for _ in range(165): 58 | input_ids.append("4HHB") 59 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": input_ids}, return_data_list=["exptl"]) 60 | query_obj.exec() 61 | # assert that the batch and merge functions are called 62 | # assert len of results is same as num of input ids 63 | 64 | def testLowercaseIds(self): 65 | with self.subTest(msg="1. List of IDs"): 66 | try: 67 | query_obj = DataQuery(input_type="entries", input_ids=["4hhb"], return_data_list=["exptl.method"]) 68 | query_obj.exec() 69 | except Exception as error: 70 | self.fail(f"Failed unexpectedly: {error}") 71 | with self.subTest(msg="2. Dictionary of IDs"): 72 | try: 73 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": ["4hhb", "1iye"]}, return_data_list=["exptl"]) 74 | query_obj.exec() 75 | except Exception as error: 76 | self.fail(f"Failed unexpectedly: {error}") 77 | with self.subTest(msg="2. IDs with separators"): 78 | try: 79 | query_obj = DataQuery(input_type="interfaces", input_ids=["4hhb-1.1"], return_data_list=["rcsb_interface_info.interface_area"]) 80 | query_obj.exec() 81 | except Exception as error: 82 | self.fail(f"Failed unexpectedly: {error}") 83 | with self.subTest(msg="3. Pubmed IDs"): 84 | try: 85 | query_obj = DataQuery(input_type="pubmed", input_ids=["6726807"], return_data_list=["rcsb_pubmed_doi"]) 86 | query_obj.exec() 87 | except Exception as error: 88 | self.fail(f"Failed unexpectedly: {error}") 89 | with self.subTest(msg="3. UniProt IDs"): 90 | try: 91 | query_obj = DataQuery(input_type="uniprot", input_ids=["p68871"], return_data_list=["rcsb_id"]) 92 | query_obj.exec() 93 | except Exception as error: 94 | self.fail(f"Failed unexpectedly: {error}") 95 | 96 | def testParseGQLError(self): 97 | pass 98 | 99 | def testBatchIDs(self): 100 | input_ids = [] 101 | for _ in range(165): 102 | input_ids.append("4HHB") 103 | query_obj = DataQuery(input_type="entries", input_ids={"entry_ids": input_ids}, return_data_list=["exptl"]) 104 | batch_size = 50 105 | batched_ids = query_obj._batch_ids(batch_size) 106 | total_ids = 0 107 | for batch in batched_ids: 108 | len_id_batch = len(batch) 109 | self.assertLessEqual(len_id_batch, batch_size) 110 | total_ids += len_id_batch 111 | self.assertEqual(len(query_obj.get_input_ids()), total_ids) 112 | 113 | def testMergeResponse(self): 114 | # assert that the lengths are combined and all ids are present? 115 | pass 116 | 117 | def testDocs(self): 118 | with self.subTest(msg="1. Initialize Schema"): 119 | schema = DataSchema() 120 | 121 | with self.subTest(msg="2. README 1"): 122 | try: 123 | query_obj = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl.method"]) 124 | query_obj.exec() 125 | except Exception as error: 126 | self.fail(f"Failed unexpectedly: {error}") 127 | 128 | with self.subTest(msg="3. README 2"): 129 | try: 130 | query_obj = DataQuery( 131 | input_type="polymer_entities", 132 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"], 133 | return_data_list=[ 134 | "polymer_entities.rcsb_id", 135 | "rcsb_entity_source_organism.ncbi_taxonomy_id", 136 | "rcsb_entity_source_organism.ncbi_scientific_name", 137 | "cluster_id", 138 | "identity", 139 | ], 140 | ) 141 | query_obj.exec() 142 | except Exception as error: 143 | self.fail(f"Failed unexpectedly: {error}") 144 | 145 | with self.subTest(msg="4. Quickstart 1"): 146 | try: 147 | query_obj = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl.method"]) 148 | query_obj.exec() 149 | except Exception as error: 150 | self.fail(f"Failed unexpectedly: {error}") 151 | 152 | with self.subTest(msg="5. Quickstart 2, autocompletion"): 153 | try: 154 | query_obj = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl"]) 155 | query_obj.exec() 156 | except Exception as error: 157 | self.fail(f"Failed unexpectedly: {error}") 158 | 159 | with self.subTest(msg="4. Helpful methods, get_editor_link()"): 160 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["exptl"]) 161 | response = requests.get(query.get_editor_link(), timeout=5) 162 | self.assertEqual(response.status_code, 200) 163 | 164 | with self.subTest(msg="5. Helpful methods, find_paths()"): 165 | try: 166 | schema.find_paths(input_type="entries", return_data_name="id") 167 | except Exception as error: 168 | self.fail(f"Failed unexpectedly: {error}") 169 | 170 | with self.subTest(msg="6. Helpful methods, get_input_id_dict"): 171 | test_dict = schema.get_input_id_dict("polymer_entity_instance") 172 | polymer_instance_keys = ["entry_id", "asym_id"] 173 | for key in polymer_instance_keys: 174 | self.assertIn(key, test_dict.keys()) 175 | for value in test_dict.values(): 176 | self.assertIsNotNone(value) 177 | 178 | with self.subTest(msg="7. Troubleshooting, Not a unique field"): 179 | with self.assertRaises(ValueError): 180 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["id"]) 181 | try: 182 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["entry.id"]) 183 | except Exception as error: 184 | self.fail(f"Failed unexpectedly: {error}") 185 | 186 | def testAddExamples(self): 187 | with self.subTest(msg="1. Entries"): 188 | try: 189 | query = DataQuery(input_type="entries", input_ids=["1STP", "2JEF", "1CDG"], return_data_list=["entries.rcsb_id", "struct.title", "exptl.method"]) 190 | query.exec() 191 | except Exception as error: 192 | self.fail(f"Failed unexpectedly: {error}") 193 | 194 | with self.subTest(msg="2. Primary Citation"): 195 | try: 196 | query = DataQuery( 197 | input_type="entries", 198 | input_ids=["1STP", "2JEF", "1CDG"], 199 | return_data_list=[ 200 | "entries.rcsb_id", 201 | "rcsb_accession_info.initial_release_date", 202 | "audit_author.name", 203 | "rcsb_primary_citation.pdbx_database_id_PubMed", 204 | "rcsb_primary_citation.pdbx_database_id_DOI", 205 | ], 206 | ) 207 | query.exec() 208 | except Exception as error: 209 | self.fail(f"Failed unexpectedly: {error}") 210 | 211 | with self.subTest(msg="3. Polymer Entities"): 212 | try: 213 | query = DataQuery( 214 | input_type="polymer_entities", 215 | input_ids=["2CPK_1", "3WHM_1", "2D5Z_1"], 216 | return_data_list=[ 217 | "polymer_entities.rcsb_id", 218 | "rcsb_entity_source_organism.ncbi_taxonomy_id", 219 | "rcsb_entity_source_organism.ncbi_scientific_name", 220 | "cluster_id", 221 | "identity", 222 | ], 223 | ) 224 | query.exec() 225 | except Exception as error: 226 | self.fail(f"Failed unexpectedly: {error}") 227 | 228 | with self.subTest(msg="4. Polymer Instances"): 229 | try: 230 | query = DataQuery( 231 | input_type="polymer_entity_instances", 232 | input_ids=["4HHB.A", "12CA.A", "3PQR.A"], 233 | return_data_list=[ 234 | "polymer_entity_instances.rcsb_id", 235 | "rcsb_polymer_instance_annotation.annotation_id", 236 | "rcsb_polymer_instance_annotation.name", 237 | "rcsb_polymer_instance_annotation.type", 238 | ], 239 | ) 240 | query.exec() 241 | except Exception as error: 242 | self.fail(f"Failed unexpectedly: {error}") 243 | 244 | with self.subTest(msg="5. Carbohydrates"): 245 | try: 246 | query = DataQuery( 247 | input_type="branched_entities", 248 | input_ids=["5FMB_2", "6L63_3"], 249 | return_data_list=["pdbx_entity_branch.type", "pdbx_entity_branch_descriptor.type", "pdbx_entity_branch_descriptor.descriptor"], 250 | ) 251 | query.exec() 252 | except Exception as error: 253 | self.fail(f"Failed unexpectedly: {error}") 254 | 255 | with self.subTest(msg="6. Sequence Positional Features"): 256 | try: 257 | query = DataQuery( 258 | input_type="polymer_entity_instances", 259 | input_ids={"instance_ids": ["1NDO.A"]}, 260 | return_data_list=[ 261 | "polymer_entity_instances.rcsb_id", 262 | "rcsb_polymer_instance_feature.type", 263 | "rcsb_polymer_instance_feature.feature_positions.beg_seq_id", 264 | "rcsb_polymer_instance_feature.feature_positions.end_seq_id", 265 | ], 266 | ) 267 | query.exec() 268 | except Exception as error: 269 | self.fail(f"Failed unexpectedly: {error}") 270 | 271 | with self.subTest(msg="7. Reference Sequence Identifiers"): 272 | try: 273 | query = DataQuery( 274 | input_type="entries", 275 | input_ids=["7NHM", "5L2G"], 276 | return_data_list=[ 277 | "entries.rcsb_id", 278 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", 279 | "polymer_entities.rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name", 280 | ], 281 | ) 282 | query.exec() 283 | except Exception as error: 284 | self.fail(f"Failed unexpectedly: {error}") 285 | 286 | with self.subTest(msg="8. Chemical Components"): 287 | try: 288 | query = DataQuery( 289 | input_type="chem_comps", 290 | input_ids=["NAG", "EBW"], 291 | return_data_list=[ 292 | "chem_comps.rcsb_id", 293 | "chem_comp.type", 294 | "chem_comp.formula_weight", 295 | "chem_comp.name", 296 | "chem_comp.formula", 297 | "rcsb_chem_comp_info.initial_release_date", 298 | ], 299 | ) 300 | query.exec() 301 | except Exception as error: 302 | self.fail(f"Failed unexpectedly: {error}") 303 | 304 | with self.subTest(msg="9. Computed Structure Models"): 305 | try: 306 | query = DataQuery(input_type="entries", input_ids=["AF_AFP68871F1"], return_data_list=["ma_qa_metric_global.type", "ma_qa_metric_global.value"]) 307 | query.exec() 308 | except Exception as error: 309 | self.fail(f"Failed unexpectedly: {error}") 310 | 311 | def testQuickstartNotebook(self): 312 | with self.subTest(msg="1. Initialize Schema"): 313 | schema = DataSchema() 314 | with self.subTest(msg="2. GraphQL example query"): 315 | query = """ 316 | { 317 | entry(entry_id: "4HHB") { 318 | rcsb_entry_info { 319 | nonpolymer_bound_components 320 | } 321 | } 322 | } 323 | """ 324 | response_json = requests.post(headers={"Content-Type": "application/graphql"}, data=query, url=const.DATA_API_ENDPOINT, timeout=config.API_TIMEOUT).json() 325 | self.assertNotIn("errors", response_json.keys()) 326 | with self.subTest(msg="4. Making Queries"): 327 | try: 328 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["nonpolymer_bound_components"]) 329 | query.exec() 330 | except Exception as error: 331 | self.fail(f"Failed unexpectedly: {error}") 332 | with self.subTest(msg="5. input_ids, mult args"): 333 | try: 334 | query = DataQuery(input_type="polymer_entity_instances", input_ids=["4HHB.A"], return_data_list=["nonpolymer_bound_components"]) 335 | query.exec() 336 | except Exception as error: 337 | self.fail(f"Failed unexpectedly: {error}") 338 | with self.subTest(msg="6. input_ids, list as entry input_ids"): 339 | try: 340 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["nonpolymer_bound_components"]) 341 | query.exec() 342 | except Exception as error: 343 | self.fail(f"Failed unexpectedly: {error}") 344 | with self.subTest(msg="7. input_ids, list as polymer instance input_ids"): 345 | try: 346 | query = DataQuery(input_type="polymer_entity_instances", input_ids=["4HHB.A"], return_data_list=["nonpolymer_bound_components"]) 347 | query.exec() 348 | except Exception as error: 349 | self.fail(f"Failed unexpectedly: {error}") 350 | with self.subTest(msg="8. return_data_list, Not a unique field error"): 351 | with self.assertRaises(ValueError): 352 | query = DataQuery(input_type="polymer_entity_instances", input_ids=["4HHB.A"], return_data_list=["polymer_composition"]) 353 | query.exec() 354 | with self.subTest(msg="9. return_data_list, find_paths() methods"): 355 | try: 356 | schema = DataSchema() 357 | schema.find_paths("polymer_entity_instances", "polymer_composition") 358 | except Exception as error: 359 | self.fail(f"Failed unexpectedly: {error}") 360 | with self.subTest(msg="10. return_data_list, corrected query with non-redundant field"): 361 | try: 362 | query = DataQuery(input_type="entries", input_ids=["4HHB"], return_data_list=["rcsb_entry_info.polymer_composition"]) 363 | query.exec() 364 | except Exception as error: 365 | self.fail(f"Failed unexpectedly: {error}") 366 | with self.subTest(msg="11. find_field_names()"): 367 | try: 368 | schema.find_field_names("polymer_composition") 369 | except Exception as error: 370 | self.fail(f"Failed unexpectedly: {error}") 371 | try: 372 | schema.find_field_names("comp") 373 | except Exception as error: 374 | self.fail(f"Failed unexpectedly: {error}") 375 | with self.subTest(msg="12. More complex queries, multiple ids"): 376 | try: 377 | query = DataQuery(input_type="entries", input_ids=["4HHB", "12CA", "3PQR"], return_data_list=["nonpolymer_bound_components"]) 378 | query.exec() 379 | except Exception as error: 380 | self.fail(f"Failed unexpectedly: {error}") 381 | with self.subTest(msg="13. More complex queries, multiple return data"): 382 | try: 383 | query = DataQuery( 384 | input_type="entries", input_ids=["4HHB"], return_data_list=["citation.title", "nonpolymer_bound_components", "rcsb_entry_info.polymer_composition"] 385 | ) 386 | query.exec() 387 | except Exception as error: 388 | self.fail(f"Failed unexpectedly: {error}") 389 | 390 | def testSearchDataNotebook(self): 391 | with self.subTest(msg="1. Construct search API query and request"): 392 | # search API query and request 393 | try: 394 | q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == "COVID-19 virus" 395 | q2 = attrs.rcsb_nonpolymer_entity_annotation.type == "SUBJECT_OF_INVESTIGATION" 396 | q3 = attrs.rcsb_polymer_entity_feature_summary.type == "modified_monomer" 397 | query = q1 & q2 & q3 398 | result_list = query() 399 | except Exception as error: 400 | self.fail(f"Failed unexpectedly: {error}") 401 | self.assertGreaterEqual(len(list(result_list)), 10) 402 | with self.subTest(msg="2. Construct data API query and parse result"): 403 | try: 404 | data_query = DataQuery( 405 | input_type="entries", 406 | # input ids removed because "rcsb_nonpolymer_instance_validation_score" is None: "6W61", "7ARF", "7JPZ", "7JQ3" 407 | input_ids=["7AWU", "7C8B", "7JP0", "7JQ0", "7JQ1", "7JQ2"], 408 | return_data_list=[ 409 | "entries.rcsb_id", 410 | "rcsb_nonpolymer_entity_instance_container_identifiers.comp_id", 411 | "is_subject_of_investigation", 412 | "citation.title", 413 | "citation.pdbx_database_id_DOI", 414 | ], 415 | ) 416 | data_query.exec() 417 | except Exception as error: 418 | self.fail(f"Failed unexpectedly: {error}") 419 | try: 420 | json = data_query.get_response()["data"]["entries"] 421 | json[0]["rcsb_id"] 422 | json[0]["nonpolymer_entities"] 423 | json[0]["nonpolymer_entities"][0]["nonpolymer_entity_instances"] 424 | json[0]["nonpolymer_entities"][0]["nonpolymer_entity_instances"][0]["rcsb_nonpolymer_instance_validation_score"][0]["is_subject_of_investigation"] 425 | json[0]["nonpolymer_entities"][0]["nonpolymer_entity_instances"][0]["rcsb_nonpolymer_entity_instance_container_identifiers"]["comp_id"] 426 | json[0]["citation"][0]["title"] 427 | json[0]["citation"][0]["pdbx_database_id_DOI"] 428 | except Exception as error: 429 | self.fail(f"Failed unexpectedly: {error}") 430 | 431 | def testAllStructures(self): 432 | from rcsbapi.data import ALL_STRUCTURES 433 | 434 | with self.subTest("1. Test entries ALL_STRUCTURES"): 435 | try: 436 | data_query = DataQuery( 437 | input_type="entries", 438 | input_ids=ALL_STRUCTURES, 439 | return_data_list=["exptl.method"], 440 | ) 441 | data_query.exec() 442 | except Exception as error: 443 | self.fail(f"Failed unexpectedly: {error}") 444 | 445 | with self.subTest("2. Test chem_comps ALL_STRUCTURES"): 446 | try: 447 | data_query = DataQuery( 448 | input_type="chem_comps", 449 | input_ids=ALL_STRUCTURES, 450 | return_data_list=["chem_comps.rcsb_id"], 451 | ) 452 | data_query.exec() 453 | except Exception as error: 454 | self.fail(f"Failed unexpectedly: {error}") 455 | 456 | 457 | def buildQuery(): 458 | suiteSelect = unittest.TestSuite() 459 | suiteSelect.addTest(QueryTests("testGetEditorLink")) 460 | suiteSelect.addTest(QueryTests("testExec")) 461 | suiteSelect.addTest(QueryTests("testLowercaseIds")) 462 | suiteSelect.addTest(QueryTests("testBatchIDs")) 463 | suiteSelect.addTest(QueryTests("testDocs")) 464 | suiteSelect.addTest(QueryTests("testAddExamples")) 465 | suiteSelect.addTest(QueryTests("testQuickstartNotebook")) 466 | suiteSelect.addTest(QueryTests("testSearchDataNotebook")) 467 | suiteSelect.addTest(QueryTests("testAllStructures")) 468 | return suiteSelect 469 | 470 | 471 | if __name__ == "__main__": 472 | mySuite = buildQuery() 473 | unittest.TextTestRunner(verbosity=2).run(mySuite) 474 | -------------------------------------------------------------------------------- /tests/test_search_schema.py: -------------------------------------------------------------------------------- 1 | ## 2 | # File: testschema.py 3 | # Author: Spencer Bliven/Santiago Blaumann 4 | # Date: 6/7/23 5 | # Version: 1.0 6 | # 7 | # Update: 8 | # 9 | # 10 | ## 11 | """ 12 | Tests for all functions of the schema file. 13 | """ 14 | 15 | __docformat__ = "google en" 16 | __author__ = "Santiago Blaumann" 17 | __email__ = "santiago.blaumann@rcsb.org" 18 | __license__ = "BSD 3-Clause" 19 | 20 | import logging 21 | import platform 22 | import resource 23 | import time 24 | import unittest 25 | import os 26 | 27 | from rcsbapi.search import search_attributes as attrs 28 | from rcsbapi.search import SEARCH_SCHEMA 29 | from rcsbapi.const import const 30 | 31 | logger = logging.getLogger(__name__) 32 | logger.setLevel(logging.INFO) 33 | 34 | 35 | class SchemaTests(unittest.TestCase): 36 | def setUp(self): 37 | self.__startTime = time.time() 38 | logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) 39 | 40 | def tearDown(self): 41 | unitS = "MB" if platform.system() == "Darwin" else "GB" 42 | rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 43 | logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS) 44 | endTime = time.time() 45 | logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) 46 | 47 | def testSchema(self): 48 | ok = attrs.rcsb_id.attribute == "rcsb_id" 49 | self.assertTrue(ok) 50 | ok2 = attrs.rcsb_struct_symmetry.symbol.attribute == "rcsb_struct_symmetry.symbol" 51 | self.assertTrue(ok2) 52 | logger.info("Schema test results: ok : (%r), ok2: (%r)", ok, ok2) 53 | 54 | def testSchemaVersion(self): 55 | # Check structure attribute schema version 56 | webSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL) 57 | localSchema = SEARCH_SCHEMA._load_json_schema(os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_FILENAME)) 58 | webVer = webSchema.get("$comment").split()[-1] 59 | localVer = localSchema.get("$comment").split()[-1] 60 | ok = len(localVer.split(".")) == 3 and len(webVer.split(".")) == 3 61 | self.assertTrue(ok) 62 | logger.info("ok is %r", ok) 63 | webVerMajorMinor = float(".".join(webVer.split(".")[0:2])) 64 | localVerMajorMinor = float(".".join(localVer.split(".")[0:2])) 65 | ok = localVerMajorMinor <= webVerMajorMinor and localVerMajorMinor >= webVerMajorMinor - 0.10 66 | logger.info("ok is %r", ok) 67 | self.assertTrue(ok) 68 | logger.info("Metadata schema tests results: local version (%r) and web version (%s)", localVer, webVer) 69 | # Check chemical attribute schema version 70 | webSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL) 71 | localSchema = SEARCH_SCHEMA._load_json_schema(os.path.join(const.SEARCH_API_SCHEMA_DIR, const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_FILENAME)) 72 | webVer = webSchema.get("$comment").split()[-1] 73 | localVer = localSchema.get("$comment").split()[-1] 74 | ok = len(localVer.split(".")) == 3 and len(webVer.split(".")) == 3 75 | self.assertTrue(ok) 76 | logger.info("ok is %r", ok) 77 | webVerMajorMinor = float(".".join(webVer.split(".")[0:2])) 78 | localVerMajorMinor = float(".".join(localVer.split(".")[0:2])) 79 | ok = localVerMajorMinor <= webVerMajorMinor and localVerMajorMinor >= webVerMajorMinor - 0.10 80 | logger.info("ok is %r", ok) 81 | self.assertTrue(ok) 82 | logger.info("Chemical schema tests results: local version (%r) and web version (%s)", localVer, webVer) 83 | 84 | def testFetchSchema(self): 85 | # check fetching of structure attribute schema 86 | fetchSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_STRUCTURE_ATTRIBUTE_SCHEMA_URL) 87 | ok = fetchSchema is not None 88 | logger.info("ok is %r", ok) 89 | self.assertTrue(ok) 90 | fetchSchema = SEARCH_SCHEMA._fetch_schema(const.SEARCH_API_CHEMICAL_ATTRIBUTE_SCHEMA_URL) 91 | ok = fetchSchema is not None 92 | logger.info("ok is %r", ok) 93 | self.assertTrue(ok) 94 | errorURL = "https://httpbin.org/status/404" 95 | fetchSchema = SEARCH_SCHEMA._fetch_schema(errorURL) 96 | ok = fetchSchema is None 97 | logger.info("ok is %r", ok) 98 | self.assertTrue(ok) 99 | 100 | def testRcsbAttrs(self): 101 | with self.subTest(msg="1. Check type and descriptions exist for attributes"): 102 | for attr in attrs: 103 | attr_dict = vars(attr) 104 | desc = attr_dict["description"] 105 | self.assertIsNotNone(desc) 106 | 107 | with self.subTest(msg="2. Check searching for attribute details"): 108 | attr_details = attrs.get_attribute_details("drugbank_info.drug_groups") 109 | for obj_attr in ["attribute", "type", "description"]: 110 | self.assertIn(obj_attr, vars(attr_details).keys()) 111 | 112 | # special case because rcsb_id is in both structure and chemical attributes 113 | attr_dict = vars(attrs.get_attribute_details("rcsb_id")) 114 | self.assertIsInstance(attr_dict["type"], list) 115 | self.assertIsInstance(attr_dict["description"], list) 116 | 117 | attr_details = attrs.get_attribute_details("foo") 118 | self.assertIsNone(attr_details) 119 | 120 | 121 | def buildSchema(): 122 | suiteSelect = unittest.TestSuite() 123 | suiteSelect.addTest(SchemaTests("testSchema")) 124 | suiteSelect.addTest(SchemaTests("testSchemaVersion")) 125 | suiteSelect.addTest(SchemaTests("testFetchSchema")) 126 | suiteSelect.addTest(SchemaTests("testRcsbAttrs")) 127 | 128 | return suiteSelect 129 | 130 | 131 | if __name__ == "__main__": 132 | mySuite = buildSchema() 133 | unittest.TextTestRunner(verbosity=2).run(mySuite) 134 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # File: tox.ini (Templated version) 2 | # 3 | [local_settings] 4 | # project specific local settings 5 | test_pattern = "test*.py" 6 | # 7 | # Source pathls (unquoted space separated list of files/directories) for linting and format checks 8 | source_paths = rcsbapi setup.py 9 | # 10 | # Start directory path for test discovery 11 | # Each path must reference valid directory that is searchable by Python (i.e. contains __init__.py) 12 | # ** It is harmless to point to paths containing no tests. 13 | # 14 | test_path_1 = "tests" 15 | # These are placeholders valid source directories without tests files 16 | # test_path_2 = "rcsbapi/data" 17 | # test_path_3 = "rcsbapi" 18 | # test_path_4 = "rcsbapi" 19 | # 20 | # Comma separate list of directories for which test coverage will be evaluated 21 | coverage_source_paths = "rcsbapi,tests" 22 | # coverage_exclude_paths = "rcsbapi/__init__.py" 23 | coverage_cutoff = 65 24 | # 25 | ## -------------------------------------------------------------------------- 26 | ## ---------- No project specific setting beyond this point -------------- 27 | ## 28 | [tox] 29 | # The complete list of supported test environments to setup and invoke 30 | envlist = format_pep8-{py39}, lint_pylint-{py39}, format_black-{py39}, py{39}, test_coverage-{py39} 31 | # 32 | minversion = 3.4.0 33 | skip_missing_interpreters = true 34 | skipsdist = false 35 | 36 | [testenv] 37 | passenv = CONFIG_SUPPORT_TOKEN_ENV 38 | allowlist_externals = echo 39 | commands = 40 | echo "Starting default tests in testenv" 41 | basepython = py39: python3.9 42 | 43 | [testenv:py39] 44 | description = 'Run unit tests (unittest runner) using {envpython}' 45 | platform= 46 | macos: darwin 47 | linux: linux 48 | skip_install = false 49 | recreate = true 50 | alwayscopy=true 51 | package = editable-legacy 52 | deps = 53 | -r requirements.txt 54 | commands = 55 | echo "Starting {envname}" 56 | {envpython} -V 57 | {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_1} --pattern "{[local_settings]test_pattern}" 58 | # {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_2} --pattern "{[local_settings]test_pattern}" 59 | # {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_3} --pattern "{[local_settings]test_pattern}" 60 | # {envpython} -m unittest discover -v --start-directory {[local_settings]test_path_4} --pattern "{[local_settings]test_pattern}" 61 | echo "Completed {envname}" 62 | 63 | # 64 | [testenv:format_pep8-py39] 65 | description = 'Run selected PEP8 compliance checks (flake8)' 66 | platform= 67 | macos: darwin 68 | linux: linux 69 | deps = 70 | flake8 71 | # This plugin is no longer compatible with latest pydocstyles - 72 | # flake8-docstrings>=0.2.7 73 | flake8-import-order>=0.9 74 | -r requirements.txt 75 | commands = 76 | # Exceptions: D for docstrings, I for imports order and formatting, E302 is slice spacing - W503 multiline spacing incompatible with black 77 | flake8 --max-line-length=185 --ignore=D,I,E203,W503 {[local_settings]source_paths} 78 | 79 | # 80 | [testenv:lint_pylint-py39] 81 | description = 'Run linting compliance checks (pylint)' 82 | platform= 83 | macos: darwin 84 | linux: linux 85 | deps = 86 | pylint 87 | -r requirements.txt 88 | commands = 89 | echo "Starting {envname}" 90 | pylint --disable=R,C --reports=n --rcfile={toxinidir}/pylintrc {[local_settings]source_paths} 91 | echo "Completed {envname}" 92 | 93 | # 94 | [testenv:format_black-py39] 95 | description = 'Run format compliance checks (black)' 96 | platform= 97 | macos: darwin 98 | linux: linux 99 | deps = 100 | black>=21.5b1 101 | -r requirements.txt 102 | # isort>=4.3.20 103 | commands = 104 | echo "Starting {envname}" 105 | black --check --line-length 185 {[local_settings]source_paths} 106 | # isort -rc rcsb/utils --check-only 107 | echo "Completed {envname}" 108 | 109 | # 110 | [testenv:test_coverage-py39] 111 | description = 'Run test coverage analysis' 112 | platform= 113 | macos: darwin 114 | linux: linux 115 | recreate = true 116 | alwayscopy=true 117 | package = editable-legacy 118 | deps = 119 | coverage 120 | -r requirements.txt 121 | 122 | commands = 123 | echo "Starting {envname}" 124 | coverage erase 125 | coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_1} --pattern "{[local_settings]test_pattern}" 126 | # coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_2} --pattern "{[local_settings]test_pattern}" 127 | # coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_3} --pattern "{[local_settings]test_pattern}" 128 | # coverage run --parallel-mode --omit="{[local_settings]coverage_exclude_paths}" --source="{[local_settings]coverage_source_paths}" -m unittest discover -v --start-directory {[local_settings]test_path_4} --pattern "{[local_settings]test_pattern}" 129 | echo " ------- Consolidating {envname} data ----------" 130 | coverage combine 131 | echo " ------- Building {envname} reports ----------" 132 | coverage report --fail-under={[local_settings]coverage_cutoff} 133 | - coverage xml 134 | echo "Completed {envname}" --------------------------------------------------------------------------------