├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── api.rst ├── conf.py ├── index.md ├── make.bat ├── queries.md ├── quickstart.md └── requirements.txt ├── environment.yml ├── notebooks ├── covid.ipynb └── quickstart.ipynb ├── pytest.ini ├── rcsbsearch ├── __init__.py ├── resources │ └── metadata_schema.json ├── schema.py ├── search.py └── update_schema.py ├── requirements.txt ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_schema.py └── test_search.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: 14 | - pdf 15 | 16 | python: 17 | version: 3.7 18 | install: 19 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.7" 4 | - "3.8" 5 | - "3.9" 6 | jobs: 7 | include: 8 | - name: lint 9 | script: tox -e lint 10 | install: 11 | - pip install tox-travis 12 | script: 13 | - tox 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v0.2.3 (2021-04-28) 4 | 5 | - Fix mug with missing schema files when installed via pip 6 | - Add jupyter notebooks 7 | - Try rcsbsearch live with binder 8 | 9 | ## v0.2.2 (2021-04-06) 10 | 11 | - Remove `in` operator syntax (incompatible with python spec) 12 | - Fix import error due to schema change 13 | - Ship schema with the package for stability and performance 14 | 15 | ## v0.2.1 (2020-06-18) 16 | 17 | - Test release process 18 | 19 | ## v0.2.0 (2020-06-18) 20 | 21 | - Add fluent syntax (originally called builder syntax) 22 | - Add PartialQuery helper 23 | - Improve docs & automated testing 24 | 25 | ## v0.1.0 (2020-06-03) 26 | 27 | - Ship it! 28 | - Support for text searches -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Testing 4 | 5 | Tests are run using tox and/or pytest. 6 | 7 | tox -e py37 8 | 9 | or directly: 10 | 11 | pytest 12 | 13 | 14 | ## Code Style 15 | 16 | Code conforms to the `black` and PEP8 style guides. Before checking in code, please run the linters: 17 | 18 | black . 19 | flake8 20 | mypy rcsbsearch 21 | 22 | These are tested by the 'lint' tox environment: 23 | 24 | tox -e lint 25 | 26 | 27 | ## Building docs 28 | 29 | Docs are written in the [MyST](https://myst-parser.readthedocs.io) superset of 30 | markdown. [Google style 31 | docstrings](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) are 32 | preferred for API documentation. 33 | 34 | Building with tox: 35 | 36 | tox -e docs 37 | 38 | Building manually: 39 | 40 | cd docs 41 | make html 42 | 43 | For live updates, you can also install the `sphinx-autobuild` tool (pip or conda) 44 | 45 | pip install sphinx-autobuild 46 | cd docs 47 | make livehtml 48 | 49 | Which runs: 50 | 51 | sphinx-autobuild -z rcsbsearch docs docs/_build/html 52 | 53 | ## Making a release 54 | 55 | ### Setup 56 | 57 | - Set up GPG key (for signing the tag) 58 | - `pip install twine` 59 | - Generate API token at TestPyPI and PyPI and add to .pypirc: 60 | 61 | [distutils] 62 | index-servers= 63 | pypi 64 | testpypi 65 | [pypi] 66 | username = __token__ 67 | password = pypi-... 68 | [testpypi] 69 | repository: https://test.pypi.org/legacy/ 70 | username = __token__ 71 | password = pypi-... 72 | 73 | - `chmod 600 ~/.pypirc` 74 | 75 | 76 | ### Release 77 | 78 | 1. Test 79 | 80 | tox 81 | 82 | 2. Build 83 | 84 | python setup.py sdist bdist_wheel 85 | 86 | 3. Tag 87 | 88 | git tag -s -a v0.1.0 89 | 90 | 4. Run checks 91 | 92 | twine check dist/* 93 | git verify-tag v0.1.0 94 | 95 | 4. Push to testing 96 | 97 | twine upload --repository testpypi -s --identity 780796DF dist/* 98 | 99 | 5. Push! 100 | 101 | git push 102 | git push --tags 103 | twine upload -s --identity 780796DF dist/* 104 | 105 | 6. Bump version number 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | -------------------- 3 | 4 | Copyright (c) 2020, Spencer Bliven 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | 3. Neither the name of the copyright holder nor the names of its contributors 18 | may be used to endorse or promote products derived from this software 19 | without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPi Release](https://img.shields.io/pypi/v/rcsbsearch.svg)](https://pypi.org/project/rcsbsearch/) 2 | [![Build Status](https://travis-ci.org/sbliven/rcsbsearch.svg?branch=master)](https://travis-ci.org/sbliven/rcsbsearch) 3 | [![Documentation Status](https://readthedocs.org/projects/rcsbsearch/badge/?version=latest)](https://rcsbsearch.readthedocs.io/en/latest/?badge=latest) 4 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 5 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fcovid.ipynb) 6 | 7 | # rcsbsearch 8 | 9 | Python interface for the RCSB search API. 10 | 11 | Currently the 'text search' part of the API has been implemented. See 'Supported 12 | features' below. 13 | 14 | This package requires python 3.7 or later. 15 | 16 | ## Example 17 | 18 | Here is a quick example of how the package is used. Two syntaxes are available for 19 | constructing queries: an "operator" API using python's comparators, and a "fluent" 20 | syntax where terms are chained together. Which to use is a matter of preference. 21 | 22 | A runnable jupyter notebook with this example is available in [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb), or can be run online using binder: 23 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fquickstart.ipynb) 24 | 25 | An additional example including a Covid-19 related example is in [notebooks/covid.ipynb](notebooks/covid.ipynb): 26 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fcovid.ipynb) 27 | 28 | ### Operator example 29 | 30 | Here is an example from the [RCSB Search 31 | API](http://search.rcsb.org/#search-example-1) page, using the operator syntax. This 32 | query finds symmetric dimers having a twofold rotation with the DNA-binding domain of 33 | a heat-shock transcription factor. 34 | 35 | from rcsbsearch import TextQuery 36 | from rcsbsearch import rcsb_attributes as attrs 37 | 38 | # Create terminals for each query 39 | q1 = TextQuery('"heat-shock transcription factor"') 40 | q2 = attrs.rcsb_struct_symmetry.symbol == "C2" 41 | q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry" 42 | q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1 43 | 44 | # combined using bitwise operators (&, |, ~, etc) 45 | query = q1 & q2 & q3 & q4 # AND of all queries 46 | 47 | # Call the query to execute it 48 | for assemblyid in query("assembly"): 49 | print(assemblyid) 50 | 51 | For a full list of attributes, please refer to the [RCSB 52 | schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema). 53 | 54 | ### Fluent Example 55 | 56 | Here is the same example using the 57 | [fluent](https://en.wikipedia.org/wiki/Fluent_interface) syntax. 58 | 59 | from rcsbsearch import TextQuery 60 | 61 | # Start with a Attr or TextQuery, then add terms 62 | results = TextQuery('"heat-shock transcription factor"') \ 63 | .and_("rcsb_struct_symmetry.symbol").exact_match("C2") \ 64 | .and_("rcsb_struct_symmetry.kind").exact_match("Global Symmetry") \ 65 | .and_("rcsb_entry_info.polymer_entity_count_DNA").greater_or_equal(1) \ 66 | .exec("assembly") 67 | 68 | # Exec produces an iterator of IDs 69 | for assemblyid in results: 70 | print(assemblyid) 71 | 72 | 73 | ## Supported Features 74 | 75 | The following table lists the status of current and planned features. 76 | 77 | - [x] Attribute Comparison operations 78 | - [x] Query set operations 79 | - [x] Attribute `contains`, `in_` (fluent only) 80 | - [ ] Sequence search 81 | - [ ] Sequence motif search 82 | - [ ] Structural search 83 | - [ ] Structural motif search 84 | - [ ] Chemical search 85 | - [ ] Rich results using the Data API 86 | 87 | Contributions are welcome for unchecked items! 88 | 89 | ## Installation 90 | 91 | Get it from pypi: 92 | 93 | pip install rcsbsearch 94 | 95 | Or, download from [github](https://github.com/sbliven/rcsbsearch) 96 | 97 | ## Documentation 98 | 99 | Detailed documentation is at [rcsbsearch.readthedocs.io](https://rcsbsearch.readthedocs.io/en/latest/) 100 | 101 | ## License 102 | 103 | Code is licensed under the BSD 3-clause license. See [LICENSE](LICENSE) for details. 104 | 105 | ## Citing rcsbsearch 106 | 107 | Please cite the rcsbsearch package by URL: 108 | 109 | > https://rcsbsearch.readthedocs.io 110 | 111 | You should also cite the RCSB service this package utilizes: 112 | 113 | > Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi 114 | > Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley, 115 | > John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards 116 | > Integrated Searching and Efficient Access to Macromolecular Structure Data 117 | > from the PDB Archive, Journal of Molecular Biology, 2020. 118 | > DOI: [10.1016/j.jmb.2020.11.003](https://doi.org/10.1016/j.jmb.2020.11.003) 119 | 120 | ## Developers 121 | 122 | For information about building and developing `rcsbsearch`, see 123 | [CONTRIBUTING.md](CONTRIBUTING.md) 124 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | livehtml: 18 | sphinx-autobuild -b html -z "$(SOURCEDIR)/../rcsbsearch" "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O) 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ***************** 3 | 4 | .. automodule:: rcsbsearch 5 | :members: 6 | :private-members: 7 | :special-members: -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("..")) 17 | import rcsbsearch # noqa: E402 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "rcsbsearch" 22 | copyright = "2020, Spencer Bliven" 23 | author = "Spencer Bliven" 24 | 25 | # The version info for the project you're documenting, acts as replacement for 26 | # |version| and |release|, also used in various other places throughout the 27 | # built documents. 28 | # 29 | # The short X.Y version. 30 | version = rcsbsearch.__version__.split("-")[0] 31 | # The full version, including alpha/beta/rc tags 32 | release = rcsbsearch.__version__ 33 | 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | "sphinx.ext.autodoc", 42 | "sphinx.ext.coverage", 43 | "sphinx.ext.napoleon", 44 | "myst_parser", 45 | ] 46 | # source_suffix = [".rst", ".md"] # Redundant with newer sphinx versions 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ["_templates"] 50 | 51 | # List of patterns, relative to source directory, that match files and 52 | # directories to ignore when looking for source files. 53 | # This pattern also affects html_static_path and html_extra_path. 54 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 55 | 56 | # Napoleon settings 57 | # napoleon_google_docstring = True 58 | napoleon_numpy_docstring = False 59 | # napoleon_include_init_with_doc = False 60 | # napoleon_include_private_with_doc = False 61 | # napoleon_include_special_with_doc = True 62 | # napoleon_use_admonition_for_examples = False 63 | # napoleon_use_admonition_for_notes = False 64 | # napoleon_use_admonition_for_references = False 65 | # napoleon_use_ivar = False 66 | # napoleon_use_param = True 67 | # napoleon_use_rtype = True 68 | 69 | 70 | # -- Options for HTML output ------------------------------------------------- 71 | 72 | # The theme to use for HTML and HTML Help pages. See the documentation for 73 | # a list of builtin themes. 74 | html_theme = "sphinx_rtd_theme" 75 | 76 | 77 | # Add any paths that contain custom static files (such as style sheets) here, 78 | # relative to this directory. They are copied after the builtin static files, 79 | # so a file named "default.css" will overwrite the builtin "default.css". 80 | html_static_path = ["_static"] 81 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # rcsbsearch - Query protein structures from python 2 | 3 | The `rcsbsearch` package provides a python interface to the [RCSB Search API](http://search.rcsb.org/). Use it to fetch lists of PDB IDs corresponding to advanced query searches. 4 | 5 | ```{toctree} 6 | --- 7 | caption: Contents 8 | maxdepth: 2 9 | --- 10 | quickstart.md 11 | queries.md 12 | api.rst 13 | ``` 14 | 15 | ## Availability 16 | 17 | Get it from pypi: 18 | 19 | pip install rcsbsearch 20 | 21 | Or, download from [github](https://github.com/sbliven/rcsbsearch) 22 | 23 | ## License 24 | 25 | Code is licensed under the BSD 3-clause license. See the 26 | [LICENSE](https://github.com/sbliven/rcsbsearch/blob/master/LICENSE) for details. 27 | 28 | ## Citing 29 | 30 | Please cite the rcsbsearch package by URL: 31 | 32 | > https://rcsbsearch.readthedocs.io 33 | 34 | You should also cite the RCSB service this package utilizes: 35 | 36 | > Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi 37 | > Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley, 38 | > John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards 39 | > Integrated Searching and Efficient Access to Macromolecular Structure Data 40 | > from the PDB Archive, Journal of Molecular Biology, 2020. 41 | > DOI: [10.1016/j.jmb.2020.11.003](https://doi.org/10.1016/j.jmb.2020.11.003) 42 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/queries.md: -------------------------------------------------------------------------------- 1 | # Queries 2 | 3 | Two syntaxes are available for constructing queries: an "operator" API using python's 4 | comparators, and a "fluent" API where terms are chained together. Which to use is a 5 | matter of preference, and both construct the same query object. 6 | 7 | ## Operator syntax 8 | 9 | Searches are built up from a series of `Terminal` nodes, which compare structural 10 | attributes to some search value. In the operator syntax, python's comparator 11 | operators are used to construct the comparison. The operators are overloaded to 12 | return `Terminal` objects for the comparisons. 13 | 14 | from rcsbsearch import TextQuery 15 | from rcsbsearch import rcsb_attributes as attrs 16 | 17 | # Create terminals for each query 18 | q1 = TextQuery('"heat-shock transcription factor"') 19 | q2 = attrs.rcsb_struct_symmetry.symbol == "C2" 20 | q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry" 21 | q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1 22 | 23 | Attributes are available from the rcsb_attributes object and can be tab-completed. 24 | They can additionally be constructed from strings using the `Attr(attribute)` 25 | constructor. For a full list of attributes, please refer to the [RCSB 26 | schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema). 27 | 28 | `Terminal`s are combined into `Group`s using python's bitwise operators. This is 29 | analogous to how bitwise operators act on python `set` objects. The operators are 30 | lazy and won't perform the search until the query is executed. 31 | 32 | query = q1 & q2 & q3 & q4 # AND of all queries 33 | 34 | AND (`&`), OR (`|`), and terminal negation (`~`) are implemented directly by the API, 35 | but the python package also implements set difference (`-`), symmetric difference (`^`), 36 | and general negation by transforming the query. 37 | 38 | Queries are executed by calling them as functions. They return an iterator of result 39 | identifiers. 40 | 41 | results = set(query()) 42 | 43 | By default, the query will return "entry" results (PDB IDs). It is also possible to 44 | query other types of results (see [return-types](http://search.rcsb.org/#return-type) 45 | for options): 46 | 47 | assemblies = set(query("assembly")) 48 | 49 | 50 | ## Fluent syntax 51 | 52 | The operator syntax is great for simple queries, but requires parentheses or 53 | temporary variables for complex nested queries. In these cases the fluent syntax may 54 | be clearer. Queries are built up by appending operations sequentially. 55 | 56 | from rcsbsearch import TextQuery 57 | 58 | # Start with a Attr or TextQuery, then add terms 59 | results = TextQuery('"heat-shock transcription factor"') \ 60 | .and_("rcsb_struct_symmetry.symbol").exact_match("C2") \ 61 | .and_("rcsb_struct_symmetry.kind").exact_match("Global Symmetry") \ 62 | .and_("rcsb_entry_info.polymer_entity_count_DNA").greater_or_equal(1) \ 63 | .exec("assembly") 64 | 65 | ## Sessions 66 | 67 | The result of executing a query (either by calling it or using `exec()`) is a 68 | `Session` object. It implements `__iter__`, so it is usually treated just as an 69 | iterator of IDs. 70 | 71 | Paging is handled transparently by the session, with additional API requests made 72 | lazily as needed. The page size can be controlled with the `rows` parameter. 73 | 74 | first = next(iter(query(rows=1))) 75 | 76 | ### Progress Bar 77 | 78 | The `Session.iquery()` method provides a progress bar indicating the number of API 79 | requests being made. It requires the `tqdm` package be installed to track the 80 | progress of the query interactively. 81 | 82 | results = query().iquery() 83 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quickstart 2 | 3 | ## Installation 4 | 5 | Get it from pypi: 6 | 7 | pip install rcsbsearch 8 | 9 | Or, download from [github](https://github.com/sbliven/rcsbsearch) 10 | 11 | ## Syntax 12 | 13 | Here is a quick example of how the package is used. Two syntaxes are available for 14 | constructing queries: an "operator" API using python's comparators, and a "fluent" 15 | syntax where terms are chained together. Which to use is a matter of preference. 16 | 17 | A runnable jupyter notebook with this example is available in [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb), or can be run online using binder: 18 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fquickstart.ipynb) 19 | 20 | An additional example including a Covid-19 related example is in [notebooks/covid.ipynb](notebooks/covid.ipynb): 21 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fcovid.ipynb) 22 | 23 | ### Operator example 24 | 25 | Here is an example from the [RCSB Search 26 | API](http://search.rcsb.org/#search-example-1) page, using the operator syntax. This 27 | query finds symmetric dimers having a twofold rotation with the DNA-binding domain of 28 | a heat-shock transcription factor. 29 | 30 | from rcsbsearch import TextQuery 31 | from rcsbsearch import rcsb_attributes as attrs 32 | 33 | # Create terminals for each query 34 | q1 = TextQuery('"heat-shock transcription factor"') 35 | q2 = attrs.rcsb_struct_symmetry.symbol == "C2" 36 | q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry" 37 | q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1 38 | 39 | # combined using bitwise operators (&, |, ~, etc) 40 | query = q1 & q2 & q3 & q4 # AND of all queries 41 | 42 | # Call the query to execute it 43 | for assemblyid in query("assembly"): 44 | print(assemblyid) 45 | 46 | For a full list of attributes, please refer to the [RCSB 47 | schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema). 48 | 49 | ### Fluent Example 50 | 51 | Here is the same example using the fluent syntax 52 | 53 | from rcsbsearch import Attr, TextQuery 54 | 55 | # Start with a Attr or TextQuery, then add terms 56 | results = TextQuery('"heat-shock transcription factor"') \ 57 | .and_("rcsb_struct_symmetry.symbol").exact_match("C2") \ 58 | .and_("rcsb_struct_symmetry.kind").exact_match("Global Symmetry") \ 59 | .and_("rcsb_entry_info.polymer_entity_count_DNA").greater_or_equal(1) \ 60 | .exec("assembly") 61 | 62 | # Exec produces an iterator of IDs 63 | for assemblyid in results: 64 | print(assemblyid) 65 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Pin dependencies for the docs 2 | # Should be kept up-to-date with setup.py 3 | sphinx==3.5.3 4 | sphinx-rtd-theme==0.5.2 5 | typing-extensions==3.7.4.3 6 | myst-parser==0.13.5 7 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # Create conda environment 2 | # This file is used for mybinder.org, so it includes all optional dependencies 3 | name: rcsbsearch 4 | channels: 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | # Python 3.7 or newer 9 | - python >= 3.7 10 | 11 | # Required dependencies 12 | - requests 13 | - jsonschema 14 | # python 3.7 only 15 | - typing_extensions 16 | 17 | # dev requirements 18 | - tqdm 19 | - tox 20 | - pytest 21 | - black 22 | - flake8 23 | - mypy 24 | - sphinx 25 | - myst-parser 26 | 27 | # notebook packages 28 | - jupyter 29 | - nglview 30 | 31 | - pip 32 | - pip: 33 | - sphinx-rtd-theme 34 | # Install rcsbsearch from local directory 35 | - . 36 | -------------------------------------------------------------------------------- /notebooks/covid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "metallic-memphis", 6 | "metadata": {}, 7 | "source": [ 8 | "# rcsbsearch\n", 9 | "\n", 10 | "Access the RCSB advanced search from python: [rcsbsearch.readthedocs.io](https://rcsbsearch.readthedocs.io)\n", 11 | "\n", 12 | " pip install rcsbsearch\n", 13 | " \n", 14 | "## Demo\n", 15 | "\n", 16 | "We are interested in how the antiviral drug boceprevir interacts with Covid-19. \n", 17 | "- Source Organism is \"COVID-19\"\n", 18 | "- Structure title contains \"protease\"\n", 19 | "- Bound to ligand \"Boceprevir\"\n", 20 | "\n", 21 | "[RCSB Query](http://www.rcsb.org/search?request=%7B%22query%22%3A%7B%22type%22%3A%22group%22%2C%22logical_operator%22%3A%22and%22%2C%22nodes%22%3A%5B%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22rcsb_entity_source_organism.taxonomy_lineage.name%22%2C%22operator%22%3A%22exact_match%22%2C%22value%22%3A%22COVID-19%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A0%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22value%22%3A%22protease%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A1%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22chem_comp.name%22%2C%22operator%22%3A%22contains_words%22%2C%22value%22%3A%22Boceprevir%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A2%7D%5D%7D%2C%22return_type%22%3A%22entry%22%2C%22request_info%22%3A%7B%22query_id%22%3A%2270e677a6376b4c5eba8b4f2b73866c92%22%2C%22src%22%3A%22ui%22%7D%7D)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "married-burden", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "application/vnd.jupyter.widget-view+json": { 33 | "model_id": "0402505eff634df58b1f636f5f277d19", 34 | "version_major": 2, 35 | "version_minor": 0 36 | }, 37 | "text/plain": [] 38 | }, 39 | "metadata": {}, 40 | "output_type": "display_data" 41 | } 42 | ], 43 | "source": [ 44 | "from rcsbsearch import rcsb_attributes as attrs, TextQuery\n", 45 | "import nglview" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "collectible-thread", 51 | "metadata": {}, 52 | "source": [ 53 | "## Operator syntax\n", 54 | "- Uses python comparison operators for basic attributes (`==`, `<`, `<=`, etc)\n", 55 | "- Combine using set operators (`&`, `|`, `~`, etc)\n", 56 | "- Execute queries as functions" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "id": "confidential-behavior", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "['6WNP', '7K40']" 69 | ] 70 | }, 71 | "execution_count": 2, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == \"COVID-19\"\n", 78 | "q2 = TextQuery(\"protease\")\n", 79 | "q3 = attrs.chem_comp.name.contains_words(\"Boceprevir\")\n", 80 | "q4 = attrs.rcsb_entry_info.resolution_combined > 1.5\n", 81 | "query = q1 & q2 & q3 & ~q4\n", 82 | "\n", 83 | "list(query())" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "id": "unauthorized-judge", 90 | "metadata": { 91 | "scrolled": true 92 | }, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "application/vnd.jupyter.widget-view+json": { 97 | "model_id": "b3b3d3efdc9b414f8a2a4d7a6de1474a", 98 | "version_major": 2, 99 | "version_minor": 0 100 | }, 101 | "text/plain": [ 102 | "NGLWidget()" 103 | ] 104 | }, 105 | "metadata": {}, 106 | "output_type": "display_data" 107 | } 108 | ], 109 | "source": [ 110 | "nglview.show_pdbid('7brp')" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "uniform-allen", 116 | "metadata": {}, 117 | "source": [ 118 | "## Fluent syntax\n", 119 | "\n", 120 | "A second syntax is available with a [fluent interface](https://en.wikipedia.org/wiki/Fluent_interface), similar to popular data science packages like tidyverse and Apache Spark. Function calls are chained together.\n", 121 | "\n", 122 | "Here's an example around a second antiviral, remdesivir. The drug interferes with RNA polymerase, replacing an adenine and causing early chain termination. When integrated into RNA, the nucleotide formed from remdesivir has residue code F86." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 4, 128 | "id": "irish-navigator", 129 | "metadata": { 130 | "scrolled": true 131 | }, 132 | "outputs": [ 133 | { 134 | "name": "stderr", 135 | "output_type": "stream", 136 | "text": [ 137 | "100%|██████████| 1/1 [00:00` etc) for rcsb attributes and set operators for combining queries." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "enabling-america", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "1FYL-2\n", 47 | "1FYL-1\n", 48 | "1FYM-1\n", 49 | "1FYK-1\n", 50 | "3HTS-1\n", 51 | "5D8K-1\n", 52 | "5D8L-2\n", 53 | "5D8L-1\n", 54 | "5D5W-1\n", 55 | "5D5X-1\n", 56 | "5HDN-2\n", 57 | "5HDN-1\n", 58 | "5D5V-1\n", 59 | "5D5U-1\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "# Create terminals for each query\n", 65 | "q1 = TextQuery('\"heat-shock transcription factor\"')\n", 66 | "q2 = attrs.rcsb_struct_symmetry.symbol == \"C2\"\n", 67 | "q3 = attrs.rcsb_struct_symmetry.kind == \"Global Symmetry\"\n", 68 | "q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1\n", 69 | "\n", 70 | "# combined using bitwise operators (&, |, ~, etc)\n", 71 | "query = q1 & q2 & q3 & q4 # AND of all queries\n", 72 | "\n", 73 | "# Call the query to execute it\n", 74 | "for assemblyid in query(\"assembly\"):\n", 75 | " print(assemblyid)\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "accomplished-passion", 81 | "metadata": {}, 82 | "source": [ 83 | "Attribute names can be found in the [RCSB schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema). They can also be found via tab completion, or by iterating:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "id": "supported-observer", 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "['citation.rcsb_authors',\n", 96 | " 'pdbx_nmr_software.authors',\n", 97 | " 'rcsb_primary_citation.rcsb_authors',\n", 98 | " 'rcsb_bird_citation.rcsb_authors']" 99 | ] 100 | }, 101 | "execution_count": 3, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "[a.attribute for a in attrs if \"authors\" in a.attribute]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "other-grant", 113 | "metadata": {}, 114 | "source": [ 115 | "## Fluent syntax\n", 116 | "\n", 117 | "Here is the same example using the fluent syntax:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "id": "polish-indonesia", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "1FYL-2\n", 131 | "1FYL-1\n", 132 | "1FYM-1\n", 133 | "1FYK-1\n", 134 | "3HTS-1\n", 135 | "5D8K-1\n", 136 | "5D8L-2\n", 137 | "5D8L-1\n", 138 | "5D5W-1\n", 139 | "5D5X-1\n", 140 | "5HDN-2\n", 141 | "5HDN-1\n", 142 | "5D5V-1\n", 143 | "5D5U-1\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "# Start with a Attr or TextQuery, then add terms\n", 149 | "results = TextQuery('\"heat-shock transcription factor\"') \\\n", 150 | " .and_(\"rcsb_struct_symmetry.symbol\").exact_match(\"C2\") \\\n", 151 | " .and_(\"rcsb_struct_symmetry.kind\").exact_match(\"Global Symmetry\") \\\n", 152 | " .and_(\"rcsb_entry_info.polymer_entity_count_DNA\").greater_or_equal(1) \\\n", 153 | " .exec(\"assembly\")\n", 154 | "\n", 155 | "# Exec produces an iterator of IDs\n", 156 | "for assemblyid in results:\n", 157 | " print(assemblyid)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "adopted-gnome", 163 | "metadata": {}, 164 | "source": [ 165 | "For a more practical example, see the [Covid-19 notebook](covid.ipynb)" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python [conda env:rcsbsearch]", 172 | "language": "python", 173 | "name": "conda-env-rcsbsearch-py" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.9.1" 186 | }, 187 | "toc": { 188 | "base_numbering": 1, 189 | "nav_menu": {}, 190 | "number_sections": true, 191 | "sideBar": true, 192 | "skip_h1_title": false, 193 | "title_cell": "Table of Contents", 194 | "title_sidebar": "Contents", 195 | "toc_cell": false, 196 | "toc_position": {}, 197 | "toc_section_display": true, 198 | "toc_window_display": false 199 | }, 200 | "varInspector": { 201 | "cols": { 202 | "lenName": 16, 203 | "lenType": 16, 204 | "lenVar": 40 205 | }, 206 | "kernels_config": { 207 | "python": { 208 | "delete_cmd_postfix": "", 209 | "delete_cmd_prefix": "del ", 210 | "library": "var_list.py", 211 | "varRefreshCmd": "print(var_dic_list())" 212 | }, 213 | "r": { 214 | "delete_cmd_postfix": ") ", 215 | "delete_cmd_prefix": "rm(", 216 | "library": "var_list.r", 217 | "varRefreshCmd": "cat(var_dic_list()) " 218 | } 219 | }, 220 | "types_to_exclude": [ 221 | "module", 222 | "function", 223 | "builtin_function_or_method", 224 | "instance", 225 | "_Feature" 226 | ], 227 | "window_display": false 228 | } 229 | }, 230 | "nbformat": 4, 231 | "nbformat_minor": 5 232 | } 233 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --strict-markers 3 | markers = 4 | internet: Tests that require internet access 5 | progressbar: Tests depending on the 'progressbar' extras -------------------------------------------------------------------------------- /rcsbsearch/__init__.py: -------------------------------------------------------------------------------- 1 | """RCSB Search API""" 2 | from typing import TYPE_CHECKING, Any, List 3 | 4 | from .search import Terminal # noqa: F401 5 | from .search import Attr, Group, Query, Session, TextQuery, Value 6 | 7 | __version__ = "0.3.0-dev0" 8 | 9 | 10 | # loading rcsb_attributes can cause errors, so load it lazily 11 | if TYPE_CHECKING: 12 | from .schema import SchemaGroup 13 | 14 | 15 | # Set docstring at top level too. Keep synchronized with schema.rcsb_attributes 16 | rcsb_attributes: "SchemaGroup" 17 | """Object with all known RCSB attributes. 18 | 19 | This is provided to ease autocompletion as compared to creating Attr objects from 20 | strings. For example, 21 | :: 22 | 23 | rcsb_attributes.rcsb_nonpolymer_instance_feature_summary.chem_id 24 | 25 | is equivalent to 26 | :: 27 | 28 | Attr('rcsb_nonpolymer_instance_feature_summary.chem_id') 29 | 30 | All attributes in `rcsb_attributes` can be iterated over. 31 | 32 | >>> [a for a in rcsb_attributes if "stoichiometry" in a.attribute] 33 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')] 34 | 35 | Attributes matching a regular expression can also be filtered: 36 | 37 | >>> list(rcsb_attributes.search('rcsb.*stoichiometry')) 38 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]a 39 | 40 | """ 41 | 42 | 43 | def __getattr__(name: str) -> Any: 44 | # delay instantiating rcsb_attributes until it is needed 45 | if name == "rcsb_attributes": 46 | if "rcsb_attributes" not in globals(): 47 | from .schema import rcsb_attributes as attrs 48 | 49 | globals()["rcsb_attributes"] = attrs 50 | return globals()["rcsb_attributes"] 51 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 52 | 53 | 54 | def __dir__() -> List[str]: 55 | return sorted(__all__) 56 | 57 | 58 | __all__ = [ 59 | "Query", 60 | "Group", 61 | "Terminal", 62 | "TextQuery", 63 | "Session", 64 | "Attr", 65 | "Value", 66 | "rcsb_attributes", 67 | ] 68 | -------------------------------------------------------------------------------- /rcsbsearch/schema.py: -------------------------------------------------------------------------------- 1 | """Parse the full RCSB search schema 2 | 3 | Provides access to all valid attributes for search queries. 4 | """ 5 | 6 | import json 7 | import logging 8 | import os 9 | import pkgutil 10 | import re 11 | from typing import Any, Iterator, List, Union 12 | 13 | import requests 14 | 15 | from .search import Attr 16 | 17 | METADATA_SCHEMA_URL = "http://search.rcsb.org/rcsbsearch/v1/metadata/schema" 18 | SEARCH_SCHEMA_URL = "http://search.rcsb.org/json-schema-rcsb_search_query.json" 19 | 20 | ENV_RCSBSEARCH_DOWNLOAD_SCHEMA = "RCSBSEARCH_DOWNLOAD_SCHEMA" 21 | 22 | 23 | def _get_json_schema(download=None): 24 | """Get the JSON schema 25 | 26 | The RCSBSEARCH_DOWNLOAD_SCHEMA environmental variable controls whether 27 | to download the schema from the web each time vs using the version shipped 28 | with rcsbsearch 29 | """ 30 | if download is True or ( 31 | download is None 32 | and ( 33 | os.environ.get(ENV_RCSBSEARCH_DOWNLOAD_SCHEMA, "no").lower() 34 | in ("1", "yes", "y") 35 | ) 36 | ): 37 | return _download_json_schema() 38 | return _load_json_schema() 39 | 40 | 41 | def _download_json_schema(): 42 | "Get the current JSON schema from the web" 43 | url = METADATA_SCHEMA_URL 44 | 45 | logging.info(f"Dowloading {url}") 46 | response = requests.get(url) 47 | response.raise_for_status() 48 | return response.json() 49 | 50 | 51 | def _load_json_schema(): 52 | logging.info("Loading schema from file") 53 | latest = pkgutil.get_data(__package__, "resources/metadata_schema.json") 54 | return json.loads(latest) 55 | 56 | 57 | class SchemaGroup: 58 | """A non-leaf node in the RCSB schema. Leaves are Attr values.""" 59 | 60 | def search(self, pattern: Union[str, re.Pattern], flags=0) -> Iterator[Attr]: 61 | """Find all attributes in the schema matching a regular expression. 62 | 63 | Returns: 64 | An iterator supplying Attr objects whose attribute matches. 65 | """ 66 | matcher = re.compile(pattern, flags=flags) 67 | return filter(lambda a: matcher.search(a.attribute), self) 68 | 69 | def __iter__(self) -> Iterator[Attr]: 70 | """Iterate over all leaf nodes 71 | 72 | Example: 73 | 74 | >>> [a for a in attrs if "stoichiometry" in a.attribute] 75 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')] 76 | 77 | """ 78 | 79 | def leaves(self): 80 | for k, v in self.__dict__.items(): 81 | if isinstance(v, Attr): 82 | yield v 83 | elif isinstance(v, SchemaGroup): 84 | yield from iter(v) 85 | else: 86 | # Shouldn't happen 87 | raise TypeError(f"Unrecognized member {k!r}: {v!r}") 88 | 89 | return leaves(self) 90 | 91 | def __str__(self): 92 | return "\n".join((str(c) for c in self.__dict__.values())) 93 | 94 | 95 | def _make_group(fullname: str, node) -> Union[SchemaGroup, Attr]: 96 | """Represent this node of the schema as a python object 97 | 98 | Params: 99 | - name: full dot-separated attribute name 100 | 101 | Returns: 102 | An Attr (Leaf nodes) or SchemaGroup (object nodes) 103 | """ 104 | if "anyOf" in node: 105 | children = {_make_group(fullname, n) for n in node["anyOf"]} 106 | # Currently only deal with anyOf in leaf nodes 107 | assert len(children) == 1, f"type of {fullname} couldn't be determined" 108 | return next(iter(children)) 109 | if "oneOf" in node: 110 | children = {_make_group(fullname, n) for n in node["oneOf"]} 111 | # Currently only deal with oneOf in leaf nodes 112 | assert len(children) == 1, f"type of {fullname} couldn't be determined" 113 | return next(iter(children)) 114 | if "allOf" in node: 115 | children = {_make_group(fullname, n) for n in node["allOf"]} 116 | # Currently only deal with allOf in leaf nodes 117 | assert len(children) == 1, f"type of {fullname} couldn't be determined" 118 | return next(iter(children)) 119 | if node["type"] in ("string", "number", "integer", "date"): 120 | return Attr(fullname) 121 | elif node["type"] == "array": 122 | # skip to items 123 | return _make_group(fullname, node["items"]) 124 | elif node["type"] == "object": 125 | group = SchemaGroup() # parent, name) 126 | for childname, childnode in node["properties"].items(): 127 | fullchildname = f"{fullname}.{childname}" if fullname else childname 128 | childgroup = _make_group(fullchildname, childnode) 129 | setattr(group, childname, childgroup) 130 | return group 131 | else: 132 | raise TypeError(f"Unrecognized node type {node['type']!r} of {fullname}") 133 | 134 | 135 | def _make_schema() -> SchemaGroup: 136 | json = _get_json_schema() 137 | schema = _make_group("", json) 138 | assert isinstance(schema, SchemaGroup) # for type checking 139 | return schema 140 | 141 | 142 | rcsb_attributes: SchemaGroup 143 | """Object with all known RCSB attributes. 144 | 145 | This is provided to ease autocompletion as compared to creating Attr objects from 146 | strings. For example, 147 | :: 148 | 149 | rcsb_attributes.rcsb_nonpolymer_instance_feature_summary.chem_id 150 | 151 | is equivalent to 152 | :: 153 | 154 | Attr('rcsb_nonpolymer_instance_feature_summary.chem_id') 155 | 156 | All attributes in `rcsb_attributes` can be iterated over. 157 | 158 | >>> [a for a in rcsb_attributes if "stoichiometry" in a.attribute] 159 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')] 160 | 161 | Attributes matching a regular expression can also be filtered: 162 | 163 | >>> list(rcsb_attributes.search('rcsb.*stoichiometry')) 164 | [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]a 165 | 166 | """ 167 | 168 | 169 | def __getattr__(name: str) -> Any: 170 | # delay instantiating rcsb_attributes until it is needed 171 | if name == "rcsb_attributes": 172 | if "rcsb_attributes" not in globals(): 173 | globals()["rcsb_attributes"] = _make_schema() 174 | return globals()["rcsb_attributes"] 175 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 176 | 177 | 178 | def __dir__() -> List[str]: 179 | return sorted(__all__) 180 | 181 | 182 | __all__ = [ # noqa: F822 183 | "METADATA_SCHEMA_URL", 184 | "SEARCH_SCHEMA_URL", 185 | "ENV_RCSBSEARCH_DOWNLOAD_SCHEMA", 186 | "rcsb_attributes", 187 | "SchemaGroup", 188 | ] 189 | -------------------------------------------------------------------------------- /rcsbsearch/search.py: -------------------------------------------------------------------------------- 1 | """Interact with the [RCSB Search API](https://search.rcsb.org/#search-api). 2 | """ 3 | 4 | import functools 5 | import json 6 | import logging 7 | import math 8 | import sys 9 | import urllib.parse 10 | import uuid 11 | from abc import ABC, abstractmethod 12 | from dataclasses import dataclass 13 | from datetime import date 14 | from typing import ( 15 | Callable, 16 | Dict, 17 | Generic, 18 | Iterable, 19 | Iterator, 20 | List, 21 | Optional, 22 | Tuple, 23 | TypeVar, 24 | Union, 25 | overload, 26 | ) 27 | 28 | import requests 29 | 30 | if sys.version_info > (3, 8): 31 | from typing import Literal 32 | else: 33 | from typing_extensions import Literal 34 | # tqdm is optional 35 | 36 | # Allowed return types for searches. http://search.rcsb.org/#return-type 37 | ReturnType = Literal[ 38 | "entry", "assembly", "polymer_entity", "non_polymer_entity", "polymer_instance" 39 | ] 40 | TAndOr = Literal["and", "or"] 41 | # All valid types for Terminal values 42 | TValue = Union[ 43 | str, 44 | int, 45 | float, 46 | date, 47 | List[str], 48 | List[int], 49 | List[float], 50 | List[date], 51 | Tuple[str, ...], 52 | Tuple[int, ...], 53 | Tuple[float, ...], 54 | Tuple[date, ...], 55 | ] 56 | # Types valid for numeric operators 57 | TNumberLike = Union[int, float, date, "Value[int]", "Value[float]", "Value[date]"] 58 | 59 | 60 | class Query(ABC): 61 | """Base class for all types of queries. 62 | 63 | Queries can be combined using set operators: 64 | 65 | - `q1 & q2`: Intersection (AND) 66 | - `q1 | q2`: Union (OR) 67 | - `~q1`: Negation (NOT) 68 | - `q1 - q2`: Difference (implemented as `q1 & ~q2`) 69 | - `q1 ^ q2`: Symmetric difference (XOR, implemented as `(q1 & ~q2) | (~q1 & q2)`) 70 | 71 | Note that only AND, OR, and negation of terminals are directly supported by 72 | the API, so other operations may be slower. 73 | 74 | Queries can be executed by calling them as functions (`list(query())`) or using 75 | the exec function. 76 | 77 | Queries are immutable, and all modifying functions return new instances. 78 | """ 79 | 80 | @abstractmethod 81 | def to_dict(self) -> Dict: 82 | """Get dictionary representing this query""" 83 | ... 84 | 85 | def to_json(self) -> str: 86 | """Get JSON string of this query""" 87 | return json.dumps(self.to_dict(), separators=(",", ":")) 88 | 89 | @abstractmethod 90 | def _assign_ids(self, node_id=0) -> Tuple["Query", int]: 91 | """Assign node_ids sequentially for all terminal nodes 92 | 93 | This is a helper for the :py:meth:`Query.assign_ids` method 94 | 95 | Args: 96 | node_id: Id to assign to the first leaf of this query 97 | 98 | Returns: 99 | query: The modified query, with node_ids assigned 100 | node_id: The next available node_id 101 | 102 | """ 103 | ... 104 | 105 | def assign_ids(self) -> "Query": 106 | """Assign node_ids sequentially for all terminal nodes 107 | 108 | Returns: 109 | the modified query, with node_ids assigned sequentially from 0 110 | """ 111 | return self._assign_ids(0)[0] 112 | 113 | @abstractmethod 114 | def __invert__(self) -> "Query": 115 | """Negation: `~a`""" 116 | ... 117 | 118 | def __and__(self, other: "Query") -> "Query": 119 | """Intersection: `a & b`""" 120 | assert isinstance(other, Query) 121 | return Group("and", [self, other]) 122 | 123 | def __or__(self, other: "Query") -> "Query": 124 | """Union: `a | b`""" 125 | assert isinstance(other, Query) 126 | return Group("or", [self, other]) 127 | 128 | def __sub__(self, other: "Query") -> "Query": 129 | """Difference: `a - b`""" 130 | return self & ~other 131 | 132 | def __xor__(self, other: "Query") -> "Query": 133 | """Symmetric difference: `a ^ b`""" 134 | return (self & ~other) | (~self & other) 135 | 136 | def exec(self, return_type: ReturnType = "entry", rows: int = 100) -> "Session": 137 | """Evaluate this query and return an iterator of all result IDs""" 138 | return Session(self, return_type, rows) 139 | 140 | def __call__(self, return_type: ReturnType = "entry", rows: int = 100) -> "Session": 141 | """Evaluate this query and return an iterator of all result IDs""" 142 | return self.exec(return_type, rows) 143 | 144 | @overload 145 | def and_(self, other: "Query") -> "Query": 146 | ... 147 | 148 | @overload 149 | def and_(self, other: Union[str, "Attr"]) -> "PartialQuery": 150 | ... 151 | 152 | def and_( 153 | self, other: Union[str, "Query", "Attr"] 154 | ) -> Union["Query", "PartialQuery"]: 155 | """Extend this query with an additional attribute via an AND""" 156 | if isinstance(other, Query): 157 | return self & other 158 | elif isinstance(other, Attr): 159 | return PartialQuery(self, "and", other) 160 | elif isinstance(other, str): 161 | return PartialQuery(self, "and", Attr(other)) 162 | else: 163 | raise TypeError(f"Expected Query or Attr, got {type(other)}") 164 | 165 | @overload 166 | def or_(self, other: "Query") -> "Query": 167 | ... 168 | 169 | @overload 170 | def or_(self, other: Union[str, "Attr"]) -> "PartialQuery": 171 | ... 172 | 173 | def or_(self, other: Union[str, "Query", "Attr"]) -> Union["Query", "PartialQuery"]: 174 | """Extend this query with an additional attribute via an OR""" 175 | if isinstance(other, Query): 176 | return self & other 177 | elif isinstance(other, Attr): 178 | return PartialQuery(self, "or", other) 179 | elif isinstance(other, str): 180 | return PartialQuery(self, "or", Attr(other)) 181 | else: 182 | raise TypeError(f"Expected Query or Attr, got {type(other)}") 183 | 184 | 185 | @dataclass(frozen=True) 186 | class Terminal(Query): 187 | """A terminal query node. 188 | 189 | Terminals are simple predicates comparing some *attribute* of a structure to a 190 | value. 191 | 192 | Examples: 193 | >>> Terminal("exptl.method", "exact_match", "X-RAY DIFFRACTION") 194 | >>> Terminal("rcsb_id", "in", ["5T89", "1TIM"]) 195 | >>> Terminal(value="tubulin") 196 | 197 | A full list of attributes is available in the 198 | `schema `_. 199 | Operators are documented `here `_. 200 | 201 | The :py:class:`Attr` class provides a more pythonic way of constructing Terminals. 202 | """ 203 | 204 | attribute: Optional[str] = None 205 | operator: Optional[str] = None 206 | value: Optional[TValue] = None 207 | service: str = "text" 208 | negation: bool = False 209 | node_id: int = 0 210 | 211 | def to_dict(self): 212 | params = dict() 213 | if self.attribute is not None: 214 | params["attribute"] = self.attribute 215 | if self.operator is not None: 216 | params["operator"] = self.operator 217 | if self.value is not None: 218 | params["value"] = self.value 219 | if self.negation is not None: 220 | params["negation"] = self.negation 221 | 222 | return dict( 223 | type="terminal", 224 | service=self.service, 225 | parameters=params, 226 | node_id=self.node_id, 227 | ) 228 | 229 | def __invert__(self): 230 | return Terminal( 231 | self.attribute, 232 | self.operator, 233 | self.value, 234 | self.service, 235 | not self.negation, 236 | self.node_id, 237 | ) 238 | 239 | def _assign_ids(self, node_id=0) -> Tuple[Query, int]: 240 | if self.node_id == node_id: 241 | return (self, node_id + 1) 242 | else: 243 | return ( 244 | Terminal( 245 | self.attribute, 246 | self.operator, 247 | self.value, 248 | self.service, 249 | self.negation, 250 | node_id, 251 | ), 252 | node_id + 1, 253 | ) 254 | 255 | def __str__(self): 256 | """Return a simplified string representation 257 | 258 | Examples: 259 | >>> Terminal("attr", "op", "val") 260 | >>> ~Terminal(value="val") 261 | 262 | """ 263 | negation = "~" if self.negation else "" 264 | if self.attribute is None and self.operator is None: 265 | # value-only 266 | return f"{negation}Terminal(value={self.value!r})" 267 | else: 268 | return ( 269 | f"{negation}Terminal({self.attribute!r}, {self.operator!r}, " 270 | f"{self.value!r})" 271 | ) 272 | 273 | 274 | class TextQuery(Terminal): 275 | """Special case of a Terminal for free-text queries""" 276 | 277 | def __init__(self, value: str, negation: bool = False): 278 | """Search for the string value anywhere in the text 279 | 280 | Args: 281 | value: free-text query 282 | negation: find structures without the pattern 283 | """ 284 | super().__init__(value=value, negation=negation) 285 | 286 | 287 | @dataclass(frozen=True) 288 | class Group(Query): 289 | """AND and OR combinations of queries""" 290 | 291 | operator: TAndOr 292 | nodes: Iterable[Query] = () 293 | 294 | def to_dict(self): 295 | return dict( 296 | type="group", 297 | logical_operator=self.operator, 298 | nodes=[node.to_dict() for node in self.nodes], 299 | ) 300 | 301 | def __invert__(self): 302 | if self.operator == "and": 303 | return Group("or", [~node for node in self.nodes]) 304 | 305 | def __and__(self, other: Query) -> Query: 306 | # Combine nodes if possible 307 | if self.operator == "and": 308 | if isinstance(other, Group): 309 | if other.operator == "and": 310 | return Group("and", (*self.nodes, *other.nodes)) 311 | elif isinstance(other, Query): 312 | return Group("and", (*self.nodes, other)) 313 | else: 314 | return NotImplemented 315 | 316 | return super().__and__(other) 317 | 318 | def __or__(self, other: Query) -> Query: 319 | # Combine nodes if possible 320 | if self.operator == "or": 321 | if isinstance(other, Group): 322 | if other.operator == "or": 323 | return Group("or", (*self.nodes, *other.nodes)) 324 | elif isinstance(other, Terminal): 325 | return Group("or", (*self.nodes, other)) 326 | else: 327 | return NotImplemented 328 | 329 | return super().__or__(other) 330 | 331 | def _assign_ids(self, node_id=0) -> Tuple[Query, int]: 332 | nodes = [] 333 | changed = False 334 | for node in self.nodes: 335 | assigned = node._assign_ids(node_id) 336 | nodes.append(assigned[0]) 337 | node_id = assigned[1] 338 | # Track whether any nodes were modified 339 | changed = changed or assigned[0] is node 340 | if changed: 341 | return (Group(self.operator, nodes), node_id) 342 | else: 343 | return (self, node_id) 344 | 345 | def __str__(self): 346 | "" # hide in documentation 347 | if self.operator == "and": 348 | return f"({' & '.join((str(n) for n in self.nodes))})" 349 | elif self.operator == "or": 350 | return f"({' | '.join((str(n) for n in self.nodes))})" 351 | else: 352 | raise ValueError("Illegal Operator") 353 | 354 | 355 | @dataclass(frozen=True) 356 | class Attr: 357 | """A search attribute, e.g. "rcsb_entry_container_identifiers.entry_id" 358 | 359 | Terminals can be constructed from Attr objects using either a functional syntax, 360 | which mirrors the API operators, or with python operators. 361 | 362 | +--------------------+---------------------+ 363 | | Fluent Function | Operator | 364 | +====================+=====================+ 365 | | exact_match | attr == str | 366 | +--------------------+---------------------+ 367 | | contains_words | | 368 | +--------------------+---------------------+ 369 | | contains_phrase | | 370 | +--------------------+---------------------+ 371 | | greater | attr > date,number | 372 | +--------------------+---------------------+ 373 | | less | attr < date,number | 374 | +--------------------+---------------------+ 375 | | greater_or_equal | attr >= date,number | 376 | +--------------------+---------------------+ 377 | | less_or_equal | attr <= date,number | 378 | +--------------------+---------------------+ 379 | | equals | attr == date,number | 380 | +--------------------+---------------------+ 381 | | range | attr[start:end] | 382 | +--------------------+---------------------+ 383 | | range_closed | | 384 | +--------------------+---------------------+ 385 | | exists | bool(attr) | 386 | +--------------------+---------------------+ 387 | | in\\_ | | 388 | +--------------------+---------------------+ 389 | 390 | Rather than their normal bool return values, operators return Terminals. 391 | 392 | Pre-instantiated attributes are available from the 393 | :py:data:`rcsbsearch.rcsb_attributes` object. These are generally easier to use 394 | than constructing Attr objects by hand. A complete list of valid attributes is 395 | available in the `schema `_. 396 | 397 | """ 398 | 399 | attribute: str 400 | 401 | def exact_match(self, value: Union[str, "Value[str]"]) -> Terminal: 402 | """Exact match with the value""" 403 | if isinstance(value, Value): 404 | value = value.value 405 | return Terminal(self.attribute, "exact_match", value) 406 | 407 | def contains_words( 408 | self, value: Union[str, "Value[str]", List[str], "Value[List[str]]"] 409 | ) -> Terminal: 410 | """Match any word within the string. 411 | 412 | Words are split at whitespace. All results which match any word are returned, 413 | with results matching more words sorted first. 414 | """ 415 | if isinstance(value, Value): 416 | value = value.value 417 | if isinstance(value, list): 418 | value = " ".join(value) 419 | return Terminal(self.attribute, "contains_words", value) 420 | 421 | def contains_phrase(self, value: Union[str, "Value[str]"]) -> Terminal: 422 | """Match an exact phrase""" 423 | if isinstance(value, Value): 424 | value = value.value 425 | return Terminal(self.attribute, "contains_phrase", value) 426 | 427 | def greater(self, value: TNumberLike) -> Terminal: 428 | """Attribute > `value`""" 429 | if isinstance(value, Value): 430 | value = value.value 431 | return Terminal(self.attribute, "greater", value) 432 | 433 | def less(self, value: TNumberLike) -> Terminal: 434 | """Attribute < `value`""" 435 | if isinstance(value, Value): 436 | value = value.value 437 | return Terminal(self.attribute, "less", value) 438 | 439 | def greater_or_equal(self, value: TNumberLike) -> Terminal: 440 | """Attribute >= `value`""" 441 | if isinstance(value, Value): 442 | value = value.value 443 | return Terminal(self.attribute, "greater_or_equal", value) 444 | 445 | def less_or_equal(self, value: TNumberLike) -> Terminal: 446 | """Attribute <= `value`""" 447 | if isinstance(value, Value): 448 | value = value.value 449 | return Terminal(self.attribute, "less_or_equal", value) 450 | 451 | def equals(self, value: TNumberLike) -> Terminal: 452 | """Attribute == `value`""" 453 | if isinstance(value, Value): 454 | value = value.value 455 | return Terminal(self.attribute, "equals", value) 456 | 457 | def range(self, value: Union[List[int], Tuple[int, int]]) -> Terminal: 458 | """Attribute is within the specified half-open range 459 | 460 | Args: 461 | value: lower and upper bounds `[a, b)` 462 | """ 463 | if isinstance(value, Value): 464 | value = value.value 465 | return Terminal(self.attribute, "range", value) 466 | 467 | def range_closed( 468 | self, 469 | value: Union[ 470 | List[int], Tuple[int, int], "Value[List[int]]", "Value[Tuple[int, int]]" 471 | ], 472 | ) -> Terminal: 473 | """Attribute is within the specified closed range 474 | 475 | Args: 476 | value: lower and upper bounds `[a, b]` 477 | """ 478 | if isinstance(value, Value): 479 | value = value.value 480 | return Terminal(self.attribute, "range_closed", value) 481 | 482 | def exists(self) -> Terminal: 483 | """Attribute is defined for the structure""" 484 | return Terminal(self.attribute, "exists") 485 | 486 | def in_( 487 | self, 488 | value: Union[ 489 | List[str], 490 | List[int], 491 | List[float], 492 | List[date], 493 | Tuple[str, ...], 494 | Tuple[int, ...], 495 | Tuple[float, ...], 496 | Tuple[date, ...], 497 | "Value[List[str]]", 498 | "Value[List[int]]", 499 | "Value[List[float]]", 500 | "Value[List[date]]", 501 | "Value[Tuple[str, ...]]", 502 | "Value[Tuple[int, ...]]", 503 | "Value[Tuple[float, ...]]", 504 | "Value[Tuple[date, ...]]", 505 | ], 506 | ) -> Terminal: 507 | """Attribute is contained in the list of values""" 508 | if isinstance(value, Value): 509 | value = value.value 510 | return Terminal(self.attribute, "in", value) 511 | 512 | # Need ignore[override] because typeshed restricts __eq__ return value 513 | # https://github.com/python/mypy/issues/2783 514 | @overload # type: ignore[override] 515 | def __eq__(self, value: "Attr") -> bool: 516 | ... 517 | 518 | @overload # type: ignore[override] 519 | def __eq__( 520 | self, 521 | value: Union[ 522 | str, 523 | int, 524 | float, 525 | date, 526 | "Value[str]", 527 | "Value[int]", 528 | "Value[float]", 529 | "Value[date]", 530 | ], 531 | ) -> Terminal: 532 | ... 533 | 534 | def __eq__( 535 | self, 536 | value: Union[ 537 | "Attr", 538 | str, 539 | int, 540 | float, 541 | date, 542 | "Value[str]", 543 | "Value[int]", 544 | "Value[float]", 545 | "Value[date]", 546 | ], 547 | ) -> Union[Terminal, bool]: # type: ignore[override] 548 | if isinstance(value, Attr): 549 | return self.attribute == value.attribute 550 | if isinstance(value, Value): 551 | value = value.value 552 | if isinstance(value, str): 553 | return self.exact_match(value) 554 | elif ( 555 | isinstance(value, date) 556 | or isinstance(value, float) 557 | or isinstance(value, int) 558 | ): 559 | return self.equals(value) 560 | else: 561 | return NotImplemented 562 | 563 | @overload # type: ignore[override] 564 | def __ne__(self, value: "Attr") -> bool: 565 | ... 566 | 567 | @overload # type: ignore[override] 568 | def __ne__( 569 | self, 570 | value: Union[ 571 | str, 572 | int, 573 | float, 574 | date, 575 | "Value[str]", 576 | "Value[int]", 577 | "Value[float]", 578 | "Value[date]", 579 | ], 580 | ) -> Terminal: 581 | ... 582 | 583 | def __ne__( 584 | self, 585 | value: Union[ 586 | "Attr", 587 | str, 588 | int, 589 | float, 590 | date, 591 | "Value[str]", 592 | "Value[int]", 593 | "Value[float]", 594 | "Value[date]", 595 | ], 596 | ) -> Union[Terminal, bool]: # type: ignore[override] 597 | if isinstance(value, Attr): 598 | return self.attribute != value.attribute 599 | if isinstance(value, Value): 600 | value = value.value 601 | return ~(self == value) 602 | 603 | def __lt__(self, value: TNumberLike) -> Terminal: 604 | if isinstance(value, Value): 605 | value = value.value 606 | return self.less(value) 607 | 608 | def __le__(self, value: TNumberLike) -> Terminal: 609 | if isinstance(value, Value): 610 | value = value.value 611 | return self.less_or_equal(value) 612 | 613 | def __gt__(self, value: TNumberLike) -> Terminal: 614 | if isinstance(value, Value): 615 | value = value.value 616 | return self.greater(value) 617 | 618 | def __ge__(self, value: TNumberLike) -> Terminal: 619 | if isinstance(value, Value): 620 | value = value.value 621 | return self.greater_or_equal(value) 622 | 623 | def __bool__(self) -> Terminal: 624 | return self.exists() 625 | 626 | def __contains__( 627 | self, value: Union[str, List[str], "Value[str]", "Value[List[str]]"] 628 | ) -> Terminal: 629 | """Maps to contains_words or contains_phrase depending on the value passed. 630 | 631 | * `"value" in attr` maps to `attr.contains_phrase("value")` for simple values. 632 | * `["value"] in attr` maps to `attr.contains_words(["value"])` for lists and 633 | tuples. 634 | """ 635 | if isinstance(value, Value): 636 | value = value.value 637 | if isinstance(value, list): 638 | if len(value) == 0 or isinstance(value[0], str): 639 | return self.contains_words(value) 640 | else: 641 | return NotImplemented 642 | else: 643 | return self.contains_phrase(value) 644 | 645 | 646 | # Type for functions returning Terminal 647 | FTerminal = TypeVar("FTerminal", bound=Callable[..., Terminal]) 648 | # Type for functions returning Query 649 | FQuery = TypeVar("FQuery", bound=Callable[..., Query]) 650 | 651 | 652 | def _attr_delegate(attr_func: FTerminal) -> Callable[[FQuery], FQuery]: 653 | """Decorator for PartialQuery methods. Delegates a function to self.attr. 654 | 655 | This reduces boilerplate, especially for classes with lots of dunder methods 656 | (preventing the use of `__getattr__`). 657 | 658 | Argument: 659 | - attr_func: A method in the Attr class producing a Terminal 660 | 661 | Returns: A function producing a Query according to the PartialQuery's operator 662 | """ 663 | 664 | def decorator(partialquery_func: FQuery): 665 | @functools.wraps(partialquery_func) 666 | def wrap(self: "PartialQuery", *args, **kwargs) -> Query: 667 | term: Terminal = attr_func(self.attr, *args, **kwargs) 668 | if self.operator == "and": 669 | return self.query & term 670 | elif self.operator == "or": 671 | return self.query | term 672 | else: 673 | raise ValueError(f"Unknown operator: {self.operator}") 674 | 675 | return wrap 676 | 677 | return decorator 678 | 679 | 680 | class PartialQuery: 681 | """A PartialQuery extends a growing query with an Attr. It is constructed 682 | using the fluent syntax with the `and_` and `or_` methods. It is not usually 683 | necessary to create instances of this class directly. 684 | 685 | PartialQuery instances behave like Attr instances in most situations. 686 | """ 687 | 688 | attr: Attr 689 | query: Query 690 | operator: TAndOr 691 | 692 | def __init__(self, query: Query, operator: TAndOr, attr: Attr): 693 | self.query = query 694 | self.operator = operator 695 | self.attr = attr 696 | 697 | @_attr_delegate(Attr.exact_match) 698 | def exact_match(self, value: Union[str, "Value[str]"]) -> Query: 699 | ... 700 | 701 | @_attr_delegate(Attr.contains_words) 702 | def contains_words( 703 | self, value: Union[str, "Value[str]", List[str], "Value[List[str]]"] 704 | ) -> Query: 705 | ... 706 | 707 | @_attr_delegate(Attr.contains_phrase) 708 | def contains_phrase(self, value: Union[str, "Value[str]"]) -> Query: 709 | ... 710 | 711 | @_attr_delegate(Attr.greater) 712 | def greater(self, value: TNumberLike) -> Query: 713 | ... 714 | 715 | @_attr_delegate(Attr.less) 716 | def less(self, value: TNumberLike) -> Query: 717 | ... 718 | 719 | @_attr_delegate(Attr.greater_or_equal) 720 | def greater_or_equal(self, value: TNumberLike) -> Query: 721 | ... 722 | 723 | @_attr_delegate(Attr.less_or_equal) 724 | def less_or_equal(self, value: TNumberLike) -> Query: 725 | ... 726 | 727 | @_attr_delegate(Attr.equals) 728 | def equals(self, value: TNumberLike) -> Query: 729 | ... 730 | 731 | @_attr_delegate(Attr.range) 732 | def range(self, value: Union[List[int], Tuple[int, int]]) -> Query: 733 | ... 734 | 735 | @_attr_delegate(Attr.range_closed) 736 | def range_closed( 737 | self, 738 | value: Union[ 739 | List[int], Tuple[int, int], "Value[List[int]]", "Value[Tuple[int, int]]" 740 | ], 741 | ) -> Query: 742 | ... 743 | 744 | @_attr_delegate(Attr.exists) 745 | def exists(self) -> Query: 746 | ... 747 | 748 | @_attr_delegate(Attr.in_) 749 | def in_( 750 | self, 751 | value: Union[ 752 | str, 753 | int, 754 | float, 755 | date, 756 | "Value[str]", 757 | "Value[int]", 758 | "Value[float]", 759 | "Value[date]", 760 | ], 761 | ) -> Query: 762 | ... 763 | 764 | @overload # type: ignore[override] 765 | def __eq__(self, value: "PartialQuery") -> bool: 766 | ... 767 | 768 | @overload # type: ignore[override] 769 | def __eq__( 770 | self, 771 | value: Union[ 772 | str, 773 | int, 774 | float, 775 | date, 776 | "Value[str]", 777 | "Value[int]", 778 | "Value[float]", 779 | "Value[date]", 780 | ], 781 | ) -> Query: 782 | ... 783 | 784 | def __eq__( 785 | self, 786 | value: Union[ 787 | "PartialQuery", 788 | str, 789 | int, 790 | float, 791 | date, 792 | "Value[str]", 793 | "Value[int]", 794 | "Value[float]", 795 | "Value[date]", 796 | ], 797 | ) -> Union[Query, bool]: # type: ignore[override] 798 | if isinstance(value, PartialQuery): 799 | return ( 800 | self.attr == value.attr 801 | and self.query == value.query 802 | and self.operator == value.operator 803 | ) 804 | 805 | if self.operator == "and": 806 | return self.query & (self.attr == value) 807 | elif self.operator == "or": 808 | return self.query | (self.attr == value) 809 | else: 810 | raise ValueError(f"Unknown operator: {self.operator}") 811 | 812 | @overload # type: ignore[override] 813 | def __ne__(self, value: "PartialQuery") -> bool: 814 | ... 815 | 816 | @overload # type: ignore[override] 817 | def __ne__( 818 | self, 819 | value: Union[ 820 | str, 821 | int, 822 | float, 823 | date, 824 | "Value[str]", 825 | "Value[int]", 826 | "Value[float]", 827 | "Value[date]", 828 | ], 829 | ) -> Query: 830 | ... 831 | 832 | def __ne__( 833 | self, 834 | value: Union[ 835 | "PartialQuery", 836 | str, 837 | int, 838 | float, 839 | date, 840 | "Value[str]", 841 | "Value[int]", 842 | "Value[float]", 843 | "Value[date]", 844 | ], 845 | ) -> Union[Query, bool]: # type: ignore[override] 846 | if isinstance(value, PartialQuery): 847 | return self.attr != value.attr 848 | return ~(self == value) 849 | 850 | @_attr_delegate(Attr.__lt__) 851 | def __lt__(self, value: TNumberLike) -> Query: 852 | ... 853 | 854 | @_attr_delegate(Attr.__le__) 855 | def __le__(self, value: TNumberLike) -> Query: 856 | ... 857 | 858 | @_attr_delegate(Attr.__gt__) 859 | def __gt__(self, value: TNumberLike) -> Query: 860 | ... 861 | 862 | @_attr_delegate(Attr.__ge__) 863 | def __ge__(self, value: TNumberLike) -> Query: 864 | ... 865 | 866 | @_attr_delegate(Attr.__bool__) 867 | def __bool__(self) -> Query: 868 | ... 869 | 870 | @_attr_delegate(Attr.__contains__) 871 | def __contains__( 872 | self, value: Union[str, List[str], "Value[str]", "Value[List[str]]"] 873 | ) -> Query: 874 | ... 875 | 876 | 877 | T = TypeVar("T", bound="TValue") 878 | 879 | 880 | @dataclass(frozen=True) 881 | class Value(Generic[T]): 882 | """Represents a value in a query. 883 | 884 | In most cases values are unnecessary and can be replaced directly by the python 885 | value. 886 | 887 | Values can also be used if the Attr object appears on the right: 888 | 889 | Value("4HHB") == Attr("rcsb_entry_container_identifiers.entry_id") 890 | """ 891 | 892 | value: T 893 | 894 | @overload # type: ignore[override] 895 | def __eq__(self, attr: "Value") -> bool: 896 | ... 897 | 898 | @overload # type: ignore[override] 899 | def __eq__(self, attr: Attr) -> Terminal: 900 | ... 901 | 902 | def __eq__(self, attr: Union["Value", Attr]) -> Union[bool, Terminal]: 903 | # type: ignore[override] 904 | if isinstance(attr, Value): 905 | return self.value == attr.value 906 | if not isinstance(attr, Attr): 907 | return NotImplemented 908 | return attr == self 909 | 910 | @overload # type: ignore[override] 911 | def __ne__(self, attr: "Value") -> bool: 912 | ... 913 | 914 | @overload # type: ignore[override] 915 | def __ne__(self, attr: Attr) -> Terminal: 916 | ... 917 | 918 | def __ne__(self, attr: Union["Value", Attr]) -> Union[bool, Terminal]: 919 | # type: ignore[override] 920 | if isinstance(attr, Value): 921 | return self.value != attr.value 922 | if not isinstance(attr, Attr): 923 | return NotImplemented 924 | return attr != self.value 925 | 926 | def __lt__(self, attr: Attr) -> Terminal: 927 | if not isinstance(attr, Attr): 928 | return NotImplemented 929 | if not ( 930 | isinstance(self.value, int) 931 | or isinstance(self.value, float) 932 | or isinstance(self.value, date) 933 | ): 934 | return NotImplemented 935 | return attr.greater(self.value) 936 | 937 | def __le__(self, attr: Attr) -> Terminal: 938 | if not isinstance(attr, Attr): 939 | return NotImplemented 940 | if not ( 941 | isinstance(self.value, int) 942 | or isinstance(self.value, float) 943 | or isinstance(self.value, date) 944 | ): 945 | return NotImplemented 946 | return attr.greater_or_equal(self.value) 947 | 948 | def __gt__(self, attr: Attr) -> Terminal: 949 | if not isinstance(attr, Attr): 950 | return NotImplemented 951 | if not ( 952 | isinstance(self.value, int) 953 | or isinstance(self.value, float) 954 | or isinstance(self.value, date) 955 | ): 956 | return NotImplemented 957 | return attr.less(self.value) 958 | 959 | def __ge__(self, attr: Attr) -> Terminal: 960 | if not isinstance(attr, Attr): 961 | return NotImplemented 962 | if not ( 963 | isinstance(self.value, int) 964 | or isinstance(self.value, float) 965 | or isinstance(self.value, date) 966 | ): 967 | return NotImplemented 968 | return attr.less_or_equal(self.value) 969 | 970 | 971 | class Session(Iterable[str]): 972 | """A single query session. 973 | 974 | Handles paging the query and parsing results 975 | """ 976 | 977 | url = "http://search.rcsb.org/rcsbsearch/v1/query" 978 | query_id: str 979 | query: Query 980 | return_type: ReturnType 981 | start: int 982 | rows: int 983 | 984 | def __init__( 985 | self, query: Query, return_type: ReturnType = "entry", rows: int = 100 986 | ): 987 | self.query_id = Session.make_uuid() 988 | self.query = query.assign_ids() 989 | self.return_type = return_type 990 | self.start = 0 991 | self.rows = rows 992 | 993 | @staticmethod 994 | def make_uuid() -> str: 995 | "Create a new UUID to identify a query" 996 | return uuid.uuid4().hex 997 | 998 | @staticmethod 999 | def _extract_identifiers(query_json: Optional[Dict]) -> List[str]: 1000 | """Extract identifiers from a JSON response""" 1001 | if query_json is None: 1002 | return [] 1003 | 1004 | # total_count = int(query_json["total_count"]) 1005 | identifiers = [result["identifier"] for result in query_json["result_set"]] 1006 | # assert len(identifiers) == total_count, f"{len(identifiers)} != {total_count}" 1007 | return identifiers 1008 | 1009 | def _make_params(self, start=0): 1010 | "Generate GET parameters as a dict" 1011 | return dict( 1012 | query=self.query.to_dict(), 1013 | return_type=self.return_type, 1014 | request_info=dict(query_id=self.query_id, src="ui"), # TODO src deprecated? 1015 | request_options=dict(pager=dict(start=start, rows=self.rows)), 1016 | ) 1017 | 1018 | def _single_query(self, start=0) -> Optional[Dict]: 1019 | "Fires a single query" 1020 | params = self._make_params(start) 1021 | logging.debug( 1022 | f"Querying {self.url} for results {start}-{start + self.rows - 1}" 1023 | ) 1024 | response = requests.get( 1025 | self.url, {"json": json.dumps(params, separators=(",", ":"))} 1026 | ) 1027 | response.raise_for_status() 1028 | if response.status_code == requests.codes.OK: 1029 | return response.json() 1030 | elif response.status_code == requests.codes.NO_CONTENT: 1031 | return None 1032 | else: 1033 | raise Exception(f"Unexpected status: {response.status_code}") 1034 | 1035 | def __iter__(self) -> Iterator[str]: 1036 | "Generator for all results as a list of identifiers" 1037 | start = 0 1038 | response = self._single_query(start=start) 1039 | if response is None: 1040 | return # be explicit for mypy 1041 | identifiers = self._extract_identifiers(response) 1042 | start += self.rows 1043 | logging.debug(f"Got {len(identifiers)} ids") 1044 | 1045 | if len(identifiers) == 0: 1046 | return 1047 | yield from identifiers 1048 | 1049 | total = response["total_count"] 1050 | 1051 | while start < total: 1052 | assert len(identifiers) == self.rows 1053 | response = self._single_query(start=start) 1054 | identifiers = self._extract_identifiers(response) 1055 | logging.debug(f"Got {len(identifiers)} ids") 1056 | start += self.rows 1057 | yield from identifiers 1058 | 1059 | def iquery(self, limit: Optional[int] = None) -> List[str]: 1060 | """Evaluate the query and display an interactive progress bar. 1061 | 1062 | Requires tqdm. 1063 | """ 1064 | from tqdm import trange # type: ignore 1065 | 1066 | response = self._single_query(start=0) 1067 | if response is None: 1068 | return [] 1069 | total = response["total_count"] 1070 | identifiers = self._extract_identifiers(response) 1071 | if limit is not None and len(identifiers) >= limit: 1072 | return identifiers[:limit] 1073 | 1074 | pages = math.ceil((total if limit is None else min(total, limit)) / self.rows) 1075 | 1076 | for page in trange(1, pages, initial=1, total=pages): 1077 | response = self._single_query(page * self.rows) 1078 | ids = self._extract_identifiers(response) 1079 | identifiers.extend(ids) 1080 | 1081 | return identifiers[:limit] 1082 | 1083 | def rcsb_query_editor_url(self) -> str: 1084 | """URL to edit this query in the RCSB query editor""" 1085 | data = json.dumps(self._make_params(), separators=(",", ":")) 1086 | return ( 1087 | f"http://search.rcsb.org/query-editor.html?json={urllib.parse.quote(data)}" 1088 | ) 1089 | 1090 | def rcsb_query_builder_url(self) -> str: 1091 | """URL to view this query on the RCSB website query builder""" 1092 | data = json.dumps(self._make_params(), separators=(",", ":")) 1093 | return f"http://www.rcsb.org/search?request={urllib.parse.quote(data)}" 1094 | -------------------------------------------------------------------------------- /rcsbsearch/update_schema.py: -------------------------------------------------------------------------------- 1 | """Update the distribution json files; for developer use only""" 2 | import json 3 | from pathlib import Path 4 | 5 | try: 6 | from .schema import _download_json_schema 7 | except Exception: 8 | # ignore errors that may occur parsing the schema 9 | pass 10 | 11 | if __name__ == "__main__": 12 | path = Path(__file__).parent.joinpath("resources", "metadata_schema.json") 13 | print(path) 14 | with open(path, "wt") as file: 15 | latest = _download_json_schema() 16 | json.dump(latest, file) 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Minimial dependencies 2 | --index-url https://pypi.python.org/simple/ 3 | 4 | -e . -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | # Full dependencies 2 | --index-url https://pypi.python.org/simple/ 3 | 4 | -e .[progressbar,docs,tests] -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # consistent with black 3 | max-line-length = 88 4 | extend-ignore = E203, W503 5 | 6 | [mypy] 7 | #exclude = /build/ 8 | files = rcsbsearch 9 | 10 | [isort] 11 | # consistent with black 12 | multi_line_output = VERTICAL_HANGING_INDENT 13 | include_trailing_comma = True 14 | force_grid_wrap = 0 15 | use_parentheses = True 16 | ensure_newline_before_comments = True 17 | line_length = 88 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools # type: ignore 2 | import sys 3 | 4 | # Load the version number from __init__.py 5 | __version__ = "Undefined" 6 | for line in open("rcsbsearch/__init__.py"): 7 | if line.startswith("__version__"): 8 | exec(line.strip()) 9 | 10 | # Version-specific requirements 11 | install_requires = ["requests", "jsonschema"] 12 | if sys.version_info < (3, 8): 13 | install_requires.append("typing_extensions") # 3.7 only 14 | 15 | # pin black version to get around https://github.com/psf/black/issues/2168 16 | tests_requires = ["tox", "pytest", "black==20.8b1", "flake8", "mypy"] 17 | 18 | # README 19 | with open("README.md", "r") as fh: 20 | long_description = fh.read() 21 | 22 | 23 | setuptools.setup( 24 | name="rcsbsearch", 25 | url="https://github.com/sbliven/rcsbsearch", 26 | description="Access the RCSB Search API", 27 | long_description=long_description, 28 | long_description_content_type="text/markdown", 29 | author="Spencer Bliven", 30 | author_email="spencer.bliven@gmail.com", 31 | version=__version__, 32 | tests_require=tests_requires, 33 | install_requires=install_requires, 34 | extras_require={ 35 | "progressbar": ["tqdm"], 36 | "tests": tests_requires, 37 | # should match docs/requirements.txt 38 | "docs": ["sphinx", "sphinx-rtd-theme", "myst-parser"], 39 | }, 40 | packages=setuptools.find_packages(exclude=["tests"]), 41 | package_data={"": ["resources/*"]}, 42 | scripts=[], 43 | classifiers=[ 44 | "Programming Language :: Python :: 3", 45 | "Programming Language :: Python :: 3 :: Only", 46 | "Programming Language :: Python :: 3.7", 47 | "Programming Language :: Python :: 3.8", 48 | "Development Status :: 4 - Beta", 49 | # "Development Status :: 5 - Production/Stable", 50 | "Operating System :: OS Independent", 51 | "Intended Audience :: Science/Research", 52 | "License :: OSI Approved :: BSD License", 53 | "Topic :: Scientific/Engineering :: Bio-Informatics", 54 | "Typing :: Typed", 55 | ], 56 | # Uses dataclasses, f-strings, typing 57 | python_requires=">=3.7", 58 | ) 59 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbliven/rcsbsearch/c7f8cb7e9f26ed5c78af1688af972fd345de8978/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_schema.py: -------------------------------------------------------------------------------- 1 | from rcsbsearch import rcsb_attributes as attrs 2 | 3 | 4 | def test_schema(): 5 | assert attrs.rcsb_id.attribute == "rcsb_id" 6 | 7 | assert attrs.rcsb_struct_symmetry.symbol.attribute == "rcsb_struct_symmetry.symbol" 8 | -------------------------------------------------------------------------------- /tests/test_search.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | 3 | import pytest # type: ignore 4 | import requests 5 | 6 | from rcsbsearch import Attr, Group, Session, Terminal, TextQuery, Value 7 | from rcsbsearch import rcsb_attributes as attrs 8 | from rcsbsearch.search import PartialQuery 9 | 10 | # q1 = rcsb.Terminal("rcsb_struct_symmetry.type", "exact_match", "Icosahedral") 11 | # q2 = rcsb.Terminal("rcsb_struct_symmetry.kind", "exact_match", "Global Symmetry") 12 | 13 | 14 | def test_construction(): 15 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "2GS2"]) 16 | q2 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "5T89"]) 17 | 18 | both = q1 & q2 19 | assert isinstance(both, Group) 20 | assert both.operator == "and" 21 | assert both.nodes[0] == q1 22 | assert both.nodes[1] == q2 23 | 24 | either = q1 | q2 25 | assert isinstance(either, Group) 26 | assert either.operator == "or" 27 | assert either.nodes[0] == q1 28 | assert either.nodes[1] == q2 29 | 30 | 31 | @pytest.mark.internet 32 | def test_single(): 33 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "2GS2"]) 34 | session = Session(Group("and", [q1])) 35 | result = session._single_query() 36 | assert result is not None 37 | 38 | 39 | @pytest.mark.internet 40 | @pytest.mark.progressbar 41 | def test_iquery(): 42 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "2GS2"]) 43 | session = Session(q1) 44 | result = session.iquery() 45 | assert len(result) == 2 46 | 47 | 48 | @pytest.mark.internet 49 | def test_iter(): 50 | ids = ["4HHB", "2GS2"] 51 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids) 52 | result = set(q1()) 53 | assert len(result) == 2 54 | assert result == set(ids) 55 | 56 | 57 | @pytest.mark.internet 58 | def test_inv(): 59 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "exact_match", "5T89") 60 | q = ~q1 61 | # Lots of results 62 | first = next(iter(q())) 63 | assert first is not None 64 | assert first != "5T89" 65 | 66 | 67 | @pytest.mark.internet 68 | def test_xor(): 69 | ids1 = ["5T89", "2GS2"] 70 | ids2 = ["4HHB", "2GS2"] 71 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids1) 72 | q2 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids2) 73 | q = q1 ^ q2 74 | print(f"XOR Query: {q}") 75 | result = set(q()) 76 | assert len(result) == 2 77 | assert result == {ids1[0], ids2[0]} 78 | 79 | 80 | @pytest.mark.internet 81 | def test_pagination(): 82 | ids = ["4HHB", "2GS2", "5T89", "1TIM"] 83 | q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids) 84 | 85 | # 2+2 results 86 | session = Session(q1, rows=2) 87 | result = set(session) 88 | assert len(result) == 4 89 | assert result == set(ids) 90 | 91 | # 3+1 results 92 | session = Session(q1, rows=3) 93 | result = set(session) 94 | assert len(result) == 4 95 | assert result == set(ids) 96 | 97 | # 1ABC will never be a valid ID 98 | q2 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["1ABC"]) 99 | session = Session(q2) 100 | result = set(session) 101 | assert len(result) == 0 102 | 103 | 104 | @pytest.mark.internet 105 | def test_errors(): 106 | # Malformed 107 | q1 = Terminal("invalid_identifier", "exact_match", "ERROR") 108 | session = Session(q1) 109 | try: 110 | set(session) 111 | assert False, "Should raise error" 112 | except requests.HTTPError: 113 | pass 114 | 115 | 116 | @pytest.mark.internet 117 | def test_example1(): 118 | """'Biological Assembly Search' from http://search.rcsb.org/#examples 119 | 120 | (Also used in the README) 121 | """ 122 | # Create terminals for each query 123 | q1 = TextQuery('"heat-shock transcription factor"') 124 | q2 = attrs.rcsb_struct_symmetry.symbol == "C2" 125 | q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry" 126 | q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1 127 | 128 | # combined using bitwise operators (&, |, ~, etc) 129 | query = q1 & q2 & q3 & q4 # AND of all queries 130 | 131 | results = set(query("assembly")) 132 | assert len(results) > 0 # 14 results 2020-06 133 | assert "1FYL-1" in results 134 | 135 | # Fluent syntax 136 | query2 = ( 137 | TextQuery('"heat-shock transcription factor"') 138 | .and_("rcsb_struct_symmetry.symbol") 139 | .exact_match("C2") 140 | .and_("rcsb_struct_symmetry.kind") 141 | .exact_match("Global Symmetry") 142 | .and_("rcsb_entry_info.polymer_entity_count_DNA") 143 | .greater_or_equal(1) 144 | ) 145 | 146 | assert query2 == query 147 | 148 | results = set(query2.exec("assembly")) 149 | assert len(results) > 0 # 14 results 2020-06 150 | assert "1FYL-1" in results 151 | 152 | 153 | @pytest.mark.internet 154 | def test_example2(): 155 | "'X-Ray Structures Search' from http://search.rcsb.org/#examples" 156 | q = ( 157 | Terminal(value='"thymidine kinase"') 158 | & Terminal( 159 | "rcsb_entity_source_organism.taxonomy_lineage.name", 160 | "exact_match", 161 | "Viruses", 162 | ) 163 | & Terminal( 164 | "exptl.method", 165 | "exact_match", 166 | "X-RAY DIFFRACTION", 167 | ) 168 | & Terminal( 169 | "rcsb_entry_info.resolution_combined", 170 | "less_or_equal", 171 | 2.5, 172 | ) 173 | & Terminal("rcsb_entry_info.nonpolymer_entity_count", "greater", 0) 174 | ) 175 | 176 | results = set(q("entry")) 177 | assert len(results) > 0 # 224 results 2020-06 178 | assert "1KI6" in results 179 | 180 | 181 | def test_attr(): 182 | attr = Attr("attr") 183 | 184 | term = attr == "value" 185 | assert isinstance(term, Terminal) 186 | assert term.operator == "exact_match" 187 | 188 | term = "value" == attr 189 | assert isinstance(term, Terminal) 190 | assert term.operator == "exact_match" 191 | 192 | term = Value("value") == attr 193 | assert isinstance(term, Terminal) 194 | assert term.operator == "exact_match" 195 | 196 | 197 | @pytest.mark.internet 198 | def test_freetext(): 199 | query = TextQuery("tubulin") 200 | results = set(query()) 201 | assert len(results) > 0 202 | 203 | 204 | def test_partialquery(): 205 | query = Attr("a").equals("aval").and_("b") 206 | 207 | assert isinstance(query, PartialQuery) 208 | 209 | query = query.exact_match("bval") 210 | 211 | assert isinstance(query, Group) 212 | assert query.operator == "and" 213 | assert len(query.nodes) == 2 214 | assert query.nodes[0].attribute == "a" 215 | assert query.nodes[0].operator == "equals" 216 | assert query.nodes[0].value == "aval" 217 | assert query.nodes[1].attribute == "b" 218 | assert query.nodes[1].operator == "exact_match" 219 | assert query.nodes[1].value == "bval" 220 | 221 | query = query.and_(Attr("c") < 5) 222 | assert len(query.nodes) == 3 223 | assert query.nodes[2].attribute == "c" 224 | assert query.nodes[2].operator == "less" 225 | assert query.nodes[2].value == 5 226 | 227 | query = query.or_("d") 228 | 229 | assert isinstance(query, PartialQuery) 230 | assert query.attr == Attr("d") 231 | assert query.operator == "or" 232 | 233 | query = query == "dval" 234 | assert isinstance(query, Group) 235 | assert query.operator == "or" 236 | assert len(query.nodes) == 2 237 | assert isinstance(query.nodes[0], Group) 238 | assert query.nodes[1].attribute == "d" 239 | assert query.nodes[1].operator == "exact_match" 240 | assert query.nodes[1].value == "dval" 241 | 242 | 243 | def test_operators(): 244 | q1 = attrs.rcsb_id.in_(["4HHB", "2GS2"]) 245 | results = list(q1()) 246 | assert len(results) == 2 247 | 248 | q1 = attrs.citation.rcsb_authors.contains_words("kisko bliven") 249 | results = list(q1()) 250 | assert results[0] == "5T89" # first hit has both authors 251 | assert "3V6B" in results # only a single author 252 | 253 | q1 = attrs.citation.rcsb_authors.contains_phrase("kisko bliven") 254 | results = list(q1()) 255 | assert len(results) == 0 256 | 257 | q1 = attrs.struct.title.contains_phrase( 258 | "VEGF-A in complex with VEGFR-1 domains D1-6" 259 | ) 260 | results = list(q1()) 261 | assert "5T89" in results 262 | 263 | q1 = attrs.rcsb_struct_symmetry.type.exact_match("Asymmetric") 264 | results = list(islice(q1(), 5)) 265 | assert len(results) == 5 266 | 267 | q1 = attrs.rcsb_struct_symmetry.type.exact_match("symmetric") 268 | results = list(islice(q1(), 5)) 269 | assert len(results) == 0 270 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37, py38, py39, lint, docs 3 | 4 | [testenv:lint] 5 | # pin black version to get around https://github.com/psf/black/issues/2168 6 | deps = 7 | black==20.8b1 8 | flake8 9 | mypy 10 | commands = 11 | black --check . 12 | flake8 13 | mypy rcsbsearch tests 14 | 15 | [testenv:docs] 16 | deps = -rdocs/requirements.txt 17 | changedir = docs 18 | whitelist_externals = make 19 | commands = 20 | make clean 21 | make html 22 | 23 | [testenv] 24 | deps = 25 | pytest 26 | tqdm 27 | commands = 28 | pytest {posargs} --------------------------------------------------------------------------------