├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── api.rst
    ├── conf.py
    ├── index.md
    ├── make.bat
    ├── queries.md
    ├── quickstart.md
    └── requirements.txt
├── environment.yml
├── notebooks
    ├── covid.ipynb
    └── quickstart.ipynb
├── pytest.ini
├── rcsbsearch
    ├── __init__.py
    ├── resources
    │   └── metadata_schema.json
    ├── schema.py
    ├── search.py
    └── update_schema.py
├── requirements.txt
├── requirements_dev.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_schema.py
    └── test_search.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |     configuration: docs/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats:
14 |     - pdf
15 | 
16 | python:
17 |     version: 3.7
18 |     install:
19 |         - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.7"
 4 |   - "3.8"
 5 |   - "3.9"
 6 | jobs:
 7 |   include:
 8 |   - name: lint
 9 |     script: tox -e lint
10 | install:
11 |   - pip install tox-travis
12 | script:
13 |   - tox
14 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## v0.2.3 (2021-04-28)
 4 | 
 5 | - Fix mug with missing schema files when installed via pip
 6 | - Add jupyter notebooks
 7 | - Try rcsbsearch live with binder
 8 | 
 9 | ## v0.2.2 (2021-04-06)
10 | 
11 | - Remove `in` operator syntax (incompatible with python spec)
12 | - Fix import error due to schema change
13 | - Ship schema with the package for stability and performance
14 | 
15 | ## v0.2.1 (2020-06-18)
16 | 
17 | - Test release process
18 | 
19 | ## v0.2.0 (2020-06-18)
20 | 
21 | - Add fluent syntax (originally called builder syntax)
22 |   - Add PartialQuery helper
23 | - Improve docs & automated testing
24 | 
25 | ## v0.1.0 (2020-06-03)
26 | 
27 | - Ship it!
28 | - Support for text searches


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | ## Testing
  4 | 
  5 | Tests are run using tox and/or pytest.
  6 | 
  7 |     tox -e py37
  8 | 
  9 | or directly:
 10 | 
 11 |     pytest
 12 | 
 13 | 
 14 | ## Code Style
 15 | 
 16 | Code conforms to the `black` and PEP8 style guides. Before checking in code, please run the linters:
 17 | 
 18 |     black .
 19 |     flake8
 20 |     mypy rcsbsearch
 21 | 
 22 | These are tested by the 'lint' tox environment:
 23 | 
 24 |     tox -e lint
 25 | 
 26 | 
 27 | ## Building docs
 28 | 
 29 | Docs are written in the [MyST](https://myst-parser.readthedocs.io) superset of
 30 | markdown. [Google style
 31 | docstrings](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) are
 32 | preferred for API documentation.
 33 | 
 34 | Building with tox:
 35 | 
 36 |     tox -e docs
 37 | 
 38 | Building manually:
 39 | 
 40 |     cd docs
 41 |     make html
 42 | 
 43 | For live updates, you can also install the `sphinx-autobuild` tool (pip or conda)
 44 | 
 45 |     pip install sphinx-autobuild
 46 |     cd docs
 47 |     make livehtml
 48 | 
 49 | Which runs:
 50 | 
 51 |     sphinx-autobuild -z rcsbsearch docs docs/_build/html
 52 | 
 53 | ## Making a release
 54 | 
 55 | ### Setup
 56 | 
 57 | - Set up GPG key (for signing the tag)
 58 | - `pip install twine`
 59 | - Generate API token at TestPyPI and PyPI and add to .pypirc:
 60 | 
 61 |     [distutils]
 62 |         index-servers=
 63 |             pypi
 64 |             testpypi
 65 |     [pypi]
 66 |         username = __token__
 67 |         password = pypi-...
 68 |     [testpypi]
 69 |         repository: https://test.pypi.org/legacy/
 70 |         username = __token__
 71 |         password = pypi-...
 72 | 
 73 | - `chmod 600 ~/.pypirc`
 74 | 
 75 | 
 76 | ### Release
 77 | 
 78 | 1. Test
 79 | 
 80 |         tox
 81 | 
 82 | 2. Build
 83 | 
 84 |         python setup.py sdist bdist_wheel
 85 | 
 86 | 3. Tag
 87 | 
 88 |         git tag -s -a v0.1.0
 89 | 
 90 | 4. Run checks
 91 | 
 92 |         twine check dist/*
 93 |         git verify-tag v0.1.0
 94 | 
 95 | 4. Push to testing
 96 | 
 97 |         twine upload --repository testpypi -s --identity 780796DF dist/*
 98 | 
 99 | 5. Push!
100 | 
101 |         git push
102 |         git push --tags
103 |         twine upload -s --identity 780796DF dist/*
104 | 
105 | 6. Bump version number
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | --------------------
 3 | 
 4 | Copyright (c) 2020, Spencer Bliven
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | 1. Redistributions of source code must retain the above copyright notice,
11 |    this list of conditions and the following disclaimer.
12 | 
13 | 2. Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 | 3. Neither the name of the copyright holder nor the names of its contributors
18 |    may be used to endorse or promote products derived from this software
19 |    without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPi Release](https://img.shields.io/pypi/v/rcsbsearch.svg)](https://pypi.org/project/rcsbsearch/)
  2 | [![Build Status](https://travis-ci.org/sbliven/rcsbsearch.svg?branch=master)](https://travis-ci.org/sbliven/rcsbsearch)
  3 | [![Documentation Status](https://readthedocs.org/projects/rcsbsearch/badge/?version=latest)](https://rcsbsearch.readthedocs.io/en/latest/?badge=latest)
  4 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  5 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fcovid.ipynb)
  6 | 
  7 | # rcsbsearch
  8 | 
  9 | Python interface for the RCSB search API.
 10 | 
 11 | Currently the 'text search' part of the API has been implemented. See 'Supported
 12 | features' below.
 13 | 
 14 | This package requires python 3.7 or later.
 15 | 
 16 | ## Example
 17 | 
 18 | Here is a quick example of how the package is used. Two syntaxes are available for
 19 | constructing queries: an "operator" API using python's comparators, and a "fluent"
 20 | syntax where terms are chained together. Which to use is a matter of preference.
 21 | 
 22 | A runnable jupyter notebook with this example is available in [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb), or can be run online using binder:
 23 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fquickstart.ipynb)
 24 | 
 25 | An additional example including a Covid-19 related example is in [notebooks/covid.ipynb](notebooks/covid.ipynb):
 26 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fcovid.ipynb)
 27 | 
 28 | ### Operator example
 29 | 
 30 | Here is an example from the [RCSB Search
 31 | API](http://search.rcsb.org/#search-example-1) page, using the operator syntax. This
 32 | query finds symmetric dimers having a twofold rotation with the DNA-binding domain of
 33 | a heat-shock transcription factor.
 34 | 
 35 |     from rcsbsearch import TextQuery
 36 |     from rcsbsearch import rcsb_attributes as attrs
 37 | 
 38 |     # Create terminals for each query
 39 |     q1 = TextQuery('"heat-shock transcription factor"')
 40 |     q2 = attrs.rcsb_struct_symmetry.symbol == "C2"
 41 |     q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry"
 42 |     q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1
 43 | 
 44 |     # combined using bitwise operators (&, |, ~, etc)
 45 |     query = q1 & q2 & q3 & q4  # AND of all queries
 46 | 
 47 |     # Call the query to execute it
 48 |     for assemblyid in query("assembly"):
 49 |         print(assemblyid)
 50 | 
 51 | For a full list of attributes, please refer to the [RCSB
 52 | schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema).
 53 | 
 54 | ### Fluent Example
 55 | 
 56 | Here is the same example using the
 57 | [fluent](https://en.wikipedia.org/wiki/Fluent_interface) syntax.
 58 | 
 59 |     from rcsbsearch import TextQuery
 60 | 
 61 |     # Start with a Attr or TextQuery, then add terms
 62 |     results = TextQuery('"heat-shock transcription factor"') \
 63 |         .and_("rcsb_struct_symmetry.symbol").exact_match("C2") \
 64 |         .and_("rcsb_struct_symmetry.kind").exact_match("Global Symmetry") \
 65 |         .and_("rcsb_entry_info.polymer_entity_count_DNA").greater_or_equal(1) \
 66 |         .exec("assembly")
 67 | 
 68 |     # Exec produces an iterator of IDs
 69 |     for assemblyid in results:
 70 |         print(assemblyid)
 71 | 
 72 | 
 73 | ## Supported Features
 74 | 
 75 | The following table lists the status of current and planned features.
 76 | 
 77 | - [x] Attribute Comparison operations
 78 | - [x] Query set operations
 79 | - [x] Attribute `contains`, `in_` (fluent only)
 80 | - [ ] Sequence search
 81 | - [ ] Sequence motif search
 82 | - [ ] Structural search
 83 | - [ ] Structural motif search
 84 | - [ ] Chemical search
 85 | - [ ] Rich results using the Data API
 86 | 
 87 | Contributions are welcome for unchecked items!
 88 | 
 89 | ## Installation
 90 | 
 91 | Get it from pypi:
 92 | 
 93 |     pip install rcsbsearch
 94 | 
 95 | Or, download from [github](https://github.com/sbliven/rcsbsearch)
 96 | 
 97 | ## Documentation
 98 | 
 99 | Detailed documentation is at [rcsbsearch.readthedocs.io](https://rcsbsearch.readthedocs.io/en/latest/)
100 | 
101 | ## License
102 | 
103 | Code is licensed under the BSD 3-clause license. See [LICENSE](LICENSE) for details.
104 | 
105 | ## Citing rcsbsearch
106 | 
107 | Please cite the rcsbsearch package by URL:
108 | 
109 | > https://rcsbsearch.readthedocs.io
110 | 
111 | You should also cite the RCSB service this package utilizes:
112 | 
113 | > Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi
114 | > Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley,
115 | > John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards
116 | > Integrated Searching and Efficient Access to Macromolecular Structure Data
117 | > from the PDB Archive, Journal of Molecular Biology, 2020.
118 | > DOI: [10.1016/j.jmb.2020.11.003](https://doi.org/10.1016/j.jmb.2020.11.003)
119 | 
120 | ## Developers
121 | 
122 | For information about building and developing `rcsbsearch`, see
123 | [CONTRIBUTING.md](CONTRIBUTING.md)
124 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | livehtml:
18 | 	sphinx-autobuild -b html -z "$(SOURCEDIR)/../rcsbsearch" "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
19 | 
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | *****************
3 | 
4 | .. automodule:: rcsbsearch
5 |     :members:
6 |     :private-members:
7 |     :special-members:


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath(".."))
17 | import rcsbsearch  # noqa: E402
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = "rcsbsearch"
22 | copyright = "2020, Spencer Bliven"
23 | author = "Spencer Bliven"
24 | 
25 | # The version info for the project you're documenting, acts as replacement for
26 | # |version| and |release|, also used in various other places throughout the
27 | # built documents.
28 | #
29 | # The short X.Y version.
30 | version = rcsbsearch.__version__.split("-")[0]
31 | # The full version, including alpha/beta/rc tags
32 | release = rcsbsearch.__version__
33 | 
34 | 
35 | # -- General configuration ---------------------------------------------------
36 | 
37 | # Add any Sphinx extension module names here, as strings. They can be
38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
39 | # ones.
40 | extensions = [
41 |     "sphinx.ext.autodoc",
42 |     "sphinx.ext.coverage",
43 |     "sphinx.ext.napoleon",
44 |     "myst_parser",
45 | ]
46 | # source_suffix = [".rst", ".md"]  # Redundant with newer sphinx versions
47 | 
48 | # Add any paths that contain templates here, relative to this directory.
49 | templates_path = ["_templates"]
50 | 
51 | # List of patterns, relative to source directory, that match files and
52 | # directories to ignore when looking for source files.
53 | # This pattern also affects html_static_path and html_extra_path.
54 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
55 | 
56 | # Napoleon settings
57 | # napoleon_google_docstring = True
58 | napoleon_numpy_docstring = False
59 | # napoleon_include_init_with_doc = False
60 | # napoleon_include_private_with_doc = False
61 | # napoleon_include_special_with_doc = True
62 | # napoleon_use_admonition_for_examples = False
63 | # napoleon_use_admonition_for_notes = False
64 | # napoleon_use_admonition_for_references = False
65 | # napoleon_use_ivar = False
66 | # napoleon_use_param = True
67 | # napoleon_use_rtype = True
68 | 
69 | 
70 | # -- Options for HTML output -------------------------------------------------
71 | 
72 | # The theme to use for HTML and HTML Help pages.  See the documentation for
73 | # a list of builtin themes.
74 | html_theme = "sphinx_rtd_theme"
75 | 
76 | 
77 | # Add any paths that contain custom static files (such as style sheets) here,
78 | # relative to this directory. They are copied after the builtin static files,
79 | # so a file named "default.css" will overwrite the builtin "default.css".
80 | html_static_path = ["_static"]
81 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # rcsbsearch - Query protein structures from python
 2 | 
 3 | The `rcsbsearch` package provides a python interface to the [RCSB Search API](http://search.rcsb.org/). Use it to fetch lists of PDB IDs corresponding to advanced query searches.
 4 | 
 5 | ```{toctree}
 6 | ---
 7 | caption: Contents
 8 | maxdepth: 2
 9 | ---
10 | quickstart.md
11 | queries.md
12 | api.rst
13 | ```
14 | 
15 | ## Availability
16 | 
17 | Get it from pypi:
18 | 
19 |     pip install rcsbsearch
20 | 
21 | Or, download from [github](https://github.com/sbliven/rcsbsearch)
22 | 
23 | ## License
24 | 
25 | Code is licensed under the BSD 3-clause license. See the
26 | [LICENSE](https://github.com/sbliven/rcsbsearch/blob/master/LICENSE) for details.
27 | 
28 | ## Citing
29 | 
30 | Please cite the rcsbsearch package by URL:
31 | 
32 | > https://rcsbsearch.readthedocs.io
33 | 
34 | You should also cite the RCSB service this package utilizes:
35 | 
36 | > Yana Rose, Jose M. Duarte, Robert Lowe, Joan Segura, Chunxiao Bi, Charmi
37 | > Bhikadiya, Li Chen, Alexander S. Rose, Sebastian Bittrich, Stephen K. Burley,
38 | > John D. Westbrook. RCSB Protein Data Bank: Architectural Advances Towards
39 | > Integrated Searching and Efficient Access to Macromolecular Structure Data
40 | > from the PDB Archive, Journal of Molecular Biology, 2020.
41 | > DOI: [10.1016/j.jmb.2020.11.003](https://doi.org/10.1016/j.jmb.2020.11.003)
42 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/queries.md:
--------------------------------------------------------------------------------
 1 | # Queries
 2 | 
 3 | Two syntaxes are available for constructing queries: an "operator" API using python's
 4 | comparators, and a "fluent" API where terms are chained together. Which to use is a
 5 | matter of preference, and both construct the same query object.
 6 | 
 7 | ## Operator syntax
 8 | 
 9 | Searches are built up from a series of `Terminal` nodes, which compare structural
10 | attributes to some search value. In the operator syntax, python's comparator
11 | operators are used to construct the comparison. The operators are overloaded to
12 | return `Terminal` objects for the comparisons.
13 | 
14 |     from rcsbsearch import TextQuery
15 |     from rcsbsearch import rcsb_attributes as attrs
16 | 
17 |     # Create terminals for each query
18 |     q1 = TextQuery('"heat-shock transcription factor"')
19 |     q2 = attrs.rcsb_struct_symmetry.symbol == "C2"
20 |     q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry"
21 |     q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1
22 | 
23 | Attributes are available from the rcsb_attributes object and can be tab-completed.
24 | They can additionally be constructed from strings using the `Attr(attribute)`
25 | constructor. For a full list of attributes, please refer to the [RCSB
26 | schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema).
27 | 
28 | `Terminal`s are combined into `Group`s using python's bitwise operators. This is
29 | analogous to how bitwise operators act on python `set` objects. The operators are
30 | lazy and won't perform the search until the query is executed.
31 | 
32 |     query = q1 & q2 & q3 & q4  # AND of all queries
33 | 
34 | AND (`&`), OR (`|`), and terminal negation (`~`) are implemented directly by the API,
35 | but the python package also implements set difference (`-`), symmetric difference (`^`),
36 | and general negation by transforming the query.
37 | 
38 | Queries are executed by calling them as functions. They return an iterator of result
39 | identifiers.
40 | 
41 |     results = set(query())
42 | 
43 | By default, the query will return "entry" results (PDB IDs). It is also possible to
44 | query other types of results (see [return-types](http://search.rcsb.org/#return-type)
45 | for options):
46 | 
47 |     assemblies = set(query("assembly"))
48 | 
49 | 
50 | ## Fluent syntax
51 | 
52 | The operator syntax is great for simple queries, but requires parentheses or
53 | temporary variables for complex nested queries. In these cases the fluent syntax may
54 | be clearer. Queries are built up by appending operations sequentially.
55 | 
56 |     from rcsbsearch import TextQuery
57 | 
58 |     # Start with a Attr or TextQuery, then add terms
59 |     results = TextQuery('"heat-shock transcription factor"') \
60 |         .and_("rcsb_struct_symmetry.symbol").exact_match("C2") \
61 |         .and_("rcsb_struct_symmetry.kind").exact_match("Global Symmetry") \
62 |         .and_("rcsb_entry_info.polymer_entity_count_DNA").greater_or_equal(1) \
63 |         .exec("assembly")
64 | 
65 | ## Sessions
66 | 
67 | The result of executing a query (either by calling it or using `exec()`) is a
68 | `Session` object. It implements `__iter__`, so it is usually treated just as an
69 | iterator of IDs.
70 | 
71 | Paging is handled transparently by the session, with additional API requests made
72 | lazily as needed. The page size can be controlled with the `rows` parameter.
73 | 
74 |     first = next(iter(query(rows=1)))
75 | 
76 | ### Progress Bar
77 | 
78 | The `Session.iquery()` method provides a progress bar indicating the number of API
79 | requests being made. It requires the `tqdm` package be installed to track the
80 | progress of the query interactively.
81 | 
82 |     results = query().iquery()
83 | 


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
 1 | # Quickstart
 2 | 
 3 | ## Installation
 4 | 
 5 | Get it from pypi:
 6 | 
 7 |     pip install rcsbsearch
 8 | 
 9 | Or, download from [github](https://github.com/sbliven/rcsbsearch)
10 | 
11 | ## Syntax
12 | 
13 | Here is a quick example of how the package is used. Two syntaxes are available for
14 | constructing queries: an "operator" API using python's comparators, and a "fluent"
15 | syntax where terms are chained together. Which to use is a matter of preference.
16 | 
17 | A runnable jupyter notebook with this example is available in [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb), or can be run online using binder:
18 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fquickstart.ipynb)
19 | 
20 | An additional example including a Covid-19 related example is in [notebooks/covid.ipynb](notebooks/covid.ipynb):
21 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sbliven/rcsbsearch/master?filepath=notebooks%2Fcovid.ipynb)
22 | 
23 | ### Operator example
24 | 
25 | Here is an example from the [RCSB Search
26 | API](http://search.rcsb.org/#search-example-1) page, using the operator syntax. This
27 | query finds symmetric dimers having a twofold rotation with the DNA-binding domain of
28 | a heat-shock transcription factor.
29 | 
30 |     from rcsbsearch import TextQuery
31 |     from rcsbsearch import rcsb_attributes as attrs
32 | 
33 |     # Create terminals for each query
34 |     q1 = TextQuery('"heat-shock transcription factor"')
35 |     q2 = attrs.rcsb_struct_symmetry.symbol == "C2"
36 |     q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry"
37 |     q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1
38 | 
39 |     # combined using bitwise operators (&, |, ~, etc)
40 |     query = q1 & q2 & q3 & q4  # AND of all queries
41 | 
42 |     # Call the query to execute it
43 |     for assemblyid in query("assembly"):
44 |         print(assemblyid)
45 | 
46 | For a full list of attributes, please refer to the [RCSB
47 | schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema).
48 | 
49 | ### Fluent Example
50 | 
51 | Here is the same example using the fluent syntax
52 | 
53 |     from rcsbsearch import Attr, TextQuery
54 | 
55 |     # Start with a Attr or TextQuery, then add terms
56 |     results = TextQuery('"heat-shock transcription factor"') \
57 |         .and_("rcsb_struct_symmetry.symbol").exact_match("C2") \
58 |         .and_("rcsb_struct_symmetry.kind").exact_match("Global Symmetry") \
59 |         .and_("rcsb_entry_info.polymer_entity_count_DNA").greater_or_equal(1) \
60 |         .exec("assembly")
61 | 
62 |     # Exec produces an iterator of IDs
63 |     for assemblyid in results:
64 |         print(assemblyid)
65 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Pin dependencies for the docs
2 | # Should be kept up-to-date with setup.py
3 | sphinx==3.5.3
4 | sphinx-rtd-theme==0.5.2
5 | typing-extensions==3.7.4.3
6 | myst-parser==0.13.5
7 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | # Create conda environment
 2 | # This file is used for mybinder.org, so it includes all optional dependencies
 3 | name: rcsbsearch
 4 | channels:
 5 |   - conda-forge
 6 |   - defaults
 7 | dependencies:
 8 |   # Python 3.7 or newer
 9 |   - python >= 3.7
10 | 
11 |   # Required dependencies
12 |   - requests
13 |   - jsonschema
14 |   # python 3.7 only
15 |   - typing_extensions
16 | 
17 |   # dev requirements
18 |   - tqdm
19 |   - tox
20 |   - pytest
21 |   - black
22 |   - flake8
23 |   - mypy
24 |   - sphinx
25 |   - myst-parser
26 | 
27 |   # notebook packages
28 |   - jupyter
29 |   - nglview
30 | 
31 |   - pip
32 |   - pip:
33 |     - sphinx-rtd-theme
34 |     # Install rcsbsearch from local directory
35 |     - .
36 | 


--------------------------------------------------------------------------------
/notebooks/covid.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "metallic-memphis",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# rcsbsearch\n",
  9 |     "\n",
 10 |     "Access the RCSB advanced search from python: [rcsbsearch.readthedocs.io](https://rcsbsearch.readthedocs.io)\n",
 11 |     "\n",
 12 |     "    pip install rcsbsearch\n",
 13 |     "    \n",
 14 |     "## Demo\n",
 15 |     "\n",
 16 |     "We are interested in how the antiviral drug boceprevir interacts with Covid-19. \n",
 17 |     "- Source Organism is \"COVID-19\"\n",
 18 |     "- Structure title contains \"protease\"\n",
 19 |     "- Bound to ligand \"Boceprevir\"\n",
 20 |     "\n",
 21 |     "[RCSB Query](http://www.rcsb.org/search?request=%7B%22query%22%3A%7B%22type%22%3A%22group%22%2C%22logical_operator%22%3A%22and%22%2C%22nodes%22%3A%5B%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22rcsb_entity_source_organism.taxonomy_lineage.name%22%2C%22operator%22%3A%22exact_match%22%2C%22value%22%3A%22COVID-19%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A0%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22value%22%3A%22protease%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A1%7D%2C%7B%22type%22%3A%22terminal%22%2C%22service%22%3A%22text%22%2C%22parameters%22%3A%7B%22attribute%22%3A%22chem_comp.name%22%2C%22operator%22%3A%22contains_words%22%2C%22value%22%3A%22Boceprevir%22%2C%22negation%22%3Afalse%7D%2C%22node_id%22%3A2%7D%5D%7D%2C%22return_type%22%3A%22entry%22%2C%22request_info%22%3A%7B%22query_id%22%3A%2270e677a6376b4c5eba8b4f2b73866c92%22%2C%22src%22%3A%22ui%22%7D%7D)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "id": "married-burden",
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "application/vnd.jupyter.widget-view+json": {
 33 |        "model_id": "0402505eff634df58b1f636f5f277d19",
 34 |        "version_major": 2,
 35 |        "version_minor": 0
 36 |       },
 37 |       "text/plain": []
 38 |      },
 39 |      "metadata": {},
 40 |      "output_type": "display_data"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "from rcsbsearch import rcsb_attributes as attrs, TextQuery\n",
 45 |     "import nglview"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "collectible-thread",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## Operator syntax\n",
 54 |     "- Uses python comparison operators for basic attributes (`==`, `<`, `<=`, etc)\n",
 55 |     "- Combine using set operators (`&`, `|`, `~`, etc)\n",
 56 |     "- Execute queries as functions"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 2,
 62 |    "id": "confidential-behavior",
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "['6WNP', '7K40']"
 69 |       ]
 70 |      },
 71 |      "execution_count": 2,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "q1 = attrs.rcsb_entity_source_organism.taxonomy_lineage.name == \"COVID-19\"\n",
 78 |     "q2 = TextQuery(\"protease\")\n",
 79 |     "q3 = attrs.chem_comp.name.contains_words(\"Boceprevir\")\n",
 80 |     "q4 = attrs.rcsb_entry_info.resolution_combined > 1.5\n",
 81 |     "query = q1 & q2 & q3 & ~q4\n",
 82 |     "\n",
 83 |     "list(query())"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "id": "unauthorized-judge",
 90 |    "metadata": {
 91 |     "scrolled": true
 92 |    },
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "application/vnd.jupyter.widget-view+json": {
 97 |        "model_id": "b3b3d3efdc9b414f8a2a4d7a6de1474a",
 98 |        "version_major": 2,
 99 |        "version_minor": 0
100 |       },
101 |       "text/plain": [
102 |        "NGLWidget()"
103 |       ]
104 |      },
105 |      "metadata": {},
106 |      "output_type": "display_data"
107 |     }
108 |    ],
109 |    "source": [
110 |     "nglview.show_pdbid('7brp')"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "id": "uniform-allen",
116 |    "metadata": {},
117 |    "source": [
118 |     "## Fluent syntax\n",
119 |     "\n",
120 |     "A second syntax is available with a [fluent interface](https://en.wikipedia.org/wiki/Fluent_interface), similar to popular data science packages like tidyverse and Apache Spark. Function calls  are chained together.\n",
121 |     "\n",
122 |     "Here's an example around a second antiviral, remdesivir. The drug interferes with RNA polymerase, replacing an adenine and causing early chain termination. When integrated into RNA, the nucleotide formed from remdesivir has residue code F86."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 4,
128 |    "id": "irish-navigator",
129 |    "metadata": {
130 |     "scrolled": true
131 |    },
132 |    "outputs": [
133 |     {
134 |      "name": "stderr",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "100%|██████████| 1/1 [00:00<?, ?it/s]\n"
138 |      ]
139 |     },
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "['7L1F', '7C2K', '7B3C', '7B3B']"
144 |       ]
145 |      },
146 |      "execution_count": 4,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "attrs.struct.title.contains_phrase(\"RNA polymerase\")\\\n",
153 |     "    .or_(attrs.struct.title).contains_words(\"RdRp\")\\\n",
154 |     "    .and_(attrs.rcsb_chem_comp_container_identifiers.comp_id).exact_match(\"F86\")\\\n",
155 |     "    .and_(attrs.rcsb_entity_source_organism.taxonomy_lineage.name).exact_match(\"COVID-19\")\\\n",
156 |     "    .exec()\\\n",
157 |     "    .iquery()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "id": "genuine-partner",
164 |    "metadata": {
165 |     "scrolled": true
166 |    },
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "application/vnd.jupyter.widget-view+json": {
171 |        "model_id": "5bb2c508ad044d1b8e3ef6f0a3e6fece",
172 |        "version_major": 2,
173 |        "version_minor": 0
174 |       },
175 |       "text/plain": [
176 |        "NGLWidget()"
177 |       ]
178 |      },
179 |      "metadata": {},
180 |      "output_type": "display_data"
181 |     }
182 |    ],
183 |    "source": [
184 |     "view = nglview.show_pdbid('7B3C', default_representation=False)\n",
185 |     "#view.get_state()['_camera_orientation']\n",
186 |     "o = [6, 3, 23, 0, 23, 1, -6, 0, -2, 24, -2, 0, -84, -92, -109, 1]\n",
187 |     "view.control.orient(o)\n",
188 |     "view.add_surface(sele=\"protein\", opacity=.8, color=\"electrostatic\")\n",
189 |     "view.add_cartoon(sele=\"rna\", color=\"cyan\")\n",
190 |     "view.add_licorice(sele=\"rna\", color=\"cyan\")\n",
191 |     "view.add_spacefill(sele=\"F86\")\n",
192 |     "view"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "id": "distant-graduate",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Try it!\n",
201 |     "\n",
202 |     "[rcsbsearch.readthedocs.io](rcsbsearch.readthedocs.io)"
203 |    ]
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python [conda env:rcsbsearch-demo]",
209 |    "language": "python",
210 |    "name": "conda-env-rcsbsearch-demo-py"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.7.10"
223 |   },
224 |   "toc": {
225 |    "base_numbering": 1,
226 |    "nav_menu": {},
227 |    "number_sections": true,
228 |    "sideBar": true,
229 |    "skip_h1_title": false,
230 |    "title_cell": "Table of Contents",
231 |    "title_sidebar": "Contents",
232 |    "toc_cell": false,
233 |    "toc_position": {},
234 |    "toc_section_display": true,
235 |    "toc_window_display": false
236 |   },
237 |   "varInspector": {
238 |    "cols": {
239 |     "lenName": 16,
240 |     "lenType": 16,
241 |     "lenVar": 40
242 |    },
243 |    "kernels_config": {
244 |     "python": {
245 |      "delete_cmd_postfix": "",
246 |      "delete_cmd_prefix": "del ",
247 |      "library": "var_list.py",
248 |      "varRefreshCmd": "print(var_dic_list())"
249 |     },
250 |     "r": {
251 |      "delete_cmd_postfix": ") ",
252 |      "delete_cmd_prefix": "rm(",
253 |      "library": "var_list.r",
254 |      "varRefreshCmd": "cat(var_dic_list()) "
255 |     }
256 |    },
257 |    "types_to_exclude": [
258 |     "module",
259 |     "function",
260 |     "builtin_function_or_method",
261 |     "instance",
262 |     "_Feature"
263 |    ],
264 |    "window_display": false
265 |   }
266 |  },
267 |  "nbformat": 4,
268 |  "nbformat_minor": 5
269 | }
270 | 


--------------------------------------------------------------------------------
/notebooks/quickstart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "upper-filing",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# rcsbsearch quickstart\n",
  9 |     "\n",
 10 |     "This notebook contains examples from the rcsbsearch [quickstart](https://rcsbsearch.readthedocs.io/en/latest/quickstart.html)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "african-monthly",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from rcsbsearch import TextQuery\n",
 21 |     "from rcsbsearch import rcsb_attributes as attrs"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "sublime-karen",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Operator syntax\n",
 30 |     "\n",
 31 |     "Here is an example from the RCSB Search API page, using the operator syntax. This query finds symmetric dimers having a twofold rotation with the DNA-binding domain of a heat-shock transcription factor.\n",
 32 |     "\n",
 33 |     "Note the use of standard comparison operators (`==`, `>` etc) for rcsb attributes and set operators for combining queries."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "id": "enabling-america",
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stdout",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "1FYL-2\n",
 47 |       "1FYL-1\n",
 48 |       "1FYM-1\n",
 49 |       "1FYK-1\n",
 50 |       "3HTS-1\n",
 51 |       "5D8K-1\n",
 52 |       "5D8L-2\n",
 53 |       "5D8L-1\n",
 54 |       "5D5W-1\n",
 55 |       "5D5X-1\n",
 56 |       "5HDN-2\n",
 57 |       "5HDN-1\n",
 58 |       "5D5V-1\n",
 59 |       "5D5U-1\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "# Create terminals for each query\n",
 65 |     "q1 = TextQuery('\"heat-shock transcription factor\"')\n",
 66 |     "q2 = attrs.rcsb_struct_symmetry.symbol == \"C2\"\n",
 67 |     "q3 = attrs.rcsb_struct_symmetry.kind == \"Global Symmetry\"\n",
 68 |     "q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1\n",
 69 |     "\n",
 70 |     "# combined using bitwise operators (&, |, ~, etc)\n",
 71 |     "query = q1 & q2 & q3 & q4  # AND of all queries\n",
 72 |     "\n",
 73 |     "# Call the query to execute it\n",
 74 |     "for assemblyid in query(\"assembly\"):\n",
 75 |     "    print(assemblyid)\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "id": "accomplished-passion",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "Attribute names can be found in the [RCSB schema](http://search.rcsb.org/rcsbsearch/v1/metadata/schema). They can also be found via tab completion, or by iterating:"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "id": "supported-observer",
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "['citation.rcsb_authors',\n",
 96 |        " 'pdbx_nmr_software.authors',\n",
 97 |        " 'rcsb_primary_citation.rcsb_authors',\n",
 98 |        " 'rcsb_bird_citation.rcsb_authors']"
 99 |       ]
100 |      },
101 |      "execution_count": 3,
102 |      "metadata": {},
103 |      "output_type": "execute_result"
104 |     }
105 |    ],
106 |    "source": [
107 |     "[a.attribute for a in attrs if \"authors\" in a.attribute]"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "other-grant",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Fluent syntax\n",
116 |     "\n",
117 |     "Here is the same example using the fluent syntax:"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "id": "polish-indonesia",
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "1FYL-2\n",
131 |       "1FYL-1\n",
132 |       "1FYM-1\n",
133 |       "1FYK-1\n",
134 |       "3HTS-1\n",
135 |       "5D8K-1\n",
136 |       "5D8L-2\n",
137 |       "5D8L-1\n",
138 |       "5D5W-1\n",
139 |       "5D5X-1\n",
140 |       "5HDN-2\n",
141 |       "5HDN-1\n",
142 |       "5D5V-1\n",
143 |       "5D5U-1\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# Start with a Attr or TextQuery, then add terms\n",
149 |     "results = TextQuery('\"heat-shock transcription factor\"') \\\n",
150 |     "    .and_(\"rcsb_struct_symmetry.symbol\").exact_match(\"C2\") \\\n",
151 |     "    .and_(\"rcsb_struct_symmetry.kind\").exact_match(\"Global Symmetry\") \\\n",
152 |     "    .and_(\"rcsb_entry_info.polymer_entity_count_DNA\").greater_or_equal(1) \\\n",
153 |     "    .exec(\"assembly\")\n",
154 |     "\n",
155 |     "# Exec produces an iterator of IDs\n",
156 |     "for assemblyid in results:\n",
157 |     "    print(assemblyid)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "id": "adopted-gnome",
163 |    "metadata": {},
164 |    "source": [
165 |     "For a more practical example, see the [Covid-19 notebook](covid.ipynb)"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python [conda env:rcsbsearch]",
172 |    "language": "python",
173 |    "name": "conda-env-rcsbsearch-py"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.9.1"
186 |   },
187 |   "toc": {
188 |    "base_numbering": 1,
189 |    "nav_menu": {},
190 |    "number_sections": true,
191 |    "sideBar": true,
192 |    "skip_h1_title": false,
193 |    "title_cell": "Table of Contents",
194 |    "title_sidebar": "Contents",
195 |    "toc_cell": false,
196 |    "toc_position": {},
197 |    "toc_section_display": true,
198 |    "toc_window_display": false
199 |   },
200 |   "varInspector": {
201 |    "cols": {
202 |     "lenName": 16,
203 |     "lenType": 16,
204 |     "lenVar": 40
205 |    },
206 |    "kernels_config": {
207 |     "python": {
208 |      "delete_cmd_postfix": "",
209 |      "delete_cmd_prefix": "del ",
210 |      "library": "var_list.py",
211 |      "varRefreshCmd": "print(var_dic_list())"
212 |     },
213 |     "r": {
214 |      "delete_cmd_postfix": ") ",
215 |      "delete_cmd_prefix": "rm(",
216 |      "library": "var_list.r",
217 |      "varRefreshCmd": "cat(var_dic_list()) "
218 |     }
219 |    },
220 |    "types_to_exclude": [
221 |     "module",
222 |     "function",
223 |     "builtin_function_or_method",
224 |     "instance",
225 |     "_Feature"
226 |    ],
227 |    "window_display": false
228 |   }
229 |  },
230 |  "nbformat": 4,
231 |  "nbformat_minor": 5
232 | }
233 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --strict-markers
3 | markers =
4 |     internet: Tests that require internet access
5 |     progressbar: Tests depending on the 'progressbar' extras


--------------------------------------------------------------------------------
/rcsbsearch/__init__.py:
--------------------------------------------------------------------------------
 1 | """RCSB Search API"""
 2 | from typing import TYPE_CHECKING, Any, List
 3 | 
 4 | from .search import Terminal  # noqa: F401
 5 | from .search import Attr, Group, Query, Session, TextQuery, Value
 6 | 
 7 | __version__ = "0.3.0-dev0"
 8 | 
 9 | 
10 | # loading rcsb_attributes can cause errors, so load it lazily
11 | if TYPE_CHECKING:
12 |     from .schema import SchemaGroup
13 | 
14 | 
15 | # Set docstring at top level too. Keep synchronized with schema.rcsb_attributes
16 | rcsb_attributes: "SchemaGroup"
17 | """Object with all known RCSB attributes.
18 | 
19 | This is provided to ease autocompletion as compared to creating Attr objects from
20 | strings. For example,
21 | ::
22 | 
23 |     rcsb_attributes.rcsb_nonpolymer_instance_feature_summary.chem_id
24 | 
25 | is equivalent to
26 | ::
27 | 
28 |     Attr('rcsb_nonpolymer_instance_feature_summary.chem_id')
29 | 
30 | All attributes in `rcsb_attributes` can be iterated over.
31 | 
32 |     >>> [a for a in rcsb_attributes if "stoichiometry" in a.attribute]
33 |     [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]
34 | 
35 | Attributes matching a regular expression can also be filtered:
36 | 
37 |     >>> list(rcsb_attributes.search('rcsb.*stoichiometry'))
38 |     [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]a
39 | 
40 | """
41 | 
42 | 
43 | def __getattr__(name: str) -> Any:
44 |     # delay instantiating rcsb_attributes until it is needed
45 |     if name == "rcsb_attributes":
46 |         if "rcsb_attributes" not in globals():
47 |             from .schema import rcsb_attributes as attrs
48 | 
49 |             globals()["rcsb_attributes"] = attrs
50 |         return globals()["rcsb_attributes"]
51 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
52 | 
53 | 
54 | def __dir__() -> List[str]:
55 |     return sorted(__all__)
56 | 
57 | 
58 | __all__ = [
59 |     "Query",
60 |     "Group",
61 |     "Terminal",
62 |     "TextQuery",
63 |     "Session",
64 |     "Attr",
65 |     "Value",
66 |     "rcsb_attributes",
67 | ]
68 | 


--------------------------------------------------------------------------------
/rcsbsearch/schema.py:
--------------------------------------------------------------------------------
  1 | """Parse the full RCSB search schema
  2 | 
  3 | Provides access to all valid attributes for search queries.
  4 | """
  5 | 
  6 | import json
  7 | import logging
  8 | import os
  9 | import pkgutil
 10 | import re
 11 | from typing import Any, Iterator, List, Union
 12 | 
 13 | import requests
 14 | 
 15 | from .search import Attr
 16 | 
 17 | METADATA_SCHEMA_URL = "http://search.rcsb.org/rcsbsearch/v1/metadata/schema"
 18 | SEARCH_SCHEMA_URL = "http://search.rcsb.org/json-schema-rcsb_search_query.json"
 19 | 
 20 | ENV_RCSBSEARCH_DOWNLOAD_SCHEMA = "RCSBSEARCH_DOWNLOAD_SCHEMA"
 21 | 
 22 | 
 23 | def _get_json_schema(download=None):
 24 |     """Get the JSON schema
 25 | 
 26 |     The RCSBSEARCH_DOWNLOAD_SCHEMA environmental variable controls whether
 27 |     to download the schema from the web each time vs using the version shipped
 28 |     with rcsbsearch
 29 |     """
 30 |     if download is True or (
 31 |         download is None
 32 |         and (
 33 |             os.environ.get(ENV_RCSBSEARCH_DOWNLOAD_SCHEMA, "no").lower()
 34 |             in ("1", "yes", "y")
 35 |         )
 36 |     ):
 37 |         return _download_json_schema()
 38 |     return _load_json_schema()
 39 | 
 40 | 
 41 | def _download_json_schema():
 42 |     "Get the current JSON schema from the web"
 43 |     url = METADATA_SCHEMA_URL
 44 | 
 45 |     logging.info(f"Dowloading {url}")
 46 |     response = requests.get(url)
 47 |     response.raise_for_status()
 48 |     return response.json()
 49 | 
 50 | 
 51 | def _load_json_schema():
 52 |     logging.info("Loading schema from file")
 53 |     latest = pkgutil.get_data(__package__, "resources/metadata_schema.json")
 54 |     return json.loads(latest)
 55 | 
 56 | 
 57 | class SchemaGroup:
 58 |     """A non-leaf node in the RCSB schema. Leaves are Attr values."""
 59 | 
 60 |     def search(self, pattern: Union[str, re.Pattern], flags=0) -> Iterator[Attr]:
 61 |         """Find all attributes in the schema matching a regular expression.
 62 | 
 63 |         Returns:
 64 |             An iterator supplying Attr objects whose attribute matches.
 65 |         """
 66 |         matcher = re.compile(pattern, flags=flags)
 67 |         return filter(lambda a: matcher.search(a.attribute), self)
 68 | 
 69 |     def __iter__(self) -> Iterator[Attr]:
 70 |         """Iterate over all leaf nodes
 71 | 
 72 |         Example:
 73 | 
 74 |             >>> [a for a in attrs if "stoichiometry" in a.attribute]
 75 |             [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]
 76 | 
 77 |         """
 78 | 
 79 |         def leaves(self):
 80 |             for k, v in self.__dict__.items():
 81 |                 if isinstance(v, Attr):
 82 |                     yield v
 83 |                 elif isinstance(v, SchemaGroup):
 84 |                     yield from iter(v)
 85 |                 else:
 86 |                     # Shouldn't happen
 87 |                     raise TypeError(f"Unrecognized member {k!r}: {v!r}")
 88 | 
 89 |         return leaves(self)
 90 | 
 91 |     def __str__(self):
 92 |         return "\n".join((str(c) for c in self.__dict__.values()))
 93 | 
 94 | 
 95 | def _make_group(fullname: str, node) -> Union[SchemaGroup, Attr]:
 96 |     """Represent this node of the schema as a python object
 97 | 
 98 |     Params:
 99 |     - name: full dot-separated attribute name
100 | 
101 |     Returns:
102 |     An Attr (Leaf nodes) or SchemaGroup (object nodes)
103 |     """
104 |     if "anyOf" in node:
105 |         children = {_make_group(fullname, n) for n in node["anyOf"]}
106 |         # Currently only deal with anyOf in leaf nodes
107 |         assert len(children) == 1, f"type of {fullname} couldn't be determined"
108 |         return next(iter(children))
109 |     if "oneOf" in node:
110 |         children = {_make_group(fullname, n) for n in node["oneOf"]}
111 |         # Currently only deal with oneOf in leaf nodes
112 |         assert len(children) == 1, f"type of {fullname} couldn't be determined"
113 |         return next(iter(children))
114 |     if "allOf" in node:
115 |         children = {_make_group(fullname, n) for n in node["allOf"]}
116 |         # Currently only deal with allOf in leaf nodes
117 |         assert len(children) == 1, f"type of {fullname} couldn't be determined"
118 |         return next(iter(children))
119 |     if node["type"] in ("string", "number", "integer", "date"):
120 |         return Attr(fullname)
121 |     elif node["type"] == "array":
122 |         # skip to items
123 |         return _make_group(fullname, node["items"])
124 |     elif node["type"] == "object":
125 |         group = SchemaGroup()  # parent, name)
126 |         for childname, childnode in node["properties"].items():
127 |             fullchildname = f"{fullname}.{childname}" if fullname else childname
128 |             childgroup = _make_group(fullchildname, childnode)
129 |             setattr(group, childname, childgroup)
130 |         return group
131 |     else:
132 |         raise TypeError(f"Unrecognized node type {node['type']!r} of {fullname}")
133 | 
134 | 
135 | def _make_schema() -> SchemaGroup:
136 |     json = _get_json_schema()
137 |     schema = _make_group("", json)
138 |     assert isinstance(schema, SchemaGroup)  # for type checking
139 |     return schema
140 | 
141 | 
142 | rcsb_attributes: SchemaGroup
143 | """Object with all known RCSB attributes.
144 | 
145 | This is provided to ease autocompletion as compared to creating Attr objects from
146 | strings. For example,
147 | ::
148 | 
149 |     rcsb_attributes.rcsb_nonpolymer_instance_feature_summary.chem_id
150 | 
151 | is equivalent to
152 | ::
153 | 
154 |     Attr('rcsb_nonpolymer_instance_feature_summary.chem_id')
155 | 
156 | All attributes in `rcsb_attributes` can be iterated over.
157 | 
158 |     >>> [a for a in rcsb_attributes if "stoichiometry" in a.attribute]
159 |     [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]
160 | 
161 | Attributes matching a regular expression can also be filtered:
162 | 
163 |     >>> list(rcsb_attributes.search('rcsb.*stoichiometry'))
164 |     [Attr(attribute='rcsb_struct_symmetry.stoichiometry')]a
165 | 
166 | """
167 | 
168 | 
169 | def __getattr__(name: str) -> Any:
170 |     # delay instantiating rcsb_attributes until it is needed
171 |     if name == "rcsb_attributes":
172 |         if "rcsb_attributes" not in globals():
173 |             globals()["rcsb_attributes"] = _make_schema()
174 |         return globals()["rcsb_attributes"]
175 |     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
176 | 
177 | 
178 | def __dir__() -> List[str]:
179 |     return sorted(__all__)
180 | 
181 | 
182 | __all__ = [  # noqa: F822
183 |     "METADATA_SCHEMA_URL",
184 |     "SEARCH_SCHEMA_URL",
185 |     "ENV_RCSBSEARCH_DOWNLOAD_SCHEMA",
186 |     "rcsb_attributes",
187 |     "SchemaGroup",
188 | ]
189 | 


--------------------------------------------------------------------------------
/rcsbsearch/search.py:
--------------------------------------------------------------------------------
   1 | """Interact with the [RCSB Search API](https://search.rcsb.org/#search-api).
   2 | """
   3 | 
   4 | import functools
   5 | import json
   6 | import logging
   7 | import math
   8 | import sys
   9 | import urllib.parse
  10 | import uuid
  11 | from abc import ABC, abstractmethod
  12 | from dataclasses import dataclass
  13 | from datetime import date
  14 | from typing import (
  15 |     Callable,
  16 |     Dict,
  17 |     Generic,
  18 |     Iterable,
  19 |     Iterator,
  20 |     List,
  21 |     Optional,
  22 |     Tuple,
  23 |     TypeVar,
  24 |     Union,
  25 |     overload,
  26 | )
  27 | 
  28 | import requests
  29 | 
  30 | if sys.version_info > (3, 8):
  31 |     from typing import Literal
  32 | else:
  33 |     from typing_extensions import Literal
  34 | # tqdm is optional
  35 | 
  36 | # Allowed return types for searches. http://search.rcsb.org/#return-type
  37 | ReturnType = Literal[
  38 |     "entry", "assembly", "polymer_entity", "non_polymer_entity", "polymer_instance"
  39 | ]
  40 | TAndOr = Literal["and", "or"]
  41 | # All valid types for Terminal values
  42 | TValue = Union[
  43 |     str,
  44 |     int,
  45 |     float,
  46 |     date,
  47 |     List[str],
  48 |     List[int],
  49 |     List[float],
  50 |     List[date],
  51 |     Tuple[str, ...],
  52 |     Tuple[int, ...],
  53 |     Tuple[float, ...],
  54 |     Tuple[date, ...],
  55 | ]
  56 | # Types valid for numeric operators
  57 | TNumberLike = Union[int, float, date, "Value[int]", "Value[float]", "Value[date]"]
  58 | 
  59 | 
  60 | class Query(ABC):
  61 |     """Base class for all types of queries.
  62 | 
  63 |     Queries can be combined using set operators:
  64 | 
  65 |     - `q1 & q2`: Intersection (AND)
  66 |     - `q1 | q2`: Union (OR)
  67 |     - `~q1`: Negation (NOT)
  68 |     - `q1 - q2`: Difference (implemented as `q1 & ~q2`)
  69 |     - `q1 ^ q2`: Symmetric difference (XOR, implemented as `(q1 & ~q2) | (~q1 & q2)`)
  70 | 
  71 |     Note that only AND, OR, and negation of terminals are directly supported by
  72 |     the API, so other operations may be slower.
  73 | 
  74 |     Queries can be executed by calling them as functions (`list(query())`) or using
  75 |     the exec function.
  76 | 
  77 |     Queries are immutable, and all modifying functions return new instances.
  78 |     """
  79 | 
  80 |     @abstractmethod
  81 |     def to_dict(self) -> Dict:
  82 |         """Get dictionary representing this query"""
  83 |         ...
  84 | 
  85 |     def to_json(self) -> str:
  86 |         """Get JSON string of this query"""
  87 |         return json.dumps(self.to_dict(), separators=(",", ":"))
  88 | 
  89 |     @abstractmethod
  90 |     def _assign_ids(self, node_id=0) -> Tuple["Query", int]:
  91 |         """Assign node_ids sequentially for all terminal nodes
  92 | 
  93 |         This is a helper for the :py:meth:`Query.assign_ids` method
  94 | 
  95 |         Args:
  96 |             node_id: Id to assign to the first leaf of this query
  97 | 
  98 |         Returns:
  99 |             query: The modified query, with node_ids assigned
 100 |             node_id: The next available node_id
 101 | 
 102 |         """
 103 |         ...
 104 | 
 105 |     def assign_ids(self) -> "Query":
 106 |         """Assign node_ids sequentially for all terminal nodes
 107 | 
 108 |         Returns:
 109 |             the modified query, with node_ids assigned sequentially from 0
 110 |         """
 111 |         return self._assign_ids(0)[0]
 112 | 
 113 |     @abstractmethod
 114 |     def __invert__(self) -> "Query":
 115 |         """Negation: `~a`"""
 116 |         ...
 117 | 
 118 |     def __and__(self, other: "Query") -> "Query":
 119 |         """Intersection: `a & b`"""
 120 |         assert isinstance(other, Query)
 121 |         return Group("and", [self, other])
 122 | 
 123 |     def __or__(self, other: "Query") -> "Query":
 124 |         """Union: `a | b`"""
 125 |         assert isinstance(other, Query)
 126 |         return Group("or", [self, other])
 127 | 
 128 |     def __sub__(self, other: "Query") -> "Query":
 129 |         """Difference: `a - b`"""
 130 |         return self & ~other
 131 | 
 132 |     def __xor__(self, other: "Query") -> "Query":
 133 |         """Symmetric difference: `a ^ b`"""
 134 |         return (self & ~other) | (~self & other)
 135 | 
 136 |     def exec(self, return_type: ReturnType = "entry", rows: int = 100) -> "Session":
 137 |         """Evaluate this query and return an iterator of all result IDs"""
 138 |         return Session(self, return_type, rows)
 139 | 
 140 |     def __call__(self, return_type: ReturnType = "entry", rows: int = 100) -> "Session":
 141 |         """Evaluate this query and return an iterator of all result IDs"""
 142 |         return self.exec(return_type, rows)
 143 | 
 144 |     @overload
 145 |     def and_(self, other: "Query") -> "Query":
 146 |         ...
 147 | 
 148 |     @overload
 149 |     def and_(self, other: Union[str, "Attr"]) -> "PartialQuery":
 150 |         ...
 151 | 
 152 |     def and_(
 153 |         self, other: Union[str, "Query", "Attr"]
 154 |     ) -> Union["Query", "PartialQuery"]:
 155 |         """Extend this query with an additional attribute via an AND"""
 156 |         if isinstance(other, Query):
 157 |             return self & other
 158 |         elif isinstance(other, Attr):
 159 |             return PartialQuery(self, "and", other)
 160 |         elif isinstance(other, str):
 161 |             return PartialQuery(self, "and", Attr(other))
 162 |         else:
 163 |             raise TypeError(f"Expected Query or Attr, got {type(other)}")
 164 | 
 165 |     @overload
 166 |     def or_(self, other: "Query") -> "Query":
 167 |         ...
 168 | 
 169 |     @overload
 170 |     def or_(self, other: Union[str, "Attr"]) -> "PartialQuery":
 171 |         ...
 172 | 
 173 |     def or_(self, other: Union[str, "Query", "Attr"]) -> Union["Query", "PartialQuery"]:
 174 |         """Extend this query with an additional attribute via an OR"""
 175 |         if isinstance(other, Query):
 176 |             return self & other
 177 |         elif isinstance(other, Attr):
 178 |             return PartialQuery(self, "or", other)
 179 |         elif isinstance(other, str):
 180 |             return PartialQuery(self, "or", Attr(other))
 181 |         else:
 182 |             raise TypeError(f"Expected Query or Attr, got {type(other)}")
 183 | 
 184 | 
 185 | @dataclass(frozen=True)
 186 | class Terminal(Query):
 187 |     """A terminal query node.
 188 | 
 189 |     Terminals are simple predicates comparing some *attribute* of a structure to a
 190 |     value.
 191 | 
 192 |     Examples:
 193 |         >>> Terminal("exptl.method", "exact_match", "X-RAY DIFFRACTION")
 194 |         >>> Terminal("rcsb_id", "in", ["5T89", "1TIM"])
 195 |         >>> Terminal(value="tubulin")
 196 | 
 197 |     A full list of attributes is available in the
 198 |     `schema <http://search.rcsb.org/rcsbsearch/v1/metadata/schema>`_.
 199 |     Operators are documented `here <http://search.rcsb.org/#field-queries>`_.
 200 | 
 201 |     The :py:class:`Attr` class provides a more pythonic way of constructing Terminals.
 202 |     """
 203 | 
 204 |     attribute: Optional[str] = None
 205 |     operator: Optional[str] = None
 206 |     value: Optional[TValue] = None
 207 |     service: str = "text"
 208 |     negation: bool = False
 209 |     node_id: int = 0
 210 | 
 211 |     def to_dict(self):
 212 |         params = dict()
 213 |         if self.attribute is not None:
 214 |             params["attribute"] = self.attribute
 215 |         if self.operator is not None:
 216 |             params["operator"] = self.operator
 217 |         if self.value is not None:
 218 |             params["value"] = self.value
 219 |         if self.negation is not None:
 220 |             params["negation"] = self.negation
 221 | 
 222 |         return dict(
 223 |             type="terminal",
 224 |             service=self.service,
 225 |             parameters=params,
 226 |             node_id=self.node_id,
 227 |         )
 228 | 
 229 |     def __invert__(self):
 230 |         return Terminal(
 231 |             self.attribute,
 232 |             self.operator,
 233 |             self.value,
 234 |             self.service,
 235 |             not self.negation,
 236 |             self.node_id,
 237 |         )
 238 | 
 239 |     def _assign_ids(self, node_id=0) -> Tuple[Query, int]:
 240 |         if self.node_id == node_id:
 241 |             return (self, node_id + 1)
 242 |         else:
 243 |             return (
 244 |                 Terminal(
 245 |                     self.attribute,
 246 |                     self.operator,
 247 |                     self.value,
 248 |                     self.service,
 249 |                     self.negation,
 250 |                     node_id,
 251 |                 ),
 252 |                 node_id + 1,
 253 |             )
 254 | 
 255 |     def __str__(self):
 256 |         """Return a simplified string representation
 257 | 
 258 |         Examples:
 259 |             >>> Terminal("attr", "op", "val")
 260 |             >>> ~Terminal(value="val")
 261 | 
 262 |         """
 263 |         negation = "~" if self.negation else ""
 264 |         if self.attribute is None and self.operator is None:
 265 |             # value-only
 266 |             return f"{negation}Terminal(value={self.value!r})"
 267 |         else:
 268 |             return (
 269 |                 f"{negation}Terminal({self.attribute!r}, {self.operator!r}, "
 270 |                 f"{self.value!r})"
 271 |             )
 272 | 
 273 | 
 274 | class TextQuery(Terminal):
 275 |     """Special case of a Terminal for free-text queries"""
 276 | 
 277 |     def __init__(self, value: str, negation: bool = False):
 278 |         """Search for the string value anywhere in the text
 279 | 
 280 |         Args:
 281 |             value: free-text query
 282 |             negation: find structures without the pattern
 283 |         """
 284 |         super().__init__(value=value, negation=negation)
 285 | 
 286 | 
 287 | @dataclass(frozen=True)
 288 | class Group(Query):
 289 |     """AND and OR combinations of queries"""
 290 | 
 291 |     operator: TAndOr
 292 |     nodes: Iterable[Query] = ()
 293 | 
 294 |     def to_dict(self):
 295 |         return dict(
 296 |             type="group",
 297 |             logical_operator=self.operator,
 298 |             nodes=[node.to_dict() for node in self.nodes],
 299 |         )
 300 | 
 301 |     def __invert__(self):
 302 |         if self.operator == "and":
 303 |             return Group("or", [~node for node in self.nodes])
 304 | 
 305 |     def __and__(self, other: Query) -> Query:
 306 |         # Combine nodes if possible
 307 |         if self.operator == "and":
 308 |             if isinstance(other, Group):
 309 |                 if other.operator == "and":
 310 |                     return Group("and", (*self.nodes, *other.nodes))
 311 |             elif isinstance(other, Query):
 312 |                 return Group("and", (*self.nodes, other))
 313 |             else:
 314 |                 return NotImplemented
 315 | 
 316 |         return super().__and__(other)
 317 | 
 318 |     def __or__(self, other: Query) -> Query:
 319 |         # Combine nodes if possible
 320 |         if self.operator == "or":
 321 |             if isinstance(other, Group):
 322 |                 if other.operator == "or":
 323 |                     return Group("or", (*self.nodes, *other.nodes))
 324 |             elif isinstance(other, Terminal):
 325 |                 return Group("or", (*self.nodes, other))
 326 |             else:
 327 |                 return NotImplemented
 328 | 
 329 |         return super().__or__(other)
 330 | 
 331 |     def _assign_ids(self, node_id=0) -> Tuple[Query, int]:
 332 |         nodes = []
 333 |         changed = False
 334 |         for node in self.nodes:
 335 |             assigned = node._assign_ids(node_id)
 336 |             nodes.append(assigned[0])
 337 |             node_id = assigned[1]
 338 |             # Track whether any nodes were modified
 339 |             changed = changed or assigned[0] is node
 340 |         if changed:
 341 |             return (Group(self.operator, nodes), node_id)
 342 |         else:
 343 |             return (self, node_id)
 344 | 
 345 |     def __str__(self):
 346 |         ""  # hide in documentation
 347 |         if self.operator == "and":
 348 |             return f"({' & '.join((str(n) for n in self.nodes))})"
 349 |         elif self.operator == "or":
 350 |             return f"({' | '.join((str(n) for n in self.nodes))})"
 351 |         else:
 352 |             raise ValueError("Illegal Operator")
 353 | 
 354 | 
 355 | @dataclass(frozen=True)
 356 | class Attr:
 357 |     """A search attribute, e.g. "rcsb_entry_container_identifiers.entry_id"
 358 | 
 359 |     Terminals can be constructed from Attr objects using either a functional syntax,
 360 |     which mirrors the API operators, or with python operators.
 361 | 
 362 |     +--------------------+---------------------+
 363 |     | Fluent Function    | Operator            |
 364 |     +====================+=====================+
 365 |     | exact_match        | attr == str         |
 366 |     +--------------------+---------------------+
 367 |     | contains_words     |                     |
 368 |     +--------------------+---------------------+
 369 |     | contains_phrase    |                     |
 370 |     +--------------------+---------------------+
 371 |     | greater            | attr > date,number  |
 372 |     +--------------------+---------------------+
 373 |     | less               | attr < date,number  |
 374 |     +--------------------+---------------------+
 375 |     | greater_or_equal   | attr >= date,number |
 376 |     +--------------------+---------------------+
 377 |     | less_or_equal      | attr <= date,number |
 378 |     +--------------------+---------------------+
 379 |     | equals             | attr == date,number |
 380 |     +--------------------+---------------------+
 381 |     | range              | attr[start:end]     |
 382 |     +--------------------+---------------------+
 383 |     | range_closed       |                     |
 384 |     +--------------------+---------------------+
 385 |     | exists             | bool(attr)          |
 386 |     +--------------------+---------------------+
 387 |     | in\\_              |                     |
 388 |     +--------------------+---------------------+
 389 | 
 390 |     Rather than their normal bool return values, operators return Terminals.
 391 | 
 392 |     Pre-instantiated attributes are available from the
 393 |     :py:data:`rcsbsearch.rcsb_attributes` object. These are generally easier to use
 394 |     than constructing Attr objects by hand. A complete list of valid attributes is
 395 |     available in the `schema <http://search.rcsb.org/rcsbsearch/v1/metadata/schema>`_.
 396 | 
 397 |     """
 398 | 
 399 |     attribute: str
 400 | 
 401 |     def exact_match(self, value: Union[str, "Value[str]"]) -> Terminal:
 402 |         """Exact match with the value"""
 403 |         if isinstance(value, Value):
 404 |             value = value.value
 405 |         return Terminal(self.attribute, "exact_match", value)
 406 | 
 407 |     def contains_words(
 408 |         self, value: Union[str, "Value[str]", List[str], "Value[List[str]]"]
 409 |     ) -> Terminal:
 410 |         """Match any word within the string.
 411 | 
 412 |         Words are split at whitespace. All results which match any word are returned,
 413 |         with results matching more words sorted first.
 414 |         """
 415 |         if isinstance(value, Value):
 416 |             value = value.value
 417 |         if isinstance(value, list):
 418 |             value = " ".join(value)
 419 |         return Terminal(self.attribute, "contains_words", value)
 420 | 
 421 |     def contains_phrase(self, value: Union[str, "Value[str]"]) -> Terminal:
 422 |         """Match an exact phrase"""
 423 |         if isinstance(value, Value):
 424 |             value = value.value
 425 |         return Terminal(self.attribute, "contains_phrase", value)
 426 | 
 427 |     def greater(self, value: TNumberLike) -> Terminal:
 428 |         """Attribute > `value`"""
 429 |         if isinstance(value, Value):
 430 |             value = value.value
 431 |         return Terminal(self.attribute, "greater", value)
 432 | 
 433 |     def less(self, value: TNumberLike) -> Terminal:
 434 |         """Attribute < `value`"""
 435 |         if isinstance(value, Value):
 436 |             value = value.value
 437 |         return Terminal(self.attribute, "less", value)
 438 | 
 439 |     def greater_or_equal(self, value: TNumberLike) -> Terminal:
 440 |         """Attribute >= `value`"""
 441 |         if isinstance(value, Value):
 442 |             value = value.value
 443 |         return Terminal(self.attribute, "greater_or_equal", value)
 444 | 
 445 |     def less_or_equal(self, value: TNumberLike) -> Terminal:
 446 |         """Attribute <= `value`"""
 447 |         if isinstance(value, Value):
 448 |             value = value.value
 449 |         return Terminal(self.attribute, "less_or_equal", value)
 450 | 
 451 |     def equals(self, value: TNumberLike) -> Terminal:
 452 |         """Attribute == `value`"""
 453 |         if isinstance(value, Value):
 454 |             value = value.value
 455 |         return Terminal(self.attribute, "equals", value)
 456 | 
 457 |     def range(self, value: Union[List[int], Tuple[int, int]]) -> Terminal:
 458 |         """Attribute is within the specified half-open range
 459 | 
 460 |         Args:
 461 |             value: lower and upper bounds `[a, b)`
 462 |         """
 463 |         if isinstance(value, Value):
 464 |             value = value.value
 465 |         return Terminal(self.attribute, "range", value)
 466 | 
 467 |     def range_closed(
 468 |         self,
 469 |         value: Union[
 470 |             List[int], Tuple[int, int], "Value[List[int]]", "Value[Tuple[int, int]]"
 471 |         ],
 472 |     ) -> Terminal:
 473 |         """Attribute is within the specified closed range
 474 | 
 475 |         Args:
 476 |             value: lower and upper bounds `[a, b]`
 477 |         """
 478 |         if isinstance(value, Value):
 479 |             value = value.value
 480 |         return Terminal(self.attribute, "range_closed", value)
 481 | 
 482 |     def exists(self) -> Terminal:
 483 |         """Attribute is defined for the structure"""
 484 |         return Terminal(self.attribute, "exists")
 485 | 
 486 |     def in_(
 487 |         self,
 488 |         value: Union[
 489 |             List[str],
 490 |             List[int],
 491 |             List[float],
 492 |             List[date],
 493 |             Tuple[str, ...],
 494 |             Tuple[int, ...],
 495 |             Tuple[float, ...],
 496 |             Tuple[date, ...],
 497 |             "Value[List[str]]",
 498 |             "Value[List[int]]",
 499 |             "Value[List[float]]",
 500 |             "Value[List[date]]",
 501 |             "Value[Tuple[str, ...]]",
 502 |             "Value[Tuple[int, ...]]",
 503 |             "Value[Tuple[float, ...]]",
 504 |             "Value[Tuple[date, ...]]",
 505 |         ],
 506 |     ) -> Terminal:
 507 |         """Attribute is contained in the list of values"""
 508 |         if isinstance(value, Value):
 509 |             value = value.value
 510 |         return Terminal(self.attribute, "in", value)
 511 | 
 512 |     # Need ignore[override] because typeshed restricts __eq__ return value
 513 |     # https://github.com/python/mypy/issues/2783
 514 |     @overload  # type: ignore[override]
 515 |     def __eq__(self, value: "Attr") -> bool:
 516 |         ...
 517 | 
 518 |     @overload  # type: ignore[override]
 519 |     def __eq__(
 520 |         self,
 521 |         value: Union[
 522 |             str,
 523 |             int,
 524 |             float,
 525 |             date,
 526 |             "Value[str]",
 527 |             "Value[int]",
 528 |             "Value[float]",
 529 |             "Value[date]",
 530 |         ],
 531 |     ) -> Terminal:
 532 |         ...
 533 | 
 534 |     def __eq__(
 535 |         self,
 536 |         value: Union[
 537 |             "Attr",
 538 |             str,
 539 |             int,
 540 |             float,
 541 |             date,
 542 |             "Value[str]",
 543 |             "Value[int]",
 544 |             "Value[float]",
 545 |             "Value[date]",
 546 |         ],
 547 |     ) -> Union[Terminal, bool]:  # type: ignore[override]
 548 |         if isinstance(value, Attr):
 549 |             return self.attribute == value.attribute
 550 |         if isinstance(value, Value):
 551 |             value = value.value
 552 |         if isinstance(value, str):
 553 |             return self.exact_match(value)
 554 |         elif (
 555 |             isinstance(value, date)
 556 |             or isinstance(value, float)
 557 |             or isinstance(value, int)
 558 |         ):
 559 |             return self.equals(value)
 560 |         else:
 561 |             return NotImplemented
 562 | 
 563 |     @overload  # type: ignore[override]
 564 |     def __ne__(self, value: "Attr") -> bool:
 565 |         ...
 566 | 
 567 |     @overload  # type: ignore[override]
 568 |     def __ne__(
 569 |         self,
 570 |         value: Union[
 571 |             str,
 572 |             int,
 573 |             float,
 574 |             date,
 575 |             "Value[str]",
 576 |             "Value[int]",
 577 |             "Value[float]",
 578 |             "Value[date]",
 579 |         ],
 580 |     ) -> Terminal:
 581 |         ...
 582 | 
 583 |     def __ne__(
 584 |         self,
 585 |         value: Union[
 586 |             "Attr",
 587 |             str,
 588 |             int,
 589 |             float,
 590 |             date,
 591 |             "Value[str]",
 592 |             "Value[int]",
 593 |             "Value[float]",
 594 |             "Value[date]",
 595 |         ],
 596 |     ) -> Union[Terminal, bool]:  # type: ignore[override]
 597 |         if isinstance(value, Attr):
 598 |             return self.attribute != value.attribute
 599 |         if isinstance(value, Value):
 600 |             value = value.value
 601 |         return ~(self == value)
 602 | 
 603 |     def __lt__(self, value: TNumberLike) -> Terminal:
 604 |         if isinstance(value, Value):
 605 |             value = value.value
 606 |         return self.less(value)
 607 | 
 608 |     def __le__(self, value: TNumberLike) -> Terminal:
 609 |         if isinstance(value, Value):
 610 |             value = value.value
 611 |         return self.less_or_equal(value)
 612 | 
 613 |     def __gt__(self, value: TNumberLike) -> Terminal:
 614 |         if isinstance(value, Value):
 615 |             value = value.value
 616 |         return self.greater(value)
 617 | 
 618 |     def __ge__(self, value: TNumberLike) -> Terminal:
 619 |         if isinstance(value, Value):
 620 |             value = value.value
 621 |         return self.greater_or_equal(value)
 622 | 
 623 |     def __bool__(self) -> Terminal:
 624 |         return self.exists()
 625 | 
 626 |     def __contains__(
 627 |         self, value: Union[str, List[str], "Value[str]", "Value[List[str]]"]
 628 |     ) -> Terminal:
 629 |         """Maps to contains_words or contains_phrase depending on the value passed.
 630 | 
 631 |         * `"value" in attr` maps to `attr.contains_phrase("value")` for simple values.
 632 |         * `["value"] in attr` maps to `attr.contains_words(["value"])` for lists and
 633 |           tuples.
 634 |         """
 635 |         if isinstance(value, Value):
 636 |             value = value.value
 637 |         if isinstance(value, list):
 638 |             if len(value) == 0 or isinstance(value[0], str):
 639 |                 return self.contains_words(value)
 640 |             else:
 641 |                 return NotImplemented
 642 |         else:
 643 |             return self.contains_phrase(value)
 644 | 
 645 | 
 646 | # Type for functions returning Terminal
 647 | FTerminal = TypeVar("FTerminal", bound=Callable[..., Terminal])
 648 | # Type for functions returning Query
 649 | FQuery = TypeVar("FQuery", bound=Callable[..., Query])
 650 | 
 651 | 
 652 | def _attr_delegate(attr_func: FTerminal) -> Callable[[FQuery], FQuery]:
 653 |     """Decorator for PartialQuery methods. Delegates a function to self.attr.
 654 | 
 655 |     This reduces boilerplate, especially for classes with lots of dunder methods
 656 |     (preventing the use of `__getattr__`).
 657 | 
 658 |     Argument:
 659 |     - attr_func: A method in the Attr class producing a Terminal
 660 | 
 661 |     Returns: A function producing a Query according to the PartialQuery's operator
 662 |     """
 663 | 
 664 |     def decorator(partialquery_func: FQuery):
 665 |         @functools.wraps(partialquery_func)
 666 |         def wrap(self: "PartialQuery", *args, **kwargs) -> Query:
 667 |             term: Terminal = attr_func(self.attr, *args, **kwargs)
 668 |             if self.operator == "and":
 669 |                 return self.query & term
 670 |             elif self.operator == "or":
 671 |                 return self.query | term
 672 |             else:
 673 |                 raise ValueError(f"Unknown operator: {self.operator}")
 674 | 
 675 |         return wrap
 676 | 
 677 |     return decorator
 678 | 
 679 | 
 680 | class PartialQuery:
 681 |     """A PartialQuery extends a growing query with an Attr. It is constructed
 682 |     using the fluent syntax with the `and_` and `or_` methods. It is not usually
 683 |     necessary to create instances of this class directly.
 684 | 
 685 |     PartialQuery instances behave like Attr instances in most situations.
 686 |     """
 687 | 
 688 |     attr: Attr
 689 |     query: Query
 690 |     operator: TAndOr
 691 | 
 692 |     def __init__(self, query: Query, operator: TAndOr, attr: Attr):
 693 |         self.query = query
 694 |         self.operator = operator
 695 |         self.attr = attr
 696 | 
 697 |     @_attr_delegate(Attr.exact_match)
 698 |     def exact_match(self, value: Union[str, "Value[str]"]) -> Query:
 699 |         ...
 700 | 
 701 |     @_attr_delegate(Attr.contains_words)
 702 |     def contains_words(
 703 |         self, value: Union[str, "Value[str]", List[str], "Value[List[str]]"]
 704 |     ) -> Query:
 705 |         ...
 706 | 
 707 |     @_attr_delegate(Attr.contains_phrase)
 708 |     def contains_phrase(self, value: Union[str, "Value[str]"]) -> Query:
 709 |         ...
 710 | 
 711 |     @_attr_delegate(Attr.greater)
 712 |     def greater(self, value: TNumberLike) -> Query:
 713 |         ...
 714 | 
 715 |     @_attr_delegate(Attr.less)
 716 |     def less(self, value: TNumberLike) -> Query:
 717 |         ...
 718 | 
 719 |     @_attr_delegate(Attr.greater_or_equal)
 720 |     def greater_or_equal(self, value: TNumberLike) -> Query:
 721 |         ...
 722 | 
 723 |     @_attr_delegate(Attr.less_or_equal)
 724 |     def less_or_equal(self, value: TNumberLike) -> Query:
 725 |         ...
 726 | 
 727 |     @_attr_delegate(Attr.equals)
 728 |     def equals(self, value: TNumberLike) -> Query:
 729 |         ...
 730 | 
 731 |     @_attr_delegate(Attr.range)
 732 |     def range(self, value: Union[List[int], Tuple[int, int]]) -> Query:
 733 |         ...
 734 | 
 735 |     @_attr_delegate(Attr.range_closed)
 736 |     def range_closed(
 737 |         self,
 738 |         value: Union[
 739 |             List[int], Tuple[int, int], "Value[List[int]]", "Value[Tuple[int, int]]"
 740 |         ],
 741 |     ) -> Query:
 742 |         ...
 743 | 
 744 |     @_attr_delegate(Attr.exists)
 745 |     def exists(self) -> Query:
 746 |         ...
 747 | 
 748 |     @_attr_delegate(Attr.in_)
 749 |     def in_(
 750 |         self,
 751 |         value: Union[
 752 |             str,
 753 |             int,
 754 |             float,
 755 |             date,
 756 |             "Value[str]",
 757 |             "Value[int]",
 758 |             "Value[float]",
 759 |             "Value[date]",
 760 |         ],
 761 |     ) -> Query:
 762 |         ...
 763 | 
 764 |     @overload  # type: ignore[override]
 765 |     def __eq__(self, value: "PartialQuery") -> bool:
 766 |         ...
 767 | 
 768 |     @overload  # type: ignore[override]
 769 |     def __eq__(
 770 |         self,
 771 |         value: Union[
 772 |             str,
 773 |             int,
 774 |             float,
 775 |             date,
 776 |             "Value[str]",
 777 |             "Value[int]",
 778 |             "Value[float]",
 779 |             "Value[date]",
 780 |         ],
 781 |     ) -> Query:
 782 |         ...
 783 | 
 784 |     def __eq__(
 785 |         self,
 786 |         value: Union[
 787 |             "PartialQuery",
 788 |             str,
 789 |             int,
 790 |             float,
 791 |             date,
 792 |             "Value[str]",
 793 |             "Value[int]",
 794 |             "Value[float]",
 795 |             "Value[date]",
 796 |         ],
 797 |     ) -> Union[Query, bool]:  # type: ignore[override]
 798 |         if isinstance(value, PartialQuery):
 799 |             return (
 800 |                 self.attr == value.attr
 801 |                 and self.query == value.query
 802 |                 and self.operator == value.operator
 803 |             )
 804 | 
 805 |         if self.operator == "and":
 806 |             return self.query & (self.attr == value)
 807 |         elif self.operator == "or":
 808 |             return self.query | (self.attr == value)
 809 |         else:
 810 |             raise ValueError(f"Unknown operator: {self.operator}")
 811 | 
 812 |     @overload  # type: ignore[override]
 813 |     def __ne__(self, value: "PartialQuery") -> bool:
 814 |         ...
 815 | 
 816 |     @overload  # type: ignore[override]
 817 |     def __ne__(
 818 |         self,
 819 |         value: Union[
 820 |             str,
 821 |             int,
 822 |             float,
 823 |             date,
 824 |             "Value[str]",
 825 |             "Value[int]",
 826 |             "Value[float]",
 827 |             "Value[date]",
 828 |         ],
 829 |     ) -> Query:
 830 |         ...
 831 | 
 832 |     def __ne__(
 833 |         self,
 834 |         value: Union[
 835 |             "PartialQuery",
 836 |             str,
 837 |             int,
 838 |             float,
 839 |             date,
 840 |             "Value[str]",
 841 |             "Value[int]",
 842 |             "Value[float]",
 843 |             "Value[date]",
 844 |         ],
 845 |     ) -> Union[Query, bool]:  # type: ignore[override]
 846 |         if isinstance(value, PartialQuery):
 847 |             return self.attr != value.attr
 848 |         return ~(self == value)
 849 | 
 850 |     @_attr_delegate(Attr.__lt__)
 851 |     def __lt__(self, value: TNumberLike) -> Query:
 852 |         ...
 853 | 
 854 |     @_attr_delegate(Attr.__le__)
 855 |     def __le__(self, value: TNumberLike) -> Query:
 856 |         ...
 857 | 
 858 |     @_attr_delegate(Attr.__gt__)
 859 |     def __gt__(self, value: TNumberLike) -> Query:
 860 |         ...
 861 | 
 862 |     @_attr_delegate(Attr.__ge__)
 863 |     def __ge__(self, value: TNumberLike) -> Query:
 864 |         ...
 865 | 
 866 |     @_attr_delegate(Attr.__bool__)
 867 |     def __bool__(self) -> Query:
 868 |         ...
 869 | 
 870 |     @_attr_delegate(Attr.__contains__)
 871 |     def __contains__(
 872 |         self, value: Union[str, List[str], "Value[str]", "Value[List[str]]"]
 873 |     ) -> Query:
 874 |         ...
 875 | 
 876 | 
 877 | T = TypeVar("T", bound="TValue")
 878 | 
 879 | 
 880 | @dataclass(frozen=True)
 881 | class Value(Generic[T]):
 882 |     """Represents a value in a query.
 883 | 
 884 |     In most cases values are unnecessary and can be replaced directly by the python
 885 |     value.
 886 | 
 887 |     Values can also be used if the Attr object appears on the right:
 888 | 
 889 |         Value("4HHB") == Attr("rcsb_entry_container_identifiers.entry_id")
 890 |     """
 891 | 
 892 |     value: T
 893 | 
 894 |     @overload  # type: ignore[override]
 895 |     def __eq__(self, attr: "Value") -> bool:
 896 |         ...
 897 | 
 898 |     @overload  # type: ignore[override]
 899 |     def __eq__(self, attr: Attr) -> Terminal:
 900 |         ...
 901 | 
 902 |     def __eq__(self, attr: Union["Value", Attr]) -> Union[bool, Terminal]:
 903 |         # type: ignore[override]
 904 |         if isinstance(attr, Value):
 905 |             return self.value == attr.value
 906 |         if not isinstance(attr, Attr):
 907 |             return NotImplemented
 908 |         return attr == self
 909 | 
 910 |     @overload  # type: ignore[override]
 911 |     def __ne__(self, attr: "Value") -> bool:
 912 |         ...
 913 | 
 914 |     @overload  # type: ignore[override]
 915 |     def __ne__(self, attr: Attr) -> Terminal:
 916 |         ...
 917 | 
 918 |     def __ne__(self, attr: Union["Value", Attr]) -> Union[bool, Terminal]:
 919 |         # type: ignore[override]
 920 |         if isinstance(attr, Value):
 921 |             return self.value != attr.value
 922 |         if not isinstance(attr, Attr):
 923 |             return NotImplemented
 924 |         return attr != self.value
 925 | 
 926 |     def __lt__(self, attr: Attr) -> Terminal:
 927 |         if not isinstance(attr, Attr):
 928 |             return NotImplemented
 929 |         if not (
 930 |             isinstance(self.value, int)
 931 |             or isinstance(self.value, float)
 932 |             or isinstance(self.value, date)
 933 |         ):
 934 |             return NotImplemented
 935 |         return attr.greater(self.value)
 936 | 
 937 |     def __le__(self, attr: Attr) -> Terminal:
 938 |         if not isinstance(attr, Attr):
 939 |             return NotImplemented
 940 |         if not (
 941 |             isinstance(self.value, int)
 942 |             or isinstance(self.value, float)
 943 |             or isinstance(self.value, date)
 944 |         ):
 945 |             return NotImplemented
 946 |         return attr.greater_or_equal(self.value)
 947 | 
 948 |     def __gt__(self, attr: Attr) -> Terminal:
 949 |         if not isinstance(attr, Attr):
 950 |             return NotImplemented
 951 |         if not (
 952 |             isinstance(self.value, int)
 953 |             or isinstance(self.value, float)
 954 |             or isinstance(self.value, date)
 955 |         ):
 956 |             return NotImplemented
 957 |         return attr.less(self.value)
 958 | 
 959 |     def __ge__(self, attr: Attr) -> Terminal:
 960 |         if not isinstance(attr, Attr):
 961 |             return NotImplemented
 962 |         if not (
 963 |             isinstance(self.value, int)
 964 |             or isinstance(self.value, float)
 965 |             or isinstance(self.value, date)
 966 |         ):
 967 |             return NotImplemented
 968 |         return attr.less_or_equal(self.value)
 969 | 
 970 | 
 971 | class Session(Iterable[str]):
 972 |     """A single query session.
 973 | 
 974 |     Handles paging the query and parsing results
 975 |     """
 976 | 
 977 |     url = "http://search.rcsb.org/rcsbsearch/v1/query"
 978 |     query_id: str
 979 |     query: Query
 980 |     return_type: ReturnType
 981 |     start: int
 982 |     rows: int
 983 | 
 984 |     def __init__(
 985 |         self, query: Query, return_type: ReturnType = "entry", rows: int = 100
 986 |     ):
 987 |         self.query_id = Session.make_uuid()
 988 |         self.query = query.assign_ids()
 989 |         self.return_type = return_type
 990 |         self.start = 0
 991 |         self.rows = rows
 992 | 
 993 |     @staticmethod
 994 |     def make_uuid() -> str:
 995 |         "Create a new UUID to identify a query"
 996 |         return uuid.uuid4().hex
 997 | 
 998 |     @staticmethod
 999 |     def _extract_identifiers(query_json: Optional[Dict]) -> List[str]:
1000 |         """Extract identifiers from a JSON response"""
1001 |         if query_json is None:
1002 |             return []
1003 | 
1004 |         # total_count = int(query_json["total_count"])
1005 |         identifiers = [result["identifier"] for result in query_json["result_set"]]
1006 |         # assert len(identifiers) == total_count, f"{len(identifiers)} != {total_count}"
1007 |         return identifiers
1008 | 
1009 |     def _make_params(self, start=0):
1010 |         "Generate GET parameters as a dict"
1011 |         return dict(
1012 |             query=self.query.to_dict(),
1013 |             return_type=self.return_type,
1014 |             request_info=dict(query_id=self.query_id, src="ui"),  # TODO src deprecated?
1015 |             request_options=dict(pager=dict(start=start, rows=self.rows)),
1016 |         )
1017 | 
1018 |     def _single_query(self, start=0) -> Optional[Dict]:
1019 |         "Fires a single query"
1020 |         params = self._make_params(start)
1021 |         logging.debug(
1022 |             f"Querying {self.url} for results {start}-{start + self.rows - 1}"
1023 |         )
1024 |         response = requests.get(
1025 |             self.url, {"json": json.dumps(params, separators=(",", ":"))}
1026 |         )
1027 |         response.raise_for_status()
1028 |         if response.status_code == requests.codes.OK:
1029 |             return response.json()
1030 |         elif response.status_code == requests.codes.NO_CONTENT:
1031 |             return None
1032 |         else:
1033 |             raise Exception(f"Unexpected status: {response.status_code}")
1034 | 
1035 |     def __iter__(self) -> Iterator[str]:
1036 |         "Generator for all results as a list of identifiers"
1037 |         start = 0
1038 |         response = self._single_query(start=start)
1039 |         if response is None:
1040 |             return  # be explicit for mypy
1041 |         identifiers = self._extract_identifiers(response)
1042 |         start += self.rows
1043 |         logging.debug(f"Got {len(identifiers)} ids")
1044 | 
1045 |         if len(identifiers) == 0:
1046 |             return
1047 |         yield from identifiers
1048 | 
1049 |         total = response["total_count"]
1050 | 
1051 |         while start < total:
1052 |             assert len(identifiers) == self.rows
1053 |             response = self._single_query(start=start)
1054 |             identifiers = self._extract_identifiers(response)
1055 |             logging.debug(f"Got {len(identifiers)} ids")
1056 |             start += self.rows
1057 |             yield from identifiers
1058 | 
1059 |     def iquery(self, limit: Optional[int] = None) -> List[str]:
1060 |         """Evaluate the query and display an interactive progress bar.
1061 | 
1062 |         Requires tqdm.
1063 |         """
1064 |         from tqdm import trange  # type: ignore
1065 | 
1066 |         response = self._single_query(start=0)
1067 |         if response is None:
1068 |             return []
1069 |         total = response["total_count"]
1070 |         identifiers = self._extract_identifiers(response)
1071 |         if limit is not None and len(identifiers) >= limit:
1072 |             return identifiers[:limit]
1073 | 
1074 |         pages = math.ceil((total if limit is None else min(total, limit)) / self.rows)
1075 | 
1076 |         for page in trange(1, pages, initial=1, total=pages):
1077 |             response = self._single_query(page * self.rows)
1078 |             ids = self._extract_identifiers(response)
1079 |             identifiers.extend(ids)
1080 | 
1081 |         return identifiers[:limit]
1082 | 
1083 |     def rcsb_query_editor_url(self) -> str:
1084 |         """URL to edit this query in the RCSB query editor"""
1085 |         data = json.dumps(self._make_params(), separators=(",", ":"))
1086 |         return (
1087 |             f"http://search.rcsb.org/query-editor.html?json={urllib.parse.quote(data)}"
1088 |         )
1089 | 
1090 |     def rcsb_query_builder_url(self) -> str:
1091 |         """URL to view this query on the RCSB website query builder"""
1092 |         data = json.dumps(self._make_params(), separators=(",", ":"))
1093 |         return f"http://www.rcsb.org/search?request={urllib.parse.quote(data)}"
1094 | 


--------------------------------------------------------------------------------
/rcsbsearch/update_schema.py:
--------------------------------------------------------------------------------
 1 | """Update the distribution json files; for developer use only"""
 2 | import json
 3 | from pathlib import Path
 4 | 
 5 | try:
 6 |     from .schema import _download_json_schema
 7 | except Exception:
 8 |     # ignore errors that may occur parsing the schema
 9 |     pass
10 | 
11 | if __name__ == "__main__":
12 |     path = Path(__file__).parent.joinpath("resources", "metadata_schema.json")
13 |     print(path)
14 |     with open(path, "wt") as file:
15 |         latest = _download_json_schema()
16 |         json.dump(latest, file)
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Minimial dependencies
2 | --index-url https://pypi.python.org/simple/
3 | 
4 | -e .


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | # Full dependencies
2 | --index-url https://pypi.python.org/simple/
3 | 
4 | -e .[progressbar,docs,tests]


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # consistent with black
 3 | max-line-length = 88
 4 | extend-ignore = E203, W503
 5 | 
 6 | [mypy]
 7 | #exclude = /build/
 8 | files = rcsbsearch
 9 | 
10 | [isort]
11 | # consistent with black
12 | multi_line_output = VERTICAL_HANGING_INDENT
13 | include_trailing_comma = True
14 | force_grid_wrap = 0
15 | use_parentheses = True
16 | ensure_newline_before_comments = True
17 | line_length = 88


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools  # type: ignore
 2 | import sys
 3 | 
 4 | # Load the version number from __init__.py
 5 | __version__ = "Undefined"
 6 | for line in open("rcsbsearch/__init__.py"):
 7 |     if line.startswith("__version__"):
 8 |         exec(line.strip())
 9 | 
10 | # Version-specific requirements
11 | install_requires = ["requests", "jsonschema"]
12 | if sys.version_info < (3, 8):
13 |     install_requires.append("typing_extensions")  # 3.7 only
14 | 
15 | # pin black version to get around https://github.com/psf/black/issues/2168
16 | tests_requires = ["tox", "pytest", "black==20.8b1", "flake8", "mypy"]
17 | 
18 | # README
19 | with open("README.md", "r") as fh:
20 |     long_description = fh.read()
21 | 
22 | 
23 | setuptools.setup(
24 |     name="rcsbsearch",
25 |     url="https://github.com/sbliven/rcsbsearch",
26 |     description="Access the RCSB Search API",
27 |     long_description=long_description,
28 |     long_description_content_type="text/markdown",
29 |     author="Spencer Bliven",
30 |     author_email="spencer.bliven@gmail.com",
31 |     version=__version__,
32 |     tests_require=tests_requires,
33 |     install_requires=install_requires,
34 |     extras_require={
35 |         "progressbar": ["tqdm"],
36 |         "tests": tests_requires,
37 |         # should match docs/requirements.txt
38 |         "docs": ["sphinx", "sphinx-rtd-theme", "myst-parser"],
39 |     },
40 |     packages=setuptools.find_packages(exclude=["tests"]),
41 |     package_data={"": ["resources/*"]},
42 |     scripts=[],
43 |     classifiers=[
44 |         "Programming Language :: Python :: 3",
45 |         "Programming Language :: Python :: 3 :: Only",
46 |         "Programming Language :: Python :: 3.7",
47 |         "Programming Language :: Python :: 3.8",
48 |         "Development Status :: 4 - Beta",
49 |         # "Development Status :: 5 - Production/Stable",
50 |         "Operating System :: OS Independent",
51 |         "Intended Audience :: Science/Research",
52 |         "License :: OSI Approved :: BSD License",
53 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
54 |         "Typing :: Typed",
55 |     ],
56 |     # Uses dataclasses, f-strings, typing
57 |     python_requires=">=3.7",
58 | )
59 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbliven/rcsbsearch/c7f8cb7e9f26ed5c78af1688af972fd345de8978/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_schema.py:
--------------------------------------------------------------------------------
1 | from rcsbsearch import rcsb_attributes as attrs
2 | 
3 | 
4 | def test_schema():
5 |     assert attrs.rcsb_id.attribute == "rcsb_id"
6 | 
7 |     assert attrs.rcsb_struct_symmetry.symbol.attribute == "rcsb_struct_symmetry.symbol"
8 | 


--------------------------------------------------------------------------------
/tests/test_search.py:
--------------------------------------------------------------------------------
  1 | from itertools import islice
  2 | 
  3 | import pytest  # type: ignore
  4 | import requests
  5 | 
  6 | from rcsbsearch import Attr, Group, Session, Terminal, TextQuery, Value
  7 | from rcsbsearch import rcsb_attributes as attrs
  8 | from rcsbsearch.search import PartialQuery
  9 | 
 10 | # q1 = rcsb.Terminal("rcsb_struct_symmetry.type", "exact_match", "Icosahedral")
 11 | # q2 = rcsb.Terminal("rcsb_struct_symmetry.kind", "exact_match", "Global Symmetry")
 12 | 
 13 | 
 14 | def test_construction():
 15 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "2GS2"])
 16 |     q2 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "5T89"])
 17 | 
 18 |     both = q1 & q2
 19 |     assert isinstance(both, Group)
 20 |     assert both.operator == "and"
 21 |     assert both.nodes[0] == q1
 22 |     assert both.nodes[1] == q2
 23 | 
 24 |     either = q1 | q2
 25 |     assert isinstance(either, Group)
 26 |     assert either.operator == "or"
 27 |     assert either.nodes[0] == q1
 28 |     assert either.nodes[1] == q2
 29 | 
 30 | 
 31 | @pytest.mark.internet
 32 | def test_single():
 33 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "2GS2"])
 34 |     session = Session(Group("and", [q1]))
 35 |     result = session._single_query()
 36 |     assert result is not None
 37 | 
 38 | 
 39 | @pytest.mark.internet
 40 | @pytest.mark.progressbar
 41 | def test_iquery():
 42 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["4HHB", "2GS2"])
 43 |     session = Session(q1)
 44 |     result = session.iquery()
 45 |     assert len(result) == 2
 46 | 
 47 | 
 48 | @pytest.mark.internet
 49 | def test_iter():
 50 |     ids = ["4HHB", "2GS2"]
 51 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids)
 52 |     result = set(q1())
 53 |     assert len(result) == 2
 54 |     assert result == set(ids)
 55 | 
 56 | 
 57 | @pytest.mark.internet
 58 | def test_inv():
 59 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "exact_match", "5T89")
 60 |     q = ~q1
 61 |     # Lots of results
 62 |     first = next(iter(q()))
 63 |     assert first is not None
 64 |     assert first != "5T89"
 65 | 
 66 | 
 67 | @pytest.mark.internet
 68 | def test_xor():
 69 |     ids1 = ["5T89", "2GS2"]
 70 |     ids2 = ["4HHB", "2GS2"]
 71 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids1)
 72 |     q2 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids2)
 73 |     q = q1 ^ q2
 74 |     print(f"XOR Query: {q}")
 75 |     result = set(q())
 76 |     assert len(result) == 2
 77 |     assert result == {ids1[0], ids2[0]}
 78 | 
 79 | 
 80 | @pytest.mark.internet
 81 | def test_pagination():
 82 |     ids = ["4HHB", "2GS2", "5T89", "1TIM"]
 83 |     q1 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ids)
 84 | 
 85 |     # 2+2 results
 86 |     session = Session(q1, rows=2)
 87 |     result = set(session)
 88 |     assert len(result) == 4
 89 |     assert result == set(ids)
 90 | 
 91 |     # 3+1 results
 92 |     session = Session(q1, rows=3)
 93 |     result = set(session)
 94 |     assert len(result) == 4
 95 |     assert result == set(ids)
 96 | 
 97 |     # 1ABC will never be a valid ID
 98 |     q2 = Terminal("rcsb_entry_container_identifiers.entry_id", "in", ["1ABC"])
 99 |     session = Session(q2)
100 |     result = set(session)
101 |     assert len(result) == 0
102 | 
103 | 
104 | @pytest.mark.internet
105 | def test_errors():
106 |     # Malformed
107 |     q1 = Terminal("invalid_identifier", "exact_match", "ERROR")
108 |     session = Session(q1)
109 |     try:
110 |         set(session)
111 |         assert False, "Should raise error"
112 |     except requests.HTTPError:
113 |         pass
114 | 
115 | 
116 | @pytest.mark.internet
117 | def test_example1():
118 |     """'Biological Assembly Search' from http://search.rcsb.org/#examples
119 | 
120 |     (Also used in the README)
121 |     """
122 |     # Create terminals for each query
123 |     q1 = TextQuery('"heat-shock transcription factor"')
124 |     q2 = attrs.rcsb_struct_symmetry.symbol == "C2"
125 |     q3 = attrs.rcsb_struct_symmetry.kind == "Global Symmetry"
126 |     q4 = attrs.rcsb_entry_info.polymer_entity_count_DNA >= 1
127 | 
128 |     # combined using bitwise operators (&, |, ~, etc)
129 |     query = q1 & q2 & q3 & q4  # AND of all queries
130 | 
131 |     results = set(query("assembly"))
132 |     assert len(results) > 0  # 14 results 2020-06
133 |     assert "1FYL-1" in results
134 | 
135 |     # Fluent syntax
136 |     query2 = (
137 |         TextQuery('"heat-shock transcription factor"')
138 |         .and_("rcsb_struct_symmetry.symbol")
139 |         .exact_match("C2")
140 |         .and_("rcsb_struct_symmetry.kind")
141 |         .exact_match("Global Symmetry")
142 |         .and_("rcsb_entry_info.polymer_entity_count_DNA")
143 |         .greater_or_equal(1)
144 |     )
145 | 
146 |     assert query2 == query
147 | 
148 |     results = set(query2.exec("assembly"))
149 |     assert len(results) > 0  # 14 results 2020-06
150 |     assert "1FYL-1" in results
151 | 
152 | 
153 | @pytest.mark.internet
154 | def test_example2():
155 |     "'X-Ray Structures Search' from http://search.rcsb.org/#examples"
156 |     q = (
157 |         Terminal(value='"thymidine kinase"')
158 |         & Terminal(
159 |             "rcsb_entity_source_organism.taxonomy_lineage.name",
160 |             "exact_match",
161 |             "Viruses",
162 |         )
163 |         & Terminal(
164 |             "exptl.method",
165 |             "exact_match",
166 |             "X-RAY DIFFRACTION",
167 |         )
168 |         & Terminal(
169 |             "rcsb_entry_info.resolution_combined",
170 |             "less_or_equal",
171 |             2.5,
172 |         )
173 |         & Terminal("rcsb_entry_info.nonpolymer_entity_count", "greater", 0)
174 |     )
175 | 
176 |     results = set(q("entry"))
177 |     assert len(results) > 0  # 224 results 2020-06
178 |     assert "1KI6" in results
179 | 
180 | 
181 | def test_attr():
182 |     attr = Attr("attr")
183 | 
184 |     term = attr == "value"
185 |     assert isinstance(term, Terminal)
186 |     assert term.operator == "exact_match"
187 | 
188 |     term = "value" == attr
189 |     assert isinstance(term, Terminal)
190 |     assert term.operator == "exact_match"
191 | 
192 |     term = Value("value") == attr
193 |     assert isinstance(term, Terminal)
194 |     assert term.operator == "exact_match"
195 | 
196 | 
197 | @pytest.mark.internet
198 | def test_freetext():
199 |     query = TextQuery("tubulin")
200 |     results = set(query())
201 |     assert len(results) > 0
202 | 
203 | 
204 | def test_partialquery():
205 |     query = Attr("a").equals("aval").and_("b")
206 | 
207 |     assert isinstance(query, PartialQuery)
208 | 
209 |     query = query.exact_match("bval")
210 | 
211 |     assert isinstance(query, Group)
212 |     assert query.operator == "and"
213 |     assert len(query.nodes) == 2
214 |     assert query.nodes[0].attribute == "a"
215 |     assert query.nodes[0].operator == "equals"
216 |     assert query.nodes[0].value == "aval"
217 |     assert query.nodes[1].attribute == "b"
218 |     assert query.nodes[1].operator == "exact_match"
219 |     assert query.nodes[1].value == "bval"
220 | 
221 |     query = query.and_(Attr("c") < 5)
222 |     assert len(query.nodes) == 3
223 |     assert query.nodes[2].attribute == "c"
224 |     assert query.nodes[2].operator == "less"
225 |     assert query.nodes[2].value == 5
226 | 
227 |     query = query.or_("d")
228 | 
229 |     assert isinstance(query, PartialQuery)
230 |     assert query.attr == Attr("d")
231 |     assert query.operator == "or"
232 | 
233 |     query = query == "dval"
234 |     assert isinstance(query, Group)
235 |     assert query.operator == "or"
236 |     assert len(query.nodes) == 2
237 |     assert isinstance(query.nodes[0], Group)
238 |     assert query.nodes[1].attribute == "d"
239 |     assert query.nodes[1].operator == "exact_match"
240 |     assert query.nodes[1].value == "dval"
241 | 
242 | 
243 | def test_operators():
244 |     q1 = attrs.rcsb_id.in_(["4HHB", "2GS2"])
245 |     results = list(q1())
246 |     assert len(results) == 2
247 | 
248 |     q1 = attrs.citation.rcsb_authors.contains_words("kisko bliven")
249 |     results = list(q1())
250 |     assert results[0] == "5T89"  # first hit has both authors
251 |     assert "3V6B" in results  # only a single author
252 | 
253 |     q1 = attrs.citation.rcsb_authors.contains_phrase("kisko bliven")
254 |     results = list(q1())
255 |     assert len(results) == 0
256 | 
257 |     q1 = attrs.struct.title.contains_phrase(
258 |         "VEGF-A in complex with VEGFR-1 domains D1-6"
259 |     )
260 |     results = list(q1())
261 |     assert "5T89" in results
262 | 
263 |     q1 = attrs.rcsb_struct_symmetry.type.exact_match("Asymmetric")
264 |     results = list(islice(q1(), 5))
265 |     assert len(results) == 5
266 | 
267 |     q1 = attrs.rcsb_struct_symmetry.type.exact_match("symmetric")
268 |     results = list(islice(q1(), 5))
269 |     assert len(results) == 0
270 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37, py38, py39, lint, docs
 3 | 
 4 | [testenv:lint]
 5 | # pin black version to get around https://github.com/psf/black/issues/2168
 6 | deps =
 7 |     black==20.8b1
 8 |     flake8
 9 |     mypy
10 | commands =
11 |     black --check .
12 |     flake8
13 |     mypy rcsbsearch tests
14 | 
15 | [testenv:docs]
16 | deps = -rdocs/requirements.txt
17 | changedir = docs
18 | whitelist_externals = make
19 | commands =
20 |     make clean
21 |     make html
22 | 
23 | [testenv]
24 | deps =
25 |     pytest
26 |     tqdm
27 | commands =
28 |     pytest {posargs}


--------------------------------------------------------------------------------