├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── codeql │ ├── cpp.yaml │ └── python.yaml └── workflows │ ├── codeql-analysis.yml │ ├── ghpages.yml │ ├── linting.yml │ ├── pytest.yml │ └── release.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Pipfile ├── Pipfile.lock ├── README.md ├── README.rst ├── docs ├── Makefile ├── make.bat └── source │ ├── _config.yml │ ├── _static │ ├── rdflib-hdt-250.png │ ├── rdflib-hdt.png │ └── rdflib-hdt.svg │ ├── api.rst │ ├── conf.py │ ├── hdtdocument.rst │ ├── hdtstore.rst │ ├── index.rst │ └── installation.rst ├── include ├── docstrings.hpp ├── hdt_document.hpp ├── join_iterator.hpp ├── join_iterator_bytes.hpp ├── pyhdt_types.hpp ├── triple_iterator.hpp ├── triple_iterator_bytes.hpp └── tripleid_iterator.hpp ├── install.sh ├── pyproject.toml ├── rdflib_hdt ├── __init__.py ├── hdt_document.py ├── hdt_store.py ├── iterators.py ├── mapping.py ├── sparql_op.py └── types.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── src ├── hdt.cpp ├── hdt_document.cpp ├── join_iterator.cpp ├── join_iterator_bytes.cpp ├── triple_iterator.cpp ├── triple_iterator_bytes.cpp └── tripleid_iterator.cpp └── tests ├── __init__.py ├── hdt_document_test.py ├── hdt_iterators_test.py ├── hdt_store_test.py ├── join_iterator_test.py ├── test.hdt └── wrappers_test.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/codeql/cpp.yaml: -------------------------------------------------------------------------------- 1 | name: "rdflib-hdt CodeQL C++ config" 2 | 3 | queries: 4 | - uses: security-and-quality 5 | 6 | paths: 7 | - src 8 | - include 9 | -------------------------------------------------------------------------------- /.github/codeql/python.yaml: -------------------------------------------------------------------------------- 1 | name: "rdflib-hdt CodeQL Python config" 2 | 3 | queries: 4 | - uses: security-and-quality 5 | 6 | paths: 7 | - rdflib_hdt 8 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: 🔒 CodeQL 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | schedule: 9 | - cron: '00 14 1 * *' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ 'cpp', 'python' ] 24 | python-version: [3.7] 25 | hdt-version: ['v1.3.3'] 26 | 27 | steps: 28 | - name: Checkout repository 29 | uses: actions/checkout@v2 30 | 31 | # Initializes the CodeQL tools for scanning. 32 | - name: Initialize CodeQL 33 | uses: github/codeql-action/init@v1 34 | with: 35 | languages: ${{ matrix.language }} 36 | # we use a specific config file per language, because they need to scan different paths 37 | # for exemple, in C++, we do not want to scan hdt-cpp sources 38 | config-file: ./.github/codeql/${{ matrix.language }}.yaml 39 | # If you wish to specify custom queries, you can do so here or in a config file. 40 | # By default, queries listed here will override any specified in a config file. 41 | # Prefix the list here with "+" to use these queries and those in the config file. 42 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 43 | - name: Set up Python ${{ matrix.python-version }} 🐍 44 | uses: actions/setup-python@v2 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Setup pipenv 48 | uses: dschep/install-pipenv-action@v1 49 | - name: Setup HDT ${{ matrix.hdt-version }} 50 | uses: Callidon/setup-hdt-action@v1.2 51 | with: 52 | token: ${{ secrets.GITHUB_TOKEN }} 53 | hdt-tag: ${{ matrix.hdt-version }} 54 | source-path: ./ 55 | - name: Install dependencies 56 | run: pipenv install --dev 57 | - name: Compile & install package 58 | run: pipenv run python setup.py install 59 | - name: Perform CodeQL Analysis (${{ matrix.language }}) 60 | uses: github/codeql-action/analyze@v1 61 | -------------------------------------------------------------------------------- /.github/workflows/ghpages.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Deploy documentation 2 | on: 3 | release: 4 | types: [created] 5 | jobs: 6 | doc: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: [3.7] 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python ${{ matrix.python-version }} 🐍 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Setup pipenv 18 | uses: dschep/install-pipenv-action@v1 19 | - name: Setup HDT v1.3.3 20 | uses: Callidon/setup-hdt-action@v1.2 21 | with: 22 | token: ${{ secrets.GITHUB_TOKEN }} 23 | hdt-tag: v1.3.3 24 | source-path: ./ 25 | - name: Install dependencies 26 | run: pipenv install --dev 27 | - name: Compile & install package 28 | run: pipenv run python setup.py install 29 | - name: Build documentation 30 | run: | 31 | cd docs && pipenv run make html 32 | - name: Deploy documentation to gh-pages 33 | uses: peaceiris/actions-gh-pages@v3 34 | with: 35 | github_token: ${{ secrets.GITHUB_TOKEN }} 36 | publish_dir: ./docs/build/html 37 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: 🔍 Code quality and security 2 | on: 3 | push: 4 | branches: [ master ] 5 | pull_request: 6 | branches: [ master ] 7 | jobs: 8 | flake8: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: [3.7] 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 🐍 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install flake8 20 | run: pip install flake8 21 | - name: Lint with flake8 22 | run: | 23 | # stop the build if there are Python syntax errors or undefined names 24 | flake8 rdflib_hdt/*.py --count --select=E9,F63,F7,F82 --show-source --statistics 25 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 26 | flake8 rdflib_hdt/*.py --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 27 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: ⏳ Tests 2 | on: 3 | push: 4 | branches: [ master ] 5 | pull_request: 6 | branches: [ master ] 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: [3.7] 13 | hdt-version: ['v1.3.3'] 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python ${{ matrix.python-version }} 🐍 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Setup pipenv 21 | uses: dschep/install-pipenv-action@v1 22 | - name: Cache pipenv dependencies 23 | uses: actions/cache@v2 24 | id: cache-pipenv 25 | with: 26 | path: ~/.local/share/virtualenvs 27 | key: ${{ runner.os }}-${{ matrix.python-version }}-pipenv-${{ hashFiles('**/Pipfile.lock') }} 28 | - name: Setup HDT ${{ matrix.hdt-version }} 29 | uses: Callidon/setup-hdt-action@v1.2 30 | with: 31 | token: ${{ secrets.GITHUB_TOKEN }} 32 | hdt-tag: ${{ matrix.hdt-version }} 33 | source-path: ./ 34 | - name: Install dependencies 35 | if: steps.cache-pipenv.outputs.cache-hit != 'true' 36 | run: pipenv install --dev 37 | - name: Compile & install package 38 | run: pipenv run python setup.py install 39 | - name: Test with pytest 40 | run: pipenv run pytest 41 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distribution 📦 to PyPI 2 | on: push 3 | jobs: 4 | build: 5 | name: Build distribution 📦 6 | runs-on: ubuntu-latest 7 | 8 | steps: 9 | - uses: actions/checkout@v4 10 | with: 11 | persist-credentials: false 12 | - name: Set up Python 13 | uses: actions/setup-python@v5 14 | with: 15 | python-version: "3.7.17" 16 | - name: Install pypa/build 17 | run: >- 18 | python3 -m 19 | pip install 20 | build 21 | --user 22 | - name: Build a binary wheel and a source tarball 23 | run: python3 -m build 24 | - name: Store the distribution packages 25 | uses: actions/upload-artifact@v4 26 | with: 27 | name: python-package-distributions 28 | path: dist/ 29 | 30 | publish-to-pypi: 31 | name: >- 32 | Publish Python 🐍 distribution 📦 to PyPI 33 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes 34 | needs: 35 | - build 36 | runs-on: ubuntu-latest 37 | environment: 38 | name: pypi 39 | url: https://pypi.org/p/rdflib-hdt 40 | permissions: 41 | id-token: write # IMPORTANT: mandatory for trusted publishing 42 | 43 | steps: 44 | - name: Download all the dists 45 | uses: actions/download-artifact@v4 46 | with: 47 | name: python-package-distributions 48 | path: dist/ 49 | - name: Publish distribution 📦 to PyPI 50 | uses: pypa/gh-action-pypi-publish@release/v1 51 | 52 | github-release: 53 | name: >- 54 | Sign the Python 🐍 distribution 📦 with Sigstore 55 | and upload them to GitHub Release 56 | needs: 57 | - publish-to-pypi 58 | runs-on: ubuntu-latest 59 | 60 | permissions: 61 | contents: write # IMPORTANT: mandatory for making GitHub Releases 62 | id-token: write # IMPORTANT: mandatory for sigstore 63 | 64 | steps: 65 | - name: Download all the dists 66 | uses: actions/download-artifact@v4 67 | with: 68 | name: python-package-distributions 69 | path: dist/ 70 | - name: Sign the dists with Sigstore 71 | uses: sigstore/gh-action-sigstore-python@v3.0.0 72 | with: 73 | inputs: >- 74 | ./dist/*.tar.gz 75 | ./dist/*.whl 76 | - name: Create GitHub Release 77 | env: 78 | GITHUB_TOKEN: ${{ github.token }} 79 | run: >- 80 | gh release create 81 | "$GITHUB_REF_NAME" 82 | --repo "$GITHUB_REPOSITORY" 83 | --notes "" 84 | - name: Upload artifact signatures to GitHub Release 85 | env: 86 | GITHUB_TOKEN: ${{ github.token }} 87 | # Upload to GitHub Release using the `gh` CLI. 88 | # `dist/` contains the built packages, and the 89 | # sigstore-produced signatures and certificates. 90 | run: >- 91 | gh release upload 92 | "$GITHUB_REF_NAME" dist/** 93 | --repo "$GITHUB_REPOSITORY" 94 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # C/C++ 2 | # Prerequisites 3 | *.d 4 | 5 | # VSCode 6 | .vscode 7 | 8 | # Compiled Object files 9 | *.slo 10 | *.lo 11 | *.o 12 | *.obj 13 | .pytest_cache/ 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Compiled Dynamic libraries 20 | *.so 21 | *.dylib 22 | *.dll 23 | 24 | # Fortran module files 25 | *.mod 26 | *.smod 27 | 28 | # Compiled Static libraries 29 | *.lai 30 | *.la 31 | *.a 32 | *.lib 33 | 34 | # Executables 35 | *.exe 36 | *.out 37 | *.app 38 | 39 | # Python 40 | # Byte-compiled / optimized / DLL files 41 | __pycache__/ 42 | *.py[cod] 43 | *$py.class 44 | 45 | # C extensions 46 | *.so 47 | 48 | # Distribution / packaging 49 | .Python 50 | build/ 51 | develop-eggs/ 52 | dist/ 53 | downloads/ 54 | eggs/ 55 | .eggs/ 56 | lib/ 57 | lib64/ 58 | parts/ 59 | sdist/ 60 | var/ 61 | wheels/ 62 | *.egg-info/ 63 | .installed.cfg 64 | *.egg 65 | MANIFEST 66 | 67 | # PyInstaller 68 | # Usually these files are written by a python script from a template 69 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 70 | *.manifest 71 | *.spec 72 | 73 | # Installer logs 74 | pip-log.txt 75 | pip-delete-this-directory.txt 76 | 77 | # Unit test / coverage reports 78 | htmlcov/ 79 | .tox/ 80 | .coverage 81 | .coverage.* 82 | .cache 83 | nosetests.xml 84 | coverage.xml 85 | *.cover 86 | .hypothesis/ 87 | 88 | # Translations 89 | *.mo 90 | *.pot 91 | 92 | # Django stuff: 93 | *.log 94 | .static_storage/ 95 | .media/ 96 | local_settings.py 97 | 98 | # Flask stuff: 99 | instance/ 100 | .webassets-cache 101 | 102 | # Scrapy stuff: 103 | .scrapy 104 | 105 | # Sphinx documentation 106 | docs/_build/ 107 | 108 | # PyBuilder 109 | target/ 110 | 111 | # Jupyter Notebook 112 | .ipynb_checkpoints 113 | 114 | # pyenv 115 | .python-version 116 | 117 | # celery beat schedule file 118 | celerybeat-schedule 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | 145 | # HDT 146 | *.hdt.index.v* 147 | hdt-cpp-* 148 | hdt-cpp.zip 149 | v1.3.*.zip 150 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017-2019 Thomas Minier 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | graft include/ 3 | graft hdt-cpp-1.3.3/libhdt/src/bitsequence 4 | graft hdt-cpp-1.3.3/libhdt/src/dictionary 5 | graft hdt-cpp-1.3.3/libhdt/src/hdt 6 | graft hdt-cpp-1.3.3/libhdt/src/header 7 | graft hdt-cpp-1.3.3/libhdt/src/huffman 8 | graft hdt-cpp-1.3.3/libhdt/src/libdcs 9 | graft hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex 10 | graft hdt-cpp-1.3.3/libhdt/src/rdf 11 | graft hdt-cpp-1.3.3/libhdt/src/sequence 12 | graft hdt-cpp-1.3.3/libhdt/src/triples 13 | graft hdt-cpp-1.3.3/libhdt/src/util 14 | graft hdt-cpp-1.3.3/libhdt/third 15 | graft hdt-cpp-1.3.3/libhdt/include/ 16 | graft hdt-cpp-1.3.3/libhdt/src/dictionary/ 17 | graft hdt-cpp-1.3.3/libhdt/src/sparql 18 | graft hdt-cpp-1.3.3/libcds/include/ 19 | graft hdt-cpp-1.3.3/libcds/src/static/bitsequence 20 | graft hdt-cpp-1.3.3/libcds/src/static/coders 21 | graft hdt-cpp-1.3.3/libcds/src/static/mapper 22 | graft hdt-cpp-1.3.3/libcds/src/static/permutation 23 | graft hdt-cpp-1.3.3/libcds/src/static/sequence 24 | graft hdt-cpp-1.3.3/libcds/src/utils 25 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [packages] 7 | pybind11 = "==2.2.4" 8 | rdflib = "==5.0.0" 9 | 10 | [dev-packages] 11 | pytest = "==5.4.1" 12 | flake8 = "*" 13 | sphinx = "*" 14 | sphinx-rtd-theme = "*" 15 | 16 | [requires] 17 | python_version = "3.7" 18 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "b00b04968478c4faa74499d7dbeaafdd85c4e26905043d85f77051e5d088b1a1" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.7" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "isodate": { 20 | "hashes": [ 21 | "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96", 22 | "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9" 23 | ], 24 | "version": "==0.6.1" 25 | }, 26 | "pybind11": { 27 | "hashes": [ 28 | "sha256:642abbbd2948ed5af28e69adfae1535347c7aa9eb0cdab130e20e1f198f8e1cf", 29 | "sha256:bd68159013d20c79bf79893b174a6ee7f74af740bf60ae731565f5d8d4094403" 30 | ], 31 | "index": "pypi", 32 | "version": "==2.2.4" 33 | }, 34 | "pyparsing": { 35 | "hashes": [ 36 | "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", 37 | "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" 38 | ], 39 | "markers": "python_full_version >= '3.6.8'", 40 | "version": "==3.0.9" 41 | }, 42 | "rdflib": { 43 | "hashes": [ 44 | "sha256:78149dd49d385efec3b3adfbd61c87afaf1281c30d3fcaf1b323b34f603fb155", 45 | "sha256:88208ea971a87886d60ae2b1a4b2cdc263527af0454c422118d43fe64b357877" 46 | ], 47 | "index": "pypi", 48 | "version": "==5.0.0" 49 | }, 50 | "six": { 51 | "hashes": [ 52 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", 53 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" 54 | ], 55 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 56 | "version": "==1.16.0" 57 | } 58 | }, 59 | "develop": { 60 | "alabaster": { 61 | "hashes": [ 62 | "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", 63 | "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" 64 | ], 65 | "version": "==0.7.12" 66 | }, 67 | "attrs": { 68 | "hashes": [ 69 | "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", 70 | "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" 71 | ], 72 | "markers": "python_version >= '3.5'", 73 | "version": "==22.1.0" 74 | }, 75 | "babel": { 76 | "hashes": [ 77 | "sha256:1ad3eca1c885218f6dce2ab67291178944f810a10a9b5f3cb8382a5a232b64fe", 78 | "sha256:5ef4b3226b0180dedded4229651c8b0e1a3a6a2837d45a073272f313e4cf97f6" 79 | ], 80 | "markers": "python_version >= '3.6'", 81 | "version": "==2.11.0" 82 | }, 83 | "certifi": { 84 | "hashes": [ 85 | "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3", 86 | "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18" 87 | ], 88 | "index": "pypi", 89 | "version": "==2022.12.7" 90 | }, 91 | "charset-normalizer": { 92 | "hashes": [ 93 | "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", 94 | "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" 95 | ], 96 | "markers": "python_version >= '3.6'", 97 | "version": "==2.1.1" 98 | }, 99 | "docutils": { 100 | "hashes": [ 101 | "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6", 102 | "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc" 103 | ], 104 | "markers": "python_version >= '3.7'", 105 | "version": "==0.19" 106 | }, 107 | "entrypoints": { 108 | "hashes": [ 109 | "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", 110 | "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" 111 | ], 112 | "markers": "python_version >= '2.7'", 113 | "version": "==0.3" 114 | }, 115 | "flake8": { 116 | "hashes": [ 117 | "sha256:45681a117ecc81e870cbf1262835ae4af5e7a8b08e40b944a8a6e6b895914cfb", 118 | "sha256:49356e766643ad15072a789a20915d3c91dc89fd313ccd71802303fd67e4deca" 119 | ], 120 | "index": "pypi", 121 | "version": "==3.7.9" 122 | }, 123 | "idna": { 124 | "hashes": [ 125 | "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", 126 | "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" 127 | ], 128 | "markers": "python_version >= '3.5'", 129 | "version": "==3.4" 130 | }, 131 | "imagesize": { 132 | "hashes": [ 133 | "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", 134 | "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a" 135 | ], 136 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 137 | "version": "==1.4.1" 138 | }, 139 | "importlib-metadata": { 140 | "hashes": [ 141 | "sha256:d5059f9f1e8e41f80e9c56c2ee58811450c31984dfa625329ffd7c0dad88a73b", 142 | "sha256:d84d17e21670ec07990e1044a99efe8d615d860fd176fc29ef5c306068fda313" 143 | ], 144 | "markers": "python_version < '3.8'", 145 | "version": "==5.1.0" 146 | }, 147 | "jinja2": { 148 | "hashes": [ 149 | "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", 150 | "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" 151 | ], 152 | "markers": "python_version >= '3.7'", 153 | "version": "==3.1.2" 154 | }, 155 | "markupsafe": { 156 | "hashes": [ 157 | "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003", 158 | "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88", 159 | "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5", 160 | "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7", 161 | "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a", 162 | "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603", 163 | "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1", 164 | "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135", 165 | "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247", 166 | "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6", 167 | "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601", 168 | "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77", 169 | "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02", 170 | "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e", 171 | "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63", 172 | "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f", 173 | "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980", 174 | "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b", 175 | "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812", 176 | "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff", 177 | "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96", 178 | "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1", 179 | "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925", 180 | "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a", 181 | "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6", 182 | "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e", 183 | "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f", 184 | "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4", 185 | "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f", 186 | "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3", 187 | "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c", 188 | "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a", 189 | "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417", 190 | "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a", 191 | "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a", 192 | "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37", 193 | "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452", 194 | "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933", 195 | "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", 196 | "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" 197 | ], 198 | "markers": "python_version >= '3.7'", 199 | "version": "==2.1.1" 200 | }, 201 | "mccabe": { 202 | "hashes": [ 203 | "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", 204 | "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" 205 | ], 206 | "version": "==0.6.1" 207 | }, 208 | "more-itertools": { 209 | "hashes": [ 210 | "sha256:250e83d7e81d0c87ca6bd942e6aeab8cc9daa6096d12c5308f3f92fa5e5c1f41", 211 | "sha256:5a6257e40878ef0520b1803990e3e22303a41b5714006c32a3fd8304b26ea1ab" 212 | ], 213 | "markers": "python_version >= '3.7'", 214 | "version": "==9.0.0" 215 | }, 216 | "packaging": { 217 | "hashes": [ 218 | "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3", 219 | "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3" 220 | ], 221 | "markers": "python_version >= '3.7'", 222 | "version": "==22.0" 223 | }, 224 | "pluggy": { 225 | "hashes": [ 226 | "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", 227 | "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" 228 | ], 229 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 230 | "version": "==0.13.1" 231 | }, 232 | "py": { 233 | "hashes": [ 234 | "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", 235 | "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" 236 | ], 237 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", 238 | "version": "==1.11.0" 239 | }, 240 | "pycodestyle": { 241 | "hashes": [ 242 | "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", 243 | "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" 244 | ], 245 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 246 | "version": "==2.5.0" 247 | }, 248 | "pyflakes": { 249 | "hashes": [ 250 | "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", 251 | "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2" 252 | ], 253 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 254 | "version": "==2.1.1" 255 | }, 256 | "pygments": { 257 | "hashes": [ 258 | "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1", 259 | "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42" 260 | ], 261 | "markers": "python_version >= '3.6'", 262 | "version": "==2.13.0" 263 | }, 264 | "pytest": { 265 | "hashes": [ 266 | "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172", 267 | "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970" 268 | ], 269 | "index": "pypi", 270 | "version": "==5.4.1" 271 | }, 272 | "pytz": { 273 | "hashes": [ 274 | "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427", 275 | "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2" 276 | ], 277 | "version": "==2022.6" 278 | }, 279 | "requests": { 280 | "hashes": [ 281 | "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", 282 | "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" 283 | ], 284 | "markers": "python_version >= '3.7' and python_version < '4'", 285 | "version": "==2.28.1" 286 | }, 287 | "setuptools": { 288 | "hashes": [ 289 | "sha256:57f6f22bde4e042978bcd50176fdb381d7c21a9efa4041202288d3737a0c6a54", 290 | "sha256:a7620757bf984b58deaf32fc8a4577a9bbc0850cf92c20e1ce41c38c19e5fb75" 291 | ], 292 | "markers": "python_version >= '3.7'", 293 | "version": "==65.6.3" 294 | }, 295 | "snowballstemmer": { 296 | "hashes": [ 297 | "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", 298 | "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" 299 | ], 300 | "version": "==2.2.0" 301 | }, 302 | "sphinx": { 303 | "hashes": [ 304 | "sha256:62edfd92d955b868d6c124c0942eba966d54b5f3dcb4ded39e65f74abac3f572", 305 | "sha256:f5505d74cf9592f3b997380f9bdb2d2d0320ed74dd69691e3ee0644b956b8d83" 306 | ], 307 | "index": "pypi", 308 | "version": "==3.0.3" 309 | }, 310 | "sphinx-rtd-theme": { 311 | "hashes": [ 312 | "sha256:00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4", 313 | "sha256:728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a" 314 | ], 315 | "index": "pypi", 316 | "version": "==0.4.3" 317 | }, 318 | "sphinxcontrib-applehelp": { 319 | "hashes": [ 320 | "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", 321 | "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" 322 | ], 323 | "markers": "python_version >= '3.5'", 324 | "version": "==1.0.2" 325 | }, 326 | "sphinxcontrib-devhelp": { 327 | "hashes": [ 328 | "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", 329 | "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" 330 | ], 331 | "markers": "python_version >= '3.5'", 332 | "version": "==1.0.2" 333 | }, 334 | "sphinxcontrib-htmlhelp": { 335 | "hashes": [ 336 | "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", 337 | "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" 338 | ], 339 | "markers": "python_version >= '3.6'", 340 | "version": "==2.0.0" 341 | }, 342 | "sphinxcontrib-jsmath": { 343 | "hashes": [ 344 | "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", 345 | "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" 346 | ], 347 | "markers": "python_version >= '3.5'", 348 | "version": "==1.0.1" 349 | }, 350 | "sphinxcontrib-qthelp": { 351 | "hashes": [ 352 | "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", 353 | "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" 354 | ], 355 | "markers": "python_version >= '3.5'", 356 | "version": "==1.0.3" 357 | }, 358 | "sphinxcontrib-serializinghtml": { 359 | "hashes": [ 360 | "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", 361 | "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" 362 | ], 363 | "markers": "python_version >= '3.5'", 364 | "version": "==1.1.5" 365 | }, 366 | "typing-extensions": { 367 | "hashes": [ 368 | "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", 369 | "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" 370 | ], 371 | "markers": "python_version < '3.8'", 372 | "version": "==4.4.0" 373 | }, 374 | "urllib3": { 375 | "hashes": [ 376 | "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc", 377 | "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8" 378 | ], 379 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", 380 | "version": "==1.26.13" 381 | }, 382 | "wcwidth": { 383 | "hashes": [ 384 | "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784", 385 | "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83" 386 | ], 387 | "version": "==0.2.5" 388 | }, 389 | "zipp": { 390 | "hashes": [ 391 | "sha256:83a28fcb75844b5c0cdaf5aa4003c2d728c77e05f5aeabe8e95e56727005fbaa", 392 | "sha256:a7a22e05929290a67401440b39690ae6563279bced5f314609d9d03798f56766" 393 | ], 394 | "markers": "python_version >= '3.7'", 395 | "version": "==3.11.0" 396 | } 397 | } 398 | } 399 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](docs/source/_static/rdflib-hdt-250.png) 2 | 3 | # rdflib-hdt 4 | 5 | ![Python tests](https://github.com/RDFLib/rdflib-hdt/workflows/Python%20tests/badge.svg) [![PyPI version](https://badge.fury.io/py/rdflib-hdt.svg)](https://badge.fury.io/py/rdflib-hdt) 6 | 7 | A Store back-end for [rdflib](https://github.com/RDFLib) to allow for reading and querying HDT documents. 8 | 9 | [Online Documentation](https://rdflib.dev/rdflib-hdt/) 10 | 11 | # Requirements 12 | 13 | * Python *version 3.6.4 or higher* 14 | * [pip](https://pip.pypa.io/en/stable/) 15 | * **gcc/clang** with **c++11 support** 16 | * **Python Development headers** 17 | > You should have the `Python.h` header available on your system. 18 | > For example, for Python 3.6, install the `python3.6-dev` package on Debian/Ubuntu systems. 19 | 20 | # Installation 21 | 22 | Installation using [pipenv](https://github.com/pypa/pipenv) or a [virtualenv](https://virtualenv.pypa.io/en/stable/) is **strongly advised!** 23 | 24 | ## PyPi installation (recommended) 25 | 26 | ```bash 27 | # you can install using pip 28 | pip install rdflib-hdt 29 | 30 | # or you can use pipenv 31 | pipenv install rdflib-hdt 32 | ``` 33 | 34 | ## Manual installation 35 | 36 | **Requirement:** [pipenv](https://github.com/pypa/pipenv) 37 | 38 | ``` 39 | git clone https://github.com/Callidon/pyHDT 40 | cd pyHDT/ 41 | ./install.sh 42 | ``` 43 | 44 | # Getting started 45 | 46 | You can use the `rdflib-hdt` library in two modes: as an rdflib Graph or as a raw HDT document. 47 | 48 | ## Graph usage (recommended) 49 | 50 | ```python 51 | from rdflib import Graph 52 | from rdflib_hdt import HDTStore 53 | from rdflib.namespace import FOAF 54 | 55 | # Load an HDT file. Missing indexes are generated automatically 56 | # You can provide the index file by putting it in the same directory as the HDT file. 57 | store = HDTStore("test.hdt") 58 | 59 | # Display some metadata about the HDT document itself 60 | print(f"Number of RDF triples: {len(store)}") 61 | print(f"Number of subjects: {store.nb_subjects}") 62 | print(f"Number of predicates: {store.nb_predicates}") 63 | print(f"Number of objects: {store.nb_objects}") 64 | print(f"Number of shared subject-object: {store.nb_shared}") 65 | 66 | # Create an RDFlib Graph with the HDT document as a backend 67 | graph = Graph(store=store) 68 | 69 | # Fetch all triples that matches { ?s foaf:name ?o } 70 | # Use None to indicates variables 71 | for s, p, o in graph.triples((None, FOAF("name"), None)): 72 | print(triple) 73 | ``` 74 | 75 | Using the RDFlib API, you can also [execute SPARQL queries](https://rdflib.readthedocs.io/en/stable/intro_to_sparql.html) over an HDT document. 76 | If you do so, we recommend that you first call the `optimize_sparql` function, which optimize 77 | the RDFlib SPARQL query engine in the context of HDT documents. 78 | 79 | ```python 80 | from rdflib import Graph 81 | from rdflib_hdt import HDTStore, optimize_sparql 82 | 83 | # Calling this function optimizes the RDFlib SPARQL engine for HDT documents 84 | optimize_sparql() 85 | 86 | graph = Graph(store=HDTStore("test.hdt")) 87 | 88 | # You can execute SPARQL queries using the regular RDFlib API 89 | qres = graph.query(""" 90 | PREFIX foaf: 91 | SELECT ?name ?friend WHERE { 92 | ?a foaf:knows ?b. 93 | ?a foaf:name ?name. 94 | ?b foaf:name ?friend. 95 | }""") 96 | 97 | for row in qres: 98 | print(f"{row.name} knows {row.friend}") 99 | ``` 100 | 101 | ## HDT Document usage 102 | 103 | ```python 104 | from rdflib_hdt import HDTDocument 105 | from rdflib.namespace import FOAF 106 | 107 | # Load an HDT file. Missing indexes are generated automatically. 108 | # You can provide the index file by putting it in the same directory as the HDT file. 109 | document = HDTDocument("test.hdt") 110 | 111 | # Display some metadata about the HDT document itself 112 | print(f"Number of RDF triples: {document.total_triples}") 113 | print(f"Number of subjects: {document.nb_subjects}") 114 | print(f"Number of predicates: {document.nb_predicates}") 115 | print(f"Number of objects: {document.nb_objects}") 116 | print(f"Number of shared subject-object: {document.nb_shared}") 117 | 118 | # Fetch all triples that matches { ?s foaf:name ?o } 119 | # Use None to indicates variables 120 | triples, cardinality = document.search((None, FOAF("name"), None)) 121 | 122 | print(f"Cardinality of (?s foaf:name ?o): {cardinality}") 123 | for s, p, o in triples: 124 | print(triple) 125 | 126 | # The search also support limit and offset 127 | triples, cardinality = document.search((None, FOAF("name"), None), limit=10, offset=100) 128 | # etc ... 129 | ``` 130 | 131 | An HDT document also provides support for evaluating joins over a set of triples patterns. 132 | 133 | ```python 134 | from rdflib_hdt import HDTDocument 135 | from rdflib import Variable 136 | from rdflib.namespace import FOAF, RDF 137 | 138 | document = HDTDocument("test.hdt") 139 | 140 | # find the names of two entities that know each other 141 | tp_a = (Variable("a"), FOAF("knows"), Variable("b")) 142 | tp_b = (Variable("a"), FOAF("name"), Variable("name")) 143 | tp_c = (Variable("b"), FOAF("name"), Variable("friend")) 144 | query = set([tp_a, tp_b, tp_c]) 145 | 146 | iterator = document.search_join(query) 147 | print(f"Estimated join cardinality: {len(iterator)}") 148 | 149 | # Join results are produced as ResultRow, like in the RDFlib SPARQL API 150 | for row in iterator: 151 | print(f"{row.name} knows {row.friend}") 152 | ``` 153 | 154 | # Handling non UTF-8 strings in python 155 | 156 | If the HDT document has been encoded with a non UTF-8 encoding the previous code won't work correctly and will result in a `UnicodeDecodeError`. 157 | More details on how to convert string to str from C++ to Python [here](https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html) 158 | 159 | To handle this, we doubled the API of the HDT document by adding: 160 | - `search_triples_bytes(...)` return an iterator of triples as `(py::bytes, py::bytes, py::bytes)` 161 | - `search_join_bytes(...)` return an iterator of sets of solutions mapping as `py::set(py::bytes, py::bytes)` 162 | - `convert_tripleid_bytes(...)` return a triple as: `(py::bytes, py::bytes, py::bytes)` 163 | - `convert_id_bytes(...)` return a `py::bytes` 164 | 165 | **Parameters and documentation are the same as the standard version** 166 | 167 | ```python 168 | from rdflib_hdt import HDTDocument 169 | 170 | document = HDTDocument("test.hdt") 171 | it = document.search_triple_bytes("", "", "") 172 | 173 | for s, p, o in it: 174 | print(s, p, o) # print b'...', b'...', b'...' 175 | # now decode it, or handle any error 176 | try: 177 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8') 178 | except UnicodeDecodeError as err: 179 | # try another other codecs, ignore error, etc 180 | pass 181 | ``` 182 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |rdflib-htd logo| 2 | 3 | |Build Status| |PyPI version| 4 | 5 | A Store back-end for `rdflib `_ to allow for reading and querying HDT documents. 6 | 7 | `Online Documentation `_ 8 | 9 | Requirements 10 | ============ 11 | 12 | 13 | * Python *version 3.6.4 or higher* 14 | * `pip `_ 15 | * **gcc/clang** with **c++11 support** 16 | * **Python Development headers** 17 | .. 18 | 19 | You should have the ``Python.h`` header available on your system.\ 20 | For example, for Python 3.6, install the ``python3.6-dev`` package on Debian/Ubuntu systems. 21 | 22 | 23 | Installation 24 | ============ 25 | 26 | Installation using `pipenv `_ or a `virtualenv `_ is **strongly advised!** 27 | 28 | PyPi installation (recommended) 29 | ------------------------------- 30 | 31 | .. code-block:: bash 32 | 33 | # you can install using pip 34 | pip install rdflib-hdt 35 | 36 | # or you can use pipenv 37 | pipenv install rdflib-hdt 38 | 39 | Manual installation 40 | ------------------- 41 | 42 | **Requirement:** `pipenv `_ 43 | 44 | .. code-block:: bash 45 | 46 | git clone https://github.com/Callidon/pyHDT 47 | cd pyHDT/ 48 | ./install.sh 49 | 50 | Getting started 51 | =============== 52 | 53 | You can use the ``rdflib-hdt`` library in two modes: as an rdflib Graph or as a raw HDT document. 54 | 55 | Graph usage (recommended) 56 | ------------------------- 57 | 58 | .. code-block:: python 59 | 60 | from rdflib import Graph 61 | from rdflib_hdt import HDTStore 62 | from rdflib.namespace import FOAF 63 | 64 | # Load an HDT file. Missing indexes are generated automatically 65 | # You can provide the index file by putting them in the same directory than the HDT file. 66 | store = HDTStore("test.hdt") 67 | 68 | # Display some metadata about the HDT document itself 69 | print(f"Number of RDF triples: {len(store)}") 70 | print(f"Number of subjects: {store.nb_subjects}") 71 | print(f"Number of predicates: {store.nb_predicates}") 72 | print(f"Number of objects: {store.nb_objects}") 73 | print(f"Number of shared subject-object: {store.nb_shared}") 74 | 75 | 76 | Using the RDFlib API, you can also `execute SPARQL queries `_ over an HDT document. 77 | If you do so, we recommend that you first call the ``optimize_sparql`` function, which optimize 78 | the RDFlib SPARQL query engine in the context of HDT documents. 79 | 80 | .. code-block:: python 81 | 82 | from rdflib import Graph 83 | from rdflib_hdt import HDTStore, optimize_sparql 84 | 85 | # Calling this function optimizes the RDFlib SPARQL engine for HDT documents 86 | optimize_sparql() 87 | 88 | graph = Graph(store=HDTStore("test.hdt")) 89 | 90 | # You can execute SPARQL queries using the regular RDFlib API 91 | qres = graph.query(""" 92 | PREFIX foaf: 93 | SELECT ?name ?friend WHERE { 94 | ?a foaf:knows ?b. 95 | ?a foaf:name ?name. 96 | ?b foaf:name ?friend. 97 | }""") 98 | 99 | for row in qres: 100 | print(f"{row.name} knows {row.friend}") 101 | 102 | HDT Document usage 103 | ------------------ 104 | 105 | .. code-block:: python 106 | 107 | from rdflib_hdt import HDTDocument 108 | 109 | # Load an HDT file. Missing indexes are generated automatically. 110 | # You can provide the index file by putting them in the same directory than the HDT file. 111 | document = HDTDocument("test.hdt") 112 | 113 | # Display some metadata about the HDT document itself 114 | print(f"Number of RDF triples: {document.total_triples}") 115 | print(f"Number of subjects: {document.nb_subjects}") 116 | print(f"Number of predicates: {document.nb_predicates}") 117 | print(f"Number of objects: {document.nb_objects}") 118 | print(f"Number of shared subject-object: {document.nb_shared}") 119 | 120 | # Fetch all triples that matches { ?s foaf:name ?o } 121 | # Use None to indicates variables 122 | triples, cardinality = document.search_triples((None, FOAF("name"), None)) 123 | 124 | print(f"Cardinality of (?s foaf:name ?o): {cardinality}") 125 | for s, p, o in triples: 126 | print(triple) 127 | 128 | # The search also support limit and offset 129 | triples, cardinality = document.search_triples((None, FOAF("name"), None), limit=10, offset=100) 130 | # etc ... 131 | 132 | An HDT document also provides support for evaluating joins over a set of triples patterns. 133 | 134 | .. code-block:: python 135 | 136 | from rdflib_hdt import HDTDocument 137 | from rdflib import Variable 138 | from rdflib.namespace import FOAF, RDF 139 | 140 | document = HDTDocument("test.hdt") 141 | 142 | # find the names of two entities that know each other 143 | tp_a = (Variable("a"), FOAF("knows"), Variable("b")) 144 | tp_b = (Variable("a"), FOAF("name"), Variable("name")) 145 | tp_c = (Variable("b"), FOAF("name"), Variable("friend")) 146 | query = set([tp_a, tp_b, tp_c]) 147 | 148 | iterator = document.search_join(query) 149 | print(f"Estimated join cardinality: {len(iterator)}") 150 | 151 | # Join results are produced as ResultRow, like in the RDFlib SPARQL API 152 | for row in iterator: 153 | print(f"{row.name} knows {row.friend}") 154 | 155 | Handling non UTF-8 strings in python 156 | ==================================== 157 | 158 | If the HDT document has been encoded with a non UTF-8 encoding the previous code won't work correctly and will result in a ``UnicodeDecodeError``. 159 | More details on how to convert string to str from C++ to Python `here `_ 160 | 161 | To handle this, we doubled the API of the HDT document by adding: 162 | 163 | 164 | * ``search_triples_bytes(...)`` return an iterator of triples as ``(py::bytes, py::bytes, py::bytes)`` 165 | * ``search_join_bytes(...)`` return an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)`` 166 | * ``convert_tripleid_bytes(...)`` return a triple as: ``(py::bytes, py::bytes, py::bytes)`` 167 | * ``convert_id_bytes(...)`` return a ``py::bytes`` 168 | 169 | **Parameters and documentation are the same as the standard version** 170 | 171 | .. code-block:: python 172 | 173 | from rdflib_hdt import HDTDocument 174 | 175 | document = HDTDocument("test.hdt") 176 | it = document.search_triple_bytes("", "", "") 177 | 178 | for s, p, o in it: 179 | print(s, p, o) # print b'...', b'...', b'...' 180 | # now decode it, or handle any error 181 | try: 182 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8') 183 | except UnicodeDecodeError as err: 184 | # try another other codecs, ignore error, etc 185 | pass 186 | 187 | .. |Build Status| image:: https://github.com/RDFLib/rdflib-hdt/workflows/Python%20tests/badge.svg 188 | :target: https://github.com/RDFLib/rdflib-hdt/actions?query=workflow%3A%22Python+tests%22 189 | .. |PyPI version| image:: https://badge.fury.io/py/rdflib-hdt.svg 190 | :target: https://badge.fury.io/py/rdflib-hdt 191 | .. |rdflib-htd logo| image:: https://raw.githubusercontent.com/RDFLib/rdflib-hdt/master/docs/source/_static/rdflib-hdt-250.png 192 | :target: https://rdflib.dev/rdflib-hdt/ 193 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = pyHDT 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | cp source/_config.yml build/html/_config.yml 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=pyHDT 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/_config.yml: -------------------------------------------------------------------------------- 1 | baseurl: / 2 | include: [ "_static", "_static/*" ] 3 | -------------------------------------------------------------------------------- /docs/source/_static/rdflib-hdt-250.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/docs/source/_static/rdflib-hdt-250.png -------------------------------------------------------------------------------- /docs/source/_static/rdflib-hdt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/docs/source/_static/rdflib-hdt.png -------------------------------------------------------------------------------- /docs/source/_static/rdflib-hdt.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 15 | 16 | 17 | 21 | 25 | 33 | 34 | 37 | 38 | 39 | 40 | 43 | 44 | 45 | 47 | 50 | 51 | 52 | 54 | 57 | 60 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | .. currentmodule:: rdflib_hdt 5 | 6 | Global functions 7 | ---------------- 8 | 9 | .. autofunction:: optimize_sparql 10 | 11 | HDTStore 12 | ----------- 13 | 14 | .. autoclass:: HDTStore 15 | :show-inheritance: 16 | :members: 17 | 18 | HDTDocument 19 | ----------- 20 | 21 | .. autoclass:: HDTDocument 22 | :members: 23 | 24 | .. autoattribute:: nb_subjects 25 | 26 | .. autoattribute:: nb_predicates 27 | 28 | .. autoattribute:: nb_objects 29 | 30 | .. autoattribute:: nb_shared 31 | 32 | 33 | HDTIterator 34 | ----------- 35 | 36 | .. autoclass:: HDTIterator 37 | :members: 38 | 39 | 40 | HDTJoinIterator 41 | --------------- 42 | 43 | .. autoclass:: HDTJoinIterator 44 | :members: 45 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # pyHDT documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Jan 22 10:41:42 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'rdflib-hdt' 50 | copyright = '2018-2020, Thomas Minier' 51 | author = 'Thomas Minier' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '1.0.0' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '1.0.0' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = [] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = { 'show_related': True} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | # If given, this must be the name of an image file (path relative to the configuration directory) that is the favicon of the docs. 100 | # Modern browsers use this as the icon for tabs, windows and bookmarks. 101 | html_favicon = '_static/rdflib-hdt-250.png' 102 | 103 | # Custom sidebar templates, must be a dictionary that maps document names 104 | # to template names. 105 | # 106 | # This is required for the alabaster theme 107 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 108 | html_sidebars = { 109 | '**': [ 110 | 'globaltoc.html', 111 | 'relations.html', # needs 'show_related': True theme option to display 112 | 'sourcelink.html', 113 | 'searchbox.html', 114 | ] 115 | } 116 | 117 | 118 | # -- Options for HTMLHelp output ------------------------------------------ 119 | 120 | # Output file base name for HTML help builder. 121 | htmlhelp_basename = 'rdflibHDTdoc' 122 | 123 | 124 | # -- Options for LaTeX output --------------------------------------------- 125 | 126 | latex_elements = { 127 | # The paper size ('letterpaper' or 'a4paper'). 128 | # 129 | # 'papersize': 'letterpaper', 130 | 131 | # The font size ('10pt', '11pt' or '12pt'). 132 | # 133 | # 'pointsize': '10pt', 134 | 135 | # Additional stuff for the LaTeX preamble. 136 | # 137 | # 'preamble': '', 138 | 139 | # Latex figure (float) alignment 140 | # 141 | # 'figure_align': 'htbp', 142 | } 143 | 144 | # Grouping the document tree into LaTeX files. List of tuples 145 | # (source start file, target name, title, 146 | # author, documentclass [howto, manual, or own class]). 147 | latex_documents = [ 148 | (master_doc, 'rdflibHDT.tex', 'pyHDT Documentation', 149 | 'Thomas Minier', 'manual'), 150 | ] 151 | 152 | 153 | # -- Options for manual page output --------------------------------------- 154 | 155 | # One entry per manual page. List of tuples 156 | # (source start file, name, description, authors, manual section). 157 | man_pages = [ 158 | (master_doc, 'rdflibHDT', 'pyHDT Documentation', 159 | [author], 1) 160 | ] 161 | 162 | 163 | # -- Options for Texinfo output ------------------------------------------- 164 | 165 | # Grouping the document tree into Texinfo files. List of tuples 166 | # (source start file, target name, title, author, 167 | # dir menu entry, description, category) 168 | texinfo_documents = [ 169 | (master_doc, 'rdflibHDT', 'rdflib-dht Documentation', 170 | author, 'rdflibHDT', 'One line description of project.', 171 | 'Miscellaneous'), 172 | ] 173 | 174 | autodoc_member_order = 'groupwise' 175 | -------------------------------------------------------------------------------- /docs/source/hdtdocument.rst: -------------------------------------------------------------------------------- 1 | Low-level Usage 2 | =============== 3 | 4 | Loading HDT files 5 | ^^^^^^^^^^^^^^^^^ 6 | 7 | The main class for directly manipulating HDT document using rdflib_hdt is :py:class:`rdflib_hdt.HDTDocument`. 8 | Upon creation, it searches for an index file in the same directory than the HDT file you wish to load. 9 | For example, if you load a file */home/awesome-user/test.hdt*, :py:class:`rdflib_hdt.HDTDocument` will look for the index file 10 | */home/awesome-user/test.hdt.index.v1-1*. 11 | 12 | .. warning:: By default, an HDTDocument discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor. 13 | 14 | .. note:: Missing indexes are generated automatically, but be careful, as it requires to load all HDT triples in memory! 15 | 16 | 17 | .. code-block:: python 18 | 19 | from rdflib_hdt import HDTDocument 20 | 21 | # Load an HDT file. 22 | # Missing indexes are generated automatically, add False as the second argument to disable them 23 | document = HDTDocument("test.hdt") 24 | 25 | # Display some metadata about the HDT document itself 26 | print(f"Number of RDF triples: {document.total_triples}") 27 | print(f"Number of subjects: {document.nb_subjects}") 28 | print(f"Number of predicates: {document.nb_predicates}") 29 | print(f"Number of objects: {document.nb_objects}") 30 | print(f"Number of shared subject-object: {document.nb_shared}") 31 | 32 | 33 | Searching for triples 34 | ^^^^^^^^^^^^^^^^^^^^^^ 35 | 36 | You can search for all RDF triples in the HDT file matching a triple pattern using :py:meth:`rdflib_hdt.HDTDocument.search`. 37 | It returns a 2-element tuple: an :py:class:`rdflib_hdt.HDTIterator` over the matching RDF triples and the estimated triple pattern *cardinality*. 38 | 39 | .. note:: The :py:meth:`rdflib_hdt.HDTDocument.search` method also accepts ``limit`` and ``offset`` parameters, to perform range queries over the matchinf RDF triples. 40 | 41 | .. code-block:: python 42 | 43 | from rdflib.namespace import FOAF 44 | from rdflib_hdt import HDTDocument 45 | document = HDTDocument("test.hdt") 46 | 47 | # Fetch all triples that matches { ?s foaf:name ?o } 48 | # Use None to indicates variables 49 | triples, cardinality = document.search((None, FOAF("name"), None)) 50 | 51 | print(f"Cardinality of (?s foaf:name ?o): {cardinality}") 52 | for s, p, o in triples: 53 | print(triple) 54 | 55 | # The search also support limit and offset 56 | triples, cardinality = document.search((None, FOAF("name"), None), limit=10, offset=100) 57 | # etc ... 58 | 59 | Searching for triple IDs 60 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 61 | 62 | A typical HDT document encodes a triple's subject, predicate, and object as unique integers, named **TripleID**. 63 | For example, the triple ``("ex:Toto", "ex:type", "ex:Person")`` can be encoded as ``(1, 2, 3)``. 64 | An :py:class:`rdflib_hdt.HDTDocument` allows for searching RDF triples and retrieving them in this format, using the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method, which takes the same parameters as the :py:meth:`rdflib_hdt.HDTDocument.search` method. 65 | 66 | .. note:: You can transform RDF terms or RDF triples to/from TripleIDs using the :meth:`rdflib_hdt.HDTDocument.from_tripleid`, :py:meth:`rdflib_hdt.HDTDocument.to_tripleid`, :meth:`rdflib_hdt.HDTDocument.term_to_id`, and :meth:`rdflib_hdt.HDTDocument.id_to_term` methods. 67 | 68 | .. code-block:: python 69 | 70 | from rdflib_hdt import HDTDocument 71 | document = HDTDocument("test.hdt") 72 | 73 | (triples, cardinality) = document.search_ids((None, None, None)) 74 | 75 | for s, p, o in triples: 76 | print(s, p, o) # will print 3-element tuples of integers 77 | 78 | # convert a triple ID to a string format 79 | print(f"TripleID {(s, p, o)} = RDF Triple {document.from_tripleid(s, p, o)}") 80 | 81 | # print only the subject 82 | print(f"ID {s} = Term {document.id_to_term(s, 0)}") 83 | 84 | Join evaluation 85 | ^^^^^^^^^^^^^^^ 86 | 87 | An HDT document also provides support for evaluating joins over a set of triples patterns. 88 | 89 | .. code-block:: python 90 | 91 | from rdflib_hdt import HDTDocument 92 | from rdflib import Variable 93 | from rdflib.namespace import FOAF, RDF 94 | 95 | document = HDTDocument("test.hdt") 96 | 97 | # find the names of two entities that know each other 98 | tp_a = (Variable("a"), FOAF("knows"), Variable("b")) 99 | tp_b = (Variable("a"), FOAF("name"), Variable("name")) 100 | tp_c = (Variable("b"), FOAF("name"), Variable("friend")) 101 | query = set([tp_a, tp_b, tp_c]) 102 | 103 | iterator = document.search_join(query) 104 | print(f"Estimated join cardinality: {len(iterator)}") 105 | 106 | # Join results are produced as ResultRow, like in the RDFlib SPARQL API 107 | for row in iterator: 108 | print(f"{row.name} knows {row.friend}") 109 | 110 | Ordering 111 | ^^^^^^^^^^^ 112 | 113 | When searching for triples (either in a string or TripleID format), results are ordered by (subject, predicate, object). 114 | However, this order is **not** an order on string values, but an order on **triple ids**. 115 | For example, ``("ex:2", "ex:type", "ex:Person") < ("ex:1", "ex:type", "ex:Person")``, 116 | because their triple ids counterparts are ``(1, 2, 3)`` and ``(2, 2, 3)``. 117 | 118 | For more details about this topic, please refer to the `HDT journal article `_. 119 | 120 | Handling non UTF-8 strings in Python 121 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 122 | 123 | If the HDT document is encoded without valid UTF-8 encoding and it does not run in *safe mode*, querying the document may raise an 124 | ``UnicodeDecodeError``. More details on how to convert string 125 | from C++ to Python `here`_ 126 | 127 | To allow fine control over string conversion in this case, we doubled the API of the HDT document by adding the following methods (Parameters and documentation are the same as their standard counterparts). 128 | 129 | - :py:meth:`rdflib_hdt.HDTDocument.search_triples_bytes` returns an iterator of triples as ``(py::bytes, py::bytes, py::bytes)`` 130 | - :py:meth:`rdflib_hdt.HDTDocument.search_join_bytes` returns an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)`` 131 | - :py:meth:`rdflib_hdt.HDTDocument.convert_tripleid_bytes` returns a triple as: ``(py::bytes, py::bytes, py::bytes)`` 132 | - :py:meth:`rdflib_hdt.HDTDocument.convert_id_bytes` returns a ``py::bytes`` 133 | 134 | .. code:: python 135 | 136 | from rdflib_hdt import HDTDocument 137 | 138 | # Load an HDT file. 139 | # Missing indexes are generated automatically, add False as the second argument to disable them 140 | document = HDTDocument("test.hdt") 141 | it = document.search_triple_bytes("", "", "") 142 | 143 | for s, p, o in it: 144 | print(s, p, o) # print b'...', b'...', b'...' 145 | # now decode it, or handle any error 146 | try: 147 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8') 148 | except UnicodeDecodeError as err: 149 | # try another other codecs 150 | pass 151 | 152 | .. _here: https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html 153 | -------------------------------------------------------------------------------- /docs/source/hdtstore.rst: -------------------------------------------------------------------------------- 1 | Querying HDT documents 2 | ====================== 3 | 4 | Getting started 5 | --------------- 6 | 7 | The primary way of using ``rdflib-hdt`` is the :py:class:`rdflib_hdt.HDTStore` class. 8 | Upon creation, it searches for an index file in the same directory than the HDT file you wish to load. 9 | For example, if you load a file */home/awesome-user/test.hdt*, :py:class:`rdflib_hdt.HDTDocument` will look for the index file 10 | */home/awesome-user/test.hdt.index.v1-1*. 11 | 12 | .. warning:: By default, an HDTStore discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor. 13 | 14 | .. note:: Missing indexes are generated automatically, but be careful, as it requires to load all HDT triples in memory! 15 | 16 | .. code-block:: python 17 | 18 | from rdflib import Graph 19 | from rdflib_hdt import HDTStore 20 | from rdflib.namespace import FOAF 21 | 22 | # Load an HDT file. Missing indexes are generated automatically 23 | # You can provide the index file by putting them in the same directory than the HDT file. 24 | store = HDTStore("test.hdt") 25 | 26 | # Display some metadata about the HDT document itself 27 | print(f"Number of RDF triples: {len(store)}") 28 | print(f"Number of subjects: {store.nb_subjects}") 29 | print(f"Number of predicates: {store.nb_predicates}") 30 | print(f"Number of objects: {store.nb_objects}") 31 | print(f"Number of shared subject-object: {store.nb_shared}") 32 | 33 | Executing SPARQL queries 34 | ------------------------ 35 | 36 | Using the RDFlib API, you can also `execute SPARQL queries `_ over an HDT document. 37 | If you do so, we recommend that you first call the :py:func:`rdflib_hdt.optimize_sparql` function, which optimize 38 | the RDFlib SPARQL query engine in the context of HDT documents. 39 | 40 | .. code-block:: python 41 | 42 | from rdflib import Graph 43 | from rdflib_hdt import HDTStore, optimize_sparql 44 | 45 | # Calling this function optimizes the RDFlib SPARQL engine for HDT documents 46 | optimize_sparql() 47 | 48 | graph = Graph(store=HDTStore("test.hdt")) 49 | 50 | # You can execute SPARQL queries using the regular RDFlib API 51 | qres = graph.query(""" 52 | PREFIX foaf: 53 | SELECT ?name ?friend WHERE { 54 | ?a foaf:knows ?b. 55 | ?a foaf:name ?name. 56 | ?b foaf:name ?friend. 57 | }""") 58 | 59 | for row in qres: 60 | print(f"{row.name} knows {row.friend}") 61 | 62 | .. note:: 63 | Calling the :py:func:`rdflib_hdt.optimize_sparql` function triggers a global modification of the RDFlib SPARQL engine. 64 | However, executing SPARQL queries using other RDFlib stores will continue to work as before, 65 | so you can safely call this function at the beginning of your code. 66 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | |rdflib-htd logo| 2 | 3 | Read and query HDT document with rdflib 4 | ====================================================== 5 | 6 | |Build Status| |PyPI version| 7 | 8 | A Store back-end for `rdflib `_ to allow for reading and querying HDT documents. 9 | 10 | Getting started 11 | ================== 12 | 13 | .. toctree:: 14 | :maxdepth: 3 15 | 16 | installation 17 | hdtstore 18 | hdtdocument 19 | api 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | 28 | .. |Build Status| image:: https://github.com/RDFLib/rdflib-hdt/workflows/Python%20tests/badge.svg 29 | :target: https://github.com/RDFLib/rdflib-hdt/actions?query=workflow%3A%22Python+tests%22 30 | .. |PyPI version| image:: https://badge.fury.io/py/rdflib-hdt.svg 31 | :target: https://badge.fury.io/py/rdflib-hdt 32 | .. |rdflib-htd logo| image:: https://raw.githubusercontent.com/RDFLib/rdflib-hdt/master/docs/source/_static/rdflib-hdt-250.png 33 | :target: https://rdflib.dev/rdflib-hdt/ 34 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============= 3 | 4 | Requirements 5 | ^^^^^^^^^^^^ 6 | 7 | * Python *version 3.6.4 or higher* 8 | * `pip `_ 9 | * **gcc/clang** with **c++11 support** 10 | * **Python Development headers** 11 | 12 | .. note:: 13 | You must have the `Python.h` header available on your system. 14 | For example, for Python 3.4, install the `python3.4-dev` package on Debian/Ubuntu systems. 15 | 16 | Installation 17 | ^^^^^^^^^^^^^ 18 | 19 | Installation using `pipenv `_ or a `virtualenv `_ is **strongly advised!** 20 | 21 | PyPi installation (recommended) 22 | ------------------------------- 23 | 24 | .. code-block:: bash 25 | 26 | # you can install using pip 27 | pip install rdflib-hdt 28 | 29 | # or you can use pipenv 30 | pipenv install rdflib-hdt 31 | 32 | Manual installation 33 | ------------------- 34 | 35 | **Requirement:** `pipenv `_ 36 | 37 | .. code-block:: bash 38 | 39 | git clone https://github.com/Callidon/pyHDT 40 | cd pyHDT/ 41 | ./install.sh 42 | -------------------------------------------------------------------------------- /include/docstrings.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * docstrings.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef PYHDT_DOCSTRINGS_HPP 7 | #define PYHDT_DOCSTRINGS_HPP 8 | 9 | #include 10 | 11 | const char *MODULE_DOC = R"( 12 | The hdt module enables to load and query HDT files with ease. 13 | )"; 14 | 15 | /** 16 | * Enums docstrings 17 | */ 18 | 19 | const char *IDENTIFIER_POSITION_DOC = R"( 20 | An enum used to indicate the position (subject, predicate or object) of an Object identifier. 21 | 22 | Possibles values: 23 | - ``IdentifierPosition.Subject``: the subject position 24 | - ``IdentifierPosition.Predicate``: the subject position 25 | - ``IdentifierPosition.Object``: the object position 26 | 27 | .. code-block:: python 28 | 29 | from hdt import IdentifierPosition 30 | print(IdentifierPosition.Subject) 31 | print(IdentifierPosition.Predicate) 32 | print(IdentifierPosition.Object) 33 | 34 | )"; 35 | 36 | /** 37 | * HDT Document docstrings 38 | */ 39 | 40 | const char *HDT_DOCUMENT_CLASS_DOC = R"( 41 | An HDTDocument enables to load and query a HDT file. 42 | 43 | Constructor: 44 | - file ``str``: Path to the HDT file to load. 45 | - predicate ``boolean``: True if additional indexes must be loaded, False otherwise. 46 | )"; 47 | 48 | const char *HDT_DOCUMENT_GETFILEPATH_DOC = R"( 49 | Return the path to the HDT file currently loaded 50 | )"; 51 | 52 | const char *HDT_DOCUMENT_GETNBTRIPLES_DOC = R"( 53 | Return the total number of triples in the HDT document 54 | )"; 55 | 56 | const char *HDT_DOCUMENT_GETNBSUBJECTS_DOC = R"( 57 | Return the number of subjects in the HDT document 58 | )"; 59 | 60 | const char *HDT_DOCUMENT_GETNBPREDICATES_DOC = R"( 61 | Return the number of predicates in the HDT document 62 | )"; 63 | 64 | const char *HDT_DOCUMENT_GETNBOBJECTS_DOC = R"( 65 | Return the number of objects in the HDT document 66 | )"; 67 | 68 | const char *HDT_DOCUMENT_GETNBSHARED_DOC = R"( 69 | Return the number of shared subject-object in the HDT document 70 | )"; 71 | 72 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_DOC = R"( 73 | Search for RDF triples matching the triple pattern { ``subject`` ``predicate`` ``object`` }, 74 | with an optional ``limit`` and ``offset``. 75 | Use empty strings (``""``) to indicate wildcards. 76 | 77 | Args: 78 | - subject ``str``: The subject of the triple pattern to seach for. 79 | - predicate ``str``: The predicate of the triple pattern to seach for. 80 | - obj ``str``: The object of the triple pattern ot seach for. 81 | - limit ``int`` ``optional``: Maximum number of triples to search for. 82 | - offset ``int`` ``optional``: Number of matching triples to skip before returning results. 83 | 84 | Return: 85 | A 2-elements ``tuple`` (:class:`hdt.TripleIterator`, estimated pattern cardinality), where 86 | the TripleIterator iterates over matching RDF triples. 87 | 88 | A RDF triple itself is a 3-elements ``tuple`` (subject, predicate, object). 89 | 90 | .. code-block:: python 91 | 92 | from hdt import HDTDocument 93 | document = HDTDocument("test.hdt") 94 | 95 | # Fetch all triples that matches { ?s ?p ?o } 96 | (triples, cardinality) = document.search_triples("", "", "") 97 | 98 | print("cardinality of { ?s ?p ?o }: %i" % cardinality) 99 | for triple in triples: 100 | print(triple) 101 | 102 | )"; 103 | 104 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC = R"( 105 | Same as :meth:`hdt.HDTDocument.search_triples`, but RDF triples are represented as unique ids (from the HDT Dictionnary). 106 | Use the integer `0` to indicate wildcards. 107 | 108 | Mapping between ids and RDF terms is done using :meth:`hdt.HDTDocument.convert_id`, :meth:`hdt.HDTDocument.convert_term` and :meth:`hdt.HDTDocument.convert_tripleid`. 109 | 110 | Args: 111 | - subject ``int``: The Object identifier of the triple pattern's subject. 112 | - predicate ``int``: The Object identifier of the triple pattern's predicate. 113 | - obj ``int``: The Object identifier of the triple pattern's object. 114 | - limit ``int`` ``optional``: Maximum number of triples to search for. 115 | - offset ``int`` ``optional``: Number of matching triples to skip before returning results. 116 | 117 | Return: 118 | A 2-elements ``tuple`` (:class:`hdt.TripleIDIterator`, estimated pattern cardinality), where 119 | the TripleIDIterator iterates over matching RDF triples IDs. 120 | 121 | A RDF triple ID itself is a 3-elements ``tuple`` (subjectID, predicateID, objectID). 122 | 123 | .. code-block:: python 124 | 125 | from hdt import HDTDocument 126 | document = HDTDocument("test.hdt") 127 | 128 | pred = document.convert_term("http://xmlns.com/foaf/0.1/") 129 | # Fetch all RDF triples that matches { ?s foaf:name ?o } 130 | (triples, cardinality) = document.search_triples_ids(0, pred, 0) 131 | 132 | print("cardinality of { ?s foaf:name ?o }: %i" % cardinality) 133 | for triple in triples: 134 | print(triple) 135 | 136 | )"; 137 | 138 | const char *HDT_DOCUMENT_SEARCH_JOIN_DOC = R"( 139 | Evaluate a join between a set of triple patterns using an iterator. 140 | A triple pattern itself is a 3-elements ``tuple`` (subject, predicate, object), where SPARQL variables, i.e., join predicates, are prefixed by a ``?``. 141 | 142 | Args: 143 | - patterns ``set``: set of triple patterns. 144 | 145 | Return: 146 | A :class:`hdt.JoinIterator`, which can be consumed as a Python iterator to evaluates the join. 147 | 148 | .. code-block:: python 149 | 150 | from hdt import HDTDocument 151 | document = HDTDocument("test.hdt") 152 | 153 | # find all actors with their names in the HDT document 154 | tp_a = ("?s", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://example.org#Actor") 155 | tp_b = ("?s", "http://xmlns.com/foaf/0.1/name", "?name") 156 | iterator = document.search_join(set([tp_a, tp_b])) 157 | 158 | print("estimated join cardinality : %i" % len(iterator)) 159 | for mappings in iterator: 160 | print(mappings) 161 | 162 | )"; 163 | 164 | const char *HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC = R"( 165 | Transform a RDF triple from a TripleID representation to a string representation. 166 | 167 | Args: 168 | - subject ``int``: unique ID of the subject. 169 | - predicate ``int``: unique ID of the predicate. 170 | - obj ``int``: unique ID of the object. 171 | 172 | Return: 173 | A triple in string representation, i.e., a 3-elements ``tuple`` (subject, predicate, object) 174 | 175 | .. code-block:: python 176 | 177 | from hdt import HDTDocument 178 | document = HDTDocument("test.hdt") 179 | 180 | # Fetch all triples that matches { ?s foaf:name ?o } 181 | pred = document.convert_term("http://xmlns.com/foaf/0.1/") 182 | (triples, cardinality) = document.search_triples_ids(0, pred, 0) 183 | 184 | for s, p, o in triples: 185 | print(s, p, o) # will print Object identifiers, i.e., integers 186 | # convert a triple ID to a string format 187 | print(document.convert_tripleid(s, p, o)) 188 | 189 | )"; 190 | 191 | const char *HDT_DOCUMENT_CONVERT_ID_DOC = R"( 192 | Transform an Object Identifier to a RDF term. 193 | Such identifier are used in TripleID. 194 | 195 | Args: 196 | - id ``int``: Object identifier. 197 | - position :class:`hdt.IdentifierPosition`: Identifier position. 198 | 199 | Return: 200 | The RDF term associated with the Object Identifier, i.e., either an URI or a RDF literal. 201 | 202 | .. code-block:: python 203 | 204 | from hdt import HDTDocument, IdentifierPosition 205 | document = HDTDocument("test.hdt") 206 | print(document.convert_id(10, IdentifierPosition.Subject)) 207 | 208 | )"; 209 | 210 | const char *HDT_DOCUMENT_CONVERT_TERM_DOC = R"( 211 | Transform an RDF Term to the associated Object Identifier. 212 | Such identifier are used in TripleID. 213 | 214 | Args: 215 | - term ``str``: RDF Term. 216 | - position :class:`hdt.IdentifierPosition`: Identifier position. 217 | 218 | Return: 219 | The Object Identifier associated with the RDF Term 220 | 221 | .. code-block:: python 222 | 223 | from hdt import HDTDocument, IdentifierPosition 224 | document = HDTDocument("test.hdt") 225 | print(document.convert_term("http://example.org#Alice", IdentifierPosition.Subject)) 226 | 227 | )"; 228 | 229 | /** 230 | * TripleIterator & TripleIDIterator docstrings 231 | */ 232 | 233 | const char *TRIPLE_ITERATOR_CLASS_DOC = R"( 234 | A TripleIterator iterates over triples in a HDT file matching a triple pattern, with an optional limit & offset. 235 | 236 | Such iterator is returned by :meth:`hdt.HDTDocument.search_triples`. 237 | )"; 238 | 239 | const char *TRIPLE_ID_ITERATOR_CLASS_DOC = R"( 240 | A TripleIDIterator iterates over triples' IDs in a HDT file matching a triple pattern, with an optional limit & offset. 241 | 242 | Such iterator is returned by :meth:`hdt.HDTDocument.search_triples_ids` 243 | 244 | Conversion from a tuple of triple ids into a RDF triple is done using :meth:`hdt.HDTDocument.convert_tripleid`. 245 | )"; 246 | 247 | const char *TRIPLE_ITERATOR_NEXT_DOC = R"( 248 | Return the next matching triple read by the iterator, or raise ``StopIterator`` if there is no more items to yield. 249 | )"; 250 | 251 | const char *TRIPLE_ITERATOR_PEEK_DOC = R"( 252 | Return the next matching triple read by the iterator without advancing it, or raise ``StopIterator`` if there is no more items to yield. 253 | )"; 254 | 255 | const char *TRIPLE_ITERATOR_HASNEXT_DOC = R"( 256 | Return true if the iterator still has items to yield, false otherwise. 257 | )"; 258 | 259 | const char *TRIPLE_ITERATOR_GETSUBJECT_DOC = R"( 260 | Return the subject of the triple pattern currently evaluated. 261 | )"; 262 | 263 | const char *TRIPLE_ITERATOR_GETPREDICATE_DOC = R"( 264 | Return the predicate of the triple pattern currently evaluated. 265 | )"; 266 | 267 | const char *TRIPLE_ITERATOR_GETOBJECT_DOC = R"( 268 | Return the object of the triple pattern currently evaluated. 269 | )"; 270 | 271 | const char *TRIPLE_ITERATOR_GETLIMIT_DOC = R"( 272 | Return the limit of the iterator, i.e., the maximum number of items the iterator will yield. 273 | A limit of 0 indicates that the iterator limit is the cardinality of the triple pattern currently evaluated. 274 | )"; 275 | 276 | const char *TRIPLE_ITERATOR_GETOFFSET_DOC = R"( 277 | Return the offset of the iterator, i.e., the number of items the iterator will first skip before yielding. 278 | An offset of 0 indicates that the iterator will not skip any items. 279 | )"; 280 | 281 | const char *TRIPLE_ITERATOR_NBREADS_DOC = R"( 282 | Return the number of items read by the iterator until now. 283 | Do not include any offset, thus the real position of the iterator in the collection of triples can be computed as offset + nb_reads 284 | )"; 285 | 286 | const char *TRIPLE_ITERATOR_SIZE_DOC = R"( 287 | Get a hint on the cardinality of the triple pattern currently evaluated. 288 | The iterator's limit and offset are not taken into account. 289 | 290 | Return: 291 | A 2-element ``tuple`` (integer, boolean), where the left member is the estimated cardinality, 292 | and the right member is True is the estimation is accurate, False otherwise 293 | )"; 294 | 295 | const char *TRIPLE_ITERATOR_ACC_ESTIMATION_DOC = R"( 296 | Return True if the iterator can accuratly estimate the cardinality of the triple pattern, False otherwise. 297 | )"; 298 | 299 | const char *JOIN_ITERATOR_CLASS_DOC = R"( 300 | A JoinIterator iterates over the set of solution mappings for a join between several triple patterns. It implements the Python iterator protocol and yields sets of solutions mappings. 301 | 302 | Such iterator is returned by :meth:`hdt.HDTDocument.search_join` 303 | )"; 304 | 305 | const char *JOIN_ITERATOR_NEXT_DOC = R"( 306 | Return the next set of solution mappings read by the iterator, or raise ``StopIterator`` if there is no more items to yield. 307 | )"; 308 | 309 | const char *JOIN_ITERATOR_HAS_NEXT_DOC = R"( 310 | Return true if the iterator still has items to yield, false otherwise. 311 | )"; 312 | 313 | const char *JOIN_ITERATOR_SIZE_DOC = R"( 314 | Return the estimated join cardinality. 315 | )"; 316 | 317 | const char *JOIN_ITERATOR_RESET_DOC = R"( 318 | Reset the join, i.e., move the iterator back to its initial state. 319 | )"; 320 | 321 | #endif /* PYHDT_DOCSTRINGS_HPP */ 322 | -------------------------------------------------------------------------------- /include/hdt_document.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt_document.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef PYHDT_DOCUMENT_HPP 7 | #define PYHDT_DOCUMENT_HPP 8 | 9 | #include 10 | #include "HDT.hpp" 11 | #include "QueryProcessor.hpp" 12 | #include "pyhdt_types.hpp" 13 | #include "triple_iterator.hpp" 14 | #include "triple_iterator_bytes.hpp" 15 | #include "tripleid_iterator.hpp" 16 | #include "join_iterator.hpp" 17 | #include "join_iterator_bytes.hpp" 18 | #include 19 | #include 20 | #include 21 | namespace py = pybind11; 22 | 23 | // The result of a search for a triple pattern in a HDT document: 24 | // a tuple (matching RDF triples, nb of matching RDF triples) 25 | typedef std::tuple search_results; 26 | 27 | // The result of a search for a triple pattern in a HDT document: 28 | // a tuple (matching RDF triples, nb of matching RDF triples) 29 | typedef std::tuple search_results_bytes; 30 | 31 | // Same as seach_results, but for an iterator over triple ids 32 | typedef std::tuple search_results_ids; 33 | 34 | /*! 35 | * HDTDocument is the main entry to manage an hdt document 36 | * \author Thomas Minier 37 | */ 38 | class HDTDocument { 39 | private: 40 | std::string hdt_file; 41 | hdt::HDT *hdt; 42 | hdt::QueryProcessor *processor; 43 | HDTDocument(std::string file, bool map, bool indexed); 44 | 45 | public: 46 | /*! 47 | * Destructor 48 | */ 49 | ~HDTDocument(); 50 | 51 | /*! 52 | * Get the path to the HDT file currently loaded 53 | * @return The path to the HDT file currently loaded 54 | */ 55 | std::string getFilePath(); 56 | 57 | /*! 58 | * Implementation for Python function "__repr__" 59 | * @return A string representation of the object 60 | */ 61 | std::string python_repr(); 62 | 63 | /*! 64 | * Get the total number of triples in the HDT document 65 | * @return The total number of triples in the HDT document 66 | */ 67 | unsigned int getNbTriples(); 68 | 69 | /*! 70 | * Get the number of distinct subjects in the HDT document 71 | * @return The number of distinct subjects in the HDT document 72 | */ 73 | unsigned int getNbSubjects(); 74 | 75 | /*! 76 | * Get the number of distinct predicates in the HDT document 77 | * @return The number of distinct predicates in the HDT document 78 | */ 79 | unsigned int getNbPredicates(); 80 | 81 | /*! 82 | * Get the number of distinct objects in the HDT document 83 | * @return The number of distinct objects in the HDT document 84 | */ 85 | unsigned int getNbObjects(); 86 | 87 | /*! 88 | * Get the number of shared subjects-objects in the HDT document 89 | * @return The number of shared subjects-objects in the HDT document 90 | */ 91 | unsigned int getNbShared(); 92 | 93 | /*! 94 | * Static factory method used to create a new HDT Document 95 | * @param file - Path to the HDT file 96 | * @param map - True maps the HDT file (faster), False loads everything in memory 97 | * @param indexed - True if the HDT must be loaded with indexes, False otherwise 98 | */ 99 | static HDTDocument create(std::string file, bool map, bool indexed) { 100 | return HDTDocument(file, map, indexed); 101 | } 102 | 103 | /*! 104 | * Convert a TripleID to a string RDF triple 105 | * @param subject - Triple's subject 106 | * @param predicate - Triple's predicate 107 | * @param object - Triple's object 108 | * @return The associated RDF triple 109 | */ 110 | triple convertTripleID(unsigned int subject, unsigned int predicate, 111 | unsigned int object); 112 | 113 | /** 114 | * Convert an Object Identifier into the equivalent an RDF term 115 | * @param id - Object Identifier 116 | * @param pos - Identifier position (subject, predicate or object) 117 | * @return The an RDF term equivalent to the Object Identifier 118 | */ 119 | string convertID(unsigned int id, IdentifierPosition pos); 120 | 121 | /** 122 | * Convert an RDF term into the associated an Object Identifier. 123 | * @param term - RDF Term in string format 124 | * @param pos - Identifier position (subject, predicate or object) 125 | * @return The Object Identifier associated with the RDF term 126 | */ 127 | unsigned int convertTerm(std::string term, IdentifierPosition pos); 128 | 129 | /*! 130 | * Search all matching triples for a triple pattern, whith an optional limit and offset. 131 | * Returns a tuple 132 | * @param subject - Triple pattern's subject 133 | * @param predicate - Triple pattern's predicate 134 | * @param object - Triple pattern's object 135 | * @param limit - (Optional) Maximum number of matching triples to read 136 | * @param offset - (Optional) Number of matching triples to skip 137 | * @return A tuple (TripleIterator*, cardinality) 138 | */ 139 | search_results search(std::string subject, std::string predicate, 140 | std::string object, unsigned int limit = 0, 141 | unsigned int offset = 0); 142 | 143 | /*! 144 | * Same as HDTDocument#search, but search for TripleIDs instead. 145 | * Returns a tuple 146 | * @param subject - Triple pattern's subject identifier 147 | * @param predicate - Triple pattern's predicate identifier 148 | * @param object - Triple pattern's object identifier 149 | * @param limit - (Optional) Maximum number of matching triples to read 150 | * @param offset - (Optional) Number of matching triples to skip 151 | * @return A tuple (TripleIDIterator*, cardinality) 152 | */ 153 | search_results_ids searchIDs(unsigned int subject, unsigned int predicate, 154 | unsigned int object, unsigned int limit = 0, 155 | unsigned int offset = 0); 156 | 157 | /** 158 | * Evaluate a join between a set of triple patterns using a JoinIterator. 159 | * @param patterns - Set of triple patterns 160 | * @return A JoinIterator* used to evaluated the join. 161 | */ 162 | JoinIterator * searchJoin(std::vector patterns); 163 | 164 | // ============== BYTES REPRESENTATION ============== 165 | // Author: Arnaud GRALL - MIT License 2017-2019 166 | /*! 167 | * Search all matching triples for a triple pattern, whith an optional limit and offset. Returns bytes instead of string 168 | * Returns a tuple 169 | * @param subject - Triple pattern's subject 170 | * @param predicate - Triple pattern's predicate 171 | * @param object - Triple pattern's object 172 | * @param limit - (Optional) Maximum number of matching triples to read 173 | * @param offset - (Optional) Number of matching triples to skip 174 | * @return A tuple (TripleIterator*, cardinality) 175 | */ 176 | search_results_bytes searchBytes(std::string subject, std::string predicate, 177 | std::string object, unsigned int limit = 0, 178 | unsigned int offset = 0); 179 | /** 180 | * Evaluate a join between a set of triple patterns using a JoinIterator. 181 | * @param patterns - Set of triple patterns 182 | * @return A JoinIterator* used to evaluated the join. 183 | */ 184 | JoinIteratorBytes * searchJoinBytes(std::vector patterns); 185 | /*! 186 | * Convert a TripleID to a RDF triple as bytes 187 | * @param subject - Triple's subject 188 | * @param predicate - Triple's predicate 189 | * @param object - Triple's object 190 | * @return The associated RDF triple 191 | */ 192 | triple_bytes convertTripleIDBytes(unsigned int subject, unsigned int predicate, 193 | unsigned int object); 194 | 195 | /** 196 | * Convert an Object Identifier into the equivalent an RDF term as bytes 197 | * @param id - Object Identifier 198 | * @param pos - Identifier position (subject, predicate or object) 199 | * @return The an RDF term equivalent to the Object Identifier 200 | */ 201 | py::bytes convertIDBytes(unsigned int id, IdentifierPosition pos); 202 | }; 203 | 204 | #endif /* PYHDT_DOCUMENT_HPP */ 205 | -------------------------------------------------------------------------------- /include/join_iterator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef JOIN_ITERATOR_HPP 7 | #define JOIN_ITERATOR_HPP 8 | 9 | #include "pyhdt_types.hpp" 10 | #include "QueryProcessor.hpp" 11 | #include 12 | 13 | /*! 14 | * JoinIterator iterates over solution bindings of a join 15 | * @author Thomas Minier 16 | */ 17 | class JoinIterator { 18 | private: 19 | hdt::VarBindingString *iterator; 20 | bool hasNextSolution = true; 21 | 22 | public: 23 | /*! 24 | * Constructor 25 | * @param iterator [description] 26 | */ 27 | JoinIterator(hdt::VarBindingString *_it); 28 | 29 | /*! 30 | * Destructor 31 | */ 32 | ~JoinIterator(); 33 | 34 | /*! 35 | * Implementation for Python function "__repr__" 36 | * @return [description] 37 | */ 38 | std::string python_repr(); 39 | 40 | /*! 41 | * Implementation for Python function "__iter__" 42 | * @return [description] 43 | */ 44 | JoinIterator *python_iter(); 45 | 46 | /** 47 | * Get the estimated join cardinality 48 | * @return [description] 49 | */ 50 | size_t estimatedCardinality(); 51 | 52 | /** 53 | * Reset the iterator into its initial state and restart join processing. 54 | */ 55 | void reset(); 56 | 57 | /*! 58 | * Return true if the iterator still has items available, False otherwise. 59 | * @return [description] 60 | */ 61 | bool hasNext(); 62 | 63 | /** 64 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 65 | * has ended. Used to implement Python Itertor protocol. 66 | * @return [description] 67 | */ 68 | solution_bindings next(); 69 | 70 | }; 71 | 72 | #endif /* JOIN_ITERATOR_HPP */ 73 | -------------------------------------------------------------------------------- /include/join_iterator_bytes.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator.hpp 3 | * Author: Arnaud Grall - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef JOIN_ITERATOR_BYTES_HPP 7 | #define JOIN_ITERATOR_BYTES_HPP 8 | 9 | #include "pyhdt_types.hpp" 10 | #include "QueryProcessor.hpp" 11 | #include 12 | 13 | /*! 14 | * JoinIterator iterates over solution bindings of a join 15 | * @author Arnaud Grall 16 | */ 17 | class JoinIteratorBytes { 18 | private: 19 | hdt::VarBindingString *iterator; 20 | bool hasNextSolution = true; 21 | 22 | public: 23 | /*! 24 | * Constructor 25 | * @param iterator [description] 26 | */ 27 | JoinIteratorBytes(hdt::VarBindingString *_it); 28 | 29 | /*! 30 | * Destructor 31 | */ 32 | ~JoinIteratorBytes(); 33 | 34 | /*! 35 | * Implementation for Python function "__repr__" 36 | * @return [description] 37 | */ 38 | std::string python_repr(); 39 | 40 | /*! 41 | * Implementation for Python function "__iter__" 42 | * @return [description] 43 | */ 44 | JoinIteratorBytes *python_iter(); 45 | 46 | /** 47 | * Get the estimated join cardinality 48 | * @return [description] 49 | */ 50 | size_t estimatedCardinality(); 51 | 52 | /** 53 | * Reset the iterator into its initial state and restart join processing. 54 | */ 55 | void reset(); 56 | 57 | /*! 58 | * Return true if the iterator still has items available, False otherwise. 59 | * @return [description] 60 | */ 61 | bool hasNext(); 62 | 63 | /** 64 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 65 | * has ended. Used to implement Python Itertor protocol. 66 | * @return [description] 67 | */ 68 | py::set next(); 69 | 70 | }; 71 | 72 | #endif /* JOIN_ITERATOR_BYTES_HPP */ 73 | -------------------------------------------------------------------------------- /include/pyhdt_types.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt_types.hpp 3 | * Author: Thomas MINIER, Arnaud Grall - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef PYHDT_TYPES_HPP 7 | #define PYHDT_TYPES_HPP 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | namespace py = pybind11; 15 | 16 | /** 17 | * Indictates the position of an Object Identifier 18 | */ 19 | enum IdentifierPosition { 20 | Subject = 1, 21 | Predicate = 2, 22 | Object = 3 23 | }; 24 | 25 | // A RDF Triple. RDF terms are represented as simple strings by HDT. 26 | typedef std::tuple triple; 27 | 28 | // A RDF triple composed of IDs from HDT dictionnary 29 | typedef std::tuple triple_id; 30 | 31 | // A list of RDF triples 32 | typedef std::list triple_list; 33 | 34 | // A list of RDF triples IDs 35 | typedef std::list triple_ids_list; 36 | 37 | // A hint over the cardinality of a triple pattern 38 | // The right element of the tuple is True if the hint is accurate, False otherwise 39 | typedef std::tuple size_hint; 40 | 41 | typedef std::tuple single_binding; 42 | 43 | typedef std::set *solution_bindings; 44 | 45 | // ============== BYTES REPRESENTATION ============== 46 | // A RDF Triple. RDF terms are represented as simple bytes by HDT. 47 | typedef std::tuple triple_bytes; 48 | // A Set of solutions bindings for the join iterator 49 | typedef py::set solution_bindings_bytes; 50 | 51 | #endif /* PYHDT_TYPES_HPP */ 52 | -------------------------------------------------------------------------------- /include/triple_iterator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef TRIPLE_ITERATOR_HPP 7 | #define TRIPLE_ITERATOR_HPP 8 | 9 | #include "tripleid_iterator.hpp" 10 | #include "pyhdt_types.hpp" 11 | #include "Dictionary.hpp" 12 | #include 13 | 14 | /*! 15 | * TripleIterator iterates over RDF triples of an HDT document which match a 16 | * triple pattern + limit + offset \author Thomas Minier 17 | */ 18 | class TripleIterator { 19 | private: 20 | TripleIDIterator *iterator; 21 | hdt::Dictionary *dictionary; 22 | 23 | public: 24 | /*! 25 | * Constructor 26 | * @param iterator [description] 27 | */ 28 | TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict); 29 | 30 | /*! 31 | * Destructor 32 | */ 33 | ~TripleIterator(); 34 | 35 | /*! 36 | * Implementation for Python function "__repr__" 37 | * @return [description] 38 | */ 39 | std::string python_repr(); 40 | 41 | /*! 42 | * Get the subject of the triple pattern currently evaluated. 43 | * An empty string represents a variable 44 | * @return [description] 45 | */ 46 | std::string getSubject(); 47 | 48 | /*! 49 | * Get the predicate of the triple pattern currently evaluated. 50 | * An empty string represents a variable 51 | * @return [description] 52 | */ 53 | std::string getPredicate(); 54 | 55 | /*! 56 | * Get the object of the triple pattern currently evaluated. 57 | * An empty string represents a variable 58 | * @return [description] 59 | */ 60 | std::string getObject(); 61 | 62 | /*! 63 | * Get the limit of the current iterator 64 | * @return [description] 65 | */ 66 | unsigned int getLimit(); 67 | 68 | /*! 69 | * Get the offset of the current iterator 70 | * @return [description] 71 | */ 72 | unsigned int getOffset(); 73 | 74 | /*! 75 | * Get the number of results read by the iterator 76 | * @return [description] 77 | */ 78 | unsigned int getNbResultsRead(); 79 | 80 | /*! 81 | * Implementation for Python function "__iter__" 82 | * @return [description] 83 | */ 84 | TripleIterator *python_iter(); 85 | 86 | /*! 87 | * Get the estimated cardinality of the pattern currently evaluated. 88 | * Offset & limit are not taken into account. 89 | * @return [description] 90 | */ 91 | size_hint sizeHint(); 92 | 93 | /*! 94 | * Return true if the iterator still has items available, False otherwise. 95 | * @return [description] 96 | */ 97 | bool hasNext(); 98 | 99 | /** 100 | * Get the next item in the iterator, or raise py::StopIteration if the 101 | * iterator has ended 102 | * @return [description] 103 | */ 104 | triple next(); 105 | 106 | /** 107 | * Get the next item in the iterator, or raise py::StopIteration if the 108 | * iterator has ended, but without advancing the iterator. 109 | * @return [description] 110 | */ 111 | triple peek(); 112 | }; 113 | 114 | #endif /* TRIPLE_ITERATOR_HPP */ 115 | -------------------------------------------------------------------------------- /include/triple_iterator_bytes.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator_bytes.hpp 3 | * Author: Arnaud GRALL - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef TRIPLE_ITERATOR_BYTES_HPP 7 | #define TRIPLE_ITERATOR_BYTES_HPP 8 | 9 | #include "tripleid_iterator.hpp" 10 | #include "pyhdt_types.hpp" 11 | #include "Dictionary.hpp" 12 | #include 13 | 14 | /*! 15 | * TripleIterator iterates over RDF triples of an HDT document which match a 16 | * triple pattern + limit + offset \author Thomas Minier 17 | */ 18 | class TripleIteratorBytes { 19 | private: 20 | TripleIDIterator *iterator; 21 | hdt::Dictionary *dictionary; 22 | 23 | public: 24 | /*! 25 | * Constructor 26 | * @param iterator [description] 27 | */ 28 | TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict); 29 | 30 | /*! 31 | * Destructor 32 | */ 33 | ~TripleIteratorBytes(); 34 | 35 | /*! 36 | * Implementation for Python function "__repr__" 37 | * @return [description] 38 | */ 39 | std::string python_repr(); 40 | 41 | /*! 42 | * Get the subject of the triple pattern currently evaluated. 43 | * An empty string represents a variable 44 | * @return [description] 45 | */ 46 | std::string getSubject(); 47 | 48 | /*! 49 | * Get the predicate of the triple pattern currently evaluated. 50 | * An empty string represents a variable 51 | * @return [description] 52 | */ 53 | std::string getPredicate(); 54 | 55 | /*! 56 | * Get the object of the triple pattern currently evaluated. 57 | * An empty string represents a variable 58 | * @return [description] 59 | */ 60 | std::string getObject(); 61 | 62 | /*! 63 | * Get the limit of the current iterator 64 | * @return [description] 65 | */ 66 | unsigned int getLimit(); 67 | 68 | /*! 69 | * Get the offset of the current iterator 70 | * @return [description] 71 | */ 72 | unsigned int getOffset(); 73 | 74 | /*! 75 | * Get the number of results read by the iterator 76 | * @return [description] 77 | */ 78 | unsigned int getNbResultsRead(); 79 | 80 | /*! 81 | * Implementation for Python function "__iter__" 82 | * @return [description] 83 | */ 84 | TripleIteratorBytes *python_iter(); 85 | 86 | /*! 87 | * Get the estimated cardinality of the pattern currently evaluated. 88 | * Offset & limit are not taken into account. 89 | * @return [description] 90 | */ 91 | size_hint sizeHint(); 92 | 93 | /*! 94 | * Return true if the iterator still has items available, False otherwise. 95 | * @return [description] 96 | */ 97 | bool hasNext(); 98 | 99 | /** 100 | * Get the next item in the iterator, or raise py::StopIteration if the 101 | * iterator has ended 102 | * @return [description] 103 | */ 104 | triple_bytes next(); 105 | 106 | /** 107 | * Get the next item in the iterator, or raise py::StopIteration if the 108 | * iterator has ended, but without advancing the iterator. 109 | * @return [description] 110 | */ 111 | triple_bytes peek(); 112 | }; 113 | 114 | #endif /* TRIPLE_ITERATOR_BYTES_HPP */ 115 | -------------------------------------------------------------------------------- /include/tripleid_iterator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * tripleid_iterator.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef TRIPLEID_ITERATOR_HPP 7 | #define TRIPLEID_ITERATOR_HPP 8 | 9 | #include "pyhdt_types.hpp" 10 | #include 11 | #include 12 | 13 | /*! 14 | * TripleIDIterator iterates over IDs of RDF triples of an HDT document which 15 | * match a triple pattern + limit + offset \author Thomas Minier 16 | */ 17 | class TripleIDIterator { 18 | private: 19 | std::string subject; 20 | std::string predicate; 21 | std::string object; 22 | unsigned int limit; 23 | unsigned int offset; 24 | hdt::IteratorTripleID *iterator; 25 | triple_id _bufferedTriple; 26 | bool hasBufferedTriple = false; 27 | unsigned int resultsRead = 0; 28 | 29 | public: 30 | /*! 31 | * Constructor 32 | * @param iterator [description] 33 | */ 34 | TripleIDIterator(hdt::IteratorTripleID *_it, std::string _subj, 35 | std::string _pred, std::string _obj, unsigned int _limit, 36 | unsigned int _offset); 37 | 38 | /*! 39 | * Destructor 40 | */ 41 | ~TripleIDIterator(); 42 | 43 | /*! 44 | * Implementation for Python function "__repr__" 45 | * @return [description] 46 | */ 47 | std::string python_repr(); 48 | 49 | /*! 50 | * Get the subject of the triple pattern currently evaluated. 51 | * @return [description] 52 | */ 53 | std::string getSubject(); 54 | 55 | /*! 56 | * Get the predicate of the triple pattern currently evaluated. 57 | * @return [description] 58 | */ 59 | std::string getPredicate(); 60 | 61 | /*! 62 | * Get the object of the triple pattern currently evaluated. 63 | * @return [description] 64 | */ 65 | std::string getObject(); 66 | 67 | /*! 68 | * Get the limit of the current iterator 69 | * @return [description] 70 | */ 71 | unsigned int getLimit(); 72 | 73 | /*! 74 | * Get the offset of the current iterator 75 | * @return [description] 76 | */ 77 | unsigned int getOffset(); 78 | 79 | /*! 80 | * Get the number of results read by the iterator 81 | * @return [description] 82 | */ 83 | unsigned int getNbResultsRead(); 84 | 85 | /*! 86 | * Implementation for Python function "__iter__" 87 | * @return [description] 88 | */ 89 | TripleIDIterator *python_iter(); 90 | 91 | /*! 92 | * Get the estimated cardinality of the pattern currently evaluated. 93 | * Offset & limit are not taken into account. 94 | * @return [description] 95 | */ 96 | size_hint sizeHint(); 97 | 98 | /*! 99 | * Return true if the iterator still has items available, False otherwise. 100 | * @return [description] 101 | */ 102 | bool hasNext(); 103 | 104 | /** 105 | * Get the next item in the iterator, or raise py::StopIteration if the 106 | * iterator has ended 107 | * @return [description] 108 | */ 109 | triple_id next(); 110 | 111 | /** 112 | * Get the next item in the iterator, or raise py::StopIteration if the 113 | * iterator has ended, but without advancing the iterator. 114 | * @return [description] 115 | */ 116 | triple_id peek(); 117 | }; 118 | 119 | #endif /* TRIPLEID_ITERATOR_HPP */ 120 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # scripts for automated installation 3 | 4 | echo "Validating dependencies..." 5 | command -v python >/dev/null 2>&1 || { echo >&2 "Python is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; } 6 | command -v pip >/dev/null 2>&1 || { echo >&2 "pip is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; } 7 | command -v curl >/dev/null 2>&1 || { echo >&2 "curl is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; } 8 | command -v unzip >/dev/null 2>&1 || { echo >&2 "unzip is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; } 9 | 10 | echo "Installing dependencies..." 11 | pipenv install 12 | 13 | echo "Installing pyHDT..." 14 | pipenv run python setup.py install 15 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "rdflib_hdt" 3 | authors = [{ name = "Thomas Minier", email = "tminier01@gmail.com" }] 4 | description = "A Store back-end for rdflib to allow for reading and querying HDT documents" 5 | keywords = ["rdflib", "hdt", "rdf", "semantic web", "search"] 6 | readme = "README.rst" 7 | license = { text = "MIT License" } 8 | dynamic = ["version"] 9 | dependencies = ["rdflib>=4.2", "pybind11>=2.2.4"] 10 | 11 | [project.urls] 12 | homepage = "https://rdflib.dev/rdflib-hdt" 13 | repository = "https://github.com/RDFLib/rdflib-hdt.git" 14 | 15 | [build-system] 16 | requires = ["pybind11", "setuptools >= 40.8.0", "wheel"] 17 | build-backend = "setuptools.build_meta" 18 | -------------------------------------------------------------------------------- /rdflib_hdt/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "HDTDocument", 3 | "HDTStore", 4 | "HDTIterator", 5 | "HDTJoinIterator", 6 | "optimize_sparql" 7 | ] 8 | 9 | from rdflib_hdt.hdt_document import HDTDocument 10 | from rdflib_hdt.iterators import HDTIterator, HDTJoinIterator 11 | from rdflib_hdt.hdt_store import HDTStore 12 | from rdflib_hdt.sparql_op import optimize_sparql 13 | -------------------------------------------------------------------------------- /rdflib_hdt/hdt_document.py: -------------------------------------------------------------------------------- 1 | """ 2 | rdflib_hdt.hdt_document 3 | ======================= 4 | """ 5 | from typing import Optional, Tuple, Union 6 | 7 | import hdt 8 | from rdflib_hdt.iterators import HDTIterator, HDTJoinIterator 9 | from rdflib_hdt.mapping import rdflib_to_hdt, term_to_rdflib 10 | from rdflib_hdt.types import BGP, SearchQuery, Term 11 | 12 | 13 | class TermKindError(NameError): 14 | """An error raised when an invalid Term position identifier is used""" 15 | pass 16 | 17 | 18 | class HDTDocument(hdt.HDTDocument): 19 | """An HDT document, in read-only mode. 20 | 21 | This class is a wrapper over the original hdt.HDTDocument class, 22 | which aligns it with the RDFlib data model. 23 | 24 | .. warning:: By default, an HDTDocument discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor. 25 | 26 | Args: 27 | - path: Absolute path to the HDT file to load. 28 | - mapped: True if the document must be mapped on disk, False to load it in memory. 29 | - indexed: True if the document must be indexed. Indexed must be located in the same directory as the HDT file. Missing indexes are automatically generated at startup. 30 | - safe_mode: True if Unicode errors should be ignored, False otherwise. 31 | """ 32 | def __init__(self, path: str, mapped: bool = True, indexed: bool = True, safe_mode=True): 33 | super(HDTDocument, self).__init__(path, mapped, indexed) 34 | self._safe_mode = safe_mode 35 | 36 | def is_safe(self) -> bool: 37 | """Return True if the HDT document ignores Unicode errors, False otherwise.""" 38 | return self._safe_mode 39 | 40 | def from_tripleid(self, triple_id: Union[int, int, int]) -> Term: 41 | """Transform an RDF triple from a TripleID representation to an RDFlib representation. 42 | 43 | Argument: 44 | - triple_id: 3-tuple of IDs (s, p, o) 45 | 46 | Return: 47 | A triple in RDFlib representation, i.e., a 3-tuple of RDFlib terms. 48 | """ 49 | s, p, o = super().convert_tripleid(triple_id[0], triple_id[1], triple_id[2]) 50 | return (term_to_rdflib(s), term_to_rdflib(p), term_to_rdflib(o)) 51 | 52 | def to_tripleid(self, triple: SearchQuery) -> Tuple[int, int, int]: 53 | """Transform a triple (pattern) from an RDFlib representation to a TripleID. 54 | 55 | It can be used to transform an RDFlib query before feeding it 56 | into the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method. 57 | 58 | Argument: 59 | - triple: 3-tuple of RDF Terms. Use `None` to indicate wildcards. 60 | 61 | Return: 62 | A triple in TripleID representation, i.e., a 3-tuple of integers 63 | """ 64 | subj = super().convert_term(rdflib_to_hdt(triple[0]), hdt.Subject) if triple[0] is not None else 0 65 | pred = super().convert_term(rdflib_to_hdt(triple[1]), hdt.Predicate) if triple[1] is not None else 0 66 | obj = super().convert_term(rdflib_to_hdt(triple[2]), hdt.Object) if triple[2] is not None else 0 67 | return (subj, pred, obj) 68 | 69 | def term_to_id(self, term: Term, kind: int) -> int: 70 | """Transform a RDF term from an RDFlib representation to an unique ID, as used in a TripleID. 71 | 72 | It can be used in interaction with the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method. 73 | 74 | Argument: 75 | - term: The RDF term to transform. 76 | - kind: The term position: `0` for subjects, `1` for predicates and `2` for objects. 77 | 78 | Return: 79 | An ID representation of the RDF Term. 80 | """ 81 | str_term = rdflib_to_hdt(term) if term is not None else 0 82 | if kind == 0: 83 | return super().convert_term(str_term, hdt.IdentifierPosition.Subject) 84 | elif kind == 1: 85 | return super().convert_term(str_term, hdt.IdentifierPosition.Predicate) 86 | elif kind == 2: 87 | return super().convert_term(str_term, hdt.IdentifierPosition.Object) 88 | else: 89 | raise TermKindError(f"The position {kind} is not a valid Term kind (0 for subjects, 1 for predicates and 2 for objects)") 90 | 91 | def id_to_term(self, term_id: int, kind: int) -> Term: 92 | """Transform a RDF term from an unique ID, as used in a TripleID, to an RDFlib representation. 93 | 94 | It can be used in interaction with the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method. 95 | 96 | Argument: 97 | - term_id: The Term ID to transform. 98 | - kind: The term position: `0` for subjects, `1` for predicates and `2` for objects. 99 | 100 | Return: 101 | An RDFlib representation of the RDF Term. 102 | """ 103 | term = None 104 | if kind == 0: 105 | term = super().convert_id(term_id, hdt.IdentifierPosition.Subject) 106 | elif kind == 1: 107 | term = super().convert_id(term_id, hdt.IdentifierPosition.Predicate) 108 | elif kind == 2: 109 | term = super().convert_id(term_id, hdt.IdentifierPosition.Object) 110 | else: 111 | raise TermKindError(f"The position {kind} is not a valid Term kind (0 for subjects, 1 for predicates and 2 for objects)") 112 | return term_to_rdflib(term) 113 | 114 | def search(self, query: SearchQuery, limit=0, offset=0) -> Tuple[HDTIterator, int]: 115 | """Search for RDF triples matching the query triple pattern, with an optional limit and offset. Use `None` for wildcards/variables. 116 | 117 | Args: 118 | - query: The triple pattern (s, p, o) to search. Use `None` to indicate wildcards/variables. 119 | - limit: (optional) Maximum number of triples to search. 120 | - offset: (optional) Number of matching triples to skip before returning results. 121 | 122 | Return: 123 | A 2-elements tuple (iterator, estimated pattern cardinality), where 124 | the iterator is a generator of matching RDF triples. An RDF triple itself is a 3-elements tuple (subject, predicate, object) of RDF terms (in rdflib format). 125 | """ 126 | subj = rdflib_to_hdt(query[0]) if query[0] is not None else "" 127 | pred = rdflib_to_hdt(query[1]) if query[1] is not None else "" 128 | obj = rdflib_to_hdt(query[2]) if query[2] is not None else "" 129 | triples, cardinality = super().search_triples(subj, pred, obj, limit=limit, offset=offset) 130 | iterator = HDTIterator(triples, safe_mode=self._safe_mode) 131 | return iterator, cardinality 132 | 133 | def search_ids(self, query: Union[Optional[int], Optional[int], Optional[int]], limit=0, offset=0) -> Tuple[hdt.TripleIDIterator, int]: 134 | """Same as :meth:`rdflib_hdt.HDTDocument.search_triples`, but RDF triples are represented as unique ids (from the HDT Dictionnary). Use `None` or `0` to indicate wildcards/variables. 135 | 136 | Mapping between ids and RDF terms is done using the :meth:`rdflib_hdt.HDTDocument.from_tripleid`, :py:meth:`rdflib_hdt.HDTDocument.to_tripleid`, :meth:`rdflib_hdt.HDTDocument.term_to_id`, and :meth:`rdflib_hdt.HDTDocument.id_to_term` methods. 137 | 138 | Args: 139 | - query: A tuple of triple patterns IDs (s, p, o) to search. Use `None` or `0` to indicate wildcards/variables. 140 | - limit: (optional) Maximum number of triples to search. 141 | - offset: (optional) Number of matching triples to skip before returning results. 142 | 143 | Return: 144 | A 2-elements tuple (iterator, estimated pattern cardinality), where 145 | the iterator is a generator of matching RDF triples. An RDF triple itself is a 3-elements tuple (subject, predicate, object) of IDs (positive integers from the HDT Dictionnary). 146 | """ 147 | subj = query[0] if query[0] is not None else 0 148 | pred = query[1] if query[1] is not None else 0 149 | obj = query[2] if query[2] is not None else 0 150 | return super().search_triples_ids(subj, pred, obj, limit=limit, offset=offset) 151 | 152 | def search_join(self, patterns: BGP) -> hdt.JoinIterator: 153 | """Evaluate a join between a set of triple patterns using an iterator. 154 | A triple pattern itself is a 3-elements ``tuple`` (subject, predicate, object) of RDFlib terms with at least one SPARQL variable. 155 | 156 | Argument: A set of triple patterns. 157 | 158 | Return: 159 | A :py:class:`rdflib_hdt.HDTJoinIterator` which produces :py:class:`rdflib.query.Results`, per the Python iteration protocol. 160 | """ 161 | bgp = [(rdflib_to_hdt(s), rdflib_to_hdt(p), rdflib_to_hdt(o)) for s, p, o in patterns] 162 | join_iterator = super().search_join(bgp) 163 | return HDTJoinIterator(join_iterator, safe_mode=self._safe_mode) 164 | -------------------------------------------------------------------------------- /rdflib_hdt/hdt_store.py: -------------------------------------------------------------------------------- 1 | """ 2 | rdflib_hdt.hdt_store 3 | ======================= 4 | """ 5 | from typing import Iterable 6 | 7 | from rdflib.store import Store 8 | 9 | from rdflib_hdt.hdt_document import HDTDocument 10 | from rdflib_hdt.types import Triple 11 | 12 | 13 | class HDTStore(Store): 14 | """An implementation of a Store over a HDT document. 15 | 16 | It is heavily inspired by the work from @FlorianLudwig (https://github.com/RDFLib/rdflib/issues/894) 17 | 18 | .. warning:: By default, an HDTStore discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor. 19 | 20 | Args: 21 | - path: Absolute path to the HDT file to load. 22 | - mapped: True if the document must be mapped on disk, False to load it in memory. 23 | - indexed: True if the document must be indexed. Indexed must be located in the same directory as the HDT file. Missing indexes are automatically generated at startup. 24 | - safe_mode: True if Unicode errors should be ignored, False otherwise. 25 | """ 26 | def __init__(self, path: str, mapped: bool = True, indexed: bool = True, safe_mode=True, configuration=None, identifier=None): 27 | super(HDTStore, self).__init__(configuration=configuration, identifier=identifier) 28 | self._hdt_document = HDTDocument(path, mapped=mapped, indexed=indexed, safe_mode=safe_mode) 29 | 30 | @property 31 | def hdt_document(self) -> HDTDocument: 32 | """The HDT document used to read and query the HDT file.""" 33 | return self._hdt_document 34 | 35 | def is_safe(self) -> bool: 36 | """Return True if the HDT store ignores Unicode errors, False otherwise.""" 37 | return self._hdt_document.is_safe() 38 | 39 | def __len__(self, context) -> int: 40 | """The number of RDF triples in the HDT store.""" 41 | return self._hdt_document.total_triples 42 | 43 | @property 44 | def nb_subjects(self) -> int: 45 | """The number of subjects in the HDT store.""" 46 | return self._hdt_document.nb_subjects 47 | 48 | @property 49 | def nb_predicates(self) -> int: 50 | """The number of predicates in the HDT store.""" 51 | return self._hdt_document.nb_predicates 52 | 53 | @property 54 | def nb_objects(self) -> int: 55 | """The number of objects in the HDT store.""" 56 | return self._hdt_document.nb_objects 57 | 58 | @property 59 | def nb_shared(self) -> int: 60 | """The number of shared subject-object in the HDT store.""" 61 | return self._hdt_document.nb_shared 62 | 63 | def triples(self, pattern, context) -> Iterable[Triple]: 64 | """Search for a triple pattern in a HDT store. 65 | 66 | Args: 67 | - pattern: The triple pattern (s, p, o) to search. 68 | - context: The query execution context. 69 | 70 | Returns: An iterator that produces RDF triples matching the input triple pattern. 71 | """ 72 | iterator, cardinality = self._hdt_document.search(pattern) 73 | for triple in iterator: 74 | yield triple, None 75 | return 76 | 77 | def create(self, configuration): 78 | raise TypeError('The HDT store is read only!') 79 | 80 | def destroy(self, configuration): 81 | raise TypeError('The HDT store is read only!') 82 | 83 | def commit(self): 84 | raise TypeError('The HDT store is read only!') 85 | 86 | def rollback(self): 87 | raise TypeError('The HDT store is read only!') 88 | 89 | def add(self, _, context=None, quoted=False): 90 | raise TypeError('The HDT store is read only!') 91 | 92 | def addN(self, quads): 93 | raise TypeError('The HDT store is read only!') 94 | 95 | def remove(self, _, context): 96 | raise TypeError('The HDT store is read only!') 97 | -------------------------------------------------------------------------------- /rdflib_hdt/iterators.py: -------------------------------------------------------------------------------- 1 | """ 2 | rdflib_hdt.iterators 3 | ======================= 4 | 5 | This module contains iterators that wraps native HDT iterators to the RDFlib data model. 6 | """ 7 | 8 | from rdflib.query import ResultRow 9 | 10 | from hdt import JoinIterator, TripleIterator 11 | from rdflib_hdt.mapping import term_to_rdflib 12 | from rdflib_hdt.types import Triple 13 | 14 | 15 | class HDTIterator: 16 | """An iterator that converts HDT matching triples to the RDFlib data model. 17 | 18 | Args: 19 | - input: Input iterator that produces RDF triples with RDF terms in string format. 20 | - safe_mode: True if Unicode errors should be ignored, False otherwise. 21 | """ 22 | def __init__(self, input: TripleIterator, safe_mode=True): 23 | super(HDTIterator, self).__init__() 24 | self._input = input 25 | self._safe_mode = safe_mode 26 | 27 | def __len__(self): 28 | """The estimated number of matching RDF triples.""" 29 | return len(self._input) 30 | 31 | def __iter__(self): 32 | return self 33 | 34 | def __next__(self) -> Triple: 35 | """Fallback implementation for the Python 2.x iterator protocol.""" 36 | return self.next() 37 | 38 | def next(self) -> Triple: 39 | """Produce a new RDF triple, per the Python iterator protocol.""" 40 | try: 41 | triple = next(self._input) 42 | if triple is None: 43 | raise StopIteration() 44 | s, p, o = triple 45 | return (term_to_rdflib(s), term_to_rdflib(p), term_to_rdflib(o)) 46 | except UnicodeDecodeError as e: 47 | # crash if safe mode is off 48 | if not self._safe_mode: 49 | raise e 50 | # otherwise, try to read a valid RDF triple from the input 51 | return self.next() 52 | except StopIteration as e: 53 | raise e 54 | 55 | 56 | class HDTJoinIterator: 57 | """An iterator that converts HDT join results to the RDFlib data model. 58 | 59 | Args: 60 | - input: Input iterator that yields join results 61 | - safe_mode: True if Unicode errors should be ignored, False otherwise. 62 | """ 63 | def __init__(self, input: JoinIterator, safe_mode=True): 64 | super(HDTJoinIterator, self).__init__() 65 | self._input = input 66 | self._safe_mode = safe_mode 67 | 68 | def __len__(self): 69 | """The estimated number of join results.""" 70 | return len(self._input) 71 | 72 | def __iter__(self): 73 | return self 74 | 75 | def __next__(self) -> ResultRow: 76 | """Fallback implementation for the Python 2.x iterator protocol.""" 77 | return self.next() 78 | 79 | def next(self) -> ResultRow: 80 | """Produce a new row of results, per the Python iterator protocol.""" 81 | try: 82 | row = dict() 83 | variables = list() 84 | # convert all solution mappings to the RDFlib data model 85 | for key, value in next(self._input): 86 | rdf_key = term_to_rdflib(key) 87 | rdf_value = term_to_rdflib(value) 88 | variables.append(rdf_key) 89 | row[rdf_key] = rdf_value 90 | return ResultRow(row, variables) 91 | except UnicodeDecodeError as e: 92 | # crash if safe mode is off 93 | if not self._safe_mode: 94 | raise e 95 | # otherwise, try to read a valid RDF triple from the input 96 | return self.next() 97 | except StopIteration as e: 98 | raise e 99 | -------------------------------------------------------------------------------- /rdflib_hdt/mapping.py: -------------------------------------------------------------------------------- 1 | """ 2 | rdflib_hdt.mapping 3 | ======================= 4 | Mapping functions between string RDF terms and the RDFlib data model. 5 | """ 6 | from rdflib import URIRef, Variable 7 | from rdflib.util import from_n3 8 | 9 | from rdflib_hdt.types import Term 10 | 11 | 12 | def term_to_rdflib(term: str) -> Term: 13 | """Convert an HDT term into its RDFlib representation.""" 14 | if term.startswith('?'): 15 | return Variable(term[1:]) 16 | elif term.startswith("\""): 17 | return from_n3(term) 18 | else: 19 | return URIRef(term) 20 | 21 | 22 | def rdflib_to_hdt(term: Term) -> str: 23 | """Convert an RDFlib term into an HDT term.""" 24 | value = term.n3() 25 | if value.startswith('<') and value.endswith('>'): 26 | return value[1: len(value) - 1] 27 | return value 28 | -------------------------------------------------------------------------------- /rdflib_hdt/sparql_op.py: -------------------------------------------------------------------------------- 1 | """ 2 | rdflib_hdt.sparql_opt 3 | ======================= 4 | Provides functions to overrides the RDFlib SPARQL evaluator for HDT documents. 5 | """ 6 | import rdflib.plugins.sparql.evaluate as sparql_evaluate 7 | from rdflib import Variable 8 | from rdflib.plugins.sparql.sparql import FrozenBindings, QueryContext 9 | 10 | from rdflib_hdt.hdt_store import HDTStore 11 | from rdflib_hdt.types import BGP 12 | 13 | 14 | def optimize_sparql(): 15 | """Overrides the RDFlib SPARQL engine to optimize SPARQL query execution over HDT documents. 16 | 17 | .. note:: 18 | Calling this function triggers a global modification of the RDFlib SPARQL engine. 19 | However, executing SPARQL queries using other RDFlib stores will continue to work as before, 20 | so you can safely call this function at the beginning of your code. 21 | """ 22 | # copy the default RDFlib function for evaluating Basic Graph Patterns 23 | rdflib_evalBGP = sparql_evaluate.evalBGP 24 | 25 | def __evalBGP__(ctx: QueryContext, bgp: BGP): 26 | # A SPARQL query executed over a non HDTStore is evaluated as usual 27 | if not isinstance(ctx.graph.store, HDTStore): 28 | return rdflib_evalBGP(ctx, bgp) 29 | if not bgp: 30 | yield ctx.solution() 31 | return 32 | 33 | # delegate the join evaluation to HDT 34 | store: HDTStore = ctx.graph.store 35 | for row in store.hdt_document.search_join(set(bgp)): 36 | # convert the ResultRow into a FrozenBindings object 37 | bindings = dict() 38 | for key in row.labels: 39 | bindings[Variable(key)] = row[key] 40 | yield FrozenBindings(ctx, bindings) 41 | return 42 | # overrides RDFlib evalBGP function 43 | sparql_evaluate.evalBGP = __evalBGP__ 44 | -------------------------------------------------------------------------------- /rdflib_hdt/types.py: -------------------------------------------------------------------------------- 1 | """ 2 | rdflib_hdt.types 3 | ======================= 4 | All commons types found in the rdflib_hdt package 5 | """ 6 | from typing import Optional, Set, Tuple, Union 7 | from rdflib import Literal, URIRef, Variable 8 | 9 | Term = Union[URIRef, Literal] 10 | Triple = Tuple[Term, Term, Term] 11 | TriplePattern = Union[URIRef, Literal, Variable] 12 | SearchQuery = Tuple[Optional[Term], Optional[Term], Optional[Term]] 13 | BGP = Set[TriplePattern] 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11==2.2.4 2 | rdflib==4.2.2 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file = README.rst 3 | 4 | [flake8] 5 | ignore = E501 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | from setuptools import find_packages, setup, Extension 4 | from os import listdir, remove 5 | from shutil import unpack_archive, move, rmtree 6 | import urllib.request 7 | import pybind11 8 | 9 | __rdflib_hdt_version__ = "3.2" 10 | 11 | def download_hdt_and_unzip(): 12 | print("Downloading HDT...") 13 | urllib.request.urlretrieve("https://github.com/rdfhdt/hdt-cpp/archive/v1.3.3.zip", "v1.3.3.zip") 14 | unpack_archive("v1.3.3.zip", "tmp") 15 | move("tmp/hdt-cpp-1.3.3", "hdt-cpp-1.3.3") 16 | rmtree("tmp") 17 | 18 | download_hdt_and_unzip() 19 | 20 | def list_files(path: str, extension=".cpp", exclude="S.cpp"): 21 | """List paths to all files that ends with a given extension""" 22 | return ["%s/%s" % (path, f) for f in listdir(path) if f.endswith(extension) and (not f.endswith(exclude))] 23 | 24 | 25 | # pyHDT source files 26 | sources = [ 27 | "src/hdt.cpp", 28 | "src/hdt_document.cpp", 29 | "src/triple_iterator.cpp", 30 | "src/triple_iterator_bytes.cpp", 31 | "src/tripleid_iterator.cpp", 32 | "src/join_iterator.cpp", 33 | "src/join_iterator_bytes.cpp" 34 | ] 35 | 36 | # HDT source files 37 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/bitsequence") 38 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/coders") 39 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/mapper") 40 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/sequence") 41 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/permutation") 42 | sources += list_files("hdt-cpp-1.3.3/libcds/src/utils") 43 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/bitsequence") 44 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/dictionary") 45 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/hdt") 46 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/header") 47 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/huffman") 48 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs") 49 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex") 50 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/rdf") 51 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sequence") 52 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/triples") 53 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/util") 54 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sparql") 55 | 56 | # pybind11 + pyHDT + libcds + HDT-lib headers 57 | include_dirs = [ 58 | pybind11.get_include(), 59 | pybind11.get_include(True), 60 | "include/", 61 | "hdt-cpp-1.3.3/libhdt/include/", 62 | "hdt-cpp-1.3.3/libhdt/src/dictionary/", 63 | "hdt-cpp-1.3.3/libhdt/src/sparql/", 64 | "hdt-cpp-1.3.3/libcds/include/", 65 | "hdt-cpp-1.3.3/libcds/src/static/bitsequence", 66 | "hdt-cpp-1.3.3/libcds/src/static/coders", 67 | "hdt-cpp-1.3.3/libcds/src/static/mapper", 68 | "hdt-cpp-1.3.3/libcds/src/static/permutation", 69 | "hdt-cpp-1.3.3/libcds/src/static/sequence", 70 | "hdt-cpp-1.3.3/libcds/src/utils" 71 | ] 72 | 73 | # Need to build in c++11 minimum 74 | # TODO add a check to use c++14 or c++17 if available 75 | extra_compile_args = ["-std=c++11"] 76 | 77 | # build HDT extension 78 | hdt_extension = Extension("hdt", 79 | sources=sources, 80 | include_dirs=include_dirs, 81 | extra_compile_args=extra_compile_args, 82 | language='c++') 83 | 84 | setup( 85 | version=__rdflib_hdt_version__, 86 | packages=find_packages(exclude=["tests"]), 87 | ext_modules=[hdt_extension] 88 | ) 89 | 90 | print("Cleaning up...") 91 | rmtree("hdt-cpp-1.3.3") 92 | remove("v1.3.3.zip") 93 | -------------------------------------------------------------------------------- /src/hdt.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | #include "docstrings.hpp" 10 | #include "hdt_document.hpp" 11 | #include "triple_iterator.hpp" 12 | #include "triple_iterator_bytes.hpp" 13 | #include "tripleid_iterator.hpp" 14 | #include "join_iterator.hpp" 15 | #include "join_iterator_bytes.hpp" 16 | 17 | namespace py = pybind11; 18 | 19 | PYBIND11_MODULE(hdt, m) { 20 | m.doc() = MODULE_DOC; 21 | 22 | py::enum_(m, "IdentifierPosition", IDENTIFIER_POSITION_DOC) 23 | .value("Subject", IdentifierPosition::Subject) 24 | .value("Predicate", IdentifierPosition::Predicate) 25 | .value("Object", IdentifierPosition::Object) 26 | .export_values(); 27 | 28 | py::class_(m, "TripleIterator", TRIPLE_ITERATOR_CLASS_DOC) 29 | .def("next", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 30 | .def("__next__", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 31 | .def("peek", &TripleIterator::peek, TRIPLE_ITERATOR_PEEK_DOC) 32 | .def("has_next", &TripleIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC) 33 | .def("size_hint", &TripleIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 34 | .def("__len__", &TripleIterator::sizeHint, 35 | TRIPLE_ITERATOR_SIZE_DOC) 36 | .def("__iter__", &TripleIterator::python_iter) 37 | .def_property_readonly("subject", &TripleIterator::getSubject, 38 | TRIPLE_ITERATOR_GETSUBJECT_DOC) 39 | .def_property_readonly("predicate", &TripleIterator::getPredicate, 40 | TRIPLE_ITERATOR_GETPREDICATE_DOC) 41 | .def_property_readonly("object", &TripleIterator::getObject, 42 | TRIPLE_ITERATOR_GETOBJECT_DOC) 43 | .def_property_readonly("limit", &TripleIterator::getLimit, 44 | TRIPLE_ITERATOR_GETLIMIT_DOC) 45 | .def_property_readonly("offset", &TripleIterator::getOffset, 46 | TRIPLE_ITERATOR_GETOFFSET_DOC) 47 | .def_property_readonly("nb_reads", &TripleIterator::getNbResultsRead, 48 | TRIPLE_ITERATOR_NBREADS_DOC) 49 | .def("__repr__", &TripleIterator::python_repr); 50 | 51 | py::class_(m, "TripleIteratorBytes", TRIPLE_ITERATOR_CLASS_DOC) 52 | .def("next", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC) 53 | .def("__next__", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC) 54 | .def("peek", &TripleIteratorBytes::peek, TRIPLE_ITERATOR_PEEK_DOC) 55 | .def("has_next", &TripleIteratorBytes::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC) 56 | .def("size_hint", &TripleIteratorBytes::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 57 | .def("__len__", &TripleIteratorBytes::sizeHint, 58 | TRIPLE_ITERATOR_SIZE_DOC) 59 | .def("__iter__", &TripleIteratorBytes::python_iter) 60 | .def_property_readonly("subject", &TripleIteratorBytes::getSubject, 61 | TRIPLE_ITERATOR_GETSUBJECT_DOC) 62 | .def_property_readonly("predicate", &TripleIteratorBytes::getPredicate, 63 | TRIPLE_ITERATOR_GETPREDICATE_DOC) 64 | .def_property_readonly("object", &TripleIteratorBytes::getObject, 65 | TRIPLE_ITERATOR_GETOBJECT_DOC) 66 | .def_property_readonly("limit", &TripleIteratorBytes::getLimit, 67 | TRIPLE_ITERATOR_GETLIMIT_DOC) 68 | .def_property_readonly("offset", &TripleIteratorBytes::getOffset, 69 | TRIPLE_ITERATOR_GETOFFSET_DOC) 70 | .def_property_readonly("nb_reads", &TripleIteratorBytes::getNbResultsRead, 71 | TRIPLE_ITERATOR_NBREADS_DOC) 72 | .def("__repr__", &TripleIteratorBytes::python_repr); 73 | 74 | py::class_(m, "TripleIDIterator", TRIPLE_ID_ITERATOR_CLASS_DOC) 75 | .def("next", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 76 | .def("__next__", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 77 | .def("peek", &TripleIDIterator::peek, TRIPLE_ITERATOR_PEEK_DOC) 78 | .def("has_next", &TripleIDIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC) 79 | .def("size_hint", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 80 | .def("__len__", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 81 | .def("__iter__", &TripleIDIterator::python_iter) 82 | .def_property_readonly("subject", &TripleIDIterator::getSubject, 83 | TRIPLE_ITERATOR_GETSUBJECT_DOC) 84 | .def_property_readonly("predicate", &TripleIDIterator::getPredicate, 85 | TRIPLE_ITERATOR_GETPREDICATE_DOC) 86 | .def_property_readonly("object", &TripleIDIterator::getObject, 87 | TRIPLE_ITERATOR_GETOBJECT_DOC) 88 | .def_property_readonly("limit", &TripleIDIterator::getLimit, 89 | TRIPLE_ITERATOR_GETLIMIT_DOC) 90 | .def_property_readonly("offset", &TripleIDIterator::getOffset, 91 | TRIPLE_ITERATOR_GETOFFSET_DOC) 92 | .def_property_readonly("nb_reads", &TripleIDIterator::getNbResultsRead, 93 | TRIPLE_ITERATOR_NBREADS_DOC) 94 | .def("__repr__", &TripleIDIterator::python_repr); 95 | 96 | py::class_(m, "JoinIterator", JOIN_ITERATOR_CLASS_DOC) 97 | .def("next", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC) 98 | .def("has_next", &JoinIterator::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC) 99 | .def("cardinality", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 100 | .def("reset", &JoinIterator::reset, JOIN_ITERATOR_RESET_DOC) 101 | .def("__len__", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 102 | .def("__next__", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC) 103 | .def("__iter__", &JoinIterator::python_iter) 104 | .def("__repr__", &JoinIterator::python_repr); 105 | 106 | py::class_(m, "JoinIteratorBytes", JOIN_ITERATOR_CLASS_DOC) 107 | .def("next", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC) 108 | .def("has_next", &JoinIteratorBytes::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC) 109 | .def("cardinality", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 110 | .def("reset", &JoinIteratorBytes::reset, JOIN_ITERATOR_RESET_DOC) 111 | .def("__len__", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 112 | .def("__next__", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC) 113 | .def("__iter__", &JoinIteratorBytes::python_iter) 114 | .def("__repr__", &JoinIteratorBytes::python_repr); 115 | 116 | py::class_(m, "HDTDocument", HDT_DOCUMENT_CLASS_DOC) 117 | .def(py::init(&HDTDocument::create), py::arg("file"), 118 | py::arg("map") = true, 119 | py::arg("indexed") = true) 120 | .def_property_readonly("file_path", &HDTDocument::getFilePath, 121 | HDT_DOCUMENT_GETFILEPATH_DOC) 122 | .def_property_readonly("total_triples", &HDTDocument::getNbTriples, 123 | HDT_DOCUMENT_GETNBTRIPLES_DOC) 124 | .def_property_readonly("nb_subjects", &HDTDocument::getNbSubjects, 125 | HDT_DOCUMENT_GETNBSUBJECTS_DOC) 126 | .def_property_readonly("nb_predicates", &HDTDocument::getNbPredicates, 127 | HDT_DOCUMENT_GETNBPREDICATES_DOC) 128 | .def_property_readonly("nb_objects", &HDTDocument::getNbObjects, 129 | HDT_DOCUMENT_GETNBOBJECTS_DOC) 130 | .def_property_readonly("nb_shared", &HDTDocument::getNbShared, 131 | HDT_DOCUMENT_GETNBSHARED_DOC) 132 | .def("search_triples", &HDTDocument::search, 133 | HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"), 134 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0, 135 | py::arg("offset") = 0) 136 | .def("search_join", &HDTDocument::searchJoin, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns")) 137 | .def("search_triples_ids", &HDTDocument::searchIDs, 138 | HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC, py::arg("subject"), 139 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0, 140 | py::arg("offset") = 0) 141 | .def("convert_tripleid", &HDTDocument::convertTripleID, 142 | HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC, 143 | py::arg("subject"), py::arg("predicate"), py::arg("object")) 144 | .def("convert_id", &HDTDocument::convertID, HDT_DOCUMENT_CONVERT_ID_DOC, 145 | py::arg("id"), py::arg("position")) 146 | .def("convert_term", &HDTDocument::convertTerm, HDT_DOCUMENT_CONVERT_TERM_DOC, 147 | py::arg("term"), py::arg("position")) 148 | // ========= BYTES REPRESENTATION ========= 149 | .def("search_triples_bytes", &HDTDocument::searchBytes, 150 | HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"), 151 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0, 152 | py::arg("offset") = 0) 153 | .def("search_join_bytes", &HDTDocument::searchJoinBytes, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns")) 154 | .def("convert_tripleid_bytes", &HDTDocument::convertTripleIDBytes, 155 | HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC, 156 | py::arg("subject"), py::arg("predicate"), py::arg("object")) 157 | .def("convert_id_bytes", &HDTDocument::convertIDBytes, HDT_DOCUMENT_CONVERT_ID_DOC, 158 | py::arg("id"), py::arg("position")) 159 | .def("__len__", &HDTDocument::getNbTriples, HDT_DOCUMENT_GETNBTRIPLES_DOC) 160 | .def("__repr__", &HDTDocument::python_repr); 161 | 162 | } 163 | -------------------------------------------------------------------------------- /src/hdt_document.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt_document.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "hdt_document.hpp" 7 | #include "triple_iterator.hpp" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | namespace py = pybind11; 15 | using namespace hdt; 16 | 17 | /*! 18 | * Skip `offset` items from an iterator, optimized for HDT iterators. 19 | * @param it - Iterator which should skip items 20 | * @param offset - How many items to skip 21 | * @param cardinality - (Estimated) number of results 22 | */ 23 | template 24 | inline void applyOffset(T *it, unsigned int offset, unsigned int cardinality) { 25 | if (offset > 0 && offset >= cardinality) { 26 | // hdt does not allow to skip past beyond the estimated nb of results, 27 | // so we may have a few results to skip manually 28 | unsigned int remainingSteps = offset - cardinality + 1; 29 | it->skip(cardinality - 1); 30 | while (it->hasNext() && remainingSteps > 0) { 31 | it->next(); 32 | remainingSteps--; 33 | } 34 | } else if (offset > 0) { 35 | it->skip(offset); 36 | } 37 | } 38 | 39 | /*! 40 | * Returns true if a file is readable, false otherwise 41 | * @param name - Path to the file to test 42 | * @return true if the file is readable, false otherwise 43 | */ 44 | inline bool file_exists(const std::string &name) { 45 | std::ifstream f(name.c_str()); 46 | bool result = f.good(); 47 | f.close(); 48 | return result; 49 | } 50 | 51 | /*! 52 | * Constructor 53 | * @param file - Path to HDT file to load 54 | * @param map - True maps the HDT file (faster), False loads everything in memory 55 | * @param indexed - True if the HDT must be loaded with indexes, False otherwise 56 | */ 57 | HDTDocument::HDTDocument(std::string file, bool map, bool indexed) { 58 | hdt_file = file; 59 | if (!file_exists(file)) { 60 | throw std::runtime_error("Cannot open HDT file '" + file + "': Not Found!"); 61 | } 62 | 63 | if(!map && indexed) { 64 | hdt = HDTManager::loadIndexedHDT(file.c_str()); 65 | } else if(!map && !indexed) { 66 | hdt = HDTManager::loadHDT(file.c_str()); 67 | } else if(map && indexed){ 68 | hdt = HDTManager::mapIndexedHDT(file.c_str()); 69 | } else { 70 | hdt = HDTManager::mapHDT(file.c_str()); 71 | } 72 | processor = new QueryProcessor(hdt); 73 | } 74 | 75 | /*! 76 | * Destructor 77 | */ 78 | HDTDocument::~HDTDocument() {} 79 | 80 | /*! 81 | * Get the path to the HDT file currently loaded 82 | * @return The path to the HDT file currently loaded 83 | */ 84 | std::string HDTDocument::getFilePath() { return hdt_file; } 85 | 86 | /*! 87 | * Implementation for Python function "__repr__" 88 | * @return A string representation of the object 89 | */ 90 | std::string HDTDocument::python_repr() { 91 | return ""; 93 | } 94 | 95 | /*! 96 | * Search all matching triples for a triple pattern, whith an optional limit and offset. 97 | * Returns a tuple 98 | * @param subject - Triple pattern's subject 99 | * @param predicate - Triple pattern's predicate 100 | * @param object - Triple pattern's object 101 | * @param limit - (Optional) Maximum number of matching triples to read 102 | * @param offset - (Optional) Number of matching triples to skip 103 | * @return A tuple (TripleIterator*, cardinality) 104 | */ 105 | search_results HDTDocument::search(std::string subject, 106 | std::string predicate, 107 | std::string object, 108 | unsigned int limit, 109 | unsigned int offset) { 110 | unsigned int idSubject = 0; 111 | unsigned int idPredicate = 0; 112 | unsigned int idObject = 0; 113 | 114 | if (!subject.empty()) { 115 | idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT); 116 | } 117 | 118 | if (!predicate.empty()) { 119 | idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE); 120 | } 121 | 122 | if (!object.empty()) { 123 | idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT); 124 | } 125 | 126 | TripleIDIterator *it; 127 | size_t cardinality = 0; 128 | 129 | // if a non-variable term was not found in the dictionnary, then the search yield nothing 130 | if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) { 131 | it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset); 132 | } else { 133 | // build a TripleIDIterator to fetch results 134 | TripleID tp(idSubject, idPredicate, idObject); 135 | IteratorTripleID *source = hdt->getTriples()->search(tp); 136 | cardinality = source->estimatedNumResults(); 137 | applyOffset(source, offset, cardinality); 138 | it = new TripleIDIterator(source, subject, predicate, object, limit, offset); 139 | } 140 | // wraps the TripleIDIterator in order to convert OID triples back to RDF triples 141 | TripleIterator *resultIterator = new TripleIterator(it, hdt->getDictionary()); 142 | return std::make_tuple(resultIterator, cardinality); 143 | } 144 | 145 | /*! 146 | * Same as HDTDocument#search, but search for a TripleIDs instead. 147 | * Returns a tuple 148 | * @param subject - Triple pattern's subject identifier 149 | * @param predicate - Triple pattern's predicate identifier 150 | * @param object - Triple pattern's object identifier 151 | * @param limit - (Optional) Maximum number of matching triples to read 152 | * @param offset - (Optional) Number of matching triples to skip 153 | * @return A tuple (TripleIDIterator*, cardinality) 154 | */ 155 | search_results_ids HDTDocument::searchIDs(unsigned int subject, 156 | unsigned int predicate, 157 | unsigned int object, 158 | unsigned int limit, 159 | unsigned int offset) { 160 | TripleID tp(subject, predicate, object); 161 | // get RDF terms associated with each ID for metadata 162 | std::string strSubject = std::string("?s"); 163 | std::string strPredicate = std::string("?p"); 164 | std::string strObject = std::string("?o"); 165 | 166 | if (subject != 0) { 167 | strSubject = hdt->getDictionary()->idToString(subject, hdt::SUBJECT); 168 | } 169 | if (predicate != 0) { 170 | strPredicate = hdt->getDictionary()->idToString(predicate, hdt::PREDICATE); 171 | } 172 | if (object != 0) { 173 | strObject = hdt->getDictionary()->idToString(object, hdt::OBJECT); 174 | } 175 | 176 | IteratorTripleID *it; 177 | size_t cardinality = 0; 178 | 179 | // if a non-variable term was not found in the dictionnary, then the search yield nothing 180 | if ((strSubject.empty() && subject != 0) || (strPredicate.empty() && predicate != 0) || (strObject.empty() && object != 0)) { 181 | it = new IteratorTripleID(); 182 | } else { 183 | // build iterator 184 | it = hdt->getTriples()->search(tp); 185 | cardinality = it->estimatedNumResults(); 186 | // apply offset 187 | applyOffset(it, offset, cardinality); 188 | } 189 | TripleIDIterator *resultIterator = new TripleIDIterator(it, strSubject, strPredicate, strObject, limit, offset); 190 | return std::make_tuple(resultIterator, cardinality); 191 | } 192 | 193 | /*! 194 | * Get the total number of triples in the HDT document 195 | * @return The total number of triples in the HDT document 196 | */ 197 | unsigned int HDTDocument::getNbTriples() { 198 | return hdt->getTriples()->getNumberOfElements(); 199 | } 200 | 201 | /*! 202 | * Get the number of distinct subjects in the HDT document 203 | * @return The number of distinct subjects in the HDT document 204 | */ 205 | unsigned int HDTDocument::getNbSubjects() { 206 | return hdt->getDictionary()->getNsubjects(); 207 | } 208 | 209 | /*! 210 | * Get the number of distinct predicates in the HDT document 211 | * @return The number of distinct predicates in the HDT document 212 | */ 213 | unsigned int HDTDocument::getNbPredicates() { 214 | return hdt->getDictionary()->getNpredicates(); 215 | } 216 | 217 | /*! 218 | * Get the number of distinct objects in the HDT document 219 | * @return The number of distinct objects in the HDT document 220 | */ 221 | unsigned int HDTDocument::getNbObjects() { 222 | return hdt->getDictionary()->getNobjects(); 223 | } 224 | 225 | /*! 226 | * Get the number of shared subjects-objects in the HDT document 227 | * @return The number of shared subjects-objects in the HDT document 228 | */ 229 | unsigned int HDTDocument::getNbShared() { 230 | return hdt->getDictionary()->getNshared(); 231 | } 232 | 233 | /*! 234 | * Convert a TripleID to a string RDF triple 235 | * @param subject - Triple's subject 236 | * @param predicate - Triple's predicate 237 | * @param object - Triple's object 238 | * @return The associated RDF triple 239 | */ 240 | triple HDTDocument::convertTripleID(unsigned int subject, unsigned int predicate, 241 | unsigned int object) { 242 | return std::make_tuple( 243 | hdt->getDictionary()->idToString(subject, hdt::SUBJECT), 244 | hdt->getDictionary()->idToString(predicate, hdt::PREDICATE), 245 | hdt->getDictionary()->idToString(object, hdt::OBJECT)); 246 | } 247 | 248 | /** 249 | * Convert an Object Identifier into the equivalent URI/Literal value 250 | * @param id - Object Identifier 251 | * @param pos - Identifier position (subject, predicate or object) 252 | * @return The URI/Literal equivalent to the Object Identifier 253 | */ 254 | string HDTDocument::convertID(unsigned int id, IdentifierPosition pos) { 255 | switch (pos) { 256 | case IdentifierPosition::Subject: 257 | return hdt->getDictionary()->idToString(id, hdt::SUBJECT); 258 | case IdentifierPosition::Predicate: 259 | return hdt->getDictionary()->idToString(id, hdt::PREDICATE); 260 | case IdentifierPosition::Object: 261 | return hdt->getDictionary()->idToString(id, hdt::OBJECT); 262 | default: 263 | throw std::runtime_error("Invalid Object Identifier exception"); 264 | } 265 | } 266 | 267 | /** 268 | * Convert an RDF term into the associated an Object Identifier. 269 | * @param term - RDF Term in string format 270 | * @param pos - Identifier position (subject, predicate or object) 271 | * @return The Object Identifier associated with the RDF term 272 | */ 273 | unsigned int HDTDocument::convertTerm(std::string term, IdentifierPosition pos) { 274 | switch (pos) { 275 | case IdentifierPosition::Subject: 276 | return hdt->getDictionary()->stringToId(term, hdt::SUBJECT); 277 | case IdentifierPosition::Predicate: 278 | return hdt->getDictionary()->stringToId(term, hdt::PREDICATE); 279 | case IdentifierPosition::Object: 280 | return hdt->getDictionary()->stringToId(term, hdt::OBJECT); 281 | default: 282 | throw std::runtime_error("Invalid Object Identifier exception"); 283 | } 284 | } 285 | 286 | /** 287 | * Evaluate a join between a set of triple patterns using a JoinIterator. 288 | * @param patterns - Set of triple patterns 289 | * @return A JoinIterator* used to evaluated the join. 290 | */ 291 | JoinIterator * HDTDocument::searchJoin(std::vector patterns) { 292 | set vars {}; 293 | vector joinPatterns {}; 294 | std::string subj, pred, obj; 295 | 296 | for (auto it = patterns.begin(); it != patterns.end(); it++) { 297 | // unpack pattern 298 | std::tie(subj, pred, obj) = *it; 299 | // add variables 300 | if (subj.at(0) == '?') { 301 | vars.insert(subj); 302 | } 303 | if (pred.at(0) == '?') { 304 | vars.insert(pred); 305 | } 306 | if (obj.at(0) == '?') { 307 | vars.insert(obj); 308 | } 309 | // build join pattern 310 | TripleString pattern(subj, pred, obj); 311 | joinPatterns.push_back(pattern); 312 | } 313 | 314 | VarBindingString *iterator = processor->searchJoin(joinPatterns, vars); 315 | return new JoinIterator(iterator); 316 | } 317 | 318 | // ============= BYTES REPRSENTATION ============ 319 | /*! 320 | * Search all matching triples for a triple pattern, whith an optional limit and offset. Triple as bytes triples (b'...', b'...', b'...') 321 | * Returns a tuple 322 | * @param subject - Triple pattern's subject 323 | * @param predicate - Triple pattern's predicate 324 | * @param object - Triple pattern's object 325 | * @param limit - (Optional) Maximum number of matching triples to read 326 | * @param offset - (Optional) Number of matching triples to skip 327 | * @return A tuple (TripleIterator*, cardinality) 328 | */ 329 | search_results_bytes HDTDocument::searchBytes(std::string subject, 330 | std::string predicate, 331 | std::string object, 332 | unsigned int limit, 333 | unsigned int offset) { 334 | unsigned int idSubject = 0; 335 | unsigned int idPredicate = 0; 336 | unsigned int idObject = 0; 337 | 338 | if (!subject.empty()) { 339 | idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT); 340 | } 341 | 342 | if (!predicate.empty()) { 343 | idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE); 344 | } 345 | 346 | if (!object.empty()) { 347 | idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT); 348 | } 349 | 350 | TripleIDIterator *it; 351 | size_t cardinality = 0; 352 | 353 | // if a non-variable term was not found in the dictionnary, then the search yield nothing 354 | if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) { 355 | it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset); 356 | } else { 357 | // build a TripleIDIterator to fetch results 358 | TripleID tp(idSubject, idPredicate, idObject); 359 | IteratorTripleID *source = hdt->getTriples()->search(tp); 360 | cardinality = source->estimatedNumResults(); 361 | applyOffset(source, offset, cardinality); 362 | it = new TripleIDIterator(source, subject, predicate, object, limit, offset); 363 | } 364 | // wraps the TripleIDIterator in order to convert OID triples back to RDF triples 365 | TripleIteratorBytes *resultIterator = new TripleIteratorBytes(it, hdt->getDictionary()); 366 | return std::make_tuple(resultIterator, cardinality); 367 | } 368 | 369 | /** 370 | * Evaluate a join between a set of triple patterns using a JoinIterator. 371 | * @param patterns - Set of triple patterns 372 | * @return A JoinIterator* used to evaluated the join. 373 | */ 374 | JoinIteratorBytes * HDTDocument::searchJoinBytes(std::vector patterns) { 375 | set vars {}; 376 | vector joinPatterns {}; 377 | std::string subj, pred, obj; 378 | 379 | for (auto it = patterns.begin(); it != patterns.end(); it++) { 380 | // unpack pattern 381 | std::tie(subj, pred, obj) = *it; 382 | // add variables 383 | if (subj.at(0) == '?') { 384 | vars.insert(subj); 385 | } 386 | if (pred.at(0) == '?') { 387 | vars.insert(pred); 388 | } 389 | if (obj.at(0) == '?') { 390 | vars.insert(obj); 391 | } 392 | // build join pattern 393 | TripleString pattern(subj, pred, obj); 394 | joinPatterns.push_back(pattern); 395 | } 396 | 397 | VarBindingString *iterator = processor->searchJoin(joinPatterns, vars); 398 | return new JoinIteratorBytes(iterator); 399 | } 400 | 401 | /** 402 | * Convert an Object Identifier into the equivalent URI/Literal value 403 | * @param id - Object Identifier 404 | * @param pos - Identifier position (subject, predicate or object) 405 | * @return The URI/Literal equivalent to the Object Identifier 406 | */ 407 | py::bytes HDTDocument::convertIDBytes(unsigned int id, IdentifierPosition pos) { 408 | return py::bytes(HDTDocument::convertID(id, pos)); 409 | } 410 | 411 | /*! 412 | * Convert a TripleID to a string RDF triple 413 | * @param subject - Triple's subject 414 | * @param predicate - Triple's predicate 415 | * @param object - Triple's object 416 | * @return The associated RDF triple 417 | */ 418 | triple_bytes HDTDocument::convertTripleIDBytes(unsigned int subject, unsigned int predicate, 419 | unsigned int object) { 420 | return std::make_tuple( 421 | py::bytes(hdt->getDictionary()->idToString(subject, hdt::SUBJECT)), 422 | py::bytes(hdt->getDictionary()->idToString(predicate, hdt::PREDICATE)), 423 | py::bytes(hdt->getDictionary()->idToString(object, hdt::OBJECT))); 424 | } 425 | -------------------------------------------------------------------------------- /src/join_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "join_iterator.hpp" 7 | #include 8 | #include 9 | 10 | /*! 11 | * Constructor 12 | * @param _it [description] 13 | */ 14 | JoinIterator::JoinIterator(hdt::VarBindingString *_it) : iterator(_it) {} 15 | 16 | /*! 17 | * Destructor 18 | */ 19 | JoinIterator::~JoinIterator() { 20 | delete iterator; 21 | } 22 | 23 | /*! 24 | * Implementation for Python function "__repr__" 25 | * @return [description] 26 | */ 27 | std::string JoinIterator::python_repr() { 28 | return "JoinIterator"; 29 | } 30 | 31 | 32 | /*! 33 | * Implementation for Python function "__iter__" 34 | * @return [description] 35 | */ 36 | JoinIterator *JoinIterator::python_iter() { return this; } 37 | 38 | /** 39 | * Get the estimated join cardinality 40 | * @return [description] 41 | */ 42 | size_t JoinIterator::estimatedCardinality() { 43 | return iterator->estimatedNumResults(); 44 | } 45 | 46 | /** 47 | * Reset the iterator into its initial state and restart join processing. 48 | */ 49 | void JoinIterator::reset() { 50 | iterator->goToStart(); 51 | } 52 | 53 | /*! 54 | * Return true if the iterator still has items available, False otherwise. 55 | * @return [description] 56 | */ 57 | bool JoinIterator::hasNext() { 58 | return hasNextSolution; 59 | } 60 | 61 | /** 62 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 63 | * has ended. Used to implement Python Itertor protocol. 64 | * @return [description] 65 | */ 66 | solution_bindings JoinIterator::next() { 67 | hasNextSolution = iterator->findNext(); 68 | // stop iteration if the iterator has ended 69 | if (!hasNextSolution) { 70 | throw pybind11::stop_iteration(); 71 | } 72 | solution_bindings solutions = new std::set(); 73 | // build solution bindings 74 | for(unsigned int i = 0; i < iterator->getNumVars(); i++) { 75 | solutions->insert(std::make_tuple(iterator->getVarName(i), iterator->getVar(i))); 76 | } 77 | return solutions; 78 | } 79 | -------------------------------------------------------------------------------- /src/join_iterator_bytes.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator_bytes.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "join_iterator_bytes.hpp" 7 | #include 8 | #include 9 | 10 | /*! 11 | * Constructor 12 | * @param _it [description] 13 | */ 14 | JoinIteratorBytes::JoinIteratorBytes(hdt::VarBindingString *_it) : iterator(_it) {} 15 | 16 | /*! 17 | * Destructor 18 | */ 19 | JoinIteratorBytes::~JoinIteratorBytes() { 20 | delete iterator; 21 | } 22 | 23 | /*! 24 | * Implementation for Python function "__repr__" 25 | * @return [description] 26 | */ 27 | std::string JoinIteratorBytes::python_repr() { 28 | return "JoinIteratorBytes"; 29 | } 30 | 31 | 32 | /*! 33 | * Implementation for Python function "__iter__" 34 | * @return [description] 35 | */ 36 | JoinIteratorBytes *JoinIteratorBytes::python_iter() { return this; } 37 | 38 | /** 39 | * Get the estimated join cardinality 40 | * @return [description] 41 | */ 42 | size_t JoinIteratorBytes::estimatedCardinality() { 43 | return iterator->estimatedNumResults(); 44 | } 45 | 46 | /** 47 | * Reset the iterator into its initial state and restart join processing. 48 | */ 49 | void JoinIteratorBytes::reset() { 50 | iterator->goToStart(); 51 | } 52 | 53 | /*! 54 | * Return true if the iterator still has items available, False otherwise. 55 | * @return [description] 56 | */ 57 | bool JoinIteratorBytes::hasNext() { 58 | return hasNextSolution; 59 | } 60 | 61 | /** 62 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 63 | * has ended. Used to implement Python Itertor protocol. 64 | * @return [description] 65 | */ 66 | py::set JoinIteratorBytes::next() { 67 | hasNextSolution = iterator->findNext(); 68 | // stop iteration if the iterator has ended 69 | if (!hasNextSolution) { 70 | throw pybind11::stop_iteration(); 71 | } 72 | solution_bindings_bytes solutions_bytes; 73 | // build solution bindings 74 | for(unsigned int i = 0; i < iterator->getNumVars(); i++) { 75 | std::string varname = iterator->getVarName(i); 76 | std::string value = iterator->getVar(i); 77 | solutions_bytes.add(std::make_tuple(py::bytes(varname), py::bytes(value))); 78 | } 79 | return solutions_bytes; 80 | } 81 | -------------------------------------------------------------------------------- /src/triple_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "triple_iterator.hpp" 7 | #include 8 | #include 9 | #include 10 | 11 | /*! 12 | * Constructor 13 | * @param iterator [description] 14 | */ 15 | TripleIterator::TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict) 16 | : iterator(_it), dictionary(_dict) {}; 17 | 18 | /*! 19 | * Destructor 20 | */ 21 | TripleIterator::~TripleIterator() { delete iterator; }; 22 | 23 | /*! 24 | * Implementation for Python function "__repr__" 25 | * @return [description] 26 | */ 27 | std::string TripleIterator::python_repr() { 28 | if (getLimit() != 0 && getOffset() > 0) { 29 | return ""; 32 | } else if (getLimit() != 0) { 33 | return ""; 35 | } else if (getOffset() > 0) { 36 | return ""; 38 | } 39 | return ""; 40 | } 41 | 42 | /*! 43 | * Get the subject of the triple pattern currently evaluated. 44 | * An empty string represents a variable 45 | * @return [description] 46 | */ 47 | std::string TripleIterator::getSubject() { return iterator->getSubject(); } 48 | 49 | /*! 50 | * Get the predicate of the triple pattern currently evaluated. 51 | * An empty string represents a variable 52 | * @return [description] 53 | */ 54 | std::string TripleIterator::getPredicate() { return iterator->getPredicate(); } 55 | 56 | /*! 57 | * Get the object of the triple pattern currently evaluated. 58 | * An empty string represents a variable 59 | * @return [description] 60 | */ 61 | std::string TripleIterator::getObject() { return iterator->getObject(); } 62 | 63 | /*! 64 | * Get the limit of the current iterator 65 | * @return [description] 66 | */ 67 | unsigned int TripleIterator::getLimit() { return iterator->getLimit(); } 68 | 69 | /*! 70 | * Get the offset of the current iterator 71 | * @return [description] 72 | */ 73 | unsigned int TripleIterator::getOffset() { return iterator->getOffset(); } 74 | 75 | /*! 76 | * Get the number of results read by the iterator 77 | * @return [description] 78 | */ 79 | unsigned int TripleIterator::getNbResultsRead() { return iterator->getNbResultsRead(); } 80 | 81 | /*! 82 | * Implementation for Python function "__iter__" 83 | * @return [description] 84 | */ 85 | TripleIterator *TripleIterator::python_iter() { return this; } 86 | 87 | /*! 88 | * Get a hint over the cardinality of the triple pattern evaluated. 89 | * Offset & limit are not taken into account. 90 | * @return [description] 91 | */ 92 | size_hint TripleIterator::sizeHint() { 93 | return iterator->sizeHint(); 94 | } 95 | 96 | /*! 97 | * Return true if the iterator still has items available, False otherwise. 98 | * @return [description] 99 | */ 100 | bool TripleIterator::hasNext() { 101 | return iterator->hasNext(); 102 | } 103 | 104 | /** 105 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 106 | * has ended. Used to implement Python Itertor protocol. 107 | * @return [description] 108 | */ 109 | triple TripleIterator::next() { 110 | triple_id t = iterator->next(); 111 | return std::make_tuple( 112 | dictionary->idToString(std::get<0>(t), hdt::SUBJECT), 113 | dictionary->idToString(std::get<1>(t), hdt::PREDICATE), 114 | dictionary->idToString(std::get<2>(t), hdt::OBJECT)); 115 | } 116 | 117 | /** 118 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 119 | * has ended, but without advancing the iterator. 120 | * @return [description] 121 | */ 122 | triple TripleIterator::peek() { 123 | triple_id t = iterator->peek(); 124 | return std::make_tuple( 125 | dictionary->idToString(std::get<0>(t), hdt::SUBJECT), 126 | dictionary->idToString(std::get<1>(t), hdt::PREDICATE), 127 | dictionary->idToString(std::get<2>(t), hdt::OBJECT)); 128 | } 129 | -------------------------------------------------------------------------------- /src/triple_iterator_bytes.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator_bytes.cpp 3 | * Author: Arnaud GRALL - MIT License 2017-2019 4 | */ 5 | 6 | #include "triple_iterator_bytes.hpp" 7 | #include 8 | #include 9 | #include 10 | namespace py = pybind11; 11 | 12 | /*! 13 | * Constructor 14 | * @param iterator [description] 15 | */ 16 | TripleIteratorBytes::TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict) 17 | : iterator(_it), dictionary(_dict) {}; 18 | 19 | /*! 20 | * Destructor 21 | */ 22 | TripleIteratorBytes::~TripleIteratorBytes() { delete iterator; }; 23 | 24 | /*! 25 | * Implementation for Python function "__repr__" 26 | * @return [description] 27 | */ 28 | std::string TripleIteratorBytes::python_repr() { 29 | if (getLimit() != 0 && getOffset() > 0) { 30 | return ""; 33 | } else if (getLimit() != 0) { 34 | return ""; 36 | } else if (getOffset() > 0) { 37 | return ""; 39 | } 40 | return ""; 41 | } 42 | 43 | /*! 44 | * Get the subject of the triple pattern currently evaluated. 45 | * An empty string represents a variable 46 | * @return [description] 47 | */ 48 | std::string TripleIteratorBytes::getSubject() { return iterator->getSubject(); } 49 | 50 | /*! 51 | * Get the predicate of the triple pattern currently evaluated. 52 | * An empty string represents a variable 53 | * @return [description] 54 | */ 55 | std::string TripleIteratorBytes::getPredicate() { return iterator->getPredicate(); } 56 | 57 | /*! 58 | * Get the object of the triple pattern currently evaluated. 59 | * An empty string represents a variable 60 | * @return [description] 61 | */ 62 | std::string TripleIteratorBytes::getObject() { return iterator->getObject(); } 63 | 64 | /*! 65 | * Get the limit of the current iterator 66 | * @return [description] 67 | */ 68 | unsigned int TripleIteratorBytes::getLimit() { return iterator->getLimit(); } 69 | 70 | /*! 71 | * Get the offset of the current iterator 72 | * @return [description] 73 | */ 74 | unsigned int TripleIteratorBytes::getOffset() { return iterator->getOffset(); } 75 | 76 | /*! 77 | * Get the number of results read by the iterator 78 | * @return [description] 79 | */ 80 | unsigned int TripleIteratorBytes::getNbResultsRead() { return iterator->getNbResultsRead(); } 81 | 82 | /*! 83 | * Implementation for Python function "__iter__" 84 | * @return [description] 85 | */ 86 | TripleIteratorBytes *TripleIteratorBytes::python_iter() { return this; } 87 | 88 | /*! 89 | * Get a hint over the cardinality of the triple pattern evaluated. 90 | * Offset & limit are not taken into account. 91 | * @return [description] 92 | */ 93 | size_hint TripleIteratorBytes::sizeHint() { 94 | return iterator->sizeHint(); 95 | } 96 | 97 | /*! 98 | * Return true if the iterator still has items available, False otherwise. 99 | * @return [description] 100 | */ 101 | bool TripleIteratorBytes::hasNext() { 102 | return iterator->hasNext(); 103 | } 104 | 105 | /** 106 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 107 | * has ended. Used to implement Python Itertor protocol. 108 | * @return [description] 109 | */ 110 | triple_bytes TripleIteratorBytes::next() { 111 | triple_id t = iterator->next(); 112 | return std::make_tuple( 113 | py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)), 114 | py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)), 115 | py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT))); 116 | } 117 | 118 | /** 119 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 120 | * has ended, but without advancing the iterator. 121 | * @return [description] 122 | */ 123 | triple_bytes TripleIteratorBytes::peek() { 124 | triple_id t = iterator->peek(); 125 | return std::make_tuple( 126 | py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)), 127 | py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)), 128 | py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT))); 129 | } 130 | -------------------------------------------------------------------------------- /src/tripleid_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * tripleid_iterator.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "tripleid_iterator.hpp" 7 | #include 8 | #include 9 | #include 10 | 11 | /*! 12 | * Constructor 13 | * @param iterator [description] 14 | */ 15 | TripleIDIterator::TripleIDIterator(hdt::IteratorTripleID *_it, 16 | std::string _subj, std::string _pred, 17 | std::string _obj, unsigned int _limit, 18 | unsigned int _offset) 19 | : subject((_subj.compare("") == 0) ? "?s" : _subj), 20 | predicate((_pred.compare("") == 0) ? "?p" : _pred), 21 | object((_obj.compare("") == 0) ? "?o" : _obj), limit(_limit), 22 | offset(_offset), iterator(_it){}; 23 | 24 | /*! 25 | * Destructor 26 | */ 27 | TripleIDIterator::~TripleIDIterator() { delete iterator; }; 28 | 29 | /*! 30 | * Implementation for Python function "__repr__" 31 | * @return [description] 32 | */ 33 | std::string TripleIDIterator::python_repr() { 34 | if (limit != 0 && offset > 0) { 35 | return ""; 38 | } else if (limit != 0) { 39 | return ""; 41 | } else if (offset > 0) { 42 | return ""; 44 | } 45 | return ""; 46 | } 47 | 48 | /*! 49 | * Get the subject of the triple pattern currently evaluated. 50 | * An empty string represents a variable 51 | * @return [description] 52 | */ 53 | std::string TripleIDIterator::getSubject() { return subject; } 54 | 55 | /*! 56 | * Get the predicate of the triple pattern currently evaluated. 57 | * An empty string represents a variable 58 | * @return [description] 59 | */ 60 | std::string TripleIDIterator::getPredicate() { return predicate; } 61 | 62 | /*! 63 | * Get the object of the triple pattern currently evaluated. 64 | * An empty string represents a variable 65 | * @return [description] 66 | */ 67 | std::string TripleIDIterator::getObject() { return object; } 68 | 69 | /*! 70 | * Get the limit of the current iterator 71 | * @return [description] 72 | */ 73 | unsigned int TripleIDIterator::getLimit() { return limit; } 74 | 75 | /*! 76 | * Get the offset of the current iterator 77 | * @return [description] 78 | */ 79 | unsigned int TripleIDIterator::getOffset() { return offset; } 80 | 81 | /*! 82 | * Get the number of results read by the iterator 83 | * @return [description] 84 | */ 85 | unsigned int TripleIDIterator::getNbResultsRead() { return resultsRead; } 86 | 87 | /*! 88 | * Implementation for Python function "__iter__" 89 | * @return [description] 90 | */ 91 | TripleIDIterator *TripleIDIterator::python_iter() { return this; } 92 | 93 | /*! 94 | * Get a hint over the cardinality of the triple pattern evaluated. 95 | * Offset & limit are not taken into account. 96 | * @return [description] 97 | */ 98 | size_hint TripleIDIterator::sizeHint() { 99 | return std::make_tuple(iterator->estimatedNumResults(), iterator->numResultEstimation() == hdt::EXACT); 100 | } 101 | 102 | /*! 103 | * Return true if the iterator still has items available, False otherwise. 104 | * @return [description] 105 | */ 106 | bool TripleIDIterator::hasNext() { 107 | bool noLimit = limit == 0; 108 | return iterator->hasNext() && (noLimit || limit > resultsRead); 109 | } 110 | 111 | /** 112 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 113 | * has ended. Used to implement Python Itertor protocol. 114 | * @return [description] 115 | */ 116 | triple_id TripleIDIterator::next() { 117 | // return any previously peeked value 118 | if (hasBufferedTriple) { 119 | hasBufferedTriple = false; 120 | resultsRead++; 121 | return _bufferedTriple; 122 | } 123 | bool noLimit = limit == 0; 124 | if (iterator->hasNext() && (noLimit || limit > resultsRead)) { 125 | resultsRead++; 126 | hdt::TripleID *ts = iterator->next(); 127 | return std::make_tuple(ts->getSubject(), ts->getPredicate(), 128 | ts->getObject()); 129 | } 130 | throw pybind11::stop_iteration(); 131 | } 132 | 133 | /** 134 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 135 | * has ended, but without advancing the iterator. 136 | * @return [description] 137 | */ 138 | triple_id TripleIDIterator::peek() { 139 | if (hasBufferedTriple) { 140 | return _bufferedTriple; 141 | } 142 | _bufferedTriple = next(); 143 | hasBufferedTriple = true; 144 | resultsRead--; 145 | return _bufferedTriple; 146 | } 147 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/tests/__init__.py -------------------------------------------------------------------------------- /tests/hdt_document_test.py: -------------------------------------------------------------------------------- 1 | # hdt_document_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | import pytest 4 | from hdt import HDTDocument, IdentifierPosition 5 | 6 | path = "tests/test.hdt" 7 | document = HDTDocument(path, True, False) 8 | nbTotalTriples = 132 9 | 10 | 11 | def test_missing_file(): 12 | with pytest.raises(RuntimeError): 13 | HDTDocument("/home/dtrump/wall.hdt") 14 | 15 | 16 | def test_file_path(): 17 | assert document.file_path == path, f"THe HDT Document filepath should be {path}" 18 | 19 | 20 | def test_total_triples(): 21 | assert document.total_triples == nbTotalTriples, f"The HDT Document should contains {nbTotalTriples} RDF triples" 22 | assert len(document) == nbTotalTriples, f"The HDT Document __len__ magic function should returns {nbTotalTriples} RDF triples" 23 | 24 | 25 | def test_nb_subjects(): 26 | assert document.nb_subjects == 4, f"The HDT Document should contains 4 subjects" 27 | 28 | 29 | def tests_nb_predicates(): 30 | assert document.nb_predicates == 3, f"The HDT Document should contains 3 predicates" 31 | 32 | 33 | def tests_nb_objects(): 34 | assert document.nb_objects == 112, f"The HDT Document should contains 112 objects" 35 | 36 | 37 | def tests_nb_shared(): 38 | assert document.nb_shared == 0, f"The HDT Document should contains 0 shared subject-objects" 39 | 40 | 41 | def test_ids_to_string(): 42 | (triples, triplesCard) = document.search_triples("", "", "") 43 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 44 | assert triplesCard == idsCard 45 | assert triplesCard == nbTotalTriples 46 | for subj, pred, obj in triples: 47 | sid, pid, oid = next(ids) 48 | s, p, o = document.convert_tripleid(sid, pid, oid) 49 | assert subj == s 50 | assert pred == p 51 | assert obj == o 52 | 53 | 54 | def test_ids_to_string_bytes(): 55 | (triples, triplesCard) = document.search_triples_bytes("", "", "") 56 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 57 | assert triplesCard == idsCard 58 | assert triplesCard == nbTotalTriples 59 | for subj, pred, obj in triples: 60 | sid, pid, oid = next(ids) 61 | s, p, o = document.convert_tripleid_bytes(sid, pid, oid) 62 | assert subj.decode('utf-8') == s.decode('utf-8') 63 | assert pred.decode('utf-8') == p.decode('utf-8') 64 | assert obj.decode('utf-8') == o.decode('utf-8') 65 | 66 | 67 | def test_convert_id(): 68 | (triples, triplesCard) = document.search_triples("", "", "") 69 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 70 | assert triplesCard == idsCard 71 | assert triplesCard == nbTotalTriples 72 | for subj, pred, obj in triples: 73 | sid, pid, oid = next(ids) 74 | s, p, o = ( 75 | document.convert_id(sid, IdentifierPosition.Subject), 76 | document.convert_id(pid, IdentifierPosition.Predicate), 77 | document.convert_id(oid, IdentifierPosition.Object) 78 | ) 79 | assert subj == s 80 | assert pred == p 81 | assert obj == o 82 | 83 | 84 | def test_convert_id_bytes(): 85 | (triples, triplesCard) = document.search_triples_bytes("", "", "") 86 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 87 | assert triplesCard == idsCard 88 | assert triplesCard == nbTotalTriples 89 | for subj, pred, obj in triples: 90 | sid, pid, oid = next(ids) 91 | s, p, o = ( 92 | document.convert_id_bytes(sid, IdentifierPosition.Subject), 93 | document.convert_id_bytes(pid, IdentifierPosition.Predicate), 94 | document.convert_id_bytes(oid, IdentifierPosition.Object) 95 | ) 96 | assert subj == s 97 | assert pred == p 98 | assert obj == o 99 | -------------------------------------------------------------------------------- /tests/hdt_iterators_test.py: -------------------------------------------------------------------------------- 1 | # hdt_iterators_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | import pytest 4 | from hdt import HDTDocument 5 | 6 | path = "tests/test.hdt" 7 | document = HDTDocument(path) 8 | nbTotalTriples = 132 9 | 10 | 11 | def test_read_document_base(): 12 | (triples, cardinality) = document.search_triples("", "", "") 13 | assert triples.subject == "?s" 14 | assert triples.predicate == "?p" 15 | assert triples.object == "?o" 16 | assert cardinality == nbTotalTriples 17 | for subj, pred, obj in triples: 18 | assert subj is not None 19 | assert pred is not None 20 | assert obj is not None 21 | assert triples.nb_reads == cardinality 22 | 23 | def test_read_document_base_bytes(): 24 | (triples, cardinality) = document.search_triples_bytes("", "", "") 25 | assert triples.subject == "?s" 26 | assert triples.predicate == "?p" 27 | assert triples.object == "?o" 28 | assert cardinality == nbTotalTriples 29 | for subj, pred, obj in triples: 30 | assert isinstance(subj, bytes) 31 | assert isinstance(pred, bytes) 32 | assert isinstance(obj, bytes) 33 | try: 34 | s, p, o = subj.decode('utf-8'), pred.decode('utf-8'), obj.decode('utf-8') 35 | except Exception as err: 36 | # with the test.hdt file we shouldnt have any problem 37 | raise err 38 | assert subj is not None 39 | assert pred is not None 40 | assert obj is not None 41 | assert triples.nb_reads == cardinality 42 | 43 | 44 | empty_triples = [ 45 | ("http://example.org#toto", "", ""), 46 | ("", "http://example.org#toto", ""), 47 | ("", "http://example.org#toto", "") 48 | ] 49 | 50 | empty_triples_ids = [ 51 | (155, 0, 0), 52 | (0, 155, 0), 53 | (0, 0, 155) 54 | ] 55 | 56 | 57 | @pytest.mark.parametrize("triple", empty_triples) 58 | def test_search_triples_empty(triple): 59 | s, p, o = triple 60 | (iterator, cardinality) = document.search_triples(s, p, o) 61 | assert cardinality == 0 62 | assert not iterator.has_next() 63 | 64 | 65 | @pytest.mark.parametrize("triple", empty_triples_ids) 66 | def test_search_ids_empty(triple): 67 | s, p, o = triple 68 | (iterator, cardinality) = document.search_triples_ids(s, p, o) 69 | assert cardinality == 0 70 | assert not iterator.has_next() 71 | 72 | 73 | def test_read_document_limit(): 74 | nbItems = 0 75 | (triples, cardinality) = document.search_triples("", "", "", limit=10) 76 | assert triples.limit == 10 77 | assert cardinality == nbTotalTriples 78 | for subj, pred, obj in triples: 79 | nbItems += 1 80 | assert subj is not None 81 | assert pred is not None 82 | assert obj is not None 83 | assert nbItems == 10 84 | assert triples.nb_reads == 10 85 | 86 | 87 | def test_read_document_bytes_peek(): 88 | nbItems = 0 89 | (triples, cardinality) = document.search_triples_bytes("", "", "", limit=10) 90 | assert triples.limit == 10 91 | assert cardinality == nbTotalTriples 92 | peek = triples.peek() 93 | for subj, pred, obj in triples: 94 | nbItems += 1 95 | assert isinstance(subj, bytes) 96 | assert isinstance(pred, bytes) 97 | assert isinstance(obj, bytes) 98 | assert subj == peek[0] 99 | assert pred == peek[1] 100 | assert obj == peek[2] 101 | assert subj is not None 102 | assert pred is not None 103 | assert obj is not None 104 | try: 105 | peek = triples.peek() 106 | except StopIteration: 107 | pass 108 | assert nbItems == 10 109 | assert triples.nb_reads == 10 110 | 111 | 112 | def test_read_document_offset(): 113 | nbItems = 0 114 | (triples, cardinality) = document.search_triples("", "", "", offset=10) 115 | assert triples.offset == 10 116 | assert cardinality == nbTotalTriples 117 | for subj, pred, obj in triples: 118 | nbItems += 1 119 | assert subj is not None 120 | assert pred is not None 121 | assert obj is not None 122 | assert nbItems == cardinality - 10 123 | assert triples.nb_reads == cardinality - 10 124 | 125 | 126 | def test_read_document_ids(): 127 | (triples, cardinality) = document.search_triples_ids(0, 0, 0) 128 | assert triples.subject == "?s" 129 | assert triples.predicate == "?p" 130 | assert triples.object, "?o" 131 | assert cardinality, nbTotalTriples 132 | for subj, pred, obj in triples: 133 | assert subj is not None 134 | assert pred is not None 135 | assert obj is not None 136 | assert triples.nb_reads == cardinality 137 | 138 | 139 | def test_string_iterator_peek(): 140 | expected = ('http://example.org/s1', 'http://example.org/p1', 'http://example.org/o001') 141 | (triples, cardinality) = document.search_triples("", "", "") 142 | v = triples.peek() 143 | assert v == expected 144 | assert triples.nb_reads == 0 145 | v = next(triples) 146 | assert v == expected 147 | assert triples.nb_reads == 1 148 | 149 | 150 | def test_ids_iterator_peek(): 151 | expected = (1, 1, 13) 152 | (triples, cardinality) = document.search_triples_ids(0, 0, 0) 153 | v = triples.peek() 154 | assert v == expected 155 | assert triples.nb_reads == 0 156 | v = next(triples) 157 | assert v == expected 158 | assert triples.nb_reads == 1 159 | 160 | 161 | def test_string_iterator_big_offset(): 162 | nbItems = 0 163 | (triples, cardinality) = document.search_triples("", "", "", offset=nbTotalTriples + 1) 164 | for s, p, o in triples: 165 | nbItems += 1 166 | assert nbItems == 0 167 | 168 | 169 | def test_ids_iterator_big_offset(): 170 | nbItems = 0 171 | (triples, cardinality) = document.search_triples_ids(0, 0, 0, offset=nbTotalTriples + 1) 172 | for s, p, o in triples: 173 | nbItems += 1 174 | assert nbItems == 0 175 | -------------------------------------------------------------------------------- /tests/hdt_store_test.py: -------------------------------------------------------------------------------- 1 | # hdt_store_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2020 3 | import pytest 4 | from rdflib_hdt import HDTStore, optimize_sparql 5 | from rdflib import Graph, URIRef, Literal 6 | 7 | path = "tests/test.hdt" 8 | store = HDTStore(path) 9 | 10 | fixtures = [ 11 | # pattern ?s ?p ?o 12 | ((None, None, None), 128), 13 | # pattern s p o 14 | ((URIRef('http://example.org/s1'), URIRef('http://example.org/p1'), URIRef('http://example.org/o002')), 1), 15 | ((URIRef('http://example.org/s5'), URIRef('http://example.org/p1'), URIRef('http://example.org/o002')), 0), 16 | # pattern s ?p ?o 17 | ((URIRef('http://example.org/s1'), None, None), 100), 18 | ((URIRef('http://example.org/s2'), None, None), 10), 19 | ((URIRef('http://example.org/s3'), None, None), 10), 20 | ((URIRef('http://example.org/s4'), None, None), 8), 21 | ((URIRef('http://example.org/s5'), None, None), 0), 22 | # pattern ?s p ?o 23 | ((None, URIRef('http://example.org/p1'), None), 110), 24 | ((None, URIRef('http://example.org/p2'), None), 10), 25 | ((None, URIRef('http://example.org/p3'), None), 8), 26 | ((None, URIRef('http://example.org/p99'), None), 0), 27 | # pattern ?s ?p o 28 | ((None, None, URIRef('http://example.org/o002')), 3), 29 | ((None, None, URIRef('http://example.org/o004')), 3), 30 | ((None, None, Literal('a')), 1), 31 | ((None, None, Literal('a', lang='en')), 1), 32 | ((None, None, Literal('', lang='en')), 1), 33 | ((None, None, Literal('', datatype=URIRef('http://example.org/literal'))), 1), 34 | ((None, None, URIRef('http://example.org/o999')), 0), 35 | # pattern s ?p o 36 | ((URIRef('http://example.org/s1'), None, URIRef('http://example.org/o002')), 1), 37 | ((URIRef('http://example.org/s2'), None, URIRef('http://example.org/o004')), 1), 38 | ((URIRef('http://example.org/s3'), None, URIRef('http://example.org/o004')), 1), 39 | ((URIRef('http://example.org/s99'), None, URIRef('http://example.org/o004')), 0), 40 | # pattern s p ?o 41 | ((URIRef('http://example.org/s1'), URIRef('http://example.org/p1'), None), 100), 42 | ((URIRef('http://example.org/s2'), URIRef('http://example.org/p1'), None), 10), 43 | ((URIRef('http://example.org/s3'), URIRef('http://example.org/p2'), None), 10), 44 | ((URIRef('http://example.org/s3'), URIRef('http://example.org/p999'), None), 0), 45 | # pattern ?s p o 46 | ((None, URIRef('http://example.org/p1'), URIRef('http://example.org/o002')), 2), 47 | ((None, URIRef('http://example.org/p2'), URIRef('http://example.org/o004')), 1), 48 | ((None, URIRef('http://example.org/p2'), URIRef('http://example.org/o999')), 0) 49 | ] 50 | 51 | 52 | @pytest.mark.parametrize("query,expected_length", fixtures) 53 | def test_rdflib_graph_search(query, expected_length): 54 | query_s, query_p, query_o = query 55 | graph = Graph(store=store) 56 | nb_triples = 0 57 | for s, p, o in graph.triples(query): 58 | nb_triples += 1 59 | assert isinstance(s, URIRef), f"The subject of an RDF triple must be an URI" 60 | assert isinstance(p, URIRef), f"The predicate of an RDF triple must be an URI" 61 | assert isinstance(o, URIRef) or isinstance(o, Literal), f"The object of an RDF triple must be an URI or a Literal" 62 | if query_s is not None: 63 | assert s == query_s, f"The expected RDF triple subject's is {query_s}" 64 | if query_p is not None: 65 | assert p == query_p, f"The expected RDF triple predicate's is {query_p}" 66 | if query_o is not None: 67 | assert o == query_o, f"The expected RDF triple subject's is {query_o}" 68 | assert nb_triples == expected_length, f"The expected number of matches for {query} is {expected_length}" 69 | 70 | 71 | def test_rdflib_sparql_query(): 72 | optimize_sparql() 73 | graph = Graph(store=store) 74 | sparql_query = """ 75 | PREFIX ex: 76 | SELECT * WHERE { 77 | ?s ex:p1 ?o 78 | } 79 | """ 80 | qres = graph.query(sparql_query) 81 | 82 | nb_bindings = 0 83 | for row in qres: 84 | nb_bindings += 1 85 | 86 | assert nb_bindings == 110, f"The query should yield 110 set of solution mappings" 87 | -------------------------------------------------------------------------------- /tests/join_iterator_test.py: -------------------------------------------------------------------------------- 1 | # hdt_iterators_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | from hdt import HDTDocument 4 | 5 | path = "tests/test.hdt" 6 | document = HDTDocument(path) 7 | 8 | 9 | def test_basic_join(): 10 | join_iter = document.search_join([ 11 | ("?s", "http://example.org/p1", "http://example.org/o001"), 12 | ("?s", "http://example.org/p1", "http://example.org/o001") 13 | ]) 14 | cpt = 0 15 | for b in join_iter: 16 | cpt += 1 17 | assert len(b) == 1 18 | assert ('?s', 'http://example.org/s1') in b or ('?s', 'http://example.org/s2') in b 19 | assert cpt == 2 20 | 21 | 22 | def test_basic_join_bytes(): 23 | join_iter = document.search_join_bytes([ 24 | ("?s", "http://example.org/p1", "http://example.org/o001"), 25 | ("?s", "http://example.org/p1", "http://example.org/o001") 26 | ]) 27 | cpt = 0 28 | for b in join_iter: 29 | cpt += 1 30 | assert len(b) == 1 31 | assert (b'?s', b'http://example.org/s1') in b or (b'?s', b'http://example.org/s2') in b 32 | assert cpt == 2 33 | -------------------------------------------------------------------------------- /tests/test.hdt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/tests/test.hdt -------------------------------------------------------------------------------- /tests/wrappers_test.py: -------------------------------------------------------------------------------- 1 | # wrappers_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2020 3 | from rdflib_hdt import HDTDocument 4 | from rdflib import URIRef, Variable 5 | 6 | path = "tests/test.hdt" 7 | document = HDTDocument(path) 8 | 9 | 10 | def test_search_join_rdflib(): 11 | expected_nb = 2 12 | join_iter = document.search_join([ 13 | (Variable("s"), URIRef("http://example.org/p1"), URIRef("http://example.org/o001")), 14 | (Variable("s"), URIRef("http://example.org/p1"), URIRef("http://example.org/o001")) 15 | ]) 16 | assert len(join_iter) == expected_nb 17 | cpt = 0 18 | for row in join_iter: 19 | cpt += 1 20 | assert row.s == URIRef('http://example.org/s1') or row.s == URIRef('http://example.org/s2') 21 | assert cpt == expected_nb 22 | --------------------------------------------------------------------------------