├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── codeql
│ ├── cpp.yaml
│ └── python.yaml
└── workflows
│ ├── codeql-analysis.yml
│ ├── ghpages.yml
│ ├── linting.yml
│ ├── pytest.yml
│ └── release.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Pipfile
├── Pipfile.lock
├── README.md
├── README.rst
├── docs
├── Makefile
├── make.bat
└── source
│ ├── _config.yml
│ ├── _static
│ ├── rdflib-hdt-250.png
│ ├── rdflib-hdt.png
│ └── rdflib-hdt.svg
│ ├── api.rst
│ ├── conf.py
│ ├── hdtdocument.rst
│ ├── hdtstore.rst
│ ├── index.rst
│ └── installation.rst
├── include
├── docstrings.hpp
├── hdt_document.hpp
├── join_iterator.hpp
├── join_iterator_bytes.hpp
├── pyhdt_types.hpp
├── triple_iterator.hpp
├── triple_iterator_bytes.hpp
└── tripleid_iterator.hpp
├── install.sh
├── pyproject.toml
├── rdflib_hdt
├── __init__.py
├── hdt_document.py
├── hdt_store.py
├── iterators.py
├── mapping.py
├── sparql_op.py
└── types.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── src
├── hdt.cpp
├── hdt_document.cpp
├── join_iterator.cpp
├── join_iterator_bytes.cpp
├── triple_iterator.cpp
├── triple_iterator_bytes.cpp
└── tripleid_iterator.cpp
└── tests
├── __init__.py
├── hdt_document_test.py
├── hdt_iterators_test.py
├── hdt_store_test.py
├── join_iterator_test.py
├── test.hdt
└── wrappers_test.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/codeql/cpp.yaml:
--------------------------------------------------------------------------------
1 | name: "rdflib-hdt CodeQL C++ config"
2 |
3 | queries:
4 | - uses: security-and-quality
5 |
6 | paths:
7 | - src
8 | - include
9 |
--------------------------------------------------------------------------------
/.github/codeql/python.yaml:
--------------------------------------------------------------------------------
1 | name: "rdflib-hdt CodeQL Python config"
2 |
3 | queries:
4 | - uses: security-and-quality
5 |
6 | paths:
7 | - rdflib_hdt
8 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: 🔒 CodeQL
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 | schedule:
9 | - cron: '00 14 1 * *'
10 |
11 | jobs:
12 | analyze:
13 | name: Analyze
14 | runs-on: ubuntu-latest
15 | permissions:
16 | actions: read
17 | contents: read
18 | security-events: write
19 |
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | language: [ 'cpp', 'python' ]
24 | python-version: [3.7]
25 | hdt-version: ['v1.3.3']
26 |
27 | steps:
28 | - name: Checkout repository
29 | uses: actions/checkout@v2
30 |
31 | # Initializes the CodeQL tools for scanning.
32 | - name: Initialize CodeQL
33 | uses: github/codeql-action/init@v1
34 | with:
35 | languages: ${{ matrix.language }}
36 | # we use a specific config file per language, because they need to scan different paths
37 | # for exemple, in C++, we do not want to scan hdt-cpp sources
38 | config-file: ./.github/codeql/${{ matrix.language }}.yaml
39 | # If you wish to specify custom queries, you can do so here or in a config file.
40 | # By default, queries listed here will override any specified in a config file.
41 | # Prefix the list here with "+" to use these queries and those in the config file.
42 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
43 | - name: Set up Python ${{ matrix.python-version }} 🐍
44 | uses: actions/setup-python@v2
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | - name: Setup pipenv
48 | uses: dschep/install-pipenv-action@v1
49 | - name: Setup HDT ${{ matrix.hdt-version }}
50 | uses: Callidon/setup-hdt-action@v1.2
51 | with:
52 | token: ${{ secrets.GITHUB_TOKEN }}
53 | hdt-tag: ${{ matrix.hdt-version }}
54 | source-path: ./
55 | - name: Install dependencies
56 | run: pipenv install --dev
57 | - name: Compile & install package
58 | run: pipenv run python setup.py install
59 | - name: Perform CodeQL Analysis (${{ matrix.language }})
60 | uses: github/codeql-action/analyze@v1
61 |
--------------------------------------------------------------------------------
/.github/workflows/ghpages.yml:
--------------------------------------------------------------------------------
1 | name: 🚀 Deploy documentation
2 | on:
3 | release:
4 | types: [created]
5 | jobs:
6 | doc:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | matrix:
10 | python-version: [3.7]
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python ${{ matrix.python-version }} 🐍
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: ${{ matrix.python-version }}
17 | - name: Setup pipenv
18 | uses: dschep/install-pipenv-action@v1
19 | - name: Setup HDT v1.3.3
20 | uses: Callidon/setup-hdt-action@v1.2
21 | with:
22 | token: ${{ secrets.GITHUB_TOKEN }}
23 | hdt-tag: v1.3.3
24 | source-path: ./
25 | - name: Install dependencies
26 | run: pipenv install --dev
27 | - name: Compile & install package
28 | run: pipenv run python setup.py install
29 | - name: Build documentation
30 | run: |
31 | cd docs && pipenv run make html
32 | - name: Deploy documentation to gh-pages
33 | uses: peaceiris/actions-gh-pages@v3
34 | with:
35 | github_token: ${{ secrets.GITHUB_TOKEN }}
36 | publish_dir: ./docs/build/html
37 |
--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
1 | name: 🔍 Code quality and security
2 | on:
3 | push:
4 | branches: [ master ]
5 | pull_request:
6 | branches: [ master ]
7 | jobs:
8 | flake8:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version: [3.7]
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python ${{ matrix.python-version }} 🐍
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Install flake8
20 | run: pip install flake8
21 | - name: Lint with flake8
22 | run: |
23 | # stop the build if there are Python syntax errors or undefined names
24 | flake8 rdflib_hdt/*.py --count --select=E9,F63,F7,F82 --show-source --statistics
25 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
26 | flake8 rdflib_hdt/*.py --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
27 |
--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
1 | name: ⏳ Tests
2 | on:
3 | push:
4 | branches: [ master ]
5 | pull_request:
6 | branches: [ master ]
7 | jobs:
8 | test:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version: [3.7]
13 | hdt-version: ['v1.3.3']
14 | steps:
15 | - uses: actions/checkout@v2
16 | - name: Set up Python ${{ matrix.python-version }} 🐍
17 | uses: actions/setup-python@v2
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Setup pipenv
21 | uses: dschep/install-pipenv-action@v1
22 | - name: Cache pipenv dependencies
23 | uses: actions/cache@v2
24 | id: cache-pipenv
25 | with:
26 | path: ~/.local/share/virtualenvs
27 | key: ${{ runner.os }}-${{ matrix.python-version }}-pipenv-${{ hashFiles('**/Pipfile.lock') }}
28 | - name: Setup HDT ${{ matrix.hdt-version }}
29 | uses: Callidon/setup-hdt-action@v1.2
30 | with:
31 | token: ${{ secrets.GITHUB_TOKEN }}
32 | hdt-tag: ${{ matrix.hdt-version }}
33 | source-path: ./
34 | - name: Install dependencies
35 | if: steps.cache-pipenv.outputs.cache-hit != 'true'
36 | run: pipenv install --dev
37 | - name: Compile & install package
38 | run: pipenv run python setup.py install
39 | - name: Test with pytest
40 | run: pipenv run pytest
41 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Publish Python 🐍 distribution 📦 to PyPI
2 | on: push
3 | jobs:
4 | build:
5 | name: Build distribution 📦
6 | runs-on: ubuntu-latest
7 |
8 | steps:
9 | - uses: actions/checkout@v4
10 | with:
11 | persist-credentials: false
12 | - name: Set up Python
13 | uses: actions/setup-python@v5
14 | with:
15 | python-version: "3.7.17"
16 | - name: Install pypa/build
17 | run: >-
18 | python3 -m
19 | pip install
20 | build
21 | --user
22 | - name: Build a binary wheel and a source tarball
23 | run: python3 -m build
24 | - name: Store the distribution packages
25 | uses: actions/upload-artifact@v4
26 | with:
27 | name: python-package-distributions
28 | path: dist/
29 |
30 | publish-to-pypi:
31 | name: >-
32 | Publish Python 🐍 distribution 📦 to PyPI
33 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
34 | needs:
35 | - build
36 | runs-on: ubuntu-latest
37 | environment:
38 | name: pypi
39 | url: https://pypi.org/p/rdflib-hdt
40 | permissions:
41 | id-token: write # IMPORTANT: mandatory for trusted publishing
42 |
43 | steps:
44 | - name: Download all the dists
45 | uses: actions/download-artifact@v4
46 | with:
47 | name: python-package-distributions
48 | path: dist/
49 | - name: Publish distribution 📦 to PyPI
50 | uses: pypa/gh-action-pypi-publish@release/v1
51 |
52 | github-release:
53 | name: >-
54 | Sign the Python 🐍 distribution 📦 with Sigstore
55 | and upload them to GitHub Release
56 | needs:
57 | - publish-to-pypi
58 | runs-on: ubuntu-latest
59 |
60 | permissions:
61 | contents: write # IMPORTANT: mandatory for making GitHub Releases
62 | id-token: write # IMPORTANT: mandatory for sigstore
63 |
64 | steps:
65 | - name: Download all the dists
66 | uses: actions/download-artifact@v4
67 | with:
68 | name: python-package-distributions
69 | path: dist/
70 | - name: Sign the dists with Sigstore
71 | uses: sigstore/gh-action-sigstore-python@v3.0.0
72 | with:
73 | inputs: >-
74 | ./dist/*.tar.gz
75 | ./dist/*.whl
76 | - name: Create GitHub Release
77 | env:
78 | GITHUB_TOKEN: ${{ github.token }}
79 | run: >-
80 | gh release create
81 | "$GITHUB_REF_NAME"
82 | --repo "$GITHUB_REPOSITORY"
83 | --notes ""
84 | - name: Upload artifact signatures to GitHub Release
85 | env:
86 | GITHUB_TOKEN: ${{ github.token }}
87 | # Upload to GitHub Release using the `gh` CLI.
88 | # `dist/` contains the built packages, and the
89 | # sigstore-produced signatures and certificates.
90 | run: >-
91 | gh release upload
92 | "$GITHUB_REF_NAME" dist/**
93 | --repo "$GITHUB_REPOSITORY"
94 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # C/C++
2 | # Prerequisites
3 | *.d
4 |
5 | # VSCode
6 | .vscode
7 |
8 | # Compiled Object files
9 | *.slo
10 | *.lo
11 | *.o
12 | *.obj
13 | .pytest_cache/
14 |
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 |
19 | # Compiled Dynamic libraries
20 | *.so
21 | *.dylib
22 | *.dll
23 |
24 | # Fortran module files
25 | *.mod
26 | *.smod
27 |
28 | # Compiled Static libraries
29 | *.lai
30 | *.la
31 | *.a
32 | *.lib
33 |
34 | # Executables
35 | *.exe
36 | *.out
37 | *.app
38 |
39 | # Python
40 | # Byte-compiled / optimized / DLL files
41 | __pycache__/
42 | *.py[cod]
43 | *$py.class
44 |
45 | # C extensions
46 | *.so
47 |
48 | # Distribution / packaging
49 | .Python
50 | build/
51 | develop-eggs/
52 | dist/
53 | downloads/
54 | eggs/
55 | .eggs/
56 | lib/
57 | lib64/
58 | parts/
59 | sdist/
60 | var/
61 | wheels/
62 | *.egg-info/
63 | .installed.cfg
64 | *.egg
65 | MANIFEST
66 |
67 | # PyInstaller
68 | # Usually these files are written by a python script from a template
69 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
70 | *.manifest
71 | *.spec
72 |
73 | # Installer logs
74 | pip-log.txt
75 | pip-delete-this-directory.txt
76 |
77 | # Unit test / coverage reports
78 | htmlcov/
79 | .tox/
80 | .coverage
81 | .coverage.*
82 | .cache
83 | nosetests.xml
84 | coverage.xml
85 | *.cover
86 | .hypothesis/
87 |
88 | # Translations
89 | *.mo
90 | *.pot
91 |
92 | # Django stuff:
93 | *.log
94 | .static_storage/
95 | .media/
96 | local_settings.py
97 |
98 | # Flask stuff:
99 | instance/
100 | .webassets-cache
101 |
102 | # Scrapy stuff:
103 | .scrapy
104 |
105 | # Sphinx documentation
106 | docs/_build/
107 |
108 | # PyBuilder
109 | target/
110 |
111 | # Jupyter Notebook
112 | .ipynb_checkpoints
113 |
114 | # pyenv
115 | .python-version
116 |
117 | # celery beat schedule file
118 | celerybeat-schedule
119 |
120 | # SageMath parsed files
121 | *.sage.py
122 |
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 |
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 |
136 | # Rope project settings
137 | .ropeproject
138 |
139 | # mkdocs documentation
140 | /site
141 |
142 | # mypy
143 | .mypy_cache/
144 |
145 | # HDT
146 | *.hdt.index.v*
147 | hdt-cpp-*
148 | hdt-cpp.zip
149 | v1.3.*.zip
150 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017-2019 Thomas Minier
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | graft include/
3 | graft hdt-cpp-1.3.3/libhdt/src/bitsequence
4 | graft hdt-cpp-1.3.3/libhdt/src/dictionary
5 | graft hdt-cpp-1.3.3/libhdt/src/hdt
6 | graft hdt-cpp-1.3.3/libhdt/src/header
7 | graft hdt-cpp-1.3.3/libhdt/src/huffman
8 | graft hdt-cpp-1.3.3/libhdt/src/libdcs
9 | graft hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex
10 | graft hdt-cpp-1.3.3/libhdt/src/rdf
11 | graft hdt-cpp-1.3.3/libhdt/src/sequence
12 | graft hdt-cpp-1.3.3/libhdt/src/triples
13 | graft hdt-cpp-1.3.3/libhdt/src/util
14 | graft hdt-cpp-1.3.3/libhdt/third
15 | graft hdt-cpp-1.3.3/libhdt/include/
16 | graft hdt-cpp-1.3.3/libhdt/src/dictionary/
17 | graft hdt-cpp-1.3.3/libhdt/src/sparql
18 | graft hdt-cpp-1.3.3/libcds/include/
19 | graft hdt-cpp-1.3.3/libcds/src/static/bitsequence
20 | graft hdt-cpp-1.3.3/libcds/src/static/coders
21 | graft hdt-cpp-1.3.3/libcds/src/static/mapper
22 | graft hdt-cpp-1.3.3/libcds/src/static/permutation
23 | graft hdt-cpp-1.3.3/libcds/src/static/sequence
24 | graft hdt-cpp-1.3.3/libcds/src/utils
25 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [packages]
7 | pybind11 = "==2.2.4"
8 | rdflib = "==5.0.0"
9 |
10 | [dev-packages]
11 | pytest = "==5.4.1"
12 | flake8 = "*"
13 | sphinx = "*"
14 | sphinx-rtd-theme = "*"
15 |
16 | [requires]
17 | python_version = "3.7"
18 |
--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
1 | {
2 | "_meta": {
3 | "hash": {
4 | "sha256": "b00b04968478c4faa74499d7dbeaafdd85c4e26905043d85f77051e5d088b1a1"
5 | },
6 | "pipfile-spec": 6,
7 | "requires": {
8 | "python_version": "3.7"
9 | },
10 | "sources": [
11 | {
12 | "name": "pypi",
13 | "url": "https://pypi.org/simple",
14 | "verify_ssl": true
15 | }
16 | ]
17 | },
18 | "default": {
19 | "isodate": {
20 | "hashes": [
21 | "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96",
22 | "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"
23 | ],
24 | "version": "==0.6.1"
25 | },
26 | "pybind11": {
27 | "hashes": [
28 | "sha256:642abbbd2948ed5af28e69adfae1535347c7aa9eb0cdab130e20e1f198f8e1cf",
29 | "sha256:bd68159013d20c79bf79893b174a6ee7f74af740bf60ae731565f5d8d4094403"
30 | ],
31 | "index": "pypi",
32 | "version": "==2.2.4"
33 | },
34 | "pyparsing": {
35 | "hashes": [
36 | "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
37 | "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
38 | ],
39 | "markers": "python_full_version >= '3.6.8'",
40 | "version": "==3.0.9"
41 | },
42 | "rdflib": {
43 | "hashes": [
44 | "sha256:78149dd49d385efec3b3adfbd61c87afaf1281c30d3fcaf1b323b34f603fb155",
45 | "sha256:88208ea971a87886d60ae2b1a4b2cdc263527af0454c422118d43fe64b357877"
46 | ],
47 | "index": "pypi",
48 | "version": "==5.0.0"
49 | },
50 | "six": {
51 | "hashes": [
52 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
53 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
54 | ],
55 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
56 | "version": "==1.16.0"
57 | }
58 | },
59 | "develop": {
60 | "alabaster": {
61 | "hashes": [
62 | "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359",
63 | "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"
64 | ],
65 | "version": "==0.7.12"
66 | },
67 | "attrs": {
68 | "hashes": [
69 | "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
70 | "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
71 | ],
72 | "markers": "python_version >= '3.5'",
73 | "version": "==22.1.0"
74 | },
75 | "babel": {
76 | "hashes": [
77 | "sha256:1ad3eca1c885218f6dce2ab67291178944f810a10a9b5f3cb8382a5a232b64fe",
78 | "sha256:5ef4b3226b0180dedded4229651c8b0e1a3a6a2837d45a073272f313e4cf97f6"
79 | ],
80 | "markers": "python_version >= '3.6'",
81 | "version": "==2.11.0"
82 | },
83 | "certifi": {
84 | "hashes": [
85 | "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3",
86 | "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"
87 | ],
88 | "index": "pypi",
89 | "version": "==2022.12.7"
90 | },
91 | "charset-normalizer": {
92 | "hashes": [
93 | "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
94 | "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
95 | ],
96 | "markers": "python_version >= '3.6'",
97 | "version": "==2.1.1"
98 | },
99 | "docutils": {
100 | "hashes": [
101 | "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6",
102 | "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"
103 | ],
104 | "markers": "python_version >= '3.7'",
105 | "version": "==0.19"
106 | },
107 | "entrypoints": {
108 | "hashes": [
109 | "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
110 | "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
111 | ],
112 | "markers": "python_version >= '2.7'",
113 | "version": "==0.3"
114 | },
115 | "flake8": {
116 | "hashes": [
117 | "sha256:45681a117ecc81e870cbf1262835ae4af5e7a8b08e40b944a8a6e6b895914cfb",
118 | "sha256:49356e766643ad15072a789a20915d3c91dc89fd313ccd71802303fd67e4deca"
119 | ],
120 | "index": "pypi",
121 | "version": "==3.7.9"
122 | },
123 | "idna": {
124 | "hashes": [
125 | "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
126 | "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
127 | ],
128 | "markers": "python_version >= '3.5'",
129 | "version": "==3.4"
130 | },
131 | "imagesize": {
132 | "hashes": [
133 | "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b",
134 | "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"
135 | ],
136 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
137 | "version": "==1.4.1"
138 | },
139 | "importlib-metadata": {
140 | "hashes": [
141 | "sha256:d5059f9f1e8e41f80e9c56c2ee58811450c31984dfa625329ffd7c0dad88a73b",
142 | "sha256:d84d17e21670ec07990e1044a99efe8d615d860fd176fc29ef5c306068fda313"
143 | ],
144 | "markers": "python_version < '3.8'",
145 | "version": "==5.1.0"
146 | },
147 | "jinja2": {
148 | "hashes": [
149 | "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
150 | "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
151 | ],
152 | "markers": "python_version >= '3.7'",
153 | "version": "==3.1.2"
154 | },
155 | "markupsafe": {
156 | "hashes": [
157 | "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
158 | "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
159 | "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
160 | "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
161 | "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
162 | "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
163 | "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
164 | "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
165 | "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
166 | "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
167 | "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
168 | "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
169 | "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
170 | "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
171 | "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
172 | "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
173 | "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
174 | "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
175 | "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
176 | "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
177 | "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
178 | "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
179 | "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
180 | "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
181 | "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
182 | "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
183 | "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
184 | "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
185 | "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
186 | "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
187 | "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
188 | "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
189 | "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
190 | "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
191 | "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
192 | "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
193 | "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
194 | "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
195 | "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
196 | "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
197 | ],
198 | "markers": "python_version >= '3.7'",
199 | "version": "==2.1.1"
200 | },
201 | "mccabe": {
202 | "hashes": [
203 | "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
204 | "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
205 | ],
206 | "version": "==0.6.1"
207 | },
208 | "more-itertools": {
209 | "hashes": [
210 | "sha256:250e83d7e81d0c87ca6bd942e6aeab8cc9daa6096d12c5308f3f92fa5e5c1f41",
211 | "sha256:5a6257e40878ef0520b1803990e3e22303a41b5714006c32a3fd8304b26ea1ab"
212 | ],
213 | "markers": "python_version >= '3.7'",
214 | "version": "==9.0.0"
215 | },
216 | "packaging": {
217 | "hashes": [
218 | "sha256:2198ec20bd4c017b8f9717e00f0c8714076fc2fd93816750ab48e2c41de2cfd3",
219 | "sha256:957e2148ba0e1a3b282772e791ef1d8083648bc131c8ab0c1feba110ce1146c3"
220 | ],
221 | "markers": "python_version >= '3.7'",
222 | "version": "==22.0"
223 | },
224 | "pluggy": {
225 | "hashes": [
226 | "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
227 | "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
228 | ],
229 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
230 | "version": "==0.13.1"
231 | },
232 | "py": {
233 | "hashes": [
234 | "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
235 | "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
236 | ],
237 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
238 | "version": "==1.11.0"
239 | },
240 | "pycodestyle": {
241 | "hashes": [
242 | "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
243 | "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
244 | ],
245 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
246 | "version": "==2.5.0"
247 | },
248 | "pyflakes": {
249 | "hashes": [
250 | "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0",
251 | "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"
252 | ],
253 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
254 | "version": "==2.1.1"
255 | },
256 | "pygments": {
257 | "hashes": [
258 | "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1",
259 | "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"
260 | ],
261 | "markers": "python_version >= '3.6'",
262 | "version": "==2.13.0"
263 | },
264 | "pytest": {
265 | "hashes": [
266 | "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172",
267 | "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970"
268 | ],
269 | "index": "pypi",
270 | "version": "==5.4.1"
271 | },
272 | "pytz": {
273 | "hashes": [
274 | "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427",
275 | "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2"
276 | ],
277 | "version": "==2022.6"
278 | },
279 | "requests": {
280 | "hashes": [
281 | "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
282 | "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
283 | ],
284 | "markers": "python_version >= '3.7' and python_version < '4'",
285 | "version": "==2.28.1"
286 | },
287 | "setuptools": {
288 | "hashes": [
289 | "sha256:57f6f22bde4e042978bcd50176fdb381d7c21a9efa4041202288d3737a0c6a54",
290 | "sha256:a7620757bf984b58deaf32fc8a4577a9bbc0850cf92c20e1ce41c38c19e5fb75"
291 | ],
292 | "markers": "python_version >= '3.7'",
293 | "version": "==65.6.3"
294 | },
295 | "snowballstemmer": {
296 | "hashes": [
297 | "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
298 | "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
299 | ],
300 | "version": "==2.2.0"
301 | },
302 | "sphinx": {
303 | "hashes": [
304 | "sha256:62edfd92d955b868d6c124c0942eba966d54b5f3dcb4ded39e65f74abac3f572",
305 | "sha256:f5505d74cf9592f3b997380f9bdb2d2d0320ed74dd69691e3ee0644b956b8d83"
306 | ],
307 | "index": "pypi",
308 | "version": "==3.0.3"
309 | },
310 | "sphinx-rtd-theme": {
311 | "hashes": [
312 | "sha256:00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4",
313 | "sha256:728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a"
314 | ],
315 | "index": "pypi",
316 | "version": "==0.4.3"
317 | },
318 | "sphinxcontrib-applehelp": {
319 | "hashes": [
320 | "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
321 | "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
322 | ],
323 | "markers": "python_version >= '3.5'",
324 | "version": "==1.0.2"
325 | },
326 | "sphinxcontrib-devhelp": {
327 | "hashes": [
328 | "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
329 | "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
330 | ],
331 | "markers": "python_version >= '3.5'",
332 | "version": "==1.0.2"
333 | },
334 | "sphinxcontrib-htmlhelp": {
335 | "hashes": [
336 | "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
337 | "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
338 | ],
339 | "markers": "python_version >= '3.6'",
340 | "version": "==2.0.0"
341 | },
342 | "sphinxcontrib-jsmath": {
343 | "hashes": [
344 | "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
345 | "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
346 | ],
347 | "markers": "python_version >= '3.5'",
348 | "version": "==1.0.1"
349 | },
350 | "sphinxcontrib-qthelp": {
351 | "hashes": [
352 | "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
353 | "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
354 | ],
355 | "markers": "python_version >= '3.5'",
356 | "version": "==1.0.3"
357 | },
358 | "sphinxcontrib-serializinghtml": {
359 | "hashes": [
360 | "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
361 | "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
362 | ],
363 | "markers": "python_version >= '3.5'",
364 | "version": "==1.1.5"
365 | },
366 | "typing-extensions": {
367 | "hashes": [
368 | "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
369 | "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
370 | ],
371 | "markers": "python_version < '3.8'",
372 | "version": "==4.4.0"
373 | },
374 | "urllib3": {
375 | "hashes": [
376 | "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc",
377 | "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"
378 | ],
379 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
380 | "version": "==1.26.13"
381 | },
382 | "wcwidth": {
383 | "hashes": [
384 | "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784",
385 | "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"
386 | ],
387 | "version": "==0.2.5"
388 | },
389 | "zipp": {
390 | "hashes": [
391 | "sha256:83a28fcb75844b5c0cdaf5aa4003c2d728c77e05f5aeabe8e95e56727005fbaa",
392 | "sha256:a7a22e05929290a67401440b39690ae6563279bced5f314609d9d03798f56766"
393 | ],
394 | "markers": "python_version >= '3.7'",
395 | "version": "==3.11.0"
396 | }
397 | }
398 | }
399 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # rdflib-hdt
4 |
5 |  [](https://badge.fury.io/py/rdflib-hdt)
6 |
7 | A Store back-end for [rdflib](https://github.com/RDFLib) to allow for reading and querying HDT documents.
8 |
9 | [Online Documentation](https://rdflib.dev/rdflib-hdt/)
10 |
11 | # Requirements
12 |
13 | * Python *version 3.6.4 or higher*
14 | * [pip](https://pip.pypa.io/en/stable/)
15 | * **gcc/clang** with **c++11 support**
16 | * **Python Development headers**
17 | > You should have the `Python.h` header available on your system.
18 | > For example, for Python 3.6, install the `python3.6-dev` package on Debian/Ubuntu systems.
19 |
20 | # Installation
21 |
22 | Installation using [pipenv](https://github.com/pypa/pipenv) or a [virtualenv](https://virtualenv.pypa.io/en/stable/) is **strongly advised!**
23 |
24 | ## PyPi installation (recommended)
25 |
26 | ```bash
27 | # you can install using pip
28 | pip install rdflib-hdt
29 |
30 | # or you can use pipenv
31 | pipenv install rdflib-hdt
32 | ```
33 |
34 | ## Manual installation
35 |
36 | **Requirement:** [pipenv](https://github.com/pypa/pipenv)
37 |
38 | ```
39 | git clone https://github.com/Callidon/pyHDT
40 | cd pyHDT/
41 | ./install.sh
42 | ```
43 |
44 | # Getting started
45 |
46 | You can use the `rdflib-hdt` library in two modes: as an rdflib Graph or as a raw HDT document.
47 |
48 | ## Graph usage (recommended)
49 |
50 | ```python
51 | from rdflib import Graph
52 | from rdflib_hdt import HDTStore
53 | from rdflib.namespace import FOAF
54 |
55 | # Load an HDT file. Missing indexes are generated automatically
56 | # You can provide the index file by putting it in the same directory as the HDT file.
57 | store = HDTStore("test.hdt")
58 |
59 | # Display some metadata about the HDT document itself
60 | print(f"Number of RDF triples: {len(store)}")
61 | print(f"Number of subjects: {store.nb_subjects}")
62 | print(f"Number of predicates: {store.nb_predicates}")
63 | print(f"Number of objects: {store.nb_objects}")
64 | print(f"Number of shared subject-object: {store.nb_shared}")
65 |
66 | # Create an RDFlib Graph with the HDT document as a backend
67 | graph = Graph(store=store)
68 |
69 | # Fetch all triples that matches { ?s foaf:name ?o }
70 | # Use None to indicates variables
71 | for s, p, o in graph.triples((None, FOAF("name"), None)):
72 | print(triple)
73 | ```
74 |
75 | Using the RDFlib API, you can also [execute SPARQL queries](https://rdflib.readthedocs.io/en/stable/intro_to_sparql.html) over an HDT document.
76 | If you do so, we recommend that you first call the `optimize_sparql` function, which optimize
77 | the RDFlib SPARQL query engine in the context of HDT documents.
78 |
79 | ```python
80 | from rdflib import Graph
81 | from rdflib_hdt import HDTStore, optimize_sparql
82 |
83 | # Calling this function optimizes the RDFlib SPARQL engine for HDT documents
84 | optimize_sparql()
85 |
86 | graph = Graph(store=HDTStore("test.hdt"))
87 |
88 | # You can execute SPARQL queries using the regular RDFlib API
89 | qres = graph.query("""
90 | PREFIX foaf:
91 | SELECT ?name ?friend WHERE {
92 | ?a foaf:knows ?b.
93 | ?a foaf:name ?name.
94 | ?b foaf:name ?friend.
95 | }""")
96 |
97 | for row in qres:
98 | print(f"{row.name} knows {row.friend}")
99 | ```
100 |
101 | ## HDT Document usage
102 |
103 | ```python
104 | from rdflib_hdt import HDTDocument
105 | from rdflib.namespace import FOAF
106 |
107 | # Load an HDT file. Missing indexes are generated automatically.
108 | # You can provide the index file by putting it in the same directory as the HDT file.
109 | document = HDTDocument("test.hdt")
110 |
111 | # Display some metadata about the HDT document itself
112 | print(f"Number of RDF triples: {document.total_triples}")
113 | print(f"Number of subjects: {document.nb_subjects}")
114 | print(f"Number of predicates: {document.nb_predicates}")
115 | print(f"Number of objects: {document.nb_objects}")
116 | print(f"Number of shared subject-object: {document.nb_shared}")
117 |
118 | # Fetch all triples that matches { ?s foaf:name ?o }
119 | # Use None to indicates variables
120 | triples, cardinality = document.search((None, FOAF("name"), None))
121 |
122 | print(f"Cardinality of (?s foaf:name ?o): {cardinality}")
123 | for s, p, o in triples:
124 | print(triple)
125 |
126 | # The search also support limit and offset
127 | triples, cardinality = document.search((None, FOAF("name"), None), limit=10, offset=100)
128 | # etc ...
129 | ```
130 |
131 | An HDT document also provides support for evaluating joins over a set of triples patterns.
132 |
133 | ```python
134 | from rdflib_hdt import HDTDocument
135 | from rdflib import Variable
136 | from rdflib.namespace import FOAF, RDF
137 |
138 | document = HDTDocument("test.hdt")
139 |
140 | # find the names of two entities that know each other
141 | tp_a = (Variable("a"), FOAF("knows"), Variable("b"))
142 | tp_b = (Variable("a"), FOAF("name"), Variable("name"))
143 | tp_c = (Variable("b"), FOAF("name"), Variable("friend"))
144 | query = set([tp_a, tp_b, tp_c])
145 |
146 | iterator = document.search_join(query)
147 | print(f"Estimated join cardinality: {len(iterator)}")
148 |
149 | # Join results are produced as ResultRow, like in the RDFlib SPARQL API
150 | for row in iterator:
151 | print(f"{row.name} knows {row.friend}")
152 | ```
153 |
154 | # Handling non UTF-8 strings in python
155 |
156 | If the HDT document has been encoded with a non UTF-8 encoding the previous code won't work correctly and will result in a `UnicodeDecodeError`.
157 | More details on how to convert string to str from C++ to Python [here](https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html)
158 |
159 | To handle this, we doubled the API of the HDT document by adding:
160 | - `search_triples_bytes(...)` return an iterator of triples as `(py::bytes, py::bytes, py::bytes)`
161 | - `search_join_bytes(...)` return an iterator of sets of solutions mapping as `py::set(py::bytes, py::bytes)`
162 | - `convert_tripleid_bytes(...)` return a triple as: `(py::bytes, py::bytes, py::bytes)`
163 | - `convert_id_bytes(...)` return a `py::bytes`
164 |
165 | **Parameters and documentation are the same as the standard version**
166 |
167 | ```python
168 | from rdflib_hdt import HDTDocument
169 |
170 | document = HDTDocument("test.hdt")
171 | it = document.search_triple_bytes("", "", "")
172 |
173 | for s, p, o in it:
174 | print(s, p, o) # print b'...', b'...', b'...'
175 | # now decode it, or handle any error
176 | try:
177 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8')
178 | except UnicodeDecodeError as err:
179 | # try another other codecs, ignore error, etc
180 | pass
181 | ```
182 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | |rdflib-htd logo|
2 |
3 | |Build Status| |PyPI version|
4 |
5 | A Store back-end for `rdflib `_ to allow for reading and querying HDT documents.
6 |
7 | `Online Documentation `_
8 |
9 | Requirements
10 | ============
11 |
12 |
13 | * Python *version 3.6.4 or higher*
14 | * `pip `_
15 | * **gcc/clang** with **c++11 support**
16 | * **Python Development headers**
17 | ..
18 |
19 | You should have the ``Python.h`` header available on your system.\
20 | For example, for Python 3.6, install the ``python3.6-dev`` package on Debian/Ubuntu systems.
21 |
22 |
23 | Installation
24 | ============
25 |
26 | Installation using `pipenv `_ or a `virtualenv `_ is **strongly advised!**
27 |
28 | PyPi installation (recommended)
29 | -------------------------------
30 |
31 | .. code-block:: bash
32 |
33 | # you can install using pip
34 | pip install rdflib-hdt
35 |
36 | # or you can use pipenv
37 | pipenv install rdflib-hdt
38 |
39 | Manual installation
40 | -------------------
41 |
42 | **Requirement:** `pipenv `_
43 |
44 | .. code-block:: bash
45 |
46 | git clone https://github.com/Callidon/pyHDT
47 | cd pyHDT/
48 | ./install.sh
49 |
50 | Getting started
51 | ===============
52 |
53 | You can use the ``rdflib-hdt`` library in two modes: as an rdflib Graph or as a raw HDT document.
54 |
55 | Graph usage (recommended)
56 | -------------------------
57 |
58 | .. code-block:: python
59 |
60 | from rdflib import Graph
61 | from rdflib_hdt import HDTStore
62 | from rdflib.namespace import FOAF
63 |
64 | # Load an HDT file. Missing indexes are generated automatically
65 | # You can provide the index file by putting them in the same directory than the HDT file.
66 | store = HDTStore("test.hdt")
67 |
68 | # Display some metadata about the HDT document itself
69 | print(f"Number of RDF triples: {len(store)}")
70 | print(f"Number of subjects: {store.nb_subjects}")
71 | print(f"Number of predicates: {store.nb_predicates}")
72 | print(f"Number of objects: {store.nb_objects}")
73 | print(f"Number of shared subject-object: {store.nb_shared}")
74 |
75 |
76 | Using the RDFlib API, you can also `execute SPARQL queries `_ over an HDT document.
77 | If you do so, we recommend that you first call the ``optimize_sparql`` function, which optimize
78 | the RDFlib SPARQL query engine in the context of HDT documents.
79 |
80 | .. code-block:: python
81 |
82 | from rdflib import Graph
83 | from rdflib_hdt import HDTStore, optimize_sparql
84 |
85 | # Calling this function optimizes the RDFlib SPARQL engine for HDT documents
86 | optimize_sparql()
87 |
88 | graph = Graph(store=HDTStore("test.hdt"))
89 |
90 | # You can execute SPARQL queries using the regular RDFlib API
91 | qres = graph.query("""
92 | PREFIX foaf:
93 | SELECT ?name ?friend WHERE {
94 | ?a foaf:knows ?b.
95 | ?a foaf:name ?name.
96 | ?b foaf:name ?friend.
97 | }""")
98 |
99 | for row in qres:
100 | print(f"{row.name} knows {row.friend}")
101 |
102 | HDT Document usage
103 | ------------------
104 |
105 | .. code-block:: python
106 |
107 | from rdflib_hdt import HDTDocument
108 |
109 | # Load an HDT file. Missing indexes are generated automatically.
110 | # You can provide the index file by putting them in the same directory than the HDT file.
111 | document = HDTDocument("test.hdt")
112 |
113 | # Display some metadata about the HDT document itself
114 | print(f"Number of RDF triples: {document.total_triples}")
115 | print(f"Number of subjects: {document.nb_subjects}")
116 | print(f"Number of predicates: {document.nb_predicates}")
117 | print(f"Number of objects: {document.nb_objects}")
118 | print(f"Number of shared subject-object: {document.nb_shared}")
119 |
120 | # Fetch all triples that matches { ?s foaf:name ?o }
121 | # Use None to indicates variables
122 | triples, cardinality = document.search_triples((None, FOAF("name"), None))
123 |
124 | print(f"Cardinality of (?s foaf:name ?o): {cardinality}")
125 | for s, p, o in triples:
126 | print(triple)
127 |
128 | # The search also support limit and offset
129 | triples, cardinality = document.search_triples((None, FOAF("name"), None), limit=10, offset=100)
130 | # etc ...
131 |
132 | An HDT document also provides support for evaluating joins over a set of triples patterns.
133 |
134 | .. code-block:: python
135 |
136 | from rdflib_hdt import HDTDocument
137 | from rdflib import Variable
138 | from rdflib.namespace import FOAF, RDF
139 |
140 | document = HDTDocument("test.hdt")
141 |
142 | # find the names of two entities that know each other
143 | tp_a = (Variable("a"), FOAF("knows"), Variable("b"))
144 | tp_b = (Variable("a"), FOAF("name"), Variable("name"))
145 | tp_c = (Variable("b"), FOAF("name"), Variable("friend"))
146 | query = set([tp_a, tp_b, tp_c])
147 |
148 | iterator = document.search_join(query)
149 | print(f"Estimated join cardinality: {len(iterator)}")
150 |
151 | # Join results are produced as ResultRow, like in the RDFlib SPARQL API
152 | for row in iterator:
153 | print(f"{row.name} knows {row.friend}")
154 |
155 | Handling non UTF-8 strings in python
156 | ====================================
157 |
158 | If the HDT document has been encoded with a non UTF-8 encoding the previous code won't work correctly and will result in a ``UnicodeDecodeError``.
159 | More details on how to convert string to str from C++ to Python `here `_
160 |
161 | To handle this, we doubled the API of the HDT document by adding:
162 |
163 |
164 | * ``search_triples_bytes(...)`` return an iterator of triples as ``(py::bytes, py::bytes, py::bytes)``
165 | * ``search_join_bytes(...)`` return an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)``
166 | * ``convert_tripleid_bytes(...)`` return a triple as: ``(py::bytes, py::bytes, py::bytes)``
167 | * ``convert_id_bytes(...)`` return a ``py::bytes``
168 |
169 | **Parameters and documentation are the same as the standard version**
170 |
171 | .. code-block:: python
172 |
173 | from rdflib_hdt import HDTDocument
174 |
175 | document = HDTDocument("test.hdt")
176 | it = document.search_triple_bytes("", "", "")
177 |
178 | for s, p, o in it:
179 | print(s, p, o) # print b'...', b'...', b'...'
180 | # now decode it, or handle any error
181 | try:
182 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8')
183 | except UnicodeDecodeError as err:
184 | # try another other codecs, ignore error, etc
185 | pass
186 |
187 | .. |Build Status| image:: https://github.com/RDFLib/rdflib-hdt/workflows/Python%20tests/badge.svg
188 | :target: https://github.com/RDFLib/rdflib-hdt/actions?query=workflow%3A%22Python+tests%22
189 | .. |PyPI version| image:: https://badge.fury.io/py/rdflib-hdt.svg
190 | :target: https://badge.fury.io/py/rdflib-hdt
191 | .. |rdflib-htd logo| image:: https://raw.githubusercontent.com/RDFLib/rdflib-hdt/master/docs/source/_static/rdflib-hdt-250.png
192 | :target: https://rdflib.dev/rdflib-hdt/
193 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = pyHDT
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | cp source/_config.yml build/html/_config.yml
22 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=pyHDT
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/source/_config.yml:
--------------------------------------------------------------------------------
1 | baseurl: /
2 | include: [ "_static", "_static/*" ]
3 |
--------------------------------------------------------------------------------
/docs/source/_static/rdflib-hdt-250.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/docs/source/_static/rdflib-hdt-250.png
--------------------------------------------------------------------------------
/docs/source/_static/rdflib-hdt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/docs/source/_static/rdflib-hdt.png
--------------------------------------------------------------------------------
/docs/source/_static/rdflib-hdt.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
65 |
--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
1 | API documentation
2 | =================
3 |
4 | .. currentmodule:: rdflib_hdt
5 |
6 | Global functions
7 | ----------------
8 |
9 | .. autofunction:: optimize_sparql
10 |
11 | HDTStore
12 | -----------
13 |
14 | .. autoclass:: HDTStore
15 | :show-inheritance:
16 | :members:
17 |
18 | HDTDocument
19 | -----------
20 |
21 | .. autoclass:: HDTDocument
22 | :members:
23 |
24 | .. autoattribute:: nb_subjects
25 |
26 | .. autoattribute:: nb_predicates
27 |
28 | .. autoattribute:: nb_objects
29 |
30 | .. autoattribute:: nb_shared
31 |
32 |
33 | HDTIterator
34 | -----------
35 |
36 | .. autoclass:: HDTIterator
37 | :members:
38 |
39 |
40 | HDTJoinIterator
41 | ---------------
42 |
43 | .. autoclass:: HDTJoinIterator
44 | :members:
45 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # pyHDT documentation build configuration file, created by
5 | # sphinx-quickstart on Mon Jan 22 10:41:42 2018.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | # import os
21 | # import sys
22 | # sys.path.insert(0, os.path.abspath('.'))
23 |
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ['sphinx.ext.autodoc']
35 |
36 | # Add any paths that contain templates here, relative to this directory.
37 | templates_path = ['_templates']
38 |
39 | # The suffix(es) of source filenames.
40 | # You can specify multiple suffix as a list of string:
41 | #
42 | # source_suffix = ['.rst', '.md']
43 | source_suffix = '.rst'
44 |
45 | # The master toctree document.
46 | master_doc = 'index'
47 |
48 | # General information about the project.
49 | project = 'rdflib-hdt'
50 | copyright = '2018-2020, Thomas Minier'
51 | author = 'Thomas Minier'
52 |
53 | # The version info for the project you're documenting, acts as replacement for
54 | # |version| and |release|, also used in various other places throughout the
55 | # built documents.
56 | #
57 | # The short X.Y version.
58 | version = '1.0.0'
59 | # The full version, including alpha/beta/rc tags.
60 | release = '1.0.0'
61 |
62 | # The language for content autogenerated by Sphinx. Refer to documentation
63 | # for a list of supported languages.
64 | #
65 | # This is also used if you do content translation via gettext catalogs.
66 | # Usually you set "language" from the command line for these cases.
67 | language = None
68 |
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This patterns also effect to html_static_path and html_extra_path
72 | exclude_patterns = []
73 |
74 | # The name of the Pygments (syntax highlighting) style to use.
75 | pygments_style = 'sphinx'
76 |
77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
78 | todo_include_todos = False
79 |
80 |
81 | # -- Options for HTML output ----------------------------------------------
82 |
83 | # The theme to use for HTML and HTML Help pages. See the documentation for
84 | # a list of builtin themes.
85 | #
86 | html_theme = 'sphinx_rtd_theme'
87 |
88 | # Theme options are theme-specific and customize the look and feel of a theme
89 | # further. For a list of options available for each theme, see the
90 | # documentation.
91 | #
92 | # html_theme_options = { 'show_related': True}
93 |
94 | # Add any paths that contain custom static files (such as style sheets) here,
95 | # relative to this directory. They are copied after the builtin static files,
96 | # so a file named "default.css" will overwrite the builtin "default.css".
97 | html_static_path = ['_static']
98 |
99 | # If given, this must be the name of an image file (path relative to the configuration directory) that is the favicon of the docs.
100 | # Modern browsers use this as the icon for tabs, windows and bookmarks.
101 | html_favicon = '_static/rdflib-hdt-250.png'
102 |
103 | # Custom sidebar templates, must be a dictionary that maps document names
104 | # to template names.
105 | #
106 | # This is required for the alabaster theme
107 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
108 | html_sidebars = {
109 | '**': [
110 | 'globaltoc.html',
111 | 'relations.html', # needs 'show_related': True theme option to display
112 | 'sourcelink.html',
113 | 'searchbox.html',
114 | ]
115 | }
116 |
117 |
118 | # -- Options for HTMLHelp output ------------------------------------------
119 |
120 | # Output file base name for HTML help builder.
121 | htmlhelp_basename = 'rdflibHDTdoc'
122 |
123 |
124 | # -- Options for LaTeX output ---------------------------------------------
125 |
126 | latex_elements = {
127 | # The paper size ('letterpaper' or 'a4paper').
128 | #
129 | # 'papersize': 'letterpaper',
130 |
131 | # The font size ('10pt', '11pt' or '12pt').
132 | #
133 | # 'pointsize': '10pt',
134 |
135 | # Additional stuff for the LaTeX preamble.
136 | #
137 | # 'preamble': '',
138 |
139 | # Latex figure (float) alignment
140 | #
141 | # 'figure_align': 'htbp',
142 | }
143 |
144 | # Grouping the document tree into LaTeX files. List of tuples
145 | # (source start file, target name, title,
146 | # author, documentclass [howto, manual, or own class]).
147 | latex_documents = [
148 | (master_doc, 'rdflibHDT.tex', 'pyHDT Documentation',
149 | 'Thomas Minier', 'manual'),
150 | ]
151 |
152 |
153 | # -- Options for manual page output ---------------------------------------
154 |
155 | # One entry per manual page. List of tuples
156 | # (source start file, name, description, authors, manual section).
157 | man_pages = [
158 | (master_doc, 'rdflibHDT', 'pyHDT Documentation',
159 | [author], 1)
160 | ]
161 |
162 |
163 | # -- Options for Texinfo output -------------------------------------------
164 |
165 | # Grouping the document tree into Texinfo files. List of tuples
166 | # (source start file, target name, title, author,
167 | # dir menu entry, description, category)
168 | texinfo_documents = [
169 | (master_doc, 'rdflibHDT', 'rdflib-dht Documentation',
170 | author, 'rdflibHDT', 'One line description of project.',
171 | 'Miscellaneous'),
172 | ]
173 |
174 | autodoc_member_order = 'groupwise'
175 |
--------------------------------------------------------------------------------
/docs/source/hdtdocument.rst:
--------------------------------------------------------------------------------
1 | Low-level Usage
2 | ===============
3 |
4 | Loading HDT files
5 | ^^^^^^^^^^^^^^^^^
6 |
7 | The main class for directly manipulating HDT document using rdflib_hdt is :py:class:`rdflib_hdt.HDTDocument`.
8 | Upon creation, it searches for an index file in the same directory than the HDT file you wish to load.
9 | For example, if you load a file */home/awesome-user/test.hdt*, :py:class:`rdflib_hdt.HDTDocument` will look for the index file
10 | */home/awesome-user/test.hdt.index.v1-1*.
11 |
12 | .. warning:: By default, an HDTDocument discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor.
13 |
14 | .. note:: Missing indexes are generated automatically, but be careful, as it requires to load all HDT triples in memory!
15 |
16 |
17 | .. code-block:: python
18 |
19 | from rdflib_hdt import HDTDocument
20 |
21 | # Load an HDT file.
22 | # Missing indexes are generated automatically, add False as the second argument to disable them
23 | document = HDTDocument("test.hdt")
24 |
25 | # Display some metadata about the HDT document itself
26 | print(f"Number of RDF triples: {document.total_triples}")
27 | print(f"Number of subjects: {document.nb_subjects}")
28 | print(f"Number of predicates: {document.nb_predicates}")
29 | print(f"Number of objects: {document.nb_objects}")
30 | print(f"Number of shared subject-object: {document.nb_shared}")
31 |
32 |
33 | Searching for triples
34 | ^^^^^^^^^^^^^^^^^^^^^^
35 |
36 | You can search for all RDF triples in the HDT file matching a triple pattern using :py:meth:`rdflib_hdt.HDTDocument.search`.
37 | It returns a 2-element tuple: an :py:class:`rdflib_hdt.HDTIterator` over the matching RDF triples and the estimated triple pattern *cardinality*.
38 |
39 | .. note:: The :py:meth:`rdflib_hdt.HDTDocument.search` method also accepts ``limit`` and ``offset`` parameters, to perform range queries over the matchinf RDF triples.
40 |
41 | .. code-block:: python
42 |
43 | from rdflib.namespace import FOAF
44 | from rdflib_hdt import HDTDocument
45 | document = HDTDocument("test.hdt")
46 |
47 | # Fetch all triples that matches { ?s foaf:name ?o }
48 | # Use None to indicates variables
49 | triples, cardinality = document.search((None, FOAF("name"), None))
50 |
51 | print(f"Cardinality of (?s foaf:name ?o): {cardinality}")
52 | for s, p, o in triples:
53 | print(triple)
54 |
55 | # The search also support limit and offset
56 | triples, cardinality = document.search((None, FOAF("name"), None), limit=10, offset=100)
57 | # etc ...
58 |
59 | Searching for triple IDs
60 | ^^^^^^^^^^^^^^^^^^^^^^^^^
61 |
62 | A typical HDT document encodes a triple's subject, predicate, and object as unique integers, named **TripleID**.
63 | For example, the triple ``("ex:Toto", "ex:type", "ex:Person")`` can be encoded as ``(1, 2, 3)``.
64 | An :py:class:`rdflib_hdt.HDTDocument` allows for searching RDF triples and retrieving them in this format, using the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method, which takes the same parameters as the :py:meth:`rdflib_hdt.HDTDocument.search` method.
65 |
66 | .. note:: You can transform RDF terms or RDF triples to/from TripleIDs using the :meth:`rdflib_hdt.HDTDocument.from_tripleid`, :py:meth:`rdflib_hdt.HDTDocument.to_tripleid`, :meth:`rdflib_hdt.HDTDocument.term_to_id`, and :meth:`rdflib_hdt.HDTDocument.id_to_term` methods.
67 |
68 | .. code-block:: python
69 |
70 | from rdflib_hdt import HDTDocument
71 | document = HDTDocument("test.hdt")
72 |
73 | (triples, cardinality) = document.search_ids((None, None, None))
74 |
75 | for s, p, o in triples:
76 | print(s, p, o) # will print 3-element tuples of integers
77 |
78 | # convert a triple ID to a string format
79 | print(f"TripleID {(s, p, o)} = RDF Triple {document.from_tripleid(s, p, o)}")
80 |
81 | # print only the subject
82 | print(f"ID {s} = Term {document.id_to_term(s, 0)}")
83 |
84 | Join evaluation
85 | ^^^^^^^^^^^^^^^
86 |
87 | An HDT document also provides support for evaluating joins over a set of triples patterns.
88 |
89 | .. code-block:: python
90 |
91 | from rdflib_hdt import HDTDocument
92 | from rdflib import Variable
93 | from rdflib.namespace import FOAF, RDF
94 |
95 | document = HDTDocument("test.hdt")
96 |
97 | # find the names of two entities that know each other
98 | tp_a = (Variable("a"), FOAF("knows"), Variable("b"))
99 | tp_b = (Variable("a"), FOAF("name"), Variable("name"))
100 | tp_c = (Variable("b"), FOAF("name"), Variable("friend"))
101 | query = set([tp_a, tp_b, tp_c])
102 |
103 | iterator = document.search_join(query)
104 | print(f"Estimated join cardinality: {len(iterator)}")
105 |
106 | # Join results are produced as ResultRow, like in the RDFlib SPARQL API
107 | for row in iterator:
108 | print(f"{row.name} knows {row.friend}")
109 |
110 | Ordering
111 | ^^^^^^^^^^^
112 |
113 | When searching for triples (either in a string or TripleID format), results are ordered by (subject, predicate, object).
114 | However, this order is **not** an order on string values, but an order on **triple ids**.
115 | For example, ``("ex:2", "ex:type", "ex:Person") < ("ex:1", "ex:type", "ex:Person")``,
116 | because their triple ids counterparts are ``(1, 2, 3)`` and ``(2, 2, 3)``.
117 |
118 | For more details about this topic, please refer to the `HDT journal article `_.
119 |
120 | Handling non UTF-8 strings in Python
121 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
122 |
123 | If the HDT document is encoded without valid UTF-8 encoding and it does not run in *safe mode*, querying the document may raise an
124 | ``UnicodeDecodeError``. More details on how to convert string
125 | from C++ to Python `here`_
126 |
127 | To allow fine control over string conversion in this case, we doubled the API of the HDT document by adding the following methods (Parameters and documentation are the same as their standard counterparts).
128 |
129 | - :py:meth:`rdflib_hdt.HDTDocument.search_triples_bytes` returns an iterator of triples as ``(py::bytes, py::bytes, py::bytes)``
130 | - :py:meth:`rdflib_hdt.HDTDocument.search_join_bytes` returns an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)``
131 | - :py:meth:`rdflib_hdt.HDTDocument.convert_tripleid_bytes` returns a triple as: ``(py::bytes, py::bytes, py::bytes)``
132 | - :py:meth:`rdflib_hdt.HDTDocument.convert_id_bytes` returns a ``py::bytes``
133 |
134 | .. code:: python
135 |
136 | from rdflib_hdt import HDTDocument
137 |
138 | # Load an HDT file.
139 | # Missing indexes are generated automatically, add False as the second argument to disable them
140 | document = HDTDocument("test.hdt")
141 | it = document.search_triple_bytes("", "", "")
142 |
143 | for s, p, o in it:
144 | print(s, p, o) # print b'...', b'...', b'...'
145 | # now decode it, or handle any error
146 | try:
147 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8')
148 | except UnicodeDecodeError as err:
149 | # try another other codecs
150 | pass
151 |
152 | .. _here: https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html
153 |
--------------------------------------------------------------------------------
/docs/source/hdtstore.rst:
--------------------------------------------------------------------------------
1 | Querying HDT documents
2 | ======================
3 |
4 | Getting started
5 | ---------------
6 |
7 | The primary way of using ``rdflib-hdt`` is the :py:class:`rdflib_hdt.HDTStore` class.
8 | Upon creation, it searches for an index file in the same directory than the HDT file you wish to load.
9 | For example, if you load a file */home/awesome-user/test.hdt*, :py:class:`rdflib_hdt.HDTDocument` will look for the index file
10 | */home/awesome-user/test.hdt.index.v1-1*.
11 |
12 | .. warning:: By default, an HDTStore discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor.
13 |
14 | .. note:: Missing indexes are generated automatically, but be careful, as it requires to load all HDT triples in memory!
15 |
16 | .. code-block:: python
17 |
18 | from rdflib import Graph
19 | from rdflib_hdt import HDTStore
20 | from rdflib.namespace import FOAF
21 |
22 | # Load an HDT file. Missing indexes are generated automatically
23 | # You can provide the index file by putting them in the same directory than the HDT file.
24 | store = HDTStore("test.hdt")
25 |
26 | # Display some metadata about the HDT document itself
27 | print(f"Number of RDF triples: {len(store)}")
28 | print(f"Number of subjects: {store.nb_subjects}")
29 | print(f"Number of predicates: {store.nb_predicates}")
30 | print(f"Number of objects: {store.nb_objects}")
31 | print(f"Number of shared subject-object: {store.nb_shared}")
32 |
33 | Executing SPARQL queries
34 | ------------------------
35 |
36 | Using the RDFlib API, you can also `execute SPARQL queries `_ over an HDT document.
37 | If you do so, we recommend that you first call the :py:func:`rdflib_hdt.optimize_sparql` function, which optimize
38 | the RDFlib SPARQL query engine in the context of HDT documents.
39 |
40 | .. code-block:: python
41 |
42 | from rdflib import Graph
43 | from rdflib_hdt import HDTStore, optimize_sparql
44 |
45 | # Calling this function optimizes the RDFlib SPARQL engine for HDT documents
46 | optimize_sparql()
47 |
48 | graph = Graph(store=HDTStore("test.hdt"))
49 |
50 | # You can execute SPARQL queries using the regular RDFlib API
51 | qres = graph.query("""
52 | PREFIX foaf:
53 | SELECT ?name ?friend WHERE {
54 | ?a foaf:knows ?b.
55 | ?a foaf:name ?name.
56 | ?b foaf:name ?friend.
57 | }""")
58 |
59 | for row in qres:
60 | print(f"{row.name} knows {row.friend}")
61 |
62 | .. note::
63 | Calling the :py:func:`rdflib_hdt.optimize_sparql` function triggers a global modification of the RDFlib SPARQL engine.
64 | However, executing SPARQL queries using other RDFlib stores will continue to work as before,
65 | so you can safely call this function at the beginning of your code.
66 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | |rdflib-htd logo|
2 |
3 | Read and query HDT document with rdflib
4 | ======================================================
5 |
6 | |Build Status| |PyPI version|
7 |
8 | A Store back-end for `rdflib `_ to allow for reading and querying HDT documents.
9 |
10 | Getting started
11 | ==================
12 |
13 | .. toctree::
14 | :maxdepth: 3
15 |
16 | installation
17 | hdtstore
18 | hdtdocument
19 | api
20 |
21 | Indices and tables
22 | ==================
23 |
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 |
28 | .. |Build Status| image:: https://github.com/RDFLib/rdflib-hdt/workflows/Python%20tests/badge.svg
29 | :target: https://github.com/RDFLib/rdflib-hdt/actions?query=workflow%3A%22Python+tests%22
30 | .. |PyPI version| image:: https://badge.fury.io/py/rdflib-hdt.svg
31 | :target: https://badge.fury.io/py/rdflib-hdt
32 | .. |rdflib-htd logo| image:: https://raw.githubusercontent.com/RDFLib/rdflib-hdt/master/docs/source/_static/rdflib-hdt-250.png
33 | :target: https://rdflib.dev/rdflib-hdt/
34 |
--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | =============
3 |
4 | Requirements
5 | ^^^^^^^^^^^^
6 |
7 | * Python *version 3.6.4 or higher*
8 | * `pip `_
9 | * **gcc/clang** with **c++11 support**
10 | * **Python Development headers**
11 |
12 | .. note::
13 | You must have the `Python.h` header available on your system.
14 | For example, for Python 3.4, install the `python3.4-dev` package on Debian/Ubuntu systems.
15 |
16 | Installation
17 | ^^^^^^^^^^^^^
18 |
19 | Installation using `pipenv `_ or a `virtualenv `_ is **strongly advised!**
20 |
21 | PyPi installation (recommended)
22 | -------------------------------
23 |
24 | .. code-block:: bash
25 |
26 | # you can install using pip
27 | pip install rdflib-hdt
28 |
29 | # or you can use pipenv
30 | pipenv install rdflib-hdt
31 |
32 | Manual installation
33 | -------------------
34 |
35 | **Requirement:** `pipenv `_
36 |
37 | .. code-block:: bash
38 |
39 | git clone https://github.com/Callidon/pyHDT
40 | cd pyHDT/
41 | ./install.sh
42 |
--------------------------------------------------------------------------------
/include/docstrings.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * docstrings.hpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #ifndef PYHDT_DOCSTRINGS_HPP
7 | #define PYHDT_DOCSTRINGS_HPP
8 |
9 | #include
10 |
11 | const char *MODULE_DOC = R"(
12 | The hdt module enables to load and query HDT files with ease.
13 | )";
14 |
15 | /**
16 | * Enums docstrings
17 | */
18 |
19 | const char *IDENTIFIER_POSITION_DOC = R"(
20 | An enum used to indicate the position (subject, predicate or object) of an Object identifier.
21 |
22 | Possibles values:
23 | - ``IdentifierPosition.Subject``: the subject position
24 | - ``IdentifierPosition.Predicate``: the subject position
25 | - ``IdentifierPosition.Object``: the object position
26 |
27 | .. code-block:: python
28 |
29 | from hdt import IdentifierPosition
30 | print(IdentifierPosition.Subject)
31 | print(IdentifierPosition.Predicate)
32 | print(IdentifierPosition.Object)
33 |
34 | )";
35 |
36 | /**
37 | * HDT Document docstrings
38 | */
39 |
40 | const char *HDT_DOCUMENT_CLASS_DOC = R"(
41 | An HDTDocument enables to load and query a HDT file.
42 |
43 | Constructor:
44 | - file ``str``: Path to the HDT file to load.
45 | - predicate ``boolean``: True if additional indexes must be loaded, False otherwise.
46 | )";
47 |
48 | const char *HDT_DOCUMENT_GETFILEPATH_DOC = R"(
49 | Return the path to the HDT file currently loaded
50 | )";
51 |
52 | const char *HDT_DOCUMENT_GETNBTRIPLES_DOC = R"(
53 | Return the total number of triples in the HDT document
54 | )";
55 |
56 | const char *HDT_DOCUMENT_GETNBSUBJECTS_DOC = R"(
57 | Return the number of subjects in the HDT document
58 | )";
59 |
60 | const char *HDT_DOCUMENT_GETNBPREDICATES_DOC = R"(
61 | Return the number of predicates in the HDT document
62 | )";
63 |
64 | const char *HDT_DOCUMENT_GETNBOBJECTS_DOC = R"(
65 | Return the number of objects in the HDT document
66 | )";
67 |
68 | const char *HDT_DOCUMENT_GETNBSHARED_DOC = R"(
69 | Return the number of shared subject-object in the HDT document
70 | )";
71 |
72 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_DOC = R"(
73 | Search for RDF triples matching the triple pattern { ``subject`` ``predicate`` ``object`` },
74 | with an optional ``limit`` and ``offset``.
75 | Use empty strings (``""``) to indicate wildcards.
76 |
77 | Args:
78 | - subject ``str``: The subject of the triple pattern to seach for.
79 | - predicate ``str``: The predicate of the triple pattern to seach for.
80 | - obj ``str``: The object of the triple pattern ot seach for.
81 | - limit ``int`` ``optional``: Maximum number of triples to search for.
82 | - offset ``int`` ``optional``: Number of matching triples to skip before returning results.
83 |
84 | Return:
85 | A 2-elements ``tuple`` (:class:`hdt.TripleIterator`, estimated pattern cardinality), where
86 | the TripleIterator iterates over matching RDF triples.
87 |
88 | A RDF triple itself is a 3-elements ``tuple`` (subject, predicate, object).
89 |
90 | .. code-block:: python
91 |
92 | from hdt import HDTDocument
93 | document = HDTDocument("test.hdt")
94 |
95 | # Fetch all triples that matches { ?s ?p ?o }
96 | (triples, cardinality) = document.search_triples("", "", "")
97 |
98 | print("cardinality of { ?s ?p ?o }: %i" % cardinality)
99 | for triple in triples:
100 | print(triple)
101 |
102 | )";
103 |
104 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC = R"(
105 | Same as :meth:`hdt.HDTDocument.search_triples`, but RDF triples are represented as unique ids (from the HDT Dictionnary).
106 | Use the integer `0` to indicate wildcards.
107 |
108 | Mapping between ids and RDF terms is done using :meth:`hdt.HDTDocument.convert_id`, :meth:`hdt.HDTDocument.convert_term` and :meth:`hdt.HDTDocument.convert_tripleid`.
109 |
110 | Args:
111 | - subject ``int``: The Object identifier of the triple pattern's subject.
112 | - predicate ``int``: The Object identifier of the triple pattern's predicate.
113 | - obj ``int``: The Object identifier of the triple pattern's object.
114 | - limit ``int`` ``optional``: Maximum number of triples to search for.
115 | - offset ``int`` ``optional``: Number of matching triples to skip before returning results.
116 |
117 | Return:
118 | A 2-elements ``tuple`` (:class:`hdt.TripleIDIterator`, estimated pattern cardinality), where
119 | the TripleIDIterator iterates over matching RDF triples IDs.
120 |
121 | A RDF triple ID itself is a 3-elements ``tuple`` (subjectID, predicateID, objectID).
122 |
123 | .. code-block:: python
124 |
125 | from hdt import HDTDocument
126 | document = HDTDocument("test.hdt")
127 |
128 | pred = document.convert_term("http://xmlns.com/foaf/0.1/")
129 | # Fetch all RDF triples that matches { ?s foaf:name ?o }
130 | (triples, cardinality) = document.search_triples_ids(0, pred, 0)
131 |
132 | print("cardinality of { ?s foaf:name ?o }: %i" % cardinality)
133 | for triple in triples:
134 | print(triple)
135 |
136 | )";
137 |
138 | const char *HDT_DOCUMENT_SEARCH_JOIN_DOC = R"(
139 | Evaluate a join between a set of triple patterns using an iterator.
140 | A triple pattern itself is a 3-elements ``tuple`` (subject, predicate, object), where SPARQL variables, i.e., join predicates, are prefixed by a ``?``.
141 |
142 | Args:
143 | - patterns ``set``: set of triple patterns.
144 |
145 | Return:
146 | A :class:`hdt.JoinIterator`, which can be consumed as a Python iterator to evaluates the join.
147 |
148 | .. code-block:: python
149 |
150 | from hdt import HDTDocument
151 | document = HDTDocument("test.hdt")
152 |
153 | # find all actors with their names in the HDT document
154 | tp_a = ("?s", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://example.org#Actor")
155 | tp_b = ("?s", "http://xmlns.com/foaf/0.1/name", "?name")
156 | iterator = document.search_join(set([tp_a, tp_b]))
157 |
158 | print("estimated join cardinality : %i" % len(iterator))
159 | for mappings in iterator:
160 | print(mappings)
161 |
162 | )";
163 |
164 | const char *HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC = R"(
165 | Transform a RDF triple from a TripleID representation to a string representation.
166 |
167 | Args:
168 | - subject ``int``: unique ID of the subject.
169 | - predicate ``int``: unique ID of the predicate.
170 | - obj ``int``: unique ID of the object.
171 |
172 | Return:
173 | A triple in string representation, i.e., a 3-elements ``tuple`` (subject, predicate, object)
174 |
175 | .. code-block:: python
176 |
177 | from hdt import HDTDocument
178 | document = HDTDocument("test.hdt")
179 |
180 | # Fetch all triples that matches { ?s foaf:name ?o }
181 | pred = document.convert_term("http://xmlns.com/foaf/0.1/")
182 | (triples, cardinality) = document.search_triples_ids(0, pred, 0)
183 |
184 | for s, p, o in triples:
185 | print(s, p, o) # will print Object identifiers, i.e., integers
186 | # convert a triple ID to a string format
187 | print(document.convert_tripleid(s, p, o))
188 |
189 | )";
190 |
191 | const char *HDT_DOCUMENT_CONVERT_ID_DOC = R"(
192 | Transform an Object Identifier to a RDF term.
193 | Such identifier are used in TripleID.
194 |
195 | Args:
196 | - id ``int``: Object identifier.
197 | - position :class:`hdt.IdentifierPosition`: Identifier position.
198 |
199 | Return:
200 | The RDF term associated with the Object Identifier, i.e., either an URI or a RDF literal.
201 |
202 | .. code-block:: python
203 |
204 | from hdt import HDTDocument, IdentifierPosition
205 | document = HDTDocument("test.hdt")
206 | print(document.convert_id(10, IdentifierPosition.Subject))
207 |
208 | )";
209 |
210 | const char *HDT_DOCUMENT_CONVERT_TERM_DOC = R"(
211 | Transform an RDF Term to the associated Object Identifier.
212 | Such identifier are used in TripleID.
213 |
214 | Args:
215 | - term ``str``: RDF Term.
216 | - position :class:`hdt.IdentifierPosition`: Identifier position.
217 |
218 | Return:
219 | The Object Identifier associated with the RDF Term
220 |
221 | .. code-block:: python
222 |
223 | from hdt import HDTDocument, IdentifierPosition
224 | document = HDTDocument("test.hdt")
225 | print(document.convert_term("http://example.org#Alice", IdentifierPosition.Subject))
226 |
227 | )";
228 |
229 | /**
230 | * TripleIterator & TripleIDIterator docstrings
231 | */
232 |
233 | const char *TRIPLE_ITERATOR_CLASS_DOC = R"(
234 | A TripleIterator iterates over triples in a HDT file matching a triple pattern, with an optional limit & offset.
235 |
236 | Such iterator is returned by :meth:`hdt.HDTDocument.search_triples`.
237 | )";
238 |
239 | const char *TRIPLE_ID_ITERATOR_CLASS_DOC = R"(
240 | A TripleIDIterator iterates over triples' IDs in a HDT file matching a triple pattern, with an optional limit & offset.
241 |
242 | Such iterator is returned by :meth:`hdt.HDTDocument.search_triples_ids`
243 |
244 | Conversion from a tuple of triple ids into a RDF triple is done using :meth:`hdt.HDTDocument.convert_tripleid`.
245 | )";
246 |
247 | const char *TRIPLE_ITERATOR_NEXT_DOC = R"(
248 | Return the next matching triple read by the iterator, or raise ``StopIterator`` if there is no more items to yield.
249 | )";
250 |
251 | const char *TRIPLE_ITERATOR_PEEK_DOC = R"(
252 | Return the next matching triple read by the iterator without advancing it, or raise ``StopIterator`` if there is no more items to yield.
253 | )";
254 |
255 | const char *TRIPLE_ITERATOR_HASNEXT_DOC = R"(
256 | Return true if the iterator still has items to yield, false otherwise.
257 | )";
258 |
259 | const char *TRIPLE_ITERATOR_GETSUBJECT_DOC = R"(
260 | Return the subject of the triple pattern currently evaluated.
261 | )";
262 |
263 | const char *TRIPLE_ITERATOR_GETPREDICATE_DOC = R"(
264 | Return the predicate of the triple pattern currently evaluated.
265 | )";
266 |
267 | const char *TRIPLE_ITERATOR_GETOBJECT_DOC = R"(
268 | Return the object of the triple pattern currently evaluated.
269 | )";
270 |
271 | const char *TRIPLE_ITERATOR_GETLIMIT_DOC = R"(
272 | Return the limit of the iterator, i.e., the maximum number of items the iterator will yield.
273 | A limit of 0 indicates that the iterator limit is the cardinality of the triple pattern currently evaluated.
274 | )";
275 |
276 | const char *TRIPLE_ITERATOR_GETOFFSET_DOC = R"(
277 | Return the offset of the iterator, i.e., the number of items the iterator will first skip before yielding.
278 | An offset of 0 indicates that the iterator will not skip any items.
279 | )";
280 |
281 | const char *TRIPLE_ITERATOR_NBREADS_DOC = R"(
282 | Return the number of items read by the iterator until now.
283 | Do not include any offset, thus the real position of the iterator in the collection of triples can be computed as offset + nb_reads
284 | )";
285 |
286 | const char *TRIPLE_ITERATOR_SIZE_DOC = R"(
287 | Get a hint on the cardinality of the triple pattern currently evaluated.
288 | The iterator's limit and offset are not taken into account.
289 |
290 | Return:
291 | A 2-element ``tuple`` (integer, boolean), where the left member is the estimated cardinality,
292 | and the right member is True is the estimation is accurate, False otherwise
293 | )";
294 |
295 | const char *TRIPLE_ITERATOR_ACC_ESTIMATION_DOC = R"(
296 | Return True if the iterator can accuratly estimate the cardinality of the triple pattern, False otherwise.
297 | )";
298 |
299 | const char *JOIN_ITERATOR_CLASS_DOC = R"(
300 | A JoinIterator iterates over the set of solution mappings for a join between several triple patterns. It implements the Python iterator protocol and yields sets of solutions mappings.
301 |
302 | Such iterator is returned by :meth:`hdt.HDTDocument.search_join`
303 | )";
304 |
305 | const char *JOIN_ITERATOR_NEXT_DOC = R"(
306 | Return the next set of solution mappings read by the iterator, or raise ``StopIterator`` if there is no more items to yield.
307 | )";
308 |
309 | const char *JOIN_ITERATOR_HAS_NEXT_DOC = R"(
310 | Return true if the iterator still has items to yield, false otherwise.
311 | )";
312 |
313 | const char *JOIN_ITERATOR_SIZE_DOC = R"(
314 | Return the estimated join cardinality.
315 | )";
316 |
317 | const char *JOIN_ITERATOR_RESET_DOC = R"(
318 | Reset the join, i.e., move the iterator back to its initial state.
319 | )";
320 |
321 | #endif /* PYHDT_DOCSTRINGS_HPP */
322 |
--------------------------------------------------------------------------------
/include/hdt_document.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * hdt_document.hpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #ifndef PYHDT_DOCUMENT_HPP
7 | #define PYHDT_DOCUMENT_HPP
8 |
9 | #include
10 | #include "HDT.hpp"
11 | #include "QueryProcessor.hpp"
12 | #include "pyhdt_types.hpp"
13 | #include "triple_iterator.hpp"
14 | #include "triple_iterator_bytes.hpp"
15 | #include "tripleid_iterator.hpp"
16 | #include "join_iterator.hpp"
17 | #include "join_iterator_bytes.hpp"
18 | #include
19 | #include
20 | #include
21 | namespace py = pybind11;
22 |
23 | // The result of a search for a triple pattern in a HDT document:
24 | // a tuple (matching RDF triples, nb of matching RDF triples)
25 | typedef std::tuple search_results;
26 |
27 | // The result of a search for a triple pattern in a HDT document:
28 | // a tuple (matching RDF triples, nb of matching RDF triples)
29 | typedef std::tuple search_results_bytes;
30 |
31 | // Same as seach_results, but for an iterator over triple ids
32 | typedef std::tuple search_results_ids;
33 |
34 | /*!
35 | * HDTDocument is the main entry to manage an hdt document
36 | * \author Thomas Minier
37 | */
38 | class HDTDocument {
39 | private:
40 | std::string hdt_file;
41 | hdt::HDT *hdt;
42 | hdt::QueryProcessor *processor;
43 | HDTDocument(std::string file, bool map, bool indexed);
44 |
45 | public:
46 | /*!
47 | * Destructor
48 | */
49 | ~HDTDocument();
50 |
51 | /*!
52 | * Get the path to the HDT file currently loaded
53 | * @return The path to the HDT file currently loaded
54 | */
55 | std::string getFilePath();
56 |
57 | /*!
58 | * Implementation for Python function "__repr__"
59 | * @return A string representation of the object
60 | */
61 | std::string python_repr();
62 |
63 | /*!
64 | * Get the total number of triples in the HDT document
65 | * @return The total number of triples in the HDT document
66 | */
67 | unsigned int getNbTriples();
68 |
69 | /*!
70 | * Get the number of distinct subjects in the HDT document
71 | * @return The number of distinct subjects in the HDT document
72 | */
73 | unsigned int getNbSubjects();
74 |
75 | /*!
76 | * Get the number of distinct predicates in the HDT document
77 | * @return The number of distinct predicates in the HDT document
78 | */
79 | unsigned int getNbPredicates();
80 |
81 | /*!
82 | * Get the number of distinct objects in the HDT document
83 | * @return The number of distinct objects in the HDT document
84 | */
85 | unsigned int getNbObjects();
86 |
87 | /*!
88 | * Get the number of shared subjects-objects in the HDT document
89 | * @return The number of shared subjects-objects in the HDT document
90 | */
91 | unsigned int getNbShared();
92 |
93 | /*!
94 | * Static factory method used to create a new HDT Document
95 | * @param file - Path to the HDT file
96 | * @param map - True maps the HDT file (faster), False loads everything in memory
97 | * @param indexed - True if the HDT must be loaded with indexes, False otherwise
98 | */
99 | static HDTDocument create(std::string file, bool map, bool indexed) {
100 | return HDTDocument(file, map, indexed);
101 | }
102 |
103 | /*!
104 | * Convert a TripleID to a string RDF triple
105 | * @param subject - Triple's subject
106 | * @param predicate - Triple's predicate
107 | * @param object - Triple's object
108 | * @return The associated RDF triple
109 | */
110 | triple convertTripleID(unsigned int subject, unsigned int predicate,
111 | unsigned int object);
112 |
113 | /**
114 | * Convert an Object Identifier into the equivalent an RDF term
115 | * @param id - Object Identifier
116 | * @param pos - Identifier position (subject, predicate or object)
117 | * @return The an RDF term equivalent to the Object Identifier
118 | */
119 | string convertID(unsigned int id, IdentifierPosition pos);
120 |
121 | /**
122 | * Convert an RDF term into the associated an Object Identifier.
123 | * @param term - RDF Term in string format
124 | * @param pos - Identifier position (subject, predicate or object)
125 | * @return The Object Identifier associated with the RDF term
126 | */
127 | unsigned int convertTerm(std::string term, IdentifierPosition pos);
128 |
129 | /*!
130 | * Search all matching triples for a triple pattern, whith an optional limit and offset.
131 | * Returns a tuple
132 | * @param subject - Triple pattern's subject
133 | * @param predicate - Triple pattern's predicate
134 | * @param object - Triple pattern's object
135 | * @param limit - (Optional) Maximum number of matching triples to read
136 | * @param offset - (Optional) Number of matching triples to skip
137 | * @return A tuple (TripleIterator*, cardinality)
138 | */
139 | search_results search(std::string subject, std::string predicate,
140 | std::string object, unsigned int limit = 0,
141 | unsigned int offset = 0);
142 |
143 | /*!
144 | * Same as HDTDocument#search, but search for TripleIDs instead.
145 | * Returns a tuple
146 | * @param subject - Triple pattern's subject identifier
147 | * @param predicate - Triple pattern's predicate identifier
148 | * @param object - Triple pattern's object identifier
149 | * @param limit - (Optional) Maximum number of matching triples to read
150 | * @param offset - (Optional) Number of matching triples to skip
151 | * @return A tuple (TripleIDIterator*, cardinality)
152 | */
153 | search_results_ids searchIDs(unsigned int subject, unsigned int predicate,
154 | unsigned int object, unsigned int limit = 0,
155 | unsigned int offset = 0);
156 |
157 | /**
158 | * Evaluate a join between a set of triple patterns using a JoinIterator.
159 | * @param patterns - Set of triple patterns
160 | * @return A JoinIterator* used to evaluated the join.
161 | */
162 | JoinIterator * searchJoin(std::vector patterns);
163 |
164 | // ============== BYTES REPRESENTATION ==============
165 | // Author: Arnaud GRALL - MIT License 2017-2019
166 | /*!
167 | * Search all matching triples for a triple pattern, whith an optional limit and offset. Returns bytes instead of string
168 | * Returns a tuple
169 | * @param subject - Triple pattern's subject
170 | * @param predicate - Triple pattern's predicate
171 | * @param object - Triple pattern's object
172 | * @param limit - (Optional) Maximum number of matching triples to read
173 | * @param offset - (Optional) Number of matching triples to skip
174 | * @return A tuple (TripleIterator*, cardinality)
175 | */
176 | search_results_bytes searchBytes(std::string subject, std::string predicate,
177 | std::string object, unsigned int limit = 0,
178 | unsigned int offset = 0);
179 | /**
180 | * Evaluate a join between a set of triple patterns using a JoinIterator.
181 | * @param patterns - Set of triple patterns
182 | * @return A JoinIterator* used to evaluated the join.
183 | */
184 | JoinIteratorBytes * searchJoinBytes(std::vector patterns);
185 | /*!
186 | * Convert a TripleID to a RDF triple as bytes
187 | * @param subject - Triple's subject
188 | * @param predicate - Triple's predicate
189 | * @param object - Triple's object
190 | * @return The associated RDF triple
191 | */
192 | triple_bytes convertTripleIDBytes(unsigned int subject, unsigned int predicate,
193 | unsigned int object);
194 |
195 | /**
196 | * Convert an Object Identifier into the equivalent an RDF term as bytes
197 | * @param id - Object Identifier
198 | * @param pos - Identifier position (subject, predicate or object)
199 | * @return The an RDF term equivalent to the Object Identifier
200 | */
201 | py::bytes convertIDBytes(unsigned int id, IdentifierPosition pos);
202 | };
203 |
204 | #endif /* PYHDT_DOCUMENT_HPP */
205 |
--------------------------------------------------------------------------------
/include/join_iterator.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * join_iterator.hpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #ifndef JOIN_ITERATOR_HPP
7 | #define JOIN_ITERATOR_HPP
8 |
9 | #include "pyhdt_types.hpp"
10 | #include "QueryProcessor.hpp"
11 | #include
12 |
13 | /*!
14 | * JoinIterator iterates over solution bindings of a join
15 | * @author Thomas Minier
16 | */
17 | class JoinIterator {
18 | private:
19 | hdt::VarBindingString *iterator;
20 | bool hasNextSolution = true;
21 |
22 | public:
23 | /*!
24 | * Constructor
25 | * @param iterator [description]
26 | */
27 | JoinIterator(hdt::VarBindingString *_it);
28 |
29 | /*!
30 | * Destructor
31 | */
32 | ~JoinIterator();
33 |
34 | /*!
35 | * Implementation for Python function "__repr__"
36 | * @return [description]
37 | */
38 | std::string python_repr();
39 |
40 | /*!
41 | * Implementation for Python function "__iter__"
42 | * @return [description]
43 | */
44 | JoinIterator *python_iter();
45 |
46 | /**
47 | * Get the estimated join cardinality
48 | * @return [description]
49 | */
50 | size_t estimatedCardinality();
51 |
52 | /**
53 | * Reset the iterator into its initial state and restart join processing.
54 | */
55 | void reset();
56 |
57 | /*!
58 | * Return true if the iterator still has items available, False otherwise.
59 | * @return [description]
60 | */
61 | bool hasNext();
62 |
63 | /**
64 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
65 | * has ended. Used to implement Python Itertor protocol.
66 | * @return [description]
67 | */
68 | solution_bindings next();
69 |
70 | };
71 |
72 | #endif /* JOIN_ITERATOR_HPP */
73 |
--------------------------------------------------------------------------------
/include/join_iterator_bytes.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * join_iterator.hpp
3 | * Author: Arnaud Grall - MIT License 2017-2019
4 | */
5 |
6 | #ifndef JOIN_ITERATOR_BYTES_HPP
7 | #define JOIN_ITERATOR_BYTES_HPP
8 |
9 | #include "pyhdt_types.hpp"
10 | #include "QueryProcessor.hpp"
11 | #include
12 |
13 | /*!
14 | * JoinIterator iterates over solution bindings of a join
15 | * @author Arnaud Grall
16 | */
17 | class JoinIteratorBytes {
18 | private:
19 | hdt::VarBindingString *iterator;
20 | bool hasNextSolution = true;
21 |
22 | public:
23 | /*!
24 | * Constructor
25 | * @param iterator [description]
26 | */
27 | JoinIteratorBytes(hdt::VarBindingString *_it);
28 |
29 | /*!
30 | * Destructor
31 | */
32 | ~JoinIteratorBytes();
33 |
34 | /*!
35 | * Implementation for Python function "__repr__"
36 | * @return [description]
37 | */
38 | std::string python_repr();
39 |
40 | /*!
41 | * Implementation for Python function "__iter__"
42 | * @return [description]
43 | */
44 | JoinIteratorBytes *python_iter();
45 |
46 | /**
47 | * Get the estimated join cardinality
48 | * @return [description]
49 | */
50 | size_t estimatedCardinality();
51 |
52 | /**
53 | * Reset the iterator into its initial state and restart join processing.
54 | */
55 | void reset();
56 |
57 | /*!
58 | * Return true if the iterator still has items available, False otherwise.
59 | * @return [description]
60 | */
61 | bool hasNext();
62 |
63 | /**
64 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
65 | * has ended. Used to implement Python Itertor protocol.
66 | * @return [description]
67 | */
68 | py::set next();
69 |
70 | };
71 |
72 | #endif /* JOIN_ITERATOR_BYTES_HPP */
73 |
--------------------------------------------------------------------------------
/include/pyhdt_types.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * hdt_types.hpp
3 | * Author: Thomas MINIER, Arnaud Grall - MIT License 2017-2019
4 | */
5 |
6 | #ifndef PYHDT_TYPES_HPP
7 | #define PYHDT_TYPES_HPP
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | namespace py = pybind11;
15 |
16 | /**
17 | * Indictates the position of an Object Identifier
18 | */
19 | enum IdentifierPosition {
20 | Subject = 1,
21 | Predicate = 2,
22 | Object = 3
23 | };
24 |
25 | // A RDF Triple. RDF terms are represented as simple strings by HDT.
26 | typedef std::tuple triple;
27 |
28 | // A RDF triple composed of IDs from HDT dictionnary
29 | typedef std::tuple triple_id;
30 |
31 | // A list of RDF triples
32 | typedef std::list triple_list;
33 |
34 | // A list of RDF triples IDs
35 | typedef std::list triple_ids_list;
36 |
37 | // A hint over the cardinality of a triple pattern
38 | // The right element of the tuple is True if the hint is accurate, False otherwise
39 | typedef std::tuple size_hint;
40 |
41 | typedef std::tuple single_binding;
42 |
43 | typedef std::set *solution_bindings;
44 |
45 | // ============== BYTES REPRESENTATION ==============
46 | // A RDF Triple. RDF terms are represented as simple bytes by HDT.
47 | typedef std::tuple triple_bytes;
48 | // A Set of solutions bindings for the join iterator
49 | typedef py::set solution_bindings_bytes;
50 |
51 | #endif /* PYHDT_TYPES_HPP */
52 |
--------------------------------------------------------------------------------
/include/triple_iterator.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * triple_iterator.hpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #ifndef TRIPLE_ITERATOR_HPP
7 | #define TRIPLE_ITERATOR_HPP
8 |
9 | #include "tripleid_iterator.hpp"
10 | #include "pyhdt_types.hpp"
11 | #include "Dictionary.hpp"
12 | #include
13 |
14 | /*!
15 | * TripleIterator iterates over RDF triples of an HDT document which match a
16 | * triple pattern + limit + offset \author Thomas Minier
17 | */
18 | class TripleIterator {
19 | private:
20 | TripleIDIterator *iterator;
21 | hdt::Dictionary *dictionary;
22 |
23 | public:
24 | /*!
25 | * Constructor
26 | * @param iterator [description]
27 | */
28 | TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict);
29 |
30 | /*!
31 | * Destructor
32 | */
33 | ~TripleIterator();
34 |
35 | /*!
36 | * Implementation for Python function "__repr__"
37 | * @return [description]
38 | */
39 | std::string python_repr();
40 |
41 | /*!
42 | * Get the subject of the triple pattern currently evaluated.
43 | * An empty string represents a variable
44 | * @return [description]
45 | */
46 | std::string getSubject();
47 |
48 | /*!
49 | * Get the predicate of the triple pattern currently evaluated.
50 | * An empty string represents a variable
51 | * @return [description]
52 | */
53 | std::string getPredicate();
54 |
55 | /*!
56 | * Get the object of the triple pattern currently evaluated.
57 | * An empty string represents a variable
58 | * @return [description]
59 | */
60 | std::string getObject();
61 |
62 | /*!
63 | * Get the limit of the current iterator
64 | * @return [description]
65 | */
66 | unsigned int getLimit();
67 |
68 | /*!
69 | * Get the offset of the current iterator
70 | * @return [description]
71 | */
72 | unsigned int getOffset();
73 |
74 | /*!
75 | * Get the number of results read by the iterator
76 | * @return [description]
77 | */
78 | unsigned int getNbResultsRead();
79 |
80 | /*!
81 | * Implementation for Python function "__iter__"
82 | * @return [description]
83 | */
84 | TripleIterator *python_iter();
85 |
86 | /*!
87 | * Get the estimated cardinality of the pattern currently evaluated.
88 | * Offset & limit are not taken into account.
89 | * @return [description]
90 | */
91 | size_hint sizeHint();
92 |
93 | /*!
94 | * Return true if the iterator still has items available, False otherwise.
95 | * @return [description]
96 | */
97 | bool hasNext();
98 |
99 | /**
100 | * Get the next item in the iterator, or raise py::StopIteration if the
101 | * iterator has ended
102 | * @return [description]
103 | */
104 | triple next();
105 |
106 | /**
107 | * Get the next item in the iterator, or raise py::StopIteration if the
108 | * iterator has ended, but without advancing the iterator.
109 | * @return [description]
110 | */
111 | triple peek();
112 | };
113 |
114 | #endif /* TRIPLE_ITERATOR_HPP */
115 |
--------------------------------------------------------------------------------
/include/triple_iterator_bytes.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * triple_iterator_bytes.hpp
3 | * Author: Arnaud GRALL - MIT License 2017-2019
4 | */
5 |
6 | #ifndef TRIPLE_ITERATOR_BYTES_HPP
7 | #define TRIPLE_ITERATOR_BYTES_HPP
8 |
9 | #include "tripleid_iterator.hpp"
10 | #include "pyhdt_types.hpp"
11 | #include "Dictionary.hpp"
12 | #include
13 |
14 | /*!
15 | * TripleIterator iterates over RDF triples of an HDT document which match a
16 | * triple pattern + limit + offset \author Thomas Minier
17 | */
18 | class TripleIteratorBytes {
19 | private:
20 | TripleIDIterator *iterator;
21 | hdt::Dictionary *dictionary;
22 |
23 | public:
24 | /*!
25 | * Constructor
26 | * @param iterator [description]
27 | */
28 | TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict);
29 |
30 | /*!
31 | * Destructor
32 | */
33 | ~TripleIteratorBytes();
34 |
35 | /*!
36 | * Implementation for Python function "__repr__"
37 | * @return [description]
38 | */
39 | std::string python_repr();
40 |
41 | /*!
42 | * Get the subject of the triple pattern currently evaluated.
43 | * An empty string represents a variable
44 | * @return [description]
45 | */
46 | std::string getSubject();
47 |
48 | /*!
49 | * Get the predicate of the triple pattern currently evaluated.
50 | * An empty string represents a variable
51 | * @return [description]
52 | */
53 | std::string getPredicate();
54 |
55 | /*!
56 | * Get the object of the triple pattern currently evaluated.
57 | * An empty string represents a variable
58 | * @return [description]
59 | */
60 | std::string getObject();
61 |
62 | /*!
63 | * Get the limit of the current iterator
64 | * @return [description]
65 | */
66 | unsigned int getLimit();
67 |
68 | /*!
69 | * Get the offset of the current iterator
70 | * @return [description]
71 | */
72 | unsigned int getOffset();
73 |
74 | /*!
75 | * Get the number of results read by the iterator
76 | * @return [description]
77 | */
78 | unsigned int getNbResultsRead();
79 |
80 | /*!
81 | * Implementation for Python function "__iter__"
82 | * @return [description]
83 | */
84 | TripleIteratorBytes *python_iter();
85 |
86 | /*!
87 | * Get the estimated cardinality of the pattern currently evaluated.
88 | * Offset & limit are not taken into account.
89 | * @return [description]
90 | */
91 | size_hint sizeHint();
92 |
93 | /*!
94 | * Return true if the iterator still has items available, False otherwise.
95 | * @return [description]
96 | */
97 | bool hasNext();
98 |
99 | /**
100 | * Get the next item in the iterator, or raise py::StopIteration if the
101 | * iterator has ended
102 | * @return [description]
103 | */
104 | triple_bytes next();
105 |
106 | /**
107 | * Get the next item in the iterator, or raise py::StopIteration if the
108 | * iterator has ended, but without advancing the iterator.
109 | * @return [description]
110 | */
111 | triple_bytes peek();
112 | };
113 |
114 | #endif /* TRIPLE_ITERATOR_BYTES_HPP */
115 |
--------------------------------------------------------------------------------
/include/tripleid_iterator.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * tripleid_iterator.hpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #ifndef TRIPLEID_ITERATOR_HPP
7 | #define TRIPLEID_ITERATOR_HPP
8 |
9 | #include "pyhdt_types.hpp"
10 | #include
11 | #include
12 |
13 | /*!
14 | * TripleIDIterator iterates over IDs of RDF triples of an HDT document which
15 | * match a triple pattern + limit + offset \author Thomas Minier
16 | */
17 | class TripleIDIterator {
18 | private:
19 | std::string subject;
20 | std::string predicate;
21 | std::string object;
22 | unsigned int limit;
23 | unsigned int offset;
24 | hdt::IteratorTripleID *iterator;
25 | triple_id _bufferedTriple;
26 | bool hasBufferedTriple = false;
27 | unsigned int resultsRead = 0;
28 |
29 | public:
30 | /*!
31 | * Constructor
32 | * @param iterator [description]
33 | */
34 | TripleIDIterator(hdt::IteratorTripleID *_it, std::string _subj,
35 | std::string _pred, std::string _obj, unsigned int _limit,
36 | unsigned int _offset);
37 |
38 | /*!
39 | * Destructor
40 | */
41 | ~TripleIDIterator();
42 |
43 | /*!
44 | * Implementation for Python function "__repr__"
45 | * @return [description]
46 | */
47 | std::string python_repr();
48 |
49 | /*!
50 | * Get the subject of the triple pattern currently evaluated.
51 | * @return [description]
52 | */
53 | std::string getSubject();
54 |
55 | /*!
56 | * Get the predicate of the triple pattern currently evaluated.
57 | * @return [description]
58 | */
59 | std::string getPredicate();
60 |
61 | /*!
62 | * Get the object of the triple pattern currently evaluated.
63 | * @return [description]
64 | */
65 | std::string getObject();
66 |
67 | /*!
68 | * Get the limit of the current iterator
69 | * @return [description]
70 | */
71 | unsigned int getLimit();
72 |
73 | /*!
74 | * Get the offset of the current iterator
75 | * @return [description]
76 | */
77 | unsigned int getOffset();
78 |
79 | /*!
80 | * Get the number of results read by the iterator
81 | * @return [description]
82 | */
83 | unsigned int getNbResultsRead();
84 |
85 | /*!
86 | * Implementation for Python function "__iter__"
87 | * @return [description]
88 | */
89 | TripleIDIterator *python_iter();
90 |
91 | /*!
92 | * Get the estimated cardinality of the pattern currently evaluated.
93 | * Offset & limit are not taken into account.
94 | * @return [description]
95 | */
96 | size_hint sizeHint();
97 |
98 | /*!
99 | * Return true if the iterator still has items available, False otherwise.
100 | * @return [description]
101 | */
102 | bool hasNext();
103 |
104 | /**
105 | * Get the next item in the iterator, or raise py::StopIteration if the
106 | * iterator has ended
107 | * @return [description]
108 | */
109 | triple_id next();
110 |
111 | /**
112 | * Get the next item in the iterator, or raise py::StopIteration if the
113 | * iterator has ended, but without advancing the iterator.
114 | * @return [description]
115 | */
116 | triple_id peek();
117 | };
118 |
119 | #endif /* TRIPLEID_ITERATOR_HPP */
120 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # scripts for automated installation
3 |
4 | echo "Validating dependencies..."
5 | command -v python >/dev/null 2>&1 || { echo >&2 "Python is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; }
6 | command -v pip >/dev/null 2>&1 || { echo >&2 "pip is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; }
7 | command -v curl >/dev/null 2>&1 || { echo >&2 "curl is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; }
8 | command -v unzip >/dev/null 2>&1 || { echo >&2 "unzip is required for the installation of rdflib_hdt! Aborting installation..."; exit 1; }
9 |
10 | echo "Installing dependencies..."
11 | pipenv install
12 |
13 | echo "Installing pyHDT..."
14 | pipenv run python setup.py install
15 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "rdflib_hdt"
3 | authors = [{ name = "Thomas Minier", email = "tminier01@gmail.com" }]
4 | description = "A Store back-end for rdflib to allow for reading and querying HDT documents"
5 | keywords = ["rdflib", "hdt", "rdf", "semantic web", "search"]
6 | readme = "README.rst"
7 | license = { text = "MIT License" }
8 | dynamic = ["version"]
9 | dependencies = ["rdflib>=4.2", "pybind11>=2.2.4"]
10 |
11 | [project.urls]
12 | homepage = "https://rdflib.dev/rdflib-hdt"
13 | repository = "https://github.com/RDFLib/rdflib-hdt.git"
14 |
15 | [build-system]
16 | requires = ["pybind11", "setuptools >= 40.8.0", "wheel"]
17 | build-backend = "setuptools.build_meta"
18 |
--------------------------------------------------------------------------------
/rdflib_hdt/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "HDTDocument",
3 | "HDTStore",
4 | "HDTIterator",
5 | "HDTJoinIterator",
6 | "optimize_sparql"
7 | ]
8 |
9 | from rdflib_hdt.hdt_document import HDTDocument
10 | from rdflib_hdt.iterators import HDTIterator, HDTJoinIterator
11 | from rdflib_hdt.hdt_store import HDTStore
12 | from rdflib_hdt.sparql_op import optimize_sparql
13 |
--------------------------------------------------------------------------------
/rdflib_hdt/hdt_document.py:
--------------------------------------------------------------------------------
1 | """
2 | rdflib_hdt.hdt_document
3 | =======================
4 | """
5 | from typing import Optional, Tuple, Union
6 |
7 | import hdt
8 | from rdflib_hdt.iterators import HDTIterator, HDTJoinIterator
9 | from rdflib_hdt.mapping import rdflib_to_hdt, term_to_rdflib
10 | from rdflib_hdt.types import BGP, SearchQuery, Term
11 |
12 |
13 | class TermKindError(NameError):
14 | """An error raised when an invalid Term position identifier is used"""
15 | pass
16 |
17 |
18 | class HDTDocument(hdt.HDTDocument):
19 | """An HDT document, in read-only mode.
20 |
21 | This class is a wrapper over the original hdt.HDTDocument class,
22 | which aligns it with the RDFlib data model.
23 |
24 | .. warning:: By default, an HDTDocument discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor.
25 |
26 | Args:
27 | - path: Absolute path to the HDT file to load.
28 | - mapped: True if the document must be mapped on disk, False to load it in memory.
29 | - indexed: True if the document must be indexed. Indexed must be located in the same directory as the HDT file. Missing indexes are automatically generated at startup.
30 | - safe_mode: True if Unicode errors should be ignored, False otherwise.
31 | """
32 | def __init__(self, path: str, mapped: bool = True, indexed: bool = True, safe_mode=True):
33 | super(HDTDocument, self).__init__(path, mapped, indexed)
34 | self._safe_mode = safe_mode
35 |
36 | def is_safe(self) -> bool:
37 | """Return True if the HDT document ignores Unicode errors, False otherwise."""
38 | return self._safe_mode
39 |
40 | def from_tripleid(self, triple_id: Union[int, int, int]) -> Term:
41 | """Transform an RDF triple from a TripleID representation to an RDFlib representation.
42 |
43 | Argument:
44 | - triple_id: 3-tuple of IDs (s, p, o)
45 |
46 | Return:
47 | A triple in RDFlib representation, i.e., a 3-tuple of RDFlib terms.
48 | """
49 | s, p, o = super().convert_tripleid(triple_id[0], triple_id[1], triple_id[2])
50 | return (term_to_rdflib(s), term_to_rdflib(p), term_to_rdflib(o))
51 |
52 | def to_tripleid(self, triple: SearchQuery) -> Tuple[int, int, int]:
53 | """Transform a triple (pattern) from an RDFlib representation to a TripleID.
54 |
55 | It can be used to transform an RDFlib query before feeding it
56 | into the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method.
57 |
58 | Argument:
59 | - triple: 3-tuple of RDF Terms. Use `None` to indicate wildcards.
60 |
61 | Return:
62 | A triple in TripleID representation, i.e., a 3-tuple of integers
63 | """
64 | subj = super().convert_term(rdflib_to_hdt(triple[0]), hdt.Subject) if triple[0] is not None else 0
65 | pred = super().convert_term(rdflib_to_hdt(triple[1]), hdt.Predicate) if triple[1] is not None else 0
66 | obj = super().convert_term(rdflib_to_hdt(triple[2]), hdt.Object) if triple[2] is not None else 0
67 | return (subj, pred, obj)
68 |
69 | def term_to_id(self, term: Term, kind: int) -> int:
70 | """Transform a RDF term from an RDFlib representation to an unique ID, as used in a TripleID.
71 |
72 | It can be used in interaction with the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method.
73 |
74 | Argument:
75 | - term: The RDF term to transform.
76 | - kind: The term position: `0` for subjects, `1` for predicates and `2` for objects.
77 |
78 | Return:
79 | An ID representation of the RDF Term.
80 | """
81 | str_term = rdflib_to_hdt(term) if term is not None else 0
82 | if kind == 0:
83 | return super().convert_term(str_term, hdt.IdentifierPosition.Subject)
84 | elif kind == 1:
85 | return super().convert_term(str_term, hdt.IdentifierPosition.Predicate)
86 | elif kind == 2:
87 | return super().convert_term(str_term, hdt.IdentifierPosition.Object)
88 | else:
89 | raise TermKindError(f"The position {kind} is not a valid Term kind (0 for subjects, 1 for predicates and 2 for objects)")
90 |
91 | def id_to_term(self, term_id: int, kind: int) -> Term:
92 | """Transform a RDF term from an unique ID, as used in a TripleID, to an RDFlib representation.
93 |
94 | It can be used in interaction with the :py:meth:`rdflib_hdt.HDTDocument.search_ids` method.
95 |
96 | Argument:
97 | - term_id: The Term ID to transform.
98 | - kind: The term position: `0` for subjects, `1` for predicates and `2` for objects.
99 |
100 | Return:
101 | An RDFlib representation of the RDF Term.
102 | """
103 | term = None
104 | if kind == 0:
105 | term = super().convert_id(term_id, hdt.IdentifierPosition.Subject)
106 | elif kind == 1:
107 | term = super().convert_id(term_id, hdt.IdentifierPosition.Predicate)
108 | elif kind == 2:
109 | term = super().convert_id(term_id, hdt.IdentifierPosition.Object)
110 | else:
111 | raise TermKindError(f"The position {kind} is not a valid Term kind (0 for subjects, 1 for predicates and 2 for objects)")
112 | return term_to_rdflib(term)
113 |
114 | def search(self, query: SearchQuery, limit=0, offset=0) -> Tuple[HDTIterator, int]:
115 | """Search for RDF triples matching the query triple pattern, with an optional limit and offset. Use `None` for wildcards/variables.
116 |
117 | Args:
118 | - query: The triple pattern (s, p, o) to search. Use `None` to indicate wildcards/variables.
119 | - limit: (optional) Maximum number of triples to search.
120 | - offset: (optional) Number of matching triples to skip before returning results.
121 |
122 | Return:
123 | A 2-elements tuple (iterator, estimated pattern cardinality), where
124 | the iterator is a generator of matching RDF triples. An RDF triple itself is a 3-elements tuple (subject, predicate, object) of RDF terms (in rdflib format).
125 | """
126 | subj = rdflib_to_hdt(query[0]) if query[0] is not None else ""
127 | pred = rdflib_to_hdt(query[1]) if query[1] is not None else ""
128 | obj = rdflib_to_hdt(query[2]) if query[2] is not None else ""
129 | triples, cardinality = super().search_triples(subj, pred, obj, limit=limit, offset=offset)
130 | iterator = HDTIterator(triples, safe_mode=self._safe_mode)
131 | return iterator, cardinality
132 |
133 | def search_ids(self, query: Union[Optional[int], Optional[int], Optional[int]], limit=0, offset=0) -> Tuple[hdt.TripleIDIterator, int]:
134 | """Same as :meth:`rdflib_hdt.HDTDocument.search_triples`, but RDF triples are represented as unique ids (from the HDT Dictionnary). Use `None` or `0` to indicate wildcards/variables.
135 |
136 | Mapping between ids and RDF terms is done using the :meth:`rdflib_hdt.HDTDocument.from_tripleid`, :py:meth:`rdflib_hdt.HDTDocument.to_tripleid`, :meth:`rdflib_hdt.HDTDocument.term_to_id`, and :meth:`rdflib_hdt.HDTDocument.id_to_term` methods.
137 |
138 | Args:
139 | - query: A tuple of triple patterns IDs (s, p, o) to search. Use `None` or `0` to indicate wildcards/variables.
140 | - limit: (optional) Maximum number of triples to search.
141 | - offset: (optional) Number of matching triples to skip before returning results.
142 |
143 | Return:
144 | A 2-elements tuple (iterator, estimated pattern cardinality), where
145 | the iterator is a generator of matching RDF triples. An RDF triple itself is a 3-elements tuple (subject, predicate, object) of IDs (positive integers from the HDT Dictionnary).
146 | """
147 | subj = query[0] if query[0] is not None else 0
148 | pred = query[1] if query[1] is not None else 0
149 | obj = query[2] if query[2] is not None else 0
150 | return super().search_triples_ids(subj, pred, obj, limit=limit, offset=offset)
151 |
152 | def search_join(self, patterns: BGP) -> hdt.JoinIterator:
153 | """Evaluate a join between a set of triple patterns using an iterator.
154 | A triple pattern itself is a 3-elements ``tuple`` (subject, predicate, object) of RDFlib terms with at least one SPARQL variable.
155 |
156 | Argument: A set of triple patterns.
157 |
158 | Return:
159 | A :py:class:`rdflib_hdt.HDTJoinIterator` which produces :py:class:`rdflib.query.Results`, per the Python iteration protocol.
160 | """
161 | bgp = [(rdflib_to_hdt(s), rdflib_to_hdt(p), rdflib_to_hdt(o)) for s, p, o in patterns]
162 | join_iterator = super().search_join(bgp)
163 | return HDTJoinIterator(join_iterator, safe_mode=self._safe_mode)
164 |
--------------------------------------------------------------------------------
/rdflib_hdt/hdt_store.py:
--------------------------------------------------------------------------------
1 | """
2 | rdflib_hdt.hdt_store
3 | =======================
4 | """
5 | from typing import Iterable
6 |
7 | from rdflib.store import Store
8 |
9 | from rdflib_hdt.hdt_document import HDTDocument
10 | from rdflib_hdt.types import Triple
11 |
12 |
13 | class HDTStore(Store):
14 | """An implementation of a Store over a HDT document.
15 |
16 | It is heavily inspired by the work from @FlorianLudwig (https://github.com/RDFLib/rdflib/issues/894)
17 |
18 | .. warning:: By default, an HDTStore discards RDF Terms with invalid UTF-8 encoding. You can change this behavior with the `safe_mode` parameter of the constructor.
19 |
20 | Args:
21 | - path: Absolute path to the HDT file to load.
22 | - mapped: True if the document must be mapped on disk, False to load it in memory.
23 | - indexed: True if the document must be indexed. Indexed must be located in the same directory as the HDT file. Missing indexes are automatically generated at startup.
24 | - safe_mode: True if Unicode errors should be ignored, False otherwise.
25 | """
26 | def __init__(self, path: str, mapped: bool = True, indexed: bool = True, safe_mode=True, configuration=None, identifier=None):
27 | super(HDTStore, self).__init__(configuration=configuration, identifier=identifier)
28 | self._hdt_document = HDTDocument(path, mapped=mapped, indexed=indexed, safe_mode=safe_mode)
29 |
30 | @property
31 | def hdt_document(self) -> HDTDocument:
32 | """The HDT document used to read and query the HDT file."""
33 | return self._hdt_document
34 |
35 | def is_safe(self) -> bool:
36 | """Return True if the HDT store ignores Unicode errors, False otherwise."""
37 | return self._hdt_document.is_safe()
38 |
39 | def __len__(self, context) -> int:
40 | """The number of RDF triples in the HDT store."""
41 | return self._hdt_document.total_triples
42 |
43 | @property
44 | def nb_subjects(self) -> int:
45 | """The number of subjects in the HDT store."""
46 | return self._hdt_document.nb_subjects
47 |
48 | @property
49 | def nb_predicates(self) -> int:
50 | """The number of predicates in the HDT store."""
51 | return self._hdt_document.nb_predicates
52 |
53 | @property
54 | def nb_objects(self) -> int:
55 | """The number of objects in the HDT store."""
56 | return self._hdt_document.nb_objects
57 |
58 | @property
59 | def nb_shared(self) -> int:
60 | """The number of shared subject-object in the HDT store."""
61 | return self._hdt_document.nb_shared
62 |
63 | def triples(self, pattern, context) -> Iterable[Triple]:
64 | """Search for a triple pattern in a HDT store.
65 |
66 | Args:
67 | - pattern: The triple pattern (s, p, o) to search.
68 | - context: The query execution context.
69 |
70 | Returns: An iterator that produces RDF triples matching the input triple pattern.
71 | """
72 | iterator, cardinality = self._hdt_document.search(pattern)
73 | for triple in iterator:
74 | yield triple, None
75 | return
76 |
77 | def create(self, configuration):
78 | raise TypeError('The HDT store is read only!')
79 |
80 | def destroy(self, configuration):
81 | raise TypeError('The HDT store is read only!')
82 |
83 | def commit(self):
84 | raise TypeError('The HDT store is read only!')
85 |
86 | def rollback(self):
87 | raise TypeError('The HDT store is read only!')
88 |
89 | def add(self, _, context=None, quoted=False):
90 | raise TypeError('The HDT store is read only!')
91 |
92 | def addN(self, quads):
93 | raise TypeError('The HDT store is read only!')
94 |
95 | def remove(self, _, context):
96 | raise TypeError('The HDT store is read only!')
97 |
--------------------------------------------------------------------------------
/rdflib_hdt/iterators.py:
--------------------------------------------------------------------------------
1 | """
2 | rdflib_hdt.iterators
3 | =======================
4 |
5 | This module contains iterators that wraps native HDT iterators to the RDFlib data model.
6 | """
7 |
8 | from rdflib.query import ResultRow
9 |
10 | from hdt import JoinIterator, TripleIterator
11 | from rdflib_hdt.mapping import term_to_rdflib
12 | from rdflib_hdt.types import Triple
13 |
14 |
15 | class HDTIterator:
16 | """An iterator that converts HDT matching triples to the RDFlib data model.
17 |
18 | Args:
19 | - input: Input iterator that produces RDF triples with RDF terms in string format.
20 | - safe_mode: True if Unicode errors should be ignored, False otherwise.
21 | """
22 | def __init__(self, input: TripleIterator, safe_mode=True):
23 | super(HDTIterator, self).__init__()
24 | self._input = input
25 | self._safe_mode = safe_mode
26 |
27 | def __len__(self):
28 | """The estimated number of matching RDF triples."""
29 | return len(self._input)
30 |
31 | def __iter__(self):
32 | return self
33 |
34 | def __next__(self) -> Triple:
35 | """Fallback implementation for the Python 2.x iterator protocol."""
36 | return self.next()
37 |
38 | def next(self) -> Triple:
39 | """Produce a new RDF triple, per the Python iterator protocol."""
40 | try:
41 | triple = next(self._input)
42 | if triple is None:
43 | raise StopIteration()
44 | s, p, o = triple
45 | return (term_to_rdflib(s), term_to_rdflib(p), term_to_rdflib(o))
46 | except UnicodeDecodeError as e:
47 | # crash if safe mode is off
48 | if not self._safe_mode:
49 | raise e
50 | # otherwise, try to read a valid RDF triple from the input
51 | return self.next()
52 | except StopIteration as e:
53 | raise e
54 |
55 |
56 | class HDTJoinIterator:
57 | """An iterator that converts HDT join results to the RDFlib data model.
58 |
59 | Args:
60 | - input: Input iterator that yields join results
61 | - safe_mode: True if Unicode errors should be ignored, False otherwise.
62 | """
63 | def __init__(self, input: JoinIterator, safe_mode=True):
64 | super(HDTJoinIterator, self).__init__()
65 | self._input = input
66 | self._safe_mode = safe_mode
67 |
68 | def __len__(self):
69 | """The estimated number of join results."""
70 | return len(self._input)
71 |
72 | def __iter__(self):
73 | return self
74 |
75 | def __next__(self) -> ResultRow:
76 | """Fallback implementation for the Python 2.x iterator protocol."""
77 | return self.next()
78 |
79 | def next(self) -> ResultRow:
80 | """Produce a new row of results, per the Python iterator protocol."""
81 | try:
82 | row = dict()
83 | variables = list()
84 | # convert all solution mappings to the RDFlib data model
85 | for key, value in next(self._input):
86 | rdf_key = term_to_rdflib(key)
87 | rdf_value = term_to_rdflib(value)
88 | variables.append(rdf_key)
89 | row[rdf_key] = rdf_value
90 | return ResultRow(row, variables)
91 | except UnicodeDecodeError as e:
92 | # crash if safe mode is off
93 | if not self._safe_mode:
94 | raise e
95 | # otherwise, try to read a valid RDF triple from the input
96 | return self.next()
97 | except StopIteration as e:
98 | raise e
99 |
--------------------------------------------------------------------------------
/rdflib_hdt/mapping.py:
--------------------------------------------------------------------------------
1 | """
2 | rdflib_hdt.mapping
3 | =======================
4 | Mapping functions between string RDF terms and the RDFlib data model.
5 | """
6 | from rdflib import URIRef, Variable
7 | from rdflib.util import from_n3
8 |
9 | from rdflib_hdt.types import Term
10 |
11 |
12 | def term_to_rdflib(term: str) -> Term:
13 | """Convert an HDT term into its RDFlib representation."""
14 | if term.startswith('?'):
15 | return Variable(term[1:])
16 | elif term.startswith("\""):
17 | return from_n3(term)
18 | else:
19 | return URIRef(term)
20 |
21 |
22 | def rdflib_to_hdt(term: Term) -> str:
23 | """Convert an RDFlib term into an HDT term."""
24 | value = term.n3()
25 | if value.startswith('<') and value.endswith('>'):
26 | return value[1: len(value) - 1]
27 | return value
28 |
--------------------------------------------------------------------------------
/rdflib_hdt/sparql_op.py:
--------------------------------------------------------------------------------
1 | """
2 | rdflib_hdt.sparql_opt
3 | =======================
4 | Provides functions to overrides the RDFlib SPARQL evaluator for HDT documents.
5 | """
6 | import rdflib.plugins.sparql.evaluate as sparql_evaluate
7 | from rdflib import Variable
8 | from rdflib.plugins.sparql.sparql import FrozenBindings, QueryContext
9 |
10 | from rdflib_hdt.hdt_store import HDTStore
11 | from rdflib_hdt.types import BGP
12 |
13 |
14 | def optimize_sparql():
15 | """Overrides the RDFlib SPARQL engine to optimize SPARQL query execution over HDT documents.
16 |
17 | .. note::
18 | Calling this function triggers a global modification of the RDFlib SPARQL engine.
19 | However, executing SPARQL queries using other RDFlib stores will continue to work as before,
20 | so you can safely call this function at the beginning of your code.
21 | """
22 | # copy the default RDFlib function for evaluating Basic Graph Patterns
23 | rdflib_evalBGP = sparql_evaluate.evalBGP
24 |
25 | def __evalBGP__(ctx: QueryContext, bgp: BGP):
26 | # A SPARQL query executed over a non HDTStore is evaluated as usual
27 | if not isinstance(ctx.graph.store, HDTStore):
28 | return rdflib_evalBGP(ctx, bgp)
29 | if not bgp:
30 | yield ctx.solution()
31 | return
32 |
33 | # delegate the join evaluation to HDT
34 | store: HDTStore = ctx.graph.store
35 | for row in store.hdt_document.search_join(set(bgp)):
36 | # convert the ResultRow into a FrozenBindings object
37 | bindings = dict()
38 | for key in row.labels:
39 | bindings[Variable(key)] = row[key]
40 | yield FrozenBindings(ctx, bindings)
41 | return
42 | # overrides RDFlib evalBGP function
43 | sparql_evaluate.evalBGP = __evalBGP__
44 |
--------------------------------------------------------------------------------
/rdflib_hdt/types.py:
--------------------------------------------------------------------------------
1 | """
2 | rdflib_hdt.types
3 | =======================
4 | All commons types found in the rdflib_hdt package
5 | """
6 | from typing import Optional, Set, Tuple, Union
7 | from rdflib import Literal, URIRef, Variable
8 |
9 | Term = Union[URIRef, Literal]
10 | Triple = Tuple[Term, Term, Term]
11 | TriplePattern = Union[URIRef, Literal, Variable]
12 | SearchQuery = Tuple[Optional[Term], Optional[Term], Optional[Term]]
13 | BGP = Set[TriplePattern]
14 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pybind11==2.2.4
2 | rdflib==4.2.2
3 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.rst
3 |
4 | [flake8]
5 | ignore = E501
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | # Author: Thomas MINIER - MIT License 2017-2019
3 | from setuptools import find_packages, setup, Extension
4 | from os import listdir, remove
5 | from shutil import unpack_archive, move, rmtree
6 | import urllib.request
7 | import pybind11
8 |
9 | __rdflib_hdt_version__ = "3.2"
10 |
11 | def download_hdt_and_unzip():
12 | print("Downloading HDT...")
13 | urllib.request.urlretrieve("https://github.com/rdfhdt/hdt-cpp/archive/v1.3.3.zip", "v1.3.3.zip")
14 | unpack_archive("v1.3.3.zip", "tmp")
15 | move("tmp/hdt-cpp-1.3.3", "hdt-cpp-1.3.3")
16 | rmtree("tmp")
17 |
18 | download_hdt_and_unzip()
19 |
20 | def list_files(path: str, extension=".cpp", exclude="S.cpp"):
21 | """List paths to all files that ends with a given extension"""
22 | return ["%s/%s" % (path, f) for f in listdir(path) if f.endswith(extension) and (not f.endswith(exclude))]
23 |
24 |
25 | # pyHDT source files
26 | sources = [
27 | "src/hdt.cpp",
28 | "src/hdt_document.cpp",
29 | "src/triple_iterator.cpp",
30 | "src/triple_iterator_bytes.cpp",
31 | "src/tripleid_iterator.cpp",
32 | "src/join_iterator.cpp",
33 | "src/join_iterator_bytes.cpp"
34 | ]
35 |
36 | # HDT source files
37 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/bitsequence")
38 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/coders")
39 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/mapper")
40 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/sequence")
41 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/permutation")
42 | sources += list_files("hdt-cpp-1.3.3/libcds/src/utils")
43 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/bitsequence")
44 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/dictionary")
45 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/hdt")
46 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/header")
47 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/huffman")
48 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs")
49 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex")
50 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/rdf")
51 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sequence")
52 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/triples")
53 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/util")
54 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sparql")
55 |
56 | # pybind11 + pyHDT + libcds + HDT-lib headers
57 | include_dirs = [
58 | pybind11.get_include(),
59 | pybind11.get_include(True),
60 | "include/",
61 | "hdt-cpp-1.3.3/libhdt/include/",
62 | "hdt-cpp-1.3.3/libhdt/src/dictionary/",
63 | "hdt-cpp-1.3.3/libhdt/src/sparql/",
64 | "hdt-cpp-1.3.3/libcds/include/",
65 | "hdt-cpp-1.3.3/libcds/src/static/bitsequence",
66 | "hdt-cpp-1.3.3/libcds/src/static/coders",
67 | "hdt-cpp-1.3.3/libcds/src/static/mapper",
68 | "hdt-cpp-1.3.3/libcds/src/static/permutation",
69 | "hdt-cpp-1.3.3/libcds/src/static/sequence",
70 | "hdt-cpp-1.3.3/libcds/src/utils"
71 | ]
72 |
73 | # Need to build in c++11 minimum
74 | # TODO add a check to use c++14 or c++17 if available
75 | extra_compile_args = ["-std=c++11"]
76 |
77 | # build HDT extension
78 | hdt_extension = Extension("hdt",
79 | sources=sources,
80 | include_dirs=include_dirs,
81 | extra_compile_args=extra_compile_args,
82 | language='c++')
83 |
84 | setup(
85 | version=__rdflib_hdt_version__,
86 | packages=find_packages(exclude=["tests"]),
87 | ext_modules=[hdt_extension]
88 | )
89 |
90 | print("Cleaning up...")
91 | rmtree("hdt-cpp-1.3.3")
92 | remove("v1.3.3.zip")
93 |
--------------------------------------------------------------------------------
/src/hdt.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * hdt.cpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #include
7 | #include
8 |
9 | #include "docstrings.hpp"
10 | #include "hdt_document.hpp"
11 | #include "triple_iterator.hpp"
12 | #include "triple_iterator_bytes.hpp"
13 | #include "tripleid_iterator.hpp"
14 | #include "join_iterator.hpp"
15 | #include "join_iterator_bytes.hpp"
16 |
17 | namespace py = pybind11;
18 |
19 | PYBIND11_MODULE(hdt, m) {
20 | m.doc() = MODULE_DOC;
21 |
22 | py::enum_(m, "IdentifierPosition", IDENTIFIER_POSITION_DOC)
23 | .value("Subject", IdentifierPosition::Subject)
24 | .value("Predicate", IdentifierPosition::Predicate)
25 | .value("Object", IdentifierPosition::Object)
26 | .export_values();
27 |
28 | py::class_(m, "TripleIterator", TRIPLE_ITERATOR_CLASS_DOC)
29 | .def("next", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
30 | .def("__next__", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
31 | .def("peek", &TripleIterator::peek, TRIPLE_ITERATOR_PEEK_DOC)
32 | .def("has_next", &TripleIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC)
33 | .def("size_hint", &TripleIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
34 | .def("__len__", &TripleIterator::sizeHint,
35 | TRIPLE_ITERATOR_SIZE_DOC)
36 | .def("__iter__", &TripleIterator::python_iter)
37 | .def_property_readonly("subject", &TripleIterator::getSubject,
38 | TRIPLE_ITERATOR_GETSUBJECT_DOC)
39 | .def_property_readonly("predicate", &TripleIterator::getPredicate,
40 | TRIPLE_ITERATOR_GETPREDICATE_DOC)
41 | .def_property_readonly("object", &TripleIterator::getObject,
42 | TRIPLE_ITERATOR_GETOBJECT_DOC)
43 | .def_property_readonly("limit", &TripleIterator::getLimit,
44 | TRIPLE_ITERATOR_GETLIMIT_DOC)
45 | .def_property_readonly("offset", &TripleIterator::getOffset,
46 | TRIPLE_ITERATOR_GETOFFSET_DOC)
47 | .def_property_readonly("nb_reads", &TripleIterator::getNbResultsRead,
48 | TRIPLE_ITERATOR_NBREADS_DOC)
49 | .def("__repr__", &TripleIterator::python_repr);
50 |
51 | py::class_(m, "TripleIteratorBytes", TRIPLE_ITERATOR_CLASS_DOC)
52 | .def("next", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC)
53 | .def("__next__", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC)
54 | .def("peek", &TripleIteratorBytes::peek, TRIPLE_ITERATOR_PEEK_DOC)
55 | .def("has_next", &TripleIteratorBytes::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC)
56 | .def("size_hint", &TripleIteratorBytes::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
57 | .def("__len__", &TripleIteratorBytes::sizeHint,
58 | TRIPLE_ITERATOR_SIZE_DOC)
59 | .def("__iter__", &TripleIteratorBytes::python_iter)
60 | .def_property_readonly("subject", &TripleIteratorBytes::getSubject,
61 | TRIPLE_ITERATOR_GETSUBJECT_DOC)
62 | .def_property_readonly("predicate", &TripleIteratorBytes::getPredicate,
63 | TRIPLE_ITERATOR_GETPREDICATE_DOC)
64 | .def_property_readonly("object", &TripleIteratorBytes::getObject,
65 | TRIPLE_ITERATOR_GETOBJECT_DOC)
66 | .def_property_readonly("limit", &TripleIteratorBytes::getLimit,
67 | TRIPLE_ITERATOR_GETLIMIT_DOC)
68 | .def_property_readonly("offset", &TripleIteratorBytes::getOffset,
69 | TRIPLE_ITERATOR_GETOFFSET_DOC)
70 | .def_property_readonly("nb_reads", &TripleIteratorBytes::getNbResultsRead,
71 | TRIPLE_ITERATOR_NBREADS_DOC)
72 | .def("__repr__", &TripleIteratorBytes::python_repr);
73 |
74 | py::class_(m, "TripleIDIterator", TRIPLE_ID_ITERATOR_CLASS_DOC)
75 | .def("next", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
76 | .def("__next__", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
77 | .def("peek", &TripleIDIterator::peek, TRIPLE_ITERATOR_PEEK_DOC)
78 | .def("has_next", &TripleIDIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC)
79 | .def("size_hint", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
80 | .def("__len__", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
81 | .def("__iter__", &TripleIDIterator::python_iter)
82 | .def_property_readonly("subject", &TripleIDIterator::getSubject,
83 | TRIPLE_ITERATOR_GETSUBJECT_DOC)
84 | .def_property_readonly("predicate", &TripleIDIterator::getPredicate,
85 | TRIPLE_ITERATOR_GETPREDICATE_DOC)
86 | .def_property_readonly("object", &TripleIDIterator::getObject,
87 | TRIPLE_ITERATOR_GETOBJECT_DOC)
88 | .def_property_readonly("limit", &TripleIDIterator::getLimit,
89 | TRIPLE_ITERATOR_GETLIMIT_DOC)
90 | .def_property_readonly("offset", &TripleIDIterator::getOffset,
91 | TRIPLE_ITERATOR_GETOFFSET_DOC)
92 | .def_property_readonly("nb_reads", &TripleIDIterator::getNbResultsRead,
93 | TRIPLE_ITERATOR_NBREADS_DOC)
94 | .def("__repr__", &TripleIDIterator::python_repr);
95 |
96 | py::class_(m, "JoinIterator", JOIN_ITERATOR_CLASS_DOC)
97 | .def("next", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC)
98 | .def("has_next", &JoinIterator::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC)
99 | .def("cardinality", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
100 | .def("reset", &JoinIterator::reset, JOIN_ITERATOR_RESET_DOC)
101 | .def("__len__", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
102 | .def("__next__", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC)
103 | .def("__iter__", &JoinIterator::python_iter)
104 | .def("__repr__", &JoinIterator::python_repr);
105 |
106 | py::class_(m, "JoinIteratorBytes", JOIN_ITERATOR_CLASS_DOC)
107 | .def("next", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC)
108 | .def("has_next", &JoinIteratorBytes::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC)
109 | .def("cardinality", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
110 | .def("reset", &JoinIteratorBytes::reset, JOIN_ITERATOR_RESET_DOC)
111 | .def("__len__", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
112 | .def("__next__", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC)
113 | .def("__iter__", &JoinIteratorBytes::python_iter)
114 | .def("__repr__", &JoinIteratorBytes::python_repr);
115 |
116 | py::class_(m, "HDTDocument", HDT_DOCUMENT_CLASS_DOC)
117 | .def(py::init(&HDTDocument::create), py::arg("file"),
118 | py::arg("map") = true,
119 | py::arg("indexed") = true)
120 | .def_property_readonly("file_path", &HDTDocument::getFilePath,
121 | HDT_DOCUMENT_GETFILEPATH_DOC)
122 | .def_property_readonly("total_triples", &HDTDocument::getNbTriples,
123 | HDT_DOCUMENT_GETNBTRIPLES_DOC)
124 | .def_property_readonly("nb_subjects", &HDTDocument::getNbSubjects,
125 | HDT_DOCUMENT_GETNBSUBJECTS_DOC)
126 | .def_property_readonly("nb_predicates", &HDTDocument::getNbPredicates,
127 | HDT_DOCUMENT_GETNBPREDICATES_DOC)
128 | .def_property_readonly("nb_objects", &HDTDocument::getNbObjects,
129 | HDT_DOCUMENT_GETNBOBJECTS_DOC)
130 | .def_property_readonly("nb_shared", &HDTDocument::getNbShared,
131 | HDT_DOCUMENT_GETNBSHARED_DOC)
132 | .def("search_triples", &HDTDocument::search,
133 | HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"),
134 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0,
135 | py::arg("offset") = 0)
136 | .def("search_join", &HDTDocument::searchJoin, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns"))
137 | .def("search_triples_ids", &HDTDocument::searchIDs,
138 | HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC, py::arg("subject"),
139 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0,
140 | py::arg("offset") = 0)
141 | .def("convert_tripleid", &HDTDocument::convertTripleID,
142 | HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC,
143 | py::arg("subject"), py::arg("predicate"), py::arg("object"))
144 | .def("convert_id", &HDTDocument::convertID, HDT_DOCUMENT_CONVERT_ID_DOC,
145 | py::arg("id"), py::arg("position"))
146 | .def("convert_term", &HDTDocument::convertTerm, HDT_DOCUMENT_CONVERT_TERM_DOC,
147 | py::arg("term"), py::arg("position"))
148 | // ========= BYTES REPRESENTATION =========
149 | .def("search_triples_bytes", &HDTDocument::searchBytes,
150 | HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"),
151 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0,
152 | py::arg("offset") = 0)
153 | .def("search_join_bytes", &HDTDocument::searchJoinBytes, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns"))
154 | .def("convert_tripleid_bytes", &HDTDocument::convertTripleIDBytes,
155 | HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC,
156 | py::arg("subject"), py::arg("predicate"), py::arg("object"))
157 | .def("convert_id_bytes", &HDTDocument::convertIDBytes, HDT_DOCUMENT_CONVERT_ID_DOC,
158 | py::arg("id"), py::arg("position"))
159 | .def("__len__", &HDTDocument::getNbTriples, HDT_DOCUMENT_GETNBTRIPLES_DOC)
160 | .def("__repr__", &HDTDocument::python_repr);
161 |
162 | }
163 |
--------------------------------------------------------------------------------
/src/hdt_document.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * hdt_document.cpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #include "hdt_document.hpp"
7 | #include "triple_iterator.hpp"
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | namespace py = pybind11;
15 | using namespace hdt;
16 |
17 | /*!
18 | * Skip `offset` items from an iterator, optimized for HDT iterators.
19 | * @param it - Iterator which should skip items
20 | * @param offset - How many items to skip
21 | * @param cardinality - (Estimated) number of results
22 | */
23 | template
24 | inline void applyOffset(T *it, unsigned int offset, unsigned int cardinality) {
25 | if (offset > 0 && offset >= cardinality) {
26 | // hdt does not allow to skip past beyond the estimated nb of results,
27 | // so we may have a few results to skip manually
28 | unsigned int remainingSteps = offset - cardinality + 1;
29 | it->skip(cardinality - 1);
30 | while (it->hasNext() && remainingSteps > 0) {
31 | it->next();
32 | remainingSteps--;
33 | }
34 | } else if (offset > 0) {
35 | it->skip(offset);
36 | }
37 | }
38 |
39 | /*!
40 | * Returns true if a file is readable, false otherwise
41 | * @param name - Path to the file to test
42 | * @return true if the file is readable, false otherwise
43 | */
44 | inline bool file_exists(const std::string &name) {
45 | std::ifstream f(name.c_str());
46 | bool result = f.good();
47 | f.close();
48 | return result;
49 | }
50 |
51 | /*!
52 | * Constructor
53 | * @param file - Path to HDT file to load
54 | * @param map - True maps the HDT file (faster), False loads everything in memory
55 | * @param indexed - True if the HDT must be loaded with indexes, False otherwise
56 | */
57 | HDTDocument::HDTDocument(std::string file, bool map, bool indexed) {
58 | hdt_file = file;
59 | if (!file_exists(file)) {
60 | throw std::runtime_error("Cannot open HDT file '" + file + "': Not Found!");
61 | }
62 |
63 | if(!map && indexed) {
64 | hdt = HDTManager::loadIndexedHDT(file.c_str());
65 | } else if(!map && !indexed) {
66 | hdt = HDTManager::loadHDT(file.c_str());
67 | } else if(map && indexed){
68 | hdt = HDTManager::mapIndexedHDT(file.c_str());
69 | } else {
70 | hdt = HDTManager::mapHDT(file.c_str());
71 | }
72 | processor = new QueryProcessor(hdt);
73 | }
74 |
75 | /*!
76 | * Destructor
77 | */
78 | HDTDocument::~HDTDocument() {}
79 |
80 | /*!
81 | * Get the path to the HDT file currently loaded
82 | * @return The path to the HDT file currently loaded
83 | */
84 | std::string HDTDocument::getFilePath() { return hdt_file; }
85 |
86 | /*!
87 | * Implementation for Python function "__repr__"
88 | * @return A string representation of the object
89 | */
90 | std::string HDTDocument::python_repr() {
91 | return "";
93 | }
94 |
95 | /*!
96 | * Search all matching triples for a triple pattern, whith an optional limit and offset.
97 | * Returns a tuple
98 | * @param subject - Triple pattern's subject
99 | * @param predicate - Triple pattern's predicate
100 | * @param object - Triple pattern's object
101 | * @param limit - (Optional) Maximum number of matching triples to read
102 | * @param offset - (Optional) Number of matching triples to skip
103 | * @return A tuple (TripleIterator*, cardinality)
104 | */
105 | search_results HDTDocument::search(std::string subject,
106 | std::string predicate,
107 | std::string object,
108 | unsigned int limit,
109 | unsigned int offset) {
110 | unsigned int idSubject = 0;
111 | unsigned int idPredicate = 0;
112 | unsigned int idObject = 0;
113 |
114 | if (!subject.empty()) {
115 | idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT);
116 | }
117 |
118 | if (!predicate.empty()) {
119 | idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE);
120 | }
121 |
122 | if (!object.empty()) {
123 | idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT);
124 | }
125 |
126 | TripleIDIterator *it;
127 | size_t cardinality = 0;
128 |
129 | // if a non-variable term was not found in the dictionnary, then the search yield nothing
130 | if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) {
131 | it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset);
132 | } else {
133 | // build a TripleIDIterator to fetch results
134 | TripleID tp(idSubject, idPredicate, idObject);
135 | IteratorTripleID *source = hdt->getTriples()->search(tp);
136 | cardinality = source->estimatedNumResults();
137 | applyOffset(source, offset, cardinality);
138 | it = new TripleIDIterator(source, subject, predicate, object, limit, offset);
139 | }
140 | // wraps the TripleIDIterator in order to convert OID triples back to RDF triples
141 | TripleIterator *resultIterator = new TripleIterator(it, hdt->getDictionary());
142 | return std::make_tuple(resultIterator, cardinality);
143 | }
144 |
145 | /*!
146 | * Same as HDTDocument#search, but search for a TripleIDs instead.
147 | * Returns a tuple
148 | * @param subject - Triple pattern's subject identifier
149 | * @param predicate - Triple pattern's predicate identifier
150 | * @param object - Triple pattern's object identifier
151 | * @param limit - (Optional) Maximum number of matching triples to read
152 | * @param offset - (Optional) Number of matching triples to skip
153 | * @return A tuple (TripleIDIterator*, cardinality)
154 | */
155 | search_results_ids HDTDocument::searchIDs(unsigned int subject,
156 | unsigned int predicate,
157 | unsigned int object,
158 | unsigned int limit,
159 | unsigned int offset) {
160 | TripleID tp(subject, predicate, object);
161 | // get RDF terms associated with each ID for metadata
162 | std::string strSubject = std::string("?s");
163 | std::string strPredicate = std::string("?p");
164 | std::string strObject = std::string("?o");
165 |
166 | if (subject != 0) {
167 | strSubject = hdt->getDictionary()->idToString(subject, hdt::SUBJECT);
168 | }
169 | if (predicate != 0) {
170 | strPredicate = hdt->getDictionary()->idToString(predicate, hdt::PREDICATE);
171 | }
172 | if (object != 0) {
173 | strObject = hdt->getDictionary()->idToString(object, hdt::OBJECT);
174 | }
175 |
176 | IteratorTripleID *it;
177 | size_t cardinality = 0;
178 |
179 | // if a non-variable term was not found in the dictionnary, then the search yield nothing
180 | if ((strSubject.empty() && subject != 0) || (strPredicate.empty() && predicate != 0) || (strObject.empty() && object != 0)) {
181 | it = new IteratorTripleID();
182 | } else {
183 | // build iterator
184 | it = hdt->getTriples()->search(tp);
185 | cardinality = it->estimatedNumResults();
186 | // apply offset
187 | applyOffset(it, offset, cardinality);
188 | }
189 | TripleIDIterator *resultIterator = new TripleIDIterator(it, strSubject, strPredicate, strObject, limit, offset);
190 | return std::make_tuple(resultIterator, cardinality);
191 | }
192 |
193 | /*!
194 | * Get the total number of triples in the HDT document
195 | * @return The total number of triples in the HDT document
196 | */
197 | unsigned int HDTDocument::getNbTriples() {
198 | return hdt->getTriples()->getNumberOfElements();
199 | }
200 |
201 | /*!
202 | * Get the number of distinct subjects in the HDT document
203 | * @return The number of distinct subjects in the HDT document
204 | */
205 | unsigned int HDTDocument::getNbSubjects() {
206 | return hdt->getDictionary()->getNsubjects();
207 | }
208 |
209 | /*!
210 | * Get the number of distinct predicates in the HDT document
211 | * @return The number of distinct predicates in the HDT document
212 | */
213 | unsigned int HDTDocument::getNbPredicates() {
214 | return hdt->getDictionary()->getNpredicates();
215 | }
216 |
217 | /*!
218 | * Get the number of distinct objects in the HDT document
219 | * @return The number of distinct objects in the HDT document
220 | */
221 | unsigned int HDTDocument::getNbObjects() {
222 | return hdt->getDictionary()->getNobjects();
223 | }
224 |
225 | /*!
226 | * Get the number of shared subjects-objects in the HDT document
227 | * @return The number of shared subjects-objects in the HDT document
228 | */
229 | unsigned int HDTDocument::getNbShared() {
230 | return hdt->getDictionary()->getNshared();
231 | }
232 |
233 | /*!
234 | * Convert a TripleID to a string RDF triple
235 | * @param subject - Triple's subject
236 | * @param predicate - Triple's predicate
237 | * @param object - Triple's object
238 | * @return The associated RDF triple
239 | */
240 | triple HDTDocument::convertTripleID(unsigned int subject, unsigned int predicate,
241 | unsigned int object) {
242 | return std::make_tuple(
243 | hdt->getDictionary()->idToString(subject, hdt::SUBJECT),
244 | hdt->getDictionary()->idToString(predicate, hdt::PREDICATE),
245 | hdt->getDictionary()->idToString(object, hdt::OBJECT));
246 | }
247 |
248 | /**
249 | * Convert an Object Identifier into the equivalent URI/Literal value
250 | * @param id - Object Identifier
251 | * @param pos - Identifier position (subject, predicate or object)
252 | * @return The URI/Literal equivalent to the Object Identifier
253 | */
254 | string HDTDocument::convertID(unsigned int id, IdentifierPosition pos) {
255 | switch (pos) {
256 | case IdentifierPosition::Subject:
257 | return hdt->getDictionary()->idToString(id, hdt::SUBJECT);
258 | case IdentifierPosition::Predicate:
259 | return hdt->getDictionary()->idToString(id, hdt::PREDICATE);
260 | case IdentifierPosition::Object:
261 | return hdt->getDictionary()->idToString(id, hdt::OBJECT);
262 | default:
263 | throw std::runtime_error("Invalid Object Identifier exception");
264 | }
265 | }
266 |
267 | /**
268 | * Convert an RDF term into the associated an Object Identifier.
269 | * @param term - RDF Term in string format
270 | * @param pos - Identifier position (subject, predicate or object)
271 | * @return The Object Identifier associated with the RDF term
272 | */
273 | unsigned int HDTDocument::convertTerm(std::string term, IdentifierPosition pos) {
274 | switch (pos) {
275 | case IdentifierPosition::Subject:
276 | return hdt->getDictionary()->stringToId(term, hdt::SUBJECT);
277 | case IdentifierPosition::Predicate:
278 | return hdt->getDictionary()->stringToId(term, hdt::PREDICATE);
279 | case IdentifierPosition::Object:
280 | return hdt->getDictionary()->stringToId(term, hdt::OBJECT);
281 | default:
282 | throw std::runtime_error("Invalid Object Identifier exception");
283 | }
284 | }
285 |
286 | /**
287 | * Evaluate a join between a set of triple patterns using a JoinIterator.
288 | * @param patterns - Set of triple patterns
289 | * @return A JoinIterator* used to evaluated the join.
290 | */
291 | JoinIterator * HDTDocument::searchJoin(std::vector patterns) {
292 | set vars {};
293 | vector joinPatterns {};
294 | std::string subj, pred, obj;
295 |
296 | for (auto it = patterns.begin(); it != patterns.end(); it++) {
297 | // unpack pattern
298 | std::tie(subj, pred, obj) = *it;
299 | // add variables
300 | if (subj.at(0) == '?') {
301 | vars.insert(subj);
302 | }
303 | if (pred.at(0) == '?') {
304 | vars.insert(pred);
305 | }
306 | if (obj.at(0) == '?') {
307 | vars.insert(obj);
308 | }
309 | // build join pattern
310 | TripleString pattern(subj, pred, obj);
311 | joinPatterns.push_back(pattern);
312 | }
313 |
314 | VarBindingString *iterator = processor->searchJoin(joinPatterns, vars);
315 | return new JoinIterator(iterator);
316 | }
317 |
318 | // ============= BYTES REPRSENTATION ============
319 | /*!
320 | * Search all matching triples for a triple pattern, whith an optional limit and offset. Triple as bytes triples (b'...', b'...', b'...')
321 | * Returns a tuple
322 | * @param subject - Triple pattern's subject
323 | * @param predicate - Triple pattern's predicate
324 | * @param object - Triple pattern's object
325 | * @param limit - (Optional) Maximum number of matching triples to read
326 | * @param offset - (Optional) Number of matching triples to skip
327 | * @return A tuple (TripleIterator*, cardinality)
328 | */
329 | search_results_bytes HDTDocument::searchBytes(std::string subject,
330 | std::string predicate,
331 | std::string object,
332 | unsigned int limit,
333 | unsigned int offset) {
334 | unsigned int idSubject = 0;
335 | unsigned int idPredicate = 0;
336 | unsigned int idObject = 0;
337 |
338 | if (!subject.empty()) {
339 | idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT);
340 | }
341 |
342 | if (!predicate.empty()) {
343 | idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE);
344 | }
345 |
346 | if (!object.empty()) {
347 | idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT);
348 | }
349 |
350 | TripleIDIterator *it;
351 | size_t cardinality = 0;
352 |
353 | // if a non-variable term was not found in the dictionnary, then the search yield nothing
354 | if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) {
355 | it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset);
356 | } else {
357 | // build a TripleIDIterator to fetch results
358 | TripleID tp(idSubject, idPredicate, idObject);
359 | IteratorTripleID *source = hdt->getTriples()->search(tp);
360 | cardinality = source->estimatedNumResults();
361 | applyOffset(source, offset, cardinality);
362 | it = new TripleIDIterator(source, subject, predicate, object, limit, offset);
363 | }
364 | // wraps the TripleIDIterator in order to convert OID triples back to RDF triples
365 | TripleIteratorBytes *resultIterator = new TripleIteratorBytes(it, hdt->getDictionary());
366 | return std::make_tuple(resultIterator, cardinality);
367 | }
368 |
369 | /**
370 | * Evaluate a join between a set of triple patterns using a JoinIterator.
371 | * @param patterns - Set of triple patterns
372 | * @return A JoinIterator* used to evaluated the join.
373 | */
374 | JoinIteratorBytes * HDTDocument::searchJoinBytes(std::vector patterns) {
375 | set vars {};
376 | vector joinPatterns {};
377 | std::string subj, pred, obj;
378 |
379 | for (auto it = patterns.begin(); it != patterns.end(); it++) {
380 | // unpack pattern
381 | std::tie(subj, pred, obj) = *it;
382 | // add variables
383 | if (subj.at(0) == '?') {
384 | vars.insert(subj);
385 | }
386 | if (pred.at(0) == '?') {
387 | vars.insert(pred);
388 | }
389 | if (obj.at(0) == '?') {
390 | vars.insert(obj);
391 | }
392 | // build join pattern
393 | TripleString pattern(subj, pred, obj);
394 | joinPatterns.push_back(pattern);
395 | }
396 |
397 | VarBindingString *iterator = processor->searchJoin(joinPatterns, vars);
398 | return new JoinIteratorBytes(iterator);
399 | }
400 |
401 | /**
402 | * Convert an Object Identifier into the equivalent URI/Literal value
403 | * @param id - Object Identifier
404 | * @param pos - Identifier position (subject, predicate or object)
405 | * @return The URI/Literal equivalent to the Object Identifier
406 | */
407 | py::bytes HDTDocument::convertIDBytes(unsigned int id, IdentifierPosition pos) {
408 | return py::bytes(HDTDocument::convertID(id, pos));
409 | }
410 |
411 | /*!
412 | * Convert a TripleID to a string RDF triple
413 | * @param subject - Triple's subject
414 | * @param predicate - Triple's predicate
415 | * @param object - Triple's object
416 | * @return The associated RDF triple
417 | */
418 | triple_bytes HDTDocument::convertTripleIDBytes(unsigned int subject, unsigned int predicate,
419 | unsigned int object) {
420 | return std::make_tuple(
421 | py::bytes(hdt->getDictionary()->idToString(subject, hdt::SUBJECT)),
422 | py::bytes(hdt->getDictionary()->idToString(predicate, hdt::PREDICATE)),
423 | py::bytes(hdt->getDictionary()->idToString(object, hdt::OBJECT)));
424 | }
425 |
--------------------------------------------------------------------------------
/src/join_iterator.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * join_iterator.cpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #include "join_iterator.hpp"
7 | #include
8 | #include
9 |
10 | /*!
11 | * Constructor
12 | * @param _it [description]
13 | */
14 | JoinIterator::JoinIterator(hdt::VarBindingString *_it) : iterator(_it) {}
15 |
16 | /*!
17 | * Destructor
18 | */
19 | JoinIterator::~JoinIterator() {
20 | delete iterator;
21 | }
22 |
23 | /*!
24 | * Implementation for Python function "__repr__"
25 | * @return [description]
26 | */
27 | std::string JoinIterator::python_repr() {
28 | return "JoinIterator";
29 | }
30 |
31 |
32 | /*!
33 | * Implementation for Python function "__iter__"
34 | * @return [description]
35 | */
36 | JoinIterator *JoinIterator::python_iter() { return this; }
37 |
38 | /**
39 | * Get the estimated join cardinality
40 | * @return [description]
41 | */
42 | size_t JoinIterator::estimatedCardinality() {
43 | return iterator->estimatedNumResults();
44 | }
45 |
46 | /**
47 | * Reset the iterator into its initial state and restart join processing.
48 | */
49 | void JoinIterator::reset() {
50 | iterator->goToStart();
51 | }
52 |
53 | /*!
54 | * Return true if the iterator still has items available, False otherwise.
55 | * @return [description]
56 | */
57 | bool JoinIterator::hasNext() {
58 | return hasNextSolution;
59 | }
60 |
61 | /**
62 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
63 | * has ended. Used to implement Python Itertor protocol.
64 | * @return [description]
65 | */
66 | solution_bindings JoinIterator::next() {
67 | hasNextSolution = iterator->findNext();
68 | // stop iteration if the iterator has ended
69 | if (!hasNextSolution) {
70 | throw pybind11::stop_iteration();
71 | }
72 | solution_bindings solutions = new std::set();
73 | // build solution bindings
74 | for(unsigned int i = 0; i < iterator->getNumVars(); i++) {
75 | solutions->insert(std::make_tuple(iterator->getVarName(i), iterator->getVar(i)));
76 | }
77 | return solutions;
78 | }
79 |
--------------------------------------------------------------------------------
/src/join_iterator_bytes.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * join_iterator_bytes.cpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #include "join_iterator_bytes.hpp"
7 | #include
8 | #include
9 |
10 | /*!
11 | * Constructor
12 | * @param _it [description]
13 | */
14 | JoinIteratorBytes::JoinIteratorBytes(hdt::VarBindingString *_it) : iterator(_it) {}
15 |
16 | /*!
17 | * Destructor
18 | */
19 | JoinIteratorBytes::~JoinIteratorBytes() {
20 | delete iterator;
21 | }
22 |
23 | /*!
24 | * Implementation for Python function "__repr__"
25 | * @return [description]
26 | */
27 | std::string JoinIteratorBytes::python_repr() {
28 | return "JoinIteratorBytes";
29 | }
30 |
31 |
32 | /*!
33 | * Implementation for Python function "__iter__"
34 | * @return [description]
35 | */
36 | JoinIteratorBytes *JoinIteratorBytes::python_iter() { return this; }
37 |
38 | /**
39 | * Get the estimated join cardinality
40 | * @return [description]
41 | */
42 | size_t JoinIteratorBytes::estimatedCardinality() {
43 | return iterator->estimatedNumResults();
44 | }
45 |
46 | /**
47 | * Reset the iterator into its initial state and restart join processing.
48 | */
49 | void JoinIteratorBytes::reset() {
50 | iterator->goToStart();
51 | }
52 |
53 | /*!
54 | * Return true if the iterator still has items available, False otherwise.
55 | * @return [description]
56 | */
57 | bool JoinIteratorBytes::hasNext() {
58 | return hasNextSolution;
59 | }
60 |
61 | /**
62 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
63 | * has ended. Used to implement Python Itertor protocol.
64 | * @return [description]
65 | */
66 | py::set JoinIteratorBytes::next() {
67 | hasNextSolution = iterator->findNext();
68 | // stop iteration if the iterator has ended
69 | if (!hasNextSolution) {
70 | throw pybind11::stop_iteration();
71 | }
72 | solution_bindings_bytes solutions_bytes;
73 | // build solution bindings
74 | for(unsigned int i = 0; i < iterator->getNumVars(); i++) {
75 | std::string varname = iterator->getVarName(i);
76 | std::string value = iterator->getVar(i);
77 | solutions_bytes.add(std::make_tuple(py::bytes(varname), py::bytes(value)));
78 | }
79 | return solutions_bytes;
80 | }
81 |
--------------------------------------------------------------------------------
/src/triple_iterator.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * triple_iterator.cpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #include "triple_iterator.hpp"
7 | #include
8 | #include
9 | #include
10 |
11 | /*!
12 | * Constructor
13 | * @param iterator [description]
14 | */
15 | TripleIterator::TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict)
16 | : iterator(_it), dictionary(_dict) {};
17 |
18 | /*!
19 | * Destructor
20 | */
21 | TripleIterator::~TripleIterator() { delete iterator; };
22 |
23 | /*!
24 | * Implementation for Python function "__repr__"
25 | * @return [description]
26 | */
27 | std::string TripleIterator::python_repr() {
28 | if (getLimit() != 0 && getOffset() > 0) {
29 | return "";
32 | } else if (getLimit() != 0) {
33 | return "";
35 | } else if (getOffset() > 0) {
36 | return "";
38 | }
39 | return "";
40 | }
41 |
42 | /*!
43 | * Get the subject of the triple pattern currently evaluated.
44 | * An empty string represents a variable
45 | * @return [description]
46 | */
47 | std::string TripleIterator::getSubject() { return iterator->getSubject(); }
48 |
49 | /*!
50 | * Get the predicate of the triple pattern currently evaluated.
51 | * An empty string represents a variable
52 | * @return [description]
53 | */
54 | std::string TripleIterator::getPredicate() { return iterator->getPredicate(); }
55 |
56 | /*!
57 | * Get the object of the triple pattern currently evaluated.
58 | * An empty string represents a variable
59 | * @return [description]
60 | */
61 | std::string TripleIterator::getObject() { return iterator->getObject(); }
62 |
63 | /*!
64 | * Get the limit of the current iterator
65 | * @return [description]
66 | */
67 | unsigned int TripleIterator::getLimit() { return iterator->getLimit(); }
68 |
69 | /*!
70 | * Get the offset of the current iterator
71 | * @return [description]
72 | */
73 | unsigned int TripleIterator::getOffset() { return iterator->getOffset(); }
74 |
75 | /*!
76 | * Get the number of results read by the iterator
77 | * @return [description]
78 | */
79 | unsigned int TripleIterator::getNbResultsRead() { return iterator->getNbResultsRead(); }
80 |
81 | /*!
82 | * Implementation for Python function "__iter__"
83 | * @return [description]
84 | */
85 | TripleIterator *TripleIterator::python_iter() { return this; }
86 |
87 | /*!
88 | * Get a hint over the cardinality of the triple pattern evaluated.
89 | * Offset & limit are not taken into account.
90 | * @return [description]
91 | */
92 | size_hint TripleIterator::sizeHint() {
93 | return iterator->sizeHint();
94 | }
95 |
96 | /*!
97 | * Return true if the iterator still has items available, False otherwise.
98 | * @return [description]
99 | */
100 | bool TripleIterator::hasNext() {
101 | return iterator->hasNext();
102 | }
103 |
104 | /**
105 | * Get the next item in the iterator, or raise py::StopIteration if the iterator
106 | * has ended. Used to implement Python Itertor protocol.
107 | * @return [description]
108 | */
109 | triple TripleIterator::next() {
110 | triple_id t = iterator->next();
111 | return std::make_tuple(
112 | dictionary->idToString(std::get<0>(t), hdt::SUBJECT),
113 | dictionary->idToString(std::get<1>(t), hdt::PREDICATE),
114 | dictionary->idToString(std::get<2>(t), hdt::OBJECT));
115 | }
116 |
117 | /**
118 | * Get the next item in the iterator, or raise py::StopIteration if the iterator
119 | * has ended, but without advancing the iterator.
120 | * @return [description]
121 | */
122 | triple TripleIterator::peek() {
123 | triple_id t = iterator->peek();
124 | return std::make_tuple(
125 | dictionary->idToString(std::get<0>(t), hdt::SUBJECT),
126 | dictionary->idToString(std::get<1>(t), hdt::PREDICATE),
127 | dictionary->idToString(std::get<2>(t), hdt::OBJECT));
128 | }
129 |
--------------------------------------------------------------------------------
/src/triple_iterator_bytes.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * triple_iterator_bytes.cpp
3 | * Author: Arnaud GRALL - MIT License 2017-2019
4 | */
5 |
6 | #include "triple_iterator_bytes.hpp"
7 | #include
8 | #include
9 | #include
10 | namespace py = pybind11;
11 |
12 | /*!
13 | * Constructor
14 | * @param iterator [description]
15 | */
16 | TripleIteratorBytes::TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict)
17 | : iterator(_it), dictionary(_dict) {};
18 |
19 | /*!
20 | * Destructor
21 | */
22 | TripleIteratorBytes::~TripleIteratorBytes() { delete iterator; };
23 |
24 | /*!
25 | * Implementation for Python function "__repr__"
26 | * @return [description]
27 | */
28 | std::string TripleIteratorBytes::python_repr() {
29 | if (getLimit() != 0 && getOffset() > 0) {
30 | return "";
33 | } else if (getLimit() != 0) {
34 | return "";
36 | } else if (getOffset() > 0) {
37 | return "";
39 | }
40 | return "";
41 | }
42 |
43 | /*!
44 | * Get the subject of the triple pattern currently evaluated.
45 | * An empty string represents a variable
46 | * @return [description]
47 | */
48 | std::string TripleIteratorBytes::getSubject() { return iterator->getSubject(); }
49 |
50 | /*!
51 | * Get the predicate of the triple pattern currently evaluated.
52 | * An empty string represents a variable
53 | * @return [description]
54 | */
55 | std::string TripleIteratorBytes::getPredicate() { return iterator->getPredicate(); }
56 |
57 | /*!
58 | * Get the object of the triple pattern currently evaluated.
59 | * An empty string represents a variable
60 | * @return [description]
61 | */
62 | std::string TripleIteratorBytes::getObject() { return iterator->getObject(); }
63 |
64 | /*!
65 | * Get the limit of the current iterator
66 | * @return [description]
67 | */
68 | unsigned int TripleIteratorBytes::getLimit() { return iterator->getLimit(); }
69 |
70 | /*!
71 | * Get the offset of the current iterator
72 | * @return [description]
73 | */
74 | unsigned int TripleIteratorBytes::getOffset() { return iterator->getOffset(); }
75 |
76 | /*!
77 | * Get the number of results read by the iterator
78 | * @return [description]
79 | */
80 | unsigned int TripleIteratorBytes::getNbResultsRead() { return iterator->getNbResultsRead(); }
81 |
82 | /*!
83 | * Implementation for Python function "__iter__"
84 | * @return [description]
85 | */
86 | TripleIteratorBytes *TripleIteratorBytes::python_iter() { return this; }
87 |
88 | /*!
89 | * Get a hint over the cardinality of the triple pattern evaluated.
90 | * Offset & limit are not taken into account.
91 | * @return [description]
92 | */
93 | size_hint TripleIteratorBytes::sizeHint() {
94 | return iterator->sizeHint();
95 | }
96 |
97 | /*!
98 | * Return true if the iterator still has items available, False otherwise.
99 | * @return [description]
100 | */
101 | bool TripleIteratorBytes::hasNext() {
102 | return iterator->hasNext();
103 | }
104 |
105 | /**
106 | * Get the next item in the iterator, or raise py::StopIteration if the iterator
107 | * has ended. Used to implement Python Itertor protocol.
108 | * @return [description]
109 | */
110 | triple_bytes TripleIteratorBytes::next() {
111 | triple_id t = iterator->next();
112 | return std::make_tuple(
113 | py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)),
114 | py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)),
115 | py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT)));
116 | }
117 |
118 | /**
119 | * Get the next item in the iterator, or raise py::StopIteration if the iterator
120 | * has ended, but without advancing the iterator.
121 | * @return [description]
122 | */
123 | triple_bytes TripleIteratorBytes::peek() {
124 | triple_id t = iterator->peek();
125 | return std::make_tuple(
126 | py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)),
127 | py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)),
128 | py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT)));
129 | }
130 |
--------------------------------------------------------------------------------
/src/tripleid_iterator.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * tripleid_iterator.cpp
3 | * Author: Thomas MINIER - MIT License 2017-2019
4 | */
5 |
6 | #include "tripleid_iterator.hpp"
7 | #include
8 | #include
9 | #include
10 |
11 | /*!
12 | * Constructor
13 | * @param iterator [description]
14 | */
15 | TripleIDIterator::TripleIDIterator(hdt::IteratorTripleID *_it,
16 | std::string _subj, std::string _pred,
17 | std::string _obj, unsigned int _limit,
18 | unsigned int _offset)
19 | : subject((_subj.compare("") == 0) ? "?s" : _subj),
20 | predicate((_pred.compare("") == 0) ? "?p" : _pred),
21 | object((_obj.compare("") == 0) ? "?o" : _obj), limit(_limit),
22 | offset(_offset), iterator(_it){};
23 |
24 | /*!
25 | * Destructor
26 | */
27 | TripleIDIterator::~TripleIDIterator() { delete iterator; };
28 |
29 | /*!
30 | * Implementation for Python function "__repr__"
31 | * @return [description]
32 | */
33 | std::string TripleIDIterator::python_repr() {
34 | if (limit != 0 && offset > 0) {
35 | return "";
38 | } else if (limit != 0) {
39 | return "";
41 | } else if (offset > 0) {
42 | return "";
44 | }
45 | return "";
46 | }
47 |
48 | /*!
49 | * Get the subject of the triple pattern currently evaluated.
50 | * An empty string represents a variable
51 | * @return [description]
52 | */
53 | std::string TripleIDIterator::getSubject() { return subject; }
54 |
55 | /*!
56 | * Get the predicate of the triple pattern currently evaluated.
57 | * An empty string represents a variable
58 | * @return [description]
59 | */
60 | std::string TripleIDIterator::getPredicate() { return predicate; }
61 |
62 | /*!
63 | * Get the object of the triple pattern currently evaluated.
64 | * An empty string represents a variable
65 | * @return [description]
66 | */
67 | std::string TripleIDIterator::getObject() { return object; }
68 |
69 | /*!
70 | * Get the limit of the current iterator
71 | * @return [description]
72 | */
73 | unsigned int TripleIDIterator::getLimit() { return limit; }
74 |
75 | /*!
76 | * Get the offset of the current iterator
77 | * @return [description]
78 | */
79 | unsigned int TripleIDIterator::getOffset() { return offset; }
80 |
81 | /*!
82 | * Get the number of results read by the iterator
83 | * @return [description]
84 | */
85 | unsigned int TripleIDIterator::getNbResultsRead() { return resultsRead; }
86 |
87 | /*!
88 | * Implementation for Python function "__iter__"
89 | * @return [description]
90 | */
91 | TripleIDIterator *TripleIDIterator::python_iter() { return this; }
92 |
93 | /*!
94 | * Get a hint over the cardinality of the triple pattern evaluated.
95 | * Offset & limit are not taken into account.
96 | * @return [description]
97 | */
98 | size_hint TripleIDIterator::sizeHint() {
99 | return std::make_tuple(iterator->estimatedNumResults(), iterator->numResultEstimation() == hdt::EXACT);
100 | }
101 |
102 | /*!
103 | * Return true if the iterator still has items available, False otherwise.
104 | * @return [description]
105 | */
106 | bool TripleIDIterator::hasNext() {
107 | bool noLimit = limit == 0;
108 | return iterator->hasNext() && (noLimit || limit > resultsRead);
109 | }
110 |
111 | /**
112 | * Get the next item in the iterator, or raise py::StopIteration if the iterator
113 | * has ended. Used to implement Python Itertor protocol.
114 | * @return [description]
115 | */
116 | triple_id TripleIDIterator::next() {
117 | // return any previously peeked value
118 | if (hasBufferedTriple) {
119 | hasBufferedTriple = false;
120 | resultsRead++;
121 | return _bufferedTriple;
122 | }
123 | bool noLimit = limit == 0;
124 | if (iterator->hasNext() && (noLimit || limit > resultsRead)) {
125 | resultsRead++;
126 | hdt::TripleID *ts = iterator->next();
127 | return std::make_tuple(ts->getSubject(), ts->getPredicate(),
128 | ts->getObject());
129 | }
130 | throw pybind11::stop_iteration();
131 | }
132 |
133 | /**
134 | * Get the next item in the iterator, or raise py::StopIteration if the iterator
135 | * has ended, but without advancing the iterator.
136 | * @return [description]
137 | */
138 | triple_id TripleIDIterator::peek() {
139 | if (hasBufferedTriple) {
140 | return _bufferedTriple;
141 | }
142 | _bufferedTriple = next();
143 | hasBufferedTriple = true;
144 | resultsRead--;
145 | return _bufferedTriple;
146 | }
147 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/tests/__init__.py
--------------------------------------------------------------------------------
/tests/hdt_document_test.py:
--------------------------------------------------------------------------------
1 | # hdt_document_test.py
2 | # Author: Thomas MINIER - MIT License 2017-2019
3 | import pytest
4 | from hdt import HDTDocument, IdentifierPosition
5 |
6 | path = "tests/test.hdt"
7 | document = HDTDocument(path, True, False)
8 | nbTotalTriples = 132
9 |
10 |
11 | def test_missing_file():
12 | with pytest.raises(RuntimeError):
13 | HDTDocument("/home/dtrump/wall.hdt")
14 |
15 |
16 | def test_file_path():
17 | assert document.file_path == path, f"THe HDT Document filepath should be {path}"
18 |
19 |
20 | def test_total_triples():
21 | assert document.total_triples == nbTotalTriples, f"The HDT Document should contains {nbTotalTriples} RDF triples"
22 | assert len(document) == nbTotalTriples, f"The HDT Document __len__ magic function should returns {nbTotalTriples} RDF triples"
23 |
24 |
25 | def test_nb_subjects():
26 | assert document.nb_subjects == 4, f"The HDT Document should contains 4 subjects"
27 |
28 |
29 | def tests_nb_predicates():
30 | assert document.nb_predicates == 3, f"The HDT Document should contains 3 predicates"
31 |
32 |
33 | def tests_nb_objects():
34 | assert document.nb_objects == 112, f"The HDT Document should contains 112 objects"
35 |
36 |
37 | def tests_nb_shared():
38 | assert document.nb_shared == 0, f"The HDT Document should contains 0 shared subject-objects"
39 |
40 |
41 | def test_ids_to_string():
42 | (triples, triplesCard) = document.search_triples("", "", "")
43 | (ids, idsCard) = document.search_triples_ids(0, 0, 0)
44 | assert triplesCard == idsCard
45 | assert triplesCard == nbTotalTriples
46 | for subj, pred, obj in triples:
47 | sid, pid, oid = next(ids)
48 | s, p, o = document.convert_tripleid(sid, pid, oid)
49 | assert subj == s
50 | assert pred == p
51 | assert obj == o
52 |
53 |
54 | def test_ids_to_string_bytes():
55 | (triples, triplesCard) = document.search_triples_bytes("", "", "")
56 | (ids, idsCard) = document.search_triples_ids(0, 0, 0)
57 | assert triplesCard == idsCard
58 | assert triplesCard == nbTotalTriples
59 | for subj, pred, obj in triples:
60 | sid, pid, oid = next(ids)
61 | s, p, o = document.convert_tripleid_bytes(sid, pid, oid)
62 | assert subj.decode('utf-8') == s.decode('utf-8')
63 | assert pred.decode('utf-8') == p.decode('utf-8')
64 | assert obj.decode('utf-8') == o.decode('utf-8')
65 |
66 |
67 | def test_convert_id():
68 | (triples, triplesCard) = document.search_triples("", "", "")
69 | (ids, idsCard) = document.search_triples_ids(0, 0, 0)
70 | assert triplesCard == idsCard
71 | assert triplesCard == nbTotalTriples
72 | for subj, pred, obj in triples:
73 | sid, pid, oid = next(ids)
74 | s, p, o = (
75 | document.convert_id(sid, IdentifierPosition.Subject),
76 | document.convert_id(pid, IdentifierPosition.Predicate),
77 | document.convert_id(oid, IdentifierPosition.Object)
78 | )
79 | assert subj == s
80 | assert pred == p
81 | assert obj == o
82 |
83 |
84 | def test_convert_id_bytes():
85 | (triples, triplesCard) = document.search_triples_bytes("", "", "")
86 | (ids, idsCard) = document.search_triples_ids(0, 0, 0)
87 | assert triplesCard == idsCard
88 | assert triplesCard == nbTotalTriples
89 | for subj, pred, obj in triples:
90 | sid, pid, oid = next(ids)
91 | s, p, o = (
92 | document.convert_id_bytes(sid, IdentifierPosition.Subject),
93 | document.convert_id_bytes(pid, IdentifierPosition.Predicate),
94 | document.convert_id_bytes(oid, IdentifierPosition.Object)
95 | )
96 | assert subj == s
97 | assert pred == p
98 | assert obj == o
99 |
--------------------------------------------------------------------------------
/tests/hdt_iterators_test.py:
--------------------------------------------------------------------------------
1 | # hdt_iterators_test.py
2 | # Author: Thomas MINIER - MIT License 2017-2019
3 | import pytest
4 | from hdt import HDTDocument
5 |
6 | path = "tests/test.hdt"
7 | document = HDTDocument(path)
8 | nbTotalTriples = 132
9 |
10 |
11 | def test_read_document_base():
12 | (triples, cardinality) = document.search_triples("", "", "")
13 | assert triples.subject == "?s"
14 | assert triples.predicate == "?p"
15 | assert triples.object == "?o"
16 | assert cardinality == nbTotalTriples
17 | for subj, pred, obj in triples:
18 | assert subj is not None
19 | assert pred is not None
20 | assert obj is not None
21 | assert triples.nb_reads == cardinality
22 |
23 | def test_read_document_base_bytes():
24 | (triples, cardinality) = document.search_triples_bytes("", "", "")
25 | assert triples.subject == "?s"
26 | assert triples.predicate == "?p"
27 | assert triples.object == "?o"
28 | assert cardinality == nbTotalTriples
29 | for subj, pred, obj in triples:
30 | assert isinstance(subj, bytes)
31 | assert isinstance(pred, bytes)
32 | assert isinstance(obj, bytes)
33 | try:
34 | s, p, o = subj.decode('utf-8'), pred.decode('utf-8'), obj.decode('utf-8')
35 | except Exception as err:
36 | # with the test.hdt file we shouldnt have any problem
37 | raise err
38 | assert subj is not None
39 | assert pred is not None
40 | assert obj is not None
41 | assert triples.nb_reads == cardinality
42 |
43 |
44 | empty_triples = [
45 | ("http://example.org#toto", "", ""),
46 | ("", "http://example.org#toto", ""),
47 | ("", "http://example.org#toto", "")
48 | ]
49 |
50 | empty_triples_ids = [
51 | (155, 0, 0),
52 | (0, 155, 0),
53 | (0, 0, 155)
54 | ]
55 |
56 |
57 | @pytest.mark.parametrize("triple", empty_triples)
58 | def test_search_triples_empty(triple):
59 | s, p, o = triple
60 | (iterator, cardinality) = document.search_triples(s, p, o)
61 | assert cardinality == 0
62 | assert not iterator.has_next()
63 |
64 |
65 | @pytest.mark.parametrize("triple", empty_triples_ids)
66 | def test_search_ids_empty(triple):
67 | s, p, o = triple
68 | (iterator, cardinality) = document.search_triples_ids(s, p, o)
69 | assert cardinality == 0
70 | assert not iterator.has_next()
71 |
72 |
73 | def test_read_document_limit():
74 | nbItems = 0
75 | (triples, cardinality) = document.search_triples("", "", "", limit=10)
76 | assert triples.limit == 10
77 | assert cardinality == nbTotalTriples
78 | for subj, pred, obj in triples:
79 | nbItems += 1
80 | assert subj is not None
81 | assert pred is not None
82 | assert obj is not None
83 | assert nbItems == 10
84 | assert triples.nb_reads == 10
85 |
86 |
87 | def test_read_document_bytes_peek():
88 | nbItems = 0
89 | (triples, cardinality) = document.search_triples_bytes("", "", "", limit=10)
90 | assert triples.limit == 10
91 | assert cardinality == nbTotalTriples
92 | peek = triples.peek()
93 | for subj, pred, obj in triples:
94 | nbItems += 1
95 | assert isinstance(subj, bytes)
96 | assert isinstance(pred, bytes)
97 | assert isinstance(obj, bytes)
98 | assert subj == peek[0]
99 | assert pred == peek[1]
100 | assert obj == peek[2]
101 | assert subj is not None
102 | assert pred is not None
103 | assert obj is not None
104 | try:
105 | peek = triples.peek()
106 | except StopIteration:
107 | pass
108 | assert nbItems == 10
109 | assert triples.nb_reads == 10
110 |
111 |
112 | def test_read_document_offset():
113 | nbItems = 0
114 | (triples, cardinality) = document.search_triples("", "", "", offset=10)
115 | assert triples.offset == 10
116 | assert cardinality == nbTotalTriples
117 | for subj, pred, obj in triples:
118 | nbItems += 1
119 | assert subj is not None
120 | assert pred is not None
121 | assert obj is not None
122 | assert nbItems == cardinality - 10
123 | assert triples.nb_reads == cardinality - 10
124 |
125 |
126 | def test_read_document_ids():
127 | (triples, cardinality) = document.search_triples_ids(0, 0, 0)
128 | assert triples.subject == "?s"
129 | assert triples.predicate == "?p"
130 | assert triples.object, "?o"
131 | assert cardinality, nbTotalTriples
132 | for subj, pred, obj in triples:
133 | assert subj is not None
134 | assert pred is not None
135 | assert obj is not None
136 | assert triples.nb_reads == cardinality
137 |
138 |
139 | def test_string_iterator_peek():
140 | expected = ('http://example.org/s1', 'http://example.org/p1', 'http://example.org/o001')
141 | (triples, cardinality) = document.search_triples("", "", "")
142 | v = triples.peek()
143 | assert v == expected
144 | assert triples.nb_reads == 0
145 | v = next(triples)
146 | assert v == expected
147 | assert triples.nb_reads == 1
148 |
149 |
150 | def test_ids_iterator_peek():
151 | expected = (1, 1, 13)
152 | (triples, cardinality) = document.search_triples_ids(0, 0, 0)
153 | v = triples.peek()
154 | assert v == expected
155 | assert triples.nb_reads == 0
156 | v = next(triples)
157 | assert v == expected
158 | assert triples.nb_reads == 1
159 |
160 |
161 | def test_string_iterator_big_offset():
162 | nbItems = 0
163 | (triples, cardinality) = document.search_triples("", "", "", offset=nbTotalTriples + 1)
164 | for s, p, o in triples:
165 | nbItems += 1
166 | assert nbItems == 0
167 |
168 |
169 | def test_ids_iterator_big_offset():
170 | nbItems = 0
171 | (triples, cardinality) = document.search_triples_ids(0, 0, 0, offset=nbTotalTriples + 1)
172 | for s, p, o in triples:
173 | nbItems += 1
174 | assert nbItems == 0
175 |
--------------------------------------------------------------------------------
/tests/hdt_store_test.py:
--------------------------------------------------------------------------------
1 | # hdt_store_test.py
2 | # Author: Thomas MINIER - MIT License 2017-2020
3 | import pytest
4 | from rdflib_hdt import HDTStore, optimize_sparql
5 | from rdflib import Graph, URIRef, Literal
6 |
7 | path = "tests/test.hdt"
8 | store = HDTStore(path)
9 |
10 | fixtures = [
11 | # pattern ?s ?p ?o
12 | ((None, None, None), 128),
13 | # pattern s p o
14 | ((URIRef('http://example.org/s1'), URIRef('http://example.org/p1'), URIRef('http://example.org/o002')), 1),
15 | ((URIRef('http://example.org/s5'), URIRef('http://example.org/p1'), URIRef('http://example.org/o002')), 0),
16 | # pattern s ?p ?o
17 | ((URIRef('http://example.org/s1'), None, None), 100),
18 | ((URIRef('http://example.org/s2'), None, None), 10),
19 | ((URIRef('http://example.org/s3'), None, None), 10),
20 | ((URIRef('http://example.org/s4'), None, None), 8),
21 | ((URIRef('http://example.org/s5'), None, None), 0),
22 | # pattern ?s p ?o
23 | ((None, URIRef('http://example.org/p1'), None), 110),
24 | ((None, URIRef('http://example.org/p2'), None), 10),
25 | ((None, URIRef('http://example.org/p3'), None), 8),
26 | ((None, URIRef('http://example.org/p99'), None), 0),
27 | # pattern ?s ?p o
28 | ((None, None, URIRef('http://example.org/o002')), 3),
29 | ((None, None, URIRef('http://example.org/o004')), 3),
30 | ((None, None, Literal('a')), 1),
31 | ((None, None, Literal('a', lang='en')), 1),
32 | ((None, None, Literal('', lang='en')), 1),
33 | ((None, None, Literal('', datatype=URIRef('http://example.org/literal'))), 1),
34 | ((None, None, URIRef('http://example.org/o999')), 0),
35 | # pattern s ?p o
36 | ((URIRef('http://example.org/s1'), None, URIRef('http://example.org/o002')), 1),
37 | ((URIRef('http://example.org/s2'), None, URIRef('http://example.org/o004')), 1),
38 | ((URIRef('http://example.org/s3'), None, URIRef('http://example.org/o004')), 1),
39 | ((URIRef('http://example.org/s99'), None, URIRef('http://example.org/o004')), 0),
40 | # pattern s p ?o
41 | ((URIRef('http://example.org/s1'), URIRef('http://example.org/p1'), None), 100),
42 | ((URIRef('http://example.org/s2'), URIRef('http://example.org/p1'), None), 10),
43 | ((URIRef('http://example.org/s3'), URIRef('http://example.org/p2'), None), 10),
44 | ((URIRef('http://example.org/s3'), URIRef('http://example.org/p999'), None), 0),
45 | # pattern ?s p o
46 | ((None, URIRef('http://example.org/p1'), URIRef('http://example.org/o002')), 2),
47 | ((None, URIRef('http://example.org/p2'), URIRef('http://example.org/o004')), 1),
48 | ((None, URIRef('http://example.org/p2'), URIRef('http://example.org/o999')), 0)
49 | ]
50 |
51 |
52 | @pytest.mark.parametrize("query,expected_length", fixtures)
53 | def test_rdflib_graph_search(query, expected_length):
54 | query_s, query_p, query_o = query
55 | graph = Graph(store=store)
56 | nb_triples = 0
57 | for s, p, o in graph.triples(query):
58 | nb_triples += 1
59 | assert isinstance(s, URIRef), f"The subject of an RDF triple must be an URI"
60 | assert isinstance(p, URIRef), f"The predicate of an RDF triple must be an URI"
61 | assert isinstance(o, URIRef) or isinstance(o, Literal), f"The object of an RDF triple must be an URI or a Literal"
62 | if query_s is not None:
63 | assert s == query_s, f"The expected RDF triple subject's is {query_s}"
64 | if query_p is not None:
65 | assert p == query_p, f"The expected RDF triple predicate's is {query_p}"
66 | if query_o is not None:
67 | assert o == query_o, f"The expected RDF triple subject's is {query_o}"
68 | assert nb_triples == expected_length, f"The expected number of matches for {query} is {expected_length}"
69 |
70 |
71 | def test_rdflib_sparql_query():
72 | optimize_sparql()
73 | graph = Graph(store=store)
74 | sparql_query = """
75 | PREFIX ex:
76 | SELECT * WHERE {
77 | ?s ex:p1 ?o
78 | }
79 | """
80 | qres = graph.query(sparql_query)
81 |
82 | nb_bindings = 0
83 | for row in qres:
84 | nb_bindings += 1
85 |
86 | assert nb_bindings == 110, f"The query should yield 110 set of solution mappings"
87 |
--------------------------------------------------------------------------------
/tests/join_iterator_test.py:
--------------------------------------------------------------------------------
1 | # hdt_iterators_test.py
2 | # Author: Thomas MINIER - MIT License 2017-2019
3 | from hdt import HDTDocument
4 |
5 | path = "tests/test.hdt"
6 | document = HDTDocument(path)
7 |
8 |
9 | def test_basic_join():
10 | join_iter = document.search_join([
11 | ("?s", "http://example.org/p1", "http://example.org/o001"),
12 | ("?s", "http://example.org/p1", "http://example.org/o001")
13 | ])
14 | cpt = 0
15 | for b in join_iter:
16 | cpt += 1
17 | assert len(b) == 1
18 | assert ('?s', 'http://example.org/s1') in b or ('?s', 'http://example.org/s2') in b
19 | assert cpt == 2
20 |
21 |
22 | def test_basic_join_bytes():
23 | join_iter = document.search_join_bytes([
24 | ("?s", "http://example.org/p1", "http://example.org/o001"),
25 | ("?s", "http://example.org/p1", "http://example.org/o001")
26 | ])
27 | cpt = 0
28 | for b in join_iter:
29 | cpt += 1
30 | assert len(b) == 1
31 | assert (b'?s', b'http://example.org/s1') in b or (b'?s', b'http://example.org/s2') in b
32 | assert cpt == 2
33 |
--------------------------------------------------------------------------------
/tests/test.hdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RDFLib/rdflib-hdt/1bf6560e453cc4df0071d171c39fcbd7d851a041/tests/test.hdt
--------------------------------------------------------------------------------
/tests/wrappers_test.py:
--------------------------------------------------------------------------------
1 | # wrappers_test.py
2 | # Author: Thomas MINIER - MIT License 2017-2020
3 | from rdflib_hdt import HDTDocument
4 | from rdflib import URIRef, Variable
5 |
6 | path = "tests/test.hdt"
7 | document = HDTDocument(path)
8 |
9 |
10 | def test_search_join_rdflib():
11 | expected_nb = 2
12 | join_iter = document.search_join([
13 | (Variable("s"), URIRef("http://example.org/p1"), URIRef("http://example.org/o001")),
14 | (Variable("s"), URIRef("http://example.org/p1"), URIRef("http://example.org/o001"))
15 | ])
16 | assert len(join_iter) == expected_nb
17 | cpt = 0
18 | for row in join_iter:
19 | cpt += 1
20 | assert row.s == URIRef('http://example.org/s1') or row.s == URIRef('http://example.org/s2')
21 | assert cpt == expected_nb
22 |
--------------------------------------------------------------------------------