├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.rst ├── docs ├── Makefile ├── make.bat └── source │ ├── _config.yml │ ├── api.rst │ ├── conf.py │ ├── hdtdocument.rst │ ├── index.rst │ └── installation.rst ├── include ├── docstrings.hpp ├── hdt_document.hpp ├── join_iterator.hpp ├── join_iterator_bytes.hpp ├── pyhdt_types.hpp ├── triple_iterator.hpp ├── triple_iterator_bytes.hpp └── tripleid_iterator.hpp ├── install.sh ├── requirements.txt ├── setup.cfg ├── setup.py ├── src ├── hdt.cpp ├── hdt_document.cpp ├── join_iterator.cpp ├── join_iterator_bytes.cpp ├── triple_iterator.cpp ├── triple_iterator_bytes.cpp └── tripleid_iterator.cpp └── tests ├── __init__.py ├── hdt_document_test.py ├── hdt_iterators_test.py ├── join_iterator_test.py └── test.hdt /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # C/C++ 2 | # Prerequisites 3 | *.d 4 | 5 | # Compiled Object files 6 | *.slo 7 | *.lo 8 | *.o 9 | *.obj 10 | .pytest_cache/ 11 | 12 | # Precompiled Headers 13 | *.gch 14 | *.pch 15 | 16 | # Compiled Dynamic libraries 17 | *.so 18 | *.dylib 19 | *.dll 20 | 21 | # Fortran module files 22 | *.mod 23 | *.smod 24 | 25 | # Compiled Static libraries 26 | *.lai 27 | *.la 28 | *.a 29 | *.lib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | 36 | # Python 37 | # Byte-compiled / optimized / DLL files 38 | __pycache__/ 39 | *.py[cod] 40 | *$py.class 41 | 42 | # C extensions 43 | *.so 44 | 45 | # Distribution / packaging 46 | .Python 47 | build/ 48 | develop-eggs/ 49 | dist/ 50 | downloads/ 51 | eggs/ 52 | .eggs/ 53 | lib/ 54 | lib64/ 55 | parts/ 56 | sdist/ 57 | var/ 58 | wheels/ 59 | *.egg-info/ 60 | .installed.cfg 61 | *.egg 62 | MANIFEST 63 | 64 | # PyInstaller 65 | # Usually these files are written by a python script from a template 66 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 67 | *.manifest 68 | *.spec 69 | 70 | # Installer logs 71 | pip-log.txt 72 | pip-delete-this-directory.txt 73 | 74 | # Unit test / coverage reports 75 | htmlcov/ 76 | .tox/ 77 | .coverage 78 | .coverage.* 79 | .cache 80 | nosetests.xml 81 | coverage.xml 82 | *.cover 83 | .hypothesis/ 84 | 85 | # Translations 86 | *.mo 87 | *.pot 88 | 89 | # Django stuff: 90 | *.log 91 | .static_storage/ 92 | .media/ 93 | local_settings.py 94 | 95 | # Flask stuff: 96 | instance/ 97 | .webassets-cache 98 | 99 | # Scrapy stuff: 100 | .scrapy 101 | 102 | # Sphinx documentation 103 | docs/_build/ 104 | 105 | # PyBuilder 106 | target/ 107 | 108 | # Jupyter Notebook 109 | .ipynb_checkpoints 110 | 111 | # pyenv 112 | .python-version 113 | 114 | # celery beat schedule file 115 | celerybeat-schedule 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | 142 | # HDT 143 | *.hdt.index.v* 144 | hdt-cpp-* 145 | hdt-cpp.zip 146 | v1.3.*.zip 147 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: pip 3 | python: 4 | - '3.6' 5 | notifications: 6 | email: false 7 | addons: 8 | apt: 9 | sources: 10 | - ubuntu-toolchain-r-test 11 | packages: 12 | - g++-4.8 13 | before_install: 14 | - if [ $TRAVIS_OS_NAME == linux ]; then export CXX=g++-4.8; fi 15 | install: 16 | - bash install.sh 17 | script: 18 | - pytest 19 | before_deploy: 20 | - rm -rf build/ 21 | - pip install pytest sphinx sphinx_rtd_theme 22 | - cd docs && make html 23 | deploy: 24 | - provider: pypi 25 | skip_cleanup: true 26 | user: callidon 27 | password: $PYPI_PASSWD 28 | distributions: "sdist bdist_wheel" 29 | on: 30 | tags: true 31 | - provider: pages 32 | skip_cleanup: true 33 | github_token: $GH_PAGES 34 | keep_history: true 35 | local_dir: docs/build/html 36 | on: 37 | branch: master 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017-2019 Thomas Minier 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | graft include/ 3 | graft hdt-cpp-1.3.3/libhdt/src/bitsequence 4 | graft hdt-cpp-1.3.3/libhdt/src/dictionary 5 | graft hdt-cpp-1.3.3/libhdt/src/hdt 6 | graft hdt-cpp-1.3.3/libhdt/src/header 7 | graft hdt-cpp-1.3.3/libhdt/src/huffman 8 | graft hdt-cpp-1.3.3/libhdt/src/libdcs 9 | graft hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex 10 | graft hdt-cpp-1.3.3/libhdt/src/rdf 11 | graft hdt-cpp-1.3.3/libhdt/src/sequence 12 | graft hdt-cpp-1.3.3/libhdt/src/triples 13 | graft hdt-cpp-1.3.3/libhdt/src/util 14 | graft hdt-cpp-1.3.3/libhdt/third 15 | graft hdt-cpp-1.3.3/libhdt/include/ 16 | graft hdt-cpp-1.3.3/libhdt/src/dictionary/ 17 | graft hdt-cpp-1.3.3/libhdt/src/sparql 18 | graft hdt-cpp-1.3.3/libcds/include/ 19 | graft hdt-cpp-1.3.3/libcds/src/static/bitsequence 20 | graft hdt-cpp-1.3.3/libcds/src/static/coders 21 | graft hdt-cpp-1.3.3/libcds/src/static/mapper 22 | graft hdt-cpp-1.3.3/libcds/src/static/permutation 23 | graft hdt-cpp-1.3.3/libcds/src/static/sequence 24 | graft hdt-cpp-1.3.3/libcds/src/utils 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyHDT 2 | 3 | [![Build Status](https://travis-ci.org/Callidon/pyHDT.svg?branch=master)](https://travis-ci.org/Callidon/pyHDT) [![Documentation Status](https://readthedocs.org/projects/pyhdt/badge/?version=latest)](https://callidon.github.io/pyHDT) [![PyPI version](https://badge.fury.io/py/hdt.svg)](https://badge.fury.io/py/hdt) 4 | 5 | **pyHDT is joining the RDFlib family as part of the rdflib 6.0 release! The development continues at [rdflib-hdt](https://github.com/RDFLib/rdflib-hdt), and this repository is going into archive.** 6 | 7 | Read and query HDT document with ease in Python 8 | 9 | [Online Documentation](https://callidon.github.io/pyHDT) 10 | 11 | # Requirements 12 | 13 | * Python *version 3.6.4 or higher* 14 | * [pip](https://pip.pypa.io/en/stable/) 15 | * **gcc/clang** with **c++11 support** 16 | * **Python Development headers** 17 | > You should have the `Python.h` header available on your system. 18 | > For example, for Python 3.6, install the `python3.6-dev` package on Debian/Ubuntu systems. 19 | 20 | Then, install the [pybind11 library](http://pybind11.readthedocs.io/en/stable/) 21 | ``` 22 | pip install pybind11 23 | ``` 24 | 25 | # Installation 26 | 27 | Installation in a [virtualenv](https://virtualenv.pypa.io/en/stable/) is **strongly advised!** 28 | 29 | ## Pip install (recommended) 30 | 31 | ``` 32 | pip install hdt 33 | ``` 34 | 35 | ## Manual installation 36 | 37 | ``` 38 | git clone https://github.com/Callidon/pyHDT 39 | cd pyHDT/ 40 | ./install.sh 41 | ``` 42 | 43 | # Getting started 44 | 45 | ```python 46 | from hdt import HDTDocument 47 | 48 | # Load an HDT file. 49 | # Missing indexes are generated automatically, add False as the second argument to disable them 50 | document = HDTDocument("test.hdt") 51 | 52 | # Display some metadata about the HDT document itself 53 | print("nb triples: %i" % document.total_triples) 54 | print("nb subjects: %i" % document.nb_subjects) 55 | print("nb predicates: %i" % document.nb_predicates) 56 | print("nb objects: %i" % document.nb_objects) 57 | print("nb shared subject-object: %i" % document.nb_shared) 58 | 59 | # Fetch all triples that matches { ?s ?p ?o } 60 | # Use empty strings ("") to indicates variables 61 | triples, cardinality = document.search_triples("", "", "") 62 | 63 | print("cardinality of { ?s ?p ?o }: %i" % cardinality) 64 | for triple in triples: 65 | print(triple) 66 | 67 | # Search also support limit and offset 68 | triples, cardinality = document.search_triples("", "", "", limit=10, offset=100) 69 | # etc ... 70 | ``` 71 | 72 | # Handling non UTF-8 strings in python 73 | 74 | If the HDT document has been encoded with a non UTF-8 encoding the previous code won't work correctly and will result in a `UnicodeDecodeError`. 75 | More details on how to convert string to str from c++ to python [here](https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html) 76 | 77 | To handle this we doubled the API of the HDT document by adding: 78 | - `search_triples_bytes(...)` return an iterator of triples as `(py::bytes, py::bytes, py::bytes)` 79 | - `search_join_bytes(...)` return an iterator of sets of solutions mapping as `py::set(py::bytes, py::bytes)` 80 | - `convert_tripleid_bytes(...)` return a triple as: `(py::bytes, py::bytes, py::bytes)` 81 | - `convert_id_bytes(...)` return a `py::bytes` 82 | 83 | **Parameters and documentation are the same as the standard version** 84 | 85 | ```python 86 | from hdt import HDTDocument 87 | 88 | # Load an HDT file. 89 | # Missing indexes are generated automatically, add False as the second argument to disable them 90 | document = HDTDocument("test.hdt") 91 | it = document.search_triple_bytes("", "", "") 92 | 93 | for s, p, o in it: 94 | print(s, p, o) # print b'...', b'...', b'...' 95 | # now decode it, or handle any error 96 | try: 97 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8') 98 | except UnicodeDecodeError as err: 99 | # try another other codecs 100 | pass 101 | ``` 102 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Build Status| |Documentation Status| |PyPI version| 2 | 3 | Read and query HDT document with ease in Python 4 | 5 | `Online Documentation `__ 6 | 7 | Requirements 8 | ============ 9 | 10 | - Python *version 3.6.4 or higher* 11 | - `pip `__ 12 | - **gcc/clang** with **c++11 support** 13 | - **Python Development headers** > You should have the ``Python.h`` 14 | header available on your system. 15 | > For example, for Python 3.6, install the ``python3.6-dev`` package 16 | on Debian/Ubuntu systems. 17 | 18 | Then, install the `pybind11 19 | library `__ 20 | 21 | :: 22 | 23 | pip install pybind11 24 | 25 | Installation 26 | ============ 27 | 28 | Installation in a `virtualenv `__ 29 | is **strongly advised!** 30 | 31 | Pip install (recommended) 32 | ------------------------- 33 | 34 | :: 35 | 36 | pip install hdt 37 | 38 | Manual installation 39 | ------------------- 40 | 41 | :: 42 | 43 | git clone https://github.com/Callidon/pyHDT 44 | cd pyHDT/ 45 | ./install.sh 46 | 47 | Getting started 48 | =============== 49 | 50 | .. code:: python 51 | 52 | from hdt import HDTDocument 53 | 54 | # Load an HDT file. 55 | # Missing indexes are generated automatically, add False as the second argument to disable them 56 | document = HDTDocument("test.hdt") 57 | 58 | # Display some metadata about the HDT document itself 59 | print("nb triples: %i" % document.total_triples) 60 | print("nb subjects: %i" % document.nb_subjects) 61 | print("nb predicates: %i" % document.nb_predicates) 62 | print("nb objects: %i" % document.nb_objects) 63 | print("nb shared subject-object: %i" % document.nb_shared) 64 | 65 | # Fetch all triples that matches { ?s ?p ?o } 66 | # Use empty strings ("") to indicates variables 67 | triples, cardinality = document.search_triples("", "", "") 68 | 69 | print("cardinality of { ?s ?p ?o }: %i" % cardinality) 70 | for triple in triples: 71 | print(triple) 72 | 73 | # Search also support limit and offset 74 | triples, cardinality = document.search_triples("", "", "", limit=10, offset=100) 75 | # etc ... 76 | 77 | .. |Build Status| image:: https://travis-ci.org/Callidon/pyHDT.svg?branch=master 78 | :target: https://travis-ci.org/Callidon/pyHDT 79 | .. |Documentation Status| image:: https://readthedocs.org/projects/pyhdt/badge/?version=latest 80 | :target: https://callidon.github.io/pyHDT 81 | .. |PyPI version| image:: https://badge.fury.io/py/hdt.svg 82 | :target: https://badge.fury.io/py/hdt 83 | 84 | Handling non UTF-8 strings in python 85 | ==================================== 86 | 87 | If the HDT document has been encoded with a non UTF-8 encoding the 88 | previous code won’t work correctly and will result in a 89 | ``UnicodeDecodeError``. More details on how to convert string to str 90 | from c++ to python `here`_ 91 | 92 | To handle this we doubled the API of the HDT document by adding: 93 | 94 | - ``search_triples_bytes(...)`` return an iterator of triples as ``(py::bytes, py::bytes, py::bytes)`` 95 | - ``search_join_bytes(...)`` return an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)`` 96 | - ``convert_tripleid_bytes(...)`` return a triple as: ``(py::bytes, py::bytes, py::bytes)`` 97 | - ``convert_id_bytes(...)`` return a ``py::bytes`` 98 | 99 | **Parameters and documentation are the same as the standard version** 100 | 101 | .. code:: python 102 | 103 | from hdt import HDTDocument 104 | 105 | # Load an HDT file. 106 | # Missing indexes are generated automatically, add False as the second argument to disable them 107 | document = HDTDocument("test.hdt") 108 | it = document.search_triple_bytes("", "", "") 109 | 110 | for s, p, o in it: 111 | print(s, p, o) # print b'...', b'...', b'...' 112 | # now decode it, or handle any error 113 | try: 114 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8') 115 | except UnicodeDecodeError as err: 116 | # try another other codecs 117 | pass 118 | 119 | .. _here: https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html 120 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = pyHDT 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | cp source/_config.yml build/html/_config.yml 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=pyHDT 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/_config.yml: -------------------------------------------------------------------------------- 1 | baseurl: / 2 | include: [ "_static", "_static/*" ] 3 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | .. currentmodule:: hdt 5 | 6 | HDTDocument 7 | ----------- 8 | 9 | .. autoclass:: HDTDocument 10 | :members: 11 | 12 | .. method:: __init__(self, filePath) -> hdt.HDTDocument 13 | 14 | Build a new :class:`hdt.HDTDocument` by the loading the HDT file located in ``filePath``. 15 | 16 | Args: 17 | - filePath ``str``: the path to the HDT file to load. 18 | 19 | .. code-block:: python 20 | 21 | from hdt import HDTDocument 22 | 23 | # Load HDT file. Missing indexes are generated automatically 24 | document = HDTDocument("test.hdt") 25 | 26 | # Display some metadata about the HDT document itself 27 | print("nb triples: %i" % document.total_triples) 28 | print("nb subjects: %i" % document.nb_subjects) 29 | print("nb predicates: %i" % document.nb_predicates) 30 | print("nb objects: %i" % document.nb_objets) 31 | print("nb shared subject-object: %i" % document.nb_shared) 32 | 33 | 34 | TripleIterator 35 | -------------- 36 | 37 | .. autoclass:: TripleIterator 38 | :inherited-members: 39 | :members: 40 | 41 | TripleIDIterator 42 | ---------------- 43 | 44 | .. autoclass:: TripleIDIterator 45 | :inherited-members: 46 | :members: 47 | 48 | JoinIterator 49 | -------------- 50 | 51 | .. autoclass:: JoinIterator 52 | :inherited-members: 53 | :members: 54 | 55 | 56 | Enumerations 57 | ------------- 58 | 59 | IdentifierPosition 60 | ^^^^^^^^^^^^^^^^^^^ 61 | 62 | .. autoclass:: IdentifierPosition 63 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # pyHDT documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Jan 22 10:41:42 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'pyHDT' 50 | copyright = '2018, Thomas Minier' 51 | author = 'Thomas Minier' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '1.0.0' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '1.0.0' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = [] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = { 'show_related': True} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | # Custom sidebar templates, must be a dictionary that maps document names 100 | # to template names. 101 | # 102 | # This is required for the alabaster theme 103 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 104 | html_sidebars = { 105 | '**': [ 106 | 'globaltoc.html', 107 | 'relations.html', # needs 'show_related': True theme option to display 108 | 'sourcelink.html', 109 | 'searchbox.html', 110 | ] 111 | } 112 | 113 | 114 | # -- Options for HTMLHelp output ------------------------------------------ 115 | 116 | # Output file base name for HTML help builder. 117 | htmlhelp_basename = 'pyHDTdoc' 118 | 119 | 120 | # -- Options for LaTeX output --------------------------------------------- 121 | 122 | latex_elements = { 123 | # The paper size ('letterpaper' or 'a4paper'). 124 | # 125 | # 'papersize': 'letterpaper', 126 | 127 | # The font size ('10pt', '11pt' or '12pt'). 128 | # 129 | # 'pointsize': '10pt', 130 | 131 | # Additional stuff for the LaTeX preamble. 132 | # 133 | # 'preamble': '', 134 | 135 | # Latex figure (float) alignment 136 | # 137 | # 'figure_align': 'htbp', 138 | } 139 | 140 | # Grouping the document tree into LaTeX files. List of tuples 141 | # (source start file, target name, title, 142 | # author, documentclass [howto, manual, or own class]). 143 | latex_documents = [ 144 | (master_doc, 'pyHDT.tex', 'pyHDT Documentation', 145 | 'Thomas Minier', 'manual'), 146 | ] 147 | 148 | 149 | # -- Options for manual page output --------------------------------------- 150 | 151 | # One entry per manual page. List of tuples 152 | # (source start file, name, description, authors, manual section). 153 | man_pages = [ 154 | (master_doc, 'pyhdt', 'pyHDT Documentation', 155 | [author], 1) 156 | ] 157 | 158 | 159 | # -- Options for Texinfo output ------------------------------------------- 160 | 161 | # Grouping the document tree into Texinfo files. List of tuples 162 | # (source start file, target name, title, author, 163 | # dir menu entry, description, category) 164 | texinfo_documents = [ 165 | (master_doc, 'pyHDT', 'pyHDT Documentation', 166 | author, 'pyHDT', 'One line description of project.', 167 | 'Miscellaneous'), 168 | ] 169 | -------------------------------------------------------------------------------- /docs/source/hdtdocument.rst: -------------------------------------------------------------------------------- 1 | HDTDocument 2 | =========== 3 | 4 | Loading HDT files 5 | ^^^^^^^^^^^^^^^^^ 6 | 7 | The main class for manipulating HDT Dicument using pyHDT is ``HDTDocument``. 8 | Upon creation, it search for an index file in the same dicrectory than the HDT file you wish to load. 9 | 10 | For example, if you load a file */home/awesome-user/test.hdt*, HDTDocument will look for the index file 11 | */home/awesome-user/test.hdt.index.v1-1*. 12 | 13 | Missing indexes are generated automatically, but be careful, as it requires to load all HDT triples in memory! 14 | 15 | .. code-block:: python 16 | 17 | from hdt import HDTDocument 18 | 19 | # Load an HDT file. 20 | # Missing indexes are generated automatically, add False as the second argument to disable them 21 | document = HDTDocument("test.hdt") 22 | 23 | # Display some metadata about the HDT document itself 24 | print("nb triples: %i" % document.total_triples) 25 | print("nb subjects: %i" % document.nb_subjects) 26 | print("nb predicates: %i" % document.nb_predicates) 27 | print("nb objects: %i" % document.nb_objets) 28 | print("nb shared subject-object: %i" % document.nb_shared) 29 | 30 | 31 | Searching for triples 32 | ^^^^^^^^^^^^^^^^^^^^^^ 33 | 34 | You can search for all RDF triples in the HDT file matching a triple pattern using `search_triples`. 35 | It returns a 2-element tuple, with an *iterator* over the matching RDF triples and the estimated triple pattern *cardinality*. 36 | 37 | .. code-block:: python 38 | 39 | from hdt import HDTDocument 40 | document = HDTDocument("test.hdt") 41 | 42 | # Fetch all triples that matches { ?s ?p ?o } 43 | # Use empty strings ("") to indicates variables 44 | (triples, cardinality) = document.search_triples("", "", "") 45 | 46 | print("cardinality of { ?s ?p ?o }: %i" % cardinality) 47 | for triple in triples: 48 | print(triple) 49 | 50 | # Search also support limit and offset 51 | (triples, cardinality) = document.search_triples("", "", "", limit=10, offset=100) 52 | # etc ... 53 | 54 | Searching for triple IDs 55 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 56 | 57 | A typical HDT document encodes a triple's subject, predicate and object as unique integers, named **TripleID**. 58 | For example, the triple ``("ex:Toto", "ex:type", "ex:Person")`` can be encoded as ``(1, 2, 3)``. 59 | An ``HDTDocument`` allows for searching RDF triples in this format, using the ``search_triple_ids`` method, which works exactly like the classic ``search_triple``. 60 | 61 | .. code-block:: python 62 | 63 | from hdt import HDTDocument 64 | document = HDTDocument("test.hdt") 65 | 66 | (triples, cardinality) = document.search_triples_ids("", "", "") 67 | 68 | for s, p, o in triples: 69 | print(s, p, o) # will print 3-element tuples of integers 70 | 71 | # convert a triple ID to a string format 72 | print(document.convert_tripleid(s, p, o)) 73 | 74 | Join evaluation 75 | ^^^^^^^^^^^^^^^ 76 | 77 | An HDT document also provides support for evaluating joins over a set of triples patterns. 78 | 79 | .. code-block:: python 80 | 81 | from hdt import HDTDocument 82 | document = HDTDocument("test.hdt") 83 | 84 | # find all actors with their names in the HDT document 85 | tp_a = ("?s", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://example.org#Actor") 86 | tp_b = ("?s", "http://xmlns.com/foaf/0.1/name", "?name") 87 | iterator = document.search_join(set([tp_a, tp_b])) 88 | 89 | print("estimated join cardinality : %i" % len(iterator)) 90 | for mappings in iterator: 91 | print(mappings) 92 | 93 | Ordering 94 | ^^^^^^^^^^^ 95 | 96 | When searching for triples (either in string or triple id format), results are returned ordred by (subject, predicate, object). 97 | However, this order is **not** an order on string values, but an order on **triple ids**. 98 | For example, ``("ex:2", "ex:type", "ex:Person") < ("ex:1", "ex:type", "ex:Person")``, 99 | because their triple ids counterparts are ``(1, 2, 3)`` and ``(2, 2, 3)``. 100 | 101 | For more details about this topic, please refer to the `HDT journal article `_. 102 | 103 | Handling non UTF-8 strings in python 104 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 105 | 106 | If the HDT document has been encoded with a non UTF-8 encoding the 107 | previous code won’t work correctly and will result in a 108 | ``UnicodeDecodeError``. More details on how to convert string to str 109 | from c++ to python `here`_ 110 | 111 | To handle this we doubled the API of the HDT document by adding: 112 | 113 | - ``search_triples_bytes(...)`` return an iterator of triples as ``(py::bytes, py::bytes, py::bytes)`` 114 | - ``search_join_bytes(...)`` return an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)`` 115 | - ``convert_tripleid_bytes(...)`` return a triple as: ``(py::bytes, py::bytes, py::bytes)`` 116 | - ``convert_id_bytes(...)`` return a ``py::bytes`` 117 | 118 | **Parameters and documentation are the same as the standard version** 119 | 120 | .. code:: python 121 | 122 | from hdt import HDTDocument 123 | 124 | # Load an HDT file. 125 | # Missing indexes are generated automatically, add False as the second argument to disable them 126 | document = HDTDocument("test.hdt") 127 | it = document.search_triple_bytes("", "", "") 128 | 129 | for s, p, o in it: 130 | print(s, p, o) # print b'...', b'...', b'...' 131 | # now decode it, or handle any error 132 | try: 133 | s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8') 134 | except UnicodeDecodeError as err: 135 | # try another other codecs 136 | pass 137 | 138 | .. _here: https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html 139 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | pyHDT: Read and query HDT document with ease in Python 2 | ====================================================== 3 | 4 | |Build Status| |Documentation Status| |PyPI version| 5 | 6 | Getting started 7 | ================== 8 | 9 | .. toctree:: 10 | :maxdepth: 3 11 | 12 | installation 13 | hdtdocument 14 | api 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | .. |Build Status| image:: https://travis-ci.org/Callidon/pyHDT.svg?branch=master 24 | :target: https://travis-ci.org/Callidon/pyHDT 25 | .. |Documentation Status| image:: https://readthedocs.org/projects/pyhdt/badge/?version=latest 26 | :target: https://callidon.github.io/pyHDT 27 | .. |PyPI version| image:: https://badge.fury.io/py/hdt.svg 28 | :target: https://badge.fury.io/py/hdt 29 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============= 3 | 4 | Requirements 5 | ^^^^^^^^^^^^ 6 | 7 | * Python *version 3.6.4 or higher* 8 | * `pip `_ 9 | * **gcc/clang** with **c++11 support** 10 | * **Python Development headers** 11 | 12 | You must have the `Python.h` header available on your system. 13 | For example, for Python 3.4, install the `python3.4-dev` package on Debian/Ubuntu systems. 14 | 15 | Then, install the `pybind11 16 | library `__ 17 | 18 | :: 19 | 20 | pip install pybind11 21 | 22 | Installation 23 | ^^^^^^^^^^^^^ 24 | 25 | Installation in a `virtualenv `_ is **strongly advised!** 26 | 27 | Installation with pip 28 | ------------------------- 29 | 30 | :: 31 | 32 | pip install hdt 33 | 34 | 35 | Manual installation 36 | ------------------------- 37 | 38 | .. code-block:: bash 39 | 40 | git clone --recursive https://github.com/Callidon/pyHDT 41 | cd pyHDT/ 42 | pip install -r requirements.txt 43 | python setup.py install 44 | -------------------------------------------------------------------------------- /include/docstrings.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * docstrings.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef PYHDT_DOCSTRINGS_HPP 7 | #define PYHDT_DOCSTRINGS_HPP 8 | 9 | #include 10 | 11 | const char *MODULE_DOC = R"( 12 | The hdt module enables to load and query HDT files with ease. 13 | )"; 14 | 15 | /** 16 | * Enums docstrings 17 | */ 18 | 19 | const char *IDENTIFIER_POSITION_DOC = R"( 20 | An enum used to indicate the position (subject, predicate or object) of an Object identifier. 21 | 22 | Possibles values: 23 | - ``IdentifierPosition.Subject``: the subject position 24 | - ``IdentifierPosition.Predicate``: the subject position 25 | - ``IdentifierPosition.Object``: the object position 26 | 27 | .. code-block:: python 28 | 29 | from hdt import IdentifierPosition 30 | print(IdentifierPosition.Subject) 31 | print(IdentifierPosition.Predicate) 32 | print(IdentifierPosition.Object) 33 | 34 | )"; 35 | 36 | /** 37 | * HDT Document docstrings 38 | */ 39 | 40 | const char *HDT_DOCUMENT_CLASS_DOC = R"( 41 | An HDTDocument enables to load and query a HDT file. 42 | 43 | Constructor: 44 | - file ``str``: Path to the HDT file to load. 45 | - predicate ``boolean``: True if additional indexes must be loaded, False otherwise. 46 | )"; 47 | 48 | const char *HDT_DOCUMENT_GETFILEPATH_DOC = R"( 49 | Return the path to the HDT file currently loaded 50 | )"; 51 | 52 | const char *HDT_DOCUMENT_GETNBTRIPLES_DOC = R"( 53 | Return the total number of triples in the HDT document 54 | )"; 55 | 56 | const char *HDT_DOCUMENT_GETNBSUBJECTS_DOC = R"( 57 | Return the number of subjects in the HDT document 58 | )"; 59 | 60 | const char *HDT_DOCUMENT_GETNBPREDICATES_DOC = R"( 61 | Return the number of predicates in the HDT document 62 | )"; 63 | 64 | const char *HDT_DOCUMENT_GETNBOBJECTS_DOC = R"( 65 | Return the number of objects in the HDT document 66 | )"; 67 | 68 | const char *HDT_DOCUMENT_GETNBSHARED_DOC = R"( 69 | Return the number of shared subject-object in the HDT document 70 | )"; 71 | 72 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_DOC = R"( 73 | Search for RDF triples matching the triple pattern { ``subject`` ``predicate`` ``object`` }, 74 | with an optional ``limit`` and ``offset``. 75 | Use empty strings (``""``) to indicate wildcards. 76 | 77 | Args: 78 | - subject ``str``: The subject of the triple pattern to seach for. 79 | - predicate ``str``: The predicate of the triple pattern to seach for. 80 | - obj ``str``: The object of the triple pattern ot seach for. 81 | - limit ``int`` ``optional``: Maximum number of triples to search for. 82 | - offset ``int`` ``optional``: Number of matching triples to skip before returning results. 83 | 84 | Return: 85 | A 2-elements ``tuple`` (:class:`hdt.TripleIterator`, estimated pattern cardinality), where 86 | the TripleIterator iterates over matching RDF triples. 87 | 88 | A RDF triple itself is a 3-elements ``tuple`` (subject, predicate, object). 89 | 90 | .. code-block:: python 91 | 92 | from hdt import HDTDocument 93 | document = HDTDocument("test.hdt") 94 | 95 | # Fetch all triples that matches { ?s ?p ?o } 96 | (triples, cardinality) = document.search_triples("", "", "") 97 | 98 | print("cardinality of { ?s ?p ?o }: %i" % cardinality) 99 | for triple in triples: 100 | print(triple) 101 | 102 | )"; 103 | 104 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC = R"( 105 | Same as :meth:`hdt.HDTDocument.search_triples`, but RDF triples are represented as unique ids (from the HDT Dictionnary). 106 | Use the integer `0` to indicate wildcards. 107 | 108 | Mapping between ids and RDF terms is done using :meth:`hdt.HDTDocument.convert_id`, :meth:`hdt.HDTDocument.convert_term` and :meth:`hdt.HDTDocument.convert_tripleid`. 109 | 110 | Args: 111 | - subject ``int``: The Object identifier of the triple pattern's subject. 112 | - predicate ``int``: The Object identifier of the triple pattern's predicate. 113 | - obj ``int``: The Object identifier of the triple pattern's object. 114 | - limit ``int`` ``optional``: Maximum number of triples to search for. 115 | - offset ``int`` ``optional``: Number of matching triples to skip before returning results. 116 | 117 | Return: 118 | A 2-elements ``tuple`` (:class:`hdt.TripleIDIterator`, estimated pattern cardinality), where 119 | the TripleIDIterator iterates over matching RDF triples IDs. 120 | 121 | A RDF triple ID itself is a 3-elements ``tuple`` (subjectID, predicateID, objectID). 122 | 123 | .. code-block:: python 124 | 125 | from hdt import HDTDocument 126 | document = HDTDocument("test.hdt") 127 | 128 | pred = document.convert_term("http://xmlns.com/foaf/0.1/") 129 | # Fetch all RDF triples that matches { ?s foaf:name ?o } 130 | (triples, cardinality) = document.search_triples_ids(0, pred, 0) 131 | 132 | print("cardinality of { ?s foaf:name ?o }: %i" % cardinality) 133 | for triple in triples: 134 | print(triple) 135 | 136 | )"; 137 | 138 | const char *HDT_DOCUMENT_SEARCH_JOIN_DOC = R"( 139 | Evaluate a join between a set of triple patterns using an iterator. 140 | A triple pattern itself is a 3-elements ``tuple`` (subject, predicate, object), where SPARQL variables, i.e., join predicates, are prefixed by a ``?``. 141 | 142 | Args: 143 | - patterns ``set``: set of triple patterns. 144 | 145 | Return: 146 | A :class:`hdt.JoinIterator`, which can be consumed as a Python iterator to evaluates the join. 147 | 148 | .. code-block:: python 149 | 150 | from hdt import HDTDocument 151 | document = HDTDocument("test.hdt") 152 | 153 | # find all actors with their names in the HDT document 154 | tp_a = ("?s", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://example.org#Actor") 155 | tp_b = ("?s", "http://xmlns.com/foaf/0.1/name", "?name") 156 | iterator = document.search_join(set([tp_a, tp_b])) 157 | 158 | print("estimated join cardinality : %i" % len(iterator)) 159 | for mappings in iterator: 160 | print(mappings) 161 | 162 | )"; 163 | 164 | const char *HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC = R"( 165 | Transform a RDF triple from a TripleID representation to a string representation. 166 | 167 | Args: 168 | - subject ``int``: unique ID of the subject. 169 | - predicate ``int``: unique ID of the predicate. 170 | - obj ``int``: unique ID of the object. 171 | 172 | Return: 173 | A triple in string representation, i.e., a 3-elements ``tuple`` (subject, predicate, object) 174 | 175 | .. code-block:: python 176 | 177 | from hdt import HDTDocument 178 | document = HDTDocument("test.hdt") 179 | 180 | # Fetch all triples that matches { ?s foaf:name ?o } 181 | pred = document.convert_term("http://xmlns.com/foaf/0.1/") 182 | (triples, cardinality) = document.search_triples_ids(0, pred, 0) 183 | 184 | for s, p, o in triples: 185 | print(s, p, o) # will print Object identifiers, i.e., integers 186 | # convert a triple ID to a string format 187 | print(document.convert_tripleid(s, p, o)) 188 | 189 | )"; 190 | 191 | const char *HDT_DOCUMENT_CONVERT_ID_DOC = R"( 192 | Transform an Object Identifier to a RDF term. 193 | Such identifier are used in TripleID. 194 | 195 | Args: 196 | - id ``int``: Object identifier. 197 | - position :class:`hdt.IdentifierPosition`: Identifier position. 198 | 199 | Return: 200 | The RDF term associated with the Object Identifier, i.e., either an URI or a RDF literal. 201 | 202 | .. code-block:: python 203 | 204 | from hdt import HDTDocument, IdentifierPosition 205 | document = HDTDocument("test.hdt") 206 | print(document.convert_id(10, IdentifierPosition.Subject)) 207 | 208 | )"; 209 | 210 | const char *HDT_DOCUMENT_CONVERT_TERM_DOC = R"( 211 | Transform an RDF Term to the associated Object Identifier. 212 | Such identifier are used in TripleID. 213 | 214 | Args: 215 | - term ``str``: RDF Term. 216 | - position :class:`hdt.IdentifierPosition`: Identifier position. 217 | 218 | Return: 219 | The Object Identifier associated with the RDF Term 220 | 221 | .. code-block:: python 222 | 223 | from hdt import HDTDocument, IdentifierPosition 224 | document = HDTDocument("test.hdt") 225 | print(document.convert_term("http://example.org#Alice", IdentifierPosition.Subject)) 226 | 227 | )"; 228 | 229 | /** 230 | * TripleIterator & TripleIDIterator docstrings 231 | */ 232 | 233 | const char *TRIPLE_ITERATOR_CLASS_DOC = R"( 234 | A TripleIterator iterates over triples in a HDT file matching a triple pattern, with an optional limit & offset. 235 | 236 | Such iterator is returned by :meth:`hdt.HDTDocument.search_triples`. 237 | )"; 238 | 239 | const char *TRIPLE_ID_ITERATOR_CLASS_DOC = R"( 240 | A TripleIDIterator iterates over triples' IDs in a HDT file matching a triple pattern, with an optional limit & offset. 241 | 242 | Such iterator is returned by :meth:`hdt.HDTDocument.search_triples_ids` 243 | 244 | Conversion from a tuple of triple ids into a RDF triple is done using :meth:`hdt.HDTDocument.convert_tripleid`. 245 | )"; 246 | 247 | const char *TRIPLE_ITERATOR_NEXT_DOC = R"( 248 | Return the next matching triple read by the iterator, or raise ``StopIterator`` if there is no more items to yield. 249 | )"; 250 | 251 | const char *TRIPLE_ITERATOR_PEEK_DOC = R"( 252 | Return the next matching triple read by the iterator without advancing it, or raise ``StopIterator`` if there is no more items to yield. 253 | )"; 254 | 255 | const char *TRIPLE_ITERATOR_HASNEXT_DOC = R"( 256 | Return true if the iterator still has items to yield, false otherwise. 257 | )"; 258 | 259 | const char *TRIPLE_ITERATOR_GETSUBJECT_DOC = R"( 260 | Return the subject of the triple pattern currently evaluated. 261 | )"; 262 | 263 | const char *TRIPLE_ITERATOR_GETPREDICATE_DOC = R"( 264 | Return the predicate of the triple pattern currently evaluated. 265 | )"; 266 | 267 | const char *TRIPLE_ITERATOR_GETOBJECT_DOC = R"( 268 | Return the object of the triple pattern currently evaluated. 269 | )"; 270 | 271 | const char *TRIPLE_ITERATOR_GETLIMIT_DOC = R"( 272 | Return the limit of the iterator, i.e., the maximum number of items the iterator will yield. 273 | A limit of 0 indicates that the iterator limit is the cardinality of the triple pattern currently evaluated. 274 | )"; 275 | 276 | const char *TRIPLE_ITERATOR_GETOFFSET_DOC = R"( 277 | Return the offset of the iterator, i.e., the number of items the iterator will first skip before yielding. 278 | An offset of 0 indicates that the iterator will not skip any items. 279 | )"; 280 | 281 | const char *TRIPLE_ITERATOR_NBREADS_DOC = R"( 282 | Return the number of items read by the iterator until now. 283 | Do not include any offset, thus the real position of the iterator in the collection of triples can be computed as offset + nb_reads 284 | )"; 285 | 286 | const char *TRIPLE_ITERATOR_SIZE_DOC = R"( 287 | Get a hint on the cardinality of the triple pattern currently evaluated. 288 | The iterator's limit and offset are not taken into account. 289 | 290 | Return: 291 | A 2-element ``tuple`` (integer, boolean), where the left member is the estimated cardinality, 292 | and the right member is True is the estimation is accurate, False otherwise 293 | )"; 294 | 295 | const char *TRIPLE_ITERATOR_ACC_ESTIMATION_DOC = R"( 296 | Return True if the iterator can accuratly estimate the cardinality of the triple pattern, False otherwise. 297 | )"; 298 | 299 | const char *JOIN_ITERATOR_CLASS_DOC = R"( 300 | A JoinIterator iterates over the set of solution mappings for a join between several triple patterns. It implements the Python iterator protocol and yields sets of solutions mappings. 301 | 302 | Such iterator is returned by :meth:`hdt.HDTDocument.search_join` 303 | )"; 304 | 305 | const char *JOIN_ITERATOR_NEXT_DOC = R"( 306 | Return the next set of solution mappings read by the iterator, or raise ``StopIterator`` if there is no more items to yield. 307 | )"; 308 | 309 | const char *JOIN_ITERATOR_HAS_NEXT_DOC = R"( 310 | Return true if the iterator still has items to yield, false otherwise. 311 | )"; 312 | 313 | const char *JOIN_ITERATOR_SIZE_DOC = R"( 314 | Return the estimated join cardinality. 315 | )"; 316 | 317 | const char *JOIN_ITERATOR_RESET_DOC = R"( 318 | Reset the join, i.e., move the iterator back to its initial state. 319 | )"; 320 | 321 | #endif /* PYHDT_DOCSTRINGS_HPP */ 322 | -------------------------------------------------------------------------------- /include/hdt_document.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt_document.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef PYHDT_DOCUMENT_HPP 7 | #define PYHDT_DOCUMENT_HPP 8 | 9 | #include 10 | #include "HDT.hpp" 11 | #include "QueryProcessor.hpp" 12 | #include "pyhdt_types.hpp" 13 | #include "triple_iterator.hpp" 14 | #include "triple_iterator_bytes.hpp" 15 | #include "tripleid_iterator.hpp" 16 | #include "join_iterator.hpp" 17 | #include "join_iterator_bytes.hpp" 18 | #include 19 | #include 20 | #include 21 | namespace py = pybind11; 22 | 23 | // The result of a search for a triple pattern in a HDT document: 24 | // a tuple (matching RDF triples, nb of matching RDF triples) 25 | typedef std::tuple search_results; 26 | 27 | // The result of a search for a triple pattern in a HDT document: 28 | // a tuple (matching RDF triples, nb of matching RDF triples) 29 | typedef std::tuple search_results_bytes; 30 | 31 | // Same as seach_results, but for an iterator over triple ids 32 | typedef std::tuple search_results_ids; 33 | 34 | /*! 35 | * HDTDocument is the main entry to manage an hdt document 36 | * \author Thomas Minier 37 | */ 38 | class HDTDocument { 39 | private: 40 | std::string hdt_file; 41 | hdt::HDT *hdt; 42 | hdt::QueryProcessor *processor; 43 | HDTDocument(std::string file, bool map, bool indexed); 44 | 45 | public: 46 | /*! 47 | * Destructor 48 | */ 49 | ~HDTDocument(); 50 | 51 | /*! 52 | * Get the path to the HDT file currently loaded 53 | * @return The path to the HDT file currently loaded 54 | */ 55 | std::string getFilePath(); 56 | 57 | /*! 58 | * Implementation for Python function "__repr__" 59 | * @return A string representation of the object 60 | */ 61 | std::string python_repr(); 62 | 63 | /*! 64 | * Get the total number of triples in the HDT document 65 | * @return The total number of triples in the HDT document 66 | */ 67 | unsigned int getNbTriples(); 68 | 69 | /*! 70 | * Get the number of distinct subjects in the HDT document 71 | * @return The number of distinct subjects in the HDT document 72 | */ 73 | unsigned int getNbSubjects(); 74 | 75 | /*! 76 | * Get the number of distinct predicates in the HDT document 77 | * @return The number of distinct predicates in the HDT document 78 | */ 79 | unsigned int getNbPredicates(); 80 | 81 | /*! 82 | * Get the number of distinct objects in the HDT document 83 | * @return The number of distinct objects in the HDT document 84 | */ 85 | unsigned int getNbObjects(); 86 | 87 | /*! 88 | * Get the number of shared subjects-objects in the HDT document 89 | * @return The number of shared subjects-objects in the HDT document 90 | */ 91 | unsigned int getNbShared(); 92 | 93 | /*! 94 | * Static factory method used to create a new HDT Document 95 | * @param file - Path to the HDT file 96 | * @param map - True maps the HDT file (faster), False loads everything in memory 97 | * @param indexed - True if the HDT must be loaded with indexes, False otherwise 98 | */ 99 | static HDTDocument create(std::string file, bool map, bool indexed) { 100 | return HDTDocument(file, map, indexed); 101 | } 102 | 103 | /*! 104 | * Convert a TripleID to a string RDF triple 105 | * @param subject - Triple's subject 106 | * @param predicate - Triple's predicate 107 | * @param object - Triple's object 108 | * @return The associated RDF triple 109 | */ 110 | triple convertTripleID(unsigned int subject, unsigned int predicate, 111 | unsigned int object); 112 | 113 | /** 114 | * Convert an Object Identifier into the equivalent an RDF term 115 | * @param id - Object Identifier 116 | * @param pos - Identifier position (subject, predicate or object) 117 | * @return The an RDF term equivalent to the Object Identifier 118 | */ 119 | string convertID(unsigned int id, IdentifierPosition pos); 120 | 121 | /** 122 | * Convert an RDF term into the associated an Object Identifier. 123 | * @param term - RDF Term in string format 124 | * @param pos - Identifier position (subject, predicate or object) 125 | * @return The Object Identifier associated with the RDF term 126 | */ 127 | unsigned int convertTerm(std::string term, IdentifierPosition pos); 128 | 129 | /*! 130 | * Search all matching triples for a triple pattern, whith an optional limit and offset. 131 | * Returns a tuple 132 | * @param subject - Triple pattern's subject 133 | * @param predicate - Triple pattern's predicate 134 | * @param object - Triple pattern's object 135 | * @param limit - (Optional) Maximum number of matching triples to read 136 | * @param offset - (Optional) Number of matching triples to skip 137 | * @return A tuple (TripleIterator*, cardinality) 138 | */ 139 | search_results search(std::string subject, std::string predicate, 140 | std::string object, unsigned int limit = 0, 141 | unsigned int offset = 0); 142 | 143 | /*! 144 | * Same as HDTDocument#search, but search for TripleIDs instead. 145 | * Returns a tuple 146 | * @param subject - Triple pattern's subject identifier 147 | * @param predicate - Triple pattern's predicate identifier 148 | * @param object - Triple pattern's object identifier 149 | * @param limit - (Optional) Maximum number of matching triples to read 150 | * @param offset - (Optional) Number of matching triples to skip 151 | * @return A tuple (TripleIDIterator*, cardinality) 152 | */ 153 | search_results_ids searchIDs(unsigned int subject, unsigned int predicate, 154 | unsigned int object, unsigned int limit = 0, 155 | unsigned int offset = 0); 156 | 157 | /** 158 | * Evaluate a join between a set of triple patterns using a JoinIterator. 159 | * @param patterns - Set of triple patterns 160 | * @return A JoinIterator* used to evaluated the join. 161 | */ 162 | JoinIterator * searchJoin(std::vector patterns); 163 | 164 | // ============== BYTES REPRESENTATION ============== 165 | // Author: Arnaud GRALL - MIT License 2017-2019 166 | /*! 167 | * Search all matching triples for a triple pattern, whith an optional limit and offset. Returns bytes instead of string 168 | * Returns a tuple 169 | * @param subject - Triple pattern's subject 170 | * @param predicate - Triple pattern's predicate 171 | * @param object - Triple pattern's object 172 | * @param limit - (Optional) Maximum number of matching triples to read 173 | * @param offset - (Optional) Number of matching triples to skip 174 | * @return A tuple (TripleIterator*, cardinality) 175 | */ 176 | search_results_bytes searchBytes(std::string subject, std::string predicate, 177 | std::string object, unsigned int limit = 0, 178 | unsigned int offset = 0); 179 | /** 180 | * Evaluate a join between a set of triple patterns using a JoinIterator. 181 | * @param patterns - Set of triple patterns 182 | * @return A JoinIterator* used to evaluated the join. 183 | */ 184 | JoinIteratorBytes * searchJoinBytes(std::vector patterns); 185 | /*! 186 | * Convert a TripleID to a RDF triple as bytes 187 | * @param subject - Triple's subject 188 | * @param predicate - Triple's predicate 189 | * @param object - Triple's object 190 | * @return The associated RDF triple 191 | */ 192 | triple_bytes convertTripleIDBytes(unsigned int subject, unsigned int predicate, 193 | unsigned int object); 194 | 195 | /** 196 | * Convert an Object Identifier into the equivalent an RDF term as bytes 197 | * @param id - Object Identifier 198 | * @param pos - Identifier position (subject, predicate or object) 199 | * @return The an RDF term equivalent to the Object Identifier 200 | */ 201 | py::bytes convertIDBytes(unsigned int id, IdentifierPosition pos); 202 | }; 203 | 204 | #endif /* PYHDT_DOCUMENT_HPP */ 205 | -------------------------------------------------------------------------------- /include/join_iterator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef JOIN_ITERATOR_HPP 7 | #define JOIN_ITERATOR_HPP 8 | 9 | #include "pyhdt_types.hpp" 10 | #include "QueryProcessor.hpp" 11 | #include 12 | 13 | /*! 14 | * JoinIterator iterates over solution bindings of a join 15 | * @author Thomas Minier 16 | */ 17 | class JoinIterator { 18 | private: 19 | hdt::VarBindingString *iterator; 20 | bool hasNextSolution = true; 21 | 22 | public: 23 | /*! 24 | * Constructor 25 | * @param iterator [description] 26 | */ 27 | JoinIterator(hdt::VarBindingString *_it); 28 | 29 | /*! 30 | * Destructor 31 | */ 32 | ~JoinIterator(); 33 | 34 | /*! 35 | * Implementation for Python function "__repr__" 36 | * @return [description] 37 | */ 38 | std::string python_repr(); 39 | 40 | /*! 41 | * Implementation for Python function "__iter__" 42 | * @return [description] 43 | */ 44 | JoinIterator *python_iter(); 45 | 46 | /** 47 | * Get the estimated join cardinality 48 | * @return [description] 49 | */ 50 | size_t estimatedCardinality(); 51 | 52 | /** 53 | * Reset the iterator into its initial state and restart join processing. 54 | */ 55 | void reset(); 56 | 57 | /*! 58 | * Return true if the iterator still has items available, False otherwise. 59 | * @return [description] 60 | */ 61 | bool hasNext(); 62 | 63 | /** 64 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 65 | * has ended. Used to implement Python Itertor protocol. 66 | * @return [description] 67 | */ 68 | solution_bindings next(); 69 | 70 | }; 71 | 72 | #endif /* JOIN_ITERATOR_HPP */ 73 | -------------------------------------------------------------------------------- /include/join_iterator_bytes.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator.hpp 3 | * Author: Arnaud Grall - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef JOIN_ITERATOR_BYTES_HPP 7 | #define JOIN_ITERATOR_BYTES_HPP 8 | 9 | #include "pyhdt_types.hpp" 10 | #include "QueryProcessor.hpp" 11 | #include 12 | 13 | /*! 14 | * JoinIterator iterates over solution bindings of a join 15 | * @author Arnaud Grall 16 | */ 17 | class JoinIteratorBytes { 18 | private: 19 | hdt::VarBindingString *iterator; 20 | bool hasNextSolution = true; 21 | 22 | public: 23 | /*! 24 | * Constructor 25 | * @param iterator [description] 26 | */ 27 | JoinIteratorBytes(hdt::VarBindingString *_it); 28 | 29 | /*! 30 | * Destructor 31 | */ 32 | ~JoinIteratorBytes(); 33 | 34 | /*! 35 | * Implementation for Python function "__repr__" 36 | * @return [description] 37 | */ 38 | std::string python_repr(); 39 | 40 | /*! 41 | * Implementation for Python function "__iter__" 42 | * @return [description] 43 | */ 44 | JoinIteratorBytes *python_iter(); 45 | 46 | /** 47 | * Get the estimated join cardinality 48 | * @return [description] 49 | */ 50 | size_t estimatedCardinality(); 51 | 52 | /** 53 | * Reset the iterator into its initial state and restart join processing. 54 | */ 55 | void reset(); 56 | 57 | /*! 58 | * Return true if the iterator still has items available, False otherwise. 59 | * @return [description] 60 | */ 61 | bool hasNext(); 62 | 63 | /** 64 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 65 | * has ended. Used to implement Python Itertor protocol. 66 | * @return [description] 67 | */ 68 | py::set next(); 69 | 70 | }; 71 | 72 | #endif /* JOIN_ITERATOR_BYTES_HPP */ 73 | -------------------------------------------------------------------------------- /include/pyhdt_types.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt_types.hpp 3 | * Author: Thomas MINIER, Arnaud Grall - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef PYHDT_TYPES_HPP 7 | #define PYHDT_TYPES_HPP 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | namespace py = pybind11; 15 | 16 | /** 17 | * Indictates the position of an Object Identifier 18 | */ 19 | enum IdentifierPosition { 20 | Subject = 1, 21 | Predicate = 2, 22 | Object = 3 23 | }; 24 | 25 | // A RDF Triple. RDF terms are represented as simple strings by HDT. 26 | typedef std::tuple triple; 27 | 28 | // A RDF triple composed of IDs from HDT dictionnary 29 | typedef std::tuple triple_id; 30 | 31 | // A list of RDF triples 32 | typedef std::list triple_list; 33 | 34 | // A list of RDF triples IDs 35 | typedef std::list triple_ids_list; 36 | 37 | // A hint over the cardinality of a triple pattern 38 | // The right element of the tuple is True if the hint is accurate, False otherwise 39 | typedef std::tuple size_hint; 40 | 41 | typedef std::tuple single_binding; 42 | 43 | typedef std::set *solution_bindings; 44 | 45 | // ============== BYTES REPRESENTATION ============== 46 | // A RDF Triple. RDF terms are represented as simple bytes by HDT. 47 | typedef std::tuple triple_bytes; 48 | // A Set of solutions bindings for the join iterator 49 | typedef py::set solution_bindings_bytes; 50 | 51 | #endif /* PYHDT_TYPES_HPP */ 52 | -------------------------------------------------------------------------------- /include/triple_iterator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef TRIPLE_ITERATOR_HPP 7 | #define TRIPLE_ITERATOR_HPP 8 | 9 | #include "tripleid_iterator.hpp" 10 | #include "pyhdt_types.hpp" 11 | #include "Dictionary.hpp" 12 | #include 13 | 14 | /*! 15 | * TripleIterator iterates over RDF triples of an HDT document which match a 16 | * triple pattern + limit + offset \author Thomas Minier 17 | */ 18 | class TripleIterator { 19 | private: 20 | TripleIDIterator *iterator; 21 | hdt::Dictionary *dictionary; 22 | 23 | public: 24 | /*! 25 | * Constructor 26 | * @param iterator [description] 27 | */ 28 | TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict); 29 | 30 | /*! 31 | * Destructor 32 | */ 33 | ~TripleIterator(); 34 | 35 | /*! 36 | * Implementation for Python function "__repr__" 37 | * @return [description] 38 | */ 39 | std::string python_repr(); 40 | 41 | /*! 42 | * Get the subject of the triple pattern currently evaluated. 43 | * An empty string represents a variable 44 | * @return [description] 45 | */ 46 | std::string getSubject(); 47 | 48 | /*! 49 | * Get the predicate of the triple pattern currently evaluated. 50 | * An empty string represents a variable 51 | * @return [description] 52 | */ 53 | std::string getPredicate(); 54 | 55 | /*! 56 | * Get the object of the triple pattern currently evaluated. 57 | * An empty string represents a variable 58 | * @return [description] 59 | */ 60 | std::string getObject(); 61 | 62 | /*! 63 | * Get the limit of the current iterator 64 | * @return [description] 65 | */ 66 | unsigned int getLimit(); 67 | 68 | /*! 69 | * Get the offset of the current iterator 70 | * @return [description] 71 | */ 72 | unsigned int getOffset(); 73 | 74 | /*! 75 | * Get the number of results read by the iterator 76 | * @return [description] 77 | */ 78 | unsigned int getNbResultsRead(); 79 | 80 | /*! 81 | * Implementation for Python function "__iter__" 82 | * @return [description] 83 | */ 84 | TripleIterator *python_iter(); 85 | 86 | /*! 87 | * Get the estimated cardinality of the pattern currently evaluated. 88 | * Offset & limit are not taken into account. 89 | * @return [description] 90 | */ 91 | size_hint sizeHint(); 92 | 93 | /*! 94 | * Return true if the iterator still has items available, False otherwise. 95 | * @return [description] 96 | */ 97 | bool hasNext(); 98 | 99 | /** 100 | * Get the next item in the iterator, or raise py::StopIteration if the 101 | * iterator has ended 102 | * @return [description] 103 | */ 104 | triple next(); 105 | 106 | /** 107 | * Get the next item in the iterator, or raise py::StopIteration if the 108 | * iterator has ended, but without advancing the iterator. 109 | * @return [description] 110 | */ 111 | triple peek(); 112 | }; 113 | 114 | #endif /* TRIPLE_ITERATOR_HPP */ 115 | -------------------------------------------------------------------------------- /include/triple_iterator_bytes.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator_bytes.hpp 3 | * Author: Arnaud GRALL - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef TRIPLE_ITERATOR_BYTES_HPP 7 | #define TRIPLE_ITERATOR_BYTES_HPP 8 | 9 | #include "tripleid_iterator.hpp" 10 | #include "pyhdt_types.hpp" 11 | #include "Dictionary.hpp" 12 | #include 13 | 14 | /*! 15 | * TripleIterator iterates over RDF triples of an HDT document which match a 16 | * triple pattern + limit + offset \author Thomas Minier 17 | */ 18 | class TripleIteratorBytes { 19 | private: 20 | TripleIDIterator *iterator; 21 | hdt::Dictionary *dictionary; 22 | 23 | public: 24 | /*! 25 | * Constructor 26 | * @param iterator [description] 27 | */ 28 | TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict); 29 | 30 | /*! 31 | * Destructor 32 | */ 33 | ~TripleIteratorBytes(); 34 | 35 | /*! 36 | * Implementation for Python function "__repr__" 37 | * @return [description] 38 | */ 39 | std::string python_repr(); 40 | 41 | /*! 42 | * Get the subject of the triple pattern currently evaluated. 43 | * An empty string represents a variable 44 | * @return [description] 45 | */ 46 | std::string getSubject(); 47 | 48 | /*! 49 | * Get the predicate of the triple pattern currently evaluated. 50 | * An empty string represents a variable 51 | * @return [description] 52 | */ 53 | std::string getPredicate(); 54 | 55 | /*! 56 | * Get the object of the triple pattern currently evaluated. 57 | * An empty string represents a variable 58 | * @return [description] 59 | */ 60 | std::string getObject(); 61 | 62 | /*! 63 | * Get the limit of the current iterator 64 | * @return [description] 65 | */ 66 | unsigned int getLimit(); 67 | 68 | /*! 69 | * Get the offset of the current iterator 70 | * @return [description] 71 | */ 72 | unsigned int getOffset(); 73 | 74 | /*! 75 | * Get the number of results read by the iterator 76 | * @return [description] 77 | */ 78 | unsigned int getNbResultsRead(); 79 | 80 | /*! 81 | * Implementation for Python function "__iter__" 82 | * @return [description] 83 | */ 84 | TripleIteratorBytes *python_iter(); 85 | 86 | /*! 87 | * Get the estimated cardinality of the pattern currently evaluated. 88 | * Offset & limit are not taken into account. 89 | * @return [description] 90 | */ 91 | size_hint sizeHint(); 92 | 93 | /*! 94 | * Return true if the iterator still has items available, False otherwise. 95 | * @return [description] 96 | */ 97 | bool hasNext(); 98 | 99 | /** 100 | * Get the next item in the iterator, or raise py::StopIteration if the 101 | * iterator has ended 102 | * @return [description] 103 | */ 104 | triple_bytes next(); 105 | 106 | /** 107 | * Get the next item in the iterator, or raise py::StopIteration if the 108 | * iterator has ended, but without advancing the iterator. 109 | * @return [description] 110 | */ 111 | triple_bytes peek(); 112 | }; 113 | 114 | #endif /* TRIPLE_ITERATOR_BYTES_HPP */ 115 | -------------------------------------------------------------------------------- /include/tripleid_iterator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * tripleid_iterator.hpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #ifndef TRIPLEID_ITERATOR_HPP 7 | #define TRIPLEID_ITERATOR_HPP 8 | 9 | #include "pyhdt_types.hpp" 10 | #include 11 | #include 12 | 13 | /*! 14 | * TripleIDIterator iterates over IDs of RDF triples of an HDT document which 15 | * match a triple pattern + limit + offset \author Thomas Minier 16 | */ 17 | class TripleIDIterator { 18 | private: 19 | std::string subject; 20 | std::string predicate; 21 | std::string object; 22 | unsigned int limit; 23 | unsigned int offset; 24 | hdt::IteratorTripleID *iterator; 25 | triple_id _bufferedTriple; 26 | bool hasBufferedTriple = false; 27 | unsigned int resultsRead = 0; 28 | 29 | public: 30 | /*! 31 | * Constructor 32 | * @param iterator [description] 33 | */ 34 | TripleIDIterator(hdt::IteratorTripleID *_it, std::string _subj, 35 | std::string _pred, std::string _obj, unsigned int _limit, 36 | unsigned int _offset); 37 | 38 | /*! 39 | * Destructor 40 | */ 41 | ~TripleIDIterator(); 42 | 43 | /*! 44 | * Implementation for Python function "__repr__" 45 | * @return [description] 46 | */ 47 | std::string python_repr(); 48 | 49 | /*! 50 | * Get the subject of the triple pattern currently evaluated. 51 | * @return [description] 52 | */ 53 | std::string getSubject(); 54 | 55 | /*! 56 | * Get the predicate of the triple pattern currently evaluated. 57 | * @return [description] 58 | */ 59 | std::string getPredicate(); 60 | 61 | /*! 62 | * Get the object of the triple pattern currently evaluated. 63 | * @return [description] 64 | */ 65 | std::string getObject(); 66 | 67 | /*! 68 | * Get the limit of the current iterator 69 | * @return [description] 70 | */ 71 | unsigned int getLimit(); 72 | 73 | /*! 74 | * Get the offset of the current iterator 75 | * @return [description] 76 | */ 77 | unsigned int getOffset(); 78 | 79 | /*! 80 | * Get the number of results read by the iterator 81 | * @return [description] 82 | */ 83 | unsigned int getNbResultsRead(); 84 | 85 | /*! 86 | * Implementation for Python function "__iter__" 87 | * @return [description] 88 | */ 89 | TripleIDIterator *python_iter(); 90 | 91 | /*! 92 | * Get the estimated cardinality of the pattern currently evaluated. 93 | * Offset & limit are not taken into account. 94 | * @return [description] 95 | */ 96 | size_hint sizeHint(); 97 | 98 | /*! 99 | * Return true if the iterator still has items available, False otherwise. 100 | * @return [description] 101 | */ 102 | bool hasNext(); 103 | 104 | /** 105 | * Get the next item in the iterator, or raise py::StopIteration if the 106 | * iterator has ended 107 | * @return [description] 108 | */ 109 | triple_id next(); 110 | 111 | /** 112 | * Get the next item in the iterator, or raise py::StopIteration if the 113 | * iterator has ended, but without advancing the iterator. 114 | * @return [description] 115 | */ 116 | triple_id peek(); 117 | }; 118 | 119 | #endif /* TRIPLEID_ITERATOR_HPP */ 120 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # scripts for automated installation 3 | 4 | echo "Validating dependencies..." 5 | command -v python >/dev/null 2>&1 || { echo >&2 "Python is required for the installation of pyHDT! Aborting installation..."; exit 1; } 6 | command -v pip >/dev/null 2>&1 || { echo >&2 "pip is required for the installation of pyHDT! Aborting installation..."; exit 1; } 7 | command -v curl >/dev/null 2>&1 || { echo >&2 "curl is required for the installation of pyHDT! Aborting installation..."; exit 1; } 8 | command -v unzip >/dev/null 2>&1 || { echo >&2 "unzip is required for the installation of pyHDT! Aborting installation..."; exit 1; } 9 | 10 | echo "Downloading HDT..." 11 | curl -LO https://github.com/rdfhdt/hdt-cpp/archive/v1.3.3.zip 12 | unzip -qq v1.3.3.zip 13 | 14 | echo "Installing pybind11..." 15 | pip install -r requirements.txt 16 | 17 | echo "Installing pyHDT..." 18 | python setup.py install 19 | 20 | echo "Cleaning up..." 21 | rm v1.3.3.zip 22 | rm -rf hdt-cpp-1.3.3/ 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11==2.2.4 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | from setuptools import setup, Extension 4 | from os import listdir 5 | import pybind11 6 | 7 | __pyhdt_version__ = "2.3" 8 | 9 | PYBIND_VERSION = 'pybind11==2.2.4' 10 | 11 | with open('README.rst') as file: 12 | long_description = file.read() 13 | 14 | 15 | def list_files(path, extension=".cpp", exclude="S.cpp"): 16 | """List paths to all files that ends with a given extension""" 17 | return ["%s/%s" % (path, f) for f in listdir(path) if f.endswith(extension) and (not f.endswith(exclude))] 18 | 19 | 20 | # pyHDT source files 21 | sources = [ 22 | "src/hdt.cpp", 23 | "src/hdt_document.cpp", 24 | "src/triple_iterator.cpp", 25 | "src/triple_iterator_bytes.cpp", 26 | "src/tripleid_iterator.cpp", 27 | "src/join_iterator.cpp", 28 | "src/join_iterator_bytes.cpp" 29 | ] 30 | 31 | # HDT source files 32 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/bitsequence") 33 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/coders") 34 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/mapper") 35 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/sequence") 36 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/permutation") 37 | sources += list_files("hdt-cpp-1.3.3/libcds/src/utils") 38 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/bitsequence") 39 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/dictionary") 40 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/hdt") 41 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/header") 42 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/huffman") 43 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs") 44 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex") 45 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/rdf") 46 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sequence") 47 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/triples") 48 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/util") 49 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sparql") 50 | 51 | # pybind11 + pyHDT + libcds + HDT-lib headers 52 | include_dirs = [ 53 | pybind11.get_include(), 54 | pybind11.get_include(True), 55 | "include/", 56 | "hdt-cpp-1.3.3/libhdt/include/", 57 | "hdt-cpp-1.3.3/libhdt/src/dictionary/", 58 | "hdt-cpp-1.3.3/libhdt/src/sparql/", 59 | "hdt-cpp-1.3.3/libcds/include/", 60 | "hdt-cpp-1.3.3/libcds/src/static/bitsequence", 61 | "hdt-cpp-1.3.3/libcds/src/static/coders", 62 | "hdt-cpp-1.3.3/libcds/src/static/mapper", 63 | "hdt-cpp-1.3.3/libcds/src/static/permutation", 64 | "hdt-cpp-1.3.3/libcds/src/static/sequence", 65 | "hdt-cpp-1.3.3/libcds/src/utils" 66 | ] 67 | 68 | # Need to build in c++11 minimum 69 | # TODO add a check to use c++14 or c++17 if available 70 | extra_compile_args = ["-std=c++11"] 71 | 72 | # build HDT extension 73 | hdt_extension = Extension("hdt", sources=sources, include_dirs=include_dirs, 74 | extra_compile_args=extra_compile_args, language='c++') 75 | 76 | setup( 77 | name="hdt", 78 | version=__pyhdt_version__, 79 | author="Thomas Minier", 80 | author_email="thomas.minier@univ-nantes.fr", 81 | url="https://github.com/Callidon/pyHDT", 82 | description="Read and query HDT document with ease in Python", 83 | long_description=long_description, 84 | keywords=["hdt", "rdf", "semantic web", "search"], 85 | license="MIT", 86 | install_requires=[PYBIND_VERSION], 87 | setup_requires=[PYBIND_VERSION], 88 | ext_modules=[hdt_extension] 89 | ) 90 | -------------------------------------------------------------------------------- /src/hdt.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | #include "docstrings.hpp" 10 | #include "hdt_document.hpp" 11 | #include "triple_iterator.hpp" 12 | #include "triple_iterator_bytes.hpp" 13 | #include "tripleid_iterator.hpp" 14 | #include "join_iterator.hpp" 15 | #include "join_iterator_bytes.hpp" 16 | 17 | namespace py = pybind11; 18 | 19 | PYBIND11_MODULE(hdt, m) { 20 | m.doc() = MODULE_DOC; 21 | 22 | py::enum_(m, "IdentifierPosition", IDENTIFIER_POSITION_DOC) 23 | .value("Subject", IdentifierPosition::Subject) 24 | .value("Predicate", IdentifierPosition::Predicate) 25 | .value("Object", IdentifierPosition::Object) 26 | .export_values(); 27 | 28 | py::class_(m, "TripleIterator", TRIPLE_ITERATOR_CLASS_DOC) 29 | .def("next", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 30 | .def("__next__", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 31 | .def("peek", &TripleIterator::peek, TRIPLE_ITERATOR_PEEK_DOC) 32 | .def("has_next", &TripleIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC) 33 | .def("size_hint", &TripleIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 34 | .def("__len__", &TripleIterator::sizeHint, 35 | TRIPLE_ITERATOR_SIZE_DOC) 36 | .def("__iter__", &TripleIterator::python_iter) 37 | .def_property_readonly("subject", &TripleIterator::getSubject, 38 | TRIPLE_ITERATOR_GETSUBJECT_DOC) 39 | .def_property_readonly("predicate", &TripleIterator::getPredicate, 40 | TRIPLE_ITERATOR_GETPREDICATE_DOC) 41 | .def_property_readonly("object", &TripleIterator::getObject, 42 | TRIPLE_ITERATOR_GETOBJECT_DOC) 43 | .def_property_readonly("limit", &TripleIterator::getLimit, 44 | TRIPLE_ITERATOR_GETLIMIT_DOC) 45 | .def_property_readonly("offset", &TripleIterator::getOffset, 46 | TRIPLE_ITERATOR_GETOFFSET_DOC) 47 | .def_property_readonly("nb_reads", &TripleIterator::getNbResultsRead, 48 | TRIPLE_ITERATOR_NBREADS_DOC) 49 | .def("__repr__", &TripleIterator::python_repr); 50 | 51 | py::class_(m, "TripleIteratorBytes", TRIPLE_ITERATOR_CLASS_DOC) 52 | .def("next", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC) 53 | .def("__next__", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC) 54 | .def("peek", &TripleIteratorBytes::peek, TRIPLE_ITERATOR_PEEK_DOC) 55 | .def("has_next", &TripleIteratorBytes::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC) 56 | .def("size_hint", &TripleIteratorBytes::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 57 | .def("__len__", &TripleIteratorBytes::sizeHint, 58 | TRIPLE_ITERATOR_SIZE_DOC) 59 | .def("__iter__", &TripleIteratorBytes::python_iter) 60 | .def_property_readonly("subject", &TripleIteratorBytes::getSubject, 61 | TRIPLE_ITERATOR_GETSUBJECT_DOC) 62 | .def_property_readonly("predicate", &TripleIteratorBytes::getPredicate, 63 | TRIPLE_ITERATOR_GETPREDICATE_DOC) 64 | .def_property_readonly("object", &TripleIteratorBytes::getObject, 65 | TRIPLE_ITERATOR_GETOBJECT_DOC) 66 | .def_property_readonly("limit", &TripleIteratorBytes::getLimit, 67 | TRIPLE_ITERATOR_GETLIMIT_DOC) 68 | .def_property_readonly("offset", &TripleIteratorBytes::getOffset, 69 | TRIPLE_ITERATOR_GETOFFSET_DOC) 70 | .def_property_readonly("nb_reads", &TripleIteratorBytes::getNbResultsRead, 71 | TRIPLE_ITERATOR_NBREADS_DOC) 72 | .def("__repr__", &TripleIteratorBytes::python_repr); 73 | 74 | py::class_(m, "TripleIDIterator", TRIPLE_ID_ITERATOR_CLASS_DOC) 75 | .def("next", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 76 | .def("__next__", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC) 77 | .def("peek", &TripleIDIterator::peek, TRIPLE_ITERATOR_PEEK_DOC) 78 | .def("has_next", &TripleIDIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC) 79 | .def("size_hint", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 80 | .def("__len__", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC) 81 | .def("__iter__", &TripleIDIterator::python_iter) 82 | .def_property_readonly("subject", &TripleIDIterator::getSubject, 83 | TRIPLE_ITERATOR_GETSUBJECT_DOC) 84 | .def_property_readonly("predicate", &TripleIDIterator::getPredicate, 85 | TRIPLE_ITERATOR_GETPREDICATE_DOC) 86 | .def_property_readonly("object", &TripleIDIterator::getObject, 87 | TRIPLE_ITERATOR_GETOBJECT_DOC) 88 | .def_property_readonly("limit", &TripleIDIterator::getLimit, 89 | TRIPLE_ITERATOR_GETLIMIT_DOC) 90 | .def_property_readonly("offset", &TripleIDIterator::getOffset, 91 | TRIPLE_ITERATOR_GETOFFSET_DOC) 92 | .def_property_readonly("nb_reads", &TripleIDIterator::getNbResultsRead, 93 | TRIPLE_ITERATOR_NBREADS_DOC) 94 | .def("__repr__", &TripleIDIterator::python_repr); 95 | 96 | py::class_(m, "JoinIterator", JOIN_ITERATOR_CLASS_DOC) 97 | .def("next", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC) 98 | .def("has_next", &JoinIterator::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC) 99 | .def("cardinality", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 100 | .def("reset", &JoinIterator::reset, JOIN_ITERATOR_RESET_DOC) 101 | .def("__len__", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 102 | .def("__next__", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC) 103 | .def("__iter__", &JoinIterator::python_iter) 104 | .def("__repr__", &JoinIterator::python_repr); 105 | 106 | py::class_(m, "JoinIteratorBytes", JOIN_ITERATOR_CLASS_DOC) 107 | .def("next", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC) 108 | .def("has_next", &JoinIteratorBytes::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC) 109 | .def("cardinality", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 110 | .def("reset", &JoinIteratorBytes::reset, JOIN_ITERATOR_RESET_DOC) 111 | .def("__len__", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC) 112 | .def("__next__", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC) 113 | .def("__iter__", &JoinIteratorBytes::python_iter) 114 | .def("__repr__", &JoinIteratorBytes::python_repr); 115 | 116 | py::class_(m, "HDTDocument", HDT_DOCUMENT_CLASS_DOC) 117 | .def(py::init(&HDTDocument::create), py::arg("file"), 118 | py::arg("map") = true, 119 | py::arg("indexed") = true) 120 | .def_property_readonly("file_path", &HDTDocument::getFilePath, 121 | HDT_DOCUMENT_GETFILEPATH_DOC) 122 | .def_property_readonly("total_triples", &HDTDocument::getNbTriples, 123 | HDT_DOCUMENT_GETNBTRIPLES_DOC) 124 | .def_property_readonly("nb_subjects", &HDTDocument::getNbSubjects, 125 | HDT_DOCUMENT_GETNBSUBJECTS_DOC) 126 | .def_property_readonly("nb_predicates", &HDTDocument::getNbPredicates, 127 | HDT_DOCUMENT_GETNBPREDICATES_DOC) 128 | .def_property_readonly("nb_objects", &HDTDocument::getNbObjects, 129 | HDT_DOCUMENT_GETNBOBJECTS_DOC) 130 | .def_property_readonly("nb_shared", &HDTDocument::getNbShared, 131 | HDT_DOCUMENT_GETNBSHARED_DOC) 132 | .def("search_triples", &HDTDocument::search, 133 | HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"), 134 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0, 135 | py::arg("offset") = 0) 136 | .def("search_join", &HDTDocument::searchJoin, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns")) 137 | .def("search_triples_ids", &HDTDocument::searchIDs, 138 | HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC, py::arg("subject"), 139 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0, 140 | py::arg("offset") = 0) 141 | .def("convert_tripleid", &HDTDocument::convertTripleID, 142 | HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC, 143 | py::arg("subject"), py::arg("predicate"), py::arg("object")) 144 | .def("convert_id", &HDTDocument::convertID, HDT_DOCUMENT_CONVERT_ID_DOC, 145 | py::arg("id"), py::arg("position")) 146 | .def("convert_term", &HDTDocument::convertTerm, HDT_DOCUMENT_CONVERT_TERM_DOC, 147 | py::arg("term"), py::arg("position")) 148 | // ========= BYTES REPRESENTATION ========= 149 | .def("search_triples_bytes", &HDTDocument::searchBytes, 150 | HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"), 151 | py::arg("predicate"), py::arg("object"), py::arg("limit") = 0, 152 | py::arg("offset") = 0) 153 | .def("search_join_bytes", &HDTDocument::searchJoinBytes, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns")) 154 | .def("convert_tripleid_bytes", &HDTDocument::convertTripleIDBytes, 155 | HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC, 156 | py::arg("subject"), py::arg("predicate"), py::arg("object")) 157 | .def("convert_id_bytes", &HDTDocument::convertIDBytes, HDT_DOCUMENT_CONVERT_ID_DOC, 158 | py::arg("id"), py::arg("position")) 159 | .def("__len__", &HDTDocument::getNbTriples, HDT_DOCUMENT_GETNBTRIPLES_DOC) 160 | .def("__repr__", &HDTDocument::python_repr); 161 | 162 | } 163 | -------------------------------------------------------------------------------- /src/hdt_document.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * hdt_document.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "hdt_document.hpp" 7 | #include "triple_iterator.hpp" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | namespace py = pybind11; 15 | using namespace hdt; 16 | 17 | /*! 18 | * Skip `offset` items from an iterator, optimized for HDT iterators. 19 | * @param it - Iterator which should skip items 20 | * @param offset - How many items to skip 21 | * @param cardinality - (Estimated) number of results 22 | */ 23 | template 24 | inline void applyOffset(T *it, unsigned int offset, unsigned int cardinality) { 25 | if (offset > 0 && offset >= cardinality) { 26 | // hdt does not allow to skip past beyond the estimated nb of results, 27 | // so we may have a few results to skip manually 28 | unsigned int remainingSteps = offset - cardinality + 1; 29 | it->skip(cardinality - 1); 30 | while (it->hasNext() && remainingSteps > 0) { 31 | it->next(); 32 | remainingSteps--; 33 | } 34 | } else if (offset > 0) { 35 | it->skip(offset); 36 | } 37 | } 38 | 39 | /*! 40 | * Returns true if a file is readable, false otherwise 41 | * @param name - Path to the file to test 42 | * @return true if the file is readable, false otherwise 43 | */ 44 | inline bool file_exists(const std::string &name) { 45 | std::ifstream f(name.c_str()); 46 | bool result = f.good(); 47 | f.close(); 48 | return result; 49 | } 50 | 51 | /*! 52 | * Constructor 53 | * @param file - Path to HDT file to load 54 | * @param map - True maps the HDT file (faster), False loads everything in memory 55 | * @param indexed - True if the HDT must be loaded with indexes, False otherwise 56 | */ 57 | HDTDocument::HDTDocument(std::string file, bool map, bool indexed) { 58 | hdt_file = file; 59 | if (!file_exists(file)) { 60 | throw std::runtime_error("Cannot open HDT file '" + file + "': Not Found!"); 61 | } 62 | 63 | if(!map && indexed) { 64 | hdt = HDTManager::loadIndexedHDT(file.c_str()); 65 | } else if(!map && !indexed) { 66 | hdt = HDTManager::loadHDT(file.c_str()); 67 | } else if(map && indexed){ 68 | hdt = HDTManager::mapIndexedHDT(file.c_str()); 69 | } else { 70 | hdt = HDTManager::mapHDT(file.c_str()); 71 | } 72 | processor = new QueryProcessor(hdt); 73 | } 74 | 75 | /*! 76 | * Destructor 77 | */ 78 | HDTDocument::~HDTDocument() {} 79 | 80 | /*! 81 | * Get the path to the HDT file currently loaded 82 | * @return The path to the HDT file currently loaded 83 | */ 84 | std::string HDTDocument::getFilePath() { return hdt_file; } 85 | 86 | /*! 87 | * Implementation for Python function "__repr__" 88 | * @return A string representation of the object 89 | */ 90 | std::string HDTDocument::python_repr() { 91 | return ""; 93 | } 94 | 95 | /*! 96 | * Search all matching triples for a triple pattern, whith an optional limit and offset. 97 | * Returns a tuple 98 | * @param subject - Triple pattern's subject 99 | * @param predicate - Triple pattern's predicate 100 | * @param object - Triple pattern's object 101 | * @param limit - (Optional) Maximum number of matching triples to read 102 | * @param offset - (Optional) Number of matching triples to skip 103 | * @return A tuple (TripleIterator*, cardinality) 104 | */ 105 | search_results HDTDocument::search(std::string subject, 106 | std::string predicate, 107 | std::string object, 108 | unsigned int limit, 109 | unsigned int offset) { 110 | unsigned int idSubject = 0; 111 | unsigned int idPredicate = 0; 112 | unsigned int idObject = 0; 113 | 114 | if (!subject.empty()) { 115 | idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT); 116 | } 117 | 118 | if (!predicate.empty()) { 119 | idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE); 120 | } 121 | 122 | if (!object.empty()) { 123 | idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT); 124 | } 125 | 126 | TripleIDIterator *it; 127 | size_t cardinality = 0; 128 | 129 | // if a non-variable term was not found in the dictionnary, then the search yield nothing 130 | if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) { 131 | it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset); 132 | } else { 133 | // build a TripleIDIterator to fetch results 134 | TripleID tp(idSubject, idPredicate, idObject); 135 | IteratorTripleID *source = hdt->getTriples()->search(tp); 136 | cardinality = source->estimatedNumResults(); 137 | applyOffset(source, offset, cardinality); 138 | it = new TripleIDIterator(source, subject, predicate, object, limit, offset); 139 | } 140 | // wraps the TripleIDIterator in order to convert OID triples back to RDF triples 141 | TripleIterator *resultIterator = new TripleIterator(it, hdt->getDictionary()); 142 | return std::make_tuple(resultIterator, cardinality); 143 | } 144 | 145 | /*! 146 | * Same as HDTDocument#search, but search for a TripleIDs instead. 147 | * Returns a tuple 148 | * @param subject - Triple pattern's subject identifier 149 | * @param predicate - Triple pattern's predicate identifier 150 | * @param object - Triple pattern's object identifier 151 | * @param limit - (Optional) Maximum number of matching triples to read 152 | * @param offset - (Optional) Number of matching triples to skip 153 | * @return A tuple (TripleIDIterator*, cardinality) 154 | */ 155 | search_results_ids HDTDocument::searchIDs(unsigned int subject, 156 | unsigned int predicate, 157 | unsigned int object, 158 | unsigned int limit, 159 | unsigned int offset) { 160 | TripleID tp(subject, predicate, object); 161 | // get RDF terms associated with each ID for metadata 162 | std::string strSubject = std::string("?s"); 163 | std::string strPredicate = std::string("?p"); 164 | std::string strObject = std::string("?o"); 165 | 166 | if (subject != 0) { 167 | strSubject = hdt->getDictionary()->idToString(subject, hdt::SUBJECT); 168 | } 169 | if (predicate != 0) { 170 | strPredicate = hdt->getDictionary()->idToString(predicate, hdt::PREDICATE); 171 | } 172 | if (object != 0) { 173 | strObject = hdt->getDictionary()->idToString(object, hdt::OBJECT); 174 | } 175 | 176 | IteratorTripleID *it; 177 | size_t cardinality = 0; 178 | 179 | // if a non-variable term was not found in the dictionnary, then the search yield nothing 180 | if ((strSubject.empty() && subject != 0) || (strPredicate.empty() && predicate != 0) || (strObject.empty() && object != 0)) { 181 | it = new IteratorTripleID(); 182 | } else { 183 | // build iterator 184 | it = hdt->getTriples()->search(tp); 185 | cardinality = it->estimatedNumResults(); 186 | // apply offset 187 | applyOffset(it, offset, cardinality); 188 | } 189 | TripleIDIterator *resultIterator = new TripleIDIterator(it, strSubject, strPredicate, strObject, limit, offset); 190 | return std::make_tuple(resultIterator, cardinality); 191 | } 192 | 193 | /*! 194 | * Get the total number of triples in the HDT document 195 | * @return The total number of triples in the HDT document 196 | */ 197 | unsigned int HDTDocument::getNbTriples() { 198 | return hdt->getTriples()->getNumberOfElements(); 199 | } 200 | 201 | /*! 202 | * Get the number of distinct subjects in the HDT document 203 | * @return The number of distinct subjects in the HDT document 204 | */ 205 | unsigned int HDTDocument::getNbSubjects() { 206 | return hdt->getDictionary()->getNsubjects(); 207 | } 208 | 209 | /*! 210 | * Get the number of distinct predicates in the HDT document 211 | * @return The number of distinct predicates in the HDT document 212 | */ 213 | unsigned int HDTDocument::getNbPredicates() { 214 | return hdt->getDictionary()->getNpredicates(); 215 | } 216 | 217 | /*! 218 | * Get the number of distinct objects in the HDT document 219 | * @return The number of distinct objects in the HDT document 220 | */ 221 | unsigned int HDTDocument::getNbObjects() { 222 | return hdt->getDictionary()->getNobjects(); 223 | } 224 | 225 | /*! 226 | * Get the number of shared subjects-objects in the HDT document 227 | * @return The number of shared subjects-objects in the HDT document 228 | */ 229 | unsigned int HDTDocument::getNbShared() { 230 | return hdt->getDictionary()->getNshared(); 231 | } 232 | 233 | /*! 234 | * Convert a TripleID to a string RDF triple 235 | * @param subject - Triple's subject 236 | * @param predicate - Triple's predicate 237 | * @param object - Triple's object 238 | * @return The associated RDF triple 239 | */ 240 | triple HDTDocument::convertTripleID(unsigned int subject, unsigned int predicate, 241 | unsigned int object) { 242 | return std::make_tuple( 243 | hdt->getDictionary()->idToString(subject, hdt::SUBJECT), 244 | hdt->getDictionary()->idToString(predicate, hdt::PREDICATE), 245 | hdt->getDictionary()->idToString(object, hdt::OBJECT)); 246 | } 247 | 248 | /** 249 | * Convert an Object Identifier into the equivalent URI/Literal value 250 | * @param id - Object Identifier 251 | * @param pos - Identifier position (subject, predicate or object) 252 | * @return The URI/Literal equivalent to the Object Identifier 253 | */ 254 | string HDTDocument::convertID(unsigned int id, IdentifierPosition pos) { 255 | switch (pos) { 256 | case IdentifierPosition::Subject: 257 | return hdt->getDictionary()->idToString(id, hdt::SUBJECT); 258 | case IdentifierPosition::Predicate: 259 | return hdt->getDictionary()->idToString(id, hdt::PREDICATE); 260 | case IdentifierPosition::Object: 261 | return hdt->getDictionary()->idToString(id, hdt::OBJECT); 262 | default: 263 | throw std::runtime_error("Invalid Object Identifier exception"); 264 | } 265 | } 266 | 267 | /** 268 | * Convert an RDF term into the associated an Object Identifier. 269 | * @param term - RDF Term in string format 270 | * @param pos - Identifier position (subject, predicate or object) 271 | * @return The Object Identifier associated with the RDF term 272 | */ 273 | unsigned int HDTDocument::convertTerm(std::string term, IdentifierPosition pos) { 274 | switch (pos) { 275 | case IdentifierPosition::Subject: 276 | return hdt->getDictionary()->stringToId(term, hdt::SUBJECT); 277 | case IdentifierPosition::Predicate: 278 | return hdt->getDictionary()->stringToId(term, hdt::PREDICATE); 279 | case IdentifierPosition::Object: 280 | return hdt->getDictionary()->stringToId(term, hdt::OBJECT); 281 | default: 282 | throw std::runtime_error("Invalid Object Identifier exception"); 283 | } 284 | } 285 | 286 | /** 287 | * Evaluate a join between a set of triple patterns using a JoinIterator. 288 | * @param patterns - Set of triple patterns 289 | * @return A JoinIterator* used to evaluated the join. 290 | */ 291 | JoinIterator * HDTDocument::searchJoin(std::vector patterns) { 292 | set vars {}; 293 | vector joinPatterns {}; 294 | std::string subj, pred, obj; 295 | 296 | for (auto it = patterns.begin(); it != patterns.end(); it++) { 297 | // unpack pattern 298 | std::tie(subj, pred, obj) = *it; 299 | // add variables 300 | if (subj.at(0) == '?') { 301 | vars.insert(subj); 302 | } 303 | if (pred.at(0) == '?') { 304 | vars.insert(pred); 305 | } 306 | if (obj.at(0) == '?') { 307 | vars.insert(obj); 308 | } 309 | // build join pattern 310 | TripleString pattern(subj, pred, obj); 311 | joinPatterns.push_back(pattern); 312 | } 313 | 314 | VarBindingString *iterator = processor->searchJoin(joinPatterns, vars); 315 | return new JoinIterator(iterator); 316 | } 317 | 318 | // ============= BYTES REPRSENTATION ============ 319 | /*! 320 | * Search all matching triples for a triple pattern, whith an optional limit and offset. Triple as bytes triples (b'...', b'...', b'...') 321 | * Returns a tuple 322 | * @param subject - Triple pattern's subject 323 | * @param predicate - Triple pattern's predicate 324 | * @param object - Triple pattern's object 325 | * @param limit - (Optional) Maximum number of matching triples to read 326 | * @param offset - (Optional) Number of matching triples to skip 327 | * @return A tuple (TripleIterator*, cardinality) 328 | */ 329 | search_results_bytes HDTDocument::searchBytes(std::string subject, 330 | std::string predicate, 331 | std::string object, 332 | unsigned int limit, 333 | unsigned int offset) { 334 | unsigned int idSubject = 0; 335 | unsigned int idPredicate = 0; 336 | unsigned int idObject = 0; 337 | 338 | if (!subject.empty()) { 339 | idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT); 340 | } 341 | 342 | if (!predicate.empty()) { 343 | idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE); 344 | } 345 | 346 | if (!object.empty()) { 347 | idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT); 348 | } 349 | 350 | TripleIDIterator *it; 351 | size_t cardinality = 0; 352 | 353 | // if a non-variable term was not found in the dictionnary, then the search yield nothing 354 | if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) { 355 | it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset); 356 | } else { 357 | // build a TripleIDIterator to fetch results 358 | TripleID tp(idSubject, idPredicate, idObject); 359 | IteratorTripleID *source = hdt->getTriples()->search(tp); 360 | cardinality = source->estimatedNumResults(); 361 | applyOffset(source, offset, cardinality); 362 | it = new TripleIDIterator(source, subject, predicate, object, limit, offset); 363 | } 364 | // wraps the TripleIDIterator in order to convert OID triples back to RDF triples 365 | TripleIteratorBytes *resultIterator = new TripleIteratorBytes(it, hdt->getDictionary()); 366 | return std::make_tuple(resultIterator, cardinality); 367 | } 368 | 369 | /** 370 | * Evaluate a join between a set of triple patterns using a JoinIterator. 371 | * @param patterns - Set of triple patterns 372 | * @return A JoinIterator* used to evaluated the join. 373 | */ 374 | JoinIteratorBytes * HDTDocument::searchJoinBytes(std::vector patterns) { 375 | set vars {}; 376 | vector joinPatterns {}; 377 | std::string subj, pred, obj; 378 | 379 | for (auto it = patterns.begin(); it != patterns.end(); it++) { 380 | // unpack pattern 381 | std::tie(subj, pred, obj) = *it; 382 | // add variables 383 | if (subj.at(0) == '?') { 384 | vars.insert(subj); 385 | } 386 | if (pred.at(0) == '?') { 387 | vars.insert(pred); 388 | } 389 | if (obj.at(0) == '?') { 390 | vars.insert(obj); 391 | } 392 | // build join pattern 393 | TripleString pattern(subj, pred, obj); 394 | joinPatterns.push_back(pattern); 395 | } 396 | 397 | VarBindingString *iterator = processor->searchJoin(joinPatterns, vars); 398 | return new JoinIteratorBytes(iterator); 399 | } 400 | 401 | /** 402 | * Convert an Object Identifier into the equivalent URI/Literal value 403 | * @param id - Object Identifier 404 | * @param pos - Identifier position (subject, predicate or object) 405 | * @return The URI/Literal equivalent to the Object Identifier 406 | */ 407 | py::bytes HDTDocument::convertIDBytes(unsigned int id, IdentifierPosition pos) { 408 | return py::bytes(HDTDocument::convertID(id, pos)); 409 | } 410 | 411 | /*! 412 | * Convert a TripleID to a string RDF triple 413 | * @param subject - Triple's subject 414 | * @param predicate - Triple's predicate 415 | * @param object - Triple's object 416 | * @return The associated RDF triple 417 | */ 418 | triple_bytes HDTDocument::convertTripleIDBytes(unsigned int subject, unsigned int predicate, 419 | unsigned int object) { 420 | return std::make_tuple( 421 | py::bytes(hdt->getDictionary()->idToString(subject, hdt::SUBJECT)), 422 | py::bytes(hdt->getDictionary()->idToString(predicate, hdt::PREDICATE)), 423 | py::bytes(hdt->getDictionary()->idToString(object, hdt::OBJECT))); 424 | } 425 | -------------------------------------------------------------------------------- /src/join_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "join_iterator.hpp" 7 | #include 8 | #include 9 | 10 | /*! 11 | * Constructor 12 | * @param _it [description] 13 | */ 14 | JoinIterator::JoinIterator(hdt::VarBindingString *_it) : iterator(_it) {} 15 | 16 | /*! 17 | * Destructor 18 | */ 19 | JoinIterator::~JoinIterator() { 20 | delete iterator; 21 | } 22 | 23 | /*! 24 | * Implementation for Python function "__repr__" 25 | * @return [description] 26 | */ 27 | std::string JoinIterator::python_repr() { 28 | return "JoinIterator"; 29 | } 30 | 31 | 32 | /*! 33 | * Implementation for Python function "__iter__" 34 | * @return [description] 35 | */ 36 | JoinIterator *JoinIterator::python_iter() { return this; } 37 | 38 | /** 39 | * Get the estimated join cardinality 40 | * @return [description] 41 | */ 42 | size_t JoinIterator::estimatedCardinality() { 43 | return iterator->estimatedNumResults(); 44 | } 45 | 46 | /** 47 | * Reset the iterator into its initial state and restart join processing. 48 | */ 49 | void JoinIterator::reset() { 50 | iterator->goToStart(); 51 | } 52 | 53 | /*! 54 | * Return true if the iterator still has items available, False otherwise. 55 | * @return [description] 56 | */ 57 | bool JoinIterator::hasNext() { 58 | return hasNextSolution; 59 | } 60 | 61 | /** 62 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 63 | * has ended. Used to implement Python Itertor protocol. 64 | * @return [description] 65 | */ 66 | solution_bindings JoinIterator::next() { 67 | hasNextSolution = iterator->findNext(); 68 | // stop iteration if the iterator has ended 69 | if (!hasNextSolution) { 70 | throw pybind11::stop_iteration(); 71 | } 72 | solution_bindings solutions = new std::set(); 73 | // build solution bindings 74 | for(unsigned int i = 0; i < iterator->getNumVars(); i++) { 75 | solutions->insert(std::make_tuple(iterator->getVarName(i), iterator->getVar(i))); 76 | } 77 | return solutions; 78 | } 79 | -------------------------------------------------------------------------------- /src/join_iterator_bytes.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * join_iterator_bytes.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "join_iterator_bytes.hpp" 7 | #include 8 | #include 9 | 10 | /*! 11 | * Constructor 12 | * @param _it [description] 13 | */ 14 | JoinIteratorBytes::JoinIteratorBytes(hdt::VarBindingString *_it) : iterator(_it) {} 15 | 16 | /*! 17 | * Destructor 18 | */ 19 | JoinIteratorBytes::~JoinIteratorBytes() { 20 | delete iterator; 21 | } 22 | 23 | /*! 24 | * Implementation for Python function "__repr__" 25 | * @return [description] 26 | */ 27 | std::string JoinIteratorBytes::python_repr() { 28 | return "JoinIteratorBytes"; 29 | } 30 | 31 | 32 | /*! 33 | * Implementation for Python function "__iter__" 34 | * @return [description] 35 | */ 36 | JoinIteratorBytes *JoinIteratorBytes::python_iter() { return this; } 37 | 38 | /** 39 | * Get the estimated join cardinality 40 | * @return [description] 41 | */ 42 | size_t JoinIteratorBytes::estimatedCardinality() { 43 | return iterator->estimatedNumResults(); 44 | } 45 | 46 | /** 47 | * Reset the iterator into its initial state and restart join processing. 48 | */ 49 | void JoinIteratorBytes::reset() { 50 | iterator->goToStart(); 51 | } 52 | 53 | /*! 54 | * Return true if the iterator still has items available, False otherwise. 55 | * @return [description] 56 | */ 57 | bool JoinIteratorBytes::hasNext() { 58 | return hasNextSolution; 59 | } 60 | 61 | /** 62 | * Return the next set of solutions bindings, or raise py::StopIteration if the iterator 63 | * has ended. Used to implement Python Itertor protocol. 64 | * @return [description] 65 | */ 66 | py::set JoinIteratorBytes::next() { 67 | hasNextSolution = iterator->findNext(); 68 | // stop iteration if the iterator has ended 69 | if (!hasNextSolution) { 70 | throw pybind11::stop_iteration(); 71 | } 72 | solution_bindings_bytes solutions_bytes; 73 | // build solution bindings 74 | for(unsigned int i = 0; i < iterator->getNumVars(); i++) { 75 | std::string varname = iterator->getVarName(i); 76 | std::string value = iterator->getVar(i); 77 | solutions_bytes.add(std::make_tuple(py::bytes(varname), py::bytes(value))); 78 | } 79 | return solutions_bytes; 80 | } 81 | -------------------------------------------------------------------------------- /src/triple_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "triple_iterator.hpp" 7 | #include 8 | #include 9 | #include 10 | 11 | /*! 12 | * Constructor 13 | * @param iterator [description] 14 | */ 15 | TripleIterator::TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict) 16 | : iterator(_it), dictionary(_dict) {}; 17 | 18 | /*! 19 | * Destructor 20 | */ 21 | TripleIterator::~TripleIterator() { delete iterator; }; 22 | 23 | /*! 24 | * Implementation for Python function "__repr__" 25 | * @return [description] 26 | */ 27 | std::string TripleIterator::python_repr() { 28 | if (getLimit() != 0 && getOffset() > 0) { 29 | return ""; 32 | } else if (getLimit() != 0) { 33 | return ""; 35 | } else if (getOffset() > 0) { 36 | return ""; 38 | } 39 | return ""; 40 | } 41 | 42 | /*! 43 | * Get the subject of the triple pattern currently evaluated. 44 | * An empty string represents a variable 45 | * @return [description] 46 | */ 47 | std::string TripleIterator::getSubject() { return iterator->getSubject(); } 48 | 49 | /*! 50 | * Get the predicate of the triple pattern currently evaluated. 51 | * An empty string represents a variable 52 | * @return [description] 53 | */ 54 | std::string TripleIterator::getPredicate() { return iterator->getPredicate(); } 55 | 56 | /*! 57 | * Get the object of the triple pattern currently evaluated. 58 | * An empty string represents a variable 59 | * @return [description] 60 | */ 61 | std::string TripleIterator::getObject() { return iterator->getObject(); } 62 | 63 | /*! 64 | * Get the limit of the current iterator 65 | * @return [description] 66 | */ 67 | unsigned int TripleIterator::getLimit() { return iterator->getLimit(); } 68 | 69 | /*! 70 | * Get the offset of the current iterator 71 | * @return [description] 72 | */ 73 | unsigned int TripleIterator::getOffset() { return iterator->getOffset(); } 74 | 75 | /*! 76 | * Get the number of results read by the iterator 77 | * @return [description] 78 | */ 79 | unsigned int TripleIterator::getNbResultsRead() { return iterator->getNbResultsRead(); } 80 | 81 | /*! 82 | * Implementation for Python function "__iter__" 83 | * @return [description] 84 | */ 85 | TripleIterator *TripleIterator::python_iter() { return this; } 86 | 87 | /*! 88 | * Get a hint over the cardinality of the triple pattern evaluated. 89 | * Offset & limit are not taken into account. 90 | * @return [description] 91 | */ 92 | size_hint TripleIterator::sizeHint() { 93 | return iterator->sizeHint(); 94 | } 95 | 96 | /*! 97 | * Return true if the iterator still has items available, False otherwise. 98 | * @return [description] 99 | */ 100 | bool TripleIterator::hasNext() { 101 | return iterator->hasNext(); 102 | } 103 | 104 | /** 105 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 106 | * has ended. Used to implement Python Itertor protocol. 107 | * @return [description] 108 | */ 109 | triple TripleIterator::next() { 110 | triple_id t = iterator->next(); 111 | return std::make_tuple( 112 | dictionary->idToString(std::get<0>(t), hdt::SUBJECT), 113 | dictionary->idToString(std::get<1>(t), hdt::PREDICATE), 114 | dictionary->idToString(std::get<2>(t), hdt::OBJECT)); 115 | } 116 | 117 | /** 118 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 119 | * has ended, but without advancing the iterator. 120 | * @return [description] 121 | */ 122 | triple TripleIterator::peek() { 123 | triple_id t = iterator->peek(); 124 | return std::make_tuple( 125 | dictionary->idToString(std::get<0>(t), hdt::SUBJECT), 126 | dictionary->idToString(std::get<1>(t), hdt::PREDICATE), 127 | dictionary->idToString(std::get<2>(t), hdt::OBJECT)); 128 | } 129 | -------------------------------------------------------------------------------- /src/triple_iterator_bytes.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * triple_iterator_bytes.cpp 3 | * Author: Arnaud GRALL - MIT License 2017-2019 4 | */ 5 | 6 | #include "triple_iterator_bytes.hpp" 7 | #include 8 | #include 9 | #include 10 | namespace py = pybind11; 11 | 12 | /*! 13 | * Constructor 14 | * @param iterator [description] 15 | */ 16 | TripleIteratorBytes::TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict) 17 | : iterator(_it), dictionary(_dict) {}; 18 | 19 | /*! 20 | * Destructor 21 | */ 22 | TripleIteratorBytes::~TripleIteratorBytes() { delete iterator; }; 23 | 24 | /*! 25 | * Implementation for Python function "__repr__" 26 | * @return [description] 27 | */ 28 | std::string TripleIteratorBytes::python_repr() { 29 | if (getLimit() != 0 && getOffset() > 0) { 30 | return ""; 33 | } else if (getLimit() != 0) { 34 | return ""; 36 | } else if (getOffset() > 0) { 37 | return ""; 39 | } 40 | return ""; 41 | } 42 | 43 | /*! 44 | * Get the subject of the triple pattern currently evaluated. 45 | * An empty string represents a variable 46 | * @return [description] 47 | */ 48 | std::string TripleIteratorBytes::getSubject() { return iterator->getSubject(); } 49 | 50 | /*! 51 | * Get the predicate of the triple pattern currently evaluated. 52 | * An empty string represents a variable 53 | * @return [description] 54 | */ 55 | std::string TripleIteratorBytes::getPredicate() { return iterator->getPredicate(); } 56 | 57 | /*! 58 | * Get the object of the triple pattern currently evaluated. 59 | * An empty string represents a variable 60 | * @return [description] 61 | */ 62 | std::string TripleIteratorBytes::getObject() { return iterator->getObject(); } 63 | 64 | /*! 65 | * Get the limit of the current iterator 66 | * @return [description] 67 | */ 68 | unsigned int TripleIteratorBytes::getLimit() { return iterator->getLimit(); } 69 | 70 | /*! 71 | * Get the offset of the current iterator 72 | * @return [description] 73 | */ 74 | unsigned int TripleIteratorBytes::getOffset() { return iterator->getOffset(); } 75 | 76 | /*! 77 | * Get the number of results read by the iterator 78 | * @return [description] 79 | */ 80 | unsigned int TripleIteratorBytes::getNbResultsRead() { return iterator->getNbResultsRead(); } 81 | 82 | /*! 83 | * Implementation for Python function "__iter__" 84 | * @return [description] 85 | */ 86 | TripleIteratorBytes *TripleIteratorBytes::python_iter() { return this; } 87 | 88 | /*! 89 | * Get a hint over the cardinality of the triple pattern evaluated. 90 | * Offset & limit are not taken into account. 91 | * @return [description] 92 | */ 93 | size_hint TripleIteratorBytes::sizeHint() { 94 | return iterator->sizeHint(); 95 | } 96 | 97 | /*! 98 | * Return true if the iterator still has items available, False otherwise. 99 | * @return [description] 100 | */ 101 | bool TripleIteratorBytes::hasNext() { 102 | return iterator->hasNext(); 103 | } 104 | 105 | /** 106 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 107 | * has ended. Used to implement Python Itertor protocol. 108 | * @return [description] 109 | */ 110 | triple_bytes TripleIteratorBytes::next() { 111 | triple_id t = iterator->next(); 112 | return std::make_tuple( 113 | py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)), 114 | py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)), 115 | py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT))); 116 | } 117 | 118 | /** 119 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 120 | * has ended, but without advancing the iterator. 121 | * @return [description] 122 | */ 123 | triple_bytes TripleIteratorBytes::peek() { 124 | triple_id t = iterator->peek(); 125 | return std::make_tuple( 126 | py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)), 127 | py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)), 128 | py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT))); 129 | } 130 | -------------------------------------------------------------------------------- /src/tripleid_iterator.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * tripleid_iterator.cpp 3 | * Author: Thomas MINIER - MIT License 2017-2019 4 | */ 5 | 6 | #include "tripleid_iterator.hpp" 7 | #include 8 | #include 9 | #include 10 | 11 | /*! 12 | * Constructor 13 | * @param iterator [description] 14 | */ 15 | TripleIDIterator::TripleIDIterator(hdt::IteratorTripleID *_it, 16 | std::string _subj, std::string _pred, 17 | std::string _obj, unsigned int _limit, 18 | unsigned int _offset) 19 | : subject((_subj.compare("") == 0) ? "?s" : _subj), 20 | predicate((_pred.compare("") == 0) ? "?p" : _pred), 21 | object((_obj.compare("") == 0) ? "?o" : _obj), limit(_limit), 22 | offset(_offset), iterator(_it){}; 23 | 24 | /*! 25 | * Destructor 26 | */ 27 | TripleIDIterator::~TripleIDIterator() { delete iterator; }; 28 | 29 | /*! 30 | * Implementation for Python function "__repr__" 31 | * @return [description] 32 | */ 33 | std::string TripleIDIterator::python_repr() { 34 | if (limit != 0 && offset > 0) { 35 | return ""; 38 | } else if (limit != 0) { 39 | return ""; 41 | } else if (offset > 0) { 42 | return ""; 44 | } 45 | return ""; 46 | } 47 | 48 | /*! 49 | * Get the subject of the triple pattern currently evaluated. 50 | * An empty string represents a variable 51 | * @return [description] 52 | */ 53 | std::string TripleIDIterator::getSubject() { return subject; } 54 | 55 | /*! 56 | * Get the predicate of the triple pattern currently evaluated. 57 | * An empty string represents a variable 58 | * @return [description] 59 | */ 60 | std::string TripleIDIterator::getPredicate() { return predicate; } 61 | 62 | /*! 63 | * Get the object of the triple pattern currently evaluated. 64 | * An empty string represents a variable 65 | * @return [description] 66 | */ 67 | std::string TripleIDIterator::getObject() { return object; } 68 | 69 | /*! 70 | * Get the limit of the current iterator 71 | * @return [description] 72 | */ 73 | unsigned int TripleIDIterator::getLimit() { return limit; } 74 | 75 | /*! 76 | * Get the offset of the current iterator 77 | * @return [description] 78 | */ 79 | unsigned int TripleIDIterator::getOffset() { return offset; } 80 | 81 | /*! 82 | * Get the number of results read by the iterator 83 | * @return [description] 84 | */ 85 | unsigned int TripleIDIterator::getNbResultsRead() { return resultsRead; } 86 | 87 | /*! 88 | * Implementation for Python function "__iter__" 89 | * @return [description] 90 | */ 91 | TripleIDIterator *TripleIDIterator::python_iter() { return this; } 92 | 93 | /*! 94 | * Get a hint over the cardinality of the triple pattern evaluated. 95 | * Offset & limit are not taken into account. 96 | * @return [description] 97 | */ 98 | size_hint TripleIDIterator::sizeHint() { 99 | return std::make_tuple(iterator->estimatedNumResults(), iterator->numResultEstimation() == hdt::EXACT); 100 | } 101 | 102 | /*! 103 | * Return true if the iterator still has items available, False otherwise. 104 | * @return [description] 105 | */ 106 | bool TripleIDIterator::hasNext() { 107 | bool noLimit = limit == 0; 108 | return iterator->hasNext() && (noLimit || limit > resultsRead); 109 | } 110 | 111 | /** 112 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 113 | * has ended. Used to implement Python Itertor protocol. 114 | * @return [description] 115 | */ 116 | triple_id TripleIDIterator::next() { 117 | // return any previously peeked value 118 | if (hasBufferedTriple) { 119 | hasBufferedTriple = false; 120 | resultsRead++; 121 | return _bufferedTriple; 122 | } 123 | bool noLimit = limit == 0; 124 | if (iterator->hasNext() && (noLimit || limit > resultsRead)) { 125 | resultsRead++; 126 | hdt::TripleID *ts = iterator->next(); 127 | return std::make_tuple(ts->getSubject(), ts->getPredicate(), 128 | ts->getObject()); 129 | } 130 | throw pybind11::stop_iteration(); 131 | } 132 | 133 | /** 134 | * Get the next item in the iterator, or raise py::StopIteration if the iterator 135 | * has ended, but without advancing the iterator. 136 | * @return [description] 137 | */ 138 | triple_id TripleIDIterator::peek() { 139 | if (hasBufferedTriple) { 140 | return _bufferedTriple; 141 | } 142 | _bufferedTriple = next(); 143 | hasBufferedTriple = true; 144 | resultsRead--; 145 | return _bufferedTriple; 146 | } 147 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Callidon/pyHDT/56370143e707c1b69bdb054bd811660e6611cae1/tests/__init__.py -------------------------------------------------------------------------------- /tests/hdt_document_test.py: -------------------------------------------------------------------------------- 1 | # hdt_document_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | import pytest 4 | from hdt import HDTDocument, IdentifierPosition 5 | 6 | path = "tests/test.hdt" 7 | document = HDTDocument(path, True, False) 8 | nbTotalTriples = 132 9 | 10 | 11 | def test_missing_file(): 12 | with pytest.raises(RuntimeError): 13 | HDTDocument("/home/dtrump/wall.hdt") 14 | 15 | 16 | def test_file_path(): 17 | assert document.file_path == path 18 | 19 | 20 | def test_total_triples(): 21 | assert document.total_triples == nbTotalTriples 22 | assert len(document) == nbTotalTriples 23 | 24 | 25 | def test_nb_subjects(): 26 | assert document.nb_subjects == 4 27 | 28 | 29 | def tests_nb_predicates(): 30 | assert document.nb_predicates == 3 31 | 32 | 33 | def tests_nb_objects(): 34 | assert document.nb_objects == 112 35 | 36 | 37 | def tests_nb_shared(): 38 | assert document.nb_shared == 0 39 | 40 | 41 | def test_ids_to_string(): 42 | (triples, triplesCard) = document.search_triples("", "", "") 43 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 44 | assert triplesCard == idsCard 45 | assert triplesCard == nbTotalTriples 46 | for subj, pred, obj in triples: 47 | sid, pid, oid = next(ids) 48 | s, p, o = document.convert_tripleid(sid, pid, oid) 49 | assert subj == s 50 | assert pred == p 51 | assert obj == o 52 | 53 | def test_ids_to_string_bytes(): 54 | (triples, triplesCard) = document.search_triples_bytes("", "", "") 55 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 56 | assert triplesCard == idsCard 57 | assert triplesCard == nbTotalTriples 58 | for subj, pred, obj in triples: 59 | print(subj, pred, obj) 60 | sid, pid, oid = next(ids) 61 | s, p, o = document.convert_tripleid_bytes(sid, pid, oid) 62 | assert subj.decode('utf-8') == s.decode('utf-8') 63 | assert pred.decode('utf-8') == p.decode('utf-8') 64 | assert obj.decode('utf-8') == o.decode('utf-8') 65 | 66 | 67 | def test_convert_id(): 68 | (triples, triplesCard) = document.search_triples("", "", "") 69 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 70 | assert triplesCard == idsCard 71 | assert triplesCard == nbTotalTriples 72 | for subj, pred, obj in triples: 73 | sid, pid, oid = next(ids) 74 | s, p, o = ( 75 | document.convert_id(sid, IdentifierPosition.Subject), 76 | document.convert_id(pid, IdentifierPosition.Predicate), 77 | document.convert_id(oid, IdentifierPosition.Object) 78 | ) 79 | assert subj == s 80 | assert pred == p 81 | assert obj == o 82 | 83 | def test_convert_id_bytes(): 84 | (triples, triplesCard) = document.search_triples_bytes("", "", "") 85 | (ids, idsCard) = document.search_triples_ids(0, 0, 0) 86 | assert triplesCard == idsCard 87 | assert triplesCard == nbTotalTriples 88 | for subj, pred, obj in triples: 89 | sid, pid, oid = next(ids) 90 | s, p, o = ( 91 | document.convert_id_bytes(sid, IdentifierPosition.Subject), 92 | document.convert_id_bytes(pid, IdentifierPosition.Predicate), 93 | document.convert_id_bytes(oid, IdentifierPosition.Object) 94 | ) 95 | assert subj == s 96 | assert pred == p 97 | assert obj == o 98 | -------------------------------------------------------------------------------- /tests/hdt_iterators_test.py: -------------------------------------------------------------------------------- 1 | # hdt_iterators_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | import pytest 4 | from hdt import HDTDocument 5 | 6 | path = "tests/test.hdt" 7 | document = HDTDocument(path) 8 | nbTotalTriples = 132 9 | 10 | 11 | def test_read_document_base(): 12 | (triples, cardinality) = document.search_triples("", "", "") 13 | assert triples.subject == "?s" 14 | assert triples.predicate == "?p" 15 | assert triples.object == "?o" 16 | assert cardinality == nbTotalTriples 17 | for subj, pred, obj in triples: 18 | assert subj is not None 19 | assert pred is not None 20 | assert obj is not None 21 | assert triples.nb_reads == cardinality 22 | 23 | def test_read_document_base_bytes(): 24 | (triples, cardinality) = document.search_triples_bytes("", "", "") 25 | assert triples.subject == "?s" 26 | assert triples.predicate == "?p" 27 | assert triples.object == "?o" 28 | assert cardinality == nbTotalTriples 29 | for subj, pred, obj in triples: 30 | assert isinstance(subj, bytes) 31 | assert isinstance(pred, bytes) 32 | assert isinstance(obj, bytes) 33 | try: 34 | s, p, o = subj.decode('utf-8'), pred.decode('utf-8'), obj.decode('utf-8') 35 | except Exception as err: 36 | # with the test.hdt file we shouldnt have any problem 37 | raise err 38 | assert subj is not None 39 | assert pred is not None 40 | assert obj is not None 41 | assert triples.nb_reads == cardinality 42 | 43 | 44 | empty_triples = [ 45 | ("http://example.org#toto", "", ""), 46 | ("", "http://example.org#toto", ""), 47 | ("", "http://example.org#toto", "") 48 | ] 49 | 50 | empty_triples_ids = [ 51 | (155, 0, 0), 52 | (0, 155, 0), 53 | (0, 0, 155) 54 | ] 55 | 56 | 57 | @pytest.mark.parametrize("triple", empty_triples) 58 | def test_search_triples_empty(triple): 59 | s, p, o = triple 60 | (iterator, cardinality) = document.search_triples(s, p, o) 61 | assert cardinality == 0 62 | assert not iterator.has_next() 63 | 64 | 65 | @pytest.mark.parametrize("triple", empty_triples_ids) 66 | def test_search_ids_empty(triple): 67 | s, p, o = triple 68 | (iterator, cardinality) = document.search_triples_ids(s, p, o) 69 | assert cardinality == 0 70 | assert not iterator.has_next() 71 | 72 | 73 | def test_read_document_limit(): 74 | nbItems = 0 75 | (triples, cardinality) = document.search_triples("", "", "", limit=10) 76 | assert triples.limit == 10 77 | assert cardinality == nbTotalTriples 78 | for subj, pred, obj in triples: 79 | nbItems += 1 80 | assert subj is not None 81 | assert pred is not None 82 | assert obj is not None 83 | assert nbItems == 10 84 | assert triples.nb_reads == 10 85 | 86 | def test_read_document_bytes_peek(): 87 | nbItems = 0 88 | (triples, cardinality) = document.search_triples_bytes("", "", "", limit=10) 89 | assert triples.limit == 10 90 | assert cardinality == nbTotalTriples 91 | peek = triples.peek() 92 | for subj, pred, obj in triples: 93 | nbItems += 1 94 | assert isinstance(subj, bytes) 95 | assert isinstance(pred, bytes) 96 | assert isinstance(obj, bytes) 97 | assert subj == peek[0] 98 | assert pred == peek[1] 99 | assert obj == peek[2] 100 | assert subj is not None 101 | assert pred is not None 102 | assert obj is not None 103 | try: 104 | peek = triples.peek() 105 | except: 106 | pass 107 | assert nbItems == 10 108 | assert triples.nb_reads == 10 109 | 110 | 111 | def test_read_document_offset(): 112 | nbItems = 0 113 | (triples, cardinality) = document.search_triples("", "", "", offset=10) 114 | assert triples.offset == 10 115 | assert cardinality == nbTotalTriples 116 | for subj, pred, obj in triples: 117 | nbItems += 1 118 | assert subj is not None 119 | assert pred is not None 120 | assert obj is not None 121 | assert nbItems == cardinality - 10 122 | assert triples.nb_reads == cardinality - 10 123 | 124 | 125 | def test_read_document_ids(): 126 | (triples, cardinality) = document.search_triples_ids(0, 0, 0) 127 | assert triples.subject == "?s" 128 | assert triples.predicate == "?p" 129 | assert triples.object, "?o" 130 | assert cardinality, nbTotalTriples 131 | for subj, pred, obj in triples: 132 | assert subj is not None 133 | assert pred is not None 134 | assert obj is not None 135 | assert triples.nb_reads == cardinality 136 | 137 | 138 | def test_string_iterator_peek(): 139 | expected = ('http://example.org/s1', 'http://example.org/p1', 'http://example.org/o001') 140 | (triples, cardinality) = document.search_triples("", "", "") 141 | v = triples.peek() 142 | assert v == expected 143 | assert triples.nb_reads == 0 144 | v = next(triples) 145 | assert v == expected 146 | assert triples.nb_reads == 1 147 | 148 | 149 | def test_ids_iterator_peek(): 150 | expected = (1, 1, 13) 151 | (triples, cardinality) = document.search_triples_ids(0, 0, 0) 152 | v = triples.peek() 153 | assert v == expected 154 | assert triples.nb_reads == 0 155 | v = next(triples) 156 | assert v == expected 157 | assert triples.nb_reads == 1 158 | 159 | 160 | def test_string_iterator_big_offset(): 161 | nbItems = 0 162 | (triples, cardinality) = document.search_triples("", "", "", offset=nbTotalTriples + 1) 163 | for s, p, o in triples: 164 | nbItems += 1 165 | assert nbItems == 0 166 | 167 | 168 | def test_ids_iterator_big_offset(): 169 | nbItems = 0 170 | (triples, cardinality) = document.search_triples_ids(0, 0, 0, offset=nbTotalTriples + 1) 171 | for s, p, o in triples: 172 | nbItems += 1 173 | assert nbItems == 0 174 | -------------------------------------------------------------------------------- /tests/join_iterator_test.py: -------------------------------------------------------------------------------- 1 | # hdt_iterators_test.py 2 | # Author: Thomas MINIER - MIT License 2017-2019 3 | from hdt import HDTDocument 4 | 5 | path = "tests/test.hdt" 6 | document = HDTDocument(path) 7 | 8 | 9 | def test_basic_join(): 10 | join_iter = document.search_join([ 11 | ("?s", "http://example.org/p1", "http://example.org/o001"), 12 | ("?s", "http://example.org/p1", "http://example.org/o001") 13 | ]) 14 | cpt = 0 15 | for b in join_iter: 16 | cpt += 1 17 | assert len(b) == 1 18 | assert ('?s', 'http://example.org/s1') in b or ('?s', 'http://example.org/s2') in b 19 | assert cpt == 2 20 | 21 | def test_basic_join_bytes(): 22 | join_iter = document.search_join_bytes([ 23 | ("?s", "http://example.org/p1", "http://example.org/o001"), 24 | ("?s", "http://example.org/p1", "http://example.org/o001") 25 | ]) 26 | cpt = 0 27 | for b in join_iter: 28 | cpt += 1 29 | assert len(b) == 1 30 | assert (b'?s', b'http://example.org/s1') in b or (b'?s', b'http://example.org/s2') in b 31 | assert cpt == 2 32 | -------------------------------------------------------------------------------- /tests/test.hdt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Callidon/pyHDT/56370143e707c1b69bdb054bd811660e6611cae1/tests/test.hdt --------------------------------------------------------------------------------