├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── README.rst
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _config.yml
    │   ├── api.rst
    │   ├── conf.py
    │   ├── hdtdocument.rst
    │   ├── index.rst
    │   └── installation.rst
├── include
    ├── docstrings.hpp
    ├── hdt_document.hpp
    ├── join_iterator.hpp
    ├── join_iterator_bytes.hpp
    ├── pyhdt_types.hpp
    ├── triple_iterator.hpp
    ├── triple_iterator_bytes.hpp
    └── tripleid_iterator.hpp
├── install.sh
├── requirements.txt
├── setup.cfg
├── setup.py
├── src
    ├── hdt.cpp
    ├── hdt_document.cpp
    ├── join_iterator.cpp
    ├── join_iterator_bytes.cpp
    ├── triple_iterator.cpp
    ├── triple_iterator_bytes.cpp
    └── tripleid_iterator.cpp
└── tests
    ├── __init__.py
    ├── hdt_document_test.py
    ├── hdt_iterators_test.py
    ├── join_iterator_test.py
    └── test.hdt


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # C/C++
  2 | # Prerequisites
  3 | *.d
  4 | 
  5 | # Compiled Object files
  6 | *.slo
  7 | *.lo
  8 | *.o
  9 | *.obj
 10 | .pytest_cache/
 11 | 
 12 | # Precompiled Headers
 13 | *.gch
 14 | *.pch
 15 | 
 16 | # Compiled Dynamic libraries
 17 | *.so
 18 | *.dylib
 19 | *.dll
 20 | 
 21 | # Fortran module files
 22 | *.mod
 23 | *.smod
 24 | 
 25 | # Compiled Static libraries
 26 | *.lai
 27 | *.la
 28 | *.a
 29 | *.lib
 30 | 
 31 | # Executables
 32 | *.exe
 33 | *.out
 34 | *.app
 35 | 
 36 | # Python
 37 | # Byte-compiled / optimized / DLL files
 38 | __pycache__/
 39 | *.py[cod]
 40 | *$py.class
 41 | 
 42 | # C extensions
 43 | *.so
 44 | 
 45 | # Distribution / packaging
 46 | .Python
 47 | build/
 48 | develop-eggs/
 49 | dist/
 50 | downloads/
 51 | eggs/
 52 | .eggs/
 53 | lib/
 54 | lib64/
 55 | parts/
 56 | sdist/
 57 | var/
 58 | wheels/
 59 | *.egg-info/
 60 | .installed.cfg
 61 | *.egg
 62 | MANIFEST
 63 | 
 64 | # PyInstaller
 65 | #  Usually these files are written by a python script from a template
 66 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 67 | *.manifest
 68 | *.spec
 69 | 
 70 | # Installer logs
 71 | pip-log.txt
 72 | pip-delete-this-directory.txt
 73 | 
 74 | # Unit test / coverage reports
 75 | htmlcov/
 76 | .tox/
 77 | .coverage
 78 | .coverage.*
 79 | .cache
 80 | nosetests.xml
 81 | coverage.xml
 82 | *.cover
 83 | .hypothesis/
 84 | 
 85 | # Translations
 86 | *.mo
 87 | *.pot
 88 | 
 89 | # Django stuff:
 90 | *.log
 91 | .static_storage/
 92 | .media/
 93 | local_settings.py
 94 | 
 95 | # Flask stuff:
 96 | instance/
 97 | .webassets-cache
 98 | 
 99 | # Scrapy stuff:
100 | .scrapy
101 | 
102 | # Sphinx documentation
103 | docs/_build/
104 | 
105 | # PyBuilder
106 | target/
107 | 
108 | # Jupyter Notebook
109 | .ipynb_checkpoints
110 | 
111 | # pyenv
112 | .python-version
113 | 
114 | # celery beat schedule file
115 | celerybeat-schedule
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | 
142 | # HDT
143 | *.hdt.index.v*
144 | hdt-cpp-*
145 | hdt-cpp.zip
146 | v1.3.*.zip
147 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache: pip
 3 | python:
 4 | - '3.6'
 5 | notifications:
 6 |   email: false
 7 | addons:
 8 |   apt:
 9 |     sources:
10 |     - ubuntu-toolchain-r-test
11 |     packages:
12 |     - g++-4.8
13 | before_install:
14 |   - if [ $TRAVIS_OS_NAME == linux ]; then export CXX=g++-4.8; fi
15 | install:
16 |   - bash install.sh
17 | script:
18 |   - pytest
19 | before_deploy:
20 |   - rm -rf build/
21 |   - pip install pytest sphinx sphinx_rtd_theme
22 |   - cd docs && make html
23 | deploy:
24 | - provider: pypi
25 |   skip_cleanup: true
26 |   user: callidon
27 |   password: $PYPI_PASSWD
28 |   distributions: "sdist bdist_wheel"
29 |   on:
30 |     tags: true
31 | - provider: pages
32 |   skip_cleanup: true
33 |   github_token: $GH_PAGES
34 |   keep_history: true
35 |   local_dir: docs/build/html
36 |   on:
37 |     branch: master
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017-2019 Thomas Minier
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include requirements.txt
 2 | graft include/
 3 | graft hdt-cpp-1.3.3/libhdt/src/bitsequence
 4 | graft hdt-cpp-1.3.3/libhdt/src/dictionary
 5 | graft hdt-cpp-1.3.3/libhdt/src/hdt
 6 | graft hdt-cpp-1.3.3/libhdt/src/header
 7 | graft hdt-cpp-1.3.3/libhdt/src/huffman
 8 | graft hdt-cpp-1.3.3/libhdt/src/libdcs
 9 | graft hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex
10 | graft hdt-cpp-1.3.3/libhdt/src/rdf
11 | graft hdt-cpp-1.3.3/libhdt/src/sequence
12 | graft hdt-cpp-1.3.3/libhdt/src/triples
13 | graft hdt-cpp-1.3.3/libhdt/src/util
14 | graft hdt-cpp-1.3.3/libhdt/third
15 | graft hdt-cpp-1.3.3/libhdt/include/
16 | graft hdt-cpp-1.3.3/libhdt/src/dictionary/
17 | graft hdt-cpp-1.3.3/libhdt/src/sparql
18 | graft hdt-cpp-1.3.3/libcds/include/
19 | graft hdt-cpp-1.3.3/libcds/src/static/bitsequence
20 | graft hdt-cpp-1.3.3/libcds/src/static/coders
21 | graft hdt-cpp-1.3.3/libcds/src/static/mapper
22 | graft hdt-cpp-1.3.3/libcds/src/static/permutation
23 | graft hdt-cpp-1.3.3/libcds/src/static/sequence
24 | graft hdt-cpp-1.3.3/libcds/src/utils
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyHDT
  2 | 
  3 | [![Build Status](https://travis-ci.org/Callidon/pyHDT.svg?branch=master)](https://travis-ci.org/Callidon/pyHDT) [![Documentation Status](https://readthedocs.org/projects/pyhdt/badge/?version=latest)](https://callidon.github.io/pyHDT) [![PyPI version](https://badge.fury.io/py/hdt.svg)](https://badge.fury.io/py/hdt)
  4 | 
  5 | **pyHDT is joining the RDFlib family as part of the rdflib 6.0 release! The development continues at [rdflib-hdt](https://github.com/RDFLib/rdflib-hdt), and this repository is going into archive.**
  6 | 
  7 | Read and query HDT document with ease in Python
  8 | 
  9 | [Online Documentation](https://callidon.github.io/pyHDT)
 10 | 
 11 | # Requirements
 12 | 
 13 | * Python *version 3.6.4 or higher*
 14 | * [pip](https://pip.pypa.io/en/stable/)
 15 | * **gcc/clang** with **c++11 support**
 16 | * **Python Development headers**
 17 | > You should have the `Python.h` header available on your system.   
 18 | > For example, for Python 3.6, install the `python3.6-dev` package on Debian/Ubuntu systems.
 19 | 
 20 | Then, install the [pybind11 library](http://pybind11.readthedocs.io/en/stable/)
 21 | ```
 22 | pip install pybind11
 23 | ```
 24 | 
 25 | # Installation
 26 | 
 27 | Installation in a [virtualenv](https://virtualenv.pypa.io/en/stable/) is **strongly advised!**
 28 | 
 29 | ## Pip install (recommended)
 30 | 
 31 | ```
 32 | pip install hdt
 33 | ```
 34 | 
 35 | ## Manual installation
 36 | 
 37 | ```
 38 | git clone https://github.com/Callidon/pyHDT
 39 | cd pyHDT/
 40 | ./install.sh
 41 | ```
 42 | 
 43 | # Getting started
 44 | 
 45 | ```python
 46 | from hdt import HDTDocument
 47 | 
 48 |  # Load an HDT file.
 49 |  # Missing indexes are generated automatically, add False as the second argument to disable them
 50 | document = HDTDocument("test.hdt")
 51 | 
 52 | # Display some metadata about the HDT document itself
 53 | print("nb triples: %i" % document.total_triples)
 54 | print("nb subjects: %i" % document.nb_subjects)
 55 | print("nb predicates: %i" % document.nb_predicates)
 56 | print("nb objects: %i" % document.nb_objects)
 57 | print("nb shared subject-object: %i" % document.nb_shared)
 58 | 
 59 | # Fetch all triples that matches { ?s ?p ?o }
 60 | # Use empty strings ("") to indicates variables
 61 | triples, cardinality = document.search_triples("", "", "")
 62 | 
 63 | print("cardinality of { ?s ?p ?o }: %i" % cardinality)
 64 | for triple in triples:
 65 |   print(triple)
 66 | 
 67 | # Search also support limit and offset
 68 | triples, cardinality = document.search_triples("", "", "", limit=10, offset=100)
 69 | # etc ...
 70 | ```
 71 | 
 72 | # Handling non UTF-8 strings in python
 73 | 
 74 | If the HDT document has been encoded with a non UTF-8 encoding the previous code won't work correctly and will result in a `UnicodeDecodeError`.
 75 | More details on how to convert string to str from c++ to python [here](https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html)
 76 | 
 77 | To handle this we doubled the API of the HDT document by adding:
 78 | - `search_triples_bytes(...)` return an iterator of triples as `(py::bytes, py::bytes, py::bytes)`
 79 | - `search_join_bytes(...)` return an iterator of sets of solutions mapping as `py::set(py::bytes, py::bytes)`
 80 | - `convert_tripleid_bytes(...)` return a triple as: `(py::bytes, py::bytes, py::bytes)`
 81 | - `convert_id_bytes(...)` return a `py::bytes`
 82 | 
 83 | **Parameters and documentation are the same as the standard version**
 84 | 
 85 | ```python
 86 | from hdt import HDTDocument
 87 | 
 88 |  # Load an HDT file.
 89 |  # Missing indexes are generated automatically, add False as the second argument to disable them
 90 | document = HDTDocument("test.hdt")
 91 | it = document.search_triple_bytes("", "", "")
 92 | 
 93 | for s, p, o in it:
 94 |   print(s, p, o) # print b'...', b'...', b'...'
 95 |   # now decode it, or handle any error
 96 |   try:
 97 |     s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8')
 98 |   except UnicodeDecodeError as err:
 99 |     # try another other codecs
100 |     pass
101 | ```
102 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | |Build Status| |Documentation Status| |PyPI version|
  2 | 
  3 | Read and query HDT document with ease in Python
  4 | 
  5 | `Online Documentation <https://callidon.github.io/pyHDT>`__
  6 | 
  7 | Requirements
  8 | ============
  9 | 
 10 | -  Python *version 3.6.4 or higher*
 11 | -  `pip <https://pip.pypa.io/en/stable/>`__
 12 | -  **gcc/clang** with **c++11 support**
 13 | -  **Python Development headers** > You should have the ``Python.h``
 14 |    header available on your system.
 15 |    > For example, for Python 3.6, install the ``python3.6-dev`` package
 16 |    on Debian/Ubuntu systems.
 17 | 
 18 | Then, install the `pybind11
 19 | library <http://pybind11.readthedocs.io/en/stable/>`__
 20 | 
 21 | ::
 22 | 
 23 |     pip install pybind11
 24 | 
 25 | Installation
 26 | ============
 27 | 
 28 | Installation in a `virtualenv <https://virtualenv.pypa.io/en/stable/>`__
 29 | is **strongly advised!**
 30 | 
 31 | Pip install (recommended)
 32 | -------------------------
 33 | 
 34 | ::
 35 | 
 36 |     pip install hdt
 37 | 
 38 | Manual installation
 39 | -------------------
 40 | 
 41 | ::
 42 | 
 43 |     git clone https://github.com/Callidon/pyHDT
 44 |     cd pyHDT/
 45 |     ./install.sh
 46 | 
 47 | Getting started
 48 | ===============
 49 | 
 50 | .. code:: python
 51 | 
 52 |     from hdt import HDTDocument
 53 | 
 54 |     # Load an HDT file.
 55 |     # Missing indexes are generated automatically, add False as the second argument to disable them
 56 |     document = HDTDocument("test.hdt")
 57 | 
 58 |     # Display some metadata about the HDT document itself
 59 |     print("nb triples: %i" % document.total_triples)
 60 |     print("nb subjects: %i" % document.nb_subjects)
 61 |     print("nb predicates: %i" % document.nb_predicates)
 62 |     print("nb objects: %i" % document.nb_objects)
 63 |     print("nb shared subject-object: %i" % document.nb_shared)
 64 | 
 65 |     # Fetch all triples that matches { ?s ?p ?o }
 66 |     # Use empty strings ("") to indicates variables
 67 |     triples, cardinality = document.search_triples("", "", "")
 68 | 
 69 |     print("cardinality of { ?s ?p ?o }: %i" % cardinality)
 70 |     for triple in triples:
 71 |       print(triple)
 72 | 
 73 |     # Search also support limit and offset
 74 |     triples, cardinality = document.search_triples("", "", "", limit=10, offset=100)
 75 |     # etc ...
 76 | 
 77 | .. |Build Status| image:: https://travis-ci.org/Callidon/pyHDT.svg?branch=master
 78 |    :target: https://travis-ci.org/Callidon/pyHDT
 79 | .. |Documentation Status| image:: https://readthedocs.org/projects/pyhdt/badge/?version=latest
 80 |    :target: https://callidon.github.io/pyHDT
 81 | .. |PyPI version| image:: https://badge.fury.io/py/hdt.svg
 82 |    :target: https://badge.fury.io/py/hdt
 83 | 
 84 | Handling non UTF-8 strings in python
 85 | ====================================
 86 | 
 87 | If the HDT document has been encoded with a non UTF-8 encoding the
 88 | previous code won’t work correctly and will result in a
 89 | ``UnicodeDecodeError``. More details on how to convert string to str
 90 | from c++ to python `here`_
 91 | 
 92 | To handle this we doubled the API of the HDT document by adding:
 93 | 
 94 | - ``search_triples_bytes(...)`` return an iterator of triples as ``(py::bytes, py::bytes, py::bytes)``
 95 | - ``search_join_bytes(...)`` return an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)``
 96 | - ``convert_tripleid_bytes(...)`` return a triple as: ``(py::bytes, py::bytes, py::bytes)``
 97 | - ``convert_id_bytes(...)`` return a ``py::bytes``
 98 | 
 99 | **Parameters and documentation are the same as the standard version**
100 | 
101 | .. code:: python
102 | 
103 |   from hdt import HDTDocument
104 | 
105 |    # Load an HDT file.
106 |    # Missing indexes are generated automatically, add False as the second argument to disable them
107 |   document = HDTDocument("test.hdt")
108 |   it = document.search_triple_bytes("", "", "")
109 | 
110 |   for s, p, o in it:
111 |     print(s, p, o) # print b'...', b'...', b'...'
112 |     # now decode it, or handle any error
113 |     try:
114 |       s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8')
115 |     except UnicodeDecodeError as err:
116 |       # try another other codecs
117 |       pass
118 | 
119 | .. _here: https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html
120 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = pyHDT
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 	cp source/_config.yml build/html/_config.yml
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=pyHDT
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/_config.yml:
--------------------------------------------------------------------------------
1 | baseurl: /
2 | include: [ "_static", "_static/*" ]
3 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | API documentation
 2 | =================
 3 | 
 4 | .. currentmodule:: hdt
 5 | 
 6 | HDTDocument
 7 | -----------
 8 | 
 9 | .. autoclass:: HDTDocument
10 |   :members:
11 | 
12 |   .. method:: __init__(self, filePath) -> hdt.HDTDocument
13 | 
14 |     Build a new :class:`hdt.HDTDocument` by the loading the HDT file located in ``filePath``.
15 | 
16 |     Args:
17 |       - filePath ``str``: the path to the HDT file to load.
18 | 
19 |     .. code-block:: python
20 | 
21 |       from hdt import HDTDocument
22 | 
23 |       # Load HDT file. Missing indexes are generated automatically
24 |       document = HDTDocument("test.hdt")
25 | 
26 |       # Display some metadata about the HDT document itself
27 |       print("nb triples: %i" % document.total_triples)
28 |       print("nb subjects: %i" % document.nb_subjects)
29 |       print("nb predicates: %i" % document.nb_predicates)
30 |       print("nb objects: %i" % document.nb_objets)
31 |       print("nb shared subject-object: %i" % document.nb_shared)
32 | 
33 | 
34 | TripleIterator
35 | --------------
36 | 
37 | .. autoclass:: TripleIterator
38 |   :inherited-members:
39 |   :members:
40 | 
41 | TripleIDIterator
42 | ----------------
43 | 
44 | .. autoclass:: TripleIDIterator
45 |   :inherited-members:
46 |   :members:
47 | 
48 | JoinIterator
49 | --------------
50 | 
51 | .. autoclass:: JoinIterator
52 |   :inherited-members:
53 |   :members:
54 | 
55 | 
56 | Enumerations
57 | -------------
58 | 
59 | IdentifierPosition
60 | ^^^^^^^^^^^^^^^^^^^
61 | 
62 | .. autoclass:: IdentifierPosition
63 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # pyHDT documentation build configuration file, created by
  5 | # sphinx-quickstart on Mon Jan 22 10:41:42 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.autodoc']
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | #
 42 | # source_suffix = ['.rst', '.md']
 43 | source_suffix = '.rst'
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = 'pyHDT'
 50 | copyright = '2018, Thomas Minier'
 51 | author = 'Thomas Minier'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = '1.0.0'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = '1.0.0'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = []
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | 
 81 | # -- Options for HTML output ----------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'sphinx_rtd_theme'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = { 'show_related': True}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ['_static']
 98 | 
 99 | # Custom sidebar templates, must be a dictionary that maps document names
100 | # to template names.
101 | #
102 | # This is required for the alabaster theme
103 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
104 | html_sidebars = {
105 |     '**': [
106 |         'globaltoc.html',
107 |         'relations.html',  # needs 'show_related': True theme option to display
108 |         'sourcelink.html',
109 |         'searchbox.html',
110 |     ]
111 | }
112 | 
113 | 
114 | # -- Options for HTMLHelp output ------------------------------------------
115 | 
116 | # Output file base name for HTML help builder.
117 | htmlhelp_basename = 'pyHDTdoc'
118 | 
119 | 
120 | # -- Options for LaTeX output ---------------------------------------------
121 | 
122 | latex_elements = {
123 |     # The paper size ('letterpaper' or 'a4paper').
124 |     #
125 |     # 'papersize': 'letterpaper',
126 | 
127 |     # The font size ('10pt', '11pt' or '12pt').
128 |     #
129 |     # 'pointsize': '10pt',
130 | 
131 |     # Additional stuff for the LaTeX preamble.
132 |     #
133 |     # 'preamble': '',
134 | 
135 |     # Latex figure (float) alignment
136 |     #
137 |     # 'figure_align': 'htbp',
138 | }
139 | 
140 | # Grouping the document tree into LaTeX files. List of tuples
141 | # (source start file, target name, title,
142 | #  author, documentclass [howto, manual, or own class]).
143 | latex_documents = [
144 |     (master_doc, 'pyHDT.tex', 'pyHDT Documentation',
145 |      'Thomas Minier', 'manual'),
146 | ]
147 | 
148 | 
149 | # -- Options for manual page output ---------------------------------------
150 | 
151 | # One entry per manual page. List of tuples
152 | # (source start file, name, description, authors, manual section).
153 | man_pages = [
154 |     (master_doc, 'pyhdt', 'pyHDT Documentation',
155 |      [author], 1)
156 | ]
157 | 
158 | 
159 | # -- Options for Texinfo output -------------------------------------------
160 | 
161 | # Grouping the document tree into Texinfo files. List of tuples
162 | # (source start file, target name, title, author,
163 | #  dir menu entry, description, category)
164 | texinfo_documents = [
165 |     (master_doc, 'pyHDT', 'pyHDT Documentation',
166 |      author, 'pyHDT', 'One line description of project.',
167 |      'Miscellaneous'),
168 | ]
169 | 


--------------------------------------------------------------------------------
/docs/source/hdtdocument.rst:
--------------------------------------------------------------------------------
  1 | HDTDocument
  2 | ===========
  3 | 
  4 | Loading HDT files
  5 | ^^^^^^^^^^^^^^^^^
  6 | 
  7 | The main class for manipulating HDT Dicument using pyHDT is ``HDTDocument``.
  8 | Upon creation, it search for an index file in the same dicrectory than the HDT file you wish to load.
  9 | 
 10 | For example, if you load a file */home/awesome-user/test.hdt*, HDTDocument will look for the index file
 11 | */home/awesome-user/test.hdt.index.v1-1*.
 12 | 
 13 | Missing indexes are generated automatically, but be careful, as it requires to load all HDT triples in memory!
 14 | 
 15 | .. code-block:: python
 16 | 
 17 |   from hdt import HDTDocument
 18 | 
 19 |   # Load an HDT file.
 20 |   # Missing indexes are generated automatically, add False as the second argument to disable them
 21 |   document = HDTDocument("test.hdt")
 22 | 
 23 |   # Display some metadata about the HDT document itself
 24 |   print("nb triples: %i" % document.total_triples)
 25 |   print("nb subjects: %i" % document.nb_subjects)
 26 |   print("nb predicates: %i" % document.nb_predicates)
 27 |   print("nb objects: %i" % document.nb_objets)
 28 |   print("nb shared subject-object: %i" % document.nb_shared)
 29 | 
 30 | 
 31 | Searching for triples
 32 | ^^^^^^^^^^^^^^^^^^^^^^
 33 | 
 34 | You can search for all RDF triples in the HDT file matching a triple pattern using `search_triples`.
 35 | It returns a 2-element tuple, with an *iterator* over the matching RDF triples and the estimated triple pattern *cardinality*.
 36 | 
 37 | .. code-block:: python
 38 | 
 39 |   from hdt import HDTDocument
 40 |   document = HDTDocument("test.hdt")
 41 | 
 42 |   # Fetch all triples that matches { ?s ?p ?o }
 43 |   # Use empty strings ("") to indicates variables
 44 |   (triples, cardinality) = document.search_triples("", "", "")
 45 | 
 46 |   print("cardinality of { ?s ?p ?o }: %i" % cardinality)
 47 |   for triple in triples:
 48 |     print(triple)
 49 | 
 50 |   # Search also support limit and offset
 51 |   (triples, cardinality) = document.search_triples("", "", "", limit=10, offset=100)
 52 |   # etc ...
 53 | 
 54 | Searching for triple IDs
 55 | ^^^^^^^^^^^^^^^^^^^^^^^^^
 56 | 
 57 | A typical HDT document encodes a triple's subject, predicate and object as unique integers, named **TripleID**.
 58 | For example, the triple ``("ex:Toto", "ex:type", "ex:Person")`` can be encoded as ``(1, 2, 3)``.
 59 | An ``HDTDocument`` allows for searching RDF triples in this format, using the ``search_triple_ids`` method, which works exactly like the classic ``search_triple``.
 60 | 
 61 | .. code-block:: python
 62 | 
 63 |   from hdt import HDTDocument
 64 |   document = HDTDocument("test.hdt")
 65 | 
 66 |   (triples, cardinality) = document.search_triples_ids("", "", "")
 67 | 
 68 |   for s, p, o in triples:
 69 |     print(s, p, o) # will print 3-element tuples of integers
 70 | 
 71 |     # convert a triple ID to a string format
 72 |     print(document.convert_tripleid(s, p, o))
 73 | 
 74 | Join evaluation
 75 | ^^^^^^^^^^^^^^^
 76 | 
 77 | An HDT document also provides support for evaluating joins over a set of triples patterns.
 78 | 
 79 | .. code-block:: python
 80 | 
 81 |   from hdt import HDTDocument
 82 |   document = HDTDocument("test.hdt")
 83 | 
 84 |   # find all actors with their names in the HDT document
 85 |   tp_a = ("?s", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://example.org#Actor")
 86 |   tp_b = ("?s", "http://xmlns.com/foaf/0.1/name", "?name")
 87 |   iterator = document.search_join(set([tp_a, tp_b]))
 88 | 
 89 |   print("estimated join cardinality : %i" % len(iterator))
 90 |   for mappings in iterator:
 91 |     print(mappings)
 92 | 
 93 | Ordering
 94 | ^^^^^^^^^^^
 95 | 
 96 | When searching for triples (either in string or triple id format), results are returned ordred by (subject, predicate, object).
 97 | However, this order is **not** an order on string values, but an order on **triple ids**.
 98 | For example, ``("ex:2", "ex:type", "ex:Person") < ("ex:1", "ex:type", "ex:Person")``,
 99 | because their triple ids counterparts are ``(1, 2, 3)`` and ``(2, 2, 3)``.
100 | 
101 | For more details about this topic, please refer to the `HDT journal article <http://www.imap.websemanticsjournal.org/preprints/index.php/ps/article/viewFile/328/333>`_.
102 | 
103 | Handling non UTF-8 strings in python
104 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105 | 
106 | If the HDT document has been encoded with a non UTF-8 encoding the
107 | previous code won’t work correctly and will result in a
108 | ``UnicodeDecodeError``. More details on how to convert string to str
109 | from c++ to python `here`_
110 | 
111 | To handle this we doubled the API of the HDT document by adding:
112 | 
113 | - ``search_triples_bytes(...)`` return an iterator of triples as ``(py::bytes, py::bytes, py::bytes)``
114 | - ``search_join_bytes(...)`` return an iterator of sets of solutions mapping as ``py::set(py::bytes, py::bytes)``
115 | - ``convert_tripleid_bytes(...)`` return a triple as: ``(py::bytes, py::bytes, py::bytes)``
116 | - ``convert_id_bytes(...)`` return a ``py::bytes``
117 | 
118 | **Parameters and documentation are the same as the standard version**
119 | 
120 | .. code:: python
121 | 
122 |   from hdt import HDTDocument
123 | 
124 |    # Load an HDT file.
125 |    # Missing indexes are generated automatically, add False as the second argument to disable them
126 |   document = HDTDocument("test.hdt")
127 |   it = document.search_triple_bytes("", "", "")
128 | 
129 |   for s, p, o in it:
130 |     print(s, p, o) # print b'...', b'...', b'...'
131 |     # now decode it, or handle any error
132 |     try:
133 |       s, p, o = s.decode('UTF-8'), p.decode('UTF-8'), o.decode('UTF-8')
134 |     except UnicodeDecodeError as err:
135 |       # try another other codecs
136 |       pass
137 | 
138 | .. _here: https://pybind11.readthedocs.io/en/stable/advanced/cast/strings.html
139 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | pyHDT: Read and query HDT document with ease in Python
 2 | ======================================================
 3 | 
 4 | |Build Status| |Documentation Status| |PyPI version|
 5 | 
 6 | Getting started
 7 | ==================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 3
11 | 
12 |    installation
13 |    hdtdocument
14 |    api
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 
23 | .. |Build Status| image:: https://travis-ci.org/Callidon/pyHDT.svg?branch=master
24 |    :target: https://travis-ci.org/Callidon/pyHDT
25 | .. |Documentation Status| image:: https://readthedocs.org/projects/pyhdt/badge/?version=latest
26 |    :target: https://callidon.github.io/pyHDT
27 | .. |PyPI version| image:: https://badge.fury.io/py/hdt.svg
28 |    :target: https://badge.fury.io/py/hdt
29 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | =============
 3 | 
 4 | Requirements
 5 | ^^^^^^^^^^^^
 6 | 
 7 | * Python *version 3.6.4 or higher*
 8 | * `pip <https://pip.pypa.io/en/stable/>`_
 9 | * **gcc/clang** with **c++11 support**
10 | * **Python Development headers**
11 | 
12 | You must have the `Python.h` header available on your system.
13 | For example, for Python 3.4, install the `python3.4-dev` package on Debian/Ubuntu systems.
14 | 
15 | Then, install the `pybind11
16 | library <http://pybind11.readthedocs.io/en/stable/>`__
17 | 
18 | ::
19 | 
20 |     pip install pybind11
21 | 
22 | Installation
23 | ^^^^^^^^^^^^^
24 | 
25 | Installation in a `virtualenv <https://virtualenv.pypa.io/en/stable/>`_ is **strongly advised!**
26 | 
27 | Installation with pip
28 | -------------------------
29 | 
30 | ::
31 | 
32 |     pip install hdt
33 | 
34 | 
35 | Manual installation
36 | -------------------------
37 | 
38 | .. code-block:: bash
39 | 
40 |   git clone --recursive https://github.com/Callidon/pyHDT
41 |   cd pyHDT/
42 |   pip install -r requirements.txt
43 |   python setup.py install
44 | 


--------------------------------------------------------------------------------
/include/docstrings.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * docstrings.hpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #ifndef PYHDT_DOCSTRINGS_HPP
  7 | #define PYHDT_DOCSTRINGS_HPP
  8 | 
  9 | #include <string>
 10 | 
 11 | const char *MODULE_DOC = R"(
 12 |   The hdt module enables to load and query HDT files with ease.
 13 | )";
 14 | 
 15 | /**
 16 |  * Enums docstrings
 17 |  */
 18 | 
 19 | const char *IDENTIFIER_POSITION_DOC = R"(
 20 |  An enum used to indicate the position (subject, predicate or object) of an Object identifier.
 21 | 
 22 |  Possibles values:
 23 |   - ``IdentifierPosition.Subject``: the subject position
 24 |   - ``IdentifierPosition.Predicate``: the subject position
 25 |   - ``IdentifierPosition.Object``: the object position
 26 | 
 27 |  .. code-block:: python
 28 | 
 29 |    from hdt import IdentifierPosition
 30 |    print(IdentifierPosition.Subject)
 31 |    print(IdentifierPosition.Predicate)
 32 |    print(IdentifierPosition.Object)
 33 | 
 34 | )";
 35 | 
 36 | /**
 37 |  * HDT Document docstrings
 38 |  */
 39 | 
 40 | const char *HDT_DOCUMENT_CLASS_DOC = R"(
 41 |   An HDTDocument enables to load and query a HDT file.
 42 | 
 43 |   Constructor:
 44 |     - file ``str``: Path to the HDT file to load.
 45 |     - predicate ``boolean``: True if additional indexes must be loaded, False otherwise.
 46 | )";
 47 | 
 48 | const char *HDT_DOCUMENT_GETFILEPATH_DOC = R"(
 49 |   Return the path to the HDT file currently loaded
 50 | )";
 51 | 
 52 | const char *HDT_DOCUMENT_GETNBTRIPLES_DOC = R"(
 53 |   Return the total number of triples in the HDT document
 54 | )";
 55 | 
 56 | const char *HDT_DOCUMENT_GETNBSUBJECTS_DOC = R"(
 57 |   Return the number of subjects in the HDT document
 58 | )";
 59 | 
 60 | const char *HDT_DOCUMENT_GETNBPREDICATES_DOC = R"(
 61 |   Return the number of predicates in the HDT document
 62 | )";
 63 | 
 64 | const char *HDT_DOCUMENT_GETNBOBJECTS_DOC = R"(
 65 |   Return the number of objects in the HDT document
 66 | )";
 67 | 
 68 | const char *HDT_DOCUMENT_GETNBSHARED_DOC = R"(
 69 |   Return the number of shared subject-object in the HDT document
 70 | )";
 71 | 
 72 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_DOC = R"(
 73 |   Search for RDF triples matching the triple pattern { ``subject`` ``predicate`` ``object`` },
 74 |   with an optional ``limit`` and ``offset``.
 75 |   Use empty strings (``""``) to indicate wildcards.
 76 | 
 77 |   Args:
 78 |     - subject ``str``: The subject of the triple pattern to seach for.
 79 |     - predicate ``str``: The predicate of the triple pattern to seach for.
 80 |     - obj ``str``: The object of the triple pattern ot seach for.
 81 |     - limit ``int`` ``optional``: Maximum number of triples to search for.
 82 |     - offset ``int`` ``optional``: Number of matching triples to skip before returning results.
 83 | 
 84 |   Return:
 85 |     A 2-elements ``tuple`` (:class:`hdt.TripleIterator`, estimated pattern cardinality), where
 86 |     the TripleIterator iterates over matching RDF triples.
 87 | 
 88 |     A RDF triple itself is a 3-elements ``tuple`` (subject, predicate, object).
 89 | 
 90 |     .. code-block:: python
 91 | 
 92 |       from hdt import HDTDocument
 93 |       document = HDTDocument("test.hdt")
 94 | 
 95 |       # Fetch all triples that matches { ?s ?p ?o }
 96 |       (triples, cardinality) = document.search_triples("", "", "")
 97 | 
 98 |       print("cardinality of { ?s ?p ?o }: %i" % cardinality)
 99 |       for triple in triples:
100 |         print(triple)
101 | 
102 | )";
103 | 
104 | const char *HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC = R"(
105 |   Same as :meth:`hdt.HDTDocument.search_triples`, but RDF triples are represented as unique ids (from the HDT Dictionnary).
106 |   Use the integer `0` to indicate wildcards.
107 | 
108 |   Mapping between ids and RDF terms is done using :meth:`hdt.HDTDocument.convert_id`, :meth:`hdt.HDTDocument.convert_term` and :meth:`hdt.HDTDocument.convert_tripleid`.
109 | 
110 |   Args:
111 |     - subject ``int``: The Object identifier of the triple pattern's subject.
112 |     - predicate ``int``: The Object identifier of the triple pattern's predicate.
113 |     - obj ``int``: The Object identifier of the triple pattern's object.
114 |     - limit ``int`` ``optional``: Maximum number of triples to search for.
115 |     - offset ``int`` ``optional``: Number of matching triples to skip before returning results.
116 | 
117 |   Return:
118 |     A 2-elements ``tuple`` (:class:`hdt.TripleIDIterator`, estimated pattern cardinality), where
119 |     the TripleIDIterator iterates over matching RDF triples IDs.
120 | 
121 |     A RDF triple ID itself is a 3-elements ``tuple`` (subjectID, predicateID, objectID).
122 | 
123 |     .. code-block:: python
124 | 
125 |       from hdt import HDTDocument
126 |       document = HDTDocument("test.hdt")
127 | 
128 |       pred = document.convert_term("http://xmlns.com/foaf/0.1/")
129 |       # Fetch all RDF triples that matches { ?s foaf:name ?o }
130 |       (triples, cardinality) = document.search_triples_ids(0, pred, 0)
131 | 
132 |       print("cardinality of { ?s foaf:name ?o }: %i" % cardinality)
133 |       for triple in triples:
134 |         print(triple)
135 | 
136 | )";
137 | 
138 | const char *HDT_DOCUMENT_SEARCH_JOIN_DOC = R"(
139 |   Evaluate a join between a set of triple patterns using an iterator.
140 |   A triple pattern itself is a 3-elements ``tuple`` (subject, predicate, object), where SPARQL variables, i.e., join predicates, are prefixed by a ``?``.
141 | 
142 |   Args:
143 |     - patterns ``set``: set of triple patterns.
144 | 
145 |   Return:
146 |     A :class:`hdt.JoinIterator`, which can be consumed as a Python iterator to evaluates the join.
147 | 
148 |     .. code-block:: python
149 | 
150 |       from hdt import HDTDocument
151 |       document = HDTDocument("test.hdt")
152 | 
153 |       # find all actors with their names in the HDT document
154 |       tp_a = ("?s", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://example.org#Actor")
155 |       tp_b = ("?s", "http://xmlns.com/foaf/0.1/name", "?name")
156 |       iterator = document.search_join(set([tp_a, tp_b]))
157 | 
158 |       print("estimated join cardinality : %i" % len(iterator))
159 |       for mappings in iterator:
160 |         print(mappings)
161 | 
162 | )";
163 | 
164 | const char *HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC = R"(
165 |   Transform a RDF triple from a TripleID representation to a string representation.
166 | 
167 |   Args:
168 |     - subject ``int``: unique ID of the subject.
169 |     - predicate ``int``: unique ID of the predicate.
170 |     - obj ``int``: unique ID of the object.
171 | 
172 |   Return:
173 |     A triple in string representation, i.e., a 3-elements ``tuple`` (subject, predicate, object)
174 | 
175 |     .. code-block:: python
176 | 
177 |       from hdt import HDTDocument
178 |       document = HDTDocument("test.hdt")
179 | 
180 |       # Fetch all triples that matches { ?s foaf:name ?o }
181 |       pred = document.convert_term("http://xmlns.com/foaf/0.1/")
182 |       (triples, cardinality) = document.search_triples_ids(0, pred, 0)
183 | 
184 |       for s, p, o in triples:
185 |         print(s, p, o) # will print Object identifiers, i.e., integers
186 |         # convert a triple ID to a string format
187 |         print(document.convert_tripleid(s, p, o))
188 | 
189 | )";
190 | 
191 | const char *HDT_DOCUMENT_CONVERT_ID_DOC = R"(
192 |   Transform an Object Identifier to a RDF term.
193 |   Such identifier are used in TripleID.
194 | 
195 |   Args:
196 |     - id ``int``: Object identifier.
197 |     - position :class:`hdt.IdentifierPosition`: Identifier position.
198 | 
199 |   Return:
200 |     The RDF term associated with the Object Identifier, i.e., either an URI or a RDF literal.
201 | 
202 |     .. code-block:: python
203 | 
204 |       from hdt import HDTDocument, IdentifierPosition
205 |       document = HDTDocument("test.hdt")
206 |       print(document.convert_id(10, IdentifierPosition.Subject))
207 | 
208 | )";
209 | 
210 | const char *HDT_DOCUMENT_CONVERT_TERM_DOC = R"(
211 |   Transform an RDF Term to the associated Object Identifier.
212 |   Such identifier are used in TripleID.
213 | 
214 |   Args:
215 |     - term ``str``: RDF Term.
216 |     - position :class:`hdt.IdentifierPosition`: Identifier position.
217 | 
218 |   Return:
219 |     The Object Identifier associated with the RDF Term
220 | 
221 |     .. code-block:: python
222 | 
223 |       from hdt import HDTDocument, IdentifierPosition
224 |       document = HDTDocument("test.hdt")
225 |       print(document.convert_term("http://example.org#Alice", IdentifierPosition.Subject))
226 | 
227 | )";
228 | 
229 | /**
230 |  * TripleIterator & TripleIDIterator docstrings
231 |  */
232 | 
233 | const char *TRIPLE_ITERATOR_CLASS_DOC = R"(
234 |   A TripleIterator iterates over triples in a HDT file matching a triple pattern, with an optional limit & offset.
235 | 
236 |   Such iterator is returned by :meth:`hdt.HDTDocument.search_triples`.
237 | )";
238 | 
239 | const char *TRIPLE_ID_ITERATOR_CLASS_DOC = R"(
240 |   A TripleIDIterator iterates over triples' IDs in a HDT file matching a triple pattern, with an optional limit & offset.
241 | 
242 |   Such iterator is returned by :meth:`hdt.HDTDocument.search_triples_ids`
243 | 
244 |   Conversion from a tuple of triple ids into a RDF triple is done using :meth:`hdt.HDTDocument.convert_tripleid`.
245 | )";
246 | 
247 | const char *TRIPLE_ITERATOR_NEXT_DOC = R"(
248 |   Return the next matching triple read by the iterator, or raise ``StopIterator`` if there is no more items to yield.
249 | )";
250 | 
251 | const char *TRIPLE_ITERATOR_PEEK_DOC = R"(
252 |   Return the next matching triple read by the iterator without advancing it, or raise ``StopIterator`` if there is no more items to yield.
253 | )";
254 | 
255 | const char *TRIPLE_ITERATOR_HASNEXT_DOC = R"(
256 |   Return true if the iterator still has items to yield, false otherwise.
257 | )";
258 | 
259 | const char *TRIPLE_ITERATOR_GETSUBJECT_DOC = R"(
260 |   Return the subject of the triple pattern currently evaluated.
261 | )";
262 | 
263 | const char *TRIPLE_ITERATOR_GETPREDICATE_DOC = R"(
264 |   Return the predicate of the triple pattern currently evaluated.
265 | )";
266 | 
267 | const char *TRIPLE_ITERATOR_GETOBJECT_DOC = R"(
268 |   Return the object of the triple pattern currently evaluated.
269 | )";
270 | 
271 | const char *TRIPLE_ITERATOR_GETLIMIT_DOC = R"(
272 |   Return the limit of the iterator, i.e., the maximum number of items the iterator will yield.
273 |   A limit of 0 indicates that the iterator limit is the cardinality of the triple pattern currently evaluated.
274 | )";
275 | 
276 | const char *TRIPLE_ITERATOR_GETOFFSET_DOC = R"(
277 |   Return the offset of the iterator, i.e., the number of items the iterator will first skip before yielding.
278 |   An offset of 0 indicates that the iterator will not skip any items.
279 | )";
280 | 
281 | const char *TRIPLE_ITERATOR_NBREADS_DOC = R"(
282 |   Return the number of items read by the iterator until now.
283 |   Do not include any offset, thus the real position of the iterator in the collection of triples can be computed as offset + nb_reads
284 | )";
285 | 
286 | const char *TRIPLE_ITERATOR_SIZE_DOC = R"(
287 |   Get a hint on the cardinality of the triple pattern currently evaluated.
288 |   The iterator's limit and offset are not taken into account.
289 | 
290 |   Return:
291 |     A 2-element ``tuple`` (integer, boolean), where the left member is the estimated cardinality,
292 |     and the right member is True is the estimation is accurate, False otherwise
293 | )";
294 | 
295 | const char *TRIPLE_ITERATOR_ACC_ESTIMATION_DOC = R"(
296 |   Return True if the iterator can accuratly estimate the cardinality of the triple pattern, False otherwise.
297 | )";
298 | 
299 | const char *JOIN_ITERATOR_CLASS_DOC = R"(
300 |   A JoinIterator iterates over the set of solution mappings for a join between several triple patterns. It implements the Python iterator protocol and yields sets of solutions mappings.
301 | 
302 |   Such iterator is returned by :meth:`hdt.HDTDocument.search_join`
303 | )";
304 | 
305 | const char *JOIN_ITERATOR_NEXT_DOC = R"(
306 |   Return the next set of solution mappings read by the iterator, or raise ``StopIterator`` if there is no more items to yield.
307 | )";
308 | 
309 | const char *JOIN_ITERATOR_HAS_NEXT_DOC = R"(
310 |   Return true if the iterator still has items to yield, false otherwise.
311 | )";
312 | 
313 | const char *JOIN_ITERATOR_SIZE_DOC = R"(
314 |   Return the estimated join cardinality.
315 | )";
316 | 
317 | const char *JOIN_ITERATOR_RESET_DOC = R"(
318 |   Reset the join, i.e., move the iterator back to its initial state.
319 | )";
320 | 
321 | #endif /* PYHDT_DOCSTRINGS_HPP */
322 | 


--------------------------------------------------------------------------------
/include/hdt_document.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * hdt_document.hpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #ifndef PYHDT_DOCUMENT_HPP
  7 | #define PYHDT_DOCUMENT_HPP
  8 | 
  9 | #include <pybind11/pybind11.h>
 10 | #include "HDT.hpp"
 11 | #include "QueryProcessor.hpp"
 12 | #include "pyhdt_types.hpp"
 13 | #include "triple_iterator.hpp"
 14 | #include "triple_iterator_bytes.hpp"
 15 | #include "tripleid_iterator.hpp"
 16 | #include "join_iterator.hpp"
 17 | #include "join_iterator_bytes.hpp"
 18 | #include <list>
 19 | #include <string>
 20 | #include <vector>
 21 | namespace py = pybind11;
 22 | 
 23 | // The result of a search for a triple pattern in a HDT document:
 24 | // a tuple (matching RDF triples, nb of matching RDF triples)
 25 | typedef std::tuple<TripleIterator *, size_t> search_results;
 26 | 
 27 | // The result of a search for a triple pattern in a HDT document:
 28 | // a tuple (matching RDF triples, nb of matching RDF triples)
 29 | typedef std::tuple<TripleIteratorBytes *, size_t> search_results_bytes;
 30 | 
 31 | // Same as seach_results, but for an iterator over triple ids
 32 | typedef std::tuple<TripleIDIterator *, size_t> search_results_ids;
 33 | 
 34 | /*!
 35 |  * HDTDocument is the main entry to manage an hdt document
 36 |  * \author Thomas Minier
 37 |  */
 38 | class HDTDocument {
 39 | private:
 40 |   std::string hdt_file;
 41 |   hdt::HDT *hdt;
 42 |   hdt::QueryProcessor *processor;
 43 |   HDTDocument(std::string file, bool map, bool indexed);
 44 | 
 45 | public:
 46 |   /*!
 47 |    * Destructor
 48 |    */
 49 |   ~HDTDocument();
 50 | 
 51 |   /*!
 52 |    * Get the path to the HDT file currently loaded
 53 |    * @return The path to the HDT file currently loaded
 54 |    */
 55 |   std::string getFilePath();
 56 | 
 57 |   /*!
 58 |    * Implementation for Python function "__repr__"
 59 |    * @return A string representation of the object
 60 |    */
 61 |   std::string python_repr();
 62 | 
 63 |   /*!
 64 |    * Get the total number of triples in the HDT document
 65 |    * @return The total number of triples in the HDT document
 66 |    */
 67 |   unsigned int getNbTriples();
 68 | 
 69 |   /*!
 70 |    * Get the number of distinct subjects in the HDT document
 71 |    * @return The number of distinct subjects in the HDT document
 72 |    */
 73 |   unsigned int getNbSubjects();
 74 | 
 75 |   /*!
 76 |    * Get the number of distinct predicates in the HDT document
 77 |    * @return The number of distinct predicates in the HDT document
 78 |    */
 79 |   unsigned int getNbPredicates();
 80 | 
 81 |   /*!
 82 |    * Get the number of distinct objects in the HDT document
 83 |    * @return The number of distinct objects in the HDT document
 84 |    */
 85 |   unsigned int getNbObjects();
 86 | 
 87 |   /*!
 88 |    * Get the number of shared subjects-objects in the HDT document
 89 |    * @return The number of shared subjects-objects in the HDT document
 90 |    */
 91 |   unsigned int getNbShared();
 92 | 
 93 |   /*!
 94 |    * Static factory method used to create a new HDT Document
 95 |    * @param file - Path to the HDT file
 96 |    * @param map - True maps the HDT file (faster), False loads everything in memory
 97 |    * @param indexed -  True if the HDT must be loaded with indexes, False otherwise
 98 |    */
 99 |   static HDTDocument create(std::string file, bool map, bool indexed) {
100 |     return HDTDocument(file, map, indexed);
101 |   }
102 | 
103 |   /*!
104 |    * Convert a TripleID to a string RDF triple
105 |    * @param  subject   - Triple's subject
106 |    * @param  predicate - Triple's predicate
107 |    * @param  object    - Triple's object
108 |    * @return The associated RDF triple
109 |    */
110 |   triple convertTripleID(unsigned int subject, unsigned int predicate,
111 |                      unsigned int object);
112 | 
113 |   /**
114 |    * Convert an Object Identifier into the equivalent an RDF term
115 |    * @param  id  - Object Identifier
116 |    * @param  pos - Identifier position (subject, predicate or object)
117 |    * @return The an RDF term equivalent to the Object Identifier
118 |    */
119 |   string convertID(unsigned int id, IdentifierPosition pos);
120 | 
121 |   /**
122 |    * Convert an RDF term into the associated an Object Identifier.
123 |    * @param  term  - RDF Term in string format
124 |    * @param  pos - Identifier position (subject, predicate or object)
125 |    * @return The Object Identifier associated with the RDF term
126 |    */
127 |   unsigned int convertTerm(std::string term, IdentifierPosition pos);
128 | 
129 |   /*!
130 |    * Search all matching triples for a triple pattern, whith an optional limit and offset.
131 |    * Returns a tuple<TripleIterator*, cardinality>
132 |    * @param subject   - Triple pattern's subject
133 |    * @param predicate - Triple pattern's predicate
134 |    * @param object    - Triple pattern's object
135 |    * @param limit     - (Optional) Maximum number of matching triples to read
136 |    * @param offset    - (Optional) Number of matching triples to skip
137 |    * @return A tuple (TripleIterator*, cardinality)
138 |    */
139 |   search_results search(std::string subject, std::string predicate,
140 |                         std::string object, unsigned int limit = 0,
141 |                         unsigned int offset = 0);
142 | 
143 |   /*!
144 |    * Same as HDTDocument#search, but search for TripleIDs instead.
145 |    * Returns a tuple<TripleIDIterator*, cardinality>
146 |    * @param subject   - Triple pattern's subject identifier
147 |    * @param predicate - Triple pattern's predicate identifier
148 |    * @param object    - Triple pattern's object identifier
149 |    * @param limit     - (Optional) Maximum number of matching triples to read
150 |    * @param offset    - (Optional) Number of matching triples to skip
151 |    * @return A tuple (TripleIDIterator*, cardinality)
152 |    */
153 |   search_results_ids searchIDs(unsigned int subject, unsigned int predicate,
154 |                                unsigned int object, unsigned int limit = 0,
155 |                                unsigned int offset = 0);
156 | 
157 |   /**
158 |    * Evaluate a join between a set of triple patterns using a JoinIterator.
159 |    * @param  patterns - Set of triple patterns
160 |    * @return A JoinIterator* used to evaluated the join.
161 |    */
162 |   JoinIterator * searchJoin(std::vector<triple> patterns);
163 | 
164 |   // ============== BYTES REPRESENTATION ==============
165 |   // Author: Arnaud GRALL - MIT License 2017-2019
166 |   /*!
167 |    * Search all matching triples for a triple pattern, whith an optional limit and offset. Returns bytes instead of string
168 |    * Returns a tuple<TripleIterator*, cardinality>
169 |    * @param subject   - Triple pattern's subject
170 |    * @param predicate - Triple pattern's predicate
171 |    * @param object    - Triple pattern's object
172 |    * @param limit     - (Optional) Maximum number of matching triples to read
173 |    * @param offset    - (Optional) Number of matching triples to skip
174 |    * @return A tuple (TripleIterator*, cardinality)
175 |    */
176 |   search_results_bytes searchBytes(std::string subject, std::string predicate,
177 |                         std::string object, unsigned int limit = 0,
178 |                         unsigned int offset = 0);
179 |   /**
180 |    * Evaluate a join between a set of triple patterns using a JoinIterator.
181 |    * @param  patterns - Set of triple patterns
182 |    * @return A JoinIterator* used to evaluated the join.
183 |    */
184 |   JoinIteratorBytes * searchJoinBytes(std::vector<triple> patterns);
185 |   /*!
186 |    * Convert a TripleID to a RDF triple as bytes
187 |    * @param  subject   - Triple's subject
188 |    * @param  predicate - Triple's predicate
189 |    * @param  object    - Triple's object
190 |    * @return The associated RDF triple
191 |    */
192 |   triple_bytes convertTripleIDBytes(unsigned int subject, unsigned int predicate,
193 |                      unsigned int object);
194 | 
195 |   /**
196 |    * Convert an Object Identifier into the equivalent an RDF term as bytes
197 |    * @param  id  - Object Identifier
198 |    * @param  pos - Identifier position (subject, predicate or object)
199 |    * @return The an RDF term equivalent to the Object Identifier
200 |    */
201 |   py::bytes convertIDBytes(unsigned int id, IdentifierPosition pos);
202 | };
203 | 
204 | #endif /* PYHDT_DOCUMENT_HPP */
205 | 


--------------------------------------------------------------------------------
/include/join_iterator.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * join_iterator.hpp
 3 |  * Author: Thomas MINIER - MIT License 2017-2019
 4 |  */
 5 | 
 6 | #ifndef JOIN_ITERATOR_HPP
 7 | #define JOIN_ITERATOR_HPP
 8 | 
 9 | #include "pyhdt_types.hpp"
10 | #include "QueryProcessor.hpp"
11 | #include <string>
12 | 
13 | /*!
14 |  * JoinIterator iterates over solution bindings of a join
15 |  * @author Thomas Minier
16 |  */
17 | class JoinIterator {
18 | private:
19 |   hdt::VarBindingString *iterator;
20 |   bool hasNextSolution = true;
21 | 
22 | public:
23 |   /*!
24 |    * Constructor
25 |    * @param iterator [description]
26 |    */
27 |   JoinIterator(hdt::VarBindingString *_it);
28 | 
29 |   /*!
30 |    * Destructor
31 |    */
32 |   ~JoinIterator();
33 | 
34 |   /*!
35 |    * Implementation for Python function "__repr__"
36 |    * @return [description]
37 |    */
38 |   std::string python_repr();
39 | 
40 |   /*!
41 |    * Implementation for Python function "__iter__"
42 |    * @return [description]
43 |    */
44 |   JoinIterator *python_iter();
45 | 
46 |   /**
47 |    * Get the estimated join cardinality
48 |    * @return [description]
49 |    */
50 |   size_t estimatedCardinality();
51 | 
52 |   /**
53 |    * Reset the iterator into its initial state and restart join processing.
54 |    */
55 |   void reset();
56 | 
57 |   /*!
58 |    * Return true if the iterator still has items available, False otherwise.
59 |    * @return [description]
60 |    */
61 |   bool hasNext();
62 | 
63 |   /**
64 |    * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
65 |    * has ended. Used to implement Python Itertor protocol.
66 |    * @return [description]
67 |    */
68 |   solution_bindings next();
69 | 
70 | };
71 | 
72 | #endif /* JOIN_ITERATOR_HPP */
73 | 


--------------------------------------------------------------------------------
/include/join_iterator_bytes.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * join_iterator.hpp
 3 |  * Author: Arnaud Grall - MIT License 2017-2019
 4 |  */
 5 | 
 6 | #ifndef JOIN_ITERATOR_BYTES_HPP
 7 | #define JOIN_ITERATOR_BYTES_HPP
 8 | 
 9 | #include "pyhdt_types.hpp"
10 | #include "QueryProcessor.hpp"
11 | #include <string>
12 | 
13 | /*!
14 |  * JoinIterator iterates over solution bindings of a join
15 |  * @author Arnaud Grall
16 |  */
17 | class JoinIteratorBytes {
18 | private:
19 |   hdt::VarBindingString *iterator;
20 |   bool hasNextSolution = true;
21 | 
22 | public:
23 |   /*!
24 |    * Constructor
25 |    * @param iterator [description]
26 |    */
27 |   JoinIteratorBytes(hdt::VarBindingString *_it);
28 | 
29 |   /*!
30 |    * Destructor
31 |    */
32 |   ~JoinIteratorBytes();
33 | 
34 |   /*!
35 |    * Implementation for Python function "__repr__"
36 |    * @return [description]
37 |    */
38 |   std::string python_repr();
39 | 
40 |   /*!
41 |    * Implementation for Python function "__iter__"
42 |    * @return [description]
43 |    */
44 |   JoinIteratorBytes *python_iter();
45 | 
46 |   /**
47 |    * Get the estimated join cardinality
48 |    * @return [description]
49 |    */
50 |   size_t estimatedCardinality();
51 | 
52 |   /**
53 |    * Reset the iterator into its initial state and restart join processing.
54 |    */
55 |   void reset();
56 | 
57 |   /*!
58 |    * Return true if the iterator still has items available, False otherwise.
59 |    * @return [description]
60 |    */
61 |   bool hasNext();
62 | 
63 |   /**
64 |    * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
65 |    * has ended. Used to implement Python Itertor protocol.
66 |    * @return [description]
67 |    */
68 |   py::set next();
69 | 
70 | };
71 | 
72 | #endif /* JOIN_ITERATOR_BYTES_HPP */
73 | 


--------------------------------------------------------------------------------
/include/pyhdt_types.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * hdt_types.hpp
 3 |  * Author: Thomas MINIER, Arnaud Grall - MIT License 2017-2019
 4 |  */
 5 | 
 6 | #ifndef PYHDT_TYPES_HPP
 7 | #define PYHDT_TYPES_HPP
 8 | 
 9 | #include <list>
10 | #include <string>
11 | #include <tuple>
12 | #include <set>
13 | #include <pybind11/pybind11.h>
14 | namespace py = pybind11;
15 | 
16 | /**
17 |  * Indictates the position of an Object Identifier
18 |  */
19 | enum IdentifierPosition {
20 |   Subject = 1,
21 |   Predicate = 2,
22 |   Object = 3
23 | };
24 | 
25 | // A RDF Triple. RDF terms are represented as simple strings by HDT.
26 | typedef std::tuple<std::string, std::string, std::string> triple;
27 | 
28 | // A RDF triple composed of IDs from HDT dictionnary
29 | typedef std::tuple<unsigned int, unsigned int, unsigned int> triple_id;
30 | 
31 | // A list of RDF triples
32 | typedef std::list<triple> triple_list;
33 | 
34 | // A list of RDF triples IDs
35 | typedef std::list<triple_id> triple_ids_list;
36 | 
37 | // A hint over the cardinality of a triple pattern
38 | // The right element of the tuple is True if the hint is accurate, False otherwise
39 | typedef std::tuple<size_t, bool> size_hint;
40 | 
41 | typedef std::tuple<std::string, std::string> single_binding;
42 | 
43 | typedef std::set<single_binding> *solution_bindings;
44 | 
45 | // ============== BYTES REPRESENTATION ==============
46 | // A RDF Triple. RDF terms are represented as simple bytes by HDT.
47 | typedef std::tuple<py::bytes, py::bytes, py::bytes> triple_bytes;
48 | // A Set of solutions bindings for the join iterator
49 | typedef py::set solution_bindings_bytes;
50 | 
51 | #endif /* PYHDT_TYPES_HPP */
52 | 


--------------------------------------------------------------------------------
/include/triple_iterator.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * triple_iterator.hpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #ifndef TRIPLE_ITERATOR_HPP
  7 | #define TRIPLE_ITERATOR_HPP
  8 | 
  9 | #include "tripleid_iterator.hpp"
 10 | #include "pyhdt_types.hpp"
 11 | #include "Dictionary.hpp"
 12 | #include <string>
 13 | 
 14 | /*!
 15 |  * TripleIterator iterates over RDF triples of an HDT document which match a
 16 |  * triple pattern + limit + offset \author Thomas Minier
 17 |  */
 18 | class TripleIterator {
 19 | private:
 20 |   TripleIDIterator *iterator;
 21 |   hdt::Dictionary *dictionary;
 22 | 
 23 | public:
 24 |   /*!
 25 |    * Constructor
 26 |    * @param iterator [description]
 27 |    */
 28 |   TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict);
 29 | 
 30 |   /*!
 31 |    * Destructor
 32 |    */
 33 |   ~TripleIterator();
 34 | 
 35 |   /*!
 36 |    * Implementation for Python function "__repr__"
 37 |    * @return [description]
 38 |    */
 39 |   std::string python_repr();
 40 | 
 41 |   /*!
 42 |    * Get the subject of the triple pattern currently evaluated.
 43 |    * An empty string represents a variable
 44 |    * @return [description]
 45 |    */
 46 |   std::string getSubject();
 47 | 
 48 |   /*!
 49 |    * Get the predicate of the triple pattern currently evaluated.
 50 |    * An empty string represents a variable
 51 |    * @return [description]
 52 |    */
 53 |   std::string getPredicate();
 54 | 
 55 |   /*!
 56 |    * Get the object of the triple pattern currently evaluated.
 57 |    * An empty string represents a variable
 58 |    * @return [description]
 59 |    */
 60 |   std::string getObject();
 61 | 
 62 |   /*!
 63 |    * Get the limit of the current iterator
 64 |    * @return [description]
 65 |    */
 66 |   unsigned int getLimit();
 67 | 
 68 |   /*!
 69 |    * Get the offset of the current iterator
 70 |    * @return [description]
 71 |    */
 72 |   unsigned int getOffset();
 73 | 
 74 |   /*!
 75 |    * Get the number of results read by the iterator
 76 |    * @return [description]
 77 |    */
 78 |   unsigned int getNbResultsRead();
 79 | 
 80 |   /*!
 81 |    * Implementation for Python function "__iter__"
 82 |    * @return [description]
 83 |    */
 84 |   TripleIterator *python_iter();
 85 | 
 86 |   /*!
 87 |    * Get the estimated cardinality of the pattern currently evaluated.
 88 |    * Offset & limit are not taken into account.
 89 |    * @return [description]
 90 |    */
 91 |   size_hint sizeHint();
 92 | 
 93 |   /*!
 94 |    * Return true if the iterator still has items available, False otherwise.
 95 |    * @return [description]
 96 |    */
 97 |   bool hasNext();
 98 | 
 99 |   /**
100 |    * Get the next item in the iterator, or raise py::StopIteration if the
101 |    * iterator has ended
102 |    * @return [description]
103 |    */
104 |   triple next();
105 | 
106 |   /**
107 |    * Get the next item in the iterator, or raise py::StopIteration if the
108 |    * iterator has ended, but without advancing the iterator.
109 |    * @return [description]
110 |    */
111 |   triple peek();
112 | };
113 | 
114 | #endif /* TRIPLE_ITERATOR_HPP */
115 | 


--------------------------------------------------------------------------------
/include/triple_iterator_bytes.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * triple_iterator_bytes.hpp
  3 |  * Author: Arnaud GRALL - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #ifndef TRIPLE_ITERATOR_BYTES_HPP
  7 | #define TRIPLE_ITERATOR_BYTES_HPP
  8 | 
  9 | #include "tripleid_iterator.hpp"
 10 | #include "pyhdt_types.hpp"
 11 | #include "Dictionary.hpp"
 12 | #include <string>
 13 | 
 14 | /*!
 15 |  * TripleIterator iterates over RDF triples of an HDT document which match a
 16 |  * triple pattern + limit + offset \author Thomas Minier
 17 |  */
 18 | class TripleIteratorBytes {
 19 | private:
 20 |   TripleIDIterator *iterator;
 21 |   hdt::Dictionary *dictionary;
 22 | 
 23 | public:
 24 |   /*!
 25 |    * Constructor
 26 |    * @param iterator [description]
 27 |    */
 28 |   TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict);
 29 | 
 30 |   /*!
 31 |    * Destructor
 32 |    */
 33 |   ~TripleIteratorBytes();
 34 | 
 35 |   /*!
 36 |    * Implementation for Python function "__repr__"
 37 |    * @return [description]
 38 |    */
 39 |   std::string python_repr();
 40 | 
 41 |   /*!
 42 |    * Get the subject of the triple pattern currently evaluated.
 43 |    * An empty string represents a variable
 44 |    * @return [description]
 45 |    */
 46 |   std::string getSubject();
 47 | 
 48 |   /*!
 49 |    * Get the predicate of the triple pattern currently evaluated.
 50 |    * An empty string represents a variable
 51 |    * @return [description]
 52 |    */
 53 |   std::string getPredicate();
 54 | 
 55 |   /*!
 56 |    * Get the object of the triple pattern currently evaluated.
 57 |    * An empty string represents a variable
 58 |    * @return [description]
 59 |    */
 60 |   std::string getObject();
 61 | 
 62 |   /*!
 63 |    * Get the limit of the current iterator
 64 |    * @return [description]
 65 |    */
 66 |   unsigned int getLimit();
 67 | 
 68 |   /*!
 69 |    * Get the offset of the current iterator
 70 |    * @return [description]
 71 |    */
 72 |   unsigned int getOffset();
 73 | 
 74 |   /*!
 75 |    * Get the number of results read by the iterator
 76 |    * @return [description]
 77 |    */
 78 |   unsigned int getNbResultsRead();
 79 | 
 80 |   /*!
 81 |    * Implementation for Python function "__iter__"
 82 |    * @return [description]
 83 |    */
 84 |   TripleIteratorBytes *python_iter();
 85 | 
 86 |   /*!
 87 |    * Get the estimated cardinality of the pattern currently evaluated.
 88 |    * Offset & limit are not taken into account.
 89 |    * @return [description]
 90 |    */
 91 |   size_hint sizeHint();
 92 | 
 93 |   /*!
 94 |    * Return true if the iterator still has items available, False otherwise.
 95 |    * @return [description]
 96 |    */
 97 |   bool hasNext();
 98 | 
 99 |   /**
100 |    * Get the next item in the iterator, or raise py::StopIteration if the
101 |    * iterator has ended
102 |    * @return [description]
103 |    */
104 |   triple_bytes next();
105 | 
106 |   /**
107 |    * Get the next item in the iterator, or raise py::StopIteration if the
108 |    * iterator has ended, but without advancing the iterator.
109 |    * @return [description]
110 |    */
111 |   triple_bytes peek();
112 | };
113 | 
114 | #endif /* TRIPLE_ITERATOR_BYTES_HPP */
115 | 


--------------------------------------------------------------------------------
/include/tripleid_iterator.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * tripleid_iterator.hpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #ifndef TRIPLEID_ITERATOR_HPP
  7 | #define TRIPLEID_ITERATOR_HPP
  8 | 
  9 | #include "pyhdt_types.hpp"
 10 | #include <Iterator.hpp>
 11 | #include <string>
 12 | 
 13 | /*!
 14 |  * TripleIDIterator iterates over IDs of RDF triples of an HDT document which
 15 |  * match a triple pattern + limit + offset \author Thomas Minier
 16 |  */
 17 | class TripleIDIterator {
 18 | private:
 19 |   std::string subject;
 20 |   std::string predicate;
 21 |   std::string object;
 22 |   unsigned int limit;
 23 |   unsigned int offset;
 24 |   hdt::IteratorTripleID *iterator;
 25 |   triple_id _bufferedTriple;
 26 |   bool hasBufferedTriple = false;
 27 |   unsigned int resultsRead = 0;
 28 | 
 29 | public:
 30 |   /*!
 31 |    * Constructor
 32 |    * @param iterator [description]
 33 |    */
 34 |   TripleIDIterator(hdt::IteratorTripleID *_it, std::string _subj,
 35 |                    std::string _pred, std::string _obj, unsigned int _limit,
 36 |                    unsigned int _offset);
 37 | 
 38 |   /*!
 39 |    * Destructor
 40 |    */
 41 |   ~TripleIDIterator();
 42 | 
 43 |   /*!
 44 |    * Implementation for Python function "__repr__"
 45 |    * @return [description]
 46 |    */
 47 |   std::string python_repr();
 48 | 
 49 |   /*!
 50 |    * Get the subject of the triple pattern currently evaluated.
 51 |    * @return [description]
 52 |    */
 53 |   std::string getSubject();
 54 | 
 55 |   /*!
 56 |    * Get the predicate of the triple pattern currently evaluated.
 57 |    * @return [description]
 58 |    */
 59 |   std::string getPredicate();
 60 | 
 61 |   /*!
 62 |    * Get the object of the triple pattern currently evaluated.
 63 |    * @return [description]
 64 |    */
 65 |   std::string getObject();
 66 | 
 67 |   /*!
 68 |    * Get the limit of the current iterator
 69 |    * @return [description]
 70 |    */
 71 |   unsigned int getLimit();
 72 | 
 73 |   /*!
 74 |    * Get the offset of the current iterator
 75 |    * @return [description]
 76 |    */
 77 |   unsigned int getOffset();
 78 | 
 79 |   /*!
 80 |    * Get the number of results read by the iterator
 81 |    * @return [description]
 82 |    */
 83 |   unsigned int getNbResultsRead();
 84 | 
 85 |   /*!
 86 |    * Implementation for Python function "__iter__"
 87 |    * @return [description]
 88 |    */
 89 |   TripleIDIterator *python_iter();
 90 | 
 91 |   /*!
 92 |    * Get the estimated cardinality of the pattern currently evaluated.
 93 |    * Offset & limit are not taken into account.
 94 |    * @return [description]
 95 |    */
 96 |   size_hint sizeHint();
 97 | 
 98 |   /*!
 99 |    * Return true if the iterator still has items available, False otherwise.
100 |    * @return [description]
101 |    */
102 |   bool hasNext();
103 | 
104 |   /**
105 |    * Get the next item in the iterator, or raise py::StopIteration if the
106 |    * iterator has ended
107 |    * @return [description]
108 |    */
109 |   triple_id next();
110 | 
111 |   /**
112 |    * Get the next item in the iterator, or raise py::StopIteration if the
113 |    * iterator has ended, but without advancing the iterator.
114 |    * @return [description]
115 |    */
116 |   triple_id peek();
117 | };
118 | 
119 | #endif /* TRIPLEID_ITERATOR_HPP */
120 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # scripts for automated installation
 3 | 
 4 | echo "Validating dependencies..."
 5 | command -v python >/dev/null 2>&1 || { echo >&2 "Python is required for the installation of pyHDT! Aborting installation..."; exit 1; }
 6 | command -v pip >/dev/null 2>&1 || { echo >&2 "pip is required for the installation of pyHDT! Aborting installation..."; exit 1; }
 7 | command -v curl >/dev/null 2>&1 || { echo >&2 "curl is required for the installation of pyHDT! Aborting installation..."; exit 1; }
 8 | command -v unzip >/dev/null 2>&1 || { echo >&2 "unzip is required for the installation of pyHDT! Aborting installation..."; exit 1; }
 9 | 
10 | echo "Downloading HDT..."
11 | curl -LO https://github.com/rdfhdt/hdt-cpp/archive/v1.3.3.zip
12 | unzip -qq v1.3.3.zip
13 | 
14 | echo "Installing pybind11..."
15 | pip install -r requirements.txt
16 | 
17 | echo "Installing pyHDT..."
18 | python setup.py install
19 | 
20 | echo "Cleaning up..."
21 | rm v1.3.3.zip
22 | rm -rf hdt-cpp-1.3.3/
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pybind11==2.2.4
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # setup.py
 2 | # Author: Thomas MINIER - MIT License 2017-2019
 3 | from setuptools import setup, Extension
 4 | from os import listdir
 5 | import pybind11
 6 | 
 7 | __pyhdt_version__ = "2.3"
 8 | 
 9 | PYBIND_VERSION = 'pybind11==2.2.4'
10 | 
11 | with open('README.rst') as file:
12 |     long_description = file.read()
13 | 
14 | 
15 | def list_files(path, extension=".cpp", exclude="S.cpp"):
16 |     """List paths to all files that ends with a given extension"""
17 |     return ["%s/%s" % (path, f) for f in listdir(path) if f.endswith(extension) and (not f.endswith(exclude))]
18 | 
19 | 
20 | # pyHDT source files
21 | sources = [
22 |     "src/hdt.cpp",
23 |     "src/hdt_document.cpp",
24 |     "src/triple_iterator.cpp",
25 |     "src/triple_iterator_bytes.cpp",
26 |     "src/tripleid_iterator.cpp",
27 |     "src/join_iterator.cpp",
28 |     "src/join_iterator_bytes.cpp"
29 | ]
30 | 
31 | # HDT source files
32 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/bitsequence")
33 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/coders")
34 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/mapper")
35 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/sequence")
36 | sources += list_files("hdt-cpp-1.3.3/libcds/src/static/permutation")
37 | sources += list_files("hdt-cpp-1.3.3/libcds/src/utils")
38 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/bitsequence")
39 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/dictionary")
40 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/hdt")
41 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/header")
42 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/huffman")
43 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs")
44 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/libdcs/fmindex")
45 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/rdf")
46 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sequence")
47 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/triples")
48 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/util")
49 | sources += list_files("hdt-cpp-1.3.3/libhdt/src/sparql")
50 | 
51 | # pybind11 + pyHDT + libcds +  HDT-lib headers
52 | include_dirs = [
53 |     pybind11.get_include(),
54 |     pybind11.get_include(True),
55 |     "include/",
56 |     "hdt-cpp-1.3.3/libhdt/include/",
57 |     "hdt-cpp-1.3.3/libhdt/src/dictionary/",
58 |     "hdt-cpp-1.3.3/libhdt/src/sparql/",
59 |     "hdt-cpp-1.3.3/libcds/include/",
60 |     "hdt-cpp-1.3.3/libcds/src/static/bitsequence",
61 |     "hdt-cpp-1.3.3/libcds/src/static/coders",
62 |     "hdt-cpp-1.3.3/libcds/src/static/mapper",
63 |     "hdt-cpp-1.3.3/libcds/src/static/permutation",
64 |     "hdt-cpp-1.3.3/libcds/src/static/sequence",
65 |     "hdt-cpp-1.3.3/libcds/src/utils"
66 | ]
67 | 
68 | # Need to build in c++11 minimum
69 | # TODO add a check to use c++14 or c++17 if available
70 | extra_compile_args = ["-std=c++11"]
71 | 
72 | # build HDT extension
73 | hdt_extension = Extension("hdt", sources=sources, include_dirs=include_dirs,
74 |                           extra_compile_args=extra_compile_args, language='c++')
75 | 
76 | setup(
77 |     name="hdt",
78 |     version=__pyhdt_version__,
79 |     author="Thomas Minier",
80 |     author_email="thomas.minier@univ-nantes.fr",
81 |     url="https://github.com/Callidon/pyHDT",
82 |     description="Read and query HDT document with ease in Python",
83 |     long_description=long_description,
84 |     keywords=["hdt", "rdf", "semantic web", "search"],
85 |     license="MIT",
86 |     install_requires=[PYBIND_VERSION],
87 |     setup_requires=[PYBIND_VERSION],
88 |     ext_modules=[hdt_extension]
89 | )
90 | 


--------------------------------------------------------------------------------
/src/hdt.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * hdt.cpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #include <pybind11/pybind11.h>
  7 | #include <pybind11/stl.h>
  8 | 
  9 | #include "docstrings.hpp"
 10 | #include "hdt_document.hpp"
 11 | #include "triple_iterator.hpp"
 12 | #include "triple_iterator_bytes.hpp"
 13 | #include "tripleid_iterator.hpp"
 14 | #include "join_iterator.hpp"
 15 | #include "join_iterator_bytes.hpp"
 16 | 
 17 | namespace py = pybind11;
 18 | 
 19 | PYBIND11_MODULE(hdt, m) {
 20 |   m.doc() = MODULE_DOC;
 21 | 
 22 |   py::enum_<IdentifierPosition>(m, "IdentifierPosition", IDENTIFIER_POSITION_DOC)
 23 |     .value("Subject", IdentifierPosition::Subject)
 24 |     .value("Predicate", IdentifierPosition::Predicate)
 25 |     .value("Object", IdentifierPosition::Object)
 26 |     .export_values();
 27 | 
 28 |   py::class_<TripleIterator>(m, "TripleIterator", TRIPLE_ITERATOR_CLASS_DOC)
 29 |       .def("next", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
 30 |       .def("__next__", &TripleIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
 31 |       .def("peek", &TripleIterator::peek, TRIPLE_ITERATOR_PEEK_DOC)
 32 |       .def("has_next", &TripleIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC)
 33 |       .def("size_hint", &TripleIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
 34 |       .def("__len__", &TripleIterator::sizeHint,
 35 |            TRIPLE_ITERATOR_SIZE_DOC)
 36 |       .def("__iter__", &TripleIterator::python_iter)
 37 |       .def_property_readonly("subject", &TripleIterator::getSubject,
 38 |                              TRIPLE_ITERATOR_GETSUBJECT_DOC)
 39 |       .def_property_readonly("predicate", &TripleIterator::getPredicate,
 40 |                              TRIPLE_ITERATOR_GETPREDICATE_DOC)
 41 |       .def_property_readonly("object", &TripleIterator::getObject,
 42 |                              TRIPLE_ITERATOR_GETOBJECT_DOC)
 43 |       .def_property_readonly("limit", &TripleIterator::getLimit,
 44 |                              TRIPLE_ITERATOR_GETLIMIT_DOC)
 45 |       .def_property_readonly("offset", &TripleIterator::getOffset,
 46 |                              TRIPLE_ITERATOR_GETOFFSET_DOC)
 47 |       .def_property_readonly("nb_reads", &TripleIterator::getNbResultsRead,
 48 |                     TRIPLE_ITERATOR_NBREADS_DOC)
 49 |       .def("__repr__", &TripleIterator::python_repr);
 50 | 
 51 |   py::class_<TripleIteratorBytes>(m, "TripleIteratorBytes", TRIPLE_ITERATOR_CLASS_DOC)
 52 |       .def("next", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC)
 53 |       .def("__next__", &TripleIteratorBytes::next, TRIPLE_ITERATOR_NEXT_DOC)
 54 |       .def("peek", &TripleIteratorBytes::peek, TRIPLE_ITERATOR_PEEK_DOC)
 55 |       .def("has_next", &TripleIteratorBytes::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC)
 56 |       .def("size_hint", &TripleIteratorBytes::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
 57 |       .def("__len__", &TripleIteratorBytes::sizeHint,
 58 |            TRIPLE_ITERATOR_SIZE_DOC)
 59 |       .def("__iter__", &TripleIteratorBytes::python_iter)
 60 |       .def_property_readonly("subject", &TripleIteratorBytes::getSubject,
 61 |                              TRIPLE_ITERATOR_GETSUBJECT_DOC)
 62 |       .def_property_readonly("predicate", &TripleIteratorBytes::getPredicate,
 63 |                              TRIPLE_ITERATOR_GETPREDICATE_DOC)
 64 |       .def_property_readonly("object", &TripleIteratorBytes::getObject,
 65 |                              TRIPLE_ITERATOR_GETOBJECT_DOC)
 66 |       .def_property_readonly("limit", &TripleIteratorBytes::getLimit,
 67 |                              TRIPLE_ITERATOR_GETLIMIT_DOC)
 68 |       .def_property_readonly("offset", &TripleIteratorBytes::getOffset,
 69 |                              TRIPLE_ITERATOR_GETOFFSET_DOC)
 70 |       .def_property_readonly("nb_reads", &TripleIteratorBytes::getNbResultsRead,
 71 |                     TRIPLE_ITERATOR_NBREADS_DOC)
 72 |       .def("__repr__", &TripleIteratorBytes::python_repr);
 73 | 
 74 |   py::class_<TripleIDIterator>(m, "TripleIDIterator", TRIPLE_ID_ITERATOR_CLASS_DOC)
 75 |       .def("next", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
 76 |       .def("__next__", &TripleIDIterator::next, TRIPLE_ITERATOR_NEXT_DOC)
 77 |       .def("peek", &TripleIDIterator::peek, TRIPLE_ITERATOR_PEEK_DOC)
 78 |       .def("has_next", &TripleIDIterator::hasNext, TRIPLE_ITERATOR_HASNEXT_DOC)
 79 |       .def("size_hint", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
 80 |       .def("__len__", &TripleIDIterator::sizeHint, TRIPLE_ITERATOR_SIZE_DOC)
 81 |       .def("__iter__", &TripleIDIterator::python_iter)
 82 |       .def_property_readonly("subject", &TripleIDIterator::getSubject,
 83 |                              TRIPLE_ITERATOR_GETSUBJECT_DOC)
 84 |       .def_property_readonly("predicate", &TripleIDIterator::getPredicate,
 85 |                              TRIPLE_ITERATOR_GETPREDICATE_DOC)
 86 |       .def_property_readonly("object", &TripleIDIterator::getObject,
 87 |                              TRIPLE_ITERATOR_GETOBJECT_DOC)
 88 |       .def_property_readonly("limit", &TripleIDIterator::getLimit,
 89 |                              TRIPLE_ITERATOR_GETLIMIT_DOC)
 90 |       .def_property_readonly("offset", &TripleIDIterator::getOffset,
 91 |                              TRIPLE_ITERATOR_GETOFFSET_DOC)
 92 |       .def_property_readonly("nb_reads", &TripleIDIterator::getNbResultsRead,
 93 |                     TRIPLE_ITERATOR_NBREADS_DOC)
 94 |       .def("__repr__", &TripleIDIterator::python_repr);
 95 | 
 96 |   py::class_<JoinIterator>(m, "JoinIterator", JOIN_ITERATOR_CLASS_DOC)
 97 |     .def("next", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC)
 98 |     .def("has_next", &JoinIterator::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC)
 99 |     .def("cardinality", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
100 |     .def("reset", &JoinIterator::reset, JOIN_ITERATOR_RESET_DOC)
101 |     .def("__len__", &JoinIterator::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
102 |     .def("__next__", &JoinIterator::next, JOIN_ITERATOR_NEXT_DOC)
103 |     .def("__iter__", &JoinIterator::python_iter)
104 |     .def("__repr__", &JoinIterator::python_repr);
105 | 
106 |   py::class_<JoinIteratorBytes>(m, "JoinIteratorBytes", JOIN_ITERATOR_CLASS_DOC)
107 |     .def("next", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC)
108 |     .def("has_next", &JoinIteratorBytes::hasNext, JOIN_ITERATOR_HAS_NEXT_DOC)
109 |     .def("cardinality", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
110 |     .def("reset", &JoinIteratorBytes::reset, JOIN_ITERATOR_RESET_DOC)
111 |     .def("__len__", &JoinIteratorBytes::estimatedCardinality, JOIN_ITERATOR_SIZE_DOC)
112 |     .def("__next__", &JoinIteratorBytes::next, JOIN_ITERATOR_NEXT_DOC)
113 |     .def("__iter__", &JoinIteratorBytes::python_iter)
114 |     .def("__repr__", &JoinIteratorBytes::python_repr);
115 | 
116 |   py::class_<HDTDocument>(m, "HDTDocument", HDT_DOCUMENT_CLASS_DOC)
117 |       .def(py::init(&HDTDocument::create), py::arg("file"),
118 |                                            py::arg("map") = true,
119 |                                            py::arg("indexed") = true)
120 |       .def_property_readonly("file_path", &HDTDocument::getFilePath,
121 |                              HDT_DOCUMENT_GETFILEPATH_DOC)
122 |       .def_property_readonly("total_triples", &HDTDocument::getNbTriples,
123 |                              HDT_DOCUMENT_GETNBTRIPLES_DOC)
124 |       .def_property_readonly("nb_subjects", &HDTDocument::getNbSubjects,
125 |                              HDT_DOCUMENT_GETNBSUBJECTS_DOC)
126 |       .def_property_readonly("nb_predicates", &HDTDocument::getNbPredicates,
127 |                              HDT_DOCUMENT_GETNBPREDICATES_DOC)
128 |       .def_property_readonly("nb_objects", &HDTDocument::getNbObjects,
129 |                              HDT_DOCUMENT_GETNBOBJECTS_DOC)
130 |       .def_property_readonly("nb_shared", &HDTDocument::getNbShared,
131 |                              HDT_DOCUMENT_GETNBSHARED_DOC)
132 |       .def("search_triples", &HDTDocument::search,
133 |            HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"),
134 |            py::arg("predicate"), py::arg("object"), py::arg("limit") = 0,
135 |            py::arg("offset") = 0)
136 |       .def("search_join", &HDTDocument::searchJoin, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns"))
137 |      .def("search_triples_ids", &HDTDocument::searchIDs,
138 |           HDT_DOCUMENT_SEARCH_TRIPLES_IDS_DOC, py::arg("subject"),
139 |           py::arg("predicate"), py::arg("object"), py::arg("limit") = 0,
140 |           py::arg("offset") = 0)
141 |       .def("convert_tripleid", &HDTDocument::convertTripleID,
142 |            HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC,
143 |            py::arg("subject"), py::arg("predicate"), py::arg("object"))
144 |       .def("convert_id", &HDTDocument::convertID, HDT_DOCUMENT_CONVERT_ID_DOC,
145 |            py::arg("id"), py::arg("position"))
146 |      .def("convert_term", &HDTDocument::convertTerm, HDT_DOCUMENT_CONVERT_TERM_DOC,
147 |           py::arg("term"), py::arg("position"))
148 |       // ========= BYTES REPRESENTATION =========
149 |       .def("search_triples_bytes", &HDTDocument::searchBytes,
150 |            HDT_DOCUMENT_SEARCH_TRIPLES_DOC, py::arg("subject"),
151 |            py::arg("predicate"), py::arg("object"), py::arg("limit") = 0,
152 |            py::arg("offset") = 0)
153 |       .def("search_join_bytes", &HDTDocument::searchJoinBytes, HDT_DOCUMENT_SEARCH_JOIN_DOC, py::arg("patterns"))
154 |       .def("convert_tripleid_bytes", &HDTDocument::convertTripleIDBytes,
155 |            HDT_DOCUMENT_TRIPLES_IDS_TO_STRING_DOC,
156 |            py::arg("subject"), py::arg("predicate"), py::arg("object"))
157 |       .def("convert_id_bytes", &HDTDocument::convertIDBytes, HDT_DOCUMENT_CONVERT_ID_DOC,
158 |            py::arg("id"), py::arg("position"))
159 |       .def("__len__", &HDTDocument::getNbTriples, HDT_DOCUMENT_GETNBTRIPLES_DOC)
160 |       .def("__repr__", &HDTDocument::python_repr);
161 | 
162 | }
163 | 


--------------------------------------------------------------------------------
/src/hdt_document.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * hdt_document.cpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #include "hdt_document.hpp"
  7 | #include "triple_iterator.hpp"
  8 | #include <HDTEnums.hpp>
  9 | #include <HDTManager.hpp>
 10 | #include <SingleTriple.hpp>
 11 | #include <fstream>
 12 | #include <pybind11/stl.h>
 13 | #include <pybind11/pybind11.h>
 14 | namespace py = pybind11;
 15 | using namespace hdt;
 16 | 
 17 | /*!
 18 |  * Skip `offset` items from an iterator, optimized for HDT iterators.
 19 |  * @param it          - Iterator which should skip items
 20 |  * @param offset      - How many items to skip
 21 |  * @param cardinality - (Estimated) number of results
 22 |  */
 23 | template <typename T>
 24 | inline void applyOffset(T *it, unsigned int offset, unsigned int cardinality) {
 25 |   if (offset > 0 && offset >= cardinality) {
 26 |     // hdt does not allow to skip past beyond the estimated nb of results,
 27 |     // so we may have a few results to skip manually
 28 |     unsigned int remainingSteps = offset - cardinality + 1;
 29 |     it->skip(cardinality - 1);
 30 |     while (it->hasNext() && remainingSteps > 0) {
 31 |       it->next();
 32 |       remainingSteps--;
 33 |     }
 34 |   } else if (offset > 0) {
 35 |     it->skip(offset);
 36 |   }
 37 | }
 38 | 
 39 | /*!
 40 |  * Returns true if a file is readable, false otherwise
 41 |  * @param  name - Path to the file to test
 42 |  * @return true if the file is readable, false otherwise
 43 |  */
 44 | inline bool file_exists(const std::string &name) {
 45 |   std::ifstream f(name.c_str());
 46 |   bool result = f.good();
 47 |   f.close();
 48 |   return result;
 49 | }
 50 | 
 51 | /*!
 52 |  * Constructor
 53 |  * @param file - Path to HDT file to load
 54 |  * @param map - True maps the HDT file (faster), False loads everything in memory
 55 |  * @param indexed -  True if the HDT must be loaded with indexes, False otherwise
 56 |  */
 57 | HDTDocument::HDTDocument(std::string file, bool map, bool indexed) {
 58 |   hdt_file = file;
 59 |   if (!file_exists(file)) {
 60 |     throw std::runtime_error("Cannot open HDT file '" + file + "': Not Found!");
 61 |   }
 62 | 
 63 |   if(!map && indexed) {
 64 |     hdt = HDTManager::loadIndexedHDT(file.c_str());
 65 |   } else if(!map && !indexed) {
 66 |     hdt = HDTManager::loadHDT(file.c_str());
 67 |   } else if(map && indexed){
 68 |     hdt = HDTManager::mapIndexedHDT(file.c_str());
 69 |   } else {
 70 |     hdt = HDTManager::mapHDT(file.c_str());
 71 |   }
 72 |   processor = new QueryProcessor(hdt);
 73 | }
 74 | 
 75 | /*!
 76 |  * Destructor
 77 |  */
 78 | HDTDocument::~HDTDocument() {}
 79 | 
 80 | /*!
 81 |  * Get the path to the HDT file currently loaded
 82 |  * @return The path to the HDT file currently loaded
 83 |  */
 84 | std::string HDTDocument::getFilePath() { return hdt_file; }
 85 | 
 86 | /*!
 87 |  * Implementation for Python function "__repr__"
 88 |  * @return A string representation of the object
 89 |  */
 90 | std::string HDTDocument::python_repr() {
 91 |   return "<HDTDocument " + hdt_file + " (~" + std::to_string(getNbTriples()) +
 92 |          " RDF triples)>";
 93 | }
 94 | 
 95 | /*!
 96 |  * Search all matching triples for a triple pattern, whith an optional limit and offset.
 97 |  * Returns a tuple<TripleIterator*, cardinality>
 98 |  * @param subject   - Triple pattern's subject
 99 |  * @param predicate - Triple pattern's predicate
100 |  * @param object    - Triple pattern's object
101 |  * @param limit     - (Optional) Maximum number of matching triples to read
102 |  * @param offset    - (Optional) Number of matching triples to skip
103 |  * @return A tuple (TripleIterator*, cardinality)
104 |  */
105 | search_results HDTDocument::search(std::string subject,
106 |                                    std::string predicate,
107 |                                    std::string object,
108 |                                    unsigned int limit,
109 |                                    unsigned int offset) {
110 |   unsigned int idSubject = 0;
111 |   unsigned int idPredicate = 0;
112 |   unsigned int idObject = 0;
113 | 
114 |   if (!subject.empty()) {
115 |     idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT);
116 |   }
117 | 
118 |   if (!predicate.empty()) {
119 |     idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE);
120 |   }
121 | 
122 |   if (!object.empty()) {
123 |     idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT);
124 |   }
125 | 
126 |   TripleIDIterator *it;
127 |   size_t cardinality = 0;
128 | 
129 |   // if a non-variable term was not found in the dictionnary, then the search yield nothing
130 |   if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) {
131 |     it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset);
132 |   } else {
133 |     // build a TripleIDIterator to fetch results
134 |     TripleID tp(idSubject, idPredicate, idObject);
135 |     IteratorTripleID *source = hdt->getTriples()->search(tp);
136 |     cardinality = source->estimatedNumResults();
137 |     applyOffset<IteratorTripleID>(source, offset, cardinality);
138 |     it = new TripleIDIterator(source, subject, predicate, object, limit, offset);
139 |   }
140 |   // wraps the TripleIDIterator in order to convert OID triples back to RDF triples
141 |   TripleIterator *resultIterator = new TripleIterator(it, hdt->getDictionary());
142 |   return std::make_tuple(resultIterator, cardinality);
143 | }
144 | 
145 | /*!
146 |  * Same as HDTDocument#search, but search for a TripleIDs instead.
147 |  * Returns a tuple<TripleIDIterator*, cardinality>
148 |  * @param subject   - Triple pattern's subject identifier
149 |  * @param predicate - Triple pattern's predicate identifier
150 |  * @param object    - Triple pattern's object identifier
151 |  * @param limit     - (Optional) Maximum number of matching triples to read
152 |  * @param offset    - (Optional) Number of matching triples to skip
153 |  * @return A tuple (TripleIDIterator*, cardinality)
154 |  */
155 | search_results_ids HDTDocument::searchIDs(unsigned int subject,
156 |                                           unsigned int predicate,
157 |                                           unsigned int object,
158 |                                           unsigned int limit,
159 |                                           unsigned int offset) {
160 |   TripleID tp(subject, predicate, object);
161 |   // get RDF terms associated with each ID for metadata
162 |   std::string strSubject = std::string("?s");
163 |   std::string strPredicate = std::string("?p");
164 |   std::string strObject = std::string("?o");
165 | 
166 |   if (subject != 0) {
167 |     strSubject = hdt->getDictionary()->idToString(subject, hdt::SUBJECT);
168 |   }
169 |   if (predicate != 0) {
170 |     strPredicate = hdt->getDictionary()->idToString(predicate, hdt::PREDICATE);
171 |   }
172 |   if (object != 0) {
173 |     strObject = hdt->getDictionary()->idToString(object, hdt::OBJECT);
174 |   }
175 | 
176 |   IteratorTripleID *it;
177 |   size_t cardinality = 0;
178 | 
179 |   // if a non-variable term was not found in the dictionnary, then the search yield nothing
180 |   if ((strSubject.empty() && subject != 0) || (strPredicate.empty() && predicate != 0) || (strObject.empty() && object != 0)) {
181 |     it = new IteratorTripleID();
182 |   } else {
183 |     // build iterator
184 |     it = hdt->getTriples()->search(tp);
185 |     cardinality = it->estimatedNumResults();
186 |     // apply offset
187 |     applyOffset<IteratorTripleID>(it, offset, cardinality);
188 |   }
189 |   TripleIDIterator *resultIterator = new TripleIDIterator(it, strSubject, strPredicate, strObject, limit, offset);
190 |   return std::make_tuple(resultIterator, cardinality);
191 | }
192 | 
193 | /*!
194 |  * Get the total number of triples in the HDT document
195 |  * @return The total number of triples in the HDT document
196 |  */
197 | unsigned int HDTDocument::getNbTriples() {
198 |   return hdt->getTriples()->getNumberOfElements();
199 | }
200 | 
201 | /*!
202 |  * Get the number of distinct subjects in the HDT document
203 |  * @return The number of distinct subjects in the HDT document
204 |  */
205 | unsigned int HDTDocument::getNbSubjects() {
206 |   return hdt->getDictionary()->getNsubjects();
207 | }
208 | 
209 | /*!
210 |  * Get the number of distinct predicates in the HDT document
211 |  * @return The number of distinct predicates in the HDT document
212 |  */
213 | unsigned int HDTDocument::getNbPredicates() {
214 |   return hdt->getDictionary()->getNpredicates();
215 | }
216 | 
217 | /*!
218 |  * Get the number of distinct objects in the HDT document
219 |  * @return The number of distinct objects in the HDT document
220 |  */
221 | unsigned int HDTDocument::getNbObjects() {
222 |   return hdt->getDictionary()->getNobjects();
223 | }
224 | 
225 | /*!
226 |  * Get the number of shared subjects-objects in the HDT document
227 |  * @return The number of shared subjects-objects in the HDT document
228 |  */
229 | unsigned int HDTDocument::getNbShared() {
230 |   return hdt->getDictionary()->getNshared();
231 | }
232 | 
233 | /*!
234 |  * Convert a TripleID to a string RDF triple
235 |  * @param  subject   - Triple's subject
236 |  * @param  predicate - Triple's predicate
237 |  * @param  object    - Triple's object
238 |  * @return The associated RDF triple
239 |  */
240 | triple HDTDocument::convertTripleID(unsigned int subject, unsigned int predicate,
241 |                                 unsigned int object) {
242 |   return std::make_tuple(
243 |       hdt->getDictionary()->idToString(subject, hdt::SUBJECT),
244 |       hdt->getDictionary()->idToString(predicate, hdt::PREDICATE),
245 |       hdt->getDictionary()->idToString(object, hdt::OBJECT));
246 | }
247 | 
248 | /**
249 |  * Convert an Object Identifier into the equivalent URI/Literal value
250 |  * @param  id  - Object Identifier
251 |  * @param  pos - Identifier position (subject, predicate or object)
252 |  * @return The URI/Literal equivalent to the Object Identifier
253 |  */
254 | string HDTDocument::convertID(unsigned int id, IdentifierPosition pos) {
255 |   switch (pos) {
256 |     case IdentifierPosition::Subject:
257 |       return hdt->getDictionary()->idToString(id, hdt::SUBJECT);
258 |     case IdentifierPosition::Predicate:
259 |       return hdt->getDictionary()->idToString(id, hdt::PREDICATE);
260 |     case IdentifierPosition::Object:
261 |       return hdt->getDictionary()->idToString(id, hdt::OBJECT);
262 |     default:
263 |       throw std::runtime_error("Invalid Object Identifier exception");
264 |   }
265 | }
266 | 
267 | /**
268 |  * Convert an RDF term into the associated an Object Identifier.
269 |  * @param  term  - RDF Term in string format
270 |  * @param  pos - Identifier position (subject, predicate or object)
271 |  * @return The Object Identifier associated with the RDF term
272 |  */
273 | unsigned int HDTDocument::convertTerm(std::string term, IdentifierPosition pos) {
274 |   switch (pos) {
275 |     case IdentifierPosition::Subject:
276 |       return hdt->getDictionary()->stringToId(term, hdt::SUBJECT);
277 |     case IdentifierPosition::Predicate:
278 |       return hdt->getDictionary()->stringToId(term, hdt::PREDICATE);
279 |     case IdentifierPosition::Object:
280 |       return hdt->getDictionary()->stringToId(term, hdt::OBJECT);
281 |     default:
282 |       throw std::runtime_error("Invalid Object Identifier exception");
283 |   }
284 | }
285 | 
286 | /**
287 |  * Evaluate a join between a set of triple patterns using a JoinIterator.
288 |  * @param  patterns - Set of triple patterns
289 |  * @return A JoinIterator* used to evaluated the join.
290 |  */
291 | JoinIterator * HDTDocument::searchJoin(std::vector<triple> patterns) {
292 |   set<string> vars {};
293 |   vector<TripleString> joinPatterns {};
294 |   std::string subj, pred, obj;
295 | 
296 |   for (auto it = patterns.begin(); it != patterns.end(); it++) {
297 |     // unpack pattern
298 |     std::tie(subj, pred, obj) = *it;
299 |     // add variables
300 |     if (subj.at(0) == '?') {
301 |       vars.insert(subj);
302 |     }
303 |     if (pred.at(0) == '?') {
304 |       vars.insert(pred);
305 |     }
306 |     if (obj.at(0) == '?') {
307 |       vars.insert(obj);
308 |     }
309 |     // build join pattern
310 |     TripleString pattern(subj, pred, obj);
311 |     joinPatterns.push_back(pattern);
312 |   }
313 | 
314 |   VarBindingString *iterator = processor->searchJoin(joinPatterns, vars);
315 |   return new JoinIterator(iterator);
316 | }
317 | 
318 | // ============= BYTES REPRSENTATION ============
319 | /*!
320 |  * Search all matching triples for a triple pattern, whith an optional limit and offset. Triple as bytes triples (b'...', b'...', b'...')
321 |  * Returns a tuple<TripleIterator*, cardinality>
322 |  * @param subject   - Triple pattern's subject
323 |  * @param predicate - Triple pattern's predicate
324 |  * @param object    - Triple pattern's object
325 |  * @param limit     - (Optional) Maximum number of matching triples to read
326 |  * @param offset    - (Optional) Number of matching triples to skip
327 |  * @return A tuple (TripleIterator*, cardinality)
328 |  */
329 | search_results_bytes HDTDocument::searchBytes(std::string subject,
330 |                                    std::string predicate,
331 |                                    std::string object,
332 |                                    unsigned int limit,
333 |                                    unsigned int offset) {
334 |   unsigned int idSubject = 0;
335 |   unsigned int idPredicate = 0;
336 |   unsigned int idObject = 0;
337 | 
338 |   if (!subject.empty()) {
339 |     idSubject = hdt->getDictionary()->stringToId(subject, hdt::SUBJECT);
340 |   }
341 | 
342 |   if (!predicate.empty()) {
343 |     idPredicate = hdt->getDictionary()->stringToId(predicate, hdt::PREDICATE);
344 |   }
345 | 
346 |   if (!object.empty()) {
347 |     idObject = hdt->getDictionary()->stringToId(object, hdt::OBJECT);
348 |   }
349 | 
350 |   TripleIDIterator *it;
351 |   size_t cardinality = 0;
352 | 
353 |   // if a non-variable term was not found in the dictionnary, then the search yield nothing
354 |   if (((!subject.empty()) && idSubject == 0) || ((!predicate.empty()) && idPredicate == 0) || ((!object.empty()) && idObject == 0)) {
355 |     it = new TripleIDIterator(new IteratorTripleID(), subject, predicate, object, limit, offset);
356 |   } else {
357 |     // build a TripleIDIterator to fetch results
358 |     TripleID tp(idSubject, idPredicate, idObject);
359 |     IteratorTripleID *source = hdt->getTriples()->search(tp);
360 |     cardinality = source->estimatedNumResults();
361 |     applyOffset<IteratorTripleID>(source, offset, cardinality);
362 |     it = new TripleIDIterator(source, subject, predicate, object, limit, offset);
363 |   }
364 |   // wraps the TripleIDIterator in order to convert OID triples back to RDF triples
365 |   TripleIteratorBytes *resultIterator = new TripleIteratorBytes(it, hdt->getDictionary());
366 |   return std::make_tuple(resultIterator, cardinality);
367 | }
368 | 
369 | /**
370 |  * Evaluate a join between a set of triple patterns using a JoinIterator.
371 |  * @param  patterns - Set of triple patterns
372 |  * @return A JoinIterator* used to evaluated the join.
373 |  */
374 | JoinIteratorBytes * HDTDocument::searchJoinBytes(std::vector<triple> patterns) {
375 |   set<string> vars {};
376 |   vector<TripleString> joinPatterns {};
377 |   std::string subj, pred, obj;
378 | 
379 |   for (auto it = patterns.begin(); it != patterns.end(); it++) {
380 |     // unpack pattern
381 |     std::tie(subj, pred, obj) = *it;
382 |     // add variables
383 |     if (subj.at(0) == '?') {
384 |       vars.insert(subj);
385 |     }
386 |     if (pred.at(0) == '?') {
387 |       vars.insert(pred);
388 |     }
389 |     if (obj.at(0) == '?') {
390 |       vars.insert(obj);
391 |     }
392 |     // build join pattern
393 |     TripleString pattern(subj, pred, obj);
394 |     joinPatterns.push_back(pattern);
395 |   }
396 | 
397 |   VarBindingString *iterator = processor->searchJoin(joinPatterns, vars);
398 |   return new JoinIteratorBytes(iterator);
399 | }
400 | 
401 | /**
402 |  * Convert an Object Identifier into the equivalent URI/Literal value
403 |  * @param  id  - Object Identifier
404 |  * @param  pos - Identifier position (subject, predicate or object)
405 |  * @return The URI/Literal equivalent to the Object Identifier
406 |  */
407 | py::bytes HDTDocument::convertIDBytes(unsigned int id, IdentifierPosition pos) {
408 |   return  py::bytes(HDTDocument::convertID(id, pos));
409 | }
410 | 
411 | /*!
412 |  * Convert a TripleID to a string RDF triple
413 |  * @param  subject   - Triple's subject
414 |  * @param  predicate - Triple's predicate
415 |  * @param  object    - Triple's object
416 |  * @return The associated RDF triple
417 |  */
418 | triple_bytes HDTDocument::convertTripleIDBytes(unsigned int subject, unsigned int predicate,
419 |                                 unsigned int object) {
420 |   return std::make_tuple(
421 |       py::bytes(hdt->getDictionary()->idToString(subject, hdt::SUBJECT)),
422 |       py::bytes(hdt->getDictionary()->idToString(predicate, hdt::PREDICATE)),
423 |       py::bytes(hdt->getDictionary()->idToString(object, hdt::OBJECT)));
424 | }
425 | 


--------------------------------------------------------------------------------
/src/join_iterator.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * join_iterator.cpp
 3 |  * Author: Thomas MINIER - MIT License 2017-2019
 4 |  */
 5 | 
 6 | #include "join_iterator.hpp"
 7 | #include <pybind11/pybind11.h>
 8 | #include <pybind11/stl.h>
 9 | 
10 | /*!
11 |  * Constructor
12 |  * @param _it [description]
13 |  */
14 | JoinIterator::JoinIterator(hdt::VarBindingString *_it) : iterator(_it) {}
15 | 
16 | /*!
17 |  * Destructor
18 |  */
19 | JoinIterator::~JoinIterator() {
20 |   delete iterator;
21 | }
22 | 
23 | /*!
24 |  * Implementation for Python function "__repr__"
25 |  * @return [description]
26 |  */
27 | std::string JoinIterator::python_repr() {
28 |   return "JoinIterator";
29 | }
30 | 
31 | 
32 | /*!
33 |  * Implementation for Python function "__iter__"
34 |  * @return [description]
35 |  */
36 | JoinIterator *JoinIterator::python_iter() { return this; }
37 | 
38 | /**
39 |  * Get the estimated join cardinality
40 |  * @return [description]
41 |  */
42 | size_t JoinIterator::estimatedCardinality() {
43 |   return iterator->estimatedNumResults();
44 | }
45 | 
46 | /**
47 |  * Reset the iterator into its initial state and restart join processing.
48 |  */
49 | void JoinIterator::reset() {
50 |   iterator->goToStart();
51 | }
52 | 
53 | /*!
54 |  * Return true if the iterator still has items available, False otherwise.
55 |  * @return [description]
56 |  */
57 | bool JoinIterator::hasNext() {
58 |   return hasNextSolution;
59 | }
60 | 
61 | /**
62 |  * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
63 |  * has ended. Used to implement Python Itertor protocol.
64 |  * @return [description]
65 |  */
66 | solution_bindings JoinIterator::next() {
67 |   hasNextSolution = iterator->findNext();
68 |   // stop iteration if the iterator has ended
69 |   if (!hasNextSolution) {
70 |     throw pybind11::stop_iteration();
71 |   }
72 |   solution_bindings solutions = new std::set<single_binding>();
73 |   // build solution bindings
74 |   for(unsigned int i = 0; i < iterator->getNumVars(); i++) {
75 |     solutions->insert(std::make_tuple(iterator->getVarName(i), iterator->getVar(i)));
76 |   }
77 |   return solutions;
78 | }
79 | 


--------------------------------------------------------------------------------
/src/join_iterator_bytes.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * join_iterator_bytes.cpp
 3 |  * Author: Thomas MINIER - MIT License 2017-2019
 4 |  */
 5 | 
 6 | #include "join_iterator_bytes.hpp"
 7 | #include <pybind11/pybind11.h>
 8 | #include <pybind11/stl.h>
 9 | 
10 | /*!
11 |  * Constructor
12 |  * @param _it [description]
13 |  */
14 | JoinIteratorBytes::JoinIteratorBytes(hdt::VarBindingString *_it) : iterator(_it) {}
15 | 
16 | /*!
17 |  * Destructor
18 |  */
19 | JoinIteratorBytes::~JoinIteratorBytes() {
20 |   delete iterator;
21 | }
22 | 
23 | /*!
24 |  * Implementation for Python function "__repr__"
25 |  * @return [description]
26 |  */
27 | std::string JoinIteratorBytes::python_repr() {
28 |   return "JoinIteratorBytes";
29 | }
30 | 
31 | 
32 | /*!
33 |  * Implementation for Python function "__iter__"
34 |  * @return [description]
35 |  */
36 | JoinIteratorBytes *JoinIteratorBytes::python_iter() { return this; }
37 | 
38 | /**
39 |  * Get the estimated join cardinality
40 |  * @return [description]
41 |  */
42 | size_t JoinIteratorBytes::estimatedCardinality() {
43 |   return iterator->estimatedNumResults();
44 | }
45 | 
46 | /**
47 |  * Reset the iterator into its initial state and restart join processing.
48 |  */
49 | void JoinIteratorBytes::reset() {
50 |   iterator->goToStart();
51 | }
52 | 
53 | /*!
54 |  * Return true if the iterator still has items available, False otherwise.
55 |  * @return [description]
56 |  */
57 | bool JoinIteratorBytes::hasNext() {
58 |   return hasNextSolution;
59 | }
60 | 
61 | /**
62 |  * Return the next set of solutions bindings, or raise py::StopIteration if the iterator
63 |  * has ended. Used to implement Python Itertor protocol.
64 |  * @return [description]
65 |  */
66 | py::set JoinIteratorBytes::next() {
67 |   hasNextSolution = iterator->findNext();
68 |   // stop iteration if the iterator has ended
69 |   if (!hasNextSolution) {
70 |     throw pybind11::stop_iteration();
71 |   }
72 |   solution_bindings_bytes solutions_bytes;
73 |   // build solution bindings
74 |   for(unsigned int i = 0; i < iterator->getNumVars(); i++) {
75 |     std::string varname = iterator->getVarName(i);
76 |     std::string value = iterator->getVar(i);
77 |     solutions_bytes.add(std::make_tuple(py::bytes(varname), py::bytes(value)));
78 |   }
79 |   return solutions_bytes;
80 | }
81 | 


--------------------------------------------------------------------------------
/src/triple_iterator.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * triple_iterator.cpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #include "triple_iterator.hpp"
  7 | #include <HDTEnums.hpp>
  8 | #include <SingleTriple.hpp>
  9 | #include <pybind11/pybind11.h>
 10 | 
 11 | /*!
 12 |  * Constructor
 13 |  * @param iterator [description]
 14 |  */
 15 | TripleIterator::TripleIterator(TripleIDIterator *_it, hdt::Dictionary *_dict)
 16 |     : iterator(_it), dictionary(_dict) {};
 17 | 
 18 | /*!
 19 |  * Destructor
 20 |  */
 21 | TripleIterator::~TripleIterator() { delete iterator; };
 22 | 
 23 | /*!
 24 |  * Implementation for Python function "__repr__"
 25 |  * @return [description]
 26 |  */
 27 | std::string TripleIterator::python_repr() {
 28 |   if (getLimit() != 0 && getOffset() > 0) {
 29 |     return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() +
 30 |            "} LIMIT " + std::to_string(getLimit()) + " OFFSET " +
 31 |            std::to_string(getOffset()) + " >";
 32 |   } else if (getLimit() != 0) {
 33 |     return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() +
 34 |            "} LIMIT " + std::to_string(getLimit()) + " >";
 35 |   } else if (getOffset() > 0) {
 36 |     return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() +
 37 |            "} OFFSET " + std::to_string(getOffset()) + ">";
 38 |   }
 39 |   return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() + "}>";
 40 | }
 41 | 
 42 | /*!
 43 |  * Get the subject of the triple pattern currently evaluated.
 44 |  * An empty string represents a variable
 45 |  * @return [description]
 46 |  */
 47 | std::string TripleIterator::getSubject() { return iterator->getSubject(); }
 48 | 
 49 | /*!
 50 |  * Get the predicate of the triple pattern currently evaluated.
 51 |  * An empty string represents a variable
 52 |  * @return [description]
 53 |  */
 54 | std::string TripleIterator::getPredicate() { return iterator->getPredicate(); }
 55 | 
 56 | /*!
 57 |  * Get the object of the triple pattern currently evaluated.
 58 |  * An empty string represents a variable
 59 |  * @return [description]
 60 |  */
 61 | std::string TripleIterator::getObject() { return iterator->getObject(); }
 62 | 
 63 | /*!
 64 |  * Get the limit of the current iterator
 65 |  * @return [description]
 66 |  */
 67 | unsigned int TripleIterator::getLimit() { return iterator->getLimit(); }
 68 | 
 69 | /*!
 70 |  * Get the offset of the current iterator
 71 |  * @return [description]
 72 |  */
 73 | unsigned int TripleIterator::getOffset() { return iterator->getOffset(); }
 74 | 
 75 | /*!
 76 |  * Get the number of results read by the iterator
 77 |  * @return [description]
 78 |  */
 79 | unsigned int TripleIterator::getNbResultsRead() { return iterator->getNbResultsRead(); }
 80 | 
 81 | /*!
 82 |  * Implementation for Python function "__iter__"
 83 |  * @return [description]
 84 |  */
 85 | TripleIterator *TripleIterator::python_iter() { return this; }
 86 | 
 87 | /*!
 88 |  * Get a hint over the cardinality of the triple pattern evaluated.
 89 |  * Offset & limit are not taken into account.
 90 |  * @return [description]
 91 |  */
 92 | size_hint TripleIterator::sizeHint() {
 93 |   return iterator->sizeHint();
 94 | }
 95 | 
 96 | /*!
 97 |  * Return true if the iterator still has items available, False otherwise.
 98 |  * @return [description]
 99 |  */
100 | bool TripleIterator::hasNext() {
101 |   return iterator->hasNext();
102 | }
103 | 
104 | /**
105 |  * Get the next item in the iterator, or raise py::StopIteration if the iterator
106 |  * has ended. Used to implement Python Itertor protocol.
107 |  * @return [description]
108 |  */
109 | triple TripleIterator::next() {
110 |   triple_id t = iterator->next();
111 |   return std::make_tuple(
112 |     dictionary->idToString(std::get<0>(t), hdt::SUBJECT),
113 |     dictionary->idToString(std::get<1>(t), hdt::PREDICATE),
114 |     dictionary->idToString(std::get<2>(t), hdt::OBJECT));
115 | }
116 | 
117 | /**
118 |  * Get the next item in the iterator, or raise py::StopIteration if the iterator
119 |  * has ended, but without advancing the iterator.
120 |  * @return [description]
121 |  */
122 | triple TripleIterator::peek() {
123 |   triple_id t = iterator->peek();
124 |   return std::make_tuple(
125 |     dictionary->idToString(std::get<0>(t), hdt::SUBJECT),
126 |     dictionary->idToString(std::get<1>(t), hdt::PREDICATE),
127 |     dictionary->idToString(std::get<2>(t), hdt::OBJECT));
128 | }
129 | 


--------------------------------------------------------------------------------
/src/triple_iterator_bytes.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * triple_iterator_bytes.cpp
  3 |  * Author: Arnaud GRALL - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #include "triple_iterator_bytes.hpp"
  7 | #include <HDTEnums.hpp>
  8 | #include <SingleTriple.hpp>
  9 | #include <pybind11/pybind11.h>
 10 | namespace py = pybind11;
 11 | 
 12 | /*!
 13 |  * Constructor
 14 |  * @param iterator [description]
 15 |  */
 16 | TripleIteratorBytes::TripleIteratorBytes(TripleIDIterator *_it, hdt::Dictionary *_dict)
 17 |     : iterator(_it), dictionary(_dict) {};
 18 | 
 19 | /*!
 20 |  * Destructor
 21 |  */
 22 | TripleIteratorBytes::~TripleIteratorBytes() { delete iterator; };
 23 | 
 24 | /*!
 25 |  * Implementation for Python function "__repr__"
 26 |  * @return [description]
 27 |  */
 28 | std::string TripleIteratorBytes::python_repr() {
 29 |   if (getLimit() != 0 && getOffset() > 0) {
 30 |     return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() +
 31 |            "} LIMIT " + std::to_string(getLimit()) + " OFFSET " +
 32 |            std::to_string(getOffset()) + " >";
 33 |   } else if (getLimit() != 0) {
 34 |     return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() +
 35 |            "} LIMIT " + std::to_string(getLimit()) + " >";
 36 |   } else if (getOffset() > 0) {
 37 |     return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() +
 38 |            "} OFFSET " + std::to_string(getOffset()) + ">";
 39 |   }
 40 |   return "<Iterator {" + getSubject() + " " + getPredicate() + " " + getObject() + "}>";
 41 | }
 42 | 
 43 | /*!
 44 |  * Get the subject of the triple pattern currently evaluated.
 45 |  * An empty string represents a variable
 46 |  * @return [description]
 47 |  */
 48 | std::string TripleIteratorBytes::getSubject() { return iterator->getSubject(); }
 49 | 
 50 | /*!
 51 |  * Get the predicate of the triple pattern currently evaluated.
 52 |  * An empty string represents a variable
 53 |  * @return [description]
 54 |  */
 55 | std::string TripleIteratorBytes::getPredicate() { return iterator->getPredicate(); }
 56 | 
 57 | /*!
 58 |  * Get the object of the triple pattern currently evaluated.
 59 |  * An empty string represents a variable
 60 |  * @return [description]
 61 |  */
 62 | std::string TripleIteratorBytes::getObject() { return iterator->getObject(); }
 63 | 
 64 | /*!
 65 |  * Get the limit of the current iterator
 66 |  * @return [description]
 67 |  */
 68 | unsigned int TripleIteratorBytes::getLimit() { return iterator->getLimit(); }
 69 | 
 70 | /*!
 71 |  * Get the offset of the current iterator
 72 |  * @return [description]
 73 |  */
 74 | unsigned int TripleIteratorBytes::getOffset() { return iterator->getOffset(); }
 75 | 
 76 | /*!
 77 |  * Get the number of results read by the iterator
 78 |  * @return [description]
 79 |  */
 80 | unsigned int TripleIteratorBytes::getNbResultsRead() { return iterator->getNbResultsRead(); }
 81 | 
 82 | /*!
 83 |  * Implementation for Python function "__iter__"
 84 |  * @return [description]
 85 |  */
 86 | TripleIteratorBytes *TripleIteratorBytes::python_iter() { return this; }
 87 | 
 88 | /*!
 89 |  * Get a hint over the cardinality of the triple pattern evaluated.
 90 |  * Offset & limit are not taken into account.
 91 |  * @return [description]
 92 |  */
 93 | size_hint TripleIteratorBytes::sizeHint() {
 94 |   return iterator->sizeHint();
 95 | }
 96 | 
 97 | /*!
 98 |  * Return true if the iterator still has items available, False otherwise.
 99 |  * @return [description]
100 |  */
101 | bool TripleIteratorBytes::hasNext() {
102 |   return iterator->hasNext();
103 | }
104 | 
105 | /**
106 |  * Get the next item in the iterator, or raise py::StopIteration if the iterator
107 |  * has ended. Used to implement Python Itertor protocol.
108 |  * @return [description]
109 |  */
110 | triple_bytes TripleIteratorBytes::next() {
111 |   triple_id t = iterator->next();
112 |   return std::make_tuple(
113 |     py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)),
114 |     py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)),
115 |     py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT)));
116 | }
117 | 
118 | /**
119 |  * Get the next item in the iterator, or raise py::StopIteration if the iterator
120 |  * has ended, but without advancing the iterator.
121 |  * @return [description]
122 |  */
123 | triple_bytes TripleIteratorBytes::peek() {
124 |   triple_id t = iterator->peek();
125 |   return std::make_tuple(
126 |     py::bytes(dictionary->idToString(std::get<0>(t), hdt::SUBJECT)),
127 |     py::bytes(dictionary->idToString(std::get<1>(t), hdt::PREDICATE)),
128 |     py::bytes(dictionary->idToString(std::get<2>(t), hdt::OBJECT)));
129 | }
130 | 


--------------------------------------------------------------------------------
/src/tripleid_iterator.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * tripleid_iterator.cpp
  3 |  * Author: Thomas MINIER - MIT License 2017-2019
  4 |  */
  5 | 
  6 | #include "tripleid_iterator.hpp"
  7 | #include <HDTEnums.hpp>
  8 | #include <SingleTriple.hpp>
  9 | #include <pybind11/pybind11.h>
 10 | 
 11 | /*!
 12 |  * Constructor
 13 |  * @param iterator [description]
 14 |  */
 15 | TripleIDIterator::TripleIDIterator(hdt::IteratorTripleID *_it,
 16 |                                    std::string _subj, std::string _pred,
 17 |                                    std::string _obj, unsigned int _limit,
 18 |                                    unsigned int _offset)
 19 |     : subject((_subj.compare("") == 0) ? "?s" : _subj),
 20 |       predicate((_pred.compare("") == 0) ? "?p" : _pred),
 21 |       object((_obj.compare("") == 0) ? "?o" : _obj), limit(_limit),
 22 |       offset(_offset), iterator(_it){};
 23 | 
 24 | /*!
 25 |  * Destructor
 26 |  */
 27 | TripleIDIterator::~TripleIDIterator() { delete iterator; };
 28 | 
 29 | /*!
 30 |  * Implementation for Python function "__repr__"
 31 |  * @return [description]
 32 |  */
 33 | std::string TripleIDIterator::python_repr() {
 34 |   if (limit != 0 && offset > 0) {
 35 |     return "<Iterator {" + subject + " " + predicate + " " + object +
 36 |            "} LIMIT " + std::to_string(limit) + " OFFSET " +
 37 |            std::to_string(offset) + " >";
 38 |   } else if (limit != 0) {
 39 |     return "<Iterator {" + subject + " " + predicate + " " + object +
 40 |            "} LIMIT " + std::to_string(limit) + " >";
 41 |   } else if (offset > 0) {
 42 |     return "<Iterator {" + subject + " " + predicate + " " + object +
 43 |            "} OFFSET " + std::to_string(offset) + ">";
 44 |   }
 45 |   return "<Iterator {" + subject + " " + predicate + " " + object + "}>";
 46 | }
 47 | 
 48 | /*!
 49 |  * Get the subject of the triple pattern currently evaluated.
 50 |  * An empty string represents a variable
 51 |  * @return [description]
 52 |  */
 53 | std::string TripleIDIterator::getSubject() { return subject; }
 54 | 
 55 | /*!
 56 |  * Get the predicate of the triple pattern currently evaluated.
 57 |  * An empty string represents a variable
 58 |  * @return [description]
 59 |  */
 60 | std::string TripleIDIterator::getPredicate() { return predicate; }
 61 | 
 62 | /*!
 63 |  * Get the object of the triple pattern currently evaluated.
 64 |  * An empty string represents a variable
 65 |  * @return [description]
 66 |  */
 67 | std::string TripleIDIterator::getObject() { return object; }
 68 | 
 69 | /*!
 70 |  * Get the limit of the current iterator
 71 |  * @return [description]
 72 |  */
 73 | unsigned int TripleIDIterator::getLimit() { return limit; }
 74 | 
 75 | /*!
 76 |  * Get the offset of the current iterator
 77 |  * @return [description]
 78 |  */
 79 | unsigned int TripleIDIterator::getOffset() { return offset; }
 80 | 
 81 | /*!
 82 |  * Get the number of results read by the iterator
 83 |  * @return [description]
 84 |  */
 85 | unsigned int TripleIDIterator::getNbResultsRead() { return resultsRead; }
 86 | 
 87 | /*!
 88 |  * Implementation for Python function "__iter__"
 89 |  * @return [description]
 90 |  */
 91 | TripleIDIterator *TripleIDIterator::python_iter() { return this; }
 92 | 
 93 | /*!
 94 |  * Get a hint over the cardinality of the triple pattern evaluated.
 95 |  * Offset & limit are not taken into account.
 96 |  * @return [description]
 97 |  */
 98 | size_hint TripleIDIterator::sizeHint() {
 99 |   return std::make_tuple(iterator->estimatedNumResults(), iterator->numResultEstimation() == hdt::EXACT);
100 | }
101 | 
102 | /*!
103 |  * Return true if the iterator still has items available, False otherwise.
104 |  * @return [description]
105 |  */
106 | bool TripleIDIterator::hasNext() {
107 |   bool noLimit = limit == 0;
108 |   return iterator->hasNext() && (noLimit || limit > resultsRead);
109 | }
110 | 
111 | /**
112 |  * Get the next item in the iterator, or raise py::StopIteration if the iterator
113 |  * has ended. Used to implement Python Itertor protocol.
114 |  * @return [description]
115 |  */
116 | triple_id TripleIDIterator::next() {
117 |   // return any previously peeked value
118 |   if (hasBufferedTriple) {
119 |     hasBufferedTriple = false;
120 |     resultsRead++;
121 |     return _bufferedTriple;
122 |   }
123 |   bool noLimit = limit == 0;
124 |   if (iterator->hasNext() && (noLimit || limit > resultsRead)) {
125 |     resultsRead++;
126 |     hdt::TripleID *ts = iterator->next();
127 |     return std::make_tuple(ts->getSubject(), ts->getPredicate(),
128 |                            ts->getObject());
129 |   }
130 |   throw pybind11::stop_iteration();
131 | }
132 | 
133 | /**
134 |  * Get the next item in the iterator, or raise py::StopIteration if the iterator
135 |  * has ended, but without advancing the iterator.
136 |  * @return [description]
137 |  */
138 | triple_id TripleIDIterator::peek() {
139 |   if (hasBufferedTriple) {
140 |     return _bufferedTriple;
141 |   }
142 |   _bufferedTriple = next();
143 |   hasBufferedTriple = true;
144 |   resultsRead--;
145 |   return _bufferedTriple;
146 | }
147 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Callidon/pyHDT/56370143e707c1b69bdb054bd811660e6611cae1/tests/__init__.py


--------------------------------------------------------------------------------
/tests/hdt_document_test.py:
--------------------------------------------------------------------------------
 1 | # hdt_document_test.py
 2 | # Author: Thomas MINIER - MIT License 2017-2019
 3 | import pytest
 4 | from hdt import HDTDocument, IdentifierPosition
 5 | 
 6 | path = "tests/test.hdt"
 7 | document = HDTDocument(path, True, False)
 8 | nbTotalTriples = 132
 9 | 
10 | 
11 | def test_missing_file():
12 |     with pytest.raises(RuntimeError):
13 |         HDTDocument("/home/dtrump/wall.hdt")
14 | 
15 | 
16 | def test_file_path():
17 |     assert document.file_path == path
18 | 
19 | 
20 | def test_total_triples():
21 |     assert document.total_triples == nbTotalTriples
22 |     assert len(document) == nbTotalTriples
23 | 
24 | 
25 | def test_nb_subjects():
26 |     assert document.nb_subjects == 4
27 | 
28 | 
29 | def tests_nb_predicates():
30 |     assert document.nb_predicates == 3
31 | 
32 | 
33 | def tests_nb_objects():
34 |     assert document.nb_objects == 112
35 | 
36 | 
37 | def tests_nb_shared():
38 |     assert document.nb_shared == 0
39 | 
40 | 
41 | def test_ids_to_string():
42 |     (triples, triplesCard) = document.search_triples("", "", "")
43 |     (ids, idsCard) = document.search_triples_ids(0, 0, 0)
44 |     assert triplesCard == idsCard
45 |     assert triplesCard == nbTotalTriples
46 |     for subj, pred, obj in triples:
47 |         sid, pid, oid = next(ids)
48 |         s, p, o = document.convert_tripleid(sid, pid, oid)
49 |         assert subj == s
50 |         assert pred == p
51 |         assert obj == o
52 | 
53 | def test_ids_to_string_bytes():
54 |     (triples, triplesCard) = document.search_triples_bytes("", "", "")
55 |     (ids, idsCard) = document.search_triples_ids(0, 0, 0)
56 |     assert triplesCard == idsCard
57 |     assert triplesCard == nbTotalTriples
58 |     for subj, pred, obj in triples:
59 |         print(subj, pred, obj)
60 |         sid, pid, oid = next(ids)
61 |         s, p, o = document.convert_tripleid_bytes(sid, pid, oid)
62 |         assert subj.decode('utf-8') == s.decode('utf-8')
63 |         assert pred.decode('utf-8') == p.decode('utf-8')
64 |         assert obj.decode('utf-8') == o.decode('utf-8')
65 | 
66 | 
67 | def test_convert_id():
68 |     (triples, triplesCard) = document.search_triples("", "", "")
69 |     (ids, idsCard) = document.search_triples_ids(0, 0, 0)
70 |     assert triplesCard == idsCard
71 |     assert triplesCard == nbTotalTriples
72 |     for subj, pred, obj in triples:
73 |         sid, pid, oid = next(ids)
74 |         s, p, o = (
75 |             document.convert_id(sid, IdentifierPosition.Subject),
76 |             document.convert_id(pid, IdentifierPosition.Predicate),
77 |             document.convert_id(oid, IdentifierPosition.Object)
78 |             )
79 |         assert subj == s
80 |         assert pred == p
81 |         assert obj == o
82 | 
83 | def test_convert_id_bytes():
84 |     (triples, triplesCard) = document.search_triples_bytes("", "", "")
85 |     (ids, idsCard) = document.search_triples_ids(0, 0, 0)
86 |     assert triplesCard == idsCard
87 |     assert triplesCard == nbTotalTriples
88 |     for subj, pred, obj in triples:
89 |         sid, pid, oid = next(ids)
90 |         s, p, o = (
91 |             document.convert_id_bytes(sid, IdentifierPosition.Subject),
92 |             document.convert_id_bytes(pid, IdentifierPosition.Predicate),
93 |             document.convert_id_bytes(oid, IdentifierPosition.Object)
94 |             )
95 |         assert subj == s
96 |         assert pred == p
97 |         assert obj == o
98 | 


--------------------------------------------------------------------------------
/tests/hdt_iterators_test.py:
--------------------------------------------------------------------------------
  1 | # hdt_iterators_test.py
  2 | # Author: Thomas MINIER - MIT License 2017-2019
  3 | import pytest
  4 | from hdt import HDTDocument
  5 | 
  6 | path = "tests/test.hdt"
  7 | document = HDTDocument(path)
  8 | nbTotalTriples = 132
  9 | 
 10 | 
 11 | def test_read_document_base():
 12 |     (triples, cardinality) = document.search_triples("", "", "")
 13 |     assert triples.subject == "?s"
 14 |     assert triples.predicate == "?p"
 15 |     assert triples.object == "?o"
 16 |     assert cardinality == nbTotalTriples
 17 |     for subj, pred, obj in triples:
 18 |         assert subj is not None
 19 |         assert pred is not None
 20 |         assert obj is not None
 21 |     assert triples.nb_reads == cardinality
 22 | 
 23 | def test_read_document_base_bytes():
 24 |     (triples, cardinality) = document.search_triples_bytes("", "", "")
 25 |     assert triples.subject == "?s"
 26 |     assert triples.predicate == "?p"
 27 |     assert triples.object == "?o"
 28 |     assert cardinality == nbTotalTriples
 29 |     for subj, pred, obj in triples:
 30 |         assert isinstance(subj, bytes)
 31 |         assert isinstance(pred, bytes)
 32 |         assert isinstance(obj, bytes)
 33 |         try:
 34 |             s, p, o = subj.decode('utf-8'), pred.decode('utf-8'), obj.decode('utf-8')
 35 |         except Exception as err:
 36 |             # with the test.hdt file we shouldnt have any problem
 37 |             raise err
 38 |         assert subj is not None
 39 |         assert pred is not None
 40 |         assert obj is not None
 41 |     assert triples.nb_reads == cardinality
 42 | 
 43 | 
 44 | empty_triples = [
 45 |     ("http://example.org#toto", "", ""),
 46 |     ("", "http://example.org#toto", ""),
 47 |     ("", "http://example.org#toto", "")
 48 | ]
 49 | 
 50 | empty_triples_ids = [
 51 |     (155, 0, 0),
 52 |     (0, 155, 0),
 53 |     (0, 0, 155)
 54 | ]
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("triple", empty_triples)
 58 | def test_search_triples_empty(triple):
 59 |     s, p, o = triple
 60 |     (iterator, cardinality) = document.search_triples(s, p, o)
 61 |     assert cardinality == 0
 62 |     assert not iterator.has_next()
 63 | 
 64 | 
 65 | @pytest.mark.parametrize("triple", empty_triples_ids)
 66 | def test_search_ids_empty(triple):
 67 |     s, p, o = triple
 68 |     (iterator, cardinality) = document.search_triples_ids(s, p, o)
 69 |     assert cardinality == 0
 70 |     assert not iterator.has_next()
 71 | 
 72 | 
 73 | def test_read_document_limit():
 74 |     nbItems = 0
 75 |     (triples, cardinality) = document.search_triples("", "", "", limit=10)
 76 |     assert triples.limit == 10
 77 |     assert cardinality == nbTotalTriples
 78 |     for subj, pred, obj in triples:
 79 |         nbItems += 1
 80 |         assert subj is not None
 81 |         assert pred is not None
 82 |         assert obj is not None
 83 |     assert nbItems == 10
 84 |     assert triples.nb_reads == 10
 85 | 
 86 | def test_read_document_bytes_peek():
 87 |     nbItems = 0
 88 |     (triples, cardinality) = document.search_triples_bytes("", "", "", limit=10)
 89 |     assert triples.limit == 10
 90 |     assert cardinality == nbTotalTriples
 91 |     peek = triples.peek()
 92 |     for subj, pred, obj in triples:
 93 |         nbItems += 1
 94 |         assert isinstance(subj, bytes)
 95 |         assert isinstance(pred, bytes)
 96 |         assert isinstance(obj, bytes)
 97 |         assert subj == peek[0]
 98 |         assert pred == peek[1]
 99 |         assert obj == peek[2]
100 |         assert subj is not None
101 |         assert pred is not None
102 |         assert obj is not None
103 |         try:
104 |             peek = triples.peek()
105 |         except:
106 |             pass
107 |     assert nbItems == 10
108 |     assert triples.nb_reads == 10
109 | 
110 | 
111 | def test_read_document_offset():
112 |     nbItems = 0
113 |     (triples, cardinality) = document.search_triples("", "", "", offset=10)
114 |     assert triples.offset == 10
115 |     assert cardinality == nbTotalTriples
116 |     for subj, pred, obj in triples:
117 |         nbItems += 1
118 |         assert subj is not None
119 |         assert pred is not None
120 |         assert obj is not None
121 |     assert nbItems == cardinality - 10
122 |     assert triples.nb_reads == cardinality - 10
123 | 
124 | 
125 | def test_read_document_ids():
126 |     (triples, cardinality) = document.search_triples_ids(0, 0, 0)
127 |     assert triples.subject == "?s"
128 |     assert triples.predicate == "?p"
129 |     assert triples.object, "?o"
130 |     assert cardinality, nbTotalTriples
131 |     for subj, pred, obj in triples:
132 |         assert subj is not None
133 |         assert pred is not None
134 |         assert obj is not None
135 |     assert triples.nb_reads == cardinality
136 | 
137 | 
138 | def test_string_iterator_peek():
139 |     expected = ('http://example.org/s1', 'http://example.org/p1', 'http://example.org/o001')
140 |     (triples, cardinality) = document.search_triples("", "", "")
141 |     v = triples.peek()
142 |     assert v == expected
143 |     assert triples.nb_reads == 0
144 |     v = next(triples)
145 |     assert v == expected
146 |     assert triples.nb_reads == 1
147 | 
148 | 
149 | def test_ids_iterator_peek():
150 |     expected = (1, 1, 13)
151 |     (triples, cardinality) = document.search_triples_ids(0, 0, 0)
152 |     v = triples.peek()
153 |     assert v == expected
154 |     assert triples.nb_reads == 0
155 |     v = next(triples)
156 |     assert v == expected
157 |     assert triples.nb_reads == 1
158 | 
159 | 
160 | def test_string_iterator_big_offset():
161 |     nbItems = 0
162 |     (triples, cardinality) = document.search_triples("", "", "", offset=nbTotalTriples + 1)
163 |     for s, p, o in triples:
164 |         nbItems += 1
165 |     assert nbItems == 0
166 | 
167 | 
168 | def test_ids_iterator_big_offset():
169 |     nbItems = 0
170 |     (triples, cardinality) = document.search_triples_ids(0, 0, 0, offset=nbTotalTriples + 1)
171 |     for s, p, o in triples:
172 |         nbItems += 1
173 |     assert nbItems == 0
174 | 


--------------------------------------------------------------------------------
/tests/join_iterator_test.py:
--------------------------------------------------------------------------------
 1 | # hdt_iterators_test.py
 2 | # Author: Thomas MINIER - MIT License 2017-2019
 3 | from hdt import HDTDocument
 4 | 
 5 | path = "tests/test.hdt"
 6 | document = HDTDocument(path)
 7 | 
 8 | 
 9 | def test_basic_join():
10 |     join_iter = document.search_join([
11 |         ("?s", "http://example.org/p1", "http://example.org/o001"),
12 |         ("?s", "http://example.org/p1", "http://example.org/o001")
13 |     ])
14 |     cpt = 0
15 |     for b in join_iter:
16 |         cpt += 1
17 |         assert len(b) == 1
18 |         assert ('?s', 'http://example.org/s1') in b or ('?s', 'http://example.org/s2') in b
19 |     assert cpt == 2
20 | 
21 | def test_basic_join_bytes():
22 |     join_iter = document.search_join_bytes([
23 |         ("?s", "http://example.org/p1", "http://example.org/o001"),
24 |         ("?s", "http://example.org/p1", "http://example.org/o001")
25 |     ])
26 |     cpt = 0
27 |     for b in join_iter:
28 |         cpt += 1
29 |         assert len(b) == 1
30 |         assert (b'?s', b'http://example.org/s1') in b or (b'?s', b'http://example.org/s2') in b
31 |     assert cpt == 2
32 | 


--------------------------------------------------------------------------------
/tests/test.hdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Callidon/pyHDT/56370143e707c1b69bdb054bd811660e6611cae1/tests/test.hdt


--------------------------------------------------------------------------------