├── src ├── pyorc │ ├── py.typed │ ├── errors.py │ ├── enums.py │ ├── __init__.py │ ├── predicates.py │ ├── converters.py │ ├── writer.py │ ├── reader.py │ ├── _pyorc.pyi │ └── typedescription.py └── _pyorc │ ├── verguard.h │ ├── SearchArgument.h │ ├── Converter.h │ ├── PyORCStream.h │ ├── Writer.h │ ├── Reader.h │ ├── PyORCStream.cpp │ ├── _pyorc.cpp │ ├── Writer.cpp │ ├── SearchArgument.cpp │ └── Reader.cpp ├── docs ├── changelog.rst ├── requirements.txt ├── Makefile ├── make.bat ├── index.rst ├── install.rst ├── conf.py └── tutorial.rst ├── .clang-format ├── .gitignore ├── MANIFEST.in ├── .readthedocs.yaml ├── pyproject.toml ├── tests ├── conftest.py ├── test_orc_ver.py ├── test_predicates.py ├── test_stripe.py ├── test_typedescription.py ├── compare │ ├── test_writer_cmp.py │ └── test_reader_cmp.py ├── test_column.py └── test_writer.py ├── .azure-pipelines ├── prepare-and-push-wheels.yml └── build-run-tests.yml ├── README.rst ├── CHANGELOG.rst ├── azure-pipelines.yml ├── setup.py └── LICENSE /src/pyorc/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | 3 | .. include:: ../CHANGELOG.rst -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Mozilla 2 | IndentWidth: 4 3 | --- 4 | Language: Cpp 5 | Standard: Cpp11 6 | ColumnLimit: 88 -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==8.1.3 2 | sphinx_rtd_theme==3.0.2 3 | furo==2024.8.6 4 | readthedocs-sphinx-search==0.3.2 5 | 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .eggs/ 3 | build/ 4 | deps/ 5 | dist/ 6 | docs/_build/ 7 | pyorc.egg-info/ 8 | .pytest_cache/ 9 | __pycache__/ 10 | tests/__pycache__ 11 | .mypy_cache/ 12 | -------------------------------------------------------------------------------- /src/pyorc/errors.py: -------------------------------------------------------------------------------- 1 | class ORCError(Exception): 2 | """ General pyorc error. """ 3 | 4 | 5 | class ParseError(ORCError): 6 | """ Parse error while processing an ORC file. """ 7 | 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include src *.cpp *.h *.py 2 | recursive-include tests *.py 3 | recursive-include docs *.rst *.py *.css Makefile make.bat 4 | include README.rst 5 | include LICENSE 6 | include CHANGELOG.rst 7 | recursive-exclude docs/_build * -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-24.04" 5 | tools: 6 | python: "3.12" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "setuptools<72.2; implementation_name == 'pypy'", # https://github.com/pypa/distutils/issues/283 5 | "wheel", 6 | "pybind11>2.6.0,<3.0" 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | -------------------------------------------------------------------------------- /src/_pyorc/verguard.h: -------------------------------------------------------------------------------- 1 | #ifndef VERGUARD_H 2 | #define VERGUARD_H 3 | 4 | #if ORC_VERSION_MINOR > 255 || ORC_VERSION_PATCH > 255 5 | #error "ORC version number component is higher than 255, version guard macro will fail" 6 | #endif 7 | 8 | #define ORC_VERSION_AT_LEAST(ma, mi, pa) \ 9 | (((ORC_VERSION_MAJOR)<<16)+((ORC_VERSION_MINOR)<<8)+(ORC_VERSION_PATCH) >= (((ma)<<16)+((mi)<<8)+(pa)) ? 1 : 0) 10 | 11 | #endif 12 | 13 | -------------------------------------------------------------------------------- /src/_pyorc/SearchArgument.h: -------------------------------------------------------------------------------- 1 | #ifndef SEARCHARGUMENT_H 2 | #define SEARCHARGUMENT_H 3 | 4 | #include 5 | #include 6 | 7 | namespace py = pybind11; 8 | 9 | std::unique_ptr createSearchArgument(py::object, 10 | py::dict, 11 | py::object); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture 8 | def output_file(): 9 | testfile = tempfile.NamedTemporaryFile( 10 | mode="wb", delete=False, prefix="pyorc_", suffix=".orc" 11 | ) 12 | yield testfile 13 | if not testfile.closed: 14 | testfile.close() 15 | os.remove(testfile.name) 16 | 17 | 18 | class NullValue: 19 | _instance = None 20 | 21 | def __new__(cls): 22 | if cls._instance is not None: 23 | return cls._instance 24 | cls._instance = super().__new__(cls) 25 | return cls._instance 26 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/test_orc_ver.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import pytest 3 | 4 | from pyorc import orc_version, orc_version_info 5 | 6 | 7 | def test_orc_version(): 8 | assert isinstance(orc_version, str) 9 | assert len(orc_version.split(".")) == 3 10 | 11 | 12 | def test_orc_version_info(): 13 | assert isinstance(orc_version_info, tuple) 14 | assert isinstance(orc_version_info.major, int) 15 | assert isinstance(orc_version_info.minor, int) 16 | assert isinstance(orc_version_info.patch, int) 17 | assert isinstance(orc_version_info.releaselevel, str) 18 | inf = orc_version_info 19 | assert ( 20 | orc_version 21 | == f"{inf.major}.{inf.minor}.{inf.patch}{'-' if inf.releaselevel else ''}{inf.releaselevel}" 22 | ) 23 | 24 | -------------------------------------------------------------------------------- /src/_pyorc/Converter.h: -------------------------------------------------------------------------------- 1 | #ifndef CONVERTER_H 2 | #define CONVERTER_H 3 | 4 | #include 5 | 6 | #include "orc/OrcFile.hh" 7 | 8 | #include 9 | 10 | namespace py = pybind11; 11 | class Converter 12 | { 13 | protected: 14 | bool hasNulls; 15 | const char* notNull = nullptr; 16 | py::object nullValue = py::none(); 17 | 18 | public: 19 | Converter(py::object nv) 20 | : nullValue(nv){}; 21 | virtual ~Converter() = default; 22 | virtual py::object toPython(uint64_t) = 0; 23 | virtual void write(orc::ColumnVectorBatch*, uint64_t, py::object) = 0; 24 | virtual void reset(const orc::ColumnVectorBatch&); 25 | virtual void clear(){}; 26 | }; 27 | 28 | std::unique_ptr 29 | createConverter(const orc::Type*, unsigned int, py::dict, py::object, py::object); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /.azure-pipelines/prepare-and-push-wheels.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: qemu 3 | type: boolean 4 | default: false 5 | - name: cibwStep 6 | type: step 7 | 8 | steps: 9 | - task: UsePythonVersion@0 10 | - bash: | 11 | set -o errexit 12 | python3 -m pip install --upgrade pip 13 | python3 -m pip install cibuildwheel==2.22.0 14 | displayName: Install cibuildwheel dependencies 15 | - bash: | 16 | echo "Worker Arch: ${ARCH}; OS: ${OS};" 17 | env: 18 | ARCH: $(Agent.OSArchitecture) 19 | OS: $(Agent.OS) 20 | displayName: Worker info 21 | 22 | 23 | - ${{ if eq(parameters.qemu, true) }}: 24 | 25 | - script: docker run --privileged --rm tonistiigi/binfmt --install all 26 | displayName: Register QEMU 27 | 28 | - ${{ parameters.cibwStep }} 29 | - task: PublishBuildArtifacts@1 30 | inputs: 31 | pathtoPublish: 'wheelhouse' 32 | artifactName: wheels 33 | 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /src/_pyorc/PyORCStream.h: -------------------------------------------------------------------------------- 1 | #ifndef PY_ORC_STREAM_H 2 | #define PY_ORC_STREAM_H 3 | 4 | #include 5 | 6 | #include "orc/OrcFile.hh" 7 | #include "verguard.h" 8 | 9 | namespace py = pybind11; 10 | 11 | class PyORCInputStream : public orc::InputStream 12 | { 13 | private: 14 | std::string filename; 15 | py::object pyread; 16 | py::object pyseek; 17 | uint64_t totalLength; 18 | 19 | public: 20 | PyORCInputStream(py::object); 21 | ~PyORCInputStream() override; 22 | uint64_t getLength() const override; 23 | uint64_t getNaturalReadSize() const override; 24 | void read(void*, uint64_t, uint64_t) override; 25 | const std::string& getName() const override; 26 | }; 27 | 28 | class PyORCOutputStream : public orc::OutputStream 29 | { 30 | private: 31 | std::string filename; 32 | py::object pywrite; 33 | py::object pyflush; 34 | uint64_t bytesWritten; 35 | bool closed; 36 | 37 | public: 38 | PyORCOutputStream(py::object); 39 | ~PyORCOutputStream() override; 40 | uint64_t getLength() const override; 41 | uint64_t getNaturalWriteSize() const override; 42 | const std::string& getName() const override; 43 | void write(const void*, size_t) override; 44 | void close() override; 45 | #if ORC_VERSION_AT_LEAST(1, 9, 0) 46 | void flush() override; 47 | #endif 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. PyORC documentation master file, created by 2 | sphinx-quickstart on Tue Nov 12 22:14:39 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PyORC's documentation 7 | ********************* 8 | 9 | PyORC is a Python module for reading and writing `Apache ORC`_ file format. 10 | It uses the Apache ORC's Core C++ API under the hood, and provides a similar 11 | interface as the `csv module`_ in the Python standard library. 12 | 13 | .. note:: 14 | The module is compatible with Python 3.9 or newer releases. 15 | 16 | 17 | Features 18 | -------- 19 | 20 | - Reading ORC files. 21 | - Writing ORC files. 22 | - While using Python's stream/file-like object IO interface. 23 | 24 | .. toctree:: 25 | :maxdepth: 2 26 | :caption: Contents: 27 | 28 | install 29 | tutorial 30 | api 31 | changelog 32 | 33 | 34 | Indices and tables 35 | ================== 36 | 37 | * :ref:`genindex` 38 | * :ref:`modindex` 39 | * :ref:`search` 40 | 41 | Contribution 42 | ============ 43 | 44 | Any contributions are welcome. If you would like to help in development fork 45 | or report issue on the project's `GitHub site`_. You can also help in 46 | improving the documentation. 47 | 48 | .. _github site: https://github.com/noirello/pyorc 49 | .. _Apache ORC: https://orc.apache.org/ 50 | .. _csv module: https://docs.python.org/3/library/csv.html 51 | -------------------------------------------------------------------------------- /src/_pyorc/Writer.h: -------------------------------------------------------------------------------- 1 | #ifndef WRITER_H 2 | #define WRITER_H 3 | 4 | #include 5 | #include 6 | 7 | #include "orc/OrcFile.hh" 8 | 9 | #include "Converter.h" 10 | #include "verguard.h" 11 | 12 | namespace py = pybind11; 13 | 14 | class Writer 15 | { 16 | private: 17 | std::unique_ptr outStream; 18 | std::unique_ptr writer; 19 | std::unique_ptr batch; 20 | std::unique_ptr converter; 21 | uint64_t batchSize; 22 | uint64_t batchItem; 23 | 24 | public: 25 | uint64_t currentRow; 26 | 27 | Writer(py::object, 28 | py::object, 29 | uint64_t = 1024, 30 | uint64_t = 67108864, 31 | uint64_t = 10000, 32 | int = 1, 33 | int = 0, 34 | uint64_t = 65536, 35 | std::set = {}, 36 | double = 0.05, 37 | py::object = py::none(), 38 | unsigned int = 0, 39 | py::object = py::none(), 40 | double = 0.0, 41 | double = 0.0, 42 | py::object = py::none(), 43 | unsigned int = 65536); 44 | void addUserMetadata(py::str, py::bytes); 45 | void write(py::object); 46 | uint64_t writerows(py::iterable); 47 | #if ORC_VERSION_AT_LEAST(1, 9, 0) 48 | uint64_t writeIntermediateFooter(); 49 | #endif 50 | void close(); 51 | ~Writer(){}; 52 | }; 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /src/pyorc/enums.py: -------------------------------------------------------------------------------- 1 | import enum 2 | 3 | 4 | class CompressionKind(enum.IntEnum): 5 | """ The compression kind for the ORC file. """ 6 | 7 | NONE = 0 8 | ZLIB = 1 9 | SNAPPY = 2 10 | LZO = 3 11 | LZ4 = 4 12 | ZSTD = 5 13 | 14 | 15 | class CompressionStrategy(enum.IntEnum): 16 | """ Compression strategy for the ORC file. """ 17 | 18 | SPEED = 0 19 | COMPRESSION = 1 20 | 21 | 22 | class TypeKind(enum.IntEnum): 23 | """ The type kinds for an ORC schema. """ 24 | 25 | BOOLEAN = 0 26 | BYTE = 1 27 | SHORT = 2 28 | INT = 3 29 | LONG = 4 30 | FLOAT = 5 31 | DOUBLE = 6 32 | STRING = 7 33 | BINARY = 8 34 | TIMESTAMP = 9 35 | LIST = 10 36 | MAP = 11 37 | STRUCT = 12 38 | UNION = 13 39 | DECIMAL = 14 40 | DATE = 15 41 | VARCHAR = 16 42 | CHAR = 17 43 | TIMESTAMP_INSTANT = 18 44 | 45 | @classmethod 46 | def has_value(cls, value: int) -> bool: 47 | return any(member.value == value for member in cls) 48 | 49 | 50 | class StructRepr(enum.IntEnum): 51 | """ Enumeration for ORC struct representation. """ 52 | 53 | TUPLE = 0 #: For tuple. 54 | DICT = 1 #: For dictionary. 55 | 56 | 57 | class WriterVersion(enum.IntEnum): 58 | """ Writer version for an ORC file. """ 59 | 60 | ORIGINAL = 0 61 | HIVE_8732 = 1 62 | HIVE_4243 = 2 63 | HIVE_12055 = 3 64 | HIVE_13083 = 4 65 | ORC_101 = 5 66 | ORC_135 = 6 67 | ORC_517 = 7 68 | ORC_203 = 8 69 | ORC_14 = 9 70 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PyORC 2 | ===== 3 | 4 | .. image:: https://dev.azure.com/noirello/pyorc/_apis/build/status/noirello.pyorc?branchName=master 5 | :target: https://dev.azure.com/noirello/pyorc/_build?definitionId=1 6 | :alt: Azure Pipelines Status 7 | 8 | .. image:: https://codecov.io/gh/noirello/pyorc/branch/master/graph/badge.svg 9 | :target: https://codecov.io/gh/noirello/pyorc 10 | :alt: Codecov code coverage 11 | 12 | .. image:: https://readthedocs.org/projects/pyorc/badge/?version=latest 13 | :target: https://pyorc.readthedocs.io/en/latest/?badge=latest 14 | :alt: Documentation Status 15 | 16 | Python module for reading and writing `Apache ORC`_ file format. It uses the Apache ORC's Core C++ API 17 | under the hood, and provides a similar interface as the `csv module`_ in the Python standard library. 18 | 19 | Supports only Python 3.9 or newer and ORC 1.7. 20 | 21 | Features 22 | -------- 23 | 24 | - Reading ORC files. 25 | - Writing ORC files. 26 | - While using Python's stream/file-like object IO interface. 27 | 28 | That sums up quite well the purpose of this project. 29 | 30 | Example 31 | ------- 32 | 33 | Minimal example for reading an ORC file: 34 | 35 | .. code:: python 36 | 37 | import pyorc 38 | 39 | with open("./data.orc", "rb") as data: 40 | reader = pyorc.Reader(data) 41 | for row in reader: 42 | print(row) 43 | 44 | And another for writing one: 45 | 46 | .. code:: python 47 | 48 | import pyorc 49 | 50 | with open("./new_data.orc", "wb") as data: 51 | with pyorc.Writer(data, "struct") as writer: 52 | writer.write((1, "ORC from Python")) 53 | 54 | Contribution 55 | ============ 56 | 57 | Any contributions are welcome. If you would like to help in development fork 58 | or report issue here on Github. You can also help in improving the 59 | documentation. 60 | 61 | .. _Apache ORC: https://orc.apache.org/ 62 | .. _csv module: https://docs.python.org/3/library/csv.html 63 | -------------------------------------------------------------------------------- /src/pyorc/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import NamedTuple 4 | 5 | if sys.platform.startswith("win32") and "TZDIR" not in os.environ: 6 | # Windows does not come with a standard IANA time zone database, 7 | # but the ORC lib requires it. Set the TZDIR environment variable 8 | # to the tzdata module's data directory. 9 | import tzdata 10 | 11 | os.environ["TZDIR"] = os.path.join(os.path.dirname(tzdata.__file__), "zoneinfo") 12 | 13 | from pyorc._pyorc import _orc_version 14 | 15 | from .enums import * 16 | from .errors import * 17 | from .predicates import PredicateColumn 18 | from .reader import Column, Reader, Stripe 19 | from .typedescription import * 20 | from .writer import Writer 21 | 22 | __version__ = "0.11.0" 23 | 24 | orc_version = _orc_version() 25 | 26 | ORCVersionInfo = NamedTuple( 27 | "ORCVersionInfo", 28 | [("major", int), ("minor", int), ("patch", int), ("releaselevel", str)], 29 | ) 30 | 31 | 32 | def __extract_version_info() -> ORCVersionInfo: 33 | splitted = _orc_version().split("-") 34 | ver = splitted[0] 35 | rel_level = splitted[1] if len(splitted) > 1 else "" 36 | major, minor, patch = map(int, ver.split(".")) 37 | return ORCVersionInfo(major, minor, patch, rel_level) 38 | 39 | 40 | orc_version_info = __extract_version_info() 41 | 42 | __all__ = [ 43 | "Column", 44 | "PredicateColumn", 45 | "Reader", 46 | "Stripe", 47 | "Writer", 48 | # Enums 49 | "CompressionKind", 50 | "CompressionStrategy", 51 | "TypeKind", 52 | "StructRepr", 53 | "WriterVersion", 54 | # Errors 55 | "ORCError", 56 | "ParseError", 57 | # Type descriptiona 58 | "TypeDescription", 59 | "Boolean", 60 | "TinyInt", 61 | "SmallInt", 62 | "Int", 63 | "BigInt", 64 | "Float", 65 | "Double", 66 | "String", 67 | "Binary", 68 | "Timestamp", 69 | "TimestampInstant", 70 | "Date", 71 | "Char", 72 | "VarChar", 73 | "Decimal", 74 | "Union", 75 | "Array", 76 | "Map", 77 | "Struct", 78 | # Version info 79 | "orc_version", 80 | "orc_version_info", 81 | ] 82 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installing 2 | ========== 3 | 4 | Using pip 5 | --------- 6 | 7 | For Linux and Mac, you can simply use pip that will install a wheel bundled 8 | with the required libraries:: 9 | 10 | $ pip3 install pyorc 11 | 12 | .. note:: 13 | To install on Linux, you need *pip 19.0* or newer. Earlier versions are 14 | unable to handle the ``manylinux2010`` wheels, thus they try to install 15 | the package from source. 16 | 17 | There could be some drawbacks of the bundled libraries in the package, when 18 | using together with other Python modules. If another module is loaded into 19 | the Python runtime besides PyORC that also pre-bundles one of the required 20 | C/C++ libraries but a slightly different version, then the two libraries 21 | will collide, and the interpreter will crash with segmentation fault at some 22 | point during the execution. 23 | 24 | It's easy to run into this situation. For example, ``libprotobuf`` is 25 | one of required library for ORC, and it's quite popular for other projects 26 | as well. To avoid this, you have to make sure that the very same version 27 | of the common library is used by both of the modules, and therefore 28 | you might need to build PyORC from source. 29 | 30 | 31 | Install from source 32 | ------------------- 33 | 34 | To install from source, the module requires the Apache ORC C++ Core library. 35 | During the extension build step, the module will build the ORC core library 36 | before building the extension module itself. It requires `cmake` -- in 37 | addition of a suitable C++ compiler. The following steps take place during 38 | the `build_ext` command: 39 | 40 | 1. Downloading the Apache ORC release package. 41 | 2. Extracting the package to a directory named `deps` into the project's 42 | root directory. 43 | 3. Running cmake to configure the ORC C++ library. 44 | 4. Running the ``make package`` command. 45 | 5. Finally, moving the C++ headers, ORC example files and ORC tools 46 | to the top level of the `deps` directory for the `setup.py` and tests 47 | to find. 48 | 6. Building the C++ extension part of PyORC. 49 | 50 | .. note:: 51 | The ``build_ext`` command has a ``--orc-version`` and a ``--source-url`` 52 | parameter for changing the default ORC library version or the URL of the 53 | source zip to download respectively. It also has a ``--skip-orc-build`` 54 | flag to skip ORC library build steps. 55 | 56 | You also need the `pybind11` Python package to be installed before running 57 | the installation:: 58 | 59 | $ pip3 install pybind11 60 | $ python3 setup.py install 61 | 62 | After the installation completes without errors, you have the module ready 63 | to use. 64 | -------------------------------------------------------------------------------- /src/pyorc/predicates.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from typing import Any, Optional 3 | 4 | from .enums import TypeKind 5 | 6 | 7 | class Operator(enum.IntEnum): 8 | NOT = 0 9 | OR = 1 10 | AND = 2 11 | EQ = 3 12 | LT = 4 13 | LE = 5 14 | 15 | 16 | class Predicate: 17 | def __init__(self, operator: Operator, left, right) -> None: 18 | self.values = (operator, left, right) 19 | 20 | def __or__(self, other) -> "Predicate": 21 | self.values = (Operator.OR, self.values, other.values) 22 | return self 23 | 24 | def __and__(self, other) -> "Predicate": 25 | self.values = (Operator.AND, self.values, other.values) 26 | return self 27 | 28 | def __invert__(self) -> "Predicate": 29 | self.values = (Operator.NOT, self.values) 30 | return self 31 | 32 | 33 | class PredicateColumn: 34 | def __init__( 35 | self, 36 | type_kind: TypeKind, 37 | name: Optional[str] = None, 38 | index: Optional[int] = None, 39 | precision: Optional[int] = None, 40 | scale: Optional[int] = None, 41 | ) -> None: 42 | if not TypeKind.has_value(type_kind) or type_kind in ( 43 | TypeKind.BINARY, 44 | TypeKind.LIST, 45 | TypeKind.MAP, 46 | TypeKind.UNION, 47 | TypeKind.STRUCT, 48 | ): 49 | raise TypeError("Invalid type for PredicateColumn: %s" % type_kind) 50 | self.type_kind = type_kind 51 | if self.type_kind == TypeKind.DECIMAL and (precision is None or scale is None): 52 | raise ValueError("Both precision and scale must be set for Decimal type") 53 | if name is not None and index is not None: 54 | raise TypeError("Only one of the name or index parameter must be given") 55 | if name is not None and not isinstance(name, str): 56 | raise TypeError("Name parameter must be string") 57 | if index is not None and not isinstance(index, int): 58 | raise TypeError("Index parameter must be int") 59 | self.name = name 60 | self.index = index 61 | self.precision = precision if precision is not None else 0 62 | self.scale = scale if scale is not None else 0 63 | 64 | def __eq__(self, other: Any) -> Predicate: 65 | return Predicate(Operator.EQ, self, other) 66 | 67 | def __ne__(self, other: Any) -> Predicate: 68 | return ~Predicate(Operator.EQ, self, other) 69 | 70 | def __lt__(self, other: Any) -> Predicate: 71 | return Predicate(Operator.LT, self, other) 72 | 73 | def __le__(self, other: Any) -> Predicate: 74 | return Predicate(Operator.LE, self, other) 75 | 76 | def __gt__(self, other: Any) -> Predicate: 77 | return ~Predicate(Operator.LE, self, other) 78 | 79 | def __ge__(self, other: Any) -> Predicate: 80 | return ~Predicate(Operator.LT, self, other) 81 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | 13 | import sys 14 | import os 15 | 16 | sys.path[0:0] = [os.path.abspath("..")] 17 | 18 | # For read-the-docs: mocking the _pyorc module. 19 | from unittest.mock import MagicMock 20 | 21 | 22 | class Mock(MagicMock): 23 | @classmethod 24 | def __getattr__(cls, name): 25 | # For pyorc 26 | if name == "typedescription": 27 | return object 28 | elif name == "reader": 29 | return object 30 | elif name == "writer": 31 | return object 32 | elif name == "stripe": 33 | return object 34 | elif name == "_orc_version": 35 | return lambda: "0.0.0-DUMMY" 36 | # For zoneinfo 37 | elif name == "ZoneInfo": 38 | return lambda key: object 39 | 40 | 41 | MOCK_MODULES = ["src.pyorc._pyorc", "pyorc", "pyorc._pyorc", "zoneinfo"] 42 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 43 | 44 | import src.pyorc as pyorc 45 | 46 | sys.modules["pyorc"] = pyorc 47 | 48 | 49 | # -- Project information ----------------------------------------------------- 50 | 51 | project = "PyORC" 52 | copyright = "2019-2025, noirello" 53 | author = "noirello" 54 | 55 | # The full version, including alpha/beta/rc tags 56 | release = pyorc.__version__ 57 | 58 | 59 | # -- General configuration --------------------------------------------------- 60 | 61 | # Add any Sphinx extension module names here, as strings. They can be 62 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 63 | # ones. 64 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.doctest"] 65 | 66 | # Add any paths that contain templates here, relative to this directory. 67 | templates_path = ["_templates"] 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This pattern also affects html_static_path and html_extra_path. 72 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 73 | 74 | 75 | # -- Options for HTML output ------------------------------------------------- 76 | 77 | # The theme to use for HTML and HTML Help pages. See the documentation for 78 | # a list of builtin themes. 79 | # 80 | html_theme = "furo" 81 | 82 | # Add any paths that contain custom static files (such as style sheets) here, 83 | # relative to this directory. They are copied after the builtin static files, 84 | # so a file named "default.css" will overwrite the builtin "default.css". 85 | html_static_path = ["_static"] 86 | 87 | master_doc = "index" 88 | -------------------------------------------------------------------------------- /src/pyorc/converters.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from datetime import date, datetime, timedelta, timezone as tz 3 | from decimal import Decimal, localcontext 4 | import math 5 | from typing import Dict, Tuple, Type, Any 6 | 7 | from .enums import TypeKind 8 | 9 | try: 10 | import zoneinfo 11 | except ImportError: 12 | from backports import zoneinfo 13 | 14 | 15 | class ORCConverter(ABC): 16 | @staticmethod 17 | @abstractmethod 18 | def from_orc(*args): 19 | pass 20 | 21 | @staticmethod 22 | @abstractmethod 23 | def to_orc(*args): 24 | pass 25 | 26 | 27 | class TimestampConverter(ORCConverter): 28 | @staticmethod 29 | def from_orc( 30 | seconds: int, nanoseconds: int, timezone: zoneinfo.ZoneInfo, 31 | ) -> datetime: 32 | epoch = datetime(1970, 1, 1, 0, 0, 0, tzinfo=tz.utc) 33 | return ( 34 | epoch + timedelta(seconds=seconds, microseconds=nanoseconds // 1000) 35 | ).astimezone(timezone) 36 | 37 | @staticmethod 38 | def to_orc(obj: datetime, timezone: zoneinfo.ZoneInfo) -> Tuple[int, int]: 39 | return math.floor(obj.timestamp()), obj.microsecond * 1000 40 | 41 | 42 | class DateConverter(ORCConverter): 43 | @staticmethod 44 | def from_orc(days: int) -> date: 45 | return date(1970, 1, 1) + timedelta(days=days) 46 | 47 | @staticmethod 48 | def to_orc(obj: date) -> int: 49 | return (obj - date(1970, 1, 1)).days 50 | 51 | 52 | class DecimalConverter(ORCConverter): 53 | @staticmethod 54 | def from_orc(decimal: str) -> Decimal: 55 | return Decimal(decimal) 56 | 57 | @staticmethod 58 | def to_orc(precision: int, scale: int, obj: Decimal) -> int: 59 | """ 60 | Adjust the Decimal number to the given precision and scale. 61 | Return the integer value of the number. 62 | 63 | :param int precision: the precision of the decimal 64 | :param int scale: the scale of decimal 65 | :param decimal.Decimal obj: the number as Python decimal 66 | :return: an integer (interpreted with the given scale) 67 | :rtype: int 68 | """ 69 | with localcontext() as ctx: 70 | try: 71 | ctx.prec = precision 72 | coefficient = Decimal("1.{0}".format("0" * scale)) 73 | dec = obj.quantize(coefficient) 74 | dec_tup = dec.as_tuple() 75 | integer = sum( 76 | dig * 10 ** exp for exp, dig in enumerate(reversed(dec_tup.digits)) 77 | ) 78 | if dec_tup.exponent > 0: 79 | integer = integer * 10 ** dec_tup.exponent 80 | if dec_tup.sign == 1: 81 | return integer * -1 82 | else: 83 | return integer 84 | except AttributeError: 85 | raise TypeError( 86 | "Item {0} cannot be cast as a decimal".format(type(obj)) 87 | ) from None 88 | 89 | 90 | DEFAULT_CONVERTERS: Dict[TypeKind, Type[ORCConverter]] = { 91 | TypeKind.DATE: DateConverter, 92 | TypeKind.DECIMAL: DecimalConverter, 93 | TypeKind.TIMESTAMP: TimestampConverter, 94 | } 95 | -------------------------------------------------------------------------------- /.azure-pipelines/build-run-tests.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: windows 3 | type: boolean 4 | default: false 5 | - name: orc_version 6 | type: string 7 | default: "" 8 | 9 | steps: 10 | - script: | 11 | which python 12 | python -V 13 | displayName: Check Python $(python.version) path 14 | 15 | - script: python -m pip install --user -U pytest pytest-cov pytest-xdist[psutil] coverage codecov 16 | displayName: Install test dependencies 17 | 18 | - ${{ if eq(parameters.windows, true) }}: 19 | - ${{ if ne(parameters.orc_version, '') }}: 20 | - script: | 21 | set PYORC_DEBUG=1 22 | set PYORC_LIB_VERSION=${{ parameters.orc_version }} 23 | python -m pip install -vvv --user . 24 | displayName: Install package 25 | - ${{ else }}: 26 | - script: | 27 | set PYORC_DEBUG=1 28 | python -m pip install -vvv --user . 29 | displayName: Install package 30 | 31 | - powershell: | 32 | $files = Get-ChildItem -path .\deps\orc-*\build\*\*\*-stamp\*.log 33 | $output = ".\\buildlogs\\logs-$env:AGENT_OS-$env:PYTHON_VER.zip" 34 | New-Item -Path .\ -Name "buildlogs" -ItemType "directory" 35 | Compress-Archive -LiteralPath $files -Destination $output 36 | env: 37 | PYTHON_VER: $(python.version) 38 | AGENT_OS: $(Agent.OS) 39 | displayName: Collect build logs 40 | condition: succeededOrFailed() 41 | 42 | - script: | 43 | FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "import os, pyorc; print(os.path.dirname(pyorc.__file__))"`) DO (SET INSTALLDIR=%%F) 44 | echo %INSTALLDIR% 45 | python -m pytest -vs --junitxml=./test-results.xml -n auto --cov=%INSTALLDIR% tests/ 46 | displayName: Run pytest (Windows) 47 | 48 | - ${{ else }}: 49 | - ${{ if ne(parameters.orc_version, '') }}: 50 | - script: | 51 | PYORC_DEBUG=1 PYORC_LIB_VERSION=${{ parameters.orc_version }} CFLAGS="-coverage" python -m pip install -vvv --user . 52 | displayName: Install package 53 | - ${{ else }}: 54 | - script: | 55 | PYORC_DEBUG=1 CFLAGS="-coverage" python -m pip install -vvv --user . 56 | displayName: Install package 57 | 58 | - script: | 59 | mkdir ./buildlogs 60 | mkdir "$AGENT_OS-$PYTHON_VER" 61 | cp $(ls deps/orc-1.*/build/*/*/*-stamp/*.log) "$AGENT_OS-$PYTHON_VER" 62 | tar -czvf "buildlogs/logs-$AGENT_OS-$PYTHON_VER.tar.gz" "$AGENT_OS-$PYTHON_VER" 63 | env: 64 | PYTHON_VER: $(python.version) 65 | AGENT_OS: $(Agent.OS) 66 | displayName: Collect build logs 67 | condition: succeededOrFailed() 68 | 69 | - script: | 70 | INSTALLDIR=$(python -c "import os, pyorc; print(os.path.dirname(pyorc.__file__))") 71 | echo $INSTALLDIR 72 | python -m pytest -vs --junitxml=./test-results.xml -n auto --cov="$INSTALLDIR" tests/ 73 | displayName: Run pytest (Unix) 74 | 75 | - task: PublishTestResults@2 76 | inputs: 77 | testResultsFiles: test-results.xml 78 | testRunTitle: Tests on $(Agent.OS) with Python $(python.version) 79 | condition: succeededOrFailed() 80 | 81 | - task: PublishBuildArtifacts@1 82 | inputs: 83 | pathtoPublish: 'buildlogs' 84 | artifactName: logs 85 | condition: succeededOrFailed() 86 | 87 | - script: python -m codecov 88 | env: 89 | CODECOV_TOKEN: $(codecov) 90 | displayName: Report Coverage 91 | condition: succeeded() 92 | -------------------------------------------------------------------------------- /src/_pyorc/Reader.h: -------------------------------------------------------------------------------- 1 | #ifndef READER_H 2 | #define READER_H 3 | 4 | #include 5 | #include 6 | 7 | #include "orc/OrcFile.hh" 8 | 9 | #include "Converter.h" 10 | 11 | namespace py = pybind11; 12 | 13 | py::object 14 | createTypeDescription(const orc::Type&); 15 | 16 | class ORCFileLikeObject 17 | { 18 | private: 19 | py::object convertTimestampMillis(int64_t) const; 20 | 21 | protected: 22 | uint64_t batchItem; 23 | orc::RowReaderOptions rowReaderOpts; 24 | std::unique_ptr rowReader; 25 | std::unique_ptr batch; 26 | std::unique_ptr converter; 27 | py::dict convDict; 28 | py::object timezoneInfo; 29 | py::dict buildStatistics(const orc::Type*, const orc::ColumnStatistics*) const; 30 | const orc::Type* findColumnType(const orc::Type*, uint64_t) const; 31 | 32 | public: 33 | uint64_t currentRow; 34 | uint64_t firstRowOfStripe; 35 | virtual uint64_t len() const = 0; 36 | py::object next(); 37 | py::list read(int64_t = -1); 38 | uint64_t seek(int64_t, uint16_t = 0); 39 | const orc::RowReaderOptions getRowReaderOptions() const { return rowReaderOpts; }; 40 | const py::dict getConverterDict() const { return convDict; } 41 | const py::object getTimeZoneInfo() const { return timezoneInfo; } 42 | virtual ~ORCFileLikeObject(){}; 43 | }; 44 | 45 | class Stripe; /* Forward declaration */ 46 | 47 | class Reader : public ORCFileLikeObject 48 | { 49 | private: 50 | std::unique_ptr reader; 51 | uint64_t batchSize; 52 | unsigned int structKind; 53 | py::object nullValue; 54 | 55 | public: 56 | Reader(py::object, 57 | uint64_t = 1024, 58 | std::list = {}, 59 | std::list = {}, 60 | py::object = py::none(), 61 | unsigned int = 0, 62 | py::object = py::none(), 63 | py::object = py::none(), 64 | py::object = py::none()); 65 | py::dict bytesLengths() const; 66 | uint64_t compression() const; 67 | uint64_t compressionBlockSize() const; 68 | uint64_t rowIndexStride() const; 69 | py::tuple formatVersion() const; 70 | uint64_t len() const override; 71 | uint64_t numberOfStripes() const; 72 | uint32_t writerId() const; 73 | uint32_t writerVersion() const; 74 | std::string softwareVersion() const; 75 | py::object schema(); 76 | py::object selectedSchema(); 77 | std::unique_ptr readStripe(uint64_t); 78 | py::tuple statistics(uint64_t); 79 | py::dict userMetadata(); 80 | 81 | const orc::Reader& getORCReader() const { return *reader; } 82 | const uint64_t getBatchSize() const { return batchSize; } 83 | const unsigned int getStructKind() const { return structKind; } 84 | const py::object getNullValue() const { return nullValue; } 85 | ~Reader(){}; 86 | }; 87 | 88 | class Stripe : public ORCFileLikeObject 89 | { 90 | private: 91 | uint64_t stripeIndex; 92 | std::unique_ptr stripeInfo; 93 | const Reader& reader; 94 | 95 | public: 96 | Stripe(const Reader&, uint64_t, std::unique_ptr); 97 | py::tuple bloomFilterColumns(); 98 | uint64_t len() const override; 99 | uint64_t length() const; 100 | uint64_t offset() const; 101 | py::tuple statistics(uint64_t); 102 | std::string writerTimezone(); 103 | ~Stripe(){}; 104 | }; 105 | 106 | #endif 107 | -------------------------------------------------------------------------------- /src/pyorc/writer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Any, BinaryIO, Dict, List, Optional, Type, Union 3 | 4 | from pyorc._pyorc import writer 5 | 6 | from .converters import DEFAULT_CONVERTERS, ORCConverter 7 | from .enums import CompressionKind, CompressionStrategy, StructRepr, TypeKind 8 | from .typedescription import TypeDescription 9 | 10 | try: 11 | import zoneinfo 12 | except ImportError: 13 | from backports import zoneinfo 14 | 15 | 16 | class Writer(writer): 17 | def __init__( 18 | self, 19 | fileo: BinaryIO, 20 | schema: Union[str, TypeDescription], 21 | batch_size: int = 1024, 22 | stripe_size: int = 67108864, 23 | row_index_stride: int = 10000, 24 | compression: CompressionKind = CompressionKind.ZLIB, 25 | compression_strategy: CompressionStrategy = CompressionStrategy.SPEED, 26 | compression_block_size: int = 65536, 27 | bloom_filter_columns: Optional[List[Union[str, int]]] = None, 28 | bloom_filter_fpp: float = 0.05, 29 | timezone: zoneinfo.ZoneInfo = zoneinfo.ZoneInfo("UTC"), 30 | struct_repr: StructRepr = StructRepr.TUPLE, 31 | converters: Optional[Dict[TypeKind, Type[ORCConverter]]] = None, 32 | padding_tolerance: float = 0.0, 33 | dict_key_size_threshold: float = 0.0, 34 | null_value: Any = None, 35 | memory_block_size = 65536, 36 | ) -> None: 37 | if isinstance(schema, str): 38 | schema = TypeDescription.from_string(schema) 39 | elif not isinstance(schema, TypeDescription): 40 | raise TypeError("Invalid `schema` type, must be string or TypeDescription") 41 | if 0.0 >= bloom_filter_fpp or bloom_filter_fpp >= 1.0: 42 | raise ValueError("False positive probability should be > 0.0 & < 1.0") 43 | self.__schema = schema 44 | self.__user_metadata: Dict[str, bytes] = {} 45 | comp = CompressionKind(compression) 46 | comp_strat = CompressionStrategy(compression_strategy) 47 | bf_set = set() 48 | if bloom_filter_columns: 49 | if any(not isinstance(item, (int, str)) for item in bloom_filter_columns): 50 | raise ValueError( 51 | "All items in `bloom_filter_columns` mut be string or int" 52 | ) 53 | for item in bloom_filter_columns: 54 | if isinstance(item, int): 55 | bf_set.add(item) 56 | elif isinstance(item, str): 57 | bf_set.add(self.__schema.find_column_id(item)) 58 | conv = None 59 | if converters: 60 | conv = DEFAULT_CONVERTERS.copy() 61 | conv.update(converters) 62 | else: 63 | conv = converters 64 | super().__init__( 65 | fileo, 66 | self.__schema, 67 | batch_size, 68 | stripe_size, 69 | row_index_stride, 70 | comp, 71 | comp_strat, 72 | compression_block_size, 73 | bf_set, 74 | bloom_filter_fpp, 75 | timezone, 76 | struct_repr, 77 | conv, 78 | padding_tolerance, 79 | dict_key_size_threshold, 80 | null_value, 81 | memory_block_size, 82 | ) 83 | 84 | def __enter__(self) -> "Writer": 85 | return self 86 | 87 | def __exit__(self, *exc: Any) -> None: 88 | self.close() 89 | 90 | def close(self) -> None: 91 | for key, val in self.__user_metadata.items(): 92 | super()._add_user_metadata(key, val) 93 | super().close() 94 | 95 | @property 96 | def schema(self) -> TypeDescription: 97 | return copy.deepcopy(self.__schema) 98 | 99 | def set_user_metadata(self, **kwargs: bytes) -> None: 100 | for key, val in kwargs.items(): 101 | if not isinstance(val, bytes): 102 | raise TypeError( 103 | "All values must be bytes, key '{0}' is {1}".format(key, type(val)) 104 | ) 105 | self.__user_metadata[key] = val 106 | -------------------------------------------------------------------------------- /tests/test_predicates.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datetime import datetime 4 | from decimal import Decimal 5 | 6 | from pyorc.predicates import * 7 | from pyorc.enums import TypeKind 8 | 9 | 10 | def test_column_type(): 11 | with pytest.raises(TypeError): 12 | _ = PredicateColumn("a", "something") 13 | with pytest.raises(TypeError): 14 | _ = PredicateColumn(TypeKind.STRUCT, "something") 15 | with pytest.raises(TypeError): 16 | _ = PredicateColumn(TypeKind.LONG, name=0) 17 | with pytest.raises(TypeError): 18 | _ = PredicateColumn(TypeKind.LONG, index="a") 19 | with pytest.raises(TypeError): 20 | _ = PredicateColumn(TypeKind.LONG, name="a", index=0) 21 | col = PredicateColumn(TypeKind.LONG, "colname") 22 | assert col is not None 23 | 24 | 25 | def test_column_fields(): 26 | col = PredicateColumn(TypeKind.LONG, "colname") 27 | assert col.name == "colname" 28 | assert col.type_kind == 4 29 | assert col.precision == 0 30 | assert col.scale == 0 31 | col = PredicateColumn(TypeKind.DECIMAL, "colname", precision=2, scale=3) 32 | assert col.type_kind == TypeKind.DECIMAL 33 | assert col.precision == 2 34 | assert col.scale == 3 35 | 36 | 37 | def test_equals(): 38 | col = PredicateColumn(TypeKind.LONG, "colname") 39 | pred = col == 100 40 | assert isinstance(pred, Predicate) 41 | assert pred.values == (Operator.EQ, col, 100) 42 | 43 | 44 | def test_not_equals(): 45 | col = PredicateColumn(TypeKind.STRING, "colname") 46 | pred = col != "test" 47 | assert isinstance(pred, Predicate) 48 | assert pred.values == (Operator.NOT, (Operator.EQ, col, "test")) 49 | 50 | 51 | def test_less_than(): 52 | col = PredicateColumn(TypeKind.INT, "colname") 53 | pred = col < 100 54 | assert isinstance(pred, Predicate) 55 | assert pred.values == (Operator.LT, col, 100) 56 | 57 | 58 | def test_less_than_or_equals(): 59 | col = PredicateColumn(TypeKind.LONG, "colname") 60 | pred = col <= 50 61 | assert isinstance(pred, Predicate) 62 | assert pred.values == (Operator.LE, col, 50) 63 | 64 | 65 | def test_greater_than(): 66 | col = PredicateColumn(TypeKind.DOUBLE, "colname") 67 | pred = col > 5.0 68 | assert isinstance(pred, Predicate) 69 | assert pred.values == (Operator.NOT, (Operator.LE, col, 5.0)) 70 | 71 | 72 | def test_greater_than_or_equals(): 73 | col = PredicateColumn(TypeKind.FLOAT, "colname") 74 | pred = col >= 10.0 75 | assert isinstance(pred, Predicate) 76 | assert pred.values == (Operator.NOT, (Operator.LT, col, 10.0)) 77 | 78 | 79 | def test_and(): 80 | col0 = PredicateColumn(TypeKind.LONG, "colname0") 81 | col1 = PredicateColumn(TypeKind.TIMESTAMP, "colname1") 82 | pred = (col0 < 100) & (col1 == datetime(2021, 3, 20)) 83 | assert isinstance(pred, Predicate) 84 | assert pred.values == ( 85 | Operator.AND, 86 | (Operator.LT, col0, 100), 87 | (Operator.EQ, col1, datetime(2021, 3, 20)), 88 | ) 89 | 90 | 91 | def test_or(): 92 | col0 = PredicateColumn(TypeKind.SHORT, name="colname0") 93 | col1 = PredicateColumn(TypeKind.DECIMAL, name="colname1", precision=2, scale=2) 94 | pred = (col0 < 100) & (col1 >= Decimal("20.00")) 95 | assert isinstance(pred, Predicate) 96 | assert pred.values == ( 97 | Operator.AND, 98 | (Operator.LT, col0, 100), 99 | (Operator.NOT, (Operator.LT, col1, Decimal("20.00"))), 100 | ) 101 | 102 | 103 | def test_not(): 104 | col = PredicateColumn(TypeKind.FLOAT, "colname") 105 | pred = ~((col < 1.0) & (col > -1.0)) 106 | assert isinstance(pred, Predicate) 107 | assert pred.values == ( 108 | Operator.NOT, 109 | ( 110 | Operator.AND, 111 | (Operator.LT, col, 1.0), 112 | (Operator.NOT, (Operator.LE, col, -1.0)), 113 | ), 114 | ) 115 | 116 | 117 | def test_decimal(): 118 | with pytest.raises(ValueError): 119 | _ = PredicateColumn(TypeKind.DECIMAL, "something") 120 | col = PredicateColumn(TypeKind.DECIMAL, "colname", precision=10, scale=3) 121 | assert col is not None 122 | -------------------------------------------------------------------------------- /src/pyorc/reader.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Type, Union 3 | 4 | from pyorc._pyorc import reader, stripe 5 | 6 | from .converters import DEFAULT_CONVERTERS, ORCConverter 7 | from .enums import CompressionKind, StructRepr, TypeKind, WriterVersion 8 | from .predicates import Predicate 9 | 10 | try: 11 | import zoneinfo 12 | except ImportError: 13 | from backports import zoneinfo 14 | 15 | 16 | class Column: 17 | def __init__(self, stream: Union["Reader", "Stripe"], index: int): 18 | self.index = index 19 | self.stream = stream 20 | self._stats = self.stream._statistics(self.index) 21 | 22 | @property 23 | def statistics(self) -> Dict[str, Any]: 24 | result = {} 25 | result_list = defaultdict(list) 26 | for stat in self._stats: 27 | for key, val in stat.items(): 28 | result_list[key].append(val) 29 | for key, values in result_list.items(): 30 | if key in ( 31 | "number_of_values", 32 | "sum", 33 | "false_count", 34 | "true_count", 35 | "total_length", 36 | ): 37 | result[key] = sum(values) 38 | elif key in ("minimum", "lower_bound"): 39 | result[key] = min(values) 40 | elif key in ("maximum", "upper_bound"): 41 | result[key] = max(values) 42 | elif key == "has_null": 43 | result[key] = any(values) 44 | result["kind"] = TypeKind(result_list["kind"][0]) 45 | return result 46 | 47 | 48 | class Stripe(stripe): 49 | def __getitem__(self, col_idx: int) -> "Column": 50 | return Column(self, col_idx) 51 | 52 | 53 | class Reader(reader): 54 | def __init__( 55 | self, 56 | fileo: BinaryIO, 57 | batch_size: int = 1024, 58 | column_indices: Optional[List[int]] = None, 59 | column_names: Optional[List[str]] = None, 60 | timezone: zoneinfo.ZoneInfo = zoneinfo.ZoneInfo("UTC"), 61 | struct_repr: StructRepr = StructRepr.TUPLE, 62 | converters: Optional[Dict[TypeKind, Type[ORCConverter]]] = None, 63 | predicate: Optional[Predicate] = None, 64 | null_value: Any = None, 65 | ) -> None: 66 | if column_indices is None: 67 | column_indices = [] 68 | if column_names is None: 69 | column_names = [] 70 | struct_repr = StructRepr(struct_repr) 71 | conv = None 72 | if converters: 73 | conv = DEFAULT_CONVERTERS.copy() 74 | conv.update(converters) 75 | else: 76 | conv = converters 77 | super().__init__( 78 | fileo, 79 | batch_size, 80 | column_indices, 81 | column_names, 82 | timezone, 83 | struct_repr, 84 | conv, 85 | predicate, 86 | null_value, 87 | ) 88 | 89 | def __getitem__(self, col_idx: int) -> Column: 90 | return Column(self, col_idx) 91 | 92 | def read_stripe(self, stripe_idx: int) -> Stripe: 93 | return Stripe(self, stripe_idx) 94 | 95 | def iter_stripes(self) -> Iterator[Stripe]: 96 | for num in range(self.num_of_stripes): 97 | yield self.read_stripe(num) 98 | 99 | @property 100 | def compression(self) -> CompressionKind: 101 | return CompressionKind(super().compression) 102 | 103 | @property 104 | def writer_id(self) -> str: 105 | wid = super().writer_id 106 | if wid == 0: 107 | return "ORC_JAVA_WRITER" 108 | elif wid == 1: 109 | return "ORC_CPP_WRITER" 110 | elif wid == 2: 111 | return "PRESTO_WRITER" 112 | elif wid == 3: 113 | return "SCRITCHLEY_GO" 114 | elif wid == 4: 115 | return "TRINO_WRITER" 116 | elif wid == 5: 117 | return "CUDF_WRITER" 118 | else: 119 | return "UNKNOWN_WRITER" 120 | 121 | @property 122 | def writer_version(self) -> WriterVersion: 123 | return WriterVersion(super().writer_version) 124 | -------------------------------------------------------------------------------- /src/_pyorc/PyORCStream.cpp: -------------------------------------------------------------------------------- 1 | #include "PyORCStream.h" 2 | 3 | PyORCInputStream::PyORCInputStream(py::object fp) 4 | { 5 | if (!(py::hasattr(fp, "read") && py::hasattr(fp, "seek"))) { 6 | throw py::type_error("Parameter must be a file-like object, but `" + 7 | (std::string)(py::str(fp.get_type())) + "` was provided"); 8 | } 9 | pyread = fp.attr("read"); 10 | pyseek = fp.attr("seek"); 11 | py::object isSeekable(fp.attr("seekable")); 12 | if (py::cast(isSeekable()) == false) { 13 | throw py::type_error("File-like object must be seekable"); 14 | } 15 | if (py::hasattr(fp, "name")) { 16 | filename = py::cast(py::str(fp.attr("name"))); 17 | } else { 18 | filename = py::cast(py::repr(fp)); 19 | } 20 | py::object pytell(fp.attr("tell")); 21 | uint64_t currPos = py::cast(pytell()); 22 | totalLength = py::cast(pyseek(0, 2)); 23 | pyseek(currPos); 24 | } 25 | 26 | uint64_t 27 | PyORCInputStream::getLength() const 28 | { 29 | return totalLength; 30 | } 31 | 32 | uint64_t 33 | PyORCInputStream::getNaturalReadSize() const 34 | { 35 | return 128 * 1024; 36 | } 37 | 38 | const std::string& 39 | PyORCInputStream::getName() const 40 | { 41 | return filename; 42 | } 43 | 44 | void 45 | PyORCInputStream::read(void* buf, uint64_t length, uint64_t offset) 46 | { 47 | char* src; 48 | Py_ssize_t bytesRead; 49 | if (!buf) { 50 | throw orc::ParseError("Buffer is null"); 51 | } 52 | 53 | pyseek(offset); 54 | py::object data = pyread(length); 55 | int rc = PyBytes_AsStringAndSize(data.ptr(), &src, &bytesRead); 56 | if (rc == -1) { 57 | PyErr_Clear(); 58 | throw orc::ParseError( 59 | "Failed to read content as bytes. Stream might not be opened as binary"); 60 | } 61 | 62 | if (static_cast(bytesRead) != length) { 63 | throw orc::ParseError("Short read of " + filename); 64 | } 65 | 66 | std::memcpy(buf, src, length); 67 | } 68 | 69 | PyORCInputStream::~PyORCInputStream() {} 70 | 71 | PyORCOutputStream::PyORCOutputStream(py::object fp) 72 | { 73 | bytesWritten = 0; 74 | if (!(py::hasattr(fp, "write") && py::hasattr(fp, "flush"))) { 75 | throw py::type_error("Parameter must be a file-like object, but `" + 76 | (std::string)(py::str(fp.get_type())) + "` was provided"); 77 | } 78 | pywrite = fp.attr("write"); 79 | pyflush = fp.attr("flush"); 80 | if (py::hasattr(fp, "name")) { 81 | filename = py::cast(py::str(fp.attr("name"))); 82 | } else { 83 | filename = py::cast(py::repr(fp)); 84 | } 85 | closed = py::cast(fp.attr("closed")); 86 | } 87 | 88 | uint64_t 89 | PyORCOutputStream::getLength() const 90 | { 91 | return bytesWritten; 92 | } 93 | 94 | uint64_t 95 | PyORCOutputStream::getNaturalWriteSize() const 96 | { 97 | return 128 * 1024; 98 | } 99 | 100 | const std::string& 101 | PyORCOutputStream::getName() const 102 | { 103 | return filename; 104 | } 105 | 106 | void 107 | PyORCOutputStream::write(const void* buf, size_t length) 108 | { 109 | if (closed) { 110 | throw std::logic_error("Cannot write to closed stream"); 111 | } 112 | try { 113 | py::bytes data = py::bytes(static_cast(buf), length); 114 | size_t count = py::cast(pywrite(data)); 115 | pyflush(); 116 | 117 | if (count != length) { 118 | throw orc::ParseError("Shorter write of " + filename); 119 | } 120 | bytesWritten += static_cast(count); 121 | } catch (py::error_already_set& err) { 122 | if (!err.matches(PyExc_TypeError)) { 123 | throw; 124 | } 125 | throw orc::ParseError( 126 | "Failed to write content as bytes. Stream might not be opened as binary"); 127 | } 128 | } 129 | 130 | void 131 | PyORCOutputStream::close() 132 | { 133 | if (!closed) { 134 | try { 135 | pyflush(); 136 | } catch (py::error_already_set& err) { 137 | if (!err.matches(PyExc_ValueError)) { 138 | throw; 139 | } 140 | // ValueError is raised when try to flush on a closed file, let's ignore. 141 | PyErr_Clear(); 142 | } 143 | closed = true; 144 | } 145 | } 146 | 147 | #if ORC_VERSION_AT_LEAST(1, 9, 0) 148 | void 149 | PyORCOutputStream::flush() 150 | { 151 | if (!closed) { 152 | pyflush(); 153 | } 154 | } 155 | #endif 156 | 157 | PyORCOutputStream::~PyORCOutputStream() 158 | { 159 | close(); 160 | } 161 | -------------------------------------------------------------------------------- /src/pyorc/_pyorc.pyi: -------------------------------------------------------------------------------- 1 | """_pyorc c++ extension""" 2 | import typing 3 | 4 | from .enums import CompressionKind, CompressionStrategy, StructRepr 5 | from .typedescription import TypeDescription 6 | 7 | __all__ = ["reader", "stripe", "writer"] 8 | 9 | class reader: 10 | def __init__( 11 | self, 12 | fileo: object, 13 | batch_size: int = 1024, 14 | col_indices: typing.Optional[typing.List[int]] = None, 15 | col_names: typing.Optional[typing.List[str]] = None, 16 | timezone: object = None, 17 | struct_repr: int = StructRepr.TUPLE, 18 | conv: object = None, 19 | predicate: object = None, 20 | null_value: object = None, 21 | ) -> None: ... 22 | def __iter__(self) -> reader: ... 23 | def __len__(self) -> int: ... 24 | def __next__(self) -> object: ... 25 | def _statistics(self, col_idx: int) -> tuple: ... 26 | def read(self, num: int = -1) -> list: ... 27 | def seek(self, row: int, whence: int = 0) -> int: ... 28 | @property 29 | def bytes_lengths(self) -> typing.Dict[str, int]: 30 | """ 31 | :type: dict 32 | """ 33 | @property 34 | def compression(self) -> int: 35 | """ 36 | :type: int 37 | """ 38 | @property 39 | def compression_block_size(self) -> int: 40 | """ 41 | :type: int 42 | """ 43 | @property 44 | def current_row(self) -> int: 45 | """ 46 | :type: int 47 | """ 48 | @property 49 | def format_version(self) -> typing.Tuple[int, int]: 50 | """ 51 | :type: tuple 52 | """ 53 | @property 54 | def num_of_stripes(self) -> int: 55 | """ 56 | :type: int 57 | """ 58 | @property 59 | def row_index_stride(self) -> int: 60 | """ 61 | :type: int 62 | """ 63 | @property 64 | def schema(self) -> object: 65 | """ 66 | :type: object 67 | """ 68 | @property 69 | def selected_schema(self) -> object: 70 | """ 71 | :type: object 72 | """ 73 | @property 74 | def software_version(self) -> str: 75 | """ 76 | :type: str 77 | """ 78 | @property 79 | def user_metadata(self) -> typing.Dict[str, bytes]: 80 | """ 81 | :type: dict 82 | """ 83 | @property 84 | def writer_id(self) -> int: 85 | """ 86 | :type: int 87 | """ 88 | @property 89 | def writer_version(self) -> int: 90 | """ 91 | :type: int 92 | """ 93 | pass 94 | 95 | class stripe: 96 | def __init__(self, reader: reader, stripe_idx: int) -> None: ... 97 | def __iter__(self) -> stripe: ... 98 | def __len__(self) -> int: ... 99 | def __next__(self) -> object: ... 100 | def _statistics(self, col_idx: int) -> tuple: ... 101 | def read(self, num: int = -1) -> list: ... 102 | def seek(self, row: int, whence: int = 0) -> int: ... 103 | @property 104 | def bloom_filter_columns(self) -> typing.Tuple[int, ...]: 105 | """ 106 | :type: tuple 107 | """ 108 | @property 109 | def bytes_length(self) -> int: 110 | """ 111 | :type: int 112 | """ 113 | @property 114 | def bytes_offset(self) -> int: 115 | """ 116 | :type: int 117 | """ 118 | @property 119 | def current_row(self) -> int: 120 | """ 121 | :type: int 122 | """ 123 | @property 124 | def row_offset(self) -> int: 125 | """ 126 | :type: int 127 | """ 128 | @property 129 | def writer_timezone(self) -> str: 130 | """ 131 | :type: str 132 | """ 133 | pass 134 | 135 | class writer: 136 | def __init__( 137 | self, 138 | fileo: object, 139 | schema: object, 140 | batch_size: int = 1024, 141 | stripe_size: int = 67108864, 142 | row_index_stride: int = 10000, 143 | compression: int = CompressionKind.ZLIB, 144 | compression_strategy: int = CompressionStrategy.SPEED, 145 | compression_block_size: int = 65536, 146 | bloom_filter_columns: typing.Optional[typing.Set[int]] = None, 147 | bloom_filter_fpp: float = 0.05, 148 | timezone: object = None, 149 | struct_repr: int = StructRepr.TUPLE, 150 | conv: object = None, 151 | padding_tolerance: float = 0.0, 152 | dict_key_size_threshold: float = 0.0, 153 | null_value: object = None, 154 | ) -> None: ... 155 | def _add_user_metadata(self, key: str, value: bytes) -> None: ... 156 | def close(self) -> None: ... 157 | def write(self, row: object) -> None: ... 158 | def writerows(self, rows: typing.Iterable) -> int: ... 159 | @property 160 | def current_row(self) -> int: 161 | """ 162 | :type: int 163 | """ 164 | pass 165 | 166 | def _orc_version() -> str: 167 | pass 168 | 169 | def _schema_from_string(arg0: str) -> TypeDescription: 170 | pass 171 | -------------------------------------------------------------------------------- /tests/test_stripe.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import io 4 | import sys 5 | 6 | from datetime import datetime, timedelta, timezone 7 | 8 | from pyorc import ( 9 | Reader, 10 | Writer, 11 | Stripe, 12 | orc_version_info, 13 | ) 14 | 15 | 16 | @pytest.fixture 17 | def striped_orc_data(): 18 | def _init(schema, rows, bfc=tuple()): 19 | data = io.BytesIO() 20 | with Writer( 21 | data, 22 | schema, 23 | batch_size=65535, 24 | stripe_size=128, 25 | compression_block_size=128, 26 | bloom_filter_columns=bfc, 27 | memory_block_size=64, 28 | ) as writer: 29 | writer.writerows(rows) 30 | data.seek(0) 31 | return data 32 | 33 | return _init 34 | 35 | 36 | def test_init(striped_orc_data): 37 | data = striped_orc_data("int", (i for i in range(100000))) 38 | reader = Reader(data) 39 | with pytest.raises(TypeError): 40 | _ = Stripe(None, 0) 41 | with pytest.raises(TypeError): 42 | _ = Stripe("reader", 0) 43 | with pytest.raises(IndexError): 44 | _ = Stripe(reader, 3) 45 | with pytest.raises(TypeError): 46 | _ = Stripe(reader, "col") 47 | assert Stripe(reader, 0) is not None 48 | 49 | 50 | def test_len(striped_orc_data): 51 | data = striped_orc_data("int", (i for i in range(100000))) 52 | reader = Reader(data) 53 | stripe = Stripe(reader, 0) 54 | 55 | assert len(reader) != len(stripe) 56 | assert len(stripe) == 65535 57 | 58 | 59 | def test_bytes_length(striped_orc_data): 60 | expected_bytes_length = ( 61 | 392 if orc_version_info.major == 1 and orc_version_info.minor < 8 else 359 62 | ) # Bold, hardcoded length values. 63 | 64 | data = striped_orc_data("int", (i for i in range(100000))) 65 | reader = Reader(data) 66 | stripe = Stripe(reader, 1) 67 | 68 | assert stripe.bytes_length == expected_bytes_length 69 | with pytest.raises(AttributeError): 70 | stripe.bytes_length = "false" 71 | 72 | 73 | def test_bytes_offset(striped_orc_data): 74 | expected_bytes_offset = ( 75 | 658 if orc_version_info.major == 1 and orc_version_info.minor < 8 else 614 76 | ) # Bold, hardcoded offset value. 77 | 78 | data = striped_orc_data("int", (i for i in range(100000))) 79 | reader = Reader(data) 80 | stripe = Stripe(reader, 1) 81 | 82 | assert stripe.bytes_offset == expected_bytes_offset 83 | with pytest.raises(AttributeError): 84 | stripe.bytes_offset = 5 85 | 86 | 87 | def test_bloom_filter_columns(striped_orc_data): 88 | expected = (0, 1) 89 | data = striped_orc_data( 90 | "struct", 91 | ((i, "Test {}".format(i + 1)) for i in range(100000)), 92 | bfc=expected, 93 | ) 94 | reader = Reader(data) 95 | assert Stripe(reader, 0).bloom_filter_columns == expected 96 | assert Stripe(reader, 1).bloom_filter_columns == expected 97 | 98 | data = striped_orc_data("int", (i for i in range(100000))) 99 | reader = Reader(data) 100 | stripe = Stripe(reader, 0) 101 | assert stripe.bloom_filter_columns == tuple() 102 | with pytest.raises(AttributeError): 103 | stripe.bloom_filter_columns = (0,) 104 | 105 | 106 | def test_row_offset(striped_orc_data): 107 | data = striped_orc_data("int", (i for i in range(100000))) 108 | reader = Reader(data) 109 | stripe0 = Stripe(reader, 0) 110 | 111 | assert stripe0.row_offset == 0 112 | assert Stripe(reader, 1).row_offset == len(stripe0) 113 | with pytest.raises(AttributeError): 114 | stripe0.row_offset = 5 115 | 116 | 117 | def test_writer_timezone(striped_orc_data): 118 | def get_dt(): 119 | start = datetime(2010, 9, 1, 7, 0, 0, 0, timezone.utc) 120 | end = datetime(2010, 9, 10, 12, 0, 0, 0, timezone.utc) 121 | while start <= end: 122 | yield start 123 | start += timedelta(seconds=10) 124 | 125 | data = striped_orc_data("timestamp", get_dt()) 126 | reader = Reader(data) 127 | stripe = Stripe(reader, 1) 128 | 129 | assert stripe.writer_timezone == "UTC" 130 | with pytest.raises(AttributeError): 131 | stripe.writer_timezone = "UTC-9:00" 132 | 133 | 134 | @pytest.mark.skipif(sys.platform == "win32", reason="Seeking fails on Windows") 135 | def test_seek_and_read(striped_orc_data): 136 | data = striped_orc_data( 137 | "struct", 138 | ((i, "Test {}".format(i + 1)) for i in range(100000)), 139 | ) 140 | reader = Reader(data) 141 | stripe = reader.read_stripe(1) 142 | assert next(stripe) == (65535, "Test 65536") 143 | stripe.seek(10000) 144 | assert next(stripe) == (75535, "Test 75536") 145 | stripe.seek(-1, 2) 146 | assert next(stripe) == (99999, "Test 100000") 147 | stripe = reader.read_stripe(0) 148 | stripe.seek(-1, 2) 149 | assert next(stripe) == (65534, "Test 65535") 150 | stripe.seek(0) 151 | next(stripe) 152 | stripe.seek(10000, 1) 153 | assert next(stripe) == (10001, "Test 10002") 154 | expected = reader.read() 155 | result = stripe.read() 156 | assert result == expected[10002:65535] 157 | stripe = reader.read_stripe(1) 158 | assert stripe.read() == expected[65535:] 159 | -------------------------------------------------------------------------------- /tests/test_typedescription.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pyorc.typedescription import * 4 | from pyorc.enums import TypeKind 5 | 6 | 7 | def test_from_str_schema(): 8 | descr = TypeDescription.from_string( 9 | "struct,c:struct>" 10 | ) 11 | assert descr.kind == TypeKind.STRUCT 12 | assert len(descr.fields) == 3 13 | assert tuple(descr.fields.keys()) == ("a", "b", "c") 14 | assert descr.fields["a"].kind == TypeKind.INT 15 | assert descr.fields["b"].column_id == 2 16 | assert descr.fields["b"].key.kind == TypeKind.VARCHAR 17 | assert descr.fields["b"].key.max_length == 20 18 | assert descr.fields["b"].value.column_id == 4 19 | assert tuple(descr.fields["c"].fields.keys()) == ("d", "e", "f") 20 | assert descr.fields["c"].fields["d"].kind == TypeKind.LONG 21 | assert descr.fields["c"].fields["e"].column_id == 7 22 | assert descr.fields["c"].fields["f"].kind == TypeKind.CHAR 23 | assert descr.fields["c"].fields["f"].max_length == 12 24 | assert descr.fields["c"].fields["f"].column_id == 8 25 | 26 | 27 | def test_from_str_schema_fail(): 28 | with pytest.raises(ValueError): 29 | _ = TypeDescription.from_string( 30 | "struct,c:struct>" 31 | ) 32 | with pytest.raises(ValueError): 33 | _ = TypeDescription.from_string("struct,e:int>>" 41 | ) 42 | assert descr.find_column_id("a") == 1 43 | assert descr.find_column_id("a.b.c") == 3 44 | assert descr.find_column_id("a.e") == 5 45 | with pytest.raises(TypeError): 46 | _ = descr.find_column_id(True) 47 | with pytest.raises(KeyError): 48 | _ = descr.find_column_id("f.z") 49 | descr = Struct(**{"a.b": Struct(c=Int()), "d": String()}) 50 | assert descr.find_column_id("`a.b`") == 1 51 | assert descr.find_column_id("`a.b`.c") == 2 52 | assert descr.find_column_id("d") == 3 53 | with pytest.raises(KeyError): 54 | _ = descr.find_column_id("a.b") 55 | 56 | 57 | TESTDATA = [ 58 | (Boolean(), TypeKind.BOOLEAN, "boolean"), 59 | (TinyInt(), TypeKind.BYTE, "tinyint"), 60 | (SmallInt(), TypeKind.SHORT, "smallint"), 61 | (Int(), TypeKind.INT, "int"), 62 | (BigInt(), TypeKind.LONG, "bigint"), 63 | (Float(), TypeKind.FLOAT, "float"), 64 | (Double(), TypeKind.DOUBLE, "double"), 65 | (Date(), TypeKind.DATE, "date"), 66 | (Timestamp(), TypeKind.TIMESTAMP, "timestamp"), 67 | (TimestampInstant(), TypeKind.TIMESTAMP_INSTANT, "timestamp with local time zone"), 68 | (String(), TypeKind.STRING, "string"), 69 | (Binary(), TypeKind.BINARY, "binary"), 70 | (Decimal(precision=10, scale=3), TypeKind.DECIMAL, "decimal(10,3)"), 71 | (Char(16), TypeKind.CHAR, "char(16)"), 72 | (VarChar(140), TypeKind.VARCHAR, "varchar(140)"), 73 | ( 74 | Union(Int(), Double(), Char(20)), 75 | TypeKind.UNION, 76 | "uniontype", 77 | ), 78 | (Array(Int()), TypeKind.LIST, "array"), 79 | (Map(key=String(), value=Double()), TypeKind.MAP, "map"), 80 | (Struct(a=String(), b=Date()), TypeKind.STRUCT, "struct"), 81 | ( 82 | Struct(a=Timestamp(), b=Struct(c=Int(), b=Array(Double()))), 83 | TypeKind.STRUCT, 84 | "struct>>", 85 | ), 86 | ] 87 | 88 | 89 | @pytest.mark.parametrize("orc_schema,kind,expected", TESTDATA) 90 | def test_str(orc_schema, kind, expected): 91 | assert str(orc_schema) == expected 92 | 93 | 94 | @pytest.mark.parametrize("orc_schema,kind,expected", TESTDATA) 95 | def test_kind(orc_schema, kind, expected): 96 | assert orc_schema.kind == kind 97 | 98 | 99 | def test_decimal(): 100 | descr = Decimal(precision=5, scale=3) 101 | assert descr.precision == 5 102 | assert descr.scale == 3 103 | assert str(descr) == "decimal(5,3)" 104 | 105 | 106 | def test_varchar(): 107 | descr = TypeDescription.from_string("varchar(30)") 108 | assert descr.max_length == 30 109 | descr.max_length = 15 110 | assert descr.max_length == 15 111 | assert str(descr) == "varchar(15)" 112 | 113 | 114 | def test_char(): 115 | descr = Char(10) 116 | assert descr.max_length == 10 117 | descr.max_length = 1 118 | assert str(descr) == "char(1)" 119 | 120 | 121 | TESTDATA = [ 122 | lambda: Struct(field0=Int(), field1=True), 123 | lambda: Map(key=Int(), value=True), 124 | lambda: Map(key=0, value=Double()), 125 | lambda: Array("test"), 126 | lambda: Union(Int(), 0, Double()), 127 | ] 128 | 129 | 130 | @pytest.mark.parametrize("orc_schema", TESTDATA) 131 | def test_failed_complex_types(orc_schema): 132 | with pytest.raises(TypeError): 133 | _ = orc_schema() 134 | 135 | 136 | def test_struct(): 137 | schema = Struct(a0=Int(), b0=Double(), c0=Struct(a1=Date(), b1=Timestamp())) 138 | assert isinstance(schema["a0"], Int) 139 | assert schema["b0"].kind == TypeKind.DOUBLE 140 | assert schema["c0"].column_id == 3 141 | assert schema["c0"]["b1"].kind == TypeKind.TIMESTAMP 142 | 143 | 144 | def test_union(): 145 | schema = TypeDescription.from_string("uniontype") 146 | assert schema[1].kind == TypeKind.DOUBLE 147 | with pytest.raises(IndexError): 148 | _ = schema[10] 149 | schema = Union(Float(), VarChar(120)) 150 | assert schema[0].kind == TypeKind.FLOAT 151 | 152 | 153 | def test_attributes(): 154 | schema = Boolean() 155 | with pytest.raises(TypeError): 156 | _ = schema.set_attributes(0) 157 | with pytest.raises(TypeError): 158 | _ = schema.set_attributes({0: "1"}) 159 | with pytest.raises(TypeError): 160 | _ = schema.set_attributes({"a": 1}) 161 | attrs = {"a": "1", "b": "2"} 162 | schema.set_attributes(attrs) 163 | assert schema.attributes == attrs 164 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========== 3 | [0.11.0] - UNRELEASED 4 | --------------------- 5 | 6 | Added 7 | ~~~~~ 8 | 9 | Changed 10 | ~~~~~~~ 11 | 12 | Fixed 13 | ~~~~~ 14 | 15 | 16 | [0.10.0] - 2025-02-18 17 | --------------------- 18 | 19 | Added 20 | ~~~~~ 21 | 22 | - New parameter to Writer: memory_block_size for setting initial block 23 | size of original input buffer. 24 | - Add CUDF_Writer to Reader's writer_id. 25 | - Python 3.13 wheels. 26 | 27 | Changed 28 | ~~~~~~~ 29 | 30 | - Dropped support for Python 3.8. 31 | - ORC C++ Core updated to 2.1.0. 32 | - Pinned setuptools<72.2 for PyPy to avoid TypeError during build. 33 | 34 | 35 | [0.9.0] - 2023-11-04 36 | -------------------- 37 | 38 | Added 39 | ~~~~~ 40 | 41 | - Writer.write_intermediate_footer method for ORC library 1.9.0 and newer. 42 | - Python 3.12 wheels. 43 | 44 | Changed 45 | ~~~~~~~ 46 | 47 | - Dropped support for Python 3.7. 48 | - ORC C++ Core updated to 1.9.1. 49 | 50 | 51 | [0.8.0] - 2022-11-19 52 | -------------------- 53 | 54 | Added 55 | ~~~~~ 56 | 57 | - Python 3.11 wheels. (PR #58, contribution of @dbaxa) 58 | 59 | Changed 60 | ~~~~~~~ 61 | 62 | - ORC C++ Core updated to 1.7.7. 63 | - Improved type annotations, set module's __all__ variable. 64 | 65 | 66 | [0.7.0] - 2022-07-16 67 | -------------------- 68 | 69 | Added 70 | ~~~~~ 71 | 72 | - Universal2 wheels for MacOS. (PR #55, contribution of @dbaxa) 73 | - ORC-517, ORC-203, and ORC-14 versions to WriterVersion enum. 74 | 75 | Changed 76 | ~~~~~~~ 77 | 78 | - Dropped support for Python 3.6. 79 | - ORC C++ Core updated to 1.7.5. 80 | 81 | 82 | [0.6.0] - 2022-02-18 83 | -------------------- 84 | 85 | Added 86 | ~~~~~ 87 | 88 | - New parameter to Writer: dict_key_size_threshold for setting threshold 89 | for dictionary encoding. (PR #46, contribution of @dirtysalt) 90 | - New parameter to Writer: padding_tolerance for block padding. 91 | - New parameter to Reader and Writer: null_value for changing representation 92 | of ORC null value. The value must be a singleton object. 93 | - Type stubs for classes implemented in C++. 94 | - Experimental musllinux and PyPy wheels. 95 | 96 | Changed 97 | ~~~~~~~ 98 | 99 | - Writer.writerows method reimplemented in C++. 100 | - Improved type annotations. 101 | - ORC C++ Core updated to 1.7.3. 102 | - Removed build_orc setup.py command, moved the same functionality to 103 | build_ext command. 104 | 105 | Fixed 106 | ~~~~~ 107 | 108 | - Unnecessary string casting of values when writing user metadata. (Issue #45) 109 | 110 | 111 | [0.5.0] - 2021-10-22 112 | -------------------- 113 | 114 | Added 115 | ~~~~~ 116 | 117 | - Module level variables for the ORC library version: orc_version string and 118 | orc_version_info namedtuple. 119 | - New parameter for Writer: row_index_stride. 120 | - New read-only properties for Reader: row_index_stride and software_version. 121 | - Trino and Scritchley writer ids. 122 | - Type annotations support for ORC types. 123 | - Support for `timestamp with local time zone` type. 124 | - New parameter for Reader and Writer: timezone. 125 | - The backported zoneinfo module dependency pior to Python 3.9. 126 | - Predicate (SearchArgument) support for filtering row groups during ORC file 127 | reads. New classes: Predicate and PredicateColumn. 128 | - New parameter for Reader: predicate. 129 | - Build for aarch64 wheels. (PR #43, contribution of @odidev) 130 | 131 | Changed 132 | ~~~~~~~ 133 | 134 | - ORC C++ Core updated to 1.7.0, and because many of the new features are not 135 | backported to the 1.6 branch, currently this is the minimum required lib 136 | version. 137 | - TimestampConverter's to_orc and from_orc methods got an extra timezone 138 | parameter, that will be bound to the same ZoneInfo object passed to the 139 | Reader or Writer via their timezone parameters during type convert. 140 | - Renamed Reader.metadata property and Writer.set_metadata method to 141 | user_metadata and set_user_metadata respectively to avoid confusion. 142 | 143 | 144 | [0.4.0] - 2021-01-11 145 | -------------------- 146 | 147 | Added 148 | ~~~~~ 149 | 150 | - Experimental Windows support. 151 | - tzdata package dependency on Windows. Automatically setting TZDIR 152 | to the path of the tzdata package's data dir after importing PyORC. 153 | 154 | Changed 155 | ~~~~~~~ 156 | - Create ORC Type from TypeDescription directly (instead of string parsing) 157 | for Writer. (PR #26, contribution of @blkerby) 158 | - Dotted column names are allowed to use in TypeDescription.find_column_id 159 | method with escaping them backticks. 160 | - ORC C++ Core updated to 1.6.6. 161 | 162 | Fixed 163 | ~~~~~ 164 | 165 | - Handling large negative seconds on Windows for TimestampConverter.from_orc. 166 | 167 | 168 | [0.3.0] - 2020-05-24 169 | -------------------- 170 | 171 | Added 172 | ~~~~~ 173 | 174 | - Metadata property for Reader and set_metadata for Writer to 175 | handle ORC file's metadata. 176 | - Meta info attributes like writer_id, writer_version, bytes_length, 177 | compression and compression_block_size for Reader. 178 | - New TypeDescription subclasses to represent ORC types. 179 | 180 | Changed 181 | ~~~~~~~ 182 | 183 | - Reimplemented TypeDescription in Python. 184 | - ORC C++ Core updated to 1.6.3. 185 | 186 | Fixed 187 | ~~~~~ 188 | 189 | - Converting date from ORC on systems where the system's timezone 190 | has a negative UTC offset (Issues #5) 191 | 192 | 193 | [0.2.0] - 2020-01-01 194 | -------------------- 195 | 196 | Added 197 | ~~~~~ 198 | 199 | - Converters for date, decimal and timestamp ORC types in Python and 200 | option to change them via Reader's and Writer's converters parameter. 201 | - Column object for accessing statistics about ORC columns. 202 | - An attribute to Reader for selected schema. 203 | 204 | Changed 205 | ~~~~~~~ 206 | 207 | - Use timezone-aware datetime objects (in UTC) for ORC timestamps by default. 208 | - Wrapped C++ stripe object to Python Stripe. 209 | 210 | Fixed 211 | ~~~~~ 212 | 213 | - Decrementing reference for bytes object after reading from file stream. 214 | 215 | [0.1.0] - 2019-11-16 216 | -------------------- 217 | 218 | Added 219 | ~~~~~ 220 | 221 | - A Reader object to read ORC files. 222 | - A stripe object to read only a stripe in an ORC file. 223 | - A Writer object to write ORC files. 224 | - A typedescription object to represent the ORC schema. 225 | - Support to represent a struct type either a Python tuple or a dictionary. 226 | -------------------------------------------------------------------------------- /src/_pyorc/_pyorc.cpp: -------------------------------------------------------------------------------- 1 | #include "Reader.h" 2 | #include "Writer.h" 3 | #include "verguard.h" 4 | 5 | #include 6 | 7 | namespace py = pybind11; 8 | 9 | PYBIND11_MODULE(_pyorc, m) 10 | { 11 | m.doc() = "_pyorc c++ extension"; 12 | m.def("_orc_version", []() -> py::object { return py::cast(ORC_VERSION); }); 13 | m.def("_schema_from_string", [](std::string schema) { 14 | try { 15 | auto orcType = orc::Type::buildTypeFromString(schema); 16 | return createTypeDescription(*orcType); 17 | } catch (std::logic_error& err) { 18 | throw py::value_error(err.what()); 19 | } 20 | }); 21 | py::register_exception_translator([](std::exception_ptr p) { 22 | try { 23 | if (p) { 24 | std::rethrow_exception(p); 25 | } 26 | } catch (const orc::ParseError& e) { 27 | py::object err = py::module::import("pyorc.errors").attr("ParseError"); 28 | PyErr_SetString(err.ptr(), e.what()); 29 | } 30 | }); 31 | py::class_(m, "stripe") 32 | .def( 33 | py::init([](Reader& reader, uint64_t num) { return reader.readStripe(num); }), 34 | py::keep_alive<0, 2>()) 35 | .def("__next__", [](Stripe& s) -> py::object { return s.next(); }) 36 | .def("__iter__", [](Stripe& s) -> Stripe& { return s; }) 37 | .def("__len__", &Stripe::len) 38 | .def("read", &Stripe::read, py::arg_v("num", -1, "-1")) 39 | .def("seek", &Stripe::seek, py::arg("row"), py::arg_v("whence", 0, "0")) 40 | .def("_statistics", &Stripe::statistics) 41 | .def_property_readonly("bytes_length", [](Stripe& s) { return s.length(); }) 42 | .def_property_readonly("bytes_offset", [](Stripe& s) { return s.offset(); }) 43 | .def_property_readonly("bloom_filter_columns", 44 | [](Stripe& s) { return s.bloomFilterColumns(); }) 45 | .def_property_readonly("writer_timezone", 46 | [](Stripe& s) { return s.writerTimezone(); }) 47 | .def_readonly("current_row", &Stripe::currentRow) 48 | .def_readonly("row_offset", &Stripe::firstRowOfStripe); 49 | py::class_(m, "reader") 50 | .def(py::init, 53 | std::list, 54 | py::object, 55 | unsigned int, 56 | py::object, 57 | py::object, 58 | py::object>(), 59 | py::arg("fileo"), 60 | py::arg_v("batch_size", 1024, "1024"), 61 | py::arg_v("col_indices", std::list{}, "None"), 62 | py::arg_v("col_names", std::list{}, "None"), 63 | py::arg_v("timezone", py::none(), "None"), 64 | py::arg_v("struct_repr", 0, "StructRepr.TUPLE"), 65 | py::arg_v("conv", py::none(), "None"), 66 | py::arg_v("predicate", py::none(), "None"), 67 | py::arg_v("null_value", py::none(), "None")) 68 | .def("__next__", [](Reader& r) -> py::object { return r.next(); }) 69 | .def("__iter__", [](Reader& r) -> Reader& { return r; }) 70 | .def("__len__", &Reader::len) 71 | .def("read", &Reader::read, py::arg_v("num", -1, "-1")) 72 | .def("seek", &Reader::seek, py::arg("row"), py::arg_v("whence", 0, "0")) 73 | .def("_statistics", &Reader::statistics) 74 | .def_property_readonly("bytes_lengths", &Reader::bytesLengths) 75 | .def_property_readonly("compression", &Reader::compression) 76 | .def_property_readonly("compression_block_size", &Reader::compressionBlockSize) 77 | .def_property_readonly("row_index_stride", &Reader::rowIndexStride) 78 | .def_property_readonly("format_version", &Reader::formatVersion) 79 | .def_property_readonly("user_metadata", &Reader::userMetadata) 80 | .def_property_readonly("schema", &Reader::schema) 81 | .def_property_readonly("selected_schema", &Reader::selectedSchema) 82 | .def_property_readonly("num_of_stripes", 83 | [](Reader& r) { return r.numberOfStripes(); }) 84 | .def_property_readonly("writer_id", &Reader::writerId) 85 | .def_property_readonly("writer_version", &Reader::writerVersion) 86 | .def_property_readonly("software_version", &Reader::softwareVersion) 87 | .def_readonly("current_row", &Reader::currentRow); 88 | py::class_(m, "writer") 89 | .def(py::init, 98 | double, 99 | py::object, 100 | unsigned int, 101 | py::object, 102 | double, 103 | double, 104 | py::object, 105 | unsigned int>(), 106 | py::arg("fileo"), 107 | py::arg("schema"), 108 | py::arg_v("batch_size", 1024, "1024"), 109 | py::arg_v("stripe_size", 67108864, "67108864"), 110 | py::arg_v("row_index_stride", 10000, "10000"), 111 | py::arg_v("compression", 1, "CompressionKind.ZLIB"), 112 | py::arg_v("compression_strategy", 0, "CompressionStrategy.SPEED"), 113 | py::arg_v("compression_block_size", 65536, "65536"), 114 | py::arg_v("bloom_filter_columns", std::set{}, "None"), 115 | py::arg_v("bloom_filter_fpp", 0.05, "0.05"), 116 | py::arg_v("timezone", py::none(), "None"), 117 | py::arg_v("struct_repr", 0, "StructRepr.TUPLE"), 118 | py::arg_v("conv", py::none(), "None"), 119 | py::arg_v("padding_tolerance", 0.0, "0.0"), 120 | py::arg_v("dict_key_size_threshold", 0.0, "0.0"), 121 | py::arg_v("null_value", py::none(), "None"), 122 | py::arg_v("memory_block_size", 65536, "65536")) 123 | .def("_add_user_metadata", &Writer::addUserMetadata) 124 | .def("write", &Writer::write) 125 | .def("writerows", &Writer::writerows) 126 | #if ORC_VERSION_AT_LEAST(1, 9, 0) 127 | .def("write_intermediate_footer", &Writer::writeIntermediateFooter) 128 | #endif 129 | .def("close", &Writer::close) 130 | .def_readonly("current_row", &Writer::currentRow); 131 | } 132 | -------------------------------------------------------------------------------- /tests/compare/test_writer_cmp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import gzip 4 | import json 5 | import os 6 | import subprocess 7 | import sys 8 | 9 | from decimal import Decimal 10 | from datetime import datetime, timezone 11 | 12 | import pyorc._pyorc 13 | from pyorc.enums import TypeKind, StructRepr 14 | from pyorc.typedescription import TypeDescription, Timestamp 15 | 16 | from conftest import output_file 17 | 18 | pytestmark = pytest.mark.skipif( 19 | sys.platform == "win32", reason="No orc-tools on Windows" 20 | ) 21 | 22 | ORC_CONTENTS_PATH = "deps/bin/orc-contents" 23 | 24 | 25 | def transform(schema, value): 26 | if schema.kind < 8: 27 | # Primitive types, no transformation. 28 | return value 29 | elif schema.kind == TypeKind.STRUCT: 30 | return {col: transform(schema[col], field) for col, field in value.items()} 31 | elif schema.kind == TypeKind.MAP: 32 | return { 33 | keypair["key"]: transform(schema.value, keypair["value"]) 34 | for keypair in value 35 | } 36 | elif schema.kind == TypeKind.LIST: 37 | return [transform(schema.type, item) for item in value] 38 | elif schema.kind == TypeKind.TIMESTAMP: 39 | if value is None: 40 | return value 41 | try: 42 | ts = datetime.strptime(value[:26], "%Y-%m-%d %H:%M:%S") 43 | except ValueError: 44 | ts = datetime.strptime(value[:26], "%Y-%m-%d %H:%M:%S.%f") 45 | return ts.replace(tzinfo=timezone.utc) 46 | elif schema.kind == TypeKind.DATE: 47 | return datetime.strptime(value, "%Y-%m-%d").date() 48 | elif schema.kind == TypeKind.BINARY: 49 | return bytes(value) 50 | elif schema.kind == TypeKind.DECIMAL: 51 | if value is None: 52 | return value 53 | elif isinstance(value, float): 54 | return Decimal.from_float(value) 55 | else: 56 | return Decimal(value) 57 | else: 58 | return value 59 | 60 | 61 | def read_expected_json_record(path): 62 | with gzip.open(path, "rb") as fileo: 63 | for line in fileo: 64 | yield json.loads(line) 65 | 66 | 67 | def get_full_path(path): 68 | curdir = os.path.abspath(os.path.dirname(__file__)) 69 | projdir = os.path.abspath(os.path.join(curdir, os.pardir, os.pardir)) 70 | return os.path.join(projdir, "deps", "examples", "expected", path) 71 | 72 | 73 | def idfn(val): 74 | return val[:40] 75 | 76 | 77 | def create_orc_output_for_test(schema, file_out, file_in): 78 | writer = pyorc._pyorc.writer(file_out, schema, struct_repr=StructRepr.DICT) 79 | num = 0 80 | for row in read_expected_json_record(get_full_path(file_in)): 81 | orc_row = transform(schema, row) 82 | writer.write(orc_row) 83 | num += 1 84 | assert num == writer.current_row 85 | writer.close() 86 | 87 | 88 | TESTDATA = [ 89 | ( 90 | "TestOrcFile.test1.jsn.gz", 91 | "struct>>,list:array>,map:map>>", 92 | ), 93 | ("TestOrcFile.testDate1900.jsn.gz", "struct"), 94 | ("TestOrcFile.testDate2038.jsn.gz", "struct"), 95 | ( 96 | "TestOrcFile.testSeek.jsn.gz", 97 | "struct>>,list:array>,map:map>>", 98 | ), 99 | ("TestOrcFile.testSnappy.jsn.gz", "struct"), 100 | ( 101 | "nulls-at-end-snappy.jsn.gz", 102 | "struct<_col0:tinyint,_col1:smallint,_col2:int,_col3:bigint,_col4:float,_col5:double,_col6:boolean>", 103 | ), 104 | ( 105 | "demo-12-zlib.jsn.gz", 106 | "struct<_col0:int,_col1:string,_col2:string,_col3:string,_col4:int,_col5:string,_col6:int,_col7:int,_col8:int>", 107 | ), 108 | ] 109 | 110 | 111 | @pytest.mark.parametrize("expected,schema", TESTDATA, ids=idfn) 112 | def test_write(expected, schema, output_file): 113 | create_orc_output_for_test( 114 | TypeDescription.from_string(schema), output_file, expected 115 | ) 116 | exp_res = read_expected_json_record(get_full_path(expected)) 117 | with subprocess.Popen( 118 | [ORC_CONTENTS_PATH, output_file.name], stdout=subprocess.PIPE 119 | ) as proc: 120 | for line in proc.stdout: 121 | assert json.loads(line) == next(exp_res) 122 | with pytest.raises(StopIteration): 123 | next(exp_res) 124 | 125 | 126 | def test_write_decimal(output_file): 127 | input_filename = "decimal.jsn.gz" 128 | create_orc_output_for_test( 129 | TypeDescription.from_string("struct<_col0:decimal(10,5)>"), 130 | output_file, 131 | input_filename, 132 | ) 133 | exp_res = read_expected_json_record(get_full_path(input_filename)) 134 | with subprocess.Popen( 135 | [ORC_CONTENTS_PATH, output_file.name], stdout=subprocess.PIPE 136 | ) as proc: 137 | for line in proc.stdout: 138 | data = next(exp_res) 139 | if pyorc.orc_version_info.major >= 2 and pyorc.orc_version_info.minor > 0: 140 | # From 2.1.0, orc-content returns decimals as string to the output, 141 | # whilte the example json has floats in it. 142 | data["_col0"] = ( 143 | data["_col0"] if data["_col0"] is None else str(data["_col0"]) 144 | ) 145 | assert json.loads(line) == data 146 | with pytest.raises(StopIteration): 147 | next(exp_res) 148 | 149 | 150 | def test_write_timestamp(output_file): 151 | input_filename = "TestOrcFile.testTimestamp.jsn.gz" 152 | create_orc_output_for_test(Timestamp(), output_file, input_filename) 153 | exp_res = read_expected_json_record(get_full_path(input_filename)) 154 | with subprocess.Popen( 155 | [ORC_CONTENTS_PATH, output_file.name], stdout=subprocess.PIPE 156 | ) as proc: 157 | for line in proc.stdout: 158 | assert datetime.strptime( 159 | json.loads(line)[:26], "%Y-%m-%d %H:%M:%S.%f" 160 | ) == datetime.strptime(next(exp_res)[:26], "%Y-%m-%d %H:%M:%S.%f") 161 | with pytest.raises(StopIteration): 162 | next(exp_res) 163 | -------------------------------------------------------------------------------- /src/_pyorc/Writer.cpp: -------------------------------------------------------------------------------- 1 | #include "Writer.h" 2 | #include "PyORCStream.h" 3 | 4 | void 5 | setTypeAttributes(orc::Type* type, py::handle schema) 6 | { 7 | py::dict attributes(py::getattr(schema, "attributes")); 8 | for (auto attr : attributes) { 9 | type->setAttribute(py::cast(attr.first), 10 | py::cast(attr.second)); 11 | } 12 | } 13 | 14 | ORC_UNIQUE_PTR 15 | createType(py::handle schema) 16 | { 17 | orc::TypeKind kind = orc::TypeKind(py::cast(getattr(schema, "kind"))); 18 | switch (kind) { 19 | case orc::TypeKind::BOOLEAN: 20 | case orc::TypeKind::BYTE: 21 | case orc::TypeKind::SHORT: 22 | case orc::TypeKind::INT: 23 | case orc::TypeKind::LONG: 24 | case orc::TypeKind::FLOAT: 25 | case orc::TypeKind::DOUBLE: 26 | case orc::TypeKind::STRING: 27 | case orc::TypeKind::BINARY: 28 | case orc::TypeKind::TIMESTAMP: 29 | case orc::TypeKind::TIMESTAMP_INSTANT: 30 | case orc::TypeKind::DATE: { 31 | ORC_UNIQUE_PTR type = orc::createPrimitiveType(kind); 32 | setTypeAttributes(type.get(), schema); 33 | return type; 34 | } 35 | case orc::TypeKind::VARCHAR: 36 | case orc::TypeKind::CHAR: { 37 | ORC_UNIQUE_PTR type = orc::createCharType( 38 | kind, py::cast(getattr(schema, "max_length"))); 39 | setTypeAttributes(type.get(), schema); 40 | return type; 41 | } 42 | case orc::TypeKind::DECIMAL: { 43 | uint64_t precision = py::cast(getattr(schema, "precision")); 44 | uint64_t scale = py::cast(getattr(schema, "scale")); 45 | ORC_UNIQUE_PTR type = orc::createDecimalType(precision, scale); 46 | setTypeAttributes(type.get(), schema); 47 | return type; 48 | } 49 | case orc::TypeKind::LIST: { 50 | py::handle child = getattr(schema, "type"); 51 | ORC_UNIQUE_PTR type = orc::createListType(createType(child)); 52 | setTypeAttributes(type.get(), schema); 53 | return type; 54 | } 55 | case orc::TypeKind::MAP: { 56 | py::handle key = getattr(schema, "key"); 57 | py::handle value = getattr(schema, "value"); 58 | ORC_UNIQUE_PTR type = 59 | orc::createMapType(createType(key), createType(value)); 60 | setTypeAttributes(type.get(), schema); 61 | return type; 62 | } 63 | case orc::TypeKind::STRUCT: { 64 | ORC_UNIQUE_PTR type = orc::createStructType(); 65 | py::dict fields = getattr(schema, "fields"); 66 | for (auto item : fields) { 67 | type->addStructField((py::str)item.first, createType(item.second)); 68 | } 69 | setTypeAttributes(type.get(), schema); 70 | return type; 71 | } 72 | case orc::TypeKind::UNION: { 73 | ORC_UNIQUE_PTR type = orc::createUnionType(); 74 | py::list cont_types = getattr(schema, "cont_types"); 75 | for (auto child : cont_types) { 76 | type->addUnionChild(createType(child)); 77 | } 78 | setTypeAttributes(type.get(), schema); 79 | return type; 80 | } 81 | default: 82 | throw py::type_error("Invalid TypeKind"); 83 | } 84 | } 85 | 86 | Writer::Writer(py::object fileo, 87 | py::object schema, 88 | uint64_t batch_size, 89 | uint64_t stripe_size, 90 | uint64_t row_index_stride, 91 | int compression, 92 | int compression_strategy, 93 | uint64_t compression_block_size, 94 | std::set bloom_filter_columns, 95 | double bloom_filter_fpp, 96 | py::object tzone, 97 | unsigned int struct_repr, 98 | py::object conv, 99 | double padding_tolerance, 100 | double dict_key_size_threshold, 101 | py::object null_value, 102 | unsigned int memory_block_size) 103 | { 104 | currentRow = 0; 105 | batchItem = 0; 106 | ORC_UNIQUE_PTR type = createType(schema); 107 | orc::WriterOptions options; 108 | py::dict converters; 109 | 110 | if (conv.is_none()) { 111 | py::dict defaultConv = 112 | py::module::import("pyorc.converters").attr("DEFAULT_CONVERTERS"); 113 | converters = py::dict(defaultConv); 114 | } else { 115 | converters = conv; 116 | } 117 | 118 | options = options.setCompression(static_cast(compression)); 119 | options = options.setCompressionStrategy( 120 | static_cast(compression_strategy)); 121 | options = options.setCompressionBlockSize(compression_block_size); 122 | options = options.setStripeSize(stripe_size); 123 | options = options.setRowIndexStride(row_index_stride); 124 | options = options.setColumnsUseBloomFilter(bloom_filter_columns); 125 | options = options.setBloomFilterFPP(bloom_filter_fpp); 126 | options = options.setDictionaryKeySizeThreshold(dict_key_size_threshold); 127 | options = options.setPaddingTolerance(padding_tolerance); 128 | #if ORC_VERSION_AT_LEAST(2, 1, 0) 129 | options = options.setMemoryBlockSize(memory_block_size); 130 | #endif 131 | if (!tzone.is_none()) { 132 | std::string tzKey = py::cast(tzone.attr("key")); 133 | options = options.setTimezoneName(tzKey); 134 | } 135 | 136 | outStream = std::unique_ptr(new PyORCOutputStream(fileo)); 137 | writer = orc::createWriter(*type, outStream.get(), options); 138 | batchSize = batch_size; 139 | batch = writer->createRowBatch(batchSize); 140 | converter = createConverter(type.get(), struct_repr, converters, tzone, null_value); 141 | } 142 | 143 | void 144 | Writer::write(py::object row) 145 | { 146 | converter->write(batch.get(), batchItem, row); 147 | currentRow++; 148 | batchItem++; 149 | 150 | if (batchItem == batchSize) { 151 | writer->add(*batch); 152 | converter->clear(); 153 | batchItem = 0; 154 | } 155 | } 156 | 157 | uint64_t 158 | Writer::writerows(py::iterable iter) 159 | { 160 | uint64_t rows = 0; 161 | for (auto handle : iter) { 162 | auto obj = py::cast(handle); 163 | this->write(obj); 164 | ++rows; 165 | } 166 | return rows; 167 | } 168 | 169 | void 170 | Writer::close() 171 | { 172 | if (batchItem != 0) { 173 | writer->add(*batch); 174 | converter->clear(); 175 | batchItem = 0; 176 | } 177 | writer->close(); 178 | } 179 | 180 | void 181 | Writer::addUserMetadata(py::str key, py::bytes value) 182 | { 183 | writer->addUserMetadata(key, value); 184 | } 185 | 186 | #if ORC_VERSION_AT_LEAST(1, 9, 0) 187 | uint64_t 188 | Writer::writeIntermediateFooter() 189 | { 190 | return writer->writeIntermediateFooter(); 191 | } 192 | #endif 193 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | 4 | At this point you have an installed pyorc module. 5 | 6 | Reading 7 | ------- 8 | 9 | Let's use one of the example ORC files to open in Python:: 10 | 11 | >>> import pyorc 12 | >>> example = open("./deps/examples/demo-12-zlib.orc", "rb") 13 | >>> reader = pyorc.Reader(example) 14 | 15 | See the schema of the selected file:: 16 | 17 | >>> reader.schema 18 | 19 | 20 | The Reader's schema read-only property is a :class:`TypeDescription` object, 21 | representing the ORC file's type hierarchy. We can get a more human-friendly 22 | interpretation if we print its string format:: 23 | 24 | >>> str(reader.schema) 25 | 'struct<_col0:int,_col1:string,_col2:string,_col3:string,_col4:int,_col5:string,_col6:int,_col7:int,_col8:int>' 26 | 27 | We can check the number of rows in the file by calling len() on the Reader:: 28 | 29 | >>> len(reader) 30 | 1920800 31 | 32 | The Reader is an interable object, yielding a new row after every 33 | iteration:: 34 | 35 | >>> next(reader) 36 | (1, 'M', 'M', 'Primary', 500, 'Good', 0, 0, 0) 37 | >>> next(reader) 38 | (2, 'F', 'M', 'Primary', 500, 'Good', 0, 0, 0) 39 | 40 | Iterating over the file's content to process its rows is the preferable way, 41 | but we can also read the entire file into the memory with the read method. 42 | This method has an optional parameter to control the maximal number of rows 43 | to read:: 44 | 45 | >>> rows = reader.read(10000) 46 | >>> rows 47 | ... (10000, 'F', 'U', 'Advanced Degree', 1500, 'Unknown', 1, 0, 0), (10001, 'M', 'M', 'Unknown', 1500, 'Unknown', 1, 0, 0), (10002, 'F', 'M', 'Unknown', 1500, 'Unknown', 1, 0, 0)] 48 | >>> reader.read() # This call froze the interpreter for several minutes! 49 | ... (1920799, 'M', 'U', 'Unknown', 10000, 'Unknown', 6, 6, 6), (1920800, 'F', 'U', 'Unknown', 10000, 'Unknown', 6, 6, 6)] 50 | 51 | Using this optional parameter for larger ORC file is highly recommended! 52 | 53 | After all the rows are read, the Reader object has no more rows to yield. 54 | There's a seek method to jump a specific row in the file and continue the 55 | read from that point:: 56 | 57 | >>> reader.seek(1000) 58 | 1000 59 | >>> next(reader) 60 | (1001, 'M', 'M', 'College', 7500, 'Good', 0, 0, 0) 61 | 62 | By default all fields are loaded from an ORC file, but that can be changed 63 | by passing either `column_indices` or `column_names` parameter to Reader:: 64 | 65 | >>> reader = pyorc.Reader(example, column_names=("_col0", "_col5")) 66 | >>> next(reader) 67 | (1, 'Good') 68 | 69 | We can also change the representation of a struct from tuple to dictionary:: 70 | 71 | >>> from pyorc.enums import StructRepr 72 | >>> reader = pyorc.Reader(example, column_indices=(1, 5), struct_repr=StructRepr.DICT) 73 | >>> next(reader) 74 | {'_col1': 'M', '_col5': 'Good'} 75 | 76 | Stripes 77 | ------- 78 | 79 | ORC files are divided in to stripes. Stripes are independent of each other. 80 | Let's open an other ORC files that has multiple stripes in it:: 81 | 82 | >>> example = open("./deps/examples/TestOrcFile.testStripeLevelStats.orc", "rb") 83 | >>> reader = pyorc.Reader(example) 84 | >>> reader.num_of_stripes 85 | 3 86 | 87 | The `num_of_stripes` property of the Reader shows how many stripes are in 88 | the file. We can read a certain stripes using the `read_stripe` method:: 89 | 90 | >>> stripe2 = reader.read_stripe(2) 91 | >>> stripe2 92 | 93 | 94 | The stripe object also an iterable object and has the same methods for 95 | reading and seeking rows, but only in the boundaries of the selected 96 | stripe:: 97 | 98 | >>> next(stripe2) 99 | (3, 'three') 100 | >>> len(stripe1) 101 | 1000 102 | >>> len(reader) 103 | 11000 104 | >>> stripe2.row_offset 105 | 10000 106 | 107 | The `row_offset` returns the absolute position of the first row in the 108 | stripe. 109 | 110 | Filtering row groups 111 | -------------------- 112 | 113 | It is possible to skip certain records in an ORC file using simple filter 114 | predicates (or search arguments). Setting a predicate expression to the 115 | Reader can help to exclude row groups that don't satisfy the condition 116 | during reading:: 117 | 118 | >>> example = open("./deps/examples/TestStringDictionary.testRowIndex.orc", "rb") 119 | >>> reader = pyorc.Reader(example) 120 | >>> next(reader) 121 | ('row 000000',) 122 | >>> reader = pyorc.Reader(example, predicate=pyorc.predicates.PredicateColumn(pyorc.TypeKind.STRING, "str") > "row 004096") 123 | >>> next(reader) 124 | ('row 004096',) 125 | 126 | The predicate can be used to select a single row group, but not an 127 | individual record. The size of the row group is determined by the 128 | `row_index_stride`, set during writing of the file. You can create more 129 | complex predicate using logical expressions:: 130 | 131 | >>> pred = (PredicateColumn(TypeKind.INT, "c0") > 300) & (PredicateColumn(TypeKind.STRING, "c1") == "A") 132 | 133 | One of the comparands must always be a literal value (cannot compare two 134 | columns to each other). 135 | 136 | Writing 137 | ------- 138 | 139 | To write a new ORC file we need to open a binary file-like object and pass 140 | to a Writer object with an ORC schema description. The schema can be a 141 | TypeDescription or a simple string ORC schema definition:: 142 | 143 | >>> output = open("./new.orc", "wb") 144 | >>> writer = pyorc.Writer(output, "struct") 145 | >>> writer 146 | 147 | 148 | We can add rows to the file with the `write` method:: 149 | 150 | >>> writer.write((0, "Test 0")) 151 | >>> writer.write((1, "Test 1")) 152 | 153 | Don't forget to close the writer to write out the necessary metadata, 154 | otherwise it won't be a valid ORC file. 155 | 156 | >>> writer.close() 157 | 158 | For simpler use the Writer object can be used as a context manager and you 159 | can also change the struct representation to use dictionaries as rows instead 160 | of tuples as well: 161 | 162 | .. code-block:: python 163 | 164 | with open("./new.orc", "wb") as output: 165 | with pyorc.Writer(output, "struct", struct_repr=StructRepr.DICT) as writer: 166 | writer.write({"col0": 0, "col1": "Test 0"}) 167 | 168 | 169 | Using custom converters 170 | ----------------------- 171 | 172 | It's possible to change the default converters that handle the transformations 173 | from ORC `date`, `decimal`, and `timestamp` types to Python objects, and back. 174 | To create your own converter you need to implement the :class:`ORCConverter` 175 | abstract class with two methods: ``from_orc`` and ``to_orc``. The following 176 | example returns the ORC timestamp values as seconds and nanoseconds pair: 177 | 178 | .. code-block:: python 179 | 180 | import pyorc 181 | from pyorc.converters import ORCConverter 182 | 183 | class TSConverter(ORCConverter): 184 | @staticmethod 185 | def to_orc(*args): 186 | seconds, nanoseconds, timezone = args 187 | return (seconds, nanoseconds) 188 | 189 | @staticmethod 190 | def from_orc(seconds, nanoseconds, timezone): 191 | return (seconds, nanoseconds) 192 | 193 | To use the converter you have to set the Reader's or Writer's converters 194 | parameter as a dictionary with one of the supported types as key:: 195 | 196 | data = open("./timestamps.orc", "rb") 197 | reader = pyorc.Reader(data, converters={TypeKind.TIMESTAMP: TSConverter}) 198 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | branches: 3 | include: 4 | - '*' 5 | 6 | stages: 7 | - stage: test 8 | jobs: 9 | - job: ubuntu 10 | pool: 11 | vmImage: "ubuntu-latest" 12 | strategy: 13 | matrix: 14 | Python39: 15 | python.version: '3.9' 16 | Python310: 17 | python.version: '3.10' 18 | Python311: 19 | python.version: '3.11' 20 | Python312: 21 | python.version: '3.12' 22 | Python313: 23 | python.version: '3.13' 24 | PyPy3: 25 | python.version: 'pypy3' 26 | steps: 27 | - task: UsePythonVersion@0 28 | inputs: 29 | versionSpec: '$(python.version)' 30 | architecture: 'x64' 31 | allowUnstable: true 32 | - template: .azure-pipelines/build-run-tests.yml 33 | 34 | - job: ubuntu_eastern_timezone 35 | pool: 36 | vmImage: "ubuntu-latest" 37 | strategy: 38 | matrix: 39 | Python38: 40 | python.version: '3.11' 41 | steps: 42 | - bash: sudo timedatectl set-timezone America/New_York 43 | displayName: Set timezone 44 | - bash: date 45 | - task: UsePythonVersion@0 46 | inputs: 47 | versionSpec: '$(python.version)' 48 | architecture: 'x64' 49 | - template: .azure-pipelines/build-run-tests.yml 50 | 51 | - job: previous_orc_versions 52 | pool: 53 | vmImage: "ubuntu-latest" 54 | strategy: 55 | matrix: 56 | ORC17: 57 | orc.version: '1.7.11' 58 | ORC18: 59 | orc.version: '1.8.8' 60 | ORC19: 61 | orc.version: '1.9.5' 62 | steps: 63 | - task: UsePythonVersion@0 64 | inputs: 65 | versionSpec: '3.11' 66 | architecture: 'x64' 67 | - template: .azure-pipelines/build-run-tests.yml 68 | parameters: 69 | orc_version: '$(orc.version)' 70 | 71 | - job: macos 72 | pool: 73 | vmImage: 'macOS-latest' 74 | strategy: 75 | matrix: 76 | Python39: 77 | python.version: '3.9' 78 | Python310: 79 | python.version: '3.10' 80 | Python311: 81 | python.version: '3.11' 82 | Python312: 83 | python.version: '3.12' 84 | Python313: 85 | python.version: '3.13' 86 | steps: 87 | - task: UsePythonVersion@0 88 | inputs: 89 | versionSpec: '$(python.version)' 90 | architecture: 'x64' 91 | allowUnstable: true 92 | - template: .azure-pipelines/build-run-tests.yml 93 | 94 | - job: windows 95 | pool: 96 | vmImage: 'windows-2019' 97 | strategy: 98 | matrix: 99 | Python39: 100 | python.version: '3.9' 101 | Python310: 102 | python.version: '3.10' 103 | Python311: 104 | python.version: '3.11' 105 | Python312: 106 | python.version: '3.12' 107 | Python313: 108 | python.version: '3.13' 109 | 110 | steps: 111 | - task: UsePythonVersion@0 112 | inputs: 113 | versionSpec: '$(python.version)' 114 | architecture: 'x64' 115 | allowUnstable: true 116 | - template: .azure-pipelines/build-run-tests.yml 117 | parameters: 118 | windows: true 119 | 120 | - stage: build_wheels 121 | jobs: 122 | - job: manylinux_x86_64 123 | pool: 124 | vmImage: 'ubuntu-latest' 125 | steps: 126 | - template: .azure-pipelines/prepare-and-push-wheels.yml 127 | parameters: 128 | cibwStep: 129 | bash: cibuildwheel --output-dir wheelhouse . 130 | env: 131 | CIBW_BUILD_VERBOSITY: 3 132 | CIBW_ARCHS: x86_64 133 | CIBW_BUILD: "*-manylinux_*" 134 | CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*" 135 | displayName: Build wheels 136 | 137 | - job: musllinux_x86_64 138 | pool: 139 | vmImage: 'ubuntu-latest' 140 | steps: 141 | - template: .azure-pipelines/prepare-and-push-wheels.yml 142 | parameters: 143 | cibwStep: 144 | bash: cibuildwheel --output-dir wheelhouse . 145 | env: 146 | CIBW_BUILD_VERBOSITY: 3 147 | CIBW_ARCHS: x86_64 148 | CIBW_BUILD: "*-musllinux_*" 149 | CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*" 150 | displayName: Build wheels 151 | 152 | - job: manylinux_cpy_aarch64 153 | timeoutInMinutes: 165 154 | pool: 155 | vmImage: 'ubuntu-latest' 156 | steps: 157 | - template: .azure-pipelines/prepare-and-push-wheels.yml 158 | parameters: 159 | qemu: true 160 | cibwStep: 161 | bash: cibuildwheel --output-dir wheelhouse . 162 | env: 163 | CIBW_BUILD_VERBOSITY: 3 164 | CIBW_ARCHS: aarch64 165 | CIBW_BUILD: "*-manylinux_*" 166 | CIBW_SKIP: "pp* cp36-* cp37-* cp38-*" 167 | displayName: Build wheels 168 | 169 | - job: manylinux_pypy_aarch64 170 | timeoutInMinutes: 165 171 | pool: 172 | vmImage: 'ubuntu-latest' 173 | steps: 174 | - template: .azure-pipelines/prepare-and-push-wheels.yml 175 | parameters: 176 | qemu: true 177 | cibwStep: 178 | bash: cibuildwheel --output-dir wheelhouse . 179 | env: 180 | CIBW_BUILD_VERBOSITY: 3 181 | CIBW_ARCHS: aarch64 182 | CIBW_BUILD: "*-manylinux_*" 183 | CIBW_SKIP: "cp* pp37-* pp38-*" 184 | displayName: Build wheels 185 | 186 | - job: musllinux_aarch64 187 | timeoutInMinutes: 165 188 | pool: 189 | vmImage: 'ubuntu-latest' 190 | steps: 191 | - template: .azure-pipelines/prepare-and-push-wheels.yml 192 | parameters: 193 | qemu: true 194 | cibwStep: 195 | bash: cibuildwheel --output-dir wheelhouse . 196 | env: 197 | CIBW_BUILD_VERBOSITY: 3 198 | CIBW_ARCHS: aarch64 199 | CIBW_BUILD: "*-musllinux_*" 200 | CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*" 201 | displayName: Build wheels 202 | 203 | - job: macos 204 | pool: 205 | vmImage: 'macOS-latest' 206 | steps: 207 | - template: .azure-pipelines/prepare-and-push-wheels.yml 208 | parameters: 209 | cibwStep: 210 | bash: cibuildwheel --output-dir wheelhouse . 211 | env: 212 | CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=10.13" 213 | CIBW_BUILD_VERBOSITY: 3 214 | CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*" 215 | displayName: Build wheels 216 | 217 | - job: macos_universal2 218 | pool: 219 | vmImage: 'macOS-latest' 220 | steps: 221 | - template: .azure-pipelines/prepare-and-push-wheels.yml 222 | parameters: 223 | cibwStep: 224 | bash: cibuildwheel --output-dir wheelhouse . 225 | env: 226 | CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=10.13" 227 | CIBW_BUILD_VERBOSITY: 3 228 | CMAKE_OSX_ARCHITECTURES: 'x86_64;arm64' 229 | CIBW_ARCHS_MACOS: universal2 230 | CIBW_SKIP: "cp36-* cp37-* cp38-*" 231 | displayName: Build wheels 232 | 233 | - job: windows_amd64 234 | pool: 235 | vmImage: 'windows-2019' 236 | steps: 237 | - template: .azure-pipelines/prepare-and-push-wheels.yml 238 | parameters: 239 | cibwStep: 240 | bash: cibuildwheel --output-dir wheelhouse . 241 | env: 242 | CIBW_BUILD_VERBOSITY: 3 243 | CIBW_ARCHS: AMD64 244 | CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*" 245 | displayName: Build wheels 246 | -------------------------------------------------------------------------------- /src/pyorc/typedescription.py: -------------------------------------------------------------------------------- 1 | import re 2 | from types import MappingProxyType 3 | from typing import Dict, Mapping, Tuple 4 | 5 | from pyorc._pyorc import _schema_from_string 6 | 7 | from .enums import TypeKind 8 | 9 | 10 | class TypeDescription: 11 | name = "" 12 | kind = -1 13 | 14 | def __init__(self) -> None: 15 | self._column_id = 0 16 | self._attributes: Dict[str, str] = {} 17 | 18 | def __str__(self) -> str: 19 | return self.name 20 | 21 | @property 22 | def attributes(self) -> Dict[str, str]: 23 | return self._attributes 24 | 25 | def set_attributes(self, val) -> None: 26 | if isinstance(val, dict): 27 | if all( 28 | isinstance(key, str) and isinstance(val, str) 29 | for key, val in val.items() 30 | ): 31 | self._attributes = val 32 | else: 33 | raise TypeError( 34 | "The all keys and values in the attributes dictionary must be string" 35 | ) 36 | else: 37 | raise TypeError("The attributes must be a dictionary") 38 | 39 | @property 40 | def column_id(self) -> int: 41 | return self._column_id 42 | 43 | def set_column_id(self, val: int) -> int: 44 | self._column_id = val 45 | return self._column_id 46 | 47 | def find_column_id(self, dotted_key: str) -> int: 48 | raise KeyError(dotted_key) 49 | 50 | @staticmethod 51 | def from_string(schema: str) -> "TypeDescription": 52 | return _schema_from_string(schema) 53 | 54 | 55 | class Boolean(TypeDescription): 56 | name = "boolean" 57 | kind = TypeKind.BOOLEAN 58 | 59 | 60 | class TinyInt(TypeDescription): 61 | name = "tinyint" 62 | kind = TypeKind.BYTE 63 | 64 | 65 | class SmallInt(TypeDescription): 66 | name = "smallint" 67 | kind = TypeKind.SHORT 68 | 69 | 70 | class Int(TypeDescription): 71 | name = "int" 72 | kind = TypeKind.INT 73 | 74 | 75 | class BigInt(TypeDescription): 76 | name = "bigint" 77 | kind = TypeKind.LONG 78 | 79 | 80 | class Float(TypeDescription): 81 | name = "float" 82 | kind = TypeKind.FLOAT 83 | 84 | 85 | class Double(TypeDescription): 86 | name = "double" 87 | kind = TypeKind.DOUBLE 88 | 89 | 90 | class String(TypeDescription): 91 | name = "string" 92 | kind = TypeKind.STRING 93 | 94 | 95 | class Binary(TypeDescription): 96 | name = "binary" 97 | kind = TypeKind.BINARY 98 | 99 | 100 | class Timestamp(TypeDescription): 101 | name = "timestamp" 102 | kind = TypeKind.TIMESTAMP 103 | 104 | 105 | class TimestampInstant(TypeDescription): 106 | name = "timestamp with local time zone" 107 | kind = TypeKind.TIMESTAMP_INSTANT 108 | 109 | 110 | class Date(TypeDescription): 111 | name = "date" 112 | kind = TypeKind.DATE 113 | 114 | 115 | class Char(TypeDescription): 116 | name = "char" 117 | kind = TypeKind.CHAR 118 | 119 | def __init__(self, max_length: int) -> None: 120 | self.max_length = max_length 121 | super().__init__() 122 | 123 | def __str__(self) -> str: 124 | return "{name}({len})".format(name=Char.name, len=self.max_length) 125 | 126 | 127 | class VarChar(TypeDescription): 128 | name = "varchar" 129 | kind = TypeKind.VARCHAR 130 | 131 | def __init__(self, max_length: int) -> None: 132 | super().__init__() 133 | self.max_length = max_length 134 | 135 | def __str__(self) -> str: 136 | return "{name}({len})".format(name=VarChar.name, len=self.max_length) 137 | 138 | 139 | class Decimal(TypeDescription): 140 | name = "decimal" 141 | kind = TypeKind.DECIMAL 142 | 143 | def __init__(self, precision: int, scale: int) -> None: 144 | super().__init__() 145 | self.precision = precision 146 | self.scale = scale 147 | 148 | def __str__(self) -> str: 149 | return "{name}({prc},{scl})".format( 150 | name=Decimal.name, prc=self.precision, scl=self.scale 151 | ) 152 | 153 | 154 | class Union(TypeDescription): 155 | name = "uniontype" 156 | kind = TypeKind.UNION 157 | 158 | def __init__(self, *cont_types: TypeDescription) -> None: 159 | super().__init__() 160 | for c_types in cont_types: 161 | if not isinstance(c_types, TypeDescription): 162 | raise TypeError("Invalid container type for Union") 163 | self.__cont_types = cont_types 164 | 165 | def __str__(self): 166 | return "{name}<{types}>".format( 167 | name=Union.name, types=",".join(str(typ) for typ in self.__cont_types), 168 | ) 169 | 170 | def __getitem__(self, idx: int) -> TypeDescription: 171 | return self.__cont_types[idx] 172 | 173 | def set_column_id(self, val: int) -> int: 174 | self._column_id = val 175 | for c_type in self.__cont_types: 176 | val = c_type.set_column_id(val + 1) 177 | return val 178 | 179 | @property 180 | def cont_types(self) -> Tuple[TypeDescription, ...]: 181 | return self.__cont_types 182 | 183 | 184 | class Array(TypeDescription): 185 | name = "array" 186 | kind = TypeKind.LIST 187 | 188 | def __init__(self, cont_type: TypeDescription) -> None: 189 | super().__init__() 190 | if not isinstance(cont_type, TypeDescription): 191 | raise TypeError("Array's container type must be a TypeDescription instance") 192 | self.__type = cont_type 193 | 194 | def __str__(self) -> str: 195 | return "{name}<{type}>".format(name=Array.name, type=str(self.__type)) 196 | 197 | def set_column_id(self, val: int) -> int: 198 | self._column_id = val 199 | val = self.__type.set_column_id(val + 1) 200 | return val 201 | 202 | @property 203 | def type(self) -> TypeDescription: 204 | return self.__type 205 | 206 | 207 | class Map(TypeDescription): 208 | name = "map" 209 | kind = TypeKind.MAP 210 | 211 | def __init__(self, key: TypeDescription, value: TypeDescription) -> None: 212 | super().__init__() 213 | if not isinstance(key, TypeDescription): 214 | raise TypeError("Map's key type must be a TypeDescription instance") 215 | if not isinstance(value, TypeDescription): 216 | raise TypeError("Map's value type must be a TypeDescription instance") 217 | self.__key = key 218 | self.__value = value 219 | 220 | def __str__(self) -> str: 221 | return "{name}<{key},{val}>".format( 222 | name=Map.name, key=str(self.__key), val=str(self.__value) 223 | ) 224 | 225 | def set_column_id(self, val: int) -> int: 226 | self._column_id = val 227 | val = self.__key.set_column_id(val + 1) 228 | val = self.__value.set_column_id(val + 1) 229 | return val 230 | 231 | @property 232 | def key(self) -> TypeDescription: 233 | return self.__key 234 | 235 | @property 236 | def value(self) -> TypeDescription: 237 | return self.__value 238 | 239 | 240 | class Struct(TypeDescription): 241 | name = "struct" 242 | kind = TypeKind.STRUCT 243 | 244 | def __init__(self, **fields: TypeDescription) -> None: 245 | super().__init__() 246 | for fld in fields.values(): 247 | if not isinstance(fld, TypeDescription): 248 | raise TypeError( 249 | "Struct's field type must be a TypeDescription instance" 250 | ) 251 | self.__fields = fields 252 | self.set_column_id(0) 253 | 254 | def __str__(self) -> str: 255 | return "{name}<{fields}>".format( 256 | name=Struct.name, 257 | fields=",".join( 258 | "{field}:{type}".format(field=key, type=str(val)) 259 | for key, val in self.__fields.items() 260 | ), 261 | ) 262 | 263 | def __getitem__(self, key: str) -> TypeDescription: 264 | return self.__fields[key] 265 | 266 | def set_column_id(self, val: int) -> int: 267 | self._column_id = val 268 | for fld in self.__fields.values(): 269 | val = fld.set_column_id(val + 1) 270 | return val 271 | 272 | def find_column_id(self, dotted_key: str) -> int: 273 | this = self 274 | # Allow to use backtick for escaping column names with dot. 275 | for key in re.findall(r"[^\.`]+|`[^`]*`", dotted_key): 276 | this = this[key.replace("`", "")] 277 | return this.column_id 278 | 279 | @property 280 | def fields(self) -> Mapping[str, TypeDescription]: 281 | return MappingProxyType(self.__fields) 282 | -------------------------------------------------------------------------------- /tests/test_column.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import io 4 | import math 5 | 6 | from datetime import date, datetime, timedelta, timezone 7 | from decimal import Decimal 8 | 9 | from pyorc import ( 10 | Reader, 11 | Writer, 12 | TypeKind, 13 | StructRepr, 14 | ParseError, 15 | Column, 16 | Stripe, 17 | ) 18 | 19 | 20 | @pytest.fixture 21 | def striped_orc_data(): 22 | def _init(schema, rows, bfc=tuple()): 23 | data = io.BytesIO() 24 | with Writer( 25 | data, 26 | schema, 27 | batch_size=65535, 28 | stripe_size=128, 29 | compression_block_size=128, 30 | bloom_filter_columns=bfc, 31 | memory_block_size=64, 32 | ) as writer: 33 | writer.writerows(rows) 34 | data.seek(0) 35 | return data 36 | 37 | return _init 38 | 39 | 40 | def test_init(striped_orc_data): 41 | data = striped_orc_data("struct", ((i, i * 5) for i in range(100000))) 42 | reader = Reader(data, column_indices=(1,)) 43 | stripe = Stripe(reader, 0) 44 | with pytest.raises(TypeError): 45 | _ = Column(stripe, "0") 46 | with pytest.raises(IndexError): 47 | _ = Column(stripe, 100) 48 | with pytest.raises(IndexError): 49 | _ = Column(reader, 100) 50 | with pytest.raises(IndexError): 51 | _ = Column(reader, 1) 52 | col = Column(stripe, 0) 53 | assert col is not None 54 | col = Column(reader, 0) 55 | assert col is not None 56 | 57 | 58 | def test_getitem(striped_orc_data): 59 | data = striped_orc_data("int", (i for i in range(100000))) 60 | reader = Reader(data) 61 | stripe = Stripe(reader, 0) 62 | col = reader[0] 63 | assert col is not None 64 | col = stripe[0] 65 | assert col is not None 66 | 67 | 68 | def test_statistics_bool(striped_orc_data): 69 | data = striped_orc_data( 70 | "struct", (((True, False, None)[i % 3],) for i in range(100000)) 71 | ) 72 | reader = Reader(data) 73 | stripe = Stripe(reader, 0) 74 | stat = stripe[0].statistics 75 | assert stat["has_null"] is False 76 | assert stat["number_of_values"] == 65535 77 | assert stat["kind"] == TypeKind.STRUCT 78 | stat = stripe[1].statistics 79 | assert stat["has_null"] is True 80 | assert stat["kind"] == TypeKind.BOOLEAN 81 | assert stat["number_of_values"] == 43690 82 | assert stat["false_count"] == 21845 83 | assert stat["true_count"] == len([i for i, in stripe if i is True]) 84 | stat = reader[1].statistics 85 | assert stat["has_null"] is True 86 | assert stat["number_of_values"] == 66667 87 | assert stat["false_count"] == len([i for i, in reader if i is False]) 88 | assert stat["true_count"] == 33334 89 | assert reader[0].statistics["number_of_values"] == 100000 90 | 91 | 92 | def test_statistics_int(striped_orc_data): 93 | data = striped_orc_data("int", (i for i in range(100000))) 94 | reader = Reader(data) 95 | stripe = Stripe(reader, 0) 96 | stat = stripe[0].statistics 97 | assert stat["has_null"] is False 98 | assert stat["number_of_values"] == 65535 99 | assert stat["kind"] == TypeKind.INT 100 | assert stat["minimum"] == 0 101 | assert stat["maximum"] == 65534 102 | assert stat["sum"] == sum(i for i in range(len(stripe))) 103 | stat = reader[0].statistics 104 | assert stat["minimum"] == 0 105 | assert stat["maximum"] == 99999 106 | assert stat["sum"] == sum(i for i in range(100000)) 107 | assert reader.read_stripe(1)[0].statistics["minimum"] == 65535 108 | 109 | 110 | def test_statistics_double(striped_orc_data): 111 | data = striped_orc_data("double", (i * 0.1 for i in range(100000))) 112 | reader = Reader(data) 113 | stripe = Stripe(reader, 0) 114 | stat = stripe[0].statistics 115 | assert stat["has_null"] is False 116 | assert stat["number_of_values"] == 65535 117 | assert stat["kind"] == TypeKind.DOUBLE 118 | assert stat["minimum"] == 0 119 | assert math.isclose(stat["maximum"], 6553.4) 120 | assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe))) 121 | stat = reader[0].statistics 122 | assert stat["minimum"] == 0 123 | assert math.isclose(stat["maximum"], 9999.9) 124 | assert stat["sum"] == sum(i * 0.1 for i in range(100000)) 125 | assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5 126 | 127 | 128 | def test_statistics_binary(striped_orc_data): 129 | data = striped_orc_data("binary", (b"\x4D\x45\x34\x01" for i in range(100000))) 130 | reader = Reader(data) 131 | stripe = Stripe(reader, 0) 132 | stat = stripe[0].statistics 133 | assert stat["has_null"] is False 134 | assert stat["kind"] == TypeKind.BINARY 135 | assert stat["number_of_values"] == 65535 136 | assert stat["total_length"] == sum(len(i) for i in stripe) 137 | stat = reader[0].statistics 138 | assert stat["total_length"] == sum(len(i) for i in reader) 139 | 140 | 141 | def test_statistics_string(striped_orc_data): 142 | data = striped_orc_data( 143 | "string", ("Test String {0}".format(i + 1) for i in range(100000)) 144 | ) 145 | reader = Reader(data) 146 | stripe = Stripe(reader, 0) 147 | stat = stripe[0].statistics 148 | assert stat["has_null"] is False 149 | assert stat["kind"] == TypeKind.STRING 150 | assert stat["number_of_values"] == 65535 151 | assert stat["total_length"] == sum(len(i) for i in stripe) 152 | assert stat["minimum"] == "Test String 1" 153 | assert stat["maximum"] == max(i for i in Stripe(reader, 0)) 154 | stat = reader[0].statistics 155 | assert stat["maximum"] == max(i for i in reader) 156 | assert reader.read_stripe(1)[0].statistics["minimum"] == "Test String 100000" 157 | 158 | 159 | def test_statistics_date(striped_orc_data): 160 | data = striped_orc_data( 161 | "date", (date(1900, 1, 1) + timedelta(days=i) for i in range(100000)) 162 | ) 163 | reader = Reader(data) 164 | stripe = Stripe(reader, 0) 165 | stat = stripe[0].statistics 166 | assert stat["kind"] == TypeKind.DATE 167 | assert stat["has_null"] is False 168 | assert stat["number_of_values"] == 65535 169 | assert stat["minimum"] == date(1900, 1, 1) 170 | assert stat["maximum"] == date(2079, 6, 5) 171 | stat = reader[0].statistics 172 | assert stat["maximum"] == max(i for i in reader) 173 | 174 | 175 | def test_statistics_timestamp(striped_orc_data): 176 | data = striped_orc_data( 177 | "timestamp", 178 | ( 179 | datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) + timedelta(minutes=i) 180 | for i in range(100000) 181 | ), 182 | ) 183 | reader = Reader(data) 184 | stripe = Stripe(reader, 0) 185 | stat = stripe[0].statistics 186 | assert stat["kind"] == TypeKind.TIMESTAMP 187 | assert stat["has_null"] is False 188 | assert stat["number_of_values"] == len(stripe) 189 | assert stat["minimum"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) 190 | assert stat["maximum"] == max(i for i in stripe) 191 | assert stat["lower_bound"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) 192 | assert stat["upper_bound"] == datetime( 193 | 2000, 2, 16, 0, 14, 0, 1000, tzinfo=timezone.utc 194 | ) 195 | stat = reader[0].statistics 196 | assert stat["maximum"] == max(i for i in reader) 197 | assert stat["upper_bound"] == datetime( 198 | 2000, 3, 10, 22, 39, 0, 1000, tzinfo=timezone.utc 199 | ) 200 | 201 | 202 | def test_statistics_decimal(striped_orc_data): 203 | data = striped_orc_data( 204 | "decimal(10,3)", 205 | (Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)), 206 | ) 207 | reader = Reader(data) 208 | stripe = Stripe(reader, 0) 209 | stat = stripe[0].statistics 210 | assert stat["kind"] == TypeKind.DECIMAL 211 | assert stat["has_null"] is False 212 | assert stat["number_of_values"] == len(stripe) 213 | assert stat["minimum"] == Decimal("1010.100") 214 | assert stat["maximum"] == Decimal("7563.500") 215 | assert stat["sum"] == sum( 216 | Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(len(stripe)) 217 | ).quantize(Decimal("1.000")) 218 | stat = reader[0].statistics 219 | assert stat["sum"] == sum( 220 | Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000) 221 | ).quantize(Decimal("1.000")) 222 | assert reader.read_stripe(1)[0].statistics["minimum"] == Decimal("7563.600") 223 | 224 | 225 | def test_statistics_array_int(striped_orc_data): 226 | data = striped_orc_data( 227 | "struct>", 228 | (([j + i for j in range(30)],) for i in range(100000)), 229 | ) 230 | reader = Reader(data) 231 | stripe = reader.read_stripe(0) 232 | stat = stripe[2].statistics 233 | assert stripe[1].statistics["kind"] == TypeKind.LIST 234 | assert stat["kind"] == TypeKind.INT 235 | assert sum(i for col in reader.read_stripe(0) for i in col[0]) == stat["sum"] 236 | assert min(i for col in reader.read_stripe(0) for i in col[0]) == stat["minimum"] 237 | assert max(i for col in reader.read_stripe(0) for i in col[0]) == stat["maximum"] 238 | stat = reader[2].statistics 239 | assert max(i for col in reader for i in col[0]) == stat["maximum"] 240 | -------------------------------------------------------------------------------- /src/_pyorc/SearchArgument.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "SearchArgument.h" 4 | 5 | std::tuple 6 | buildLiteral(py::object column, 7 | py::object value, 8 | py::dict convDict, 9 | py::object timezoneInfo) 10 | { 11 | int colType = py::cast(column.attr("type_kind")); 12 | switch (colType) { 13 | case orc::TypeKind::BOOLEAN: 14 | if (value.is_none()) { 15 | return std::make_tuple(orc::PredicateDataType::BOOLEAN, 16 | orc::Literal(orc::PredicateDataType::BOOLEAN)); 17 | } else { 18 | return std::make_tuple(orc::PredicateDataType::BOOLEAN, 19 | orc::Literal(py::cast(value))); 20 | } 21 | case orc::TypeKind::BYTE: 22 | case orc::TypeKind::SHORT: 23 | case orc::TypeKind::INT: 24 | case orc::TypeKind::LONG: 25 | if (value.is_none()) { 26 | return std::make_tuple(orc::PredicateDataType::LONG, 27 | orc::Literal(orc::PredicateDataType::LONG)); 28 | } else { 29 | return std::make_tuple(orc::PredicateDataType::LONG, 30 | orc::Literal(py::cast(value))); 31 | } 32 | case orc::TypeKind::FLOAT: 33 | case orc::TypeKind::DOUBLE: 34 | if (value.is_none()) { 35 | return std::make_tuple(orc::PredicateDataType::FLOAT, 36 | orc::Literal(orc::PredicateDataType::FLOAT)); 37 | } else { 38 | return std::make_tuple(orc::PredicateDataType::FLOAT, 39 | orc::Literal(py::cast(value))); 40 | } 41 | case orc::TypeKind::CHAR: 42 | case orc::TypeKind::VARCHAR: 43 | case orc::TypeKind::STRING: { 44 | if (value.is_none()) { 45 | return std::make_tuple(orc::PredicateDataType::STRING, 46 | orc::Literal(orc::PredicateDataType::STRING)); 47 | } else { 48 | std::string str = py::cast(value); 49 | return std::make_tuple(orc::PredicateDataType::STRING, 50 | orc::Literal(str.c_str(), str.size())); 51 | } 52 | } 53 | case orc::TypeKind::DATE: { 54 | if (value.is_none()) { 55 | return std::make_tuple(orc::PredicateDataType::DATE, 56 | orc::Literal(orc::PredicateDataType::DATE)); 57 | } else { 58 | py::object idx(py::int_(static_cast(orc::TypeKind::DATE))); 59 | py::object to_orc = convDict[idx].attr("to_orc"); 60 | return std::make_tuple(orc::PredicateDataType::DATE, 61 | orc::Literal(orc::PredicateDataType::DATE, 62 | py::cast(to_orc(value)))); 63 | } 64 | } 65 | case orc::TypeKind::TIMESTAMP: 66 | case orc::TypeKind::TIMESTAMP_INSTANT: { 67 | if (value.is_none()) { 68 | return std::make_tuple(orc::PredicateDataType::TIMESTAMP, 69 | orc::Literal(orc::PredicateDataType::TIMESTAMP)); 70 | } else { 71 | py::object idx(py::int_(static_cast(orc::TypeKind::TIMESTAMP))); 72 | py::object to_orc = convDict[idx].attr("to_orc"); 73 | py::tuple res = to_orc(value, timezoneInfo); 74 | return std::make_tuple( 75 | orc::PredicateDataType::TIMESTAMP, 76 | orc::Literal(py::cast(res[0]), py::cast(res[1]))); 77 | } 78 | } 79 | case orc::TypeKind::DECIMAL: { 80 | if (value.is_none()) { 81 | return std::make_tuple(orc::PredicateDataType::DECIMAL, 82 | orc::Literal(orc::PredicateDataType::DECIMAL)); 83 | } else { 84 | py::object idx(py::int_(static_cast(orc::TypeKind::DECIMAL))); 85 | uint64_t precision = py::cast(column.attr("precision")); 86 | uint64_t scale = py::cast(column.attr("scale")); 87 | py::object to_orc = convDict[idx].attr("to_orc"); 88 | py::object res = to_orc(precision, scale, value); 89 | std::string strRes = py::cast(py::str(res)); 90 | return std::make_tuple(orc::PredicateDataType::DECIMAL, 91 | orc::Literal(orc::Int128(strRes), 92 | static_cast(precision), 93 | static_cast(scale))); 94 | } 95 | } 96 | default: 97 | throw py::type_error("Unsupported type for ORC Literal in predicate"); 98 | } 99 | } 100 | 101 | orc::SearchArgumentBuilder& 102 | buildSearchArgument(orc::SearchArgumentBuilder& sarg, 103 | py::tuple predVals, 104 | py::dict convDict, 105 | py::object timezoneInfo) 106 | { 107 | int opCode = py::cast(predVals[0]); 108 | switch (opCode) { 109 | case 0: /* NOT */ 110 | return buildSearchArgument( 111 | sarg.startNot(), predVals[1], convDict, timezoneInfo) 112 | .end(); 113 | case 1: /* OR */ 114 | return buildSearchArgument( 115 | buildSearchArgument( 116 | sarg.startOr(), predVals[1], convDict, timezoneInfo), 117 | predVals[2], 118 | convDict, 119 | timezoneInfo) 120 | .end(); 121 | case 2: /* AND */ 122 | return buildSearchArgument( 123 | buildSearchArgument( 124 | sarg.startAnd(), predVals[1], convDict, timezoneInfo), 125 | predVals[2], 126 | convDict, 127 | timezoneInfo) 128 | .end(); 129 | case 3: { /* EQ */ 130 | py::object colName = predVals[1].attr("name"); 131 | py::object colIdx = predVals[1].attr("index"); 132 | std::tuple res = 133 | buildLiteral(predVals[1], predVals[2], convDict, timezoneInfo); 134 | if (!colName.is_none()) { 135 | return sarg.equals( 136 | py::cast(colName), std::get<0>(res), std::get<1>(res)); 137 | } else if (!colIdx.is_none()) { 138 | return sarg.equals( 139 | py::cast(colIdx), std::get<0>(res), std::get<1>(res)); 140 | } else { 141 | throw py::type_error("Either name or index parameter must be set"); 142 | } 143 | } 144 | case 4: { /* LT */ 145 | py::object colName = predVals[1].attr("name"); 146 | py::object colIdx = predVals[1].attr("index"); 147 | std::tuple res = 148 | buildLiteral(predVals[1], predVals[2], convDict, timezoneInfo); 149 | if (!colName.is_none()) { 150 | return sarg.lessThan( 151 | py::cast(colName), std::get<0>(res), std::get<1>(res)); 152 | } else if (!colIdx.is_none()) { 153 | return sarg.lessThan( 154 | py::cast(colIdx), std::get<0>(res), std::get<1>(res)); 155 | } else { 156 | throw py::type_error("Either name or index parameter must be set"); 157 | } 158 | } 159 | case 5: { /* LE */ 160 | py::object colName = predVals[1].attr("name"); 161 | py::object colIdx = predVals[1].attr("index"); 162 | std::tuple res = 163 | buildLiteral(predVals[1], predVals[2], convDict, timezoneInfo); 164 | if (!colName.is_none()) { 165 | return sarg.lessThanEquals( 166 | py::cast(colName), std::get<0>(res), std::get<1>(res)); 167 | } else if (!colIdx.is_none()) { 168 | return sarg.lessThanEquals( 169 | py::cast(colIdx), std::get<0>(res), std::get<1>(res)); 170 | } else { 171 | throw py::type_error("Either name or index parameter must be set"); 172 | } 173 | } 174 | default: 175 | throw py::type_error("Invalid operation on Literal in predicate"); 176 | } 177 | return sarg; 178 | } 179 | 180 | std::unique_ptr 181 | createSearchArgument(py::object predicate, py::dict convDict, py::object timezoneInfo) 182 | { 183 | std::unique_ptr builder = 184 | orc::SearchArgumentFactory::newBuilder(); 185 | try { 186 | py::tuple predVals = predicate.attr("values"); 187 | buildSearchArgument(*builder.get(), predVals, convDict, timezoneInfo); 188 | return builder->build(); 189 | } catch (py::error_already_set& err) { 190 | if (err.matches(PyExc_AttributeError)) { 191 | std::string strbuf("Invalid predicate: "); 192 | strbuf.append(py::cast(py::repr(predicate)).c_str()); 193 | throw py::type_error(strbuf.c_str()); 194 | } else { 195 | throw; 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /tests/compare/test_reader_cmp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import json 4 | import gzip 5 | import os 6 | import math 7 | import subprocess 8 | import sys 9 | import platform 10 | 11 | try: 12 | import zoneinfo 13 | except ImportError: 14 | from backports import zoneinfo 15 | 16 | from pyorc import TypeKind, StructRepr 17 | import pyorc._pyorc 18 | 19 | ORC_METADATA_PATH = "deps/bin/orc-metadata" 20 | 21 | 22 | def traverse_json_row(schema, value, parent=""): 23 | if schema.kind < 8: 24 | # Primitive types, no transformation. 25 | yield schema.kind, parent, value 26 | elif schema.kind == TypeKind.STRUCT: 27 | for key, val in value.items(): 28 | yield from traverse_json_row( 29 | schema[key], val, "{0}.{1}".format(parent, key) 30 | ) 31 | elif schema.kind == TypeKind.MAP: 32 | for keypair in value: 33 | yield from traverse_json_row( 34 | schema.value, 35 | keypair["value"], 36 | "{0}['{1}']".format(parent, keypair["key"]), 37 | ) 38 | elif schema.kind == TypeKind.LIST: 39 | for idx, item in enumerate(value): 40 | yield from traverse_json_row( 41 | schema.type, item, "{0}[{1}]".format(parent, idx) 42 | ) 43 | elif schema.kind == TypeKind.UNION: 44 | yield schema.kind, parent, value["value"] if value is not None else None 45 | 46 | 47 | def traverse_orc_row(schema, value, parent=""): 48 | if schema.kind < 8 or schema.kind == TypeKind.UNION: 49 | # Primitive types, no transformation. 50 | yield schema.kind, parent, value 51 | elif schema.kind == TypeKind.STRUCT: 52 | for key, val in value.items(): 53 | yield from traverse_orc_row(schema[key], val, "{0}.{1}".format(parent, key)) 54 | elif schema.kind == TypeKind.MAP: 55 | for key, val in value.items(): 56 | yield from traverse_orc_row( 57 | schema.value, val, "{0}['{1}']".format(parent, key) 58 | ) 59 | elif schema.kind == TypeKind.LIST: 60 | for idx, item in enumerate(value): 61 | yield from traverse_orc_row( 62 | schema.type, item, "{0}[{1}]".format(parent, idx) 63 | ) 64 | 65 | 66 | def get_full_path(path): 67 | curdir = os.path.abspath(os.path.dirname(__file__)) 68 | projdir = os.path.abspath(os.path.join(curdir, os.pardir, os.pardir)) 69 | return os.path.join(projdir, "deps", "examples", path) 70 | 71 | 72 | def idfn(val): 73 | return val.split("/")[-1] 74 | 75 | 76 | TESTDATA = [ 77 | ("TestOrcFile.emptyFile.orc", "expected/TestOrcFile.emptyFile.jsn.gz"), 78 | ("TestOrcFile.test1.orc", "expected/TestOrcFile.test1.jsn.gz"), 79 | ("TestOrcFile.testDate1900.orc", "expected/TestOrcFile.testDate1900.jsn.gz"), 80 | ("TestOrcFile.testDate2038.orc", "expected/TestOrcFile.testDate2038.jsn.gz"), 81 | ("TestOrcFile.testSeek.orc", "expected/TestOrcFile.testSeek.jsn.gz"), 82 | ("TestOrcFile.testSnappy.orc", "expected/TestOrcFile.testSnappy.jsn.gz"), 83 | ("TestOrcFile.testTimestamp.orc", "expected/TestOrcFile.testTimestamp.jsn.gz"), 84 | ( 85 | "TestOrcFile.testUnionAndTimestamp.orc", 86 | "expected/TestOrcFile.testUnionAndTimestamp.jsn.gz", 87 | ), 88 | ("nulls-at-end-snappy.orc", "expected/nulls-at-end-snappy.jsn.gz"), 89 | ("demo-12-zlib.orc", "expected/demo-12-zlib.jsn.gz"), 90 | ("decimal.orc", "expected/decimal.jsn.gz"), 91 | ] 92 | 93 | 94 | @pytest.mark.parametrize("example,expected", TESTDATA, ids=idfn) 95 | def test_read(example, expected): 96 | if example == "demo-12-zlib.orc" and platform.python_implementation() == "PyPy": 97 | pytest.skip("This test runs waaay too long on PyPy.") 98 | exp_res = gzip.open(get_full_path(expected), "rb") 99 | with open(get_full_path(example), "rb") as fileo: 100 | orc_res = pyorc._pyorc.reader( 101 | fileo, timezone=zoneinfo.ZoneInfo("UTC"), struct_repr=StructRepr.DICT 102 | ) 103 | length = 0 104 | for num, line in enumerate(exp_res): 105 | json_row = traverse_json_row(orc_res.schema, json.loads(line)) 106 | orc_row = traverse_orc_row(orc_res.schema, next(orc_res)) 107 | for _, exp_path, exp_val in json_row: 108 | otype, act_path, act_val = next(orc_row) 109 | assert exp_path == act_path 110 | if exp_val is None: 111 | assert act_val is None 112 | elif otype == TypeKind.BINARY: 113 | assert exp_val == [ 114 | int(i) for i in act_val 115 | ], "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path) 116 | elif otype == TypeKind.DOUBLE or otype == TypeKind.FLOAT: 117 | assert math.isclose( 118 | exp_val, 119 | act_val, 120 | abs_tol=0.005, # Extermely permissive float comparing. 121 | ), "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path) 122 | elif otype == TypeKind.TIMESTAMP: 123 | assert exp_val == act_val.strftime("%Y-%m-%d %H:%M:%S.%f").rstrip( 124 | "0" 125 | ), "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path) 126 | elif otype == TypeKind.DATE: 127 | assert ( 128 | exp_val == act_val.isoformat() 129 | ), "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path) 130 | elif otype == TypeKind.DECIMAL: 131 | assert exp_val == float( 132 | act_val 133 | ), "Row #{num}, Column: `{path}`".format( # Not the best comparing. 134 | num=num + 1, path=act_path 135 | ) 136 | else: 137 | assert exp_val == act_val, "Row #{num}, Column: `{path}`".format( 138 | num=num + 1, path=act_path 139 | ) 140 | length = num + 1 141 | assert len(orc_res) == length, "ORC file has a different number of row" 142 | exp_res.close() 143 | 144 | 145 | def test_metadata_read(): 146 | with open(get_full_path("TestOrcFile.emptyFile.orc"), "rb") as fileo: 147 | res = pyorc._pyorc.reader(fileo, struct_repr=StructRepr.DICT) 148 | assert res.user_metadata == {} 149 | with open(get_full_path("TestOrcFile.metaData.orc"), "rb") as fileo: 150 | res = pyorc._pyorc.reader(fileo, struct_repr=StructRepr.DICT) 151 | assert res.user_metadata["clobber"] == b"\x05\x07\x0b\r\x11\x13" 152 | assert ( 153 | res.user_metadata["my.meta"] 154 | == b"\x01\x02\x03\x04\x05\x06\x07\xff\xfe\x7f\x80" 155 | ) 156 | 157 | 158 | def test_format_version(): 159 | with open(get_full_path("demo-11-zlib.orc"), "rb") as fileo: 160 | res = pyorc._pyorc.reader(fileo) 161 | assert res.format_version == (0, 11) 162 | with open(get_full_path("demo-12-zlib.orc"), "rb") as fileo: 163 | res = pyorc._pyorc.reader(fileo) 164 | assert res.format_version == (0, 12) 165 | 166 | 167 | def test_writer_id(): 168 | with open(get_full_path("demo-12-zlib.orc"), "rb") as fileo: 169 | res = pyorc.reader.Reader(fileo) 170 | assert res.writer_id == "ORC_JAVA_WRITER" 171 | 172 | 173 | def test_writer_version(): 174 | with open(get_full_path("demo-12-zlib.orc"), "rb") as fileo: 175 | res = pyorc.reader.Reader(fileo) 176 | assert res.writer_version == 1 177 | with open(get_full_path("decimal.orc"), "rb") as fileo: 178 | res = pyorc.reader.Reader(fileo) 179 | assert res.writer_version == 0 180 | 181 | 182 | @pytest.mark.skipif(sys.platform == "win32", reason="No orc-tools on Windows") 183 | def test_metadata(): 184 | test_data = get_full_path("complextypes_iceberg.orc") 185 | expected_metadata = None 186 | with subprocess.Popen( 187 | [ORC_METADATA_PATH, test_data], 188 | stdout=subprocess.PIPE, 189 | ) as proc: 190 | expected_metadata = json.load(proc.stdout) 191 | with open(test_data, "rb") as fileo: 192 | res = pyorc.reader.Reader(fileo) 193 | assert str(res.schema) == expected_metadata["type"] 194 | assert len(res) == expected_metadata["rows"] 195 | assert res.num_of_stripes == expected_metadata["stripe count"] 196 | assert ( 197 | f"{res.format_version[0]}.{res.format_version[1]}" 198 | == expected_metadata["format"] 199 | ) 200 | assert res.software_version == expected_metadata["software version"] 201 | assert res.compression.name.lower() == expected_metadata["compression"] 202 | assert res.compression_block_size == expected_metadata["compression block"] 203 | assert res.row_index_stride == expected_metadata["row index stride"] 204 | assert res.user_metadata == expected_metadata["user metadata"] 205 | assert res.bytes_lengths["content_length"] == expected_metadata["content"] 206 | assert res.bytes_lengths["file_footer_length"] == expected_metadata["footer"] 207 | assert res.bytes_lengths["file_length"] == expected_metadata["file length"] 208 | assert ( 209 | res.bytes_lengths["file_postscript_length"] 210 | == expected_metadata["postscript"] 211 | ) 212 | assert ( 213 | res.bytes_lengths["stripe_statistics_length"] 214 | == expected_metadata["stripe stats"] 215 | ) 216 | assert ( 217 | res.read_stripe(0).bytes_length == expected_metadata["stripes"][0]["length"] 218 | ) 219 | assert ( 220 | res.read_stripe(0).bytes_offset == expected_metadata["stripes"][0]["offset"] 221 | ) 222 | for col, expected_attr in expected_metadata["attributes"].items(): 223 | col_type = res.schema 224 | for item in col.split("."): 225 | if item == "_elem": 226 | col_type = col_type.type 227 | elif item == "_key": 228 | col_type = col_type.key 229 | elif item == "_value": 230 | col_type = col_type.value 231 | else: 232 | col_type = col_type[item] 233 | assert col_type.attributes == expected_attr 234 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import pathlib 4 | import platform 5 | import sys 6 | import shutil 7 | import subprocess 8 | import urllib.request 9 | import tarfile 10 | import logging 11 | 12 | from setuptools import setup 13 | 14 | from pybind11.setup_helpers import Pybind11Extension, build_ext 15 | 16 | 17 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) 18 | 19 | 20 | SOURCES = [ 21 | "_pyorc.cpp", 22 | "Converter.cpp", 23 | "PyORCStream.cpp", 24 | "Reader.cpp", 25 | "SearchArgument.cpp", 26 | "Writer.cpp", 27 | ] 28 | 29 | HEADERS = [ 30 | "Converter.h", 31 | "PyORCStream.h", 32 | "Reader.h", 33 | "SearchArgument.h", 34 | "Writer.h", 35 | "verguard.h", 36 | ] 37 | 38 | if sys.platform.startswith("win32"): 39 | LIBS = [ 40 | "orc", 41 | "libprotobuf", 42 | "libprotoc", 43 | "lz4", 44 | "zstd_static", 45 | "zlibstatic", 46 | "snappy", 47 | ] 48 | else: 49 | LIBS = ["orc", "protobuf", "protoc", "lz4", "zstd", "z", "snappy", "pthread"] 50 | 51 | LIBS = os.getenv("PYORC_LIBRARIES", ",".join(LIBS)).split(",") 52 | 53 | EXT_MODULES = [ 54 | Pybind11Extension( 55 | "pyorc._pyorc", 56 | sources=[os.path.join("src", "_pyorc", src) for src in SOURCES], 57 | depends=[os.path.join("src", "_pyorc", hdr) for hdr in HEADERS], 58 | libraries=LIBS, 59 | include_dirs=[os.path.join("deps", "include")], 60 | library_dirs=[os.path.join("deps", "lib")], 61 | ) 62 | ] 63 | 64 | 65 | class BuildExt(build_ext): 66 | """ 67 | A custom build extension for build ORC Core library and handling 68 | debug build on Windows. 69 | """ 70 | 71 | user_options = build_ext.user_options + [ 72 | ("orc-version=", None, "the version of the ORC C++ Core library"), 73 | ("output-dir=", None, "the output directory"), 74 | ("source-url=", None, "the HTTP url for downloading the ORC source"), 75 | ("download-only", None, "just download and extract the ORC source"), 76 | ("skip-orc-build", None, "skip building ORC C++ Core library"), 77 | ] 78 | 79 | boolean_options = build_ext.boolean_options + [ 80 | "download-only", 81 | "skip-orc-build", 82 | ] 83 | 84 | def initialize_options(self) -> None: 85 | """Set default values for options.""" 86 | super().initialize_options() 87 | self.orc_version = "2.1.0" 88 | self.output_dir = "deps" 89 | self.source_url = "https://archive.apache.org/dist/orc/" 90 | self.download_only = False 91 | self.skip_orc_build = False 92 | 93 | def finalize_options(self) -> None: 94 | # Workaround to set options with environment variables, 95 | # because pip fails to pass parameters to build_ext. 96 | if os.getenv("PYORC_DEBUG", 0): 97 | self.debug = True 98 | if os.getenv("PYORC_SKIP_ORC_BUILD", 0): 99 | self.skip_orc_build = True 100 | self.orc_version = os.getenv("PYORC_LIB_VERSION", self.orc_version) 101 | super().finalize_options() 102 | 103 | def _download_source(self) -> None: 104 | tmp_tar = io.BytesIO() 105 | url = "{url}orc-{ver}/orc-{ver}.tar.gz".format( 106 | url=self.source_url, ver=self.orc_version 107 | ) 108 | with urllib.request.urlopen(url) as src: 109 | logging.info("Download ORC release from: %s" % url) 110 | tmp_tar.write(src.read()) 111 | tmp_tar.seek(0) 112 | tar_src = tarfile.open(fileobj=tmp_tar, mode="r:gz") 113 | logging.info("Extract archives in: %s" % self.output_dir) 114 | tar_src.extractall(self.output_dir) 115 | tar_src.close() 116 | 117 | @staticmethod 118 | def _get_build_envs() -> dict: 119 | env = os.environ.copy() 120 | 121 | if sys.platform != "win32": 122 | env["CFLAGS"] = "-fPIC" 123 | env["CXXFLAGS"] = "-fPIC" 124 | 125 | return env 126 | 127 | def _build_with_cmake(self) -> str: 128 | build_type = "DEBUG" if self.debug else "RELEASE" 129 | 130 | cmake_args = [ 131 | f"-DCMAKE_BUILD_TYPE={build_type}", 132 | "-DBUILD_JAVA=OFF", 133 | "-DBUILD_LIBHDFSPP=OFF", 134 | "-DCMAKE_POSITION_INDEPENDENT_CODE=ON", 135 | ] 136 | if sys.platform == "win32": 137 | cmake_args.append("-DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded") 138 | if not self.debug or sys.platform == "win32": 139 | # Skip building tools and tests. 140 | cmake_args.append("-DBUILD_TOOLS=OFF") 141 | cmake_args.append("-DBUILD_CPP_TESTS=OFF") 142 | env = self._get_build_envs() 143 | build_dir = os.path.join( 144 | self.output_dir, "orc-{ver}".format(ver=self.orc_version), "build" 145 | ) 146 | if not os.path.exists(build_dir): 147 | os.makedirs(build_dir) 148 | logging.info("Build libraries with cmake") 149 | cmake_cmd = ["cmake", ".."] + cmake_args 150 | logging.info("Cmake command: %s" % cmake_cmd) 151 | subprocess.check_call(cmake_cmd, cwd=build_dir, env=env) 152 | if sys.platform == "win32": 153 | subprocess.check_call( 154 | [ 155 | "cmake", 156 | "--build", 157 | ".", 158 | "--config", 159 | build_type, 160 | "--target", 161 | "PACKAGE", 162 | ], 163 | cwd=build_dir, 164 | env=env, 165 | ) 166 | else: 167 | j_flag = f"-j{os.cpu_count() or 1}" 168 | subprocess.check_call(["make", j_flag, "package"], cwd=build_dir, env=env) 169 | return build_dir 170 | 171 | def _build_orc_lib(self): 172 | logging.info("Build ORC C++ Core library") 173 | build_dir = self._build_with_cmake() 174 | plat = ( 175 | sys.platform.title() 176 | if not sys.platform.startswith("win32") 177 | # Change platform title on Windows depending on arch (32/64bit) 178 | else sys.platform.title().replace("32", platform.architecture()[0][:2]) 179 | ) 180 | pack_dir = os.path.join( 181 | build_dir, 182 | "_CPack_Packages", 183 | plat, 184 | "TGZ", 185 | f"ORC-{self.orc_version}-{plat}", 186 | ) 187 | logging.info( 188 | "Move artifacts from '%s' to the '%s' folder" % (pack_dir, self.output_dir) 189 | ) 190 | try: 191 | shutil.move(os.path.join(pack_dir, "include"), self.output_dir) 192 | lib_dir = ( 193 | "lib64" if os.path.exists(os.path.join(pack_dir, "lib64")) else "lib" 194 | ) 195 | shutil.move( 196 | os.path.join(pack_dir, lib_dir), os.path.join(self.output_dir, "lib") 197 | ) 198 | if self.debug and not sys.platform.startswith("win32"): 199 | shutil.move(os.path.join(pack_dir, "bin"), self.output_dir) 200 | shutil.move( 201 | os.path.join( 202 | self.output_dir, 203 | f"orc-{self.orc_version}", 204 | "examples", 205 | ), 206 | self.output_dir, 207 | ) 208 | except Exception as exc: 209 | logging.warning(exc) 210 | 211 | def get_version_macros(self): 212 | parts = self.orc_version.split(".") 213 | return ( 214 | ("ORC_VERSION_MAJOR", int(parts[0])), 215 | ("ORC_VERSION_MINOR", int(parts[1])), 216 | ("ORC_VERSION_PATCH", int(parts[2])), 217 | ) 218 | 219 | def build_extensions(self): 220 | if not self.skip_orc_build: 221 | orc_lib = os.path.join( 222 | self.output_dir, 223 | "lib", 224 | "orc.lib" if sys.platform.startswith("win32") else "liborc.a", 225 | ) 226 | if not os.path.isdir( 227 | os.path.join(self.output_dir, "orc-{ver}".format(ver=self.orc_version)) 228 | ): 229 | self._download_source() 230 | 231 | if self.download_only: 232 | logging.info("Only downloaded the ORC library source. Skip build_ext") 233 | return 234 | 235 | if not os.path.exists(orc_lib): 236 | self._build_orc_lib() 237 | 238 | if sys.platform.startswith("win32") and self.debug: 239 | self.extensions[0].libraries = [ 240 | lib if lib != "zlibstatic" else "zlibstaticd" 241 | for lib in self.extensions[0].libraries 242 | ] 243 | self.extensions[0].define_macros.extend(self.get_version_macros()) 244 | super().build_extensions() 245 | 246 | 247 | CURRDIR = pathlib.Path(__file__).resolve().parent 248 | with open(CURRDIR / "README.rst") as file: 249 | LONG_DESC = file.read() 250 | 251 | # Get version number from the module's __init__.py file. 252 | with open(CURRDIR / "src" / "pyorc" / "__init__.py") as src: 253 | VER = [ 254 | line.split('"')[1] for line in src.readlines() if line.startswith("__version__") 255 | ][0] 256 | 257 | setup( 258 | name="pyorc", 259 | version=VER, 260 | description="Python module for reading and writing Apache ORC file format.", 261 | author="noirello", 262 | author_email="noirello@gmail.com", 263 | url="https://github.com/noirello/pyorc", 264 | long_description=LONG_DESC, 265 | long_description_content_type="text/x-rst", 266 | license="Apache License, Version 2.0", 267 | ext_modules=EXT_MODULES, 268 | package_dir={"pyorc": "src/pyorc"}, 269 | packages=["pyorc"], 270 | package_data={"pyorc": ["py.typed", "_pyorc.pyi"]}, 271 | include_package_data=True, 272 | cmdclass={"build_ext": BuildExt}, 273 | keywords=["python3", "orc", "apache-orc"], 274 | classifiers=[ 275 | "Development Status :: 3 - Alpha", 276 | "Intended Audience :: Developers", 277 | "Intended Audience :: System Administrators", 278 | "License :: OSI Approved :: Apache Software License", 279 | "Programming Language :: C++", 280 | "Programming Language :: Python :: 3 :: Only", 281 | "Programming Language :: Python :: 3.9", 282 | "Programming Language :: Python :: 3.10", 283 | "Programming Language :: Python :: 3.11", 284 | "Programming Language :: Python :: 3.12", 285 | "Programming Language :: Python :: 3.13", 286 | ], 287 | python_requires=">=3.6", 288 | install_requires=[ 289 | 'tzdata >= 2020.5 ; sys_platform == "win32"', 290 | 'backports.zoneinfo >= 0.2.1 ; python_version < "3.9"', 291 | ], 292 | ) 293 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /tests/test_writer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import io 4 | import math 5 | import os 6 | from datetime import date, datetime, timezone 7 | from decimal import Decimal 8 | 9 | try: 10 | import zoneinfo as zi 11 | except ImportError: 12 | from backports import zoneinfo as zi 13 | 14 | from pyorc import ( 15 | Writer, 16 | Reader, 17 | TypeDescription, 18 | ParseError, 19 | TypeKind, 20 | StructRepr, 21 | CompressionKind, 22 | orc_version, 23 | orc_version_info, 24 | ) 25 | from pyorc.converters import ORCConverter 26 | 27 | from conftest import output_file, NullValue 28 | 29 | 30 | def test_open_file(output_file): 31 | output_file.close() 32 | with open(output_file.name, mode="wt") as fp: 33 | with pytest.raises(ParseError): 34 | _ = Writer(fp, "int") 35 | with open(output_file.name, "rb") as fp: 36 | with pytest.raises(io.UnsupportedOperation): 37 | _ = Writer(fp, "int") 38 | with open(output_file.name, mode="wb") as fp: 39 | writer = Writer(fp, "int") 40 | assert isinstance(writer, Writer) 41 | with pytest.raises(TypeError): 42 | _ = Writer(0, "int") 43 | 44 | 45 | def test_init(): 46 | data = io.BytesIO() 47 | with pytest.raises(TypeError): 48 | _ = Writer(data, 0) 49 | with pytest.raises(TypeError): 50 | _ = Writer(data, "int", batch_size=-1) 51 | with pytest.raises(TypeError): 52 | _ = Writer(data, "int", batch_size="fail") 53 | with pytest.raises(TypeError): 54 | _ = Writer(data, "int", batch_size=1000, stripe_size=-1) 55 | with pytest.raises(TypeError): 56 | _ = Writer(data, "int", batch_size=1000, stripe_size="fail") 57 | with pytest.raises(ValueError): 58 | _ = Writer(data, "int", batch_size=1000, stripe_size=5000, compression=-1) 59 | with pytest.raises(ValueError): 60 | _ = Writer(data, "int", batch_size=1000, stripe_size=5000, compression="wrong") 61 | with pytest.raises(ValueError): 62 | _ = Writer( 63 | data, 64 | "int", 65 | batch_size=1000, 66 | stripe_size=5000, 67 | compression=0, 68 | compression_strategy=-1, 69 | ) 70 | with pytest.raises(ValueError): 71 | _ = Writer( 72 | data, 73 | "int", 74 | batch_size=1000, 75 | stripe_size=5000, 76 | compression=0, 77 | compression_strategy="fail", 78 | ) 79 | with pytest.raises(TypeError): 80 | _ = Writer( 81 | data, 82 | "int", 83 | batch_size=1000, 84 | stripe_size=5000, 85 | compression=0, 86 | compression_strategy=0, 87 | compression_block_size=-1, 88 | ) 89 | with pytest.raises(ValueError): 90 | _ = Writer( 91 | data, 92 | "int", 93 | batch_size=1000, 94 | stripe_size=5000, 95 | compression=0, 96 | compression_strategy=0, 97 | compression_block_size=1, 98 | bloom_filter_columns=["0", 1, 3.4], 99 | ) 100 | with pytest.raises(KeyError): 101 | _ = Writer( 102 | data, 103 | "int", 104 | batch_size=1000, 105 | stripe_size=5000, 106 | compression=0, 107 | compression_strategy=0, 108 | compression_block_size=1, 109 | bloom_filter_columns=["0"], 110 | ) 111 | with pytest.raises(TypeError): 112 | _ = Writer( 113 | data, 114 | "int", 115 | batch_size=1000, 116 | stripe_size=5000, 117 | compression=0, 118 | compression_strategy=0, 119 | compression_block_size=1, 120 | bloom_filter_columns=[0], 121 | bloom_filter_fpp="wrong", 122 | ) 123 | with pytest.raises(ValueError): 124 | _ = Writer( 125 | data, 126 | "int", 127 | batch_size=1000, 128 | stripe_size=5000, 129 | compression=0, 130 | compression_strategy=0, 131 | compression_block_size=1, 132 | bloom_filter_columns=[0], 133 | bloom_filter_fpp=2.0, 134 | ) 135 | writer = Writer( 136 | data, 137 | "int", 138 | batch_size=1000, 139 | stripe_size=5000, 140 | compression=0, 141 | compression_strategy=0, 142 | compression_block_size=1, 143 | bloom_filter_columns=[0], 144 | bloom_filter_fpp=0.5, 145 | padding_tolerance=0.5, 146 | dict_key_size_threshold=0.5, 147 | memory_block_size=1, 148 | ) 149 | assert isinstance(writer, Writer) 150 | 151 | 152 | def test_write(): 153 | data = io.BytesIO() 154 | writer = Writer(data, "struct") 155 | records = [(1, "Test A", 2.13), (2, "Test B", 0.123213), (3, "Test C", 123.011234)] 156 | for rec in records: 157 | writer.write(rec) 158 | writer.close() 159 | data.seek(0) 160 | reader = Reader(data) 161 | assert reader.read() == records 162 | 163 | 164 | TESTDATA = [ 165 | ("string", 0), 166 | ("string", b"\x10\x13"), 167 | ("int", "str example"), 168 | ("bigint", 3.14), 169 | ("binary", "str example"), 170 | ("binary", 12), 171 | ("float", "str example"), 172 | ("double", b"\x42\x32"), 173 | ("boolean", "str example"), 174 | ("timestamp", "str example"), 175 | ("timestamp", 102112), 176 | ("date", "str example"), 177 | ("date", 123), 178 | ("decimal(10,5)", "str example"), 179 | ("decimal(36,8)", 1024), 180 | ] 181 | 182 | 183 | @pytest.mark.parametrize("orc_type,value", TESTDATA) 184 | def test_write_wrong_primitive_type(orc_type, value): 185 | data = io.BytesIO() 186 | writer = Writer(data, orc_type) 187 | with pytest.raises(TypeError): 188 | writer.write(value) 189 | 190 | 191 | TESTDATA = [ 192 | ("string", ["Not so very very very long text", "Another text", None, "Onemore"]), 193 | ("binary", [b"\x10\x13\x45\x95\xa4", b"\x34\x56\x45", None, b"\44\x23\x34\xa2"]), 194 | ("int", [100, None, 1231, 1234]), 195 | ("bigint", [3123213123, 12321344, 1231238384, None]), 196 | ("float", [3.14, 2.1, None, 5.5]), 197 | ("double", [3.14159265359, None, 4.12345678, 4.863723423]), 198 | ("boolean", [None, False, True, False]), 199 | ( 200 | "timestamp", 201 | [ 202 | datetime(2019, 4, 19, 12, 58, 59, tzinfo=timezone.utc), 203 | datetime(1914, 6, 28, 10, 45, 0, tzinfo=timezone.utc), 204 | None, 205 | datetime(2001, 3, 12, 10, 45, 21, 12, tzinfo=timezone.utc), 206 | ], 207 | ), 208 | ("date", [date(1909, 12, 8), None, date(2038, 10, 11), date(2019, 11, 11)]), 209 | ( 210 | "decimal(10,7)", 211 | [None, Decimal("0.999999"), Decimal("123.4567890"), Decimal("99.1780000")], 212 | ), 213 | ( 214 | "decimal(38,6)", 215 | [Decimal("999989898.1234"), Decimal("1.245678e24"), None, Decimal("1.2145e28")], 216 | ), 217 | ] 218 | 219 | 220 | @pytest.mark.parametrize("orc_type,values", TESTDATA) 221 | def test_write_primitive_type(orc_type, values): 222 | data = io.BytesIO() 223 | writer = Writer(data, orc_type) 224 | for rec in values: 225 | writer.write(rec) 226 | writer.close() 227 | 228 | data.seek(0) 229 | reader = Reader(data) 230 | if orc_type == "float": 231 | result = reader.read() 232 | assert len(result) == len(values) 233 | for res, exp in zip(result, values): 234 | if exp is None: 235 | assert res is None 236 | else: 237 | assert math.isclose(res, exp, rel_tol=1e-07, abs_tol=0.0) 238 | else: 239 | assert reader.read() == values 240 | 241 | 242 | TESTDATA = [ 243 | ("map", "string"), 244 | ("map", False), 245 | ("map", ["a", "b", "c"]), 246 | ("map", {"0": 0, "1": 1}), 247 | ("array", 0), 248 | ("array", [False, True, False]), 249 | ("array", "false"), 250 | ("uniontype", "string"), 251 | ("uniontype", 2.4), 252 | ("uniontype", [0, 2]), 253 | ("struct", "string"), 254 | ("struct", 0), 255 | ("struct", [0, 1, 2]), 256 | ("struct", (0,)), 257 | ("struct", {"col0": "a", "col1": 0}), 258 | ] 259 | 260 | 261 | @pytest.mark.parametrize("orc_type,value", TESTDATA) 262 | def test_write_wrong_complex_type(orc_type, value): 263 | data = io.BytesIO() 264 | writer = Writer(data, orc_type) 265 | with pytest.raises( 266 | (TypeError, ValueError) 267 | ): # Dict construction might raise ValueError as well. 268 | writer.write(value) 269 | 270 | 271 | TESTDATA = [ 272 | ( 273 | "map", 274 | [{"a": "b", "c": "d"}, {"e": "f", "g": "h", "i": "j"}, None, {"k": "l"}], 275 | ), 276 | ( 277 | "map", 278 | [ 279 | {"zero": 0, "one": 1}, 280 | None, 281 | {"two": 2, "tree": 3}, 282 | {"one": 1, "two": 2, "nill": None}, 283 | ], 284 | ), 285 | ("array", [[0, 1, 2, 3], [4, 5, 6, 7, 8], None, [9, 10, 11, 12]]), 286 | ( 287 | "array", 288 | [ 289 | ["First text", "Second text", "Third text", None], 290 | None, 291 | ["Fourth text", "Fifth text", "Sixth text"], 292 | ["Seventh text", "Last text"], 293 | ], 294 | ), 295 | ("uniontype", ["string", 1, "text", 2, None]), 296 | ( 297 | "struct", 298 | [ 299 | {"col0": 0, "col1": "String"}, 300 | {"col0": 1, "col1": "String 2"}, 301 | None, 302 | {"col0": 2, "col1": None}, 303 | ], 304 | ), 305 | ] 306 | 307 | 308 | @pytest.mark.parametrize("orc_type,values", TESTDATA) 309 | def test_write_complex_type(orc_type, values): 310 | data = io.BytesIO() 311 | writer = Writer(data, orc_type, struct_repr=StructRepr.DICT) 312 | for rec in values: 313 | writer.write(rec) 314 | writer.close() 315 | 316 | data.seek(0) 317 | reader = Reader(data, struct_repr=StructRepr.DICT) 318 | assert reader.read() == values 319 | 320 | 321 | TESTDATA = [ 322 | ("int", 42), 323 | ("bigint", 560000000000001), 324 | ("float", 3.14), 325 | ("double", math.e), 326 | ("string", "test"), 327 | ("binary", b"\x23\x45\x45"), 328 | ("varchar(4)", "four"), 329 | ("timestamp", datetime(2019, 11, 10, 12, 59, 59, 100, tzinfo=timezone.utc)), 330 | ("date", date(2010, 9, 1)), 331 | ("decimal(10,0)", Decimal("1000000000")), 332 | ("array", [0, 1, 2, 3]), 333 | ("map", {"test": "example"}), 334 | ("struct", (0, "test")), 335 | ] 336 | 337 | 338 | @pytest.mark.parametrize("orc_type,value", TESTDATA) 339 | def test_write_nones(orc_type, value): 340 | data = io.BytesIO() 341 | writer = Writer(data, orc_type, batch_size=20) 342 | for _ in range(100): 343 | writer.write(value) 344 | for _ in range(100): 345 | writer.write(None) 346 | writer.close() 347 | 348 | data.seek(0) 349 | reader = Reader(data, batch_size=30) 350 | non_nones = reader.read(100) 351 | nones = reader.read(100) 352 | assert len(reader) == 200 353 | if orc_type in ("float", "double"): 354 | assert math.isclose(non_nones[0], value, rel_tol=1e-07, abs_tol=0.0) 355 | assert math.isclose(non_nones[-1], value, rel_tol=1e-07, abs_tol=0.0) 356 | else: 357 | assert non_nones[0] == value 358 | assert non_nones[-1] == value 359 | assert all(row is not None for row in non_nones) 360 | assert all(row is None for row in nones) 361 | 362 | 363 | def test_context_manager(): 364 | data = io.BytesIO() 365 | records = [ 366 | {"col0": 1, "col1": "Test A", "col2": 2.13}, 367 | {"col0": 2, "col1": "Test B", "col2": 0.123213}, 368 | {"col0": 3, "col1": "Test C", "col2": 123.011234}, 369 | ] 370 | with Writer( 371 | data, "struct", struct_repr=StructRepr.DICT 372 | ) as writer: 373 | for rec in records: 374 | writer.write(rec) 375 | data.seek(0) 376 | reader = Reader(data, struct_repr=StructRepr.DICT) 377 | assert reader.read() == records 378 | 379 | 380 | def test_current_row(): 381 | data = io.BytesIO() 382 | writer = Writer(data, "struct") 383 | assert writer.current_row == 0 384 | writer.write((0, "Test A", 0.0001)) 385 | assert writer.current_row == 1 386 | for i in range(10): 387 | writer.write((i, "Test A", 0.0001)) 388 | assert writer.current_row == 11 389 | writer.close() 390 | data.seek(0) 391 | reader = Reader(data) 392 | assert writer.current_row == len(reader) 393 | 394 | 395 | def test_schema(): 396 | schema_str = "struct" 397 | data = io.BytesIO() 398 | writer = Writer(data, schema_str) 399 | 400 | assert str(writer.schema) == schema_str 401 | with pytest.raises(AttributeError): 402 | writer.schema = "fail" 403 | with pytest.raises(AttributeError): 404 | del writer.schema 405 | 406 | schema = writer.schema 407 | del writer 408 | assert isinstance(schema, TypeDescription) 409 | assert schema.kind == TypeKind.STRUCT 410 | 411 | 412 | def test_writerows(): 413 | data = io.BytesIO() 414 | writer = Writer(data, "int") 415 | res = writer.writerows([]) 416 | assert res == 0 417 | rows = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) 418 | res = writer.writerows(rows) 419 | writer.close() 420 | assert res == len(rows) 421 | 422 | data.seek(0) 423 | reader = Reader(data) 424 | assert list(rows) == reader.read() 425 | 426 | 427 | def test_struct_repr(): 428 | data = io.BytesIO() 429 | writer = Writer(data, "struct") 430 | with pytest.raises(TypeError): 431 | writer.write({"a": 1}) 432 | writer = Writer(data, "struct", struct_repr=StructRepr.DICT) 433 | with pytest.raises(TypeError): 434 | writer.write((1,)) 435 | with pytest.raises(TypeError): 436 | writer.write({"a": "b"}) 437 | 438 | 439 | class TestConverter(ORCConverter): 440 | @staticmethod 441 | def to_orc(obj, timezone): 442 | seconds, nanoseconds = obj 443 | return (seconds, nanoseconds) 444 | 445 | @staticmethod 446 | def from_orc(seconds, nanoseconds, timezone): 447 | pass 448 | 449 | 450 | def test_converter(): 451 | data = io.BytesIO() 452 | seconds = 1500000 453 | nanoseconds = 101000 454 | exp_date = date(2000, 1, 1) 455 | record = ((seconds, nanoseconds), exp_date) 456 | with Writer( 457 | data, 458 | "struct", 459 | converters={TypeKind.TIMESTAMP: TestConverter}, 460 | ) as writer: 461 | writer.write(record) 462 | 463 | data.seek(0) 464 | reader = Reader(data) 465 | assert next(reader) == ( 466 | datetime.fromtimestamp(seconds, timezone.utc).replace( 467 | microsecond=nanoseconds // 1000 468 | ), 469 | exp_date, 470 | ) 471 | 472 | 473 | def test_user_metadata(): 474 | random_val = os.urandom(64) 475 | data = io.BytesIO() 476 | with Writer(data, "int") as writer: 477 | writer.set_user_metadata( 478 | test="test1".encode("UTF-8"), meta=b"\x30\x40\x50\x60", val=random_val 479 | ) 480 | writer.set_user_metadata(test="test2".encode("UTF-8")) 481 | with pytest.raises(TypeError): 482 | writer.set_user_metadata(meta="string") 483 | reader = Reader(data) 484 | assert len(reader) == 0 485 | assert reader.user_metadata == { 486 | "test": "test2".encode("UTF-8"), 487 | "meta": b"\x30\x40\x50\x60", 488 | "val": random_val, 489 | } 490 | 491 | 492 | @pytest.mark.parametrize( 493 | "kind", (CompressionKind.NONE, CompressionKind.ZLIB, CompressionKind.ZSTD) 494 | ) 495 | def test_compression(kind): 496 | data = io.BytesIO() 497 | with Writer(data, "struct", compression=kind) as writer: 498 | writer.writerows((num, "ABCDEFG", 0.12) for num in range(50000)) 499 | data.seek(0) 500 | reader = Reader(data) 501 | assert reader.compression == kind 502 | for idx, row in enumerate(reader): 503 | assert row == (idx, "ABCDEFG", 0.12) 504 | 505 | 506 | @pytest.mark.parametrize( 507 | "schema,attrs", 508 | ( 509 | (TypeDescription.from_string("int"), {"a": "1", "b": "2"}), 510 | (TypeDescription.from_string("struct"), {"test": "attribute"}), 511 | ), 512 | ) 513 | def test_attributes(schema, attrs): 514 | data = io.BytesIO() 515 | schema.set_attributes(attrs) 516 | writer = Writer(data, schema) 517 | writer.close() 518 | reader = Reader(data) 519 | assert len(reader) == 0 520 | assert reader.schema.attributes == attrs 521 | 522 | 523 | @pytest.mark.parametrize( 524 | "schema,writer_tz,reader_tz,input,expected", 525 | [ 526 | ( 527 | "struct", 528 | zi.ZoneInfo("UTC"), 529 | zi.ZoneInfo("UTC"), 530 | datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")), 531 | datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")), 532 | ), 533 | ( 534 | "struct", 535 | zi.ZoneInfo("Asia/Tokyo"), 536 | zi.ZoneInfo("UTC"), 537 | datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("Asia/Tokyo")), 538 | datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")), 539 | ), 540 | ( 541 | "struct", 542 | zi.ZoneInfo("America/Los_Angeles"), 543 | zi.ZoneInfo("America/New_York"), 544 | datetime(2014, 12, 12, 6, 0, 0, tzinfo=zi.ZoneInfo("America/Los_Angeles")), 545 | datetime(2014, 12, 12, 6, 0, 0, tzinfo=zi.ZoneInfo("America/New_York")), 546 | ), 547 | ( 548 | "struct", 549 | zi.ZoneInfo("America/Los_Angeles"), 550 | zi.ZoneInfo("America/New_York"), 551 | datetime(2014, 12, 12, 6, 0, 0, tzinfo=zi.ZoneInfo("America/Los_Angeles")), 552 | datetime(2014, 12, 12, 9, 0, 0, tzinfo=zi.ZoneInfo("America/New_York")), 553 | ), 554 | ( 555 | "struct", 556 | zi.ZoneInfo("UTC"), 557 | zi.ZoneInfo("UTC"), 558 | datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")), 559 | datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")), 560 | ), 561 | ( 562 | "struct", 563 | zi.ZoneInfo("Asia/Tokyo"), 564 | zi.ZoneInfo("UTC"), 565 | datetime(2021, 10, 10, 3, 0, 0, tzinfo=zi.ZoneInfo("Asia/Tokyo")), 566 | datetime(2021, 10, 9, 18, 0, 0, tzinfo=zi.ZoneInfo("UTC")), 567 | ), 568 | ( 569 | "struct", 570 | zi.ZoneInfo("Europe/Berlin"), 571 | zi.ZoneInfo("Europe/London"), 572 | datetime(2021, 10, 31, 3, 0, 0, tzinfo=zi.ZoneInfo("Europe/Berlin")), 573 | datetime(2021, 10, 31, 2, 0, 0, tzinfo=zi.ZoneInfo("Europe/London")), 574 | ), 575 | ], 576 | ) 577 | def test_timestamp_with_timezones(schema, writer_tz, reader_tz, input, expected): 578 | data = io.BytesIO() 579 | with Writer(data, schema, timezone=writer_tz) as writer: 580 | writer.write((input,)) 581 | reader = Reader(data, timezone=reader_tz) 582 | output = next(reader)[0] 583 | assert output == expected 584 | 585 | 586 | TESTDATA = [ 587 | ("int", 42), 588 | ("bigint", 560000000000001), 589 | ("float", 3.14), 590 | ("double", math.e), 591 | ("string", "test"), 592 | ("binary", b"\x23\x45\x45"), 593 | ("varchar(4)", "four"), 594 | ("timestamp", datetime(2019, 11, 10, 12, 59, 59, 100, tzinfo=timezone.utc)), 595 | ("date", date(2010, 9, 1)), 596 | ("decimal(10,0)", Decimal("1000000000")), 597 | ("array", [0, 1, 2, 3]), 598 | ("map", {"test": "example"}), 599 | ("struct", (0, "test")), 600 | ] 601 | 602 | 603 | @pytest.mark.parametrize("orc_type,value", TESTDATA) 604 | def test_write_custom_null_value(orc_type, value): 605 | data = io.BytesIO() 606 | with Writer(data, orc_type, null_value=NullValue()) as writer: 607 | writer.write(value) 608 | writer.write(NullValue()) 609 | reader = Reader(data) 610 | if orc_type in ("float", "double"): 611 | assert math.isclose(next(reader), value, rel_tol=1e-07, abs_tol=0.0) 612 | else: 613 | assert next(reader) == value 614 | assert next(reader) is None 615 | 616 | 617 | @pytest.mark.skipif( 618 | orc_version_info.major <= 1 and orc_version_info.minor < 9, 619 | reason=f"write_intermediate_footer is unsupported for {orc_version}", 620 | ) 621 | def test_write_intermediate_footer(): 622 | data = io.BytesIO() 623 | writer = Writer( 624 | data, 625 | "int", 626 | stripe_size=1024, 627 | compression_block_size=1024, 628 | memory_block_size=512, 629 | ) 630 | writer.writerows(range(65536)) 631 | with pytest.raises(ParseError): 632 | _ = Reader(data) 633 | offset = writer.write_intermediate_footer() 634 | assert isinstance(offset, int) 635 | assert offset > 0 636 | reader = Reader(data) 637 | assert reader.bytes_lengths["file_length"] == offset 638 | assert len(reader) == 65536 639 | assert reader.read()[-1] == 65535 640 | data.seek(offset) 641 | writer.close() 642 | reader = Reader(data) 643 | assert len(reader) == 65536 644 | assert reader.bytes_lengths["file_length"] >= offset 645 | -------------------------------------------------------------------------------- /src/_pyorc/Reader.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "PyORCStream.h" 4 | #include "Reader.h" 5 | #include "SearchArgument.h" 6 | 7 | using namespace py::literals; 8 | 9 | py::dict 10 | createAttributeDict(const orc::Type& orcType) 11 | { 12 | py::dict result; 13 | for (std::string key : orcType.getAttributeKeys()) { 14 | result[key.c_str()] = py::str(orcType.getAttributeValue(key).c_str()); 15 | } 16 | return result; 17 | } 18 | 19 | py::object 20 | createTypeDescription(const orc::Type& orcType) 21 | { 22 | py::object typeModule = py::module::import("pyorc.typedescription"); 23 | int kind = static_cast(orcType.getKind()); 24 | py::object attrDict = createAttributeDict(orcType); 25 | switch (kind) { 26 | case orc::BOOLEAN: { 27 | py::object typeDesc = typeModule.attr("Boolean")(); 28 | typeDesc.attr("set_attributes")(attrDict); 29 | return typeDesc; 30 | } 31 | case orc::BYTE: { 32 | py::object typeDesc = typeModule.attr("TinyInt")(); 33 | typeDesc.attr("set_attributes")(attrDict); 34 | return typeDesc; 35 | } 36 | case orc::SHORT: { 37 | py::object typeDesc = typeModule.attr("SmallInt")(); 38 | typeDesc.attr("set_attributes")(attrDict); 39 | return typeDesc; 40 | } 41 | case orc::INT: { 42 | py::object typeDesc = typeModule.attr("Int")(); 43 | typeDesc.attr("set_attributes")(attrDict); 44 | return typeDesc; 45 | } 46 | case orc::LONG: { 47 | py::object typeDesc = typeModule.attr("BigInt")(); 48 | typeDesc.attr("set_attributes")(attrDict); 49 | return typeDesc; 50 | } 51 | case orc::FLOAT: { 52 | py::object typeDesc = typeModule.attr("Float")(); 53 | typeDesc.attr("set_attributes")(attrDict); 54 | return typeDesc; 55 | } 56 | case orc::DOUBLE: { 57 | py::object typeDesc = typeModule.attr("Double")(); 58 | typeDesc.attr("set_attributes")(attrDict); 59 | return typeDesc; 60 | } 61 | case orc::STRING: { 62 | py::object typeDesc = typeModule.attr("String")(); 63 | typeDesc.attr("set_attributes")(attrDict); 64 | return typeDesc; 65 | } 66 | case orc::BINARY: { 67 | py::object typeDesc = typeModule.attr("Binary")(); 68 | typeDesc.attr("set_attributes")(attrDict); 69 | return typeDesc; 70 | } 71 | case orc::TIMESTAMP: { 72 | py::object typeDesc = typeModule.attr("Timestamp")(); 73 | typeDesc.attr("set_attributes")(attrDict); 74 | return typeDesc; 75 | } 76 | case orc::TIMESTAMP_INSTANT: { 77 | py::object typeDesc = typeModule.attr("TimestampInstant")(); 78 | typeDesc.attr("set_attributes")(attrDict); 79 | return typeDesc; 80 | } 81 | case orc::DATE: { 82 | py::object typeDesc = typeModule.attr("Date")(); 83 | typeDesc.attr("set_attributes")(attrDict); 84 | return typeDesc; 85 | } 86 | case orc::CHAR: { 87 | py::object typeDesc = 88 | typeModule.attr("Char")(py::cast(orcType.getMaximumLength())); 89 | typeDesc.attr("set_attributes")(attrDict); 90 | return typeDesc; 91 | } 92 | case orc::VARCHAR: { 93 | py::object typeDesc = 94 | typeModule.attr("VarChar")(py::cast(orcType.getMaximumLength())); 95 | typeDesc.attr("set_attributes")(attrDict); 96 | return typeDesc; 97 | } 98 | case orc::DECIMAL: { 99 | py::object typeDesc = typeModule.attr("Decimal")( 100 | "precision"_a = py::cast(orcType.getPrecision()), 101 | "scale"_a = py::cast(orcType.getScale())); 102 | typeDesc.attr("set_attributes")(attrDict); 103 | return typeDesc; 104 | } 105 | case orc::LIST: { 106 | py::object typeDesc = 107 | typeModule.attr("Array")(createTypeDescription(*orcType.getSubtype(0))); 108 | typeDesc.attr("set_attributes")(attrDict); 109 | return typeDesc; 110 | } 111 | case orc::MAP: { 112 | py::object typeDesc = typeModule.attr("Map")( 113 | "key"_a = createTypeDescription(*orcType.getSubtype(0)), 114 | "value"_a = createTypeDescription(*orcType.getSubtype(1))); 115 | typeDesc.attr("set_attributes")(attrDict); 116 | return typeDesc; 117 | } 118 | case orc::UNION: { 119 | py::tuple args(orcType.getSubtypeCount()); 120 | for (size_t i = 0; i < orcType.getSubtypeCount(); ++i) { 121 | args[i] = createTypeDescription(*orcType.getSubtype(i)); 122 | } 123 | py::object typeDesc = typeModule.attr("Union")(*args); 124 | typeDesc.attr("set_attributes")(attrDict); 125 | return typeDesc; 126 | } 127 | case orc::STRUCT: { 128 | py::dict fields; 129 | for (size_t i = 0; i < orcType.getSubtypeCount(); ++i) { 130 | auto key = orcType.getFieldName(i); 131 | fields[key.c_str()] = createTypeDescription(*orcType.getSubtype(i)); 132 | } 133 | py::object typeDesc = typeModule.attr("Struct")(**fields); 134 | typeDesc.attr("set_attributes")(attrDict); 135 | return typeDesc; 136 | } 137 | default: 138 | throw py::type_error("Invalid TypeKind"); 139 | } 140 | } 141 | 142 | py::object 143 | ORCFileLikeObject::next() 144 | { 145 | while (true) { 146 | if (batchItem == 0) { 147 | if (!rowReader->next(*batch)) { 148 | throw py::stop_iteration(); 149 | } 150 | converter->reset(*batch); 151 | } 152 | if (batchItem < batch->numElements) { 153 | py::object val = converter->toPython(batchItem); 154 | ++batchItem; 155 | ++currentRow; 156 | return val; 157 | } else { 158 | batchItem = 0; 159 | } 160 | } 161 | } 162 | 163 | py::list 164 | ORCFileLikeObject::read(int64_t num) 165 | { 166 | int64_t i = 0; 167 | py::list res; 168 | if (num < -1) { 169 | throw py::value_error("Read length must be positive or -1"); 170 | } 171 | try { 172 | while (true) { 173 | if (num != -1 && i == num) { 174 | return res; 175 | } 176 | res.append(this->next()); 177 | ++i; 178 | } 179 | } catch (py::stop_iteration&) { 180 | return res; 181 | } 182 | } 183 | 184 | uint64_t 185 | ORCFileLikeObject::seek(int64_t row, uint16_t whence) 186 | { 187 | uint64_t start = 0; 188 | switch (whence) { 189 | case 0: 190 | start = firstRowOfStripe; 191 | if (row < 0) { 192 | throw py::value_error("Invalid value for row"); 193 | } 194 | break; 195 | case 1: 196 | start = currentRow + firstRowOfStripe; 197 | break; 198 | case 2: 199 | start = this->len() + firstRowOfStripe; 200 | break; 201 | default: 202 | throw py::value_error("Invalid value for whence"); 203 | break; 204 | } 205 | rowReader->seekToRow(start + row); 206 | batchItem = 0; 207 | currentRow = rowReader->getRowNumber() - firstRowOfStripe; 208 | return currentRow; 209 | } 210 | 211 | const orc::Type* 212 | ORCFileLikeObject::findColumnType(const orc::Type* type, uint64_t columnIndex) const 213 | { 214 | if (type->getColumnId() == columnIndex) { 215 | return type; 216 | } else { 217 | for (size_t i = 0; i < type->getSubtypeCount(); ++i) { 218 | auto* subtype = type->getSubtype(i); 219 | if (subtype->getColumnId() <= columnIndex && 220 | subtype->getMaximumColumnId() >= columnIndex) { 221 | return ORCFileLikeObject::findColumnType(subtype, columnIndex); 222 | } 223 | } 224 | throw py::index_error("column not found"); 225 | } 226 | } 227 | 228 | py::object 229 | ORCFileLikeObject::convertTimestampMillis(int64_t millisec) const 230 | { 231 | py::object idx(py::int_(static_cast(orc::TIMESTAMP))); 232 | py::object from_orc = convDict[idx].attr("from_orc"); 233 | int64_t seconds = millisec / 1000; 234 | int64_t nanosecs = std::abs(millisec % 1000) * 1000 * 1000; 235 | return from_orc(seconds, nanosecs, timezoneInfo); 236 | } 237 | 238 | py::dict 239 | ORCFileLikeObject::buildStatistics(const orc::Type* type, 240 | const orc::ColumnStatistics* stats) const 241 | { 242 | py::dict result; 243 | int64_t typeKind = static_cast(type->getKind()); 244 | result["kind"] = typeKind; 245 | result["has_null"] = py::cast(stats->hasNull()); 246 | result["number_of_values"] = py::cast(stats->getNumberOfValues()); 247 | switch (typeKind) { 248 | case orc::BOOLEAN: { 249 | auto* boolStat = dynamic_cast(stats); 250 | if (boolStat->hasCount()) { 251 | result["false_count"] = py::cast(boolStat->getFalseCount()); 252 | result["true_count"] = py::cast(boolStat->getTrueCount()); 253 | } 254 | return result; 255 | } 256 | case orc::BYTE: 257 | case orc::INT: 258 | case orc::LONG: 259 | case orc::SHORT: { 260 | auto* intStat = dynamic_cast(stats); 261 | if (intStat->hasMinimum()) { 262 | result["minimum"] = py::cast(intStat->getMinimum()); 263 | } 264 | if (intStat->hasMaximum()) { 265 | result["maximum"] = py::cast(intStat->getMaximum()); 266 | } 267 | if (intStat->hasSum()) { 268 | result["sum"] = py::cast(intStat->getSum()); 269 | } 270 | return result; 271 | } 272 | case orc::STRUCT: 273 | case orc::MAP: 274 | case orc::LIST: 275 | case orc::UNION: 276 | return result; 277 | case orc::FLOAT: 278 | case orc::DOUBLE: { 279 | auto* doubleStat = dynamic_cast(stats); 280 | if (doubleStat->hasMinimum()) { 281 | result["minimum"] = py::cast(doubleStat->getMinimum()); 282 | } 283 | if (doubleStat->hasMaximum()) { 284 | result["maximum"] = py::cast(doubleStat->getMaximum()); 285 | } 286 | if (doubleStat->hasSum()) { 287 | result["sum"] = py::cast(doubleStat->getSum()); 288 | } 289 | return result; 290 | } 291 | case orc::BINARY: { 292 | auto* binaryStat = dynamic_cast(stats); 293 | if (binaryStat->hasTotalLength()) { 294 | result["total_length"] = py::cast(binaryStat->getTotalLength()); 295 | } 296 | return result; 297 | } 298 | case orc::STRING: 299 | case orc::CHAR: 300 | case orc::VARCHAR: { 301 | auto* strStat = dynamic_cast(stats); 302 | if (strStat->hasMinimum()) { 303 | result["minimum"] = py::cast(strStat->getMinimum()); 304 | } 305 | if (strStat->hasMaximum()) { 306 | result["maximum"] = py::cast(strStat->getMaximum()); 307 | } 308 | if (strStat->hasTotalLength()) { 309 | result["total_length"] = py::cast(strStat->getTotalLength()); 310 | } 311 | return result; 312 | } 313 | case orc::DATE: { 314 | auto* dateStat = dynamic_cast(stats); 315 | py::object idx(py::int_(static_cast(orc::DATE))); 316 | py::object from_orc = convDict[idx].attr("from_orc"); 317 | if (dateStat->hasMinimum()) { 318 | result["minimum"] = from_orc(dateStat->getMinimum()); 319 | } 320 | if (dateStat->hasMaximum()) { 321 | result["maximum"] = from_orc(dateStat->getMaximum()); 322 | } 323 | return result; 324 | } 325 | case orc::TIMESTAMP: 326 | case orc::TIMESTAMP_INSTANT: { 327 | auto* timeStat = dynamic_cast(stats); 328 | if (timeStat->hasMinimum()) { 329 | result["minimum"] = convertTimestampMillis(timeStat->getMinimum()); 330 | } 331 | if (timeStat->hasMaximum()) { 332 | result["maximum"] = convertTimestampMillis(timeStat->getMaximum()); 333 | } 334 | if (timeStat->hasLowerBound()) { 335 | result["lower_bound"] = 336 | convertTimestampMillis(timeStat->getLowerBound()); 337 | } 338 | if (timeStat->hasUpperBound()) { 339 | result["upper_bound"] = 340 | convertTimestampMillis(timeStat->getUpperBound()); 341 | } 342 | return result; 343 | } 344 | case orc::DECIMAL: { 345 | auto* decStat = dynamic_cast(stats); 346 | py::object idx(py::int_(static_cast(orc::DECIMAL))); 347 | py::object from_orc = convDict[idx].attr("from_orc"); 348 | if (decStat->hasMinimum()) { 349 | result["minimum"] = from_orc(decStat->getMinimum().toString()); 350 | } 351 | if (decStat->hasMaximum()) { 352 | result["maximum"] = from_orc(decStat->getMaximum().toString()); 353 | } 354 | if (decStat->hasSum()) { 355 | result["sum"] = from_orc(decStat->getSum().toString()); 356 | } 357 | return result; 358 | } 359 | default: 360 | return result; 361 | } 362 | } 363 | 364 | Reader::Reader(py::object fileo, 365 | uint64_t batch_size, 366 | std::list col_indices, 367 | std::list col_names, 368 | py::object tzone, 369 | unsigned int struct_repr, 370 | py::object conv, 371 | py::object predicate, 372 | py::object null_value) 373 | { 374 | orc::ReaderOptions readerOpts; 375 | batchItem = 0; 376 | currentRow = 0; 377 | firstRowOfStripe = 0; 378 | structKind = struct_repr; 379 | nullValue = null_value; 380 | if (!col_indices.empty() && !col_names.empty()) { 381 | throw py::value_error( 382 | "Either col_indices or col_names can be set to select columns"); 383 | } 384 | if (!col_indices.empty()) { 385 | rowReaderOpts = rowReaderOpts.include(col_indices); 386 | } 387 | if (!col_names.empty()) { 388 | rowReaderOpts = rowReaderOpts.include(col_names); 389 | } 390 | if (!tzone.is_none()) { 391 | std::string tzKey = py::cast(tzone.attr("key")); 392 | rowReaderOpts = rowReaderOpts.setTimezoneName(tzKey); 393 | } 394 | timezoneInfo = tzone; 395 | if (conv.is_none()) { 396 | py::dict defaultConv = 397 | py::module::import("pyorc.converters").attr("DEFAULT_CONVERTERS"); 398 | convDict = py::dict(defaultConv); 399 | } else { 400 | convDict = conv; 401 | } 402 | if (!predicate.is_none()) { 403 | rowReaderOpts = rowReaderOpts.searchArgument( 404 | std::move(createSearchArgument(predicate, convDict, timezoneInfo))); 405 | } 406 | reader = orc::createReader( 407 | std::unique_ptr(new PyORCInputStream(fileo)), readerOpts); 408 | try { 409 | batchSize = batch_size; 410 | rowReader = reader->createRowReader(rowReaderOpts); 411 | batch = rowReader->createRowBatch(batchSize); 412 | converter = createConverter( 413 | &rowReader->getSelectedType(), structKind, convDict, timezoneInfo, nullValue); 414 | } catch (orc::ParseError& err) { 415 | throw py::value_error(err.what()); 416 | } 417 | } 418 | 419 | py::dict 420 | Reader::bytesLengths() const 421 | { 422 | py::dict res; 423 | res["content_length"] = reader->getContentLength(); 424 | res["file_footer_length"] = reader->getFileFooterLength(); 425 | res["file_postscript_length"] = reader->getFilePostscriptLength(); 426 | res["file_length"] = reader->getFileLength(); 427 | res["stripe_statistics_length"] = reader->getStripeStatisticsLength(); 428 | return res; 429 | } 430 | 431 | uint64_t 432 | Reader::compression() const 433 | { 434 | return static_cast(reader->getCompression()); 435 | } 436 | 437 | uint64_t 438 | Reader::compressionBlockSize() const 439 | { 440 | return reader->getCompressionSize(); 441 | } 442 | 443 | uint64_t 444 | Reader::rowIndexStride() const 445 | { 446 | return reader->getRowIndexStride(); 447 | } 448 | 449 | py::tuple 450 | Reader::formatVersion() const 451 | { 452 | py::tuple res(2); 453 | orc::FileVersion ver = reader->getFormatVersion(); 454 | res[0] = py::cast(ver.getMajor()); 455 | res[1] = py::cast(ver.getMinor()); 456 | return res; 457 | } 458 | 459 | uint64_t 460 | Reader::len() const 461 | { 462 | return reader->getNumberOfRows(); 463 | } 464 | 465 | uint64_t 466 | Reader::numberOfStripes() const 467 | { 468 | return reader->getNumberOfStripes(); 469 | } 470 | 471 | uint32_t 472 | Reader::writerId() const 473 | { 474 | return reader->getWriterIdValue(); 475 | } 476 | 477 | uint32_t 478 | Reader::writerVersion() const 479 | { 480 | return reader->getWriterVersion(); 481 | } 482 | 483 | std::string 484 | Reader::softwareVersion() const 485 | { 486 | return reader->getSoftwareVersion(); 487 | } 488 | 489 | std::unique_ptr 490 | Reader::readStripe(uint64_t idx) 491 | { 492 | if (idx >= reader->getNumberOfStripes()) { 493 | throw py::index_error("stripe index out of range"); 494 | } 495 | return std::unique_ptr(new Stripe(*this, idx, reader->getStripe(idx))); 496 | } 497 | 498 | py::object 499 | Reader::schema() 500 | { 501 | return createTypeDescription(reader->getType()); 502 | } 503 | 504 | py::object 505 | Reader::selectedSchema() 506 | { 507 | return createTypeDescription(rowReader->getSelectedType()); 508 | } 509 | 510 | py::tuple 511 | Reader::statistics(uint64_t columnIndex) 512 | { 513 | try { 514 | py::tuple result = py::tuple(1); 515 | std::unique_ptr stats = 516 | reader->getColumnStatistics(columnIndex); 517 | result[0] = this->buildStatistics( 518 | this->findColumnType(&rowReader->getSelectedType(), columnIndex), 519 | stats.get()); 520 | return result; 521 | } catch (std::logic_error& err) { 522 | throw py::index_error(err.what()); 523 | } 524 | } 525 | 526 | py::dict 527 | Reader::userMetadata() 528 | { 529 | py::dict result; 530 | for (std::string key : reader->getMetadataKeys()) { 531 | result[key.c_str()] = py::bytes(reader->getMetadataValue(key)); 532 | } 533 | return result; 534 | } 535 | 536 | Stripe::Stripe(const Reader& reader_, 537 | uint64_t idx, 538 | std::unique_ptr stripe) 539 | : reader(reader_) 540 | { 541 | batchItem = 0; 542 | currentRow = 0; 543 | stripeIndex = idx; 544 | stripeInfo = std::move(stripe); 545 | convDict = reader.getConverterDict(); 546 | timezoneInfo = reader.getTimeZoneInfo(); 547 | rowReaderOpts = reader.getRowReaderOptions(); 548 | rowReaderOpts = 549 | rowReaderOpts.range(stripeInfo->getOffset(), stripeInfo->getLength()); 550 | rowReader = reader.getORCReader().createRowReader(rowReaderOpts); 551 | batch = rowReader->createRowBatch(reader.getBatchSize()); 552 | converter = createConverter(&rowReader->getSelectedType(), 553 | reader.getStructKind(), 554 | convDict, 555 | timezoneInfo, 556 | reader.getNullValue()); 557 | firstRowOfStripe = rowReader->getRowNumber() + 1; 558 | } 559 | 560 | py::tuple 561 | Stripe::bloomFilterColumns() 562 | { 563 | int64_t idx = 0; 564 | std::set empty = {}; 565 | std::map bfCols = 566 | reader.getORCReader().getBloomFilters(stripeIndex, empty); 567 | py::tuple result(bfCols.size()); 568 | for (auto const& col : bfCols) { 569 | result[idx] = py::cast(col.first); 570 | ++idx; 571 | } 572 | return result; 573 | } 574 | 575 | uint64_t 576 | Stripe::len() const 577 | { 578 | return stripeInfo->getNumberOfRows(); 579 | } 580 | 581 | uint64_t 582 | Stripe::length() const 583 | { 584 | return stripeInfo->getLength(); 585 | } 586 | 587 | uint64_t 588 | Stripe::offset() const 589 | { 590 | return stripeInfo->getOffset(); 591 | } 592 | 593 | py::tuple 594 | Stripe::statistics(uint64_t columnIndex) 595 | { 596 | if (columnIndex < 0 || 597 | columnIndex > rowReader->getSelectedType().getMaximumColumnId()) { 598 | throw py::index_error("column index out of range"); 599 | } 600 | std::unique_ptr stripeStats = 601 | reader.getORCReader().getStripeStatistics(stripeIndex); 602 | uint32_t num = stripeStats->getNumberOfRowIndexStats(columnIndex); 603 | py::tuple result = py::tuple(num); 604 | for (uint32_t i = 0; i < num; ++i) { 605 | const orc::ColumnStatistics* stats = 606 | stripeStats->getRowIndexStatistics(columnIndex, i); 607 | result[i] = this->buildStatistics( 608 | this->findColumnType(&rowReader->getSelectedType(), columnIndex), stats); 609 | } 610 | return result; 611 | } 612 | 613 | std::string 614 | Stripe::writerTimezone() 615 | { 616 | return stripeInfo->getWriterTimezone(); 617 | } 618 | --------------------------------------------------------------------------------