├── src
    ├── pyorc
    │   ├── py.typed
    │   ├── errors.py
    │   ├── enums.py
    │   ├── __init__.py
    │   ├── predicates.py
    │   ├── converters.py
    │   ├── writer.py
    │   ├── reader.py
    │   ├── _pyorc.pyi
    │   └── typedescription.py
    └── _pyorc
    │   ├── verguard.h
    │   ├── SearchArgument.h
    │   ├── Converter.h
    │   ├── PyORCStream.h
    │   ├── Writer.h
    │   ├── Reader.h
    │   ├── PyORCStream.cpp
    │   ├── _pyorc.cpp
    │   ├── Writer.cpp
    │   ├── SearchArgument.cpp
    │   └── Reader.cpp
├── docs
    ├── changelog.rst
    ├── requirements.txt
    ├── Makefile
    ├── make.bat
    ├── index.rst
    ├── install.rst
    ├── conf.py
    └── tutorial.rst
├── .clang-format
├── .gitignore
├── MANIFEST.in
├── .readthedocs.yaml
├── pyproject.toml
├── tests
    ├── conftest.py
    ├── test_orc_ver.py
    ├── test_predicates.py
    ├── test_stripe.py
    ├── test_typedescription.py
    ├── compare
    │   ├── test_writer_cmp.py
    │   └── test_reader_cmp.py
    ├── test_column.py
    └── test_writer.py
├── .azure-pipelines
    ├── prepare-and-push-wheels.yml
    └── build-run-tests.yml
├── README.rst
├── CHANGELOG.rst
├── azure-pipelines.yml
├── setup.py
└── LICENSE


/src/pyorc/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | :tocdepth: 1
2 | 
3 | .. include:: ../CHANGELOG.rst


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Mozilla
2 | IndentWidth: 4
3 | ---
4 | Language: Cpp
5 | Standard: Cpp11
6 | ColumnLimit: 88


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==8.1.3
2 | sphinx_rtd_theme==3.0.2
3 | furo==2024.8.6
4 | readthedocs-sphinx-search==0.3.2
5 | 
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | .eggs/
 3 | build/
 4 | deps/
 5 | dist/
 6 | docs/_build/
 7 | pyorc.egg-info/
 8 | .pytest_cache/
 9 | __pycache__/
10 | tests/__pycache__
11 | .mypy_cache/
12 | 


--------------------------------------------------------------------------------
/src/pyorc/errors.py:
--------------------------------------------------------------------------------
1 | class ORCError(Exception):
2 |     """ General pyorc error. """
3 | 
4 | 
5 | class ParseError(ORCError):
6 |     """ Parse error while processing an ORC file. """
7 | 
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include src *.cpp *.h *.py
2 | recursive-include tests *.py
3 | recursive-include docs *.rst *.py *.css Makefile make.bat
4 | include README.rst
5 | include LICENSE
6 | include CHANGELOG.rst
7 | recursive-exclude docs/_build *


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-24.04"
 5 |   tools:
 6 |     python: "3.12"
 7 | 
 8 | sphinx:
 9 |   configuration: docs/conf.py
10 | 
11 | python:
12 |   install:
13 |     - requirements: docs/requirements.txt
14 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools",
4 |     "setuptools<72.2; implementation_name == 'pypy'",  # https://github.com/pypa/distutils/issues/283
5 |     "wheel",
6 |     "pybind11>2.6.0,<3.0"
7 | ]
8 | build-backend = "setuptools.build_meta"
9 | 


--------------------------------------------------------------------------------
/src/_pyorc/verguard.h:
--------------------------------------------------------------------------------
 1 | #ifndef VERGUARD_H
 2 | #define VERGUARD_H
 3 | 
 4 | #if ORC_VERSION_MINOR > 255 || ORC_VERSION_PATCH > 255
 5 | #error "ORC version number component is higher than 255, version guard macro will fail"
 6 | #endif
 7 | 
 8 | #define ORC_VERSION_AT_LEAST(ma, mi, pa) \
 9 |     (((ORC_VERSION_MAJOR)<<16)+((ORC_VERSION_MINOR)<<8)+(ORC_VERSION_PATCH) >= (((ma)<<16)+((mi)<<8)+(pa)) ? 1 : 0)
10 | 
11 | #endif
12 | 
13 | 


--------------------------------------------------------------------------------
/src/_pyorc/SearchArgument.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEARCHARGUMENT_H
 2 | #define SEARCHARGUMENT_H
 3 | 
 4 | #include <orc/sargs/SearchArgument.hh>
 5 | #include <pybind11/pybind11.h>
 6 | 
 7 | namespace py = pybind11;
 8 | 
 9 | std::unique_ptr<orc::SearchArgument> createSearchArgument(py::object,
10 |                                                           py::dict,
11 |                                                           py::object);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def output_file():
 9 |     testfile = tempfile.NamedTemporaryFile(
10 |         mode="wb", delete=False, prefix="pyorc_", suffix=".orc"
11 |     )
12 |     yield testfile
13 |     if not testfile.closed:
14 |         testfile.close()
15 |     os.remove(testfile.name)
16 | 
17 | 
18 | class NullValue:
19 |     _instance = None
20 | 
21 |     def __new__(cls):
22 |         if cls._instance is not None:
23 |             return cls._instance
24 |         cls._instance = super().__new__(cls)
25 |         return cls._instance
26 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/tests/test_orc_ver.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import pytest
 3 | 
 4 | from pyorc import orc_version, orc_version_info
 5 | 
 6 | 
 7 | def test_orc_version():
 8 |     assert isinstance(orc_version, str)
 9 |     assert len(orc_version.split(".")) == 3
10 | 
11 | 
12 | def test_orc_version_info():
13 |     assert isinstance(orc_version_info, tuple)
14 |     assert isinstance(orc_version_info.major, int)
15 |     assert isinstance(orc_version_info.minor, int)
16 |     assert isinstance(orc_version_info.patch, int)
17 |     assert isinstance(orc_version_info.releaselevel, str)
18 |     inf = orc_version_info
19 |     assert (
20 |         orc_version
21 |         == f"{inf.major}.{inf.minor}.{inf.patch}{'-' if inf.releaselevel else ''}{inf.releaselevel}"
22 |     )
23 | 
24 | 


--------------------------------------------------------------------------------
/src/_pyorc/Converter.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONVERTER_H
 2 | #define CONVERTER_H
 3 | 
 4 | #include <memory>
 5 | 
 6 | #include "orc/OrcFile.hh"
 7 | 
 8 | #include <pybind11/pybind11.h>
 9 | 
10 | namespace py = pybind11;
11 | class Converter
12 | {
13 |   protected:
14 |     bool hasNulls;
15 |     const char* notNull = nullptr;
16 |     py::object nullValue = py::none();
17 | 
18 |   public:
19 |     Converter(py::object nv)
20 |       : nullValue(nv){};
21 |     virtual ~Converter() = default;
22 |     virtual py::object toPython(uint64_t) = 0;
23 |     virtual void write(orc::ColumnVectorBatch*, uint64_t, py::object) = 0;
24 |     virtual void reset(const orc::ColumnVectorBatch&);
25 |     virtual void clear(){};
26 | };
27 | 
28 | std::unique_ptr<Converter>
29 | createConverter(const orc::Type*, unsigned int, py::dict, py::object, py::object);
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/.azure-pipelines/prepare-and-push-wheels.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 | - name: qemu
 3 |   type: boolean
 4 |   default: false
 5 | - name: cibwStep
 6 |   type: step
 7 | 
 8 | steps:
 9 | - task: UsePythonVersion@0
10 | - bash: |
11 |     set -o errexit
12 |     python3 -m pip install --upgrade pip
13 |     python3 -m pip install cibuildwheel==2.22.0
14 |   displayName: Install cibuildwheel dependencies
15 | - bash: |
16 |     echo "Worker Arch: ${ARCH}; OS: ${OS};"
17 |   env:
18 |     ARCH: $(Agent.OSArchitecture)
19 |     OS: $(Agent.OS)
20 |   displayName: Worker info
21 | 
22 | 
23 | - ${{ if eq(parameters.qemu, true) }}:
24 | 
25 |   - script: docker run --privileged --rm tonistiigi/binfmt --install all
26 |     displayName: Register QEMU
27 | 
28 | - ${{ parameters.cibwStep }}
29 | - task: PublishBuildArtifacts@1
30 |   inputs:
31 |     pathtoPublish: 'wheelhouse'
32 |     artifactName: wheels
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/src/_pyorc/PyORCStream.h:
--------------------------------------------------------------------------------
 1 | #ifndef PY_ORC_STREAM_H
 2 | #define PY_ORC_STREAM_H
 3 | 
 4 | #include <pybind11/pybind11.h>
 5 | 
 6 | #include "orc/OrcFile.hh"
 7 | #include "verguard.h"
 8 | 
 9 | namespace py = pybind11;
10 | 
11 | class PyORCInputStream : public orc::InputStream
12 | {
13 |   private:
14 |     std::string filename;
15 |     py::object pyread;
16 |     py::object pyseek;
17 |     uint64_t totalLength;
18 | 
19 |   public:
20 |     PyORCInputStream(py::object);
21 |     ~PyORCInputStream() override;
22 |     uint64_t getLength() const override;
23 |     uint64_t getNaturalReadSize() const override;
24 |     void read(void*, uint64_t, uint64_t) override;
25 |     const std::string& getName() const override;
26 | };
27 | 
28 | class PyORCOutputStream : public orc::OutputStream
29 | {
30 |   private:
31 |     std::string filename;
32 |     py::object pywrite;
33 |     py::object pyflush;
34 |     uint64_t bytesWritten;
35 |     bool closed;
36 | 
37 |   public:
38 |     PyORCOutputStream(py::object);
39 |     ~PyORCOutputStream() override;
40 |     uint64_t getLength() const override;
41 |     uint64_t getNaturalWriteSize() const override;
42 |     const std::string& getName() const override;
43 |     void write(const void*, size_t) override;
44 |     void close() override;
45 | #if ORC_VERSION_AT_LEAST(1, 9, 0)
46 |     void flush() override;
47 | #endif
48 | };
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyORC documentation master file, created by
 2 |    sphinx-quickstart on Tue Nov 12 22:14:39 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | PyORC's documentation
 7 | *********************
 8 | 
 9 | PyORC is a Python module for reading and writing `Apache ORC`_ file format.
10 | It uses the Apache ORC's Core C++ API under the hood, and provides a similar
11 | interface as the `csv module`_ in the Python standard library.
12 | 
13 | .. note::
14 |     The module is compatible with Python 3.9 or newer releases.
15 | 
16 | 
17 | Features
18 | --------
19 | 
20 | - Reading ORC files.
21 | - Writing ORC files.
22 | - While using Python's stream/file-like object IO interface.
23 | 
24 | .. toctree::
25 |    :maxdepth: 2
26 |    :caption: Contents:
27 | 
28 |    install
29 |    tutorial
30 |    api
31 |    changelog
32 | 
33 | 
34 | Indices and tables
35 | ==================
36 | 
37 | * :ref:`genindex`
38 | * :ref:`modindex`
39 | * :ref:`search`
40 | 
41 | Contribution
42 | ============
43 | 
44 | Any contributions are welcome. If you would like to help in development fork
45 | or report issue on the project's `GitHub site`_. You can also help in
46 | improving the documentation.
47 | 
48 | .. _github site: https://github.com/noirello/pyorc
49 | .. _Apache ORC: https://orc.apache.org/
50 | .. _csv module: https://docs.python.org/3/library/csv.html
51 | 


--------------------------------------------------------------------------------
/src/_pyorc/Writer.h:
--------------------------------------------------------------------------------
 1 | #ifndef WRITER_H
 2 | #define WRITER_H
 3 | 
 4 | #include <pybind11/pybind11.h>
 5 | #include <pybind11/stl.h>
 6 | 
 7 | #include "orc/OrcFile.hh"
 8 | 
 9 | #include "Converter.h"
10 | #include "verguard.h"
11 | 
12 | namespace py = pybind11;
13 | 
14 | class Writer
15 | {
16 |   private:
17 |     std::unique_ptr<orc::OutputStream> outStream;
18 |     std::unique_ptr<orc::Writer> writer;
19 |     std::unique_ptr<orc::ColumnVectorBatch> batch;
20 |     std::unique_ptr<Converter> converter;
21 |     uint64_t batchSize;
22 |     uint64_t batchItem;
23 | 
24 |   public:
25 |     uint64_t currentRow;
26 | 
27 |     Writer(py::object,
28 |            py::object,
29 |            uint64_t = 1024,
30 |            uint64_t = 67108864,
31 |            uint64_t = 10000,
32 |            int = 1,
33 |            int = 0,
34 |            uint64_t = 65536,
35 |            std::set<uint64_t> = {},
36 |            double = 0.05,
37 |            py::object = py::none(),
38 |            unsigned int = 0,
39 |            py::object = py::none(),
40 |            double = 0.0,
41 |            double = 0.0,
42 |            py::object = py::none(),
43 |            unsigned int = 65536);
44 |     void addUserMetadata(py::str, py::bytes);
45 |     void write(py::object);
46 |     uint64_t writerows(py::iterable);
47 | #if ORC_VERSION_AT_LEAST(1, 9, 0)
48 |     uint64_t writeIntermediateFooter();
49 | #endif
50 |     void close();
51 |     ~Writer(){};
52 | };
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/src/pyorc/enums.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | 
 3 | 
 4 | class CompressionKind(enum.IntEnum):
 5 |     """ The compression kind for the ORC file. """
 6 | 
 7 |     NONE = 0
 8 |     ZLIB = 1
 9 |     SNAPPY = 2
10 |     LZO = 3
11 |     LZ4 = 4
12 |     ZSTD = 5
13 | 
14 | 
15 | class CompressionStrategy(enum.IntEnum):
16 |     """ Compression strategy for the ORC file. """
17 | 
18 |     SPEED = 0
19 |     COMPRESSION = 1
20 | 
21 | 
22 | class TypeKind(enum.IntEnum):
23 |     """ The type kinds for an ORC schema. """
24 | 
25 |     BOOLEAN = 0
26 |     BYTE = 1
27 |     SHORT = 2
28 |     INT = 3
29 |     LONG = 4
30 |     FLOAT = 5
31 |     DOUBLE = 6
32 |     STRING = 7
33 |     BINARY = 8
34 |     TIMESTAMP = 9
35 |     LIST = 10
36 |     MAP = 11
37 |     STRUCT = 12
38 |     UNION = 13
39 |     DECIMAL = 14
40 |     DATE = 15
41 |     VARCHAR = 16
42 |     CHAR = 17
43 |     TIMESTAMP_INSTANT = 18
44 | 
45 |     @classmethod
46 |     def has_value(cls, value: int) -> bool:
47 |         return any(member.value == value for member in cls)
48 | 
49 | 
50 | class StructRepr(enum.IntEnum):
51 |     """ Enumeration for ORC struct representation. """
52 | 
53 |     TUPLE = 0  #: For tuple.
54 |     DICT = 1  #: For dictionary.
55 | 
56 | 
57 | class WriterVersion(enum.IntEnum):
58 |     """ Writer version for an ORC file. """
59 | 
60 |     ORIGINAL = 0
61 |     HIVE_8732 = 1
62 |     HIVE_4243 = 2
63 |     HIVE_12055 = 3
64 |     HIVE_13083 = 4
65 |     ORC_101 = 5
66 |     ORC_135 = 6
67 |     ORC_517 = 7
68 |     ORC_203 = 8
69 |     ORC_14 = 9
70 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | PyORC
 2 | =====
 3 | 
 4 | .. image:: https://dev.azure.com/noirello/pyorc/_apis/build/status/noirello.pyorc?branchName=master
 5 |     :target: https://dev.azure.com/noirello/pyorc/_build?definitionId=1
 6 |     :alt: Azure Pipelines Status
 7 | 
 8 | .. image:: https://codecov.io/gh/noirello/pyorc/branch/master/graph/badge.svg
 9 |     :target: https://codecov.io/gh/noirello/pyorc
10 |     :alt: Codecov code coverage
11 | 
12 | .. image:: https://readthedocs.org/projects/pyorc/badge/?version=latest
13 |     :target: https://pyorc.readthedocs.io/en/latest/?badge=latest
14 |     :alt: Documentation Status
15 | 
16 | Python module for reading and writing `Apache ORC`_ file format. It uses the Apache ORC's Core C++ API
17 | under the hood, and provides a similar interface as the `csv module`_ in the Python standard library.
18 | 
19 | Supports only Python 3.9 or newer and ORC 1.7.
20 | 
21 | Features
22 | --------
23 | 
24 | - Reading ORC files.
25 | - Writing ORC files.
26 | - While using Python's stream/file-like object IO interface.
27 | 
28 | That sums up quite well the purpose of this project.
29 | 
30 | Example
31 | -------
32 | 
33 | Minimal example for reading an ORC file:
34 | 
35 | .. code:: python
36 | 
37 |         import pyorc
38 | 
39 |         with open("./data.orc", "rb") as data:
40 |             reader = pyorc.Reader(data)
41 |             for row in reader:
42 |                 print(row)
43 | 
44 | And another for writing one:
45 | 
46 | .. code:: python
47 | 
48 |         import pyorc
49 | 
50 |         with open("./new_data.orc", "wb") as data:
51 |             with pyorc.Writer(data, "struct<col0:int,col1:string>") as writer:
52 |                 writer.write((1, "ORC from Python"))
53 | 
54 | Contribution
55 | ============
56 | 
57 | Any contributions are welcome. If you would like to help in development fork
58 | or report issue here on Github. You can also help in improving the
59 | documentation.
60 | 
61 | .. _Apache ORC: https://orc.apache.org/
62 | .. _csv module: https://docs.python.org/3/library/csv.html
63 | 


--------------------------------------------------------------------------------
/src/pyorc/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from typing import NamedTuple
 4 | 
 5 | if sys.platform.startswith("win32") and "TZDIR" not in os.environ:
 6 |     # Windows does not come with a standard IANA time zone database,
 7 |     # but the ORC lib requires it. Set the TZDIR environment variable
 8 |     # to the tzdata module's data directory.
 9 |     import tzdata
10 | 
11 |     os.environ["TZDIR"] = os.path.join(os.path.dirname(tzdata.__file__), "zoneinfo")
12 | 
13 | from pyorc._pyorc import _orc_version
14 | 
15 | from .enums import *
16 | from .errors import *
17 | from .predicates import PredicateColumn
18 | from .reader import Column, Reader, Stripe
19 | from .typedescription import *
20 | from .writer import Writer
21 | 
22 | __version__ = "0.11.0"
23 | 
24 | orc_version = _orc_version()
25 | 
26 | ORCVersionInfo = NamedTuple(
27 |     "ORCVersionInfo",
28 |     [("major", int), ("minor", int), ("patch", int), ("releaselevel", str)],
29 | )
30 | 
31 | 
32 | def __extract_version_info() -> ORCVersionInfo:
33 |     splitted = _orc_version().split("-")
34 |     ver = splitted[0]
35 |     rel_level = splitted[1] if len(splitted) > 1 else ""
36 |     major, minor, patch = map(int, ver.split("."))
37 |     return ORCVersionInfo(major, minor, patch, rel_level)
38 | 
39 | 
40 | orc_version_info = __extract_version_info()
41 | 
42 | __all__ = [
43 |     "Column",
44 |     "PredicateColumn",
45 |     "Reader",
46 |     "Stripe",
47 |     "Writer",
48 |     # Enums
49 |     "CompressionKind",
50 |     "CompressionStrategy",
51 |     "TypeKind",
52 |     "StructRepr",
53 |     "WriterVersion",
54 |     # Errors
55 |     "ORCError",
56 |     "ParseError",
57 |     # Type descriptiona
58 |     "TypeDescription",
59 |     "Boolean",
60 |     "TinyInt",
61 |     "SmallInt",
62 |     "Int",
63 |     "BigInt",
64 |     "Float",
65 |     "Double",
66 |     "String",
67 |     "Binary",
68 |     "Timestamp",
69 |     "TimestampInstant",
70 |     "Date",
71 |     "Char",
72 |     "VarChar",
73 |     "Decimal",
74 |     "Union",
75 |     "Array",
76 |     "Map",
77 |     "Struct",
78 |     # Version info
79 |     "orc_version",
80 |     "orc_version_info",
81 | ]
82 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installing
 2 | ==========
 3 | 
 4 | Using pip
 5 | ---------
 6 | 
 7 | For Linux and Mac, you can simply use pip that will install a wheel bundled
 8 | with the required libraries::
 9 | 
10 |     $ pip3 install pyorc
11 | 
12 | .. note::
13 |     To install on Linux, you need *pip 19.0* or newer. Earlier versions are
14 |     unable to handle the ``manylinux2010`` wheels, thus they try to install
15 |     the package from source.
16 | 
17 | There could be some drawbacks of the bundled libraries in the package, when
18 | using together with other Python modules. If another module is loaded into
19 | the Python runtime besides PyORC that also pre-bundles one of the required
20 | C/C++ libraries but a slightly different version, then the two libraries
21 | will collide, and the interpreter will crash with segmentation fault at some
22 | point during the execution.
23 | 
24 | It's easy to run into this situation. For example, ``libprotobuf`` is
25 | one of required library for ORC, and it's quite popular for other projects
26 | as well. To avoid this, you have to make sure that the very same version
27 | of the common library is used by both of the modules, and therefore 
28 | you might need to build PyORC from source.
29 | 
30 | 
31 | Install from source
32 | -------------------
33 | 
34 | To install from source, the module requires the Apache ORC C++ Core library.
35 | During the extension build step, the module will build the ORC core library
36 | before building the extension module itself. It requires `cmake` -- in
37 | addition of a suitable C++ compiler. The following steps take place during
38 | the `build_ext` command:
39 | 
40 |     1. Downloading the Apache ORC release package.
41 |     2. Extracting the package to a directory named `deps` into the project's
42 |        root directory.
43 |     3. Running cmake to configure the ORC C++ library.
44 |     4. Running the ``make package`` command.
45 |     5. Finally, moving the C++ headers, ORC example files and ORC tools
46 |        to the top level of the `deps` directory for the `setup.py` and tests
47 |        to find.
48 |     6. Building the C++ extension part of PyORC.
49 | 
50 | .. note::
51 |     The ``build_ext`` command has a ``--orc-version`` and a ``--source-url``
52 |     parameter for changing the default ORC library version or the URL of the
53 |     source zip to download respectively. It also has a ``--skip-orc-build``
54 |     flag to skip ORC library build steps.
55 | 
56 | You also need the `pybind11` Python package to be installed before running
57 | the installation::
58 | 
59 |     $ pip3 install pybind11
60 |     $ python3 setup.py install
61 | 
62 | After the installation completes without errors, you have the module ready
63 | to use.
64 | 


--------------------------------------------------------------------------------
/src/pyorc/predicates.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | from typing import Any, Optional
 3 | 
 4 | from .enums import TypeKind
 5 | 
 6 | 
 7 | class Operator(enum.IntEnum):
 8 |     NOT = 0
 9 |     OR = 1
10 |     AND = 2
11 |     EQ = 3
12 |     LT = 4
13 |     LE = 5
14 | 
15 | 
16 | class Predicate:
17 |     def __init__(self, operator: Operator, left, right) -> None:
18 |         self.values = (operator, left, right)
19 | 
20 |     def __or__(self, other) -> "Predicate":
21 |         self.values = (Operator.OR, self.values, other.values)
22 |         return self
23 | 
24 |     def __and__(self, other) -> "Predicate":
25 |         self.values = (Operator.AND, self.values, other.values)
26 |         return self
27 | 
28 |     def __invert__(self) -> "Predicate":
29 |         self.values = (Operator.NOT, self.values)
30 |         return self
31 | 
32 | 
33 | class PredicateColumn:
34 |     def __init__(
35 |         self,
36 |         type_kind: TypeKind,
37 |         name: Optional[str] = None,
38 |         index: Optional[int] = None,
39 |         precision: Optional[int] = None,
40 |         scale: Optional[int] = None,
41 |     ) -> None:
42 |         if not TypeKind.has_value(type_kind) or type_kind in (
43 |             TypeKind.BINARY,
44 |             TypeKind.LIST,
45 |             TypeKind.MAP,
46 |             TypeKind.UNION,
47 |             TypeKind.STRUCT,
48 |         ):
49 |             raise TypeError("Invalid type for PredicateColumn: %s" % type_kind)
50 |         self.type_kind = type_kind
51 |         if self.type_kind == TypeKind.DECIMAL and (precision is None or scale is None):
52 |             raise ValueError("Both precision and scale must be set for Decimal type")
53 |         if name is not None and index is not None:
54 |             raise TypeError("Only one of the name or index parameter must be given")
55 |         if name is not None and not isinstance(name, str):
56 |             raise TypeError("Name parameter must be string")
57 |         if index is not None and not isinstance(index, int):
58 |             raise TypeError("Index parameter must be int")
59 |         self.name = name
60 |         self.index = index
61 |         self.precision = precision if precision is not None else 0
62 |         self.scale = scale if scale is not None else 0
63 | 
64 |     def __eq__(self, other: Any) -> Predicate:
65 |         return Predicate(Operator.EQ, self, other)
66 | 
67 |     def __ne__(self, other: Any) -> Predicate:
68 |         return ~Predicate(Operator.EQ, self, other)
69 | 
70 |     def __lt__(self, other: Any) -> Predicate:
71 |         return Predicate(Operator.LT, self, other)
72 | 
73 |     def __le__(self, other: Any) -> Predicate:
74 |         return Predicate(Operator.LE, self, other)
75 | 
76 |     def __gt__(self, other: Any) -> Predicate:
77 |         return ~Predicate(Operator.LE, self, other)
78 | 
79 |     def __ge__(self, other: Any) -> Predicate:
80 |         return ~Predicate(Operator.LT, self, other)
81 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | 
13 | import sys
14 | import os
15 | 
16 | sys.path[0:0] = [os.path.abspath("..")]
17 | 
18 | # For read-the-docs: mocking the _pyorc module.
19 | from unittest.mock import MagicMock
20 | 
21 | 
22 | class Mock(MagicMock):
23 |     @classmethod
24 |     def __getattr__(cls, name):
25 |         # For pyorc
26 |         if name == "typedescription":
27 |             return object
28 |         elif name == "reader":
29 |             return object
30 |         elif name == "writer":
31 |             return object
32 |         elif name == "stripe":
33 |             return object
34 |         elif name == "_orc_version":
35 |             return lambda: "0.0.0-DUMMY"
36 |         # For zoneinfo
37 |         elif name == "ZoneInfo":
38 |             return lambda key: object
39 | 
40 | 
41 | MOCK_MODULES = ["src.pyorc._pyorc", "pyorc", "pyorc._pyorc", "zoneinfo"]
42 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
43 | 
44 | import src.pyorc as pyorc
45 | 
46 | sys.modules["pyorc"] = pyorc
47 | 
48 | 
49 | # -- Project information -----------------------------------------------------
50 | 
51 | project = "PyORC"
52 | copyright = "2019-2025, noirello"
53 | author = "noirello"
54 | 
55 | # The full version, including alpha/beta/rc tags
56 | release = pyorc.__version__
57 | 
58 | 
59 | # -- General configuration ---------------------------------------------------
60 | 
61 | # Add any Sphinx extension module names here, as strings. They can be
62 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
63 | # ones.
64 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.doctest"]
65 | 
66 | # Add any paths that contain templates here, relative to this directory.
67 | templates_path = ["_templates"]
68 | 
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This pattern also affects html_static_path and html_extra_path.
72 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
73 | 
74 | 
75 | # -- Options for HTML output -------------------------------------------------
76 | 
77 | # The theme to use for HTML and HTML Help pages.  See the documentation for
78 | # a list of builtin themes.
79 | #
80 | html_theme = "furo"
81 | 
82 | # Add any paths that contain custom static files (such as style sheets) here,
83 | # relative to this directory. They are copied after the builtin static files,
84 | # so a file named "default.css" will overwrite the builtin "default.css".
85 | html_static_path = ["_static"]
86 | 
87 | master_doc = "index"
88 | 


--------------------------------------------------------------------------------
/src/pyorc/converters.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from datetime import date, datetime, timedelta, timezone as tz
 3 | from decimal import Decimal, localcontext
 4 | import math
 5 | from typing import Dict, Tuple, Type, Any
 6 | 
 7 | from .enums import TypeKind
 8 | 
 9 | try:
10 |     import zoneinfo
11 | except ImportError:
12 |     from backports import zoneinfo
13 | 
14 | 
15 | class ORCConverter(ABC):
16 |     @staticmethod
17 |     @abstractmethod
18 |     def from_orc(*args):
19 |         pass
20 | 
21 |     @staticmethod
22 |     @abstractmethod
23 |     def to_orc(*args):
24 |         pass
25 | 
26 | 
27 | class TimestampConverter(ORCConverter):
28 |     @staticmethod
29 |     def from_orc(
30 |         seconds: int, nanoseconds: int, timezone: zoneinfo.ZoneInfo,
31 |     ) -> datetime:
32 |         epoch = datetime(1970, 1, 1, 0, 0, 0, tzinfo=tz.utc)
33 |         return (
34 |             epoch + timedelta(seconds=seconds, microseconds=nanoseconds // 1000)
35 |         ).astimezone(timezone)
36 | 
37 |     @staticmethod
38 |     def to_orc(obj: datetime, timezone: zoneinfo.ZoneInfo) -> Tuple[int, int]:
39 |         return math.floor(obj.timestamp()), obj.microsecond * 1000
40 | 
41 | 
42 | class DateConverter(ORCConverter):
43 |     @staticmethod
44 |     def from_orc(days: int) -> date:
45 |         return date(1970, 1, 1) + timedelta(days=days)
46 | 
47 |     @staticmethod
48 |     def to_orc(obj: date) -> int:
49 |         return (obj - date(1970, 1, 1)).days
50 | 
51 | 
52 | class DecimalConverter(ORCConverter):
53 |     @staticmethod
54 |     def from_orc(decimal: str) -> Decimal:
55 |         return Decimal(decimal)
56 | 
57 |     @staticmethod
58 |     def to_orc(precision: int, scale: int, obj: Decimal) -> int:
59 |         """
60 |         Adjust the Decimal number to the given precision and scale.
61 |         Return the integer value of the number.
62 | 
63 |         :param int precision: the precision of the decimal
64 |         :param int scale: the scale of decimal
65 |         :param decimal.Decimal obj: the number as Python decimal
66 |         :return: an integer (interpreted with the given scale)
67 |         :rtype: int
68 |         """
69 |         with localcontext() as ctx:
70 |             try:
71 |                 ctx.prec = precision
72 |                 coefficient = Decimal("1.{0}".format("0" * scale))
73 |                 dec = obj.quantize(coefficient)
74 |                 dec_tup = dec.as_tuple()
75 |                 integer = sum(
76 |                     dig * 10 ** exp for exp, dig in enumerate(reversed(dec_tup.digits))
77 |                 )
78 |                 if dec_tup.exponent > 0:
79 |                     integer = integer * 10 ** dec_tup.exponent
80 |                 if dec_tup.sign == 1:
81 |                     return integer * -1
82 |                 else:
83 |                     return integer
84 |             except AttributeError:
85 |                 raise TypeError(
86 |                     "Item {0} cannot be cast as a decimal".format(type(obj))
87 |                 ) from None
88 | 
89 | 
90 | DEFAULT_CONVERTERS: Dict[TypeKind, Type[ORCConverter]] = {
91 |     TypeKind.DATE: DateConverter,
92 |     TypeKind.DECIMAL: DecimalConverter,
93 |     TypeKind.TIMESTAMP: TimestampConverter,
94 | }
95 | 


--------------------------------------------------------------------------------
/.azure-pipelines/build-run-tests.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 | - name: windows
 3 |   type: boolean
 4 |   default: false
 5 | - name: orc_version
 6 |   type: string
 7 |   default: ""
 8 | 
 9 | steps:
10 | - script: |
11 |     which python
12 |     python -V
13 |   displayName: Check Python $(python.version) path
14 | 
15 | - script: python -m pip install --user -U pytest pytest-cov pytest-xdist[psutil] coverage codecov
16 |   displayName: Install test dependencies
17 | 
18 | - ${{ if eq(parameters.windows, true) }}:
19 |   - ${{ if ne(parameters.orc_version, '') }}:
20 |     - script: |
21 |         set PYORC_DEBUG=1
22 |         set PYORC_LIB_VERSION=${{ parameters.orc_version }}
23 |         python -m pip install -vvv --user .
24 |       displayName: Install package
25 |   - ${{ else }}:
26 |     - script: |
27 |         set PYORC_DEBUG=1
28 |         python -m pip install -vvv --user .
29 |       displayName: Install package
30 | 
31 |   - powershell: |
32 |       $files = Get-ChildItem -path .\deps\orc-*\build\*\*\*-stamp\*.log
33 |       $output = ".\\buildlogs\\logs-$env:AGENT_OS-$env:PYTHON_VER.zip"
34 |       New-Item -Path .\ -Name "buildlogs" -ItemType "directory"
35 |       Compress-Archive -LiteralPath $files -Destination $output
36 |     env:
37 |       PYTHON_VER: $(python.version)
38 |       AGENT_OS: $(Agent.OS)
39 |     displayName: Collect build logs
40 |     condition: succeededOrFailed()
41 | 
42 |   - script: |
43 |       FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "import os, pyorc; print(os.path.dirname(pyorc.__file__))"`) DO (SET INSTALLDIR=%%F)
44 |       echo %INSTALLDIR%
45 |       python -m pytest -vs --junitxml=./test-results.xml -n auto --cov=%INSTALLDIR% tests/
46 |     displayName: Run pytest (Windows)
47 | 
48 | - ${{ else }}:
49 |   - ${{ if ne(parameters.orc_version, '') }}:
50 |     - script: |
51 |         PYORC_DEBUG=1 PYORC_LIB_VERSION=${{ parameters.orc_version }} CFLAGS="-coverage" python -m pip install -vvv --user .
52 |       displayName: Install package
53 |   - ${{ else }}:
54 |     - script: |
55 |         PYORC_DEBUG=1 CFLAGS="-coverage" python -m pip install -vvv --user .
56 |       displayName: Install package
57 | 
58 |   - script: |
59 |       mkdir ./buildlogs
60 |       mkdir "$AGENT_OS-$PYTHON_VER"
61 |       cp $(ls deps/orc-1.*/build/*/*/*-stamp/*.log) "$AGENT_OS-$PYTHON_VER"
62 |       tar -czvf "buildlogs/logs-$AGENT_OS-$PYTHON_VER.tar.gz" "$AGENT_OS-$PYTHON_VER"
63 |     env:
64 |       PYTHON_VER: $(python.version)
65 |       AGENT_OS: $(Agent.OS)
66 |     displayName: Collect build logs
67 |     condition: succeededOrFailed()
68 | 
69 |   - script: |
70 |       INSTALLDIR=$(python -c "import os, pyorc; print(os.path.dirname(pyorc.__file__))")
71 |       echo $INSTALLDIR
72 |       python -m pytest -vs --junitxml=./test-results.xml -n auto --cov="$INSTALLDIR" tests/
73 |     displayName: Run pytest (Unix)
74 | 
75 | - task: PublishTestResults@2
76 |   inputs:
77 |     testResultsFiles: test-results.xml
78 |     testRunTitle: Tests on $(Agent.OS) with Python $(python.version)
79 |   condition: succeededOrFailed()
80 | 
81 | - task: PublishBuildArtifacts@1
82 |   inputs:
83 |     pathtoPublish: 'buildlogs'
84 |     artifactName: logs
85 |   condition: succeededOrFailed()
86 | 
87 | - script: python -m codecov
88 |   env:
89 |     CODECOV_TOKEN: $(codecov)
90 |   displayName: Report Coverage
91 |   condition: succeeded()
92 | 


--------------------------------------------------------------------------------
/src/_pyorc/Reader.h:
--------------------------------------------------------------------------------
  1 | #ifndef READER_H
  2 | #define READER_H
  3 | 
  4 | #include <pybind11/pybind11.h>
  5 | #include <pybind11/stl.h>
  6 | 
  7 | #include "orc/OrcFile.hh"
  8 | 
  9 | #include "Converter.h"
 10 | 
 11 | namespace py = pybind11;
 12 | 
 13 | py::object
 14 | createTypeDescription(const orc::Type&);
 15 | 
 16 | class ORCFileLikeObject
 17 | {
 18 |   private:
 19 |     py::object convertTimestampMillis(int64_t) const;
 20 | 
 21 |   protected:
 22 |     uint64_t batchItem;
 23 |     orc::RowReaderOptions rowReaderOpts;
 24 |     std::unique_ptr<orc::RowReader> rowReader;
 25 |     std::unique_ptr<orc::ColumnVectorBatch> batch;
 26 |     std::unique_ptr<Converter> converter;
 27 |     py::dict convDict;
 28 |     py::object timezoneInfo;
 29 |     py::dict buildStatistics(const orc::Type*, const orc::ColumnStatistics*) const;
 30 |     const orc::Type* findColumnType(const orc::Type*, uint64_t) const;
 31 | 
 32 |   public:
 33 |     uint64_t currentRow;
 34 |     uint64_t firstRowOfStripe;
 35 |     virtual uint64_t len() const = 0;
 36 |     py::object next();
 37 |     py::list read(int64_t = -1);
 38 |     uint64_t seek(int64_t, uint16_t = 0);
 39 |     const orc::RowReaderOptions getRowReaderOptions() const { return rowReaderOpts; };
 40 |     const py::dict getConverterDict() const { return convDict; }
 41 |     const py::object getTimeZoneInfo() const { return timezoneInfo; }
 42 |     virtual ~ORCFileLikeObject(){};
 43 | };
 44 | 
 45 | class Stripe; /* Forward declaration */
 46 | 
 47 | class Reader : public ORCFileLikeObject
 48 | {
 49 |   private:
 50 |     std::unique_ptr<orc::Reader> reader;
 51 |     uint64_t batchSize;
 52 |     unsigned int structKind;
 53 |     py::object nullValue;
 54 | 
 55 |   public:
 56 |     Reader(py::object,
 57 |            uint64_t = 1024,
 58 |            std::list<uint64_t> = {},
 59 |            std::list<std::string> = {},
 60 |            py::object = py::none(),
 61 |            unsigned int = 0,
 62 |            py::object = py::none(),
 63 |            py::object = py::none(),
 64 |            py::object = py::none());
 65 |     py::dict bytesLengths() const;
 66 |     uint64_t compression() const;
 67 |     uint64_t compressionBlockSize() const;
 68 |     uint64_t rowIndexStride() const;
 69 |     py::tuple formatVersion() const;
 70 |     uint64_t len() const override;
 71 |     uint64_t numberOfStripes() const;
 72 |     uint32_t writerId() const;
 73 |     uint32_t writerVersion() const;
 74 |     std::string softwareVersion() const;
 75 |     py::object schema();
 76 |     py::object selectedSchema();
 77 |     std::unique_ptr<Stripe> readStripe(uint64_t);
 78 |     py::tuple statistics(uint64_t);
 79 |     py::dict userMetadata();
 80 | 
 81 |     const orc::Reader& getORCReader() const { return *reader; }
 82 |     const uint64_t getBatchSize() const { return batchSize; }
 83 |     const unsigned int getStructKind() const { return structKind; }
 84 |     const py::object getNullValue() const { return nullValue; }
 85 |     ~Reader(){};
 86 | };
 87 | 
 88 | class Stripe : public ORCFileLikeObject
 89 | {
 90 |   private:
 91 |     uint64_t stripeIndex;
 92 |     std::unique_ptr<orc::StripeInformation> stripeInfo;
 93 |     const Reader& reader;
 94 | 
 95 |   public:
 96 |     Stripe(const Reader&, uint64_t, std::unique_ptr<orc::StripeInformation>);
 97 |     py::tuple bloomFilterColumns();
 98 |     uint64_t len() const override;
 99 |     uint64_t length() const;
100 |     uint64_t offset() const;
101 |     py::tuple statistics(uint64_t);
102 |     std::string writerTimezone();
103 |     ~Stripe(){};
104 | };
105 | 
106 | #endif
107 | 


--------------------------------------------------------------------------------
/src/pyorc/writer.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Any, BinaryIO, Dict, List, Optional, Type, Union
  3 | 
  4 | from pyorc._pyorc import writer
  5 | 
  6 | from .converters import DEFAULT_CONVERTERS, ORCConverter
  7 | from .enums import CompressionKind, CompressionStrategy, StructRepr, TypeKind
  8 | from .typedescription import TypeDescription
  9 | 
 10 | try:
 11 |     import zoneinfo
 12 | except ImportError:
 13 |     from backports import zoneinfo
 14 | 
 15 | 
 16 | class Writer(writer):
 17 |     def __init__(
 18 |         self,
 19 |         fileo: BinaryIO,
 20 |         schema: Union[str, TypeDescription],
 21 |         batch_size: int = 1024,
 22 |         stripe_size: int = 67108864,
 23 |         row_index_stride: int = 10000,
 24 |         compression: CompressionKind = CompressionKind.ZLIB,
 25 |         compression_strategy: CompressionStrategy = CompressionStrategy.SPEED,
 26 |         compression_block_size: int = 65536,
 27 |         bloom_filter_columns: Optional[List[Union[str, int]]] = None,
 28 |         bloom_filter_fpp: float = 0.05,
 29 |         timezone: zoneinfo.ZoneInfo = zoneinfo.ZoneInfo("UTC"),
 30 |         struct_repr: StructRepr = StructRepr.TUPLE,
 31 |         converters: Optional[Dict[TypeKind, Type[ORCConverter]]] = None,
 32 |         padding_tolerance: float = 0.0,
 33 |         dict_key_size_threshold: float = 0.0,
 34 |         null_value: Any = None,
 35 |         memory_block_size = 65536,
 36 |     ) -> None:
 37 |         if isinstance(schema, str):
 38 |             schema = TypeDescription.from_string(schema)
 39 |         elif not isinstance(schema, TypeDescription):
 40 |             raise TypeError("Invalid `schema` type, must be string or TypeDescription")
 41 |         if 0.0 >= bloom_filter_fpp or bloom_filter_fpp >= 1.0:
 42 |             raise ValueError("False positive probability should be > 0.0 & < 1.0")
 43 |         self.__schema = schema
 44 |         self.__user_metadata: Dict[str, bytes] = {}
 45 |         comp = CompressionKind(compression)
 46 |         comp_strat = CompressionStrategy(compression_strategy)
 47 |         bf_set = set()
 48 |         if bloom_filter_columns:
 49 |             if any(not isinstance(item, (int, str)) for item in bloom_filter_columns):
 50 |                 raise ValueError(
 51 |                     "All items in `bloom_filter_columns` mut be string or int"
 52 |                 )
 53 |             for item in bloom_filter_columns:
 54 |                 if isinstance(item, int):
 55 |                     bf_set.add(item)
 56 |                 elif isinstance(item, str):
 57 |                     bf_set.add(self.__schema.find_column_id(item))
 58 |         conv = None
 59 |         if converters:
 60 |             conv = DEFAULT_CONVERTERS.copy()
 61 |             conv.update(converters)
 62 |         else:
 63 |             conv = converters
 64 |         super().__init__(
 65 |             fileo,
 66 |             self.__schema,
 67 |             batch_size,
 68 |             stripe_size,
 69 |             row_index_stride,
 70 |             comp,
 71 |             comp_strat,
 72 |             compression_block_size,
 73 |             bf_set,
 74 |             bloom_filter_fpp,
 75 |             timezone,
 76 |             struct_repr,
 77 |             conv,
 78 |             padding_tolerance,
 79 |             dict_key_size_threshold,
 80 |             null_value,
 81 |             memory_block_size,
 82 |         )
 83 | 
 84 |     def __enter__(self) -> "Writer":
 85 |         return self
 86 | 
 87 |     def __exit__(self, *exc: Any) -> None:
 88 |         self.close()
 89 | 
 90 |     def close(self) -> None:
 91 |         for key, val in self.__user_metadata.items():
 92 |             super()._add_user_metadata(key, val)
 93 |         super().close()
 94 | 
 95 |     @property
 96 |     def schema(self) -> TypeDescription:
 97 |         return copy.deepcopy(self.__schema)
 98 | 
 99 |     def set_user_metadata(self, **kwargs: bytes) -> None:
100 |         for key, val in kwargs.items():
101 |             if not isinstance(val, bytes):
102 |                 raise TypeError(
103 |                     "All values must be bytes, key '{0}' is {1}".format(key, type(val))
104 |                 )
105 |             self.__user_metadata[key] = val
106 | 


--------------------------------------------------------------------------------
/tests/test_predicates.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from datetime import datetime
  4 | from decimal import Decimal
  5 | 
  6 | from pyorc.predicates import *
  7 | from pyorc.enums import TypeKind
  8 | 
  9 | 
 10 | def test_column_type():
 11 |     with pytest.raises(TypeError):
 12 |         _ = PredicateColumn("a", "something")
 13 |     with pytest.raises(TypeError):
 14 |         _ = PredicateColumn(TypeKind.STRUCT, "something")
 15 |     with pytest.raises(TypeError):
 16 |         _ = PredicateColumn(TypeKind.LONG, name=0)
 17 |     with pytest.raises(TypeError):
 18 |         _ = PredicateColumn(TypeKind.LONG, index="a")
 19 |     with pytest.raises(TypeError):
 20 |         _ = PredicateColumn(TypeKind.LONG, name="a", index=0)
 21 |     col = PredicateColumn(TypeKind.LONG, "colname")
 22 |     assert col is not None
 23 | 
 24 | 
 25 | def test_column_fields():
 26 |     col = PredicateColumn(TypeKind.LONG, "colname")
 27 |     assert col.name == "colname"
 28 |     assert col.type_kind == 4
 29 |     assert col.precision == 0
 30 |     assert col.scale == 0
 31 |     col = PredicateColumn(TypeKind.DECIMAL, "colname", precision=2, scale=3)
 32 |     assert col.type_kind == TypeKind.DECIMAL
 33 |     assert col.precision == 2
 34 |     assert col.scale == 3
 35 | 
 36 | 
 37 | def test_equals():
 38 |     col = PredicateColumn(TypeKind.LONG, "colname")
 39 |     pred = col == 100
 40 |     assert isinstance(pred, Predicate)
 41 |     assert pred.values == (Operator.EQ, col, 100)
 42 | 
 43 | 
 44 | def test_not_equals():
 45 |     col = PredicateColumn(TypeKind.STRING, "colname")
 46 |     pred = col != "test"
 47 |     assert isinstance(pred, Predicate)
 48 |     assert pred.values == (Operator.NOT, (Operator.EQ, col, "test"))
 49 | 
 50 | 
 51 | def test_less_than():
 52 |     col = PredicateColumn(TypeKind.INT, "colname")
 53 |     pred = col < 100
 54 |     assert isinstance(pred, Predicate)
 55 |     assert pred.values == (Operator.LT, col, 100)
 56 | 
 57 | 
 58 | def test_less_than_or_equals():
 59 |     col = PredicateColumn(TypeKind.LONG, "colname")
 60 |     pred = col <= 50
 61 |     assert isinstance(pred, Predicate)
 62 |     assert pred.values == (Operator.LE, col, 50)
 63 | 
 64 | 
 65 | def test_greater_than():
 66 |     col = PredicateColumn(TypeKind.DOUBLE, "colname")
 67 |     pred = col > 5.0
 68 |     assert isinstance(pred, Predicate)
 69 |     assert pred.values == (Operator.NOT, (Operator.LE, col, 5.0))
 70 | 
 71 | 
 72 | def test_greater_than_or_equals():
 73 |     col = PredicateColumn(TypeKind.FLOAT, "colname")
 74 |     pred = col >= 10.0
 75 |     assert isinstance(pred, Predicate)
 76 |     assert pred.values == (Operator.NOT, (Operator.LT, col, 10.0))
 77 | 
 78 | 
 79 | def test_and():
 80 |     col0 = PredicateColumn(TypeKind.LONG, "colname0")
 81 |     col1 = PredicateColumn(TypeKind.TIMESTAMP, "colname1")
 82 |     pred = (col0 < 100) & (col1 == datetime(2021, 3, 20))
 83 |     assert isinstance(pred, Predicate)
 84 |     assert pred.values == (
 85 |         Operator.AND,
 86 |         (Operator.LT, col0, 100),
 87 |         (Operator.EQ, col1, datetime(2021, 3, 20)),
 88 |     )
 89 | 
 90 | 
 91 | def test_or():
 92 |     col0 = PredicateColumn(TypeKind.SHORT, name="colname0")
 93 |     col1 = PredicateColumn(TypeKind.DECIMAL, name="colname1", precision=2, scale=2)
 94 |     pred = (col0 < 100) & (col1 >= Decimal("20.00"))
 95 |     assert isinstance(pred, Predicate)
 96 |     assert pred.values == (
 97 |         Operator.AND,
 98 |         (Operator.LT, col0, 100),
 99 |         (Operator.NOT, (Operator.LT, col1, Decimal("20.00"))),
100 |     )
101 | 
102 | 
103 | def test_not():
104 |     col = PredicateColumn(TypeKind.FLOAT, "colname")
105 |     pred = ~((col < 1.0) & (col > -1.0))
106 |     assert isinstance(pred, Predicate)
107 |     assert pred.values == (
108 |         Operator.NOT,
109 |         (
110 |             Operator.AND,
111 |             (Operator.LT, col, 1.0),
112 |             (Operator.NOT, (Operator.LE, col, -1.0)),
113 |         ),
114 |     )
115 | 
116 | 
117 | def test_decimal():
118 |     with pytest.raises(ValueError):
119 |         _ = PredicateColumn(TypeKind.DECIMAL, "something")
120 |     col = PredicateColumn(TypeKind.DECIMAL, "colname", precision=10, scale=3)
121 |     assert col is not None
122 | 


--------------------------------------------------------------------------------
/src/pyorc/reader.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Type, Union
  3 | 
  4 | from pyorc._pyorc import reader, stripe
  5 | 
  6 | from .converters import DEFAULT_CONVERTERS, ORCConverter
  7 | from .enums import CompressionKind, StructRepr, TypeKind, WriterVersion
  8 | from .predicates import Predicate
  9 | 
 10 | try:
 11 |     import zoneinfo
 12 | except ImportError:
 13 |     from backports import zoneinfo
 14 | 
 15 | 
 16 | class Column:
 17 |     def __init__(self, stream: Union["Reader", "Stripe"], index: int):
 18 |         self.index = index
 19 |         self.stream = stream
 20 |         self._stats = self.stream._statistics(self.index)
 21 | 
 22 |     @property
 23 |     def statistics(self) -> Dict[str, Any]:
 24 |         result = {}
 25 |         result_list = defaultdict(list)
 26 |         for stat in self._stats:
 27 |             for key, val in stat.items():
 28 |                 result_list[key].append(val)
 29 |         for key, values in result_list.items():
 30 |             if key in (
 31 |                 "number_of_values",
 32 |                 "sum",
 33 |                 "false_count",
 34 |                 "true_count",
 35 |                 "total_length",
 36 |             ):
 37 |                 result[key] = sum(values)
 38 |             elif key in ("minimum", "lower_bound"):
 39 |                 result[key] = min(values)
 40 |             elif key in ("maximum", "upper_bound"):
 41 |                 result[key] = max(values)
 42 |             elif key == "has_null":
 43 |                 result[key] = any(values)
 44 |         result["kind"] = TypeKind(result_list["kind"][0])
 45 |         return result
 46 | 
 47 | 
 48 | class Stripe(stripe):
 49 |     def __getitem__(self, col_idx: int) -> "Column":
 50 |         return Column(self, col_idx)
 51 | 
 52 | 
 53 | class Reader(reader):
 54 |     def __init__(
 55 |         self,
 56 |         fileo: BinaryIO,
 57 |         batch_size: int = 1024,
 58 |         column_indices: Optional[List[int]] = None,
 59 |         column_names: Optional[List[str]] = None,
 60 |         timezone: zoneinfo.ZoneInfo = zoneinfo.ZoneInfo("UTC"),
 61 |         struct_repr: StructRepr = StructRepr.TUPLE,
 62 |         converters: Optional[Dict[TypeKind, Type[ORCConverter]]] = None,
 63 |         predicate: Optional[Predicate] = None,
 64 |         null_value: Any = None,
 65 |     ) -> None:
 66 |         if column_indices is None:
 67 |             column_indices = []
 68 |         if column_names is None:
 69 |             column_names = []
 70 |         struct_repr = StructRepr(struct_repr)
 71 |         conv = None
 72 |         if converters:
 73 |             conv = DEFAULT_CONVERTERS.copy()
 74 |             conv.update(converters)
 75 |         else:
 76 |             conv = converters
 77 |         super().__init__(
 78 |             fileo,
 79 |             batch_size,
 80 |             column_indices,
 81 |             column_names,
 82 |             timezone,
 83 |             struct_repr,
 84 |             conv,
 85 |             predicate,
 86 |             null_value,
 87 |         )
 88 | 
 89 |     def __getitem__(self, col_idx: int) -> Column:
 90 |         return Column(self, col_idx)
 91 | 
 92 |     def read_stripe(self, stripe_idx: int) -> Stripe:
 93 |         return Stripe(self, stripe_idx)
 94 | 
 95 |     def iter_stripes(self) -> Iterator[Stripe]:
 96 |         for num in range(self.num_of_stripes):
 97 |             yield self.read_stripe(num)
 98 | 
 99 |     @property
100 |     def compression(self) -> CompressionKind:
101 |         return CompressionKind(super().compression)
102 | 
103 |     @property
104 |     def writer_id(self) -> str:
105 |         wid = super().writer_id
106 |         if wid == 0:
107 |             return "ORC_JAVA_WRITER"
108 |         elif wid == 1:
109 |             return "ORC_CPP_WRITER"
110 |         elif wid == 2:
111 |             return "PRESTO_WRITER"
112 |         elif wid == 3:
113 |             return "SCRITCHLEY_GO"
114 |         elif wid == 4:
115 |             return "TRINO_WRITER"
116 |         elif wid == 5:
117 |             return "CUDF_WRITER"
118 |         else:
119 |             return "UNKNOWN_WRITER"
120 | 
121 |     @property
122 |     def writer_version(self) -> WriterVersion:
123 |         return WriterVersion(super().writer_version)
124 | 


--------------------------------------------------------------------------------
/src/_pyorc/PyORCStream.cpp:
--------------------------------------------------------------------------------
  1 | #include "PyORCStream.h"
  2 | 
  3 | PyORCInputStream::PyORCInputStream(py::object fp)
  4 | {
  5 |     if (!(py::hasattr(fp, "read") && py::hasattr(fp, "seek"))) {
  6 |         throw py::type_error("Parameter must be a file-like object, but `" +
  7 |                              (std::string)(py::str(fp.get_type())) + "` was provided");
  8 |     }
  9 |     pyread = fp.attr("read");
 10 |     pyseek = fp.attr("seek");
 11 |     py::object isSeekable(fp.attr("seekable"));
 12 |     if (py::cast<bool>(isSeekable()) == false) {
 13 |         throw py::type_error("File-like object must be seekable");
 14 |     }
 15 |     if (py::hasattr(fp, "name")) {
 16 |         filename = py::cast<std::string>(py::str(fp.attr("name")));
 17 |     } else {
 18 |         filename = py::cast<std::string>(py::repr(fp));
 19 |     }
 20 |     py::object pytell(fp.attr("tell"));
 21 |     uint64_t currPos = py::cast<uint64_t>(pytell());
 22 |     totalLength = py::cast<uint64_t>(pyseek(0, 2));
 23 |     pyseek(currPos);
 24 | }
 25 | 
 26 | uint64_t
 27 | PyORCInputStream::getLength() const
 28 | {
 29 |     return totalLength;
 30 | }
 31 | 
 32 | uint64_t
 33 | PyORCInputStream::getNaturalReadSize() const
 34 | {
 35 |     return 128 * 1024;
 36 | }
 37 | 
 38 | const std::string&
 39 | PyORCInputStream::getName() const
 40 | {
 41 |     return filename;
 42 | }
 43 | 
 44 | void
 45 | PyORCInputStream::read(void* buf, uint64_t length, uint64_t offset)
 46 | {
 47 |     char* src;
 48 |     Py_ssize_t bytesRead;
 49 |     if (!buf) {
 50 |         throw orc::ParseError("Buffer is null");
 51 |     }
 52 | 
 53 |     pyseek(offset);
 54 |     py::object data = pyread(length);
 55 |     int rc = PyBytes_AsStringAndSize(data.ptr(), &src, &bytesRead);
 56 |     if (rc == -1) {
 57 |         PyErr_Clear();
 58 |         throw orc::ParseError(
 59 |           "Failed to read content as bytes. Stream might not be opened as binary");
 60 |     }
 61 | 
 62 |     if (static_cast<uint64_t>(bytesRead) != length) {
 63 |         throw orc::ParseError("Short read of " + filename);
 64 |     }
 65 | 
 66 |     std::memcpy(buf, src, length);
 67 | }
 68 | 
 69 | PyORCInputStream::~PyORCInputStream() {}
 70 | 
 71 | PyORCOutputStream::PyORCOutputStream(py::object fp)
 72 | {
 73 |     bytesWritten = 0;
 74 |     if (!(py::hasattr(fp, "write") && py::hasattr(fp, "flush"))) {
 75 |         throw py::type_error("Parameter must be a file-like object, but `" +
 76 |                              (std::string)(py::str(fp.get_type())) + "` was provided");
 77 |     }
 78 |     pywrite = fp.attr("write");
 79 |     pyflush = fp.attr("flush");
 80 |     if (py::hasattr(fp, "name")) {
 81 |         filename = py::cast<std::string>(py::str(fp.attr("name")));
 82 |     } else {
 83 |         filename = py::cast<std::string>(py::repr(fp));
 84 |     }
 85 |     closed = py::cast<bool>(fp.attr("closed"));
 86 | }
 87 | 
 88 | uint64_t
 89 | PyORCOutputStream::getLength() const
 90 | {
 91 |     return bytesWritten;
 92 | }
 93 | 
 94 | uint64_t
 95 | PyORCOutputStream::getNaturalWriteSize() const
 96 | {
 97 |     return 128 * 1024;
 98 | }
 99 | 
100 | const std::string&
101 | PyORCOutputStream::getName() const
102 | {
103 |     return filename;
104 | }
105 | 
106 | void
107 | PyORCOutputStream::write(const void* buf, size_t length)
108 | {
109 |     if (closed) {
110 |         throw std::logic_error("Cannot write to closed stream");
111 |     }
112 |     try {
113 |         py::bytes data = py::bytes(static_cast<const char*>(buf), length);
114 |         size_t count = py::cast<size_t>(pywrite(data));
115 |         pyflush();
116 | 
117 |         if (count != length) {
118 |             throw orc::ParseError("Shorter write of " + filename);
119 |         }
120 |         bytesWritten += static_cast<uint64_t>(count);
121 |     } catch (py::error_already_set& err) {
122 |         if (!err.matches(PyExc_TypeError)) {
123 |             throw;
124 |         }
125 |         throw orc::ParseError(
126 |           "Failed to write content as bytes. Stream might not be opened as binary");
127 |     }
128 | }
129 | 
130 | void
131 | PyORCOutputStream::close()
132 | {
133 |     if (!closed) {
134 |         try {
135 |             pyflush();
136 |         } catch (py::error_already_set& err) {
137 |             if (!err.matches(PyExc_ValueError)) {
138 |                 throw;
139 |             }
140 |             // ValueError is raised when try to flush on a closed file, let's ignore.
141 |             PyErr_Clear();
142 |         }
143 |         closed = true;
144 |     }
145 | }
146 | 
147 | #if ORC_VERSION_AT_LEAST(1, 9, 0)
148 | void
149 | PyORCOutputStream::flush()
150 | {
151 |     if (!closed) {
152 |         pyflush();
153 |     }
154 | }
155 | #endif
156 | 
157 | PyORCOutputStream::~PyORCOutputStream()
158 | {
159 |     close();
160 | }
161 | 


--------------------------------------------------------------------------------
/src/pyorc/_pyorc.pyi:
--------------------------------------------------------------------------------
  1 | """_pyorc c++ extension"""
  2 | import typing
  3 | 
  4 | from .enums import CompressionKind, CompressionStrategy, StructRepr
  5 | from .typedescription import TypeDescription
  6 | 
  7 | __all__ = ["reader", "stripe", "writer"]
  8 | 
  9 | class reader:
 10 |     def __init__(
 11 |         self,
 12 |         fileo: object,
 13 |         batch_size: int = 1024,
 14 |         col_indices: typing.Optional[typing.List[int]] = None,
 15 |         col_names: typing.Optional[typing.List[str]] = None,
 16 |         timezone: object = None,
 17 |         struct_repr: int = StructRepr.TUPLE,
 18 |         conv: object = None,
 19 |         predicate: object = None,
 20 |         null_value: object = None,
 21 |     ) -> None: ...
 22 |     def __iter__(self) -> reader: ...
 23 |     def __len__(self) -> int: ...
 24 |     def __next__(self) -> object: ...
 25 |     def _statistics(self, col_idx: int) -> tuple: ...
 26 |     def read(self, num: int = -1) -> list: ...
 27 |     def seek(self, row: int, whence: int = 0) -> int: ...
 28 |     @property
 29 |     def bytes_lengths(self) -> typing.Dict[str, int]:
 30 |         """
 31 |         :type: dict
 32 |         """
 33 |     @property
 34 |     def compression(self) -> int:
 35 |         """
 36 |         :type: int
 37 |         """
 38 |     @property
 39 |     def compression_block_size(self) -> int:
 40 |         """
 41 |         :type: int
 42 |         """
 43 |     @property
 44 |     def current_row(self) -> int:
 45 |         """
 46 |         :type: int
 47 |         """
 48 |     @property
 49 |     def format_version(self) -> typing.Tuple[int, int]:
 50 |         """
 51 |         :type: tuple
 52 |         """
 53 |     @property
 54 |     def num_of_stripes(self) -> int:
 55 |         """
 56 |         :type: int
 57 |         """
 58 |     @property
 59 |     def row_index_stride(self) -> int:
 60 |         """
 61 |         :type: int
 62 |         """
 63 |     @property
 64 |     def schema(self) -> object:
 65 |         """
 66 |         :type: object
 67 |         """
 68 |     @property
 69 |     def selected_schema(self) -> object:
 70 |         """
 71 |         :type: object
 72 |         """
 73 |     @property
 74 |     def software_version(self) -> str:
 75 |         """
 76 |         :type: str
 77 |         """
 78 |     @property
 79 |     def user_metadata(self) -> typing.Dict[str, bytes]:
 80 |         """
 81 |         :type: dict
 82 |         """
 83 |     @property
 84 |     def writer_id(self) -> int:
 85 |         """
 86 |         :type: int
 87 |         """
 88 |     @property
 89 |     def writer_version(self) -> int:
 90 |         """
 91 |         :type: int
 92 |         """
 93 |     pass
 94 | 
 95 | class stripe:
 96 |     def __init__(self, reader: reader, stripe_idx: int) -> None: ...
 97 |     def __iter__(self) -> stripe: ...
 98 |     def __len__(self) -> int: ...
 99 |     def __next__(self) -> object: ...
100 |     def _statistics(self, col_idx: int) -> tuple: ...
101 |     def read(self, num: int = -1) -> list: ...
102 |     def seek(self, row: int, whence: int = 0) -> int: ...
103 |     @property
104 |     def bloom_filter_columns(self) -> typing.Tuple[int, ...]:
105 |         """
106 |         :type: tuple
107 |         """
108 |     @property
109 |     def bytes_length(self) -> int:
110 |         """
111 |         :type: int
112 |         """
113 |     @property
114 |     def bytes_offset(self) -> int:
115 |         """
116 |         :type: int
117 |         """
118 |     @property
119 |     def current_row(self) -> int:
120 |         """
121 |         :type: int
122 |         """
123 |     @property
124 |     def row_offset(self) -> int:
125 |         """
126 |         :type: int
127 |         """
128 |     @property
129 |     def writer_timezone(self) -> str:
130 |         """
131 |         :type: str
132 |         """
133 |     pass
134 | 
135 | class writer:
136 |     def __init__(
137 |         self,
138 |         fileo: object,
139 |         schema: object,
140 |         batch_size: int = 1024,
141 |         stripe_size: int = 67108864,
142 |         row_index_stride: int = 10000,
143 |         compression: int = CompressionKind.ZLIB,
144 |         compression_strategy: int = CompressionStrategy.SPEED,
145 |         compression_block_size: int = 65536,
146 |         bloom_filter_columns: typing.Optional[typing.Set[int]] = None,
147 |         bloom_filter_fpp: float = 0.05,
148 |         timezone: object = None,
149 |         struct_repr: int = StructRepr.TUPLE,
150 |         conv: object = None,
151 |         padding_tolerance: float = 0.0,
152 |         dict_key_size_threshold: float = 0.0,
153 |         null_value: object = None,
154 |     ) -> None: ...
155 |     def _add_user_metadata(self, key: str, value: bytes) -> None: ...
156 |     def close(self) -> None: ...
157 |     def write(self, row: object) -> None: ...
158 |     def writerows(self, rows: typing.Iterable) -> int: ...
159 |     @property
160 |     def current_row(self) -> int:
161 |         """
162 |         :type: int
163 |         """
164 |     pass
165 | 
166 | def _orc_version() -> str:
167 |     pass
168 | 
169 | def _schema_from_string(arg0: str) -> TypeDescription:
170 |     pass
171 | 


--------------------------------------------------------------------------------
/tests/test_stripe.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import io
  4 | import sys
  5 | 
  6 | from datetime import datetime, timedelta, timezone
  7 | 
  8 | from pyorc import (
  9 |     Reader,
 10 |     Writer,
 11 |     Stripe,
 12 |     orc_version_info,
 13 | )
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def striped_orc_data():
 18 |     def _init(schema, rows, bfc=tuple()):
 19 |         data = io.BytesIO()
 20 |         with Writer(
 21 |             data,
 22 |             schema,
 23 |             batch_size=65535,
 24 |             stripe_size=128,
 25 |             compression_block_size=128,
 26 |             bloom_filter_columns=bfc,
 27 |             memory_block_size=64,
 28 |         ) as writer:
 29 |             writer.writerows(rows)
 30 |         data.seek(0)
 31 |         return data
 32 | 
 33 |     return _init
 34 | 
 35 | 
 36 | def test_init(striped_orc_data):
 37 |     data = striped_orc_data("int", (i for i in range(100000)))
 38 |     reader = Reader(data)
 39 |     with pytest.raises(TypeError):
 40 |         _ = Stripe(None, 0)
 41 |     with pytest.raises(TypeError):
 42 |         _ = Stripe("reader", 0)
 43 |     with pytest.raises(IndexError):
 44 |         _ = Stripe(reader, 3)
 45 |     with pytest.raises(TypeError):
 46 |         _ = Stripe(reader, "col")
 47 |     assert Stripe(reader, 0) is not None
 48 | 
 49 | 
 50 | def test_len(striped_orc_data):
 51 |     data = striped_orc_data("int", (i for i in range(100000)))
 52 |     reader = Reader(data)
 53 |     stripe = Stripe(reader, 0)
 54 | 
 55 |     assert len(reader) != len(stripe)
 56 |     assert len(stripe) == 65535
 57 | 
 58 | 
 59 | def test_bytes_length(striped_orc_data):
 60 |     expected_bytes_length = (
 61 |         392 if orc_version_info.major == 1 and orc_version_info.minor < 8 else 359
 62 |     )  # Bold, hardcoded length values.
 63 | 
 64 |     data = striped_orc_data("int", (i for i in range(100000)))
 65 |     reader = Reader(data)
 66 |     stripe = Stripe(reader, 1)
 67 | 
 68 |     assert stripe.bytes_length == expected_bytes_length
 69 |     with pytest.raises(AttributeError):
 70 |         stripe.bytes_length = "false"
 71 | 
 72 | 
 73 | def test_bytes_offset(striped_orc_data):
 74 |     expected_bytes_offset = (
 75 |         658 if orc_version_info.major == 1 and orc_version_info.minor < 8 else 614
 76 |     )  # Bold, hardcoded offset value.
 77 | 
 78 |     data = striped_orc_data("int", (i for i in range(100000)))
 79 |     reader = Reader(data)
 80 |     stripe = Stripe(reader, 1)
 81 | 
 82 |     assert stripe.bytes_offset == expected_bytes_offset
 83 |     with pytest.raises(AttributeError):
 84 |         stripe.bytes_offset = 5
 85 | 
 86 | 
 87 | def test_bloom_filter_columns(striped_orc_data):
 88 |     expected = (0, 1)
 89 |     data = striped_orc_data(
 90 |         "struct<col0:int,col1:string>",
 91 |         ((i, "Test {}".format(i + 1)) for i in range(100000)),
 92 |         bfc=expected,
 93 |     )
 94 |     reader = Reader(data)
 95 |     assert Stripe(reader, 0).bloom_filter_columns == expected
 96 |     assert Stripe(reader, 1).bloom_filter_columns == expected
 97 | 
 98 |     data = striped_orc_data("int", (i for i in range(100000)))
 99 |     reader = Reader(data)
100 |     stripe = Stripe(reader, 0)
101 |     assert stripe.bloom_filter_columns == tuple()
102 |     with pytest.raises(AttributeError):
103 |         stripe.bloom_filter_columns = (0,)
104 | 
105 | 
106 | def test_row_offset(striped_orc_data):
107 |     data = striped_orc_data("int", (i for i in range(100000)))
108 |     reader = Reader(data)
109 |     stripe0 = Stripe(reader, 0)
110 | 
111 |     assert stripe0.row_offset == 0
112 |     assert Stripe(reader, 1).row_offset == len(stripe0)
113 |     with pytest.raises(AttributeError):
114 |         stripe0.row_offset = 5
115 | 
116 | 
117 | def test_writer_timezone(striped_orc_data):
118 |     def get_dt():
119 |         start = datetime(2010, 9, 1, 7, 0, 0, 0, timezone.utc)
120 |         end = datetime(2010, 9, 10, 12, 0, 0, 0, timezone.utc)
121 |         while start <= end:
122 |             yield start
123 |             start += timedelta(seconds=10)
124 | 
125 |     data = striped_orc_data("timestamp", get_dt())
126 |     reader = Reader(data)
127 |     stripe = Stripe(reader, 1)
128 | 
129 |     assert stripe.writer_timezone == "UTC"
130 |     with pytest.raises(AttributeError):
131 |         stripe.writer_timezone = "UTC-9:00"
132 | 
133 | 
134 | @pytest.mark.skipif(sys.platform == "win32", reason="Seeking fails on Windows")
135 | def test_seek_and_read(striped_orc_data):
136 |     data = striped_orc_data(
137 |         "struct<col0:int,col1:string>",
138 |         ((i, "Test {}".format(i + 1)) for i in range(100000)),
139 |     )
140 |     reader = Reader(data)
141 |     stripe = reader.read_stripe(1)
142 |     assert next(stripe) == (65535, "Test 65536")
143 |     stripe.seek(10000)
144 |     assert next(stripe) == (75535, "Test 75536")
145 |     stripe.seek(-1, 2)
146 |     assert next(stripe) == (99999, "Test 100000")
147 |     stripe = reader.read_stripe(0)
148 |     stripe.seek(-1, 2)
149 |     assert next(stripe) == (65534, "Test 65535")
150 |     stripe.seek(0)
151 |     next(stripe)
152 |     stripe.seek(10000, 1)
153 |     assert next(stripe) == (10001, "Test 10002")
154 |     expected = reader.read()
155 |     result = stripe.read()
156 |     assert result == expected[10002:65535]
157 |     stripe = reader.read_stripe(1)
158 |     assert stripe.read() == expected[65535:]
159 | 


--------------------------------------------------------------------------------
/tests/test_typedescription.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pyorc.typedescription import *
  4 | from pyorc.enums import TypeKind
  5 | 
  6 | 
  7 | def test_from_str_schema():
  8 |     descr = TypeDescription.from_string(
  9 |         "struct<a:int,b:map<varchar(20),int>,c:struct<d:bigint,e:float,f:char(12)>>"
 10 |     )
 11 |     assert descr.kind == TypeKind.STRUCT
 12 |     assert len(descr.fields) == 3
 13 |     assert tuple(descr.fields.keys()) == ("a", "b", "c")
 14 |     assert descr.fields["a"].kind == TypeKind.INT
 15 |     assert descr.fields["b"].column_id == 2
 16 |     assert descr.fields["b"].key.kind == TypeKind.VARCHAR
 17 |     assert descr.fields["b"].key.max_length == 20
 18 |     assert descr.fields["b"].value.column_id == 4
 19 |     assert tuple(descr.fields["c"].fields.keys()) == ("d", "e", "f")
 20 |     assert descr.fields["c"].fields["d"].kind == TypeKind.LONG
 21 |     assert descr.fields["c"].fields["e"].column_id == 7
 22 |     assert descr.fields["c"].fields["f"].kind == TypeKind.CHAR
 23 |     assert descr.fields["c"].fields["f"].max_length == 12
 24 |     assert descr.fields["c"].fields["f"].column_id == 8
 25 | 
 26 | 
 27 | def test_from_str_schema_fail():
 28 |     with pytest.raises(ValueError):
 29 |         _ = TypeDescription.from_string(
 30 |             "struct<a:int,b:map<varchar(20),in>,c:struct<d:bigint,e:float>>"
 31 |         )
 32 |     with pytest.raises(ValueError):
 33 |         _ = TypeDescription.from_string("struct<a:int,")
 34 |     with pytest.raises(TypeError):
 35 |         _ = TypeDescription.from_string(2.5)
 36 | 
 37 | 
 38 | def test_find_column_id():
 39 |     descr = TypeDescription.from_string(
 40 |         "struct<a:struct<b:struct<c:int,d:string>,e:int>>"
 41 |     )
 42 |     assert descr.find_column_id("a") == 1
 43 |     assert descr.find_column_id("a.b.c") == 3
 44 |     assert descr.find_column_id("a.e") == 5
 45 |     with pytest.raises(TypeError):
 46 |         _ = descr.find_column_id(True)
 47 |     with pytest.raises(KeyError):
 48 |         _ = descr.find_column_id("f.z")
 49 |     descr = Struct(**{"a.b": Struct(c=Int()), "d": String()})
 50 |     assert descr.find_column_id("`a.b`") == 1
 51 |     assert descr.find_column_id("`a.b`.c") == 2
 52 |     assert descr.find_column_id("d") == 3
 53 |     with pytest.raises(KeyError):
 54 |         _ = descr.find_column_id("a.b")
 55 | 
 56 | 
 57 | TESTDATA = [
 58 |     (Boolean(), TypeKind.BOOLEAN, "boolean"),
 59 |     (TinyInt(), TypeKind.BYTE, "tinyint"),
 60 |     (SmallInt(), TypeKind.SHORT, "smallint"),
 61 |     (Int(), TypeKind.INT, "int"),
 62 |     (BigInt(), TypeKind.LONG, "bigint"),
 63 |     (Float(), TypeKind.FLOAT, "float"),
 64 |     (Double(), TypeKind.DOUBLE, "double"),
 65 |     (Date(), TypeKind.DATE, "date"),
 66 |     (Timestamp(), TypeKind.TIMESTAMP, "timestamp"),
 67 |     (TimestampInstant(), TypeKind.TIMESTAMP_INSTANT, "timestamp with local time zone"),
 68 |     (String(), TypeKind.STRING, "string"),
 69 |     (Binary(), TypeKind.BINARY, "binary"),
 70 |     (Decimal(precision=10, scale=3), TypeKind.DECIMAL, "decimal(10,3)"),
 71 |     (Char(16), TypeKind.CHAR, "char(16)"),
 72 |     (VarChar(140), TypeKind.VARCHAR, "varchar(140)"),
 73 |     (
 74 |         Union(Int(), Double(), Char(20)),
 75 |         TypeKind.UNION,
 76 |         "uniontype<int,double,char(20)>",
 77 |     ),
 78 |     (Array(Int()), TypeKind.LIST, "array<int>"),
 79 |     (Map(key=String(), value=Double()), TypeKind.MAP, "map<string,double>"),
 80 |     (Struct(a=String(), b=Date()), TypeKind.STRUCT, "struct<a:string,b:date>"),
 81 |     (
 82 |         Struct(a=Timestamp(), b=Struct(c=Int(), b=Array(Double()))),
 83 |         TypeKind.STRUCT,
 84 |         "struct<a:timestamp,b:struct<c:int,b:array<double>>>",
 85 |     ),
 86 | ]
 87 | 
 88 | 
 89 | @pytest.mark.parametrize("orc_schema,kind,expected", TESTDATA)
 90 | def test_str(orc_schema, kind, expected):
 91 |     assert str(orc_schema) == expected
 92 | 
 93 | 
 94 | @pytest.mark.parametrize("orc_schema,kind,expected", TESTDATA)
 95 | def test_kind(orc_schema, kind, expected):
 96 |     assert orc_schema.kind == kind
 97 | 
 98 | 
 99 | def test_decimal():
100 |     descr = Decimal(precision=5, scale=3)
101 |     assert descr.precision == 5
102 |     assert descr.scale == 3
103 |     assert str(descr) == "decimal(5,3)"
104 | 
105 | 
106 | def test_varchar():
107 |     descr = TypeDescription.from_string("varchar(30)")
108 |     assert descr.max_length == 30
109 |     descr.max_length = 15
110 |     assert descr.max_length == 15
111 |     assert str(descr) == "varchar(15)"
112 | 
113 | 
114 | def test_char():
115 |     descr = Char(10)
116 |     assert descr.max_length == 10
117 |     descr.max_length = 1
118 |     assert str(descr) == "char(1)"
119 | 
120 | 
121 | TESTDATA = [
122 |     lambda: Struct(field0=Int(), field1=True),
123 |     lambda: Map(key=Int(), value=True),
124 |     lambda: Map(key=0, value=Double()),
125 |     lambda: Array("test"),
126 |     lambda: Union(Int(), 0, Double()),
127 | ]
128 | 
129 | 
130 | @pytest.mark.parametrize("orc_schema", TESTDATA)
131 | def test_failed_complex_types(orc_schema):
132 |     with pytest.raises(TypeError):
133 |         _ = orc_schema()
134 | 
135 | 
136 | def test_struct():
137 |     schema = Struct(a0=Int(), b0=Double(), c0=Struct(a1=Date(), b1=Timestamp()))
138 |     assert isinstance(schema["a0"], Int)
139 |     assert schema["b0"].kind == TypeKind.DOUBLE
140 |     assert schema["c0"].column_id == 3
141 |     assert schema["c0"]["b1"].kind == TypeKind.TIMESTAMP
142 | 
143 | 
144 | def test_union():
145 |     schema = TypeDescription.from_string("uniontype<int,double,string>")
146 |     assert schema[1].kind == TypeKind.DOUBLE
147 |     with pytest.raises(IndexError):
148 |         _ = schema[10]
149 |     schema = Union(Float(), VarChar(120))
150 |     assert schema[0].kind == TypeKind.FLOAT
151 | 
152 | 
153 | def test_attributes():
154 |     schema = Boolean()
155 |     with pytest.raises(TypeError):
156 |         _ = schema.set_attributes(0)
157 |     with pytest.raises(TypeError):
158 |         _ = schema.set_attributes({0: "1"})
159 |     with pytest.raises(TypeError):
160 |         _ = schema.set_attributes({"a": 1})
161 |     attrs = {"a": "1", "b": "2"}
162 |     schema.set_attributes(attrs)
163 |     assert schema.attributes == attrs
164 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | ==========
  3 | [0.11.0] - UNRELEASED
  4 | ---------------------
  5 | 
  6 | Added
  7 | ~~~~~
  8 | 
  9 | Changed
 10 | ~~~~~~~
 11 | 
 12 | Fixed
 13 | ~~~~~
 14 | 
 15 | 
 16 | [0.10.0] - 2025-02-18
 17 | ---------------------
 18 | 
 19 | Added
 20 | ~~~~~
 21 | 
 22 | - New parameter to Writer: memory_block_size for setting initial block
 23 |   size of original input buffer.
 24 | - Add CUDF_Writer to Reader's writer_id.
 25 | - Python 3.13 wheels.
 26 | 
 27 | Changed
 28 | ~~~~~~~
 29 | 
 30 | - Dropped support for Python 3.8.
 31 | - ORC C++ Core updated to 2.1.0.
 32 | - Pinned setuptools<72.2 for PyPy to avoid TypeError during build.
 33 | 
 34 | 
 35 | [0.9.0] - 2023-11-04
 36 | --------------------
 37 | 
 38 | Added
 39 | ~~~~~
 40 | 
 41 | - Writer.write_intermediate_footer method for ORC library 1.9.0 and newer.
 42 | - Python 3.12 wheels.
 43 | 
 44 | Changed
 45 | ~~~~~~~
 46 | 
 47 | - Dropped support for Python 3.7.
 48 | - ORC C++ Core updated to 1.9.1.
 49 | 
 50 | 
 51 | [0.8.0] - 2022-11-19
 52 | --------------------
 53 | 
 54 | Added
 55 | ~~~~~
 56 | 
 57 | - Python 3.11 wheels. (PR #58, contribution of @dbaxa)
 58 | 
 59 | Changed
 60 | ~~~~~~~
 61 | 
 62 | - ORC C++ Core updated to 1.7.7.
 63 | - Improved type annotations, set module's __all__ variable.
 64 | 
 65 | 
 66 | [0.7.0] - 2022-07-16
 67 | --------------------
 68 | 
 69 | Added
 70 | ~~~~~
 71 | 
 72 | - Universal2 wheels for MacOS. (PR #55, contribution of @dbaxa)
 73 | - ORC-517, ORC-203, and ORC-14 versions to WriterVersion enum.
 74 | 
 75 | Changed
 76 | ~~~~~~~
 77 | 
 78 | - Dropped support for Python 3.6.
 79 | - ORC C++ Core updated to 1.7.5.
 80 | 
 81 | 
 82 | [0.6.0] - 2022-02-18
 83 | --------------------
 84 | 
 85 | Added
 86 | ~~~~~
 87 | 
 88 | - New parameter to Writer: dict_key_size_threshold for setting threshold
 89 |   for dictionary encoding. (PR #46, contribution of @dirtysalt)
 90 | - New parameter to Writer: padding_tolerance for block padding.
 91 | - New parameter to Reader and Writer: null_value for changing representation
 92 |   of ORC null value. The value must be a singleton object.
 93 | - Type stubs for classes implemented in C++.
 94 | - Experimental musllinux and PyPy wheels.
 95 | 
 96 | Changed
 97 | ~~~~~~~
 98 | 
 99 | - Writer.writerows method reimplemented in C++.
100 | - Improved type annotations.
101 | - ORC C++ Core updated to 1.7.3.
102 | - Removed build_orc setup.py command, moved the same functionality to
103 |   build_ext command.
104 | 
105 | Fixed
106 | ~~~~~
107 | 
108 | - Unnecessary string casting of values when writing user metadata. (Issue #45)
109 | 
110 | 
111 | [0.5.0] - 2021-10-22
112 | --------------------
113 | 
114 | Added
115 | ~~~~~
116 | 
117 | - Module level variables for the ORC library version: orc_version string and
118 |   orc_version_info namedtuple.
119 | - New parameter for Writer: row_index_stride.
120 | - New read-only properties for Reader: row_index_stride and software_version.
121 | - Trino and Scritchley writer ids.
122 | - Type annotations support for ORC types.
123 | - Support for `timestamp with local time zone` type.
124 | - New parameter for Reader and Writer: timezone.
125 | - The backported zoneinfo module dependency pior to Python 3.9.
126 | - Predicate (SearchArgument) support for filtering row groups during ORC file
127 |   reads. New classes: Predicate and PredicateColumn.
128 | - New parameter for Reader: predicate.
129 | - Build for aarch64 wheels. (PR #43, contribution of @odidev)
130 | 
131 | Changed
132 | ~~~~~~~
133 | 
134 | - ORC C++ Core updated to 1.7.0, and because many of the new features are not
135 |   backported to the 1.6 branch, currently this is the minimum required lib
136 |   version.
137 | - TimestampConverter's to_orc and from_orc methods got an extra timezone
138 |   parameter, that will be bound to the same ZoneInfo object passed to the
139 |   Reader or Writer via their timezone parameters during type convert.
140 | - Renamed Reader.metadata property and Writer.set_metadata method to
141 |   user_metadata and set_user_metadata respectively to avoid confusion.
142 | 
143 | 
144 | [0.4.0] - 2021-01-11
145 | --------------------
146 | 
147 | Added
148 | ~~~~~
149 | 
150 | - Experimental Windows support.
151 | - tzdata package dependency on Windows. Automatically setting TZDIR
152 |   to the path of the tzdata package's data dir after importing PyORC.
153 | 
154 | Changed
155 | ~~~~~~~
156 | - Create ORC Type from TypeDescription directly (instead of string parsing)
157 |   for Writer. (PR #26, contribution of @blkerby)
158 | - Dotted column names are allowed to use in TypeDescription.find_column_id
159 |   method with escaping them backticks.
160 | - ORC C++ Core updated to 1.6.6.
161 | 
162 | Fixed
163 | ~~~~~
164 | 
165 | - Handling large negative seconds on Windows for TimestampConverter.from_orc.
166 | 
167 | 
168 | [0.3.0] - 2020-05-24
169 | --------------------
170 | 
171 | Added
172 | ~~~~~
173 | 
174 | - Metadata property for Reader and set_metadata for Writer to
175 |   handle ORC file's metadata.
176 | - Meta info attributes like writer_id, writer_version, bytes_length,
177 |   compression and compression_block_size for Reader.
178 | - New TypeDescription subclasses to represent ORC types.
179 | 
180 | Changed
181 | ~~~~~~~
182 | 
183 | - Reimplemented TypeDescription in Python.
184 | - ORC C++ Core updated to 1.6.3.
185 | 
186 | Fixed
187 | ~~~~~
188 | 
189 | - Converting date from ORC on systems where the system's timezone
190 |   has a negative UTC offset (Issues #5)
191 | 
192 | 
193 | [0.2.0] - 2020-01-01
194 | --------------------
195 | 
196 | Added
197 | ~~~~~
198 | 
199 | - Converters for date, decimal and timestamp ORC types in Python and
200 |   option to change them via Reader's and Writer's converters parameter.
201 | - Column object for accessing statistics about ORC columns.
202 | - An attribute to Reader for selected schema.
203 | 
204 | Changed
205 | ~~~~~~~
206 | 
207 | - Use timezone-aware datetime objects (in UTC) for ORC timestamps by default.
208 | - Wrapped C++ stripe object to Python Stripe.
209 | 
210 | Fixed
211 | ~~~~~
212 | 
213 | - Decrementing reference for bytes object after reading from file stream.
214 | 
215 | [0.1.0] - 2019-11-16
216 | --------------------
217 | 
218 | Added
219 | ~~~~~
220 | 
221 | - A Reader object to read ORC files.
222 | - A stripe object to read only a stripe in an ORC file.
223 | - A Writer object to write ORC files.
224 | - A typedescription object to represent the ORC schema.
225 | - Support to represent a struct type either a Python tuple or a dictionary.
226 | 


--------------------------------------------------------------------------------
/src/_pyorc/_pyorc.cpp:
--------------------------------------------------------------------------------
  1 | #include "Reader.h"
  2 | #include "Writer.h"
  3 | #include "verguard.h"
  4 | 
  5 | #include <orc/orc-config.hh>
  6 | 
  7 | namespace py = pybind11;
  8 | 
  9 | PYBIND11_MODULE(_pyorc, m)
 10 | {
 11 |     m.doc() = "_pyorc c++ extension";
 12 |     m.def("_orc_version", []() -> py::object { return py::cast(ORC_VERSION); });
 13 |     m.def("_schema_from_string", [](std::string schema) {
 14 |         try {
 15 |             auto orcType = orc::Type::buildTypeFromString(schema);
 16 |             return createTypeDescription(*orcType);
 17 |         } catch (std::logic_error& err) {
 18 |             throw py::value_error(err.what());
 19 |         }
 20 |     });
 21 |     py::register_exception_translator([](std::exception_ptr p) {
 22 |         try {
 23 |             if (p) {
 24 |                 std::rethrow_exception(p);
 25 |             }
 26 |         } catch (const orc::ParseError& e) {
 27 |             py::object err = py::module::import("pyorc.errors").attr("ParseError");
 28 |             PyErr_SetString(err.ptr(), e.what());
 29 |         }
 30 |     });
 31 |     py::class_<Stripe>(m, "stripe")
 32 |       .def(
 33 |         py::init([](Reader& reader, uint64_t num) { return reader.readStripe(num); }),
 34 |         py::keep_alive<0, 2>())
 35 |       .def("__next__", [](Stripe& s) -> py::object { return s.next(); })
 36 |       .def("__iter__", [](Stripe& s) -> Stripe& { return s; })
 37 |       .def("__len__", &Stripe::len)
 38 |       .def("read", &Stripe::read, py::arg_v("num", -1, "-1"))
 39 |       .def("seek", &Stripe::seek, py::arg("row"), py::arg_v("whence", 0, "0"))
 40 |       .def("_statistics", &Stripe::statistics)
 41 |       .def_property_readonly("bytes_length", [](Stripe& s) { return s.length(); })
 42 |       .def_property_readonly("bytes_offset", [](Stripe& s) { return s.offset(); })
 43 |       .def_property_readonly("bloom_filter_columns",
 44 |                              [](Stripe& s) { return s.bloomFilterColumns(); })
 45 |       .def_property_readonly("writer_timezone",
 46 |                              [](Stripe& s) { return s.writerTimezone(); })
 47 |       .def_readonly("current_row", &Stripe::currentRow)
 48 |       .def_readonly("row_offset", &Stripe::firstRowOfStripe);
 49 |     py::class_<Reader>(m, "reader")
 50 |       .def(py::init<py::object,
 51 |                     uint64_t,
 52 |                     std::list<uint64_t>,
 53 |                     std::list<std::string>,
 54 |                     py::object,
 55 |                     unsigned int,
 56 |                     py::object,
 57 |                     py::object,
 58 |                     py::object>(),
 59 |            py::arg("fileo"),
 60 |            py::arg_v("batch_size", 1024, "1024"),
 61 |            py::arg_v("col_indices", std::list<uint64_t>{}, "None"),
 62 |            py::arg_v("col_names", std::list<std::string>{}, "None"),
 63 |            py::arg_v("timezone", py::none(), "None"),
 64 |            py::arg_v("struct_repr", 0, "StructRepr.TUPLE"),
 65 |            py::arg_v("conv", py::none(), "None"),
 66 |            py::arg_v("predicate", py::none(), "None"),
 67 |            py::arg_v("null_value", py::none(), "None"))
 68 |       .def("__next__", [](Reader& r) -> py::object { return r.next(); })
 69 |       .def("__iter__", [](Reader& r) -> Reader& { return r; })
 70 |       .def("__len__", &Reader::len)
 71 |       .def("read", &Reader::read, py::arg_v("num", -1, "-1"))
 72 |       .def("seek", &Reader::seek, py::arg("row"), py::arg_v("whence", 0, "0"))
 73 |       .def("_statistics", &Reader::statistics)
 74 |       .def_property_readonly("bytes_lengths", &Reader::bytesLengths)
 75 |       .def_property_readonly("compression", &Reader::compression)
 76 |       .def_property_readonly("compression_block_size", &Reader::compressionBlockSize)
 77 |       .def_property_readonly("row_index_stride", &Reader::rowIndexStride)
 78 |       .def_property_readonly("format_version", &Reader::formatVersion)
 79 |       .def_property_readonly("user_metadata", &Reader::userMetadata)
 80 |       .def_property_readonly("schema", &Reader::schema)
 81 |       .def_property_readonly("selected_schema", &Reader::selectedSchema)
 82 |       .def_property_readonly("num_of_stripes",
 83 |                              [](Reader& r) { return r.numberOfStripes(); })
 84 |       .def_property_readonly("writer_id", &Reader::writerId)
 85 |       .def_property_readonly("writer_version", &Reader::writerVersion)
 86 |       .def_property_readonly("software_version", &Reader::softwareVersion)
 87 |       .def_readonly("current_row", &Reader::currentRow);
 88 |     py::class_<Writer>(m, "writer")
 89 |       .def(py::init<py::object,
 90 |                     py::object,
 91 |                     uint64_t,
 92 |                     uint64_t,
 93 |                     uint64_t,
 94 |                     int,
 95 |                     int,
 96 |                     uint64_t,
 97 |                     std::set<uint64_t>,
 98 |                     double,
 99 |                     py::object,
100 |                     unsigned int,
101 |                     py::object,
102 |                     double,
103 |                     double,
104 |                     py::object,
105 |                     unsigned int>(),
106 |            py::arg("fileo"),
107 |            py::arg("schema"),
108 |            py::arg_v("batch_size", 1024, "1024"),
109 |            py::arg_v("stripe_size", 67108864, "67108864"),
110 |            py::arg_v("row_index_stride", 10000, "10000"),
111 |            py::arg_v("compression", 1, "CompressionKind.ZLIB"),
112 |            py::arg_v("compression_strategy", 0, "CompressionStrategy.SPEED"),
113 |            py::arg_v("compression_block_size", 65536, "65536"),
114 |            py::arg_v("bloom_filter_columns", std::set<uint64_t>{}, "None"),
115 |            py::arg_v("bloom_filter_fpp", 0.05, "0.05"),
116 |            py::arg_v("timezone", py::none(), "None"),
117 |            py::arg_v("struct_repr", 0, "StructRepr.TUPLE"),
118 |            py::arg_v("conv", py::none(), "None"),
119 |            py::arg_v("padding_tolerance", 0.0, "0.0"),
120 |            py::arg_v("dict_key_size_threshold", 0.0, "0.0"),
121 |            py::arg_v("null_value", py::none(), "None"),
122 |            py::arg_v("memory_block_size", 65536, "65536"))
123 |       .def("_add_user_metadata", &Writer::addUserMetadata)
124 |       .def("write", &Writer::write)
125 |       .def("writerows", &Writer::writerows)
126 | #if ORC_VERSION_AT_LEAST(1, 9, 0)
127 |       .def("write_intermediate_footer", &Writer::writeIntermediateFooter)
128 | #endif
129 |       .def("close", &Writer::close)
130 |       .def_readonly("current_row", &Writer::currentRow);
131 | }
132 | 


--------------------------------------------------------------------------------
/tests/compare/test_writer_cmp.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import gzip
  4 | import json
  5 | import os
  6 | import subprocess
  7 | import sys
  8 | 
  9 | from decimal import Decimal
 10 | from datetime import datetime, timezone
 11 | 
 12 | import pyorc._pyorc
 13 | from pyorc.enums import TypeKind, StructRepr
 14 | from pyorc.typedescription import TypeDescription, Timestamp
 15 | 
 16 | from conftest import output_file
 17 | 
 18 | pytestmark = pytest.mark.skipif(
 19 |     sys.platform == "win32", reason="No orc-tools on Windows"
 20 | )
 21 | 
 22 | ORC_CONTENTS_PATH = "deps/bin/orc-contents"
 23 | 
 24 | 
 25 | def transform(schema, value):
 26 |     if schema.kind < 8:
 27 |         # Primitive types, no transformation.
 28 |         return value
 29 |     elif schema.kind == TypeKind.STRUCT:
 30 |         return {col: transform(schema[col], field) for col, field in value.items()}
 31 |     elif schema.kind == TypeKind.MAP:
 32 |         return {
 33 |             keypair["key"]: transform(schema.value, keypair["value"])
 34 |             for keypair in value
 35 |         }
 36 |     elif schema.kind == TypeKind.LIST:
 37 |         return [transform(schema.type, item) for item in value]
 38 |     elif schema.kind == TypeKind.TIMESTAMP:
 39 |         if value is None:
 40 |             return value
 41 |         try:
 42 |             ts = datetime.strptime(value[:26], "%Y-%m-%d %H:%M:%S")
 43 |         except ValueError:
 44 |             ts = datetime.strptime(value[:26], "%Y-%m-%d %H:%M:%S.%f")
 45 |         return ts.replace(tzinfo=timezone.utc)
 46 |     elif schema.kind == TypeKind.DATE:
 47 |         return datetime.strptime(value, "%Y-%m-%d").date()
 48 |     elif schema.kind == TypeKind.BINARY:
 49 |         return bytes(value)
 50 |     elif schema.kind == TypeKind.DECIMAL:
 51 |         if value is None:
 52 |             return value
 53 |         elif isinstance(value, float):
 54 |             return Decimal.from_float(value)
 55 |         else:
 56 |             return Decimal(value)
 57 |     else:
 58 |         return value
 59 | 
 60 | 
 61 | def read_expected_json_record(path):
 62 |     with gzip.open(path, "rb") as fileo:
 63 |         for line in fileo:
 64 |             yield json.loads(line)
 65 | 
 66 | 
 67 | def get_full_path(path):
 68 |     curdir = os.path.abspath(os.path.dirname(__file__))
 69 |     projdir = os.path.abspath(os.path.join(curdir, os.pardir, os.pardir))
 70 |     return os.path.join(projdir, "deps", "examples", "expected", path)
 71 | 
 72 | 
 73 | def idfn(val):
 74 |     return val[:40]
 75 | 
 76 | 
 77 | def create_orc_output_for_test(schema, file_out, file_in):
 78 |     writer = pyorc._pyorc.writer(file_out, schema, struct_repr=StructRepr.DICT)
 79 |     num = 0
 80 |     for row in read_expected_json_record(get_full_path(file_in)):
 81 |         orc_row = transform(schema, row)
 82 |         writer.write(orc_row)
 83 |         num += 1
 84 |     assert num == writer.current_row
 85 |     writer.close()
 86 | 
 87 | 
 88 | TESTDATA = [
 89 |     (
 90 |         "TestOrcFile.test1.jsn.gz",
 91 |         "struct<boolean1:boolean,byte1:tinyint,short1:smallint,int1:int,long1:bigint,float1:float,double1:double,bytes1:binary,string1:string,middle:struct<list:array<struct<int1:int,string1:string>>>,list:array<struct<int1:int,string1:string>>,map:map<string,struct<int1:int,string1:string>>>",
 92 |     ),
 93 |     ("TestOrcFile.testDate1900.jsn.gz", "struct<time:timestamp,date:date>"),
 94 |     ("TestOrcFile.testDate2038.jsn.gz", "struct<time:timestamp,date:date>"),
 95 |     (
 96 |         "TestOrcFile.testSeek.jsn.gz",
 97 |         "struct<boolean1:boolean,byte1:tinyint,short1:smallint,int1:int,long1:bigint,float1:float,double1:double,bytes1:binary,string1:string,middle:struct<list:array<struct<int1:int,string1:string>>>,list:array<struct<int1:int,string1:string>>,map:map<string,struct<int1:int,string1:string>>>",
 98 |     ),
 99 |     ("TestOrcFile.testSnappy.jsn.gz", "struct<int1:int,string1:string>"),
100 |     (
101 |         "nulls-at-end-snappy.jsn.gz",
102 |         "struct<_col0:tinyint,_col1:smallint,_col2:int,_col3:bigint,_col4:float,_col5:double,_col6:boolean>",
103 |     ),
104 |     (
105 |         "demo-12-zlib.jsn.gz",
106 |         "struct<_col0:int,_col1:string,_col2:string,_col3:string,_col4:int,_col5:string,_col6:int,_col7:int,_col8:int>",
107 |     ),
108 | ]
109 | 
110 | 
111 | @pytest.mark.parametrize("expected,schema", TESTDATA, ids=idfn)
112 | def test_write(expected, schema, output_file):
113 |     create_orc_output_for_test(
114 |         TypeDescription.from_string(schema), output_file, expected
115 |     )
116 |     exp_res = read_expected_json_record(get_full_path(expected))
117 |     with subprocess.Popen(
118 |         [ORC_CONTENTS_PATH, output_file.name], stdout=subprocess.PIPE
119 |     ) as proc:
120 |         for line in proc.stdout:
121 |             assert json.loads(line) == next(exp_res)
122 |     with pytest.raises(StopIteration):
123 |         next(exp_res)
124 | 
125 | 
126 | def test_write_decimal(output_file):
127 |     input_filename = "decimal.jsn.gz"
128 |     create_orc_output_for_test(
129 |         TypeDescription.from_string("struct<_col0:decimal(10,5)>"),
130 |         output_file,
131 |         input_filename,
132 |     )
133 |     exp_res = read_expected_json_record(get_full_path(input_filename))
134 |     with subprocess.Popen(
135 |         [ORC_CONTENTS_PATH, output_file.name], stdout=subprocess.PIPE
136 |     ) as proc:
137 |         for line in proc.stdout:
138 |             data = next(exp_res)
139 |             if pyorc.orc_version_info.major >= 2 and pyorc.orc_version_info.minor > 0:
140 |                 # From 2.1.0, orc-content returns decimals as string to the output,
141 |                 # whilte the example json has floats in it.
142 |                 data["_col0"] = (
143 |                     data["_col0"] if data["_col0"] is None else str(data["_col0"])
144 |                 )
145 |             assert json.loads(line) == data
146 |     with pytest.raises(StopIteration):
147 |         next(exp_res)
148 | 
149 | 
150 | def test_write_timestamp(output_file):
151 |     input_filename = "TestOrcFile.testTimestamp.jsn.gz"
152 |     create_orc_output_for_test(Timestamp(), output_file, input_filename)
153 |     exp_res = read_expected_json_record(get_full_path(input_filename))
154 |     with subprocess.Popen(
155 |         [ORC_CONTENTS_PATH, output_file.name], stdout=subprocess.PIPE
156 |     ) as proc:
157 |         for line in proc.stdout:
158 |             assert datetime.strptime(
159 |                 json.loads(line)[:26], "%Y-%m-%d %H:%M:%S.%f"
160 |             ) == datetime.strptime(next(exp_res)[:26], "%Y-%m-%d %H:%M:%S.%f")
161 |     with pytest.raises(StopIteration):
162 |         next(exp_res)
163 | 


--------------------------------------------------------------------------------
/src/_pyorc/Writer.cpp:
--------------------------------------------------------------------------------
  1 | #include "Writer.h"
  2 | #include "PyORCStream.h"
  3 | 
  4 | void
  5 | setTypeAttributes(orc::Type* type, py::handle schema)
  6 | {
  7 |     py::dict attributes(py::getattr(schema, "attributes"));
  8 |     for (auto attr : attributes) {
  9 |         type->setAttribute(py::cast<std::string>(attr.first),
 10 |                            py::cast<std::string>(attr.second));
 11 |     }
 12 | }
 13 | 
 14 | ORC_UNIQUE_PTR<orc::Type>
 15 | createType(py::handle schema)
 16 | {
 17 |     orc::TypeKind kind = orc::TypeKind(py::cast<int>(getattr(schema, "kind")));
 18 |     switch (kind) {
 19 |         case orc::TypeKind::BOOLEAN:
 20 |         case orc::TypeKind::BYTE:
 21 |         case orc::TypeKind::SHORT:
 22 |         case orc::TypeKind::INT:
 23 |         case orc::TypeKind::LONG:
 24 |         case orc::TypeKind::FLOAT:
 25 |         case orc::TypeKind::DOUBLE:
 26 |         case orc::TypeKind::STRING:
 27 |         case orc::TypeKind::BINARY:
 28 |         case orc::TypeKind::TIMESTAMP:
 29 |         case orc::TypeKind::TIMESTAMP_INSTANT:
 30 |         case orc::TypeKind::DATE: {
 31 |             ORC_UNIQUE_PTR<orc::Type> type = orc::createPrimitiveType(kind);
 32 |             setTypeAttributes(type.get(), schema);
 33 |             return type;
 34 |         }
 35 |         case orc::TypeKind::VARCHAR:
 36 |         case orc::TypeKind::CHAR: {
 37 |             ORC_UNIQUE_PTR<orc::Type> type = orc::createCharType(
 38 |               kind, py::cast<uint64_t>(getattr(schema, "max_length")));
 39 |             setTypeAttributes(type.get(), schema);
 40 |             return type;
 41 |         }
 42 |         case orc::TypeKind::DECIMAL: {
 43 |             uint64_t precision = py::cast<uint64_t>(getattr(schema, "precision"));
 44 |             uint64_t scale = py::cast<uint64_t>(getattr(schema, "scale"));
 45 |             ORC_UNIQUE_PTR<orc::Type> type = orc::createDecimalType(precision, scale);
 46 |             setTypeAttributes(type.get(), schema);
 47 |             return type;
 48 |         }
 49 |         case orc::TypeKind::LIST: {
 50 |             py::handle child = getattr(schema, "type");
 51 |             ORC_UNIQUE_PTR<orc::Type> type = orc::createListType(createType(child));
 52 |             setTypeAttributes(type.get(), schema);
 53 |             return type;
 54 |         }
 55 |         case orc::TypeKind::MAP: {
 56 |             py::handle key = getattr(schema, "key");
 57 |             py::handle value = getattr(schema, "value");
 58 |             ORC_UNIQUE_PTR<orc::Type> type =
 59 |               orc::createMapType(createType(key), createType(value));
 60 |             setTypeAttributes(type.get(), schema);
 61 |             return type;
 62 |         }
 63 |         case orc::TypeKind::STRUCT: {
 64 |             ORC_UNIQUE_PTR<orc::Type> type = orc::createStructType();
 65 |             py::dict fields = getattr(schema, "fields");
 66 |             for (auto item : fields) {
 67 |                 type->addStructField((py::str)item.first, createType(item.second));
 68 |             }
 69 |             setTypeAttributes(type.get(), schema);
 70 |             return type;
 71 |         }
 72 |         case orc::TypeKind::UNION: {
 73 |             ORC_UNIQUE_PTR<orc::Type> type = orc::createUnionType();
 74 |             py::list cont_types = getattr(schema, "cont_types");
 75 |             for (auto child : cont_types) {
 76 |                 type->addUnionChild(createType(child));
 77 |             }
 78 |             setTypeAttributes(type.get(), schema);
 79 |             return type;
 80 |         }
 81 |         default:
 82 |             throw py::type_error("Invalid TypeKind");
 83 |     }
 84 | }
 85 | 
 86 | Writer::Writer(py::object fileo,
 87 |                py::object schema,
 88 |                uint64_t batch_size,
 89 |                uint64_t stripe_size,
 90 |                uint64_t row_index_stride,
 91 |                int compression,
 92 |                int compression_strategy,
 93 |                uint64_t compression_block_size,
 94 |                std::set<uint64_t> bloom_filter_columns,
 95 |                double bloom_filter_fpp,
 96 |                py::object tzone,
 97 |                unsigned int struct_repr,
 98 |                py::object conv,
 99 |                double padding_tolerance,
100 |                double dict_key_size_threshold,
101 |                py::object null_value,
102 |                unsigned int memory_block_size)
103 | {
104 |     currentRow = 0;
105 |     batchItem = 0;
106 |     ORC_UNIQUE_PTR<orc::Type> type = createType(schema);
107 |     orc::WriterOptions options;
108 |     py::dict converters;
109 | 
110 |     if (conv.is_none()) {
111 |         py::dict defaultConv =
112 |           py::module::import("pyorc.converters").attr("DEFAULT_CONVERTERS");
113 |         converters = py::dict(defaultConv);
114 |     } else {
115 |         converters = conv;
116 |     }
117 | 
118 |     options = options.setCompression(static_cast<orc::CompressionKind>(compression));
119 |     options = options.setCompressionStrategy(
120 |       static_cast<orc::CompressionStrategy>(compression_strategy));
121 |     options = options.setCompressionBlockSize(compression_block_size);
122 |     options = options.setStripeSize(stripe_size);
123 |     options = options.setRowIndexStride(row_index_stride);
124 |     options = options.setColumnsUseBloomFilter(bloom_filter_columns);
125 |     options = options.setBloomFilterFPP(bloom_filter_fpp);
126 |     options = options.setDictionaryKeySizeThreshold(dict_key_size_threshold);
127 |     options = options.setPaddingTolerance(padding_tolerance);
128 | #if ORC_VERSION_AT_LEAST(2, 1, 0)
129 |     options = options.setMemoryBlockSize(memory_block_size);
130 | #endif
131 |     if (!tzone.is_none()) {
132 |         std::string tzKey = py::cast<std::string>(tzone.attr("key"));
133 |         options = options.setTimezoneName(tzKey);
134 |     }
135 | 
136 |     outStream = std::unique_ptr<orc::OutputStream>(new PyORCOutputStream(fileo));
137 |     writer = orc::createWriter(*type, outStream.get(), options);
138 |     batchSize = batch_size;
139 |     batch = writer->createRowBatch(batchSize);
140 |     converter = createConverter(type.get(), struct_repr, converters, tzone, null_value);
141 | }
142 | 
143 | void
144 | Writer::write(py::object row)
145 | {
146 |     converter->write(batch.get(), batchItem, row);
147 |     currentRow++;
148 |     batchItem++;
149 | 
150 |     if (batchItem == batchSize) {
151 |         writer->add(*batch);
152 |         converter->clear();
153 |         batchItem = 0;
154 |     }
155 | }
156 | 
157 | uint64_t
158 | Writer::writerows(py::iterable iter)
159 | {
160 |     uint64_t rows = 0;
161 |     for (auto handle : iter) {
162 |         auto obj = py::cast<py::object>(handle);
163 |         this->write(obj);
164 |         ++rows;
165 |     }
166 |     return rows;
167 | }
168 | 
169 | void
170 | Writer::close()
171 | {
172 |     if (batchItem != 0) {
173 |         writer->add(*batch);
174 |         converter->clear();
175 |         batchItem = 0;
176 |     }
177 |     writer->close();
178 | }
179 | 
180 | void
181 | Writer::addUserMetadata(py::str key, py::bytes value)
182 | {
183 |     writer->addUserMetadata(key, value);
184 | }
185 | 
186 | #if ORC_VERSION_AT_LEAST(1, 9, 0)
187 | uint64_t
188 | Writer::writeIntermediateFooter()
189 | {
190 |     return writer->writeIntermediateFooter();
191 | }
192 | #endif
193 | 


--------------------------------------------------------------------------------
/docs/tutorial.rst:
--------------------------------------------------------------------------------
  1 | Tutorial
  2 | ========
  3 | 
  4 | At this point you have an installed pyorc module.
  5 | 
  6 | Reading
  7 | -------
  8 | 
  9 | Let's use one of the example ORC files to open in Python::
 10 | 
 11 |     >>> import pyorc
 12 |     >>> example = open("./deps/examples/demo-12-zlib.orc", "rb")
 13 |     >>> reader = pyorc.Reader(example)
 14 | 
 15 | See the schema of the selected file::
 16 | 
 17 |     >>> reader.schema
 18 |     <pyorc.typedescription.Struct object at 0x7f9784d91298>
 19 | 
 20 | The Reader's schema read-only property is a :class:`TypeDescription` object,
 21 | representing the ORC file's type hierarchy. We can get a more human-friendly
 22 | interpretation if we print its string format::
 23 | 
 24 |     >>> str(reader.schema)
 25 |     'struct<_col0:int,_col1:string,_col2:string,_col3:string,_col4:int,_col5:string,_col6:int,_col7:int,_col8:int>'
 26 | 
 27 | We can check the number of rows in the file by calling len() on the Reader::
 28 | 
 29 |     >>> len(reader)
 30 |     1920800
 31 | 
 32 | The Reader is an interable object, yielding a new row after every
 33 | iteration::
 34 | 
 35 |     >>> next(reader)
 36 |     (1, 'M', 'M', 'Primary', 500, 'Good', 0, 0, 0)
 37 |     >>> next(reader)
 38 |     (2, 'F', 'M', 'Primary', 500, 'Good', 0, 0, 0)
 39 | 
 40 | Iterating over the file's content to process its rows is the preferable way,
 41 | but we can also read the entire file into the memory with the read method.
 42 | This method has an optional parameter to control the maximal number of rows
 43 | to read::
 44 | 
 45 |     >>> rows = reader.read(10000)
 46 |     >>> rows
 47 |     ... (10000, 'F', 'U', 'Advanced Degree', 1500, 'Unknown', 1, 0, 0), (10001, 'M', 'M', 'Unknown', 1500, 'Unknown', 1, 0, 0), (10002, 'F', 'M', 'Unknown', 1500, 'Unknown', 1, 0, 0)]
 48 |     >>> reader.read()  # This call froze the interpreter for several minutes!
 49 |     ... (1920799, 'M', 'U', 'Unknown', 10000, 'Unknown', 6, 6, 6), (1920800, 'F', 'U', 'Unknown', 10000, 'Unknown', 6, 6, 6)]
 50 | 
 51 | Using this optional parameter for larger ORC file is highly recommended!
 52 | 
 53 | After all the rows are read, the Reader object has no more rows to yield.
 54 | There's a seek method to jump a specific row in the file and continue the
 55 | read from that point::
 56 | 
 57 |     >>> reader.seek(1000)
 58 |     1000
 59 |     >>> next(reader)
 60 |     (1001, 'M', 'M', 'College', 7500, 'Good', 0, 0, 0)
 61 | 
 62 | By default all fields are loaded from an ORC file, but that can be changed
 63 | by passing either `column_indices` or `column_names` parameter to Reader::
 64 | 
 65 |     >>> reader = pyorc.Reader(example, column_names=("_col0", "_col5"))
 66 |     >>> next(reader)
 67 |     (1, 'Good')
 68 | 
 69 | We can also change the representation of a struct from tuple to dictionary::
 70 | 
 71 |     >>> from pyorc.enums import StructRepr
 72 |     >>> reader = pyorc.Reader(example, column_indices=(1, 5), struct_repr=StructRepr.DICT)
 73 |     >>> next(reader)
 74 |     {'_col1': 'M', '_col5': 'Good'}
 75 | 
 76 | Stripes
 77 | -------
 78 | 
 79 | ORC files are divided in to stripes. Stripes are independent of each other.
 80 | Let's open an other ORC files that has multiple stripes in it::
 81 | 
 82 |     >>> example = open("./deps/examples/TestOrcFile.testStripeLevelStats.orc", "rb")
 83 |     >>> reader = pyorc.Reader(example)
 84 |     >>> reader.num_of_stripes
 85 |     3
 86 | 
 87 | The `num_of_stripes` property of the Reader shows how many stripes are in
 88 | the file. We can read a certain stripes using the `read_stripe` method::
 89 | 
 90 |     >>> stripe2 = reader.read_stripe(2)
 91 |     >>> stripe2
 92 |     <pyorc._pyorc.stripe object at 0x7f9784e09ce0>
 93 | 
 94 | The stripe object also an iterable object and has the same methods for
 95 | reading and seeking rows, but only in the boundaries of the selected
 96 | stripe::
 97 | 
 98 |     >>> next(stripe2)
 99 |     (3, 'three')
100 |     >>> len(stripe1)
101 |     1000
102 |     >>> len(reader)
103 |     11000
104 |     >>> stripe2.row_offset
105 |     10000
106 | 
107 | The `row_offset` returns the absolute position of the first row in the
108 | stripe.
109 | 
110 | Filtering row groups
111 | --------------------
112 | 
113 | It is possible to skip certain records in an ORC file using simple filter
114 | predicates (or search arguments). Setting a predicate expression to the
115 | Reader can help to exclude row groups that don't satisfy the condition
116 | during reading::
117 | 
118 |     >>> example = open("./deps/examples/TestStringDictionary.testRowIndex.orc", "rb")
119 |     >>> reader = pyorc.Reader(example)
120 |     >>> next(reader)
121 |     ('row 000000',)
122 |     >>> reader = pyorc.Reader(example, predicate=pyorc.predicates.PredicateColumn(pyorc.TypeKind.STRING, "str") > "row 004096")
123 |     >>> next(reader)
124 |     ('row 004096',)
125 | 
126 | The predicate can be used to select a single row group, but not an
127 | individual record. The size of the row group is determined by the
128 | `row_index_stride`, set during writing of the file. You can create more
129 | complex predicate using logical expressions::
130 | 
131 |     >>> pred = (PredicateColumn(TypeKind.INT, "c0") > 300) & (PredicateColumn(TypeKind.STRING, "c1") == "A")
132 | 
133 | One of the comparands must always be a literal value (cannot compare two
134 | columns to each other).
135 | 
136 | Writing
137 | -------
138 | 
139 | To write a new ORC file we need to open a binary file-like object and pass
140 | to a Writer object with an ORC schema description. The schema can be a 
141 | TypeDescription or a simple string ORC schema definition::
142 | 
143 |     >>> output = open("./new.orc", "wb")
144 |     >>> writer = pyorc.Writer(output, "struct<col0:int,col1:string>")
145 |     >>> writer
146 |     <pyorc.writer.Writer object at 0x7f9784e0c308>
147 | 
148 | We can add rows to the file with the `write` method::
149 | 
150 |     >>> writer.write((0, "Test 0"))
151 |     >>> writer.write((1, "Test 1"))
152 | 
153 | Don't forget to close the writer to write out the necessary metadata,
154 | otherwise it won't be a valid ORC file.
155 | 
156 |     >>> writer.close()
157 | 
158 | For simpler use the Writer object can be used as a context manager and you
159 | can also change the struct representation to use dictionaries as rows instead
160 | of tuples as well:
161 | 
162 | .. code-block:: python
163 | 
164 |     with open("./new.orc", "wb") as output:
165 |         with pyorc.Writer(output, "struct<col0:int,col1:string>", struct_repr=StructRepr.DICT) as writer:
166 |             writer.write({"col0": 0, "col1": "Test 0"})
167 | 
168 | 
169 | Using custom converters
170 | -----------------------
171 | 
172 | It's possible to change the default converters that handle the transformations
173 | from ORC `date`, `decimal`, and `timestamp` types to Python objects, and back.
174 | To create your own converter you need to implement the :class:`ORCConverter`
175 | abstract class with two methods: ``from_orc`` and ``to_orc``. The following
176 | example returns the ORC timestamp values as seconds and nanoseconds pair:
177 | 
178 | .. code-block:: python
179 | 
180 |     import pyorc
181 |     from pyorc.converters import ORCConverter
182 | 
183 |     class TSConverter(ORCConverter):
184 |         @staticmethod
185 |         def to_orc(*args):
186 |             seconds, nanoseconds, timezone = args
187 |             return (seconds, nanoseconds)
188 | 
189 |         @staticmethod
190 |         def from_orc(seconds, nanoseconds, timezone):
191 |             return (seconds, nanoseconds)
192 | 
193 | To use the converter you have to set the Reader's or Writer's converters
194 | parameter as a dictionary with one of the supported types as key::
195 | 
196 |     data = open("./timestamps.orc", "rb")
197 |     reader = pyorc.Reader(data, converters={TypeKind.TIMESTAMP: TSConverter})
198 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | trigger:
  2 |   branches:
  3 |     include:
  4 |     - '*'
  5 | 
  6 | stages:
  7 | - stage: test
  8 |   jobs:
  9 |   - job: ubuntu
 10 |     pool:
 11 |       vmImage: "ubuntu-latest"
 12 |     strategy:
 13 |       matrix:
 14 |         Python39:
 15 |           python.version: '3.9'
 16 |         Python310:
 17 |           python.version: '3.10'
 18 |         Python311:
 19 |           python.version: '3.11'
 20 |         Python312:
 21 |           python.version: '3.12'
 22 |         Python313:
 23 |           python.version: '3.13'
 24 |         PyPy3:
 25 |           python.version: 'pypy3'
 26 |     steps:
 27 |     - task: UsePythonVersion@0
 28 |       inputs:
 29 |         versionSpec: '$(python.version)'
 30 |         architecture: 'x64'
 31 |         allowUnstable: true
 32 |     - template: .azure-pipelines/build-run-tests.yml
 33 | 
 34 |   - job: ubuntu_eastern_timezone
 35 |     pool:
 36 |       vmImage: "ubuntu-latest"
 37 |     strategy:
 38 |       matrix:
 39 |         Python38:
 40 |           python.version: '3.11'
 41 |     steps:
 42 |     - bash: sudo timedatectl set-timezone America/New_York
 43 |       displayName: Set timezone
 44 |     - bash: date
 45 |     - task: UsePythonVersion@0
 46 |       inputs:
 47 |         versionSpec: '$(python.version)'
 48 |         architecture: 'x64'
 49 |     - template: .azure-pipelines/build-run-tests.yml
 50 | 
 51 |   - job: previous_orc_versions
 52 |     pool:
 53 |       vmImage: "ubuntu-latest"
 54 |     strategy:
 55 |       matrix:
 56 |         ORC17:
 57 |           orc.version: '1.7.11'
 58 |         ORC18:
 59 |           orc.version: '1.8.8'
 60 |         ORC19:
 61 |           orc.version: '1.9.5'
 62 |     steps:
 63 |     - task: UsePythonVersion@0
 64 |       inputs:
 65 |         versionSpec: '3.11'
 66 |         architecture: 'x64'
 67 |     - template: .azure-pipelines/build-run-tests.yml
 68 |       parameters:
 69 |         orc_version: '$(orc.version)'
 70 | 
 71 |   - job: macos
 72 |     pool:
 73 |       vmImage: 'macOS-latest'
 74 |     strategy:
 75 |       matrix:
 76 |         Python39:
 77 |           python.version: '3.9'
 78 |         Python310:
 79 |           python.version: '3.10'
 80 |         Python311:
 81 |           python.version: '3.11'
 82 |         Python312:
 83 |           python.version: '3.12'
 84 |         Python313:
 85 |           python.version: '3.13'
 86 |     steps:
 87 |     - task: UsePythonVersion@0
 88 |       inputs:
 89 |         versionSpec: '$(python.version)'
 90 |         architecture: 'x64'
 91 |         allowUnstable: true
 92 |     - template: .azure-pipelines/build-run-tests.yml
 93 | 
 94 |   - job: windows
 95 |     pool:
 96 |       vmImage: 'windows-2019'
 97 |     strategy:
 98 |       matrix:
 99 |         Python39:
100 |           python.version: '3.9'
101 |         Python310:
102 |           python.version: '3.10'
103 |         Python311:
104 |           python.version: '3.11'
105 |         Python312:
106 |           python.version: '3.12'
107 |         Python313:
108 |           python.version: '3.13'
109 | 
110 |     steps:
111 |     - task: UsePythonVersion@0
112 |       inputs:
113 |         versionSpec: '$(python.version)'
114 |         architecture: 'x64'
115 |         allowUnstable: true
116 |     - template: .azure-pipelines/build-run-tests.yml
117 |       parameters:
118 |         windows: true
119 | 
120 | - stage: build_wheels
121 |   jobs:
122 |   - job: manylinux_x86_64
123 |     pool:
124 |       vmImage: 'ubuntu-latest'
125 |     steps:
126 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
127 |         parameters:
128 |           cibwStep:
129 |             bash: cibuildwheel --output-dir wheelhouse .
130 |             env:
131 |               CIBW_BUILD_VERBOSITY: 3
132 |               CIBW_ARCHS: x86_64
133 |               CIBW_BUILD: "*-manylinux_*"
134 |               CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*"
135 |             displayName: Build wheels
136 | 
137 |   - job: musllinux_x86_64
138 |     pool:
139 |       vmImage: 'ubuntu-latest'
140 |     steps:
141 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
142 |         parameters:
143 |           cibwStep:
144 |             bash: cibuildwheel --output-dir wheelhouse .
145 |             env:
146 |               CIBW_BUILD_VERBOSITY: 3
147 |               CIBW_ARCHS: x86_64
148 |               CIBW_BUILD: "*-musllinux_*"
149 |               CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*"
150 |             displayName: Build wheels
151 | 
152 |   - job: manylinux_cpy_aarch64
153 |     timeoutInMinutes: 165
154 |     pool:
155 |       vmImage: 'ubuntu-latest'
156 |     steps:
157 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
158 |         parameters:
159 |           qemu: true
160 |           cibwStep:
161 |             bash: cibuildwheel --output-dir wheelhouse .
162 |             env:
163 |               CIBW_BUILD_VERBOSITY: 3
164 |               CIBW_ARCHS: aarch64
165 |               CIBW_BUILD: "*-manylinux_*"
166 |               CIBW_SKIP: "pp* cp36-* cp37-* cp38-*"
167 |             displayName: Build wheels
168 | 
169 |   - job: manylinux_pypy_aarch64
170 |     timeoutInMinutes: 165
171 |     pool:
172 |       vmImage: 'ubuntu-latest'
173 |     steps:
174 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
175 |         parameters:
176 |           qemu: true
177 |           cibwStep:
178 |             bash: cibuildwheel --output-dir wheelhouse .
179 |             env:
180 |               CIBW_BUILD_VERBOSITY: 3
181 |               CIBW_ARCHS: aarch64
182 |               CIBW_BUILD: "*-manylinux_*"
183 |               CIBW_SKIP: "cp* pp37-* pp38-*"
184 |             displayName: Build wheels
185 | 
186 |   - job: musllinux_aarch64
187 |     timeoutInMinutes: 165
188 |     pool:
189 |       vmImage: 'ubuntu-latest'
190 |     steps:
191 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
192 |         parameters:
193 |           qemu: true
194 |           cibwStep:
195 |             bash: cibuildwheel --output-dir wheelhouse .
196 |             env:
197 |               CIBW_BUILD_VERBOSITY: 3
198 |               CIBW_ARCHS: aarch64
199 |               CIBW_BUILD: "*-musllinux_*"
200 |               CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*"
201 |             displayName: Build wheels
202 | 
203 |   - job: macos
204 |     pool:
205 |       vmImage: 'macOS-latest'
206 |     steps:
207 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
208 |         parameters:
209 |           cibwStep:
210 |             bash: cibuildwheel --output-dir wheelhouse .
211 |             env:
212 |               CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=10.13"
213 |               CIBW_BUILD_VERBOSITY: 3
214 |               CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*"
215 |             displayName: Build wheels
216 | 
217 |   - job: macos_universal2
218 |     pool:
219 |       vmImage: 'macOS-latest'
220 |     steps:
221 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
222 |         parameters:
223 |           cibwStep:
224 |             bash: cibuildwheel --output-dir wheelhouse .
225 |             env:
226 |               CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=10.13"
227 |               CIBW_BUILD_VERBOSITY: 3
228 |               CMAKE_OSX_ARCHITECTURES: 'x86_64;arm64'
229 |               CIBW_ARCHS_MACOS: universal2
230 |               CIBW_SKIP: "cp36-* cp37-* cp38-*"
231 |             displayName: Build wheels
232 | 
233 |   - job: windows_amd64
234 |     pool:
235 |       vmImage: 'windows-2019'
236 |     steps:
237 |       - template: .azure-pipelines/prepare-and-push-wheels.yml
238 |         parameters:
239 |           cibwStep:
240 |             bash: cibuildwheel --output-dir wheelhouse .
241 |             env:
242 |               CIBW_BUILD_VERBOSITY: 3
243 |               CIBW_ARCHS: AMD64
244 |               CIBW_SKIP: "cp36-* cp37-* cp38-* pp37-* pp38-*"
245 |             displayName: Build wheels
246 | 


--------------------------------------------------------------------------------
/src/pyorc/typedescription.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from types import MappingProxyType
  3 | from typing import Dict, Mapping, Tuple
  4 | 
  5 | from pyorc._pyorc import _schema_from_string
  6 | 
  7 | from .enums import TypeKind
  8 | 
  9 | 
 10 | class TypeDescription:
 11 |     name = ""
 12 |     kind = -1
 13 | 
 14 |     def __init__(self) -> None:
 15 |         self._column_id = 0
 16 |         self._attributes: Dict[str, str] = {}
 17 | 
 18 |     def __str__(self) -> str:
 19 |         return self.name
 20 | 
 21 |     @property
 22 |     def attributes(self) -> Dict[str, str]:
 23 |         return self._attributes
 24 | 
 25 |     def set_attributes(self, val) -> None:
 26 |         if isinstance(val, dict):
 27 |             if all(
 28 |                 isinstance(key, str) and isinstance(val, str)
 29 |                 for key, val in val.items()
 30 |             ):
 31 |                 self._attributes = val
 32 |             else:
 33 |                 raise TypeError(
 34 |                     "The all keys and values in the attributes dictionary must be string"
 35 |                 )
 36 |         else:
 37 |             raise TypeError("The attributes must be a dictionary")
 38 | 
 39 |     @property
 40 |     def column_id(self) -> int:
 41 |         return self._column_id
 42 | 
 43 |     def set_column_id(self, val: int) -> int:
 44 |         self._column_id = val
 45 |         return self._column_id
 46 | 
 47 |     def find_column_id(self, dotted_key: str) -> int:
 48 |         raise KeyError(dotted_key)
 49 | 
 50 |     @staticmethod
 51 |     def from_string(schema: str) -> "TypeDescription":
 52 |         return _schema_from_string(schema)
 53 | 
 54 | 
 55 | class Boolean(TypeDescription):
 56 |     name = "boolean"
 57 |     kind = TypeKind.BOOLEAN
 58 | 
 59 | 
 60 | class TinyInt(TypeDescription):
 61 |     name = "tinyint"
 62 |     kind = TypeKind.BYTE
 63 | 
 64 | 
 65 | class SmallInt(TypeDescription):
 66 |     name = "smallint"
 67 |     kind = TypeKind.SHORT
 68 | 
 69 | 
 70 | class Int(TypeDescription):
 71 |     name = "int"
 72 |     kind = TypeKind.INT
 73 | 
 74 | 
 75 | class BigInt(TypeDescription):
 76 |     name = "bigint"
 77 |     kind = TypeKind.LONG
 78 | 
 79 | 
 80 | class Float(TypeDescription):
 81 |     name = "float"
 82 |     kind = TypeKind.FLOAT
 83 | 
 84 | 
 85 | class Double(TypeDescription):
 86 |     name = "double"
 87 |     kind = TypeKind.DOUBLE
 88 | 
 89 | 
 90 | class String(TypeDescription):
 91 |     name = "string"
 92 |     kind = TypeKind.STRING
 93 | 
 94 | 
 95 | class Binary(TypeDescription):
 96 |     name = "binary"
 97 |     kind = TypeKind.BINARY
 98 | 
 99 | 
100 | class Timestamp(TypeDescription):
101 |     name = "timestamp"
102 |     kind = TypeKind.TIMESTAMP
103 | 
104 | 
105 | class TimestampInstant(TypeDescription):
106 |     name = "timestamp with local time zone"
107 |     kind = TypeKind.TIMESTAMP_INSTANT
108 | 
109 | 
110 | class Date(TypeDescription):
111 |     name = "date"
112 |     kind = TypeKind.DATE
113 | 
114 | 
115 | class Char(TypeDescription):
116 |     name = "char"
117 |     kind = TypeKind.CHAR
118 | 
119 |     def __init__(self, max_length: int) -> None:
120 |         self.max_length = max_length
121 |         super().__init__()
122 | 
123 |     def __str__(self) -> str:
124 |         return "{name}({len})".format(name=Char.name, len=self.max_length)
125 | 
126 | 
127 | class VarChar(TypeDescription):
128 |     name = "varchar"
129 |     kind = TypeKind.VARCHAR
130 | 
131 |     def __init__(self, max_length: int) -> None:
132 |         super().__init__()
133 |         self.max_length = max_length
134 | 
135 |     def __str__(self) -> str:
136 |         return "{name}({len})".format(name=VarChar.name, len=self.max_length)
137 | 
138 | 
139 | class Decimal(TypeDescription):
140 |     name = "decimal"
141 |     kind = TypeKind.DECIMAL
142 | 
143 |     def __init__(self, precision: int, scale: int) -> None:
144 |         super().__init__()
145 |         self.precision = precision
146 |         self.scale = scale
147 | 
148 |     def __str__(self) -> str:
149 |         return "{name}({prc},{scl})".format(
150 |             name=Decimal.name, prc=self.precision, scl=self.scale
151 |         )
152 | 
153 | 
154 | class Union(TypeDescription):
155 |     name = "uniontype"
156 |     kind = TypeKind.UNION
157 | 
158 |     def __init__(self, *cont_types: TypeDescription) -> None:
159 |         super().__init__()
160 |         for c_types in cont_types:
161 |             if not isinstance(c_types, TypeDescription):
162 |                 raise TypeError("Invalid container type for Union")
163 |         self.__cont_types = cont_types
164 | 
165 |     def __str__(self):
166 |         return "{name}<{types}>".format(
167 |             name=Union.name, types=",".join(str(typ) for typ in self.__cont_types),
168 |         )
169 | 
170 |     def __getitem__(self, idx: int) -> TypeDescription:
171 |         return self.__cont_types[idx]
172 | 
173 |     def set_column_id(self, val: int) -> int:
174 |         self._column_id = val
175 |         for c_type in self.__cont_types:
176 |             val = c_type.set_column_id(val + 1)
177 |         return val
178 | 
179 |     @property
180 |     def cont_types(self) -> Tuple[TypeDescription, ...]:
181 |         return self.__cont_types
182 | 
183 | 
184 | class Array(TypeDescription):
185 |     name = "array"
186 |     kind = TypeKind.LIST
187 | 
188 |     def __init__(self, cont_type: TypeDescription) -> None:
189 |         super().__init__()
190 |         if not isinstance(cont_type, TypeDescription):
191 |             raise TypeError("Array's container type must be a TypeDescription instance")
192 |         self.__type = cont_type
193 | 
194 |     def __str__(self) -> str:
195 |         return "{name}<{type}>".format(name=Array.name, type=str(self.__type))
196 | 
197 |     def set_column_id(self, val: int) -> int:
198 |         self._column_id = val
199 |         val = self.__type.set_column_id(val + 1)
200 |         return val
201 | 
202 |     @property
203 |     def type(self) -> TypeDescription:
204 |         return self.__type
205 | 
206 | 
207 | class Map(TypeDescription):
208 |     name = "map"
209 |     kind = TypeKind.MAP
210 | 
211 |     def __init__(self, key: TypeDescription, value: TypeDescription) -> None:
212 |         super().__init__()
213 |         if not isinstance(key, TypeDescription):
214 |             raise TypeError("Map's key type must be a TypeDescription instance")
215 |         if not isinstance(value, TypeDescription):
216 |             raise TypeError("Map's value type must be a TypeDescription instance")
217 |         self.__key = key
218 |         self.__value = value
219 | 
220 |     def __str__(self) -> str:
221 |         return "{name}<{key},{val}>".format(
222 |             name=Map.name, key=str(self.__key), val=str(self.__value)
223 |         )
224 | 
225 |     def set_column_id(self, val: int) -> int:
226 |         self._column_id = val
227 |         val = self.__key.set_column_id(val + 1)
228 |         val = self.__value.set_column_id(val + 1)
229 |         return val
230 | 
231 |     @property
232 |     def key(self) -> TypeDescription:
233 |         return self.__key
234 | 
235 |     @property
236 |     def value(self) -> TypeDescription:
237 |         return self.__value
238 | 
239 | 
240 | class Struct(TypeDescription):
241 |     name = "struct"
242 |     kind = TypeKind.STRUCT
243 | 
244 |     def __init__(self, **fields: TypeDescription) -> None:
245 |         super().__init__()
246 |         for fld in fields.values():
247 |             if not isinstance(fld, TypeDescription):
248 |                 raise TypeError(
249 |                     "Struct's field type must be a TypeDescription instance"
250 |                 )
251 |         self.__fields = fields
252 |         self.set_column_id(0)
253 | 
254 |     def __str__(self) -> str:
255 |         return "{name}<{fields}>".format(
256 |             name=Struct.name,
257 |             fields=",".join(
258 |                 "{field}:{type}".format(field=key, type=str(val))
259 |                 for key, val in self.__fields.items()
260 |             ),
261 |         )
262 | 
263 |     def __getitem__(self, key: str) -> TypeDescription:
264 |         return self.__fields[key]
265 | 
266 |     def set_column_id(self, val: int) -> int:
267 |         self._column_id = val
268 |         for fld in self.__fields.values():
269 |             val = fld.set_column_id(val + 1)
270 |         return val
271 | 
272 |     def find_column_id(self, dotted_key: str) -> int:
273 |         this = self
274 |         # Allow to use backtick for escaping column names with dot.
275 |         for key in re.findall(r"[^\.`]+|`[^`]*`", dotted_key):
276 |             this = this[key.replace("`", "")]
277 |         return this.column_id
278 | 
279 |     @property
280 |     def fields(self) -> Mapping[str, TypeDescription]:
281 |         return MappingProxyType(self.__fields)
282 | 


--------------------------------------------------------------------------------
/tests/test_column.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import io
  4 | import math
  5 | 
  6 | from datetime import date, datetime, timedelta, timezone
  7 | from decimal import Decimal
  8 | 
  9 | from pyorc import (
 10 |     Reader,
 11 |     Writer,
 12 |     TypeKind,
 13 |     StructRepr,
 14 |     ParseError,
 15 |     Column,
 16 |     Stripe,
 17 | )
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def striped_orc_data():
 22 |     def _init(schema, rows, bfc=tuple()):
 23 |         data = io.BytesIO()
 24 |         with Writer(
 25 |             data,
 26 |             schema,
 27 |             batch_size=65535,
 28 |             stripe_size=128,
 29 |             compression_block_size=128,
 30 |             bloom_filter_columns=bfc,
 31 |             memory_block_size=64,
 32 |         ) as writer:
 33 |             writer.writerows(rows)
 34 |         data.seek(0)
 35 |         return data
 36 | 
 37 |     return _init
 38 | 
 39 | 
 40 | def test_init(striped_orc_data):
 41 |     data = striped_orc_data("struct<a:int,b:int>", ((i, i * 5) for i in range(100000)))
 42 |     reader = Reader(data, column_indices=(1,))
 43 |     stripe = Stripe(reader, 0)
 44 |     with pytest.raises(TypeError):
 45 |         _ = Column(stripe, "0")
 46 |     with pytest.raises(IndexError):
 47 |         _ = Column(stripe, 100)
 48 |     with pytest.raises(IndexError):
 49 |         _ = Column(reader, 100)
 50 |     with pytest.raises(IndexError):
 51 |         _ = Column(reader, 1)
 52 |     col = Column(stripe, 0)
 53 |     assert col is not None
 54 |     col = Column(reader, 0)
 55 |     assert col is not None
 56 | 
 57 | 
 58 | def test_getitem(striped_orc_data):
 59 |     data = striped_orc_data("int", (i for i in range(100000)))
 60 |     reader = Reader(data)
 61 |     stripe = Stripe(reader, 0)
 62 |     col = reader[0]
 63 |     assert col is not None
 64 |     col = stripe[0]
 65 |     assert col is not None
 66 | 
 67 | 
 68 | def test_statistics_bool(striped_orc_data):
 69 |     data = striped_orc_data(
 70 |         "struct<a:boolean>", (((True, False, None)[i % 3],) for i in range(100000))
 71 |     )
 72 |     reader = Reader(data)
 73 |     stripe = Stripe(reader, 0)
 74 |     stat = stripe[0].statistics
 75 |     assert stat["has_null"] is False
 76 |     assert stat["number_of_values"] == 65535
 77 |     assert stat["kind"] == TypeKind.STRUCT
 78 |     stat = stripe[1].statistics
 79 |     assert stat["has_null"] is True
 80 |     assert stat["kind"] == TypeKind.BOOLEAN
 81 |     assert stat["number_of_values"] == 43690
 82 |     assert stat["false_count"] == 21845
 83 |     assert stat["true_count"] == len([i for i, in stripe if i is True])
 84 |     stat = reader[1].statistics
 85 |     assert stat["has_null"] is True
 86 |     assert stat["number_of_values"] == 66667
 87 |     assert stat["false_count"] == len([i for i, in reader if i is False])
 88 |     assert stat["true_count"] == 33334
 89 |     assert reader[0].statistics["number_of_values"] == 100000
 90 | 
 91 | 
 92 | def test_statistics_int(striped_orc_data):
 93 |     data = striped_orc_data("int", (i for i in range(100000)))
 94 |     reader = Reader(data)
 95 |     stripe = Stripe(reader, 0)
 96 |     stat = stripe[0].statistics
 97 |     assert stat["has_null"] is False
 98 |     assert stat["number_of_values"] == 65535
 99 |     assert stat["kind"] == TypeKind.INT
100 |     assert stat["minimum"] == 0
101 |     assert stat["maximum"] == 65534
102 |     assert stat["sum"] == sum(i for i in range(len(stripe)))
103 |     stat = reader[0].statistics
104 |     assert stat["minimum"] == 0
105 |     assert stat["maximum"] == 99999
106 |     assert stat["sum"] == sum(i for i in range(100000))
107 |     assert reader.read_stripe(1)[0].statistics["minimum"] == 65535
108 | 
109 | 
110 | def test_statistics_double(striped_orc_data):
111 |     data = striped_orc_data("double", (i * 0.1 for i in range(100000)))
112 |     reader = Reader(data)
113 |     stripe = Stripe(reader, 0)
114 |     stat = stripe[0].statistics
115 |     assert stat["has_null"] is False
116 |     assert stat["number_of_values"] == 65535
117 |     assert stat["kind"] == TypeKind.DOUBLE
118 |     assert stat["minimum"] == 0
119 |     assert math.isclose(stat["maximum"], 6553.4)
120 |     assert stat["sum"] == sum(i * 0.1 for i in range(len(stripe)))
121 |     stat = reader[0].statistics
122 |     assert stat["minimum"] == 0
123 |     assert math.isclose(stat["maximum"], 9999.9)
124 |     assert stat["sum"] == sum(i * 0.1 for i in range(100000))
125 |     assert reader.read_stripe(1)[0].statistics["minimum"] == 6553.5
126 | 
127 | 
128 | def test_statistics_binary(striped_orc_data):
129 |     data = striped_orc_data("binary", (b"\x4D\x45\x34\x01" for i in range(100000)))
130 |     reader = Reader(data)
131 |     stripe = Stripe(reader, 0)
132 |     stat = stripe[0].statistics
133 |     assert stat["has_null"] is False
134 |     assert stat["kind"] == TypeKind.BINARY
135 |     assert stat["number_of_values"] == 65535
136 |     assert stat["total_length"] == sum(len(i) for i in stripe)
137 |     stat = reader[0].statistics
138 |     assert stat["total_length"] == sum(len(i) for i in reader)
139 | 
140 | 
141 | def test_statistics_string(striped_orc_data):
142 |     data = striped_orc_data(
143 |         "string", ("Test String {0}".format(i + 1) for i in range(100000))
144 |     )
145 |     reader = Reader(data)
146 |     stripe = Stripe(reader, 0)
147 |     stat = stripe[0].statistics
148 |     assert stat["has_null"] is False
149 |     assert stat["kind"] == TypeKind.STRING
150 |     assert stat["number_of_values"] == 65535
151 |     assert stat["total_length"] == sum(len(i) for i in stripe)
152 |     assert stat["minimum"] == "Test String 1"
153 |     assert stat["maximum"] == max(i for i in Stripe(reader, 0))
154 |     stat = reader[0].statistics
155 |     assert stat["maximum"] == max(i for i in reader)
156 |     assert reader.read_stripe(1)[0].statistics["minimum"] == "Test String 100000"
157 | 
158 | 
159 | def test_statistics_date(striped_orc_data):
160 |     data = striped_orc_data(
161 |         "date", (date(1900, 1, 1) + timedelta(days=i) for i in range(100000))
162 |     )
163 |     reader = Reader(data)
164 |     stripe = Stripe(reader, 0)
165 |     stat = stripe[0].statistics
166 |     assert stat["kind"] == TypeKind.DATE
167 |     assert stat["has_null"] is False
168 |     assert stat["number_of_values"] == 65535
169 |     assert stat["minimum"] == date(1900, 1, 1)
170 |     assert stat["maximum"] == date(2079, 6, 5)
171 |     stat = reader[0].statistics
172 |     assert stat["maximum"] == max(i for i in reader)
173 | 
174 | 
175 | def test_statistics_timestamp(striped_orc_data):
176 |     data = striped_orc_data(
177 |         "timestamp",
178 |         (
179 |             datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc) + timedelta(minutes=i)
180 |             for i in range(100000)
181 |         ),
182 |     )
183 |     reader = Reader(data)
184 |     stripe = Stripe(reader, 0)
185 |     stat = stripe[0].statistics
186 |     assert stat["kind"] == TypeKind.TIMESTAMP
187 |     assert stat["has_null"] is False
188 |     assert stat["number_of_values"] == len(stripe)
189 |     assert stat["minimum"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc)
190 |     assert stat["maximum"] == max(i for i in stripe)
191 |     assert stat["lower_bound"] == datetime(2000, 1, 1, 12, 0, tzinfo=timezone.utc)
192 |     assert stat["upper_bound"] == datetime(
193 |         2000, 2, 16, 0, 14, 0, 1000, tzinfo=timezone.utc
194 |     )
195 |     stat = reader[0].statistics
196 |     assert stat["maximum"] == max(i for i in reader)
197 |     assert stat["upper_bound"] == datetime(
198 |         2000, 3, 10, 22, 39, 0, 1000, tzinfo=timezone.utc
199 |     )
200 | 
201 | 
202 | def test_statistics_decimal(striped_orc_data):
203 |     data = striped_orc_data(
204 |         "decimal(10,3)",
205 |         (Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)),
206 |     )
207 |     reader = Reader(data)
208 |     stripe = Stripe(reader, 0)
209 |     stat = stripe[0].statistics
210 |     assert stat["kind"] == TypeKind.DECIMAL
211 |     assert stat["has_null"] is False
212 |     assert stat["number_of_values"] == len(stripe)
213 |     assert stat["minimum"] == Decimal("1010.100")
214 |     assert stat["maximum"] == Decimal("7563.500")
215 |     assert stat["sum"] == sum(
216 |         Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(len(stripe))
217 |     ).quantize(Decimal("1.000"))
218 |     stat = reader[0].statistics
219 |     assert stat["sum"] == sum(
220 |         Decimal("1000.1") + Decimal((i + 100) * 0.1) for i in range(100000)
221 |     ).quantize(Decimal("1.000"))
222 |     assert reader.read_stripe(1)[0].statistics["minimum"] == Decimal("7563.600")
223 | 
224 | 
225 | def test_statistics_array_int(striped_orc_data):
226 |     data = striped_orc_data(
227 |         "struct<list:array<int>>",
228 |         (([j + i for j in range(30)],) for i in range(100000)),
229 |     )
230 |     reader = Reader(data)
231 |     stripe = reader.read_stripe(0)
232 |     stat = stripe[2].statistics
233 |     assert stripe[1].statistics["kind"] == TypeKind.LIST
234 |     assert stat["kind"] == TypeKind.INT
235 |     assert sum(i for col in reader.read_stripe(0) for i in col[0]) == stat["sum"]
236 |     assert min(i for col in reader.read_stripe(0) for i in col[0]) == stat["minimum"]
237 |     assert max(i for col in reader.read_stripe(0) for i in col[0]) == stat["maximum"]
238 |     stat = reader[2].statistics
239 |     assert max(i for col in reader for i in col[0]) == stat["maximum"]
240 | 


--------------------------------------------------------------------------------
/src/_pyorc/SearchArgument.cpp:
--------------------------------------------------------------------------------
  1 | #include <orc/Type.hh>
  2 | 
  3 | #include "SearchArgument.h"
  4 | 
  5 | std::tuple<orc::PredicateDataType, orc::Literal>
  6 | buildLiteral(py::object column,
  7 |              py::object value,
  8 |              py::dict convDict,
  9 |              py::object timezoneInfo)
 10 | {
 11 |     int colType = py::cast<int>(column.attr("type_kind"));
 12 |     switch (colType) {
 13 |         case orc::TypeKind::BOOLEAN:
 14 |             if (value.is_none()) {
 15 |                 return std::make_tuple(orc::PredicateDataType::BOOLEAN,
 16 |                                        orc::Literal(orc::PredicateDataType::BOOLEAN));
 17 |             } else {
 18 |                 return std::make_tuple(orc::PredicateDataType::BOOLEAN,
 19 |                                        orc::Literal(py::cast<bool>(value)));
 20 |             }
 21 |         case orc::TypeKind::BYTE:
 22 |         case orc::TypeKind::SHORT:
 23 |         case orc::TypeKind::INT:
 24 |         case orc::TypeKind::LONG:
 25 |             if (value.is_none()) {
 26 |                 return std::make_tuple(orc::PredicateDataType::LONG,
 27 |                                        orc::Literal(orc::PredicateDataType::LONG));
 28 |             } else {
 29 |                 return std::make_tuple(orc::PredicateDataType::LONG,
 30 |                                        orc::Literal(py::cast<int64_t>(value)));
 31 |             }
 32 |         case orc::TypeKind::FLOAT:
 33 |         case orc::TypeKind::DOUBLE:
 34 |             if (value.is_none()) {
 35 |                 return std::make_tuple(orc::PredicateDataType::FLOAT,
 36 |                                        orc::Literal(orc::PredicateDataType::FLOAT));
 37 |             } else {
 38 |                 return std::make_tuple(orc::PredicateDataType::FLOAT,
 39 |                                        orc::Literal(py::cast<double>(value)));
 40 |             }
 41 |         case orc::TypeKind::CHAR:
 42 |         case orc::TypeKind::VARCHAR:
 43 |         case orc::TypeKind::STRING: {
 44 |             if (value.is_none()) {
 45 |                 return std::make_tuple(orc::PredicateDataType::STRING,
 46 |                                        orc::Literal(orc::PredicateDataType::STRING));
 47 |             } else {
 48 |                 std::string str = py::cast<std::string>(value);
 49 |                 return std::make_tuple(orc::PredicateDataType::STRING,
 50 |                                        orc::Literal(str.c_str(), str.size()));
 51 |             }
 52 |         }
 53 |         case orc::TypeKind::DATE: {
 54 |             if (value.is_none()) {
 55 |                 return std::make_tuple(orc::PredicateDataType::DATE,
 56 |                                        orc::Literal(orc::PredicateDataType::DATE));
 57 |             } else {
 58 |                 py::object idx(py::int_(static_cast<int>(orc::TypeKind::DATE)));
 59 |                 py::object to_orc = convDict[idx].attr("to_orc");
 60 |                 return std::make_tuple(orc::PredicateDataType::DATE,
 61 |                                        orc::Literal(orc::PredicateDataType::DATE,
 62 |                                                     py::cast<int64_t>(to_orc(value))));
 63 |             }
 64 |         }
 65 |         case orc::TypeKind::TIMESTAMP:
 66 |         case orc::TypeKind::TIMESTAMP_INSTANT: {
 67 |             if (value.is_none()) {
 68 |                 return std::make_tuple(orc::PredicateDataType::TIMESTAMP,
 69 |                                        orc::Literal(orc::PredicateDataType::TIMESTAMP));
 70 |             } else {
 71 |                 py::object idx(py::int_(static_cast<int>(orc::TypeKind::TIMESTAMP)));
 72 |                 py::object to_orc = convDict[idx].attr("to_orc");
 73 |                 py::tuple res = to_orc(value, timezoneInfo);
 74 |                 return std::make_tuple(
 75 |                   orc::PredicateDataType::TIMESTAMP,
 76 |                   orc::Literal(py::cast<int64_t>(res[0]), py::cast<int64_t>(res[1])));
 77 |             }
 78 |         }
 79 |         case orc::TypeKind::DECIMAL: {
 80 |             if (value.is_none()) {
 81 |                 return std::make_tuple(orc::PredicateDataType::DECIMAL,
 82 |                                        orc::Literal(orc::PredicateDataType::DECIMAL));
 83 |             } else {
 84 |                 py::object idx(py::int_(static_cast<int>(orc::TypeKind::DECIMAL)));
 85 |                 uint64_t precision = py::cast<uint64_t>(column.attr("precision"));
 86 |                 uint64_t scale = py::cast<uint64_t>(column.attr("scale"));
 87 |                 py::object to_orc = convDict[idx].attr("to_orc");
 88 |                 py::object res = to_orc(precision, scale, value);
 89 |                 std::string strRes = py::cast<std::string>(py::str(res));
 90 |                 return std::make_tuple(orc::PredicateDataType::DECIMAL,
 91 |                                        orc::Literal(orc::Int128(strRes),
 92 |                                                     static_cast<int32_t>(precision),
 93 |                                                     static_cast<int32_t>(scale)));
 94 |             }
 95 |         }
 96 |         default:
 97 |             throw py::type_error("Unsupported type for ORC Literal in predicate");
 98 |     }
 99 | }
100 | 
101 | orc::SearchArgumentBuilder&
102 | buildSearchArgument(orc::SearchArgumentBuilder& sarg,
103 |                     py::tuple predVals,
104 |                     py::dict convDict,
105 |                     py::object timezoneInfo)
106 | {
107 |     int opCode = py::cast<int>(predVals[0]);
108 |     switch (opCode) {
109 |         case 0: /* NOT */
110 |             return buildSearchArgument(
111 |                      sarg.startNot(), predVals[1], convDict, timezoneInfo)
112 |               .end();
113 |         case 1: /* OR */
114 |             return buildSearchArgument(
115 |                      buildSearchArgument(
116 |                        sarg.startOr(), predVals[1], convDict, timezoneInfo),
117 |                      predVals[2],
118 |                      convDict,
119 |                      timezoneInfo)
120 |               .end();
121 |         case 2: /* AND */
122 |             return buildSearchArgument(
123 |                      buildSearchArgument(
124 |                        sarg.startAnd(), predVals[1], convDict, timezoneInfo),
125 |                      predVals[2],
126 |                      convDict,
127 |                      timezoneInfo)
128 |               .end();
129 |         case 3: { /* EQ */
130 |             py::object colName = predVals[1].attr("name");
131 |             py::object colIdx = predVals[1].attr("index");
132 |             std::tuple<orc::PredicateDataType, orc::Literal> res =
133 |               buildLiteral(predVals[1], predVals[2], convDict, timezoneInfo);
134 |             if (!colName.is_none()) {
135 |                 return sarg.equals(
136 |                   py::cast<std::string>(colName), std::get<0>(res), std::get<1>(res));
137 |             } else if (!colIdx.is_none()) {
138 |                 return sarg.equals(
139 |                   py::cast<uint64_t>(colIdx), std::get<0>(res), std::get<1>(res));
140 |             } else {
141 |                 throw py::type_error("Either name or index parameter must be set");
142 |             }
143 |         }
144 |         case 4: { /* LT */
145 |             py::object colName = predVals[1].attr("name");
146 |             py::object colIdx = predVals[1].attr("index");
147 |             std::tuple<orc::PredicateDataType, orc::Literal> res =
148 |               buildLiteral(predVals[1], predVals[2], convDict, timezoneInfo);
149 |             if (!colName.is_none()) {
150 |                 return sarg.lessThan(
151 |                   py::cast<std::string>(colName), std::get<0>(res), std::get<1>(res));
152 |             } else if (!colIdx.is_none()) {
153 |                 return sarg.lessThan(
154 |                   py::cast<uint64_t>(colIdx), std::get<0>(res), std::get<1>(res));
155 |             } else {
156 |                 throw py::type_error("Either name or index parameter must be set");
157 |             }
158 |         }
159 |         case 5: { /* LE */
160 |             py::object colName = predVals[1].attr("name");
161 |             py::object colIdx = predVals[1].attr("index");
162 |             std::tuple<orc::PredicateDataType, orc::Literal> res =
163 |               buildLiteral(predVals[1], predVals[2], convDict, timezoneInfo);
164 |             if (!colName.is_none()) {
165 |                 return sarg.lessThanEquals(
166 |                   py::cast<std::string>(colName), std::get<0>(res), std::get<1>(res));
167 |             } else if (!colIdx.is_none()) {
168 |                 return sarg.lessThanEquals(
169 |                   py::cast<uint64_t>(colIdx), std::get<0>(res), std::get<1>(res));
170 |             } else {
171 |                 throw py::type_error("Either name or index parameter must be set");
172 |             }
173 |         }
174 |         default:
175 |             throw py::type_error("Invalid operation on Literal in predicate");
176 |     }
177 |     return sarg;
178 | }
179 | 
180 | std::unique_ptr<orc::SearchArgument>
181 | createSearchArgument(py::object predicate, py::dict convDict, py::object timezoneInfo)
182 | {
183 |     std::unique_ptr<orc::SearchArgumentBuilder> builder =
184 |       orc::SearchArgumentFactory::newBuilder();
185 |     try {
186 |         py::tuple predVals = predicate.attr("values");
187 |         buildSearchArgument(*builder.get(), predVals, convDict, timezoneInfo);
188 |         return builder->build();
189 |     } catch (py::error_already_set& err) {
190 |         if (err.matches(PyExc_AttributeError)) {
191 |             std::string strbuf("Invalid predicate: ");
192 |             strbuf.append(py::cast<std::string>(py::repr(predicate)).c_str());
193 |             throw py::type_error(strbuf.c_str());
194 |         } else {
195 |             throw;
196 |         }
197 |     }
198 | }
199 | 


--------------------------------------------------------------------------------
/tests/compare/test_reader_cmp.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import json
  4 | import gzip
  5 | import os
  6 | import math
  7 | import subprocess
  8 | import sys
  9 | import platform
 10 | 
 11 | try:
 12 |     import zoneinfo
 13 | except ImportError:
 14 |     from backports import zoneinfo
 15 | 
 16 | from pyorc import TypeKind, StructRepr
 17 | import pyorc._pyorc
 18 | 
 19 | ORC_METADATA_PATH = "deps/bin/orc-metadata"
 20 | 
 21 | 
 22 | def traverse_json_row(schema, value, parent=""):
 23 |     if schema.kind < 8:
 24 |         # Primitive types, no transformation.
 25 |         yield schema.kind, parent, value
 26 |     elif schema.kind == TypeKind.STRUCT:
 27 |         for key, val in value.items():
 28 |             yield from traverse_json_row(
 29 |                 schema[key], val, "{0}.{1}".format(parent, key)
 30 |             )
 31 |     elif schema.kind == TypeKind.MAP:
 32 |         for keypair in value:
 33 |             yield from traverse_json_row(
 34 |                 schema.value,
 35 |                 keypair["value"],
 36 |                 "{0}['{1}']".format(parent, keypair["key"]),
 37 |             )
 38 |     elif schema.kind == TypeKind.LIST:
 39 |         for idx, item in enumerate(value):
 40 |             yield from traverse_json_row(
 41 |                 schema.type, item, "{0}[{1}]".format(parent, idx)
 42 |             )
 43 |     elif schema.kind == TypeKind.UNION:
 44 |         yield schema.kind, parent, value["value"] if value is not None else None
 45 | 
 46 | 
 47 | def traverse_orc_row(schema, value, parent=""):
 48 |     if schema.kind < 8 or schema.kind == TypeKind.UNION:
 49 |         # Primitive types, no transformation.
 50 |         yield schema.kind, parent, value
 51 |     elif schema.kind == TypeKind.STRUCT:
 52 |         for key, val in value.items():
 53 |             yield from traverse_orc_row(schema[key], val, "{0}.{1}".format(parent, key))
 54 |     elif schema.kind == TypeKind.MAP:
 55 |         for key, val in value.items():
 56 |             yield from traverse_orc_row(
 57 |                 schema.value, val, "{0}['{1}']".format(parent, key)
 58 |             )
 59 |     elif schema.kind == TypeKind.LIST:
 60 |         for idx, item in enumerate(value):
 61 |             yield from traverse_orc_row(
 62 |                 schema.type, item, "{0}[{1}]".format(parent, idx)
 63 |             )
 64 | 
 65 | 
 66 | def get_full_path(path):
 67 |     curdir = os.path.abspath(os.path.dirname(__file__))
 68 |     projdir = os.path.abspath(os.path.join(curdir, os.pardir, os.pardir))
 69 |     return os.path.join(projdir, "deps", "examples", path)
 70 | 
 71 | 
 72 | def idfn(val):
 73 |     return val.split("/")[-1]
 74 | 
 75 | 
 76 | TESTDATA = [
 77 |     ("TestOrcFile.emptyFile.orc", "expected/TestOrcFile.emptyFile.jsn.gz"),
 78 |     ("TestOrcFile.test1.orc", "expected/TestOrcFile.test1.jsn.gz"),
 79 |     ("TestOrcFile.testDate1900.orc", "expected/TestOrcFile.testDate1900.jsn.gz"),
 80 |     ("TestOrcFile.testDate2038.orc", "expected/TestOrcFile.testDate2038.jsn.gz"),
 81 |     ("TestOrcFile.testSeek.orc", "expected/TestOrcFile.testSeek.jsn.gz"),
 82 |     ("TestOrcFile.testSnappy.orc", "expected/TestOrcFile.testSnappy.jsn.gz"),
 83 |     ("TestOrcFile.testTimestamp.orc", "expected/TestOrcFile.testTimestamp.jsn.gz"),
 84 |     (
 85 |         "TestOrcFile.testUnionAndTimestamp.orc",
 86 |         "expected/TestOrcFile.testUnionAndTimestamp.jsn.gz",
 87 |     ),
 88 |     ("nulls-at-end-snappy.orc", "expected/nulls-at-end-snappy.jsn.gz"),
 89 |     ("demo-12-zlib.orc", "expected/demo-12-zlib.jsn.gz"),
 90 |     ("decimal.orc", "expected/decimal.jsn.gz"),
 91 | ]
 92 | 
 93 | 
 94 | @pytest.mark.parametrize("example,expected", TESTDATA, ids=idfn)
 95 | def test_read(example, expected):
 96 |     if example == "demo-12-zlib.orc" and platform.python_implementation() == "PyPy":
 97 |         pytest.skip("This test runs waaay too long on PyPy.")
 98 |     exp_res = gzip.open(get_full_path(expected), "rb")
 99 |     with open(get_full_path(example), "rb") as fileo:
100 |         orc_res = pyorc._pyorc.reader(
101 |             fileo, timezone=zoneinfo.ZoneInfo("UTC"), struct_repr=StructRepr.DICT
102 |         )
103 |         length = 0
104 |         for num, line in enumerate(exp_res):
105 |             json_row = traverse_json_row(orc_res.schema, json.loads(line))
106 |             orc_row = traverse_orc_row(orc_res.schema, next(orc_res))
107 |             for _, exp_path, exp_val in json_row:
108 |                 otype, act_path, act_val = next(orc_row)
109 |                 assert exp_path == act_path
110 |                 if exp_val is None:
111 |                     assert act_val is None
112 |                 elif otype == TypeKind.BINARY:
113 |                     assert exp_val == [
114 |                         int(i) for i in act_val
115 |                     ], "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path)
116 |                 elif otype == TypeKind.DOUBLE or otype == TypeKind.FLOAT:
117 |                     assert math.isclose(
118 |                         exp_val,
119 |                         act_val,
120 |                         abs_tol=0.005,  # Extermely permissive float comparing.
121 |                     ), "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path)
122 |                 elif otype == TypeKind.TIMESTAMP:
123 |                     assert exp_val == act_val.strftime("%Y-%m-%d %H:%M:%S.%f").rstrip(
124 |                         "0"
125 |                     ), "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path)
126 |                 elif otype == TypeKind.DATE:
127 |                     assert (
128 |                         exp_val == act_val.isoformat()
129 |                     ), "Row #{num}, Column: `{path}`".format(num=num + 1, path=act_path)
130 |                 elif otype == TypeKind.DECIMAL:
131 |                     assert exp_val == float(
132 |                         act_val
133 |                     ), "Row #{num}, Column: `{path}`".format(  # Not the best comparing.
134 |                         num=num + 1, path=act_path
135 |                     )
136 |                 else:
137 |                     assert exp_val == act_val, "Row #{num}, Column: `{path}`".format(
138 |                         num=num + 1, path=act_path
139 |                     )
140 |             length = num + 1
141 |         assert len(orc_res) == length, "ORC file has a different number of row"
142 |     exp_res.close()
143 | 
144 | 
145 | def test_metadata_read():
146 |     with open(get_full_path("TestOrcFile.emptyFile.orc"), "rb") as fileo:
147 |         res = pyorc._pyorc.reader(fileo, struct_repr=StructRepr.DICT)
148 |         assert res.user_metadata == {}
149 |     with open(get_full_path("TestOrcFile.metaData.orc"), "rb") as fileo:
150 |         res = pyorc._pyorc.reader(fileo, struct_repr=StructRepr.DICT)
151 |         assert res.user_metadata["clobber"] == b"\x05\x07\x0b\r\x11\x13"
152 |         assert (
153 |             res.user_metadata["my.meta"]
154 |             == b"\x01\x02\x03\x04\x05\x06\x07\xff\xfe\x7f\x80"
155 |         )
156 | 
157 | 
158 | def test_format_version():
159 |     with open(get_full_path("demo-11-zlib.orc"), "rb") as fileo:
160 |         res = pyorc._pyorc.reader(fileo)
161 |         assert res.format_version == (0, 11)
162 |     with open(get_full_path("demo-12-zlib.orc"), "rb") as fileo:
163 |         res = pyorc._pyorc.reader(fileo)
164 |         assert res.format_version == (0, 12)
165 | 
166 | 
167 | def test_writer_id():
168 |     with open(get_full_path("demo-12-zlib.orc"), "rb") as fileo:
169 |         res = pyorc.reader.Reader(fileo)
170 |         assert res.writer_id == "ORC_JAVA_WRITER"
171 | 
172 | 
173 | def test_writer_version():
174 |     with open(get_full_path("demo-12-zlib.orc"), "rb") as fileo:
175 |         res = pyorc.reader.Reader(fileo)
176 |         assert res.writer_version == 1
177 |     with open(get_full_path("decimal.orc"), "rb") as fileo:
178 |         res = pyorc.reader.Reader(fileo)
179 |         assert res.writer_version == 0
180 | 
181 | 
182 | @pytest.mark.skipif(sys.platform == "win32", reason="No orc-tools on Windows")
183 | def test_metadata():
184 |     test_data = get_full_path("complextypes_iceberg.orc")
185 |     expected_metadata = None
186 |     with subprocess.Popen(
187 |         [ORC_METADATA_PATH, test_data],
188 |         stdout=subprocess.PIPE,
189 |     ) as proc:
190 |         expected_metadata = json.load(proc.stdout)
191 |     with open(test_data, "rb") as fileo:
192 |         res = pyorc.reader.Reader(fileo)
193 |         assert str(res.schema) == expected_metadata["type"]
194 |         assert len(res) == expected_metadata["rows"]
195 |         assert res.num_of_stripes == expected_metadata["stripe count"]
196 |         assert (
197 |             f"{res.format_version[0]}.{res.format_version[1]}"
198 |             == expected_metadata["format"]
199 |         )
200 |         assert res.software_version == expected_metadata["software version"]
201 |         assert res.compression.name.lower() == expected_metadata["compression"]
202 |         assert res.compression_block_size == expected_metadata["compression block"]
203 |         assert res.row_index_stride == expected_metadata["row index stride"]
204 |         assert res.user_metadata == expected_metadata["user metadata"]
205 |         assert res.bytes_lengths["content_length"] == expected_metadata["content"]
206 |         assert res.bytes_lengths["file_footer_length"] == expected_metadata["footer"]
207 |         assert res.bytes_lengths["file_length"] == expected_metadata["file length"]
208 |         assert (
209 |             res.bytes_lengths["file_postscript_length"]
210 |             == expected_metadata["postscript"]
211 |         )
212 |         assert (
213 |             res.bytes_lengths["stripe_statistics_length"]
214 |             == expected_metadata["stripe stats"]
215 |         )
216 |         assert (
217 |             res.read_stripe(0).bytes_length == expected_metadata["stripes"][0]["length"]
218 |         )
219 |         assert (
220 |             res.read_stripe(0).bytes_offset == expected_metadata["stripes"][0]["offset"]
221 |         )
222 |         for col, expected_attr in expected_metadata["attributes"].items():
223 |             col_type = res.schema
224 |             for item in col.split("."):
225 |                 if item == "_elem":
226 |                     col_type = col_type.type
227 |                 elif item == "_key":
228 |                     col_type = col_type.key
229 |                 elif item == "_value":
230 |                     col_type = col_type.value
231 |                 else:
232 |                     col_type = col_type[item]
233 |             assert col_type.attributes == expected_attr
234 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import pathlib
  4 | import platform
  5 | import sys
  6 | import shutil
  7 | import subprocess
  8 | import urllib.request
  9 | import tarfile
 10 | import logging
 11 | 
 12 | from setuptools import setup
 13 | 
 14 | from pybind11.setup_helpers import Pybind11Extension, build_ext
 15 | 
 16 | 
 17 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
 18 | 
 19 | 
 20 | SOURCES = [
 21 |     "_pyorc.cpp",
 22 |     "Converter.cpp",
 23 |     "PyORCStream.cpp",
 24 |     "Reader.cpp",
 25 |     "SearchArgument.cpp",
 26 |     "Writer.cpp",
 27 | ]
 28 | 
 29 | HEADERS = [
 30 |     "Converter.h",
 31 |     "PyORCStream.h",
 32 |     "Reader.h",
 33 |     "SearchArgument.h",
 34 |     "Writer.h",
 35 |     "verguard.h",
 36 | ]
 37 | 
 38 | if sys.platform.startswith("win32"):
 39 |     LIBS = [
 40 |         "orc",
 41 |         "libprotobuf",
 42 |         "libprotoc",
 43 |         "lz4",
 44 |         "zstd_static",
 45 |         "zlibstatic",
 46 |         "snappy",
 47 |     ]
 48 | else:
 49 |     LIBS = ["orc", "protobuf", "protoc", "lz4", "zstd", "z", "snappy", "pthread"]
 50 | 
 51 | LIBS = os.getenv("PYORC_LIBRARIES", ",".join(LIBS)).split(",")
 52 | 
 53 | EXT_MODULES = [
 54 |     Pybind11Extension(
 55 |         "pyorc._pyorc",
 56 |         sources=[os.path.join("src", "_pyorc", src) for src in SOURCES],
 57 |         depends=[os.path.join("src", "_pyorc", hdr) for hdr in HEADERS],
 58 |         libraries=LIBS,
 59 |         include_dirs=[os.path.join("deps", "include")],
 60 |         library_dirs=[os.path.join("deps", "lib")],
 61 |     )
 62 | ]
 63 | 
 64 | 
 65 | class BuildExt(build_ext):
 66 |     """
 67 |     A custom build extension for build ORC Core library and handling
 68 |     debug build on Windows.
 69 |     """
 70 | 
 71 |     user_options = build_ext.user_options + [
 72 |         ("orc-version=", None, "the version of the ORC C++ Core library"),
 73 |         ("output-dir=", None, "the output directory"),
 74 |         ("source-url=", None, "the HTTP url for downloading the ORC source"),
 75 |         ("download-only", None, "just download and extract the ORC source"),
 76 |         ("skip-orc-build", None, "skip building ORC C++ Core library"),
 77 |     ]
 78 | 
 79 |     boolean_options = build_ext.boolean_options + [
 80 |         "download-only",
 81 |         "skip-orc-build",
 82 |     ]
 83 | 
 84 |     def initialize_options(self) -> None:
 85 |         """Set default values for options."""
 86 |         super().initialize_options()
 87 |         self.orc_version = "2.1.0"
 88 |         self.output_dir = "deps"
 89 |         self.source_url = "https://archive.apache.org/dist/orc/"
 90 |         self.download_only = False
 91 |         self.skip_orc_build = False
 92 | 
 93 |     def finalize_options(self) -> None:
 94 |         # Workaround to set options with environment variables,
 95 |         # because pip fails to pass parameters to build_ext.
 96 |         if os.getenv("PYORC_DEBUG", 0):
 97 |             self.debug = True
 98 |         if os.getenv("PYORC_SKIP_ORC_BUILD", 0):
 99 |             self.skip_orc_build = True
100 |         self.orc_version = os.getenv("PYORC_LIB_VERSION", self.orc_version)
101 |         super().finalize_options()
102 | 
103 |     def _download_source(self) -> None:
104 |         tmp_tar = io.BytesIO()
105 |         url = "{url}orc-{ver}/orc-{ver}.tar.gz".format(
106 |             url=self.source_url, ver=self.orc_version
107 |         )
108 |         with urllib.request.urlopen(url) as src:
109 |             logging.info("Download ORC release from: %s" % url)
110 |             tmp_tar.write(src.read())
111 |         tmp_tar.seek(0)
112 |         tar_src = tarfile.open(fileobj=tmp_tar, mode="r:gz")
113 |         logging.info("Extract archives in: %s" % self.output_dir)
114 |         tar_src.extractall(self.output_dir)
115 |         tar_src.close()
116 | 
117 |     @staticmethod
118 |     def _get_build_envs() -> dict:
119 |         env = os.environ.copy()
120 | 
121 |         if sys.platform != "win32":
122 |             env["CFLAGS"] = "-fPIC"
123 |             env["CXXFLAGS"] = "-fPIC"
124 | 
125 |         return env
126 | 
127 |     def _build_with_cmake(self) -> str:
128 |         build_type = "DEBUG" if self.debug else "RELEASE"
129 | 
130 |         cmake_args = [
131 |             f"-DCMAKE_BUILD_TYPE={build_type}",
132 |             "-DBUILD_JAVA=OFF",
133 |             "-DBUILD_LIBHDFSPP=OFF",
134 |             "-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
135 |         ]
136 |         if sys.platform == "win32":
137 |             cmake_args.append("-DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded")
138 |         if not self.debug or sys.platform == "win32":
139 |             # Skip building tools and tests.
140 |             cmake_args.append("-DBUILD_TOOLS=OFF")
141 |             cmake_args.append("-DBUILD_CPP_TESTS=OFF")
142 |         env = self._get_build_envs()
143 |         build_dir = os.path.join(
144 |             self.output_dir, "orc-{ver}".format(ver=self.orc_version), "build"
145 |         )
146 |         if not os.path.exists(build_dir):
147 |             os.makedirs(build_dir)
148 |         logging.info("Build libraries with cmake")
149 |         cmake_cmd = ["cmake", ".."] + cmake_args
150 |         logging.info("Cmake command: %s" % cmake_cmd)
151 |         subprocess.check_call(cmake_cmd, cwd=build_dir, env=env)
152 |         if sys.platform == "win32":
153 |             subprocess.check_call(
154 |                 [
155 |                     "cmake",
156 |                     "--build",
157 |                     ".",
158 |                     "--config",
159 |                     build_type,
160 |                     "--target",
161 |                     "PACKAGE",
162 |                 ],
163 |                 cwd=build_dir,
164 |                 env=env,
165 |             )
166 |         else:
167 |             j_flag = f"-j{os.cpu_count() or 1}"
168 |             subprocess.check_call(["make", j_flag, "package"], cwd=build_dir, env=env)
169 |         return build_dir
170 | 
171 |     def _build_orc_lib(self):
172 |         logging.info("Build ORC C++ Core library")
173 |         build_dir = self._build_with_cmake()
174 |         plat = (
175 |             sys.platform.title()
176 |             if not sys.platform.startswith("win32")
177 |             # Change platform title on Windows depending on arch (32/64bit)
178 |             else sys.platform.title().replace("32", platform.architecture()[0][:2])
179 |         )
180 |         pack_dir = os.path.join(
181 |             build_dir,
182 |             "_CPack_Packages",
183 |             plat,
184 |             "TGZ",
185 |             f"ORC-{self.orc_version}-{plat}",
186 |         )
187 |         logging.info(
188 |             "Move artifacts from '%s' to the '%s' folder" % (pack_dir, self.output_dir)
189 |         )
190 |         try:
191 |             shutil.move(os.path.join(pack_dir, "include"), self.output_dir)
192 |             lib_dir = (
193 |                 "lib64" if os.path.exists(os.path.join(pack_dir, "lib64")) else "lib"
194 |             )
195 |             shutil.move(
196 |                 os.path.join(pack_dir, lib_dir), os.path.join(self.output_dir, "lib")
197 |             )
198 |             if self.debug and not sys.platform.startswith("win32"):
199 |                 shutil.move(os.path.join(pack_dir, "bin"), self.output_dir)
200 |             shutil.move(
201 |                 os.path.join(
202 |                     self.output_dir,
203 |                     f"orc-{self.orc_version}",
204 |                     "examples",
205 |                 ),
206 |                 self.output_dir,
207 |             )
208 |         except Exception as exc:
209 |             logging.warning(exc)
210 | 
211 |     def get_version_macros(self):
212 |         parts = self.orc_version.split(".")
213 |         return (
214 |             ("ORC_VERSION_MAJOR", int(parts[0])),
215 |             ("ORC_VERSION_MINOR", int(parts[1])),
216 |             ("ORC_VERSION_PATCH", int(parts[2])),
217 |         )
218 | 
219 |     def build_extensions(self):
220 |         if not self.skip_orc_build:
221 |             orc_lib = os.path.join(
222 |                 self.output_dir,
223 |                 "lib",
224 |                 "orc.lib" if sys.platform.startswith("win32") else "liborc.a",
225 |             )
226 |             if not os.path.isdir(
227 |                 os.path.join(self.output_dir, "orc-{ver}".format(ver=self.orc_version))
228 |             ):
229 |                 self._download_source()
230 | 
231 |             if self.download_only:
232 |                 logging.info("Only downloaded the ORC library source. Skip build_ext")
233 |                 return
234 | 
235 |             if not os.path.exists(orc_lib):
236 |                 self._build_orc_lib()
237 | 
238 |         if sys.platform.startswith("win32") and self.debug:
239 |             self.extensions[0].libraries = [
240 |                 lib if lib != "zlibstatic" else "zlibstaticd"
241 |                 for lib in self.extensions[0].libraries
242 |             ]
243 |         self.extensions[0].define_macros.extend(self.get_version_macros())
244 |         super().build_extensions()
245 | 
246 | 
247 | CURRDIR = pathlib.Path(__file__).resolve().parent
248 | with open(CURRDIR / "README.rst") as file:
249 |     LONG_DESC = file.read()
250 | 
251 | # Get version number from the module's __init__.py file.
252 | with open(CURRDIR / "src" / "pyorc" / "__init__.py") as src:
253 |     VER = [
254 |         line.split('"')[1] for line in src.readlines() if line.startswith("__version__")
255 |     ][0]
256 | 
257 | setup(
258 |     name="pyorc",
259 |     version=VER,
260 |     description="Python module for reading and writing Apache ORC file format.",
261 |     author="noirello",
262 |     author_email="noirello@gmail.com",
263 |     url="https://github.com/noirello/pyorc",
264 |     long_description=LONG_DESC,
265 |     long_description_content_type="text/x-rst",
266 |     license="Apache License, Version 2.0",
267 |     ext_modules=EXT_MODULES,
268 |     package_dir={"pyorc": "src/pyorc"},
269 |     packages=["pyorc"],
270 |     package_data={"pyorc": ["py.typed", "_pyorc.pyi"]},
271 |     include_package_data=True,
272 |     cmdclass={"build_ext": BuildExt},
273 |     keywords=["python3", "orc", "apache-orc"],
274 |     classifiers=[
275 |         "Development Status :: 3 - Alpha",
276 |         "Intended Audience :: Developers",
277 |         "Intended Audience :: System Administrators",
278 |         "License :: OSI Approved :: Apache Software License",
279 |         "Programming Language :: C++",
280 |         "Programming Language :: Python :: 3 :: Only",
281 |         "Programming Language :: Python :: 3.9",
282 |         "Programming Language :: Python :: 3.10",
283 |         "Programming Language :: Python :: 3.11",
284 |         "Programming Language :: Python :: 3.12",
285 |         "Programming Language :: Python :: 3.13",
286 |     ],
287 |     python_requires=">=3.6",
288 |     install_requires=[
289 |         'tzdata >= 2020.5 ; sys_platform == "win32"',
290 |         'backports.zoneinfo >= 0.2.1 ; python_version < "3.9"',
291 |     ],
292 | )
293 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/tests/test_writer.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import io
  4 | import math
  5 | import os
  6 | from datetime import date, datetime, timezone
  7 | from decimal import Decimal
  8 | 
  9 | try:
 10 |     import zoneinfo as zi
 11 | except ImportError:
 12 |     from backports import zoneinfo as zi
 13 | 
 14 | from pyorc import (
 15 |     Writer,
 16 |     Reader,
 17 |     TypeDescription,
 18 |     ParseError,
 19 |     TypeKind,
 20 |     StructRepr,
 21 |     CompressionKind,
 22 |     orc_version,
 23 |     orc_version_info,
 24 | )
 25 | from pyorc.converters import ORCConverter
 26 | 
 27 | from conftest import output_file, NullValue
 28 | 
 29 | 
 30 | def test_open_file(output_file):
 31 |     output_file.close()
 32 |     with open(output_file.name, mode="wt") as fp:
 33 |         with pytest.raises(ParseError):
 34 |             _ = Writer(fp, "int")
 35 |     with open(output_file.name, "rb") as fp:
 36 |         with pytest.raises(io.UnsupportedOperation):
 37 |             _ = Writer(fp, "int")
 38 |     with open(output_file.name, mode="wb") as fp:
 39 |         writer = Writer(fp, "int")
 40 |         assert isinstance(writer, Writer)
 41 |     with pytest.raises(TypeError):
 42 |         _ = Writer(0, "int")
 43 | 
 44 | 
 45 | def test_init():
 46 |     data = io.BytesIO()
 47 |     with pytest.raises(TypeError):
 48 |         _ = Writer(data, 0)
 49 |     with pytest.raises(TypeError):
 50 |         _ = Writer(data, "int", batch_size=-1)
 51 |     with pytest.raises(TypeError):
 52 |         _ = Writer(data, "int", batch_size="fail")
 53 |     with pytest.raises(TypeError):
 54 |         _ = Writer(data, "int", batch_size=1000, stripe_size=-1)
 55 |     with pytest.raises(TypeError):
 56 |         _ = Writer(data, "int", batch_size=1000, stripe_size="fail")
 57 |     with pytest.raises(ValueError):
 58 |         _ = Writer(data, "int", batch_size=1000, stripe_size=5000, compression=-1)
 59 |     with pytest.raises(ValueError):
 60 |         _ = Writer(data, "int", batch_size=1000, stripe_size=5000, compression="wrong")
 61 |     with pytest.raises(ValueError):
 62 |         _ = Writer(
 63 |             data,
 64 |             "int",
 65 |             batch_size=1000,
 66 |             stripe_size=5000,
 67 |             compression=0,
 68 |             compression_strategy=-1,
 69 |         )
 70 |     with pytest.raises(ValueError):
 71 |         _ = Writer(
 72 |             data,
 73 |             "int",
 74 |             batch_size=1000,
 75 |             stripe_size=5000,
 76 |             compression=0,
 77 |             compression_strategy="fail",
 78 |         )
 79 |     with pytest.raises(TypeError):
 80 |         _ = Writer(
 81 |             data,
 82 |             "int",
 83 |             batch_size=1000,
 84 |             stripe_size=5000,
 85 |             compression=0,
 86 |             compression_strategy=0,
 87 |             compression_block_size=-1,
 88 |         )
 89 |     with pytest.raises(ValueError):
 90 |         _ = Writer(
 91 |             data,
 92 |             "int",
 93 |             batch_size=1000,
 94 |             stripe_size=5000,
 95 |             compression=0,
 96 |             compression_strategy=0,
 97 |             compression_block_size=1,
 98 |             bloom_filter_columns=["0", 1, 3.4],
 99 |         )
100 |     with pytest.raises(KeyError):
101 |         _ = Writer(
102 |             data,
103 |             "int",
104 |             batch_size=1000,
105 |             stripe_size=5000,
106 |             compression=0,
107 |             compression_strategy=0,
108 |             compression_block_size=1,
109 |             bloom_filter_columns=["0"],
110 |         )
111 |     with pytest.raises(TypeError):
112 |         _ = Writer(
113 |             data,
114 |             "int",
115 |             batch_size=1000,
116 |             stripe_size=5000,
117 |             compression=0,
118 |             compression_strategy=0,
119 |             compression_block_size=1,
120 |             bloom_filter_columns=[0],
121 |             bloom_filter_fpp="wrong",
122 |         )
123 |     with pytest.raises(ValueError):
124 |         _ = Writer(
125 |             data,
126 |             "int",
127 |             batch_size=1000,
128 |             stripe_size=5000,
129 |             compression=0,
130 |             compression_strategy=0,
131 |             compression_block_size=1,
132 |             bloom_filter_columns=[0],
133 |             bloom_filter_fpp=2.0,
134 |         )
135 |     writer = Writer(
136 |         data,
137 |         "int",
138 |         batch_size=1000,
139 |         stripe_size=5000,
140 |         compression=0,
141 |         compression_strategy=0,
142 |         compression_block_size=1,
143 |         bloom_filter_columns=[0],
144 |         bloom_filter_fpp=0.5,
145 |         padding_tolerance=0.5,
146 |         dict_key_size_threshold=0.5,
147 |         memory_block_size=1,
148 |     )
149 |     assert isinstance(writer, Writer)
150 | 
151 | 
152 | def test_write():
153 |     data = io.BytesIO()
154 |     writer = Writer(data, "struct<col0:int,col1:string,col2:double>")
155 |     records = [(1, "Test A", 2.13), (2, "Test B", 0.123213), (3, "Test C", 123.011234)]
156 |     for rec in records:
157 |         writer.write(rec)
158 |     writer.close()
159 |     data.seek(0)
160 |     reader = Reader(data)
161 |     assert reader.read() == records
162 | 
163 | 
164 | TESTDATA = [
165 |     ("string", 0),
166 |     ("string", b"\x10\x13"),
167 |     ("int", "str example"),
168 |     ("bigint", 3.14),
169 |     ("binary", "str example"),
170 |     ("binary", 12),
171 |     ("float", "str example"),
172 |     ("double", b"\x42\x32"),
173 |     ("boolean", "str example"),
174 |     ("timestamp", "str example"),
175 |     ("timestamp", 102112),
176 |     ("date", "str example"),
177 |     ("date", 123),
178 |     ("decimal(10,5)", "str example"),
179 |     ("decimal(36,8)", 1024),
180 | ]
181 | 
182 | 
183 | @pytest.mark.parametrize("orc_type,value", TESTDATA)
184 | def test_write_wrong_primitive_type(orc_type, value):
185 |     data = io.BytesIO()
186 |     writer = Writer(data, orc_type)
187 |     with pytest.raises(TypeError):
188 |         writer.write(value)
189 | 
190 | 
191 | TESTDATA = [
192 |     ("string", ["Not so very very very long text", "Another text", None, "Onemore"]),
193 |     ("binary", [b"\x10\x13\x45\x95\xa4", b"\x34\x56\x45", None, b"\44\x23\x34\xa2"]),
194 |     ("int", [100, None, 1231, 1234]),
195 |     ("bigint", [3123213123, 12321344, 1231238384, None]),
196 |     ("float", [3.14, 2.1, None, 5.5]),
197 |     ("double", [3.14159265359, None, 4.12345678, 4.863723423]),
198 |     ("boolean", [None, False, True, False]),
199 |     (
200 |         "timestamp",
201 |         [
202 |             datetime(2019, 4, 19, 12, 58, 59, tzinfo=timezone.utc),
203 |             datetime(1914, 6, 28, 10, 45, 0, tzinfo=timezone.utc),
204 |             None,
205 |             datetime(2001, 3, 12, 10, 45, 21, 12, tzinfo=timezone.utc),
206 |         ],
207 |     ),
208 |     ("date", [date(1909, 12, 8), None, date(2038, 10, 11), date(2019, 11, 11)]),
209 |     (
210 |         "decimal(10,7)",
211 |         [None, Decimal("0.999999"), Decimal("123.4567890"), Decimal("99.1780000")],
212 |     ),
213 |     (
214 |         "decimal(38,6)",
215 |         [Decimal("999989898.1234"), Decimal("1.245678e24"), None, Decimal("1.2145e28")],
216 |     ),
217 | ]
218 | 
219 | 
220 | @pytest.mark.parametrize("orc_type,values", TESTDATA)
221 | def test_write_primitive_type(orc_type, values):
222 |     data = io.BytesIO()
223 |     writer = Writer(data, orc_type)
224 |     for rec in values:
225 |         writer.write(rec)
226 |     writer.close()
227 | 
228 |     data.seek(0)
229 |     reader = Reader(data)
230 |     if orc_type == "float":
231 |         result = reader.read()
232 |         assert len(result) == len(values)
233 |         for res, exp in zip(result, values):
234 |             if exp is None:
235 |                 assert res is None
236 |             else:
237 |                 assert math.isclose(res, exp, rel_tol=1e-07, abs_tol=0.0)
238 |     else:
239 |         assert reader.read() == values
240 | 
241 | 
242 | TESTDATA = [
243 |     ("map<string,string>", "string"),
244 |     ("map<string,int>", False),
245 |     ("map<int,string>", ["a", "b", "c"]),
246 |     ("map<int,string>", {"0": 0, "1": 1}),
247 |     ("array<int>", 0),
248 |     ("array<string>", [False, True, False]),
249 |     ("array<boolean>", "false"),
250 |     ("uniontype<int,float>", "string"),
251 |     ("uniontype<int,string>", 2.4),
252 |     ("uniontype<string,boolean>", [0, 2]),
253 |     ("struct<col0:int,col1:string>", "string"),
254 |     ("struct<col0:string,col1:int>", 0),
255 |     ("struct<col0:string,col1:int>", [0, 1, 2]),
256 |     ("struct<col0:string,col1:int>", (0,)),
257 |     ("struct<col0:string,col1:int>", {"col0": "a", "col1": 0}),
258 | ]
259 | 
260 | 
261 | @pytest.mark.parametrize("orc_type,value", TESTDATA)
262 | def test_write_wrong_complex_type(orc_type, value):
263 |     data = io.BytesIO()
264 |     writer = Writer(data, orc_type)
265 |     with pytest.raises(
266 |         (TypeError, ValueError)
267 |     ):  # Dict construction might raise ValueError as well.
268 |         writer.write(value)
269 | 
270 | 
271 | TESTDATA = [
272 |     (
273 |         "map<string,string>",
274 |         [{"a": "b", "c": "d"}, {"e": "f", "g": "h", "i": "j"}, None, {"k": "l"}],
275 |     ),
276 |     (
277 |         "map<string,int>",
278 |         [
279 |             {"zero": 0, "one": 1},
280 |             None,
281 |             {"two": 2, "tree": 3},
282 |             {"one": 1, "two": 2, "nill": None},
283 |         ],
284 |     ),
285 |     ("array<int>", [[0, 1, 2, 3], [4, 5, 6, 7, 8], None, [9, 10, 11, 12]]),
286 |     (
287 |         "array<string>",
288 |         [
289 |             ["First text", "Second text", "Third text", None],
290 |             None,
291 |             ["Fourth text", "Fifth text", "Sixth text"],
292 |             ["Seventh text", "Last text"],
293 |         ],
294 |     ),
295 |     ("uniontype<int,string>", ["string", 1, "text", 2, None]),
296 |     (
297 |         "struct<col0:int,col1:string>",
298 |         [
299 |             {"col0": 0, "col1": "String"},
300 |             {"col0": 1, "col1": "String 2"},
301 |             None,
302 |             {"col0": 2, "col1": None},
303 |         ],
304 |     ),
305 | ]
306 | 
307 | 
308 | @pytest.mark.parametrize("orc_type,values", TESTDATA)
309 | def test_write_complex_type(orc_type, values):
310 |     data = io.BytesIO()
311 |     writer = Writer(data, orc_type, struct_repr=StructRepr.DICT)
312 |     for rec in values:
313 |         writer.write(rec)
314 |     writer.close()
315 | 
316 |     data.seek(0)
317 |     reader = Reader(data, struct_repr=StructRepr.DICT)
318 |     assert reader.read() == values
319 | 
320 | 
321 | TESTDATA = [
322 |     ("int", 42),
323 |     ("bigint", 560000000000001),
324 |     ("float", 3.14),
325 |     ("double", math.e),
326 |     ("string", "test"),
327 |     ("binary", b"\x23\x45\x45"),
328 |     ("varchar(4)", "four"),
329 |     ("timestamp", datetime(2019, 11, 10, 12, 59, 59, 100, tzinfo=timezone.utc)),
330 |     ("date", date(2010, 9, 1)),
331 |     ("decimal(10,0)", Decimal("1000000000")),
332 |     ("array<int>", [0, 1, 2, 3]),
333 |     ("map<string,string>", {"test": "example"}),
334 |     ("struct<col0:int,col1:string>", (0, "test")),
335 | ]
336 | 
337 | 
338 | @pytest.mark.parametrize("orc_type,value", TESTDATA)
339 | def test_write_nones(orc_type, value):
340 |     data = io.BytesIO()
341 |     writer = Writer(data, orc_type, batch_size=20)
342 |     for _ in range(100):
343 |         writer.write(value)
344 |     for _ in range(100):
345 |         writer.write(None)
346 |     writer.close()
347 | 
348 |     data.seek(0)
349 |     reader = Reader(data, batch_size=30)
350 |     non_nones = reader.read(100)
351 |     nones = reader.read(100)
352 |     assert len(reader) == 200
353 |     if orc_type in ("float", "double"):
354 |         assert math.isclose(non_nones[0], value, rel_tol=1e-07, abs_tol=0.0)
355 |         assert math.isclose(non_nones[-1], value, rel_tol=1e-07, abs_tol=0.0)
356 |     else:
357 |         assert non_nones[0] == value
358 |         assert non_nones[-1] == value
359 |     assert all(row is not None for row in non_nones)
360 |     assert all(row is None for row in nones)
361 | 
362 | 
363 | def test_context_manager():
364 |     data = io.BytesIO()
365 |     records = [
366 |         {"col0": 1, "col1": "Test A", "col2": 2.13},
367 |         {"col0": 2, "col1": "Test B", "col2": 0.123213},
368 |         {"col0": 3, "col1": "Test C", "col2": 123.011234},
369 |     ]
370 |     with Writer(
371 |         data, "struct<col0:int,col1:string,col2:double>", struct_repr=StructRepr.DICT
372 |     ) as writer:
373 |         for rec in records:
374 |             writer.write(rec)
375 |     data.seek(0)
376 |     reader = Reader(data, struct_repr=StructRepr.DICT)
377 |     assert reader.read() == records
378 | 
379 | 
380 | def test_current_row():
381 |     data = io.BytesIO()
382 |     writer = Writer(data, "struct<col0:int,col1:string,col2:double>")
383 |     assert writer.current_row == 0
384 |     writer.write((0, "Test A", 0.0001))
385 |     assert writer.current_row == 1
386 |     for i in range(10):
387 |         writer.write((i, "Test A", 0.0001))
388 |     assert writer.current_row == 11
389 |     writer.close()
390 |     data.seek(0)
391 |     reader = Reader(data)
392 |     assert writer.current_row == len(reader)
393 | 
394 | 
395 | def test_schema():
396 |     schema_str = "struct<col0:int,col1:string>"
397 |     data = io.BytesIO()
398 |     writer = Writer(data, schema_str)
399 | 
400 |     assert str(writer.schema) == schema_str
401 |     with pytest.raises(AttributeError):
402 |         writer.schema = "fail"
403 |     with pytest.raises(AttributeError):
404 |         del writer.schema
405 | 
406 |     schema = writer.schema
407 |     del writer
408 |     assert isinstance(schema, TypeDescription)
409 |     assert schema.kind == TypeKind.STRUCT
410 | 
411 | 
412 | def test_writerows():
413 |     data = io.BytesIO()
414 |     writer = Writer(data, "int")
415 |     res = writer.writerows([])
416 |     assert res == 0
417 |     rows = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
418 |     res = writer.writerows(rows)
419 |     writer.close()
420 |     assert res == len(rows)
421 | 
422 |     data.seek(0)
423 |     reader = Reader(data)
424 |     assert list(rows) == reader.read()
425 | 
426 | 
427 | def test_struct_repr():
428 |     data = io.BytesIO()
429 |     writer = Writer(data, "struct<a:int>")
430 |     with pytest.raises(TypeError):
431 |         writer.write({"a": 1})
432 |     writer = Writer(data, "struct<a:int>", struct_repr=StructRepr.DICT)
433 |     with pytest.raises(TypeError):
434 |         writer.write((1,))
435 |     with pytest.raises(TypeError):
436 |         writer.write({"a": "b"})
437 | 
438 | 
439 | class TestConverter(ORCConverter):
440 |     @staticmethod
441 |     def to_orc(obj, timezone):
442 |         seconds, nanoseconds = obj
443 |         return (seconds, nanoseconds)
444 | 
445 |     @staticmethod
446 |     def from_orc(seconds, nanoseconds, timezone):
447 |         pass
448 | 
449 | 
450 | def test_converter():
451 |     data = io.BytesIO()
452 |     seconds = 1500000
453 |     nanoseconds = 101000
454 |     exp_date = date(2000, 1, 1)
455 |     record = ((seconds, nanoseconds), exp_date)
456 |     with Writer(
457 |         data,
458 |         "struct<col0:timestamp,col1:date>",
459 |         converters={TypeKind.TIMESTAMP: TestConverter},
460 |     ) as writer:
461 |         writer.write(record)
462 | 
463 |     data.seek(0)
464 |     reader = Reader(data)
465 |     assert next(reader) == (
466 |         datetime.fromtimestamp(seconds, timezone.utc).replace(
467 |             microsecond=nanoseconds // 1000
468 |         ),
469 |         exp_date,
470 |     )
471 | 
472 | 
473 | def test_user_metadata():
474 |     random_val = os.urandom(64)
475 |     data = io.BytesIO()
476 |     with Writer(data, "int") as writer:
477 |         writer.set_user_metadata(
478 |             test="test1".encode("UTF-8"), meta=b"\x30\x40\x50\x60", val=random_val
479 |         )
480 |         writer.set_user_metadata(test="test2".encode("UTF-8"))
481 |         with pytest.raises(TypeError):
482 |             writer.set_user_metadata(meta="string")
483 |     reader = Reader(data)
484 |     assert len(reader) == 0
485 |     assert reader.user_metadata == {
486 |         "test": "test2".encode("UTF-8"),
487 |         "meta": b"\x30\x40\x50\x60",
488 |         "val": random_val,
489 |     }
490 | 
491 | 
492 | @pytest.mark.parametrize(
493 |     "kind", (CompressionKind.NONE, CompressionKind.ZLIB, CompressionKind.ZSTD)
494 | )
495 | def test_compression(kind):
496 |     data = io.BytesIO()
497 |     with Writer(data, "struct<a:int,b:string,c:double>", compression=kind) as writer:
498 |         writer.writerows((num, "ABCDEFG", 0.12) for num in range(50000))
499 |     data.seek(0)
500 |     reader = Reader(data)
501 |     assert reader.compression == kind
502 |     for idx, row in enumerate(reader):
503 |         assert row == (idx, "ABCDEFG", 0.12)
504 | 
505 | 
506 | @pytest.mark.parametrize(
507 |     "schema,attrs",
508 |     (
509 |         (TypeDescription.from_string("int"), {"a": "1", "b": "2"}),
510 |         (TypeDescription.from_string("struct<a:boolean>"), {"test": "attribute"}),
511 |     ),
512 | )
513 | def test_attributes(schema, attrs):
514 |     data = io.BytesIO()
515 |     schema.set_attributes(attrs)
516 |     writer = Writer(data, schema)
517 |     writer.close()
518 |     reader = Reader(data)
519 |     assert len(reader) == 0
520 |     assert reader.schema.attributes == attrs
521 | 
522 | 
523 | @pytest.mark.parametrize(
524 |     "schema,writer_tz,reader_tz,input,expected",
525 |     [
526 |         (
527 |             "struct<col0:timestamp>",
528 |             zi.ZoneInfo("UTC"),
529 |             zi.ZoneInfo("UTC"),
530 |             datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")),
531 |             datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")),
532 |         ),
533 |         (
534 |             "struct<col0:timestamp>",
535 |             zi.ZoneInfo("Asia/Tokyo"),
536 |             zi.ZoneInfo("UTC"),
537 |             datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("Asia/Tokyo")),
538 |             datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")),
539 |         ),
540 |         (
541 |             "struct<col0:timestamp>",
542 |             zi.ZoneInfo("America/Los_Angeles"),
543 |             zi.ZoneInfo("America/New_York"),
544 |             datetime(2014, 12, 12, 6, 0, 0, tzinfo=zi.ZoneInfo("America/Los_Angeles")),
545 |             datetime(2014, 12, 12, 6, 0, 0, tzinfo=zi.ZoneInfo("America/New_York")),
546 |         ),
547 |         (
548 |             "struct<col0:timestamp with local time zone>",
549 |             zi.ZoneInfo("America/Los_Angeles"),
550 |             zi.ZoneInfo("America/New_York"),
551 |             datetime(2014, 12, 12, 6, 0, 0, tzinfo=zi.ZoneInfo("America/Los_Angeles")),
552 |             datetime(2014, 12, 12, 9, 0, 0, tzinfo=zi.ZoneInfo("America/New_York")),
553 |         ),
554 |         (
555 |             "struct<col0:timestamp with local time zone>",
556 |             zi.ZoneInfo("UTC"),
557 |             zi.ZoneInfo("UTC"),
558 |             datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")),
559 |             datetime(2021, 10, 10, 12, 0, 0, tzinfo=zi.ZoneInfo("UTC")),
560 |         ),
561 |         (
562 |             "struct<col0:timestamp with local time zone>",
563 |             zi.ZoneInfo("Asia/Tokyo"),
564 |             zi.ZoneInfo("UTC"),
565 |             datetime(2021, 10, 10, 3, 0, 0, tzinfo=zi.ZoneInfo("Asia/Tokyo")),
566 |             datetime(2021, 10, 9, 18, 0, 0, tzinfo=zi.ZoneInfo("UTC")),
567 |         ),
568 |         (
569 |             "struct<col0:timestamp with local time zone>",
570 |             zi.ZoneInfo("Europe/Berlin"),
571 |             zi.ZoneInfo("Europe/London"),
572 |             datetime(2021, 10, 31, 3, 0, 0, tzinfo=zi.ZoneInfo("Europe/Berlin")),
573 |             datetime(2021, 10, 31, 2, 0, 0, tzinfo=zi.ZoneInfo("Europe/London")),
574 |         ),
575 |     ],
576 | )
577 | def test_timestamp_with_timezones(schema, writer_tz, reader_tz, input, expected):
578 |     data = io.BytesIO()
579 |     with Writer(data, schema, timezone=writer_tz) as writer:
580 |         writer.write((input,))
581 |     reader = Reader(data, timezone=reader_tz)
582 |     output = next(reader)[0]
583 |     assert output == expected
584 | 
585 | 
586 | TESTDATA = [
587 |     ("int", 42),
588 |     ("bigint", 560000000000001),
589 |     ("float", 3.14),
590 |     ("double", math.e),
591 |     ("string", "test"),
592 |     ("binary", b"\x23\x45\x45"),
593 |     ("varchar(4)", "four"),
594 |     ("timestamp", datetime(2019, 11, 10, 12, 59, 59, 100, tzinfo=timezone.utc)),
595 |     ("date", date(2010, 9, 1)),
596 |     ("decimal(10,0)", Decimal("1000000000")),
597 |     ("array<int>", [0, 1, 2, 3]),
598 |     ("map<string,string>", {"test": "example"}),
599 |     ("struct<col0:int,col1:string>", (0, "test")),
600 | ]
601 | 
602 | 
603 | @pytest.mark.parametrize("orc_type,value", TESTDATA)
604 | def test_write_custom_null_value(orc_type, value):
605 |     data = io.BytesIO()
606 |     with Writer(data, orc_type, null_value=NullValue()) as writer:
607 |         writer.write(value)
608 |         writer.write(NullValue())
609 |     reader = Reader(data)
610 |     if orc_type in ("float", "double"):
611 |         assert math.isclose(next(reader), value, rel_tol=1e-07, abs_tol=0.0)
612 |     else:
613 |         assert next(reader) == value
614 |     assert next(reader) is None
615 | 
616 | 
617 | @pytest.mark.skipif(
618 |     orc_version_info.major <= 1 and orc_version_info.minor < 9,
619 |     reason=f"write_intermediate_footer is unsupported for {orc_version}",
620 | )
621 | def test_write_intermediate_footer():
622 |     data = io.BytesIO()
623 |     writer = Writer(
624 |         data,
625 |         "int",
626 |         stripe_size=1024,
627 |         compression_block_size=1024,
628 |         memory_block_size=512,
629 |     )
630 |     writer.writerows(range(65536))
631 |     with pytest.raises(ParseError):
632 |         _ = Reader(data)
633 |     offset = writer.write_intermediate_footer()
634 |     assert isinstance(offset, int)
635 |     assert offset > 0
636 |     reader = Reader(data)
637 |     assert reader.bytes_lengths["file_length"] == offset
638 |     assert len(reader) == 65536
639 |     assert reader.read()[-1] == 65535
640 |     data.seek(offset)
641 |     writer.close()
642 |     reader = Reader(data)
643 |     assert len(reader) == 65536
644 |     assert reader.bytes_lengths["file_length"] >= offset
645 | 


--------------------------------------------------------------------------------
/src/_pyorc/Reader.cpp:
--------------------------------------------------------------------------------
  1 | #include <pybind11/stl.h>
  2 | 
  3 | #include "PyORCStream.h"
  4 | #include "Reader.h"
  5 | #include "SearchArgument.h"
  6 | 
  7 | using namespace py::literals;
  8 | 
  9 | py::dict
 10 | createAttributeDict(const orc::Type& orcType)
 11 | {
 12 |     py::dict result;
 13 |     for (std::string key : orcType.getAttributeKeys()) {
 14 |         result[key.c_str()] = py::str(orcType.getAttributeValue(key).c_str());
 15 |     }
 16 |     return result;
 17 | }
 18 | 
 19 | py::object
 20 | createTypeDescription(const orc::Type& orcType)
 21 | {
 22 |     py::object typeModule = py::module::import("pyorc.typedescription");
 23 |     int kind = static_cast<int>(orcType.getKind());
 24 |     py::object attrDict = createAttributeDict(orcType);
 25 |     switch (kind) {
 26 |         case orc::BOOLEAN: {
 27 |             py::object typeDesc = typeModule.attr("Boolean")();
 28 |             typeDesc.attr("set_attributes")(attrDict);
 29 |             return typeDesc;
 30 |         }
 31 |         case orc::BYTE: {
 32 |             py::object typeDesc = typeModule.attr("TinyInt")();
 33 |             typeDesc.attr("set_attributes")(attrDict);
 34 |             return typeDesc;
 35 |         }
 36 |         case orc::SHORT: {
 37 |             py::object typeDesc = typeModule.attr("SmallInt")();
 38 |             typeDesc.attr("set_attributes")(attrDict);
 39 |             return typeDesc;
 40 |         }
 41 |         case orc::INT: {
 42 |             py::object typeDesc = typeModule.attr("Int")();
 43 |             typeDesc.attr("set_attributes")(attrDict);
 44 |             return typeDesc;
 45 |         }
 46 |         case orc::LONG: {
 47 |             py::object typeDesc = typeModule.attr("BigInt")();
 48 |             typeDesc.attr("set_attributes")(attrDict);
 49 |             return typeDesc;
 50 |         }
 51 |         case orc::FLOAT: {
 52 |             py::object typeDesc = typeModule.attr("Float")();
 53 |             typeDesc.attr("set_attributes")(attrDict);
 54 |             return typeDesc;
 55 |         }
 56 |         case orc::DOUBLE: {
 57 |             py::object typeDesc = typeModule.attr("Double")();
 58 |             typeDesc.attr("set_attributes")(attrDict);
 59 |             return typeDesc;
 60 |         }
 61 |         case orc::STRING: {
 62 |             py::object typeDesc = typeModule.attr("String")();
 63 |             typeDesc.attr("set_attributes")(attrDict);
 64 |             return typeDesc;
 65 |         }
 66 |         case orc::BINARY: {
 67 |             py::object typeDesc = typeModule.attr("Binary")();
 68 |             typeDesc.attr("set_attributes")(attrDict);
 69 |             return typeDesc;
 70 |         }
 71 |         case orc::TIMESTAMP: {
 72 |             py::object typeDesc = typeModule.attr("Timestamp")();
 73 |             typeDesc.attr("set_attributes")(attrDict);
 74 |             return typeDesc;
 75 |         }
 76 |         case orc::TIMESTAMP_INSTANT: {
 77 |             py::object typeDesc = typeModule.attr("TimestampInstant")();
 78 |             typeDesc.attr("set_attributes")(attrDict);
 79 |             return typeDesc;
 80 |         }
 81 |         case orc::DATE: {
 82 |             py::object typeDesc = typeModule.attr("Date")();
 83 |             typeDesc.attr("set_attributes")(attrDict);
 84 |             return typeDesc;
 85 |         }
 86 |         case orc::CHAR: {
 87 |             py::object typeDesc =
 88 |               typeModule.attr("Char")(py::cast(orcType.getMaximumLength()));
 89 |             typeDesc.attr("set_attributes")(attrDict);
 90 |             return typeDesc;
 91 |         }
 92 |         case orc::VARCHAR: {
 93 |             py::object typeDesc =
 94 |               typeModule.attr("VarChar")(py::cast(orcType.getMaximumLength()));
 95 |             typeDesc.attr("set_attributes")(attrDict);
 96 |             return typeDesc;
 97 |         }
 98 |         case orc::DECIMAL: {
 99 |             py::object typeDesc = typeModule.attr("Decimal")(
100 |               "precision"_a = py::cast(orcType.getPrecision()),
101 |               "scale"_a = py::cast(orcType.getScale()));
102 |             typeDesc.attr("set_attributes")(attrDict);
103 |             return typeDesc;
104 |         }
105 |         case orc::LIST: {
106 |             py::object typeDesc =
107 |               typeModule.attr("Array")(createTypeDescription(*orcType.getSubtype(0)));
108 |             typeDesc.attr("set_attributes")(attrDict);
109 |             return typeDesc;
110 |         }
111 |         case orc::MAP: {
112 |             py::object typeDesc = typeModule.attr("Map")(
113 |               "key"_a = createTypeDescription(*orcType.getSubtype(0)),
114 |               "value"_a = createTypeDescription(*orcType.getSubtype(1)));
115 |             typeDesc.attr("set_attributes")(attrDict);
116 |             return typeDesc;
117 |         }
118 |         case orc::UNION: {
119 |             py::tuple args(orcType.getSubtypeCount());
120 |             for (size_t i = 0; i < orcType.getSubtypeCount(); ++i) {
121 |                 args[i] = createTypeDescription(*orcType.getSubtype(i));
122 |             }
123 |             py::object typeDesc = typeModule.attr("Union")(*args);
124 |             typeDesc.attr("set_attributes")(attrDict);
125 |             return typeDesc;
126 |         }
127 |         case orc::STRUCT: {
128 |             py::dict fields;
129 |             for (size_t i = 0; i < orcType.getSubtypeCount(); ++i) {
130 |                 auto key = orcType.getFieldName(i);
131 |                 fields[key.c_str()] = createTypeDescription(*orcType.getSubtype(i));
132 |             }
133 |             py::object typeDesc = typeModule.attr("Struct")(**fields);
134 |             typeDesc.attr("set_attributes")(attrDict);
135 |             return typeDesc;
136 |         }
137 |         default:
138 |             throw py::type_error("Invalid TypeKind");
139 |     }
140 | }
141 | 
142 | py::object
143 | ORCFileLikeObject::next()
144 | {
145 |     while (true) {
146 |         if (batchItem == 0) {
147 |             if (!rowReader->next(*batch)) {
148 |                 throw py::stop_iteration();
149 |             }
150 |             converter->reset(*batch);
151 |         }
152 |         if (batchItem < batch->numElements) {
153 |             py::object val = converter->toPython(batchItem);
154 |             ++batchItem;
155 |             ++currentRow;
156 |             return val;
157 |         } else {
158 |             batchItem = 0;
159 |         }
160 |     }
161 | }
162 | 
163 | py::list
164 | ORCFileLikeObject::read(int64_t num)
165 | {
166 |     int64_t i = 0;
167 |     py::list res;
168 |     if (num < -1) {
169 |         throw py::value_error("Read length must be positive or -1");
170 |     }
171 |     try {
172 |         while (true) {
173 |             if (num != -1 && i == num) {
174 |                 return res;
175 |             }
176 |             res.append(this->next());
177 |             ++i;
178 |         }
179 |     } catch (py::stop_iteration&) {
180 |         return res;
181 |     }
182 | }
183 | 
184 | uint64_t
185 | ORCFileLikeObject::seek(int64_t row, uint16_t whence)
186 | {
187 |     uint64_t start = 0;
188 |     switch (whence) {
189 |         case 0:
190 |             start = firstRowOfStripe;
191 |             if (row < 0) {
192 |                 throw py::value_error("Invalid value for row");
193 |             }
194 |             break;
195 |         case 1:
196 |             start = currentRow + firstRowOfStripe;
197 |             break;
198 |         case 2:
199 |             start = this->len() + firstRowOfStripe;
200 |             break;
201 |         default:
202 |             throw py::value_error("Invalid value for whence");
203 |             break;
204 |     }
205 |     rowReader->seekToRow(start + row);
206 |     batchItem = 0;
207 |     currentRow = rowReader->getRowNumber() - firstRowOfStripe;
208 |     return currentRow;
209 | }
210 | 
211 | const orc::Type*
212 | ORCFileLikeObject::findColumnType(const orc::Type* type, uint64_t columnIndex) const
213 | {
214 |     if (type->getColumnId() == columnIndex) {
215 |         return type;
216 |     } else {
217 |         for (size_t i = 0; i < type->getSubtypeCount(); ++i) {
218 |             auto* subtype = type->getSubtype(i);
219 |             if (subtype->getColumnId() <= columnIndex &&
220 |                 subtype->getMaximumColumnId() >= columnIndex) {
221 |                 return ORCFileLikeObject::findColumnType(subtype, columnIndex);
222 |             }
223 |         }
224 |         throw py::index_error("column not found");
225 |     }
226 | }
227 | 
228 | py::object
229 | ORCFileLikeObject::convertTimestampMillis(int64_t millisec) const
230 | {
231 |     py::object idx(py::int_(static_cast<int>(orc::TIMESTAMP)));
232 |     py::object from_orc = convDict[idx].attr("from_orc");
233 |     int64_t seconds = millisec / 1000;
234 |     int64_t nanosecs = std::abs(millisec % 1000) * 1000 * 1000;
235 |     return from_orc(seconds, nanosecs, timezoneInfo);
236 | }
237 | 
238 | py::dict
239 | ORCFileLikeObject::buildStatistics(const orc::Type* type,
240 |                                    const orc::ColumnStatistics* stats) const
241 | {
242 |     py::dict result;
243 |     int64_t typeKind = static_cast<int64_t>(type->getKind());
244 |     result["kind"] = typeKind;
245 |     result["has_null"] = py::cast(stats->hasNull());
246 |     result["number_of_values"] = py::cast(stats->getNumberOfValues());
247 |     switch (typeKind) {
248 |         case orc::BOOLEAN: {
249 |             auto* boolStat = dynamic_cast<const orc::BooleanColumnStatistics*>(stats);
250 |             if (boolStat->hasCount()) {
251 |                 result["false_count"] = py::cast(boolStat->getFalseCount());
252 |                 result["true_count"] = py::cast(boolStat->getTrueCount());
253 |             }
254 |             return result;
255 |         }
256 |         case orc::BYTE:
257 |         case orc::INT:
258 |         case orc::LONG:
259 |         case orc::SHORT: {
260 |             auto* intStat = dynamic_cast<const orc::IntegerColumnStatistics*>(stats);
261 |             if (intStat->hasMinimum()) {
262 |                 result["minimum"] = py::cast(intStat->getMinimum());
263 |             }
264 |             if (intStat->hasMaximum()) {
265 |                 result["maximum"] = py::cast(intStat->getMaximum());
266 |             }
267 |             if (intStat->hasSum()) {
268 |                 result["sum"] = py::cast(intStat->getSum());
269 |             }
270 |             return result;
271 |         }
272 |         case orc::STRUCT:
273 |         case orc::MAP:
274 |         case orc::LIST:
275 |         case orc::UNION:
276 |             return result;
277 |         case orc::FLOAT:
278 |         case orc::DOUBLE: {
279 |             auto* doubleStat = dynamic_cast<const orc::DoubleColumnStatistics*>(stats);
280 |             if (doubleStat->hasMinimum()) {
281 |                 result["minimum"] = py::cast(doubleStat->getMinimum());
282 |             }
283 |             if (doubleStat->hasMaximum()) {
284 |                 result["maximum"] = py::cast(doubleStat->getMaximum());
285 |             }
286 |             if (doubleStat->hasSum()) {
287 |                 result["sum"] = py::cast(doubleStat->getSum());
288 |             }
289 |             return result;
290 |         }
291 |         case orc::BINARY: {
292 |             auto* binaryStat = dynamic_cast<const orc::BinaryColumnStatistics*>(stats);
293 |             if (binaryStat->hasTotalLength()) {
294 |                 result["total_length"] = py::cast(binaryStat->getTotalLength());
295 |             }
296 |             return result;
297 |         }
298 |         case orc::STRING:
299 |         case orc::CHAR:
300 |         case orc::VARCHAR: {
301 |             auto* strStat = dynamic_cast<const orc::StringColumnStatistics*>(stats);
302 |             if (strStat->hasMinimum()) {
303 |                 result["minimum"] = py::cast(strStat->getMinimum());
304 |             }
305 |             if (strStat->hasMaximum()) {
306 |                 result["maximum"] = py::cast(strStat->getMaximum());
307 |             }
308 |             if (strStat->hasTotalLength()) {
309 |                 result["total_length"] = py::cast(strStat->getTotalLength());
310 |             }
311 |             return result;
312 |         }
313 |         case orc::DATE: {
314 |             auto* dateStat = dynamic_cast<const orc::DateColumnStatistics*>(stats);
315 |             py::object idx(py::int_(static_cast<int>(orc::DATE)));
316 |             py::object from_orc = convDict[idx].attr("from_orc");
317 |             if (dateStat->hasMinimum()) {
318 |                 result["minimum"] = from_orc(dateStat->getMinimum());
319 |             }
320 |             if (dateStat->hasMaximum()) {
321 |                 result["maximum"] = from_orc(dateStat->getMaximum());
322 |             }
323 |             return result;
324 |         }
325 |         case orc::TIMESTAMP:
326 |         case orc::TIMESTAMP_INSTANT: {
327 |             auto* timeStat = dynamic_cast<const orc::TimestampColumnStatistics*>(stats);
328 |             if (timeStat->hasMinimum()) {
329 |                 result["minimum"] = convertTimestampMillis(timeStat->getMinimum());
330 |             }
331 |             if (timeStat->hasMaximum()) {
332 |                 result["maximum"] = convertTimestampMillis(timeStat->getMaximum());
333 |             }
334 |             if (timeStat->hasLowerBound()) {
335 |                 result["lower_bound"] =
336 |                   convertTimestampMillis(timeStat->getLowerBound());
337 |             }
338 |             if (timeStat->hasUpperBound()) {
339 |                 result["upper_bound"] =
340 |                   convertTimestampMillis(timeStat->getUpperBound());
341 |             }
342 |             return result;
343 |         }
344 |         case orc::DECIMAL: {
345 |             auto* decStat = dynamic_cast<const orc::DecimalColumnStatistics*>(stats);
346 |             py::object idx(py::int_(static_cast<int>(orc::DECIMAL)));
347 |             py::object from_orc = convDict[idx].attr("from_orc");
348 |             if (decStat->hasMinimum()) {
349 |                 result["minimum"] = from_orc(decStat->getMinimum().toString());
350 |             }
351 |             if (decStat->hasMaximum()) {
352 |                 result["maximum"] = from_orc(decStat->getMaximum().toString());
353 |             }
354 |             if (decStat->hasSum()) {
355 |                 result["sum"] = from_orc(decStat->getSum().toString());
356 |             }
357 |             return result;
358 |         }
359 |         default:
360 |             return result;
361 |     }
362 | }
363 | 
364 | Reader::Reader(py::object fileo,
365 |                uint64_t batch_size,
366 |                std::list<uint64_t> col_indices,
367 |                std::list<std::string> col_names,
368 |                py::object tzone,
369 |                unsigned int struct_repr,
370 |                py::object conv,
371 |                py::object predicate,
372 |                py::object null_value)
373 | {
374 |     orc::ReaderOptions readerOpts;
375 |     batchItem = 0;
376 |     currentRow = 0;
377 |     firstRowOfStripe = 0;
378 |     structKind = struct_repr;
379 |     nullValue = null_value;
380 |     if (!col_indices.empty() && !col_names.empty()) {
381 |         throw py::value_error(
382 |           "Either col_indices or col_names can be set to select columns");
383 |     }
384 |     if (!col_indices.empty()) {
385 |         rowReaderOpts = rowReaderOpts.include(col_indices);
386 |     }
387 |     if (!col_names.empty()) {
388 |         rowReaderOpts = rowReaderOpts.include(col_names);
389 |     }
390 |     if (!tzone.is_none()) {
391 |         std::string tzKey = py::cast<std::string>(tzone.attr("key"));
392 |         rowReaderOpts = rowReaderOpts.setTimezoneName(tzKey);
393 |     }
394 |     timezoneInfo = tzone;
395 |     if (conv.is_none()) {
396 |         py::dict defaultConv =
397 |           py::module::import("pyorc.converters").attr("DEFAULT_CONVERTERS");
398 |         convDict = py::dict(defaultConv);
399 |     } else {
400 |         convDict = conv;
401 |     }
402 |     if (!predicate.is_none()) {
403 |         rowReaderOpts = rowReaderOpts.searchArgument(
404 |           std::move(createSearchArgument(predicate, convDict, timezoneInfo)));
405 |     }
406 |     reader = orc::createReader(
407 |       std::unique_ptr<orc::InputStream>(new PyORCInputStream(fileo)), readerOpts);
408 |     try {
409 |         batchSize = batch_size;
410 |         rowReader = reader->createRowReader(rowReaderOpts);
411 |         batch = rowReader->createRowBatch(batchSize);
412 |         converter = createConverter(
413 |           &rowReader->getSelectedType(), structKind, convDict, timezoneInfo, nullValue);
414 |     } catch (orc::ParseError& err) {
415 |         throw py::value_error(err.what());
416 |     }
417 | }
418 | 
419 | py::dict
420 | Reader::bytesLengths() const
421 | {
422 |     py::dict res;
423 |     res["content_length"] = reader->getContentLength();
424 |     res["file_footer_length"] = reader->getFileFooterLength();
425 |     res["file_postscript_length"] = reader->getFilePostscriptLength();
426 |     res["file_length"] = reader->getFileLength();
427 |     res["stripe_statistics_length"] = reader->getStripeStatisticsLength();
428 |     return res;
429 | }
430 | 
431 | uint64_t
432 | Reader::compression() const
433 | {
434 |     return static_cast<uint64_t>(reader->getCompression());
435 | }
436 | 
437 | uint64_t
438 | Reader::compressionBlockSize() const
439 | {
440 |     return reader->getCompressionSize();
441 | }
442 | 
443 | uint64_t
444 | Reader::rowIndexStride() const
445 | {
446 |     return reader->getRowIndexStride();
447 | }
448 | 
449 | py::tuple
450 | Reader::formatVersion() const
451 | {
452 |     py::tuple res(2);
453 |     orc::FileVersion ver = reader->getFormatVersion();
454 |     res[0] = py::cast(ver.getMajor());
455 |     res[1] = py::cast(ver.getMinor());
456 |     return res;
457 | }
458 | 
459 | uint64_t
460 | Reader::len() const
461 | {
462 |     return reader->getNumberOfRows();
463 | }
464 | 
465 | uint64_t
466 | Reader::numberOfStripes() const
467 | {
468 |     return reader->getNumberOfStripes();
469 | }
470 | 
471 | uint32_t
472 | Reader::writerId() const
473 | {
474 |     return reader->getWriterIdValue();
475 | }
476 | 
477 | uint32_t
478 | Reader::writerVersion() const
479 | {
480 |     return reader->getWriterVersion();
481 | }
482 | 
483 | std::string
484 | Reader::softwareVersion() const
485 | {
486 |     return reader->getSoftwareVersion();
487 | }
488 | 
489 | std::unique_ptr<Stripe>
490 | Reader::readStripe(uint64_t idx)
491 | {
492 |     if (idx >= reader->getNumberOfStripes()) {
493 |         throw py::index_error("stripe index out of range");
494 |     }
495 |     return std::unique_ptr<Stripe>(new Stripe(*this, idx, reader->getStripe(idx)));
496 | }
497 | 
498 | py::object
499 | Reader::schema()
500 | {
501 |     return createTypeDescription(reader->getType());
502 | }
503 | 
504 | py::object
505 | Reader::selectedSchema()
506 | {
507 |     return createTypeDescription(rowReader->getSelectedType());
508 | }
509 | 
510 | py::tuple
511 | Reader::statistics(uint64_t columnIndex)
512 | {
513 |     try {
514 |         py::tuple result = py::tuple(1);
515 |         std::unique_ptr<orc::ColumnStatistics> stats =
516 |           reader->getColumnStatistics(columnIndex);
517 |         result[0] = this->buildStatistics(
518 |           this->findColumnType(&rowReader->getSelectedType(), columnIndex),
519 |           stats.get());
520 |         return result;
521 |     } catch (std::logic_error& err) {
522 |         throw py::index_error(err.what());
523 |     }
524 | }
525 | 
526 | py::dict
527 | Reader::userMetadata()
528 | {
529 |     py::dict result;
530 |     for (std::string key : reader->getMetadataKeys()) {
531 |         result[key.c_str()] = py::bytes(reader->getMetadataValue(key));
532 |     }
533 |     return result;
534 | }
535 | 
536 | Stripe::Stripe(const Reader& reader_,
537 |                uint64_t idx,
538 |                std::unique_ptr<orc::StripeInformation> stripe)
539 |   : reader(reader_)
540 | {
541 |     batchItem = 0;
542 |     currentRow = 0;
543 |     stripeIndex = idx;
544 |     stripeInfo = std::move(stripe);
545 |     convDict = reader.getConverterDict();
546 |     timezoneInfo = reader.getTimeZoneInfo();
547 |     rowReaderOpts = reader.getRowReaderOptions();
548 |     rowReaderOpts =
549 |       rowReaderOpts.range(stripeInfo->getOffset(), stripeInfo->getLength());
550 |     rowReader = reader.getORCReader().createRowReader(rowReaderOpts);
551 |     batch = rowReader->createRowBatch(reader.getBatchSize());
552 |     converter = createConverter(&rowReader->getSelectedType(),
553 |                                 reader.getStructKind(),
554 |                                 convDict,
555 |                                 timezoneInfo,
556 |                                 reader.getNullValue());
557 |     firstRowOfStripe = rowReader->getRowNumber() + 1;
558 | }
559 | 
560 | py::tuple
561 | Stripe::bloomFilterColumns()
562 | {
563 |     int64_t idx = 0;
564 |     std::set<uint32_t> empty = {};
565 |     std::map<uint32_t, orc::BloomFilterIndex> bfCols =
566 |       reader.getORCReader().getBloomFilters(stripeIndex, empty);
567 |     py::tuple result(bfCols.size());
568 |     for (auto const& col : bfCols) {
569 |         result[idx] = py::cast(col.first);
570 |         ++idx;
571 |     }
572 |     return result;
573 | }
574 | 
575 | uint64_t
576 | Stripe::len() const
577 | {
578 |     return stripeInfo->getNumberOfRows();
579 | }
580 | 
581 | uint64_t
582 | Stripe::length() const
583 | {
584 |     return stripeInfo->getLength();
585 | }
586 | 
587 | uint64_t
588 | Stripe::offset() const
589 | {
590 |     return stripeInfo->getOffset();
591 | }
592 | 
593 | py::tuple
594 | Stripe::statistics(uint64_t columnIndex)
595 | {
596 |     if (columnIndex < 0 ||
597 |         columnIndex > rowReader->getSelectedType().getMaximumColumnId()) {
598 |         throw py::index_error("column index out of range");
599 |     }
600 |     std::unique_ptr<orc::StripeStatistics> stripeStats =
601 |       reader.getORCReader().getStripeStatistics(stripeIndex);
602 |     uint32_t num = stripeStats->getNumberOfRowIndexStats(columnIndex);
603 |     py::tuple result = py::tuple(num);
604 |     for (uint32_t i = 0; i < num; ++i) {
605 |         const orc::ColumnStatistics* stats =
606 |           stripeStats->getRowIndexStatistics(columnIndex, i);
607 |         result[i] = this->buildStatistics(
608 |           this->findColumnType(&rowReader->getSelectedType(), columnIndex), stats);
609 |     }
610 |     return result;
611 | }
612 | 
613 | std::string
614 | Stripe::writerTimezone()
615 | {
616 |     return stripeInfo->getWriterTimezone();
617 | }
618 | 


--------------------------------------------------------------------------------