├── .clang-format ├── .devcontainer ├── devcontainer.json ├── docker-compose.yml └── docker.env ├── .editorconfig ├── .github └── workflows │ └── build.yml ├── .gitignore ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── README.md ├── TODO.md ├── benchmarks ├── create_tables.sh ├── minute_bars.csv ├── minute_bars.svg ├── run.py └── sql │ ├── array.sql │ ├── composite.sql │ ├── enum.sql │ ├── json.sql │ ├── minute_bars.sql │ └── numbers.sql ├── ci ├── linux │ └── repair-wheel.py ├── macos │ └── repair-wheel.py ├── setup-db.sh └── windows │ ├── repair-wheel.py │ └── repair-wheel.sh ├── cleanup.sh ├── environment.yml ├── include └── pgeon.h ├── pyproject.toml ├── python ├── __init__.py └── _pgeon.pyx ├── setup.py ├── src ├── CMakeLists.txt ├── cli.cc └── pgeon │ ├── api.cc │ ├── builder.cc │ ├── builder.h │ ├── builder │ ├── base.h │ ├── common.h │ ├── datetime.cc │ ├── datetime.h │ ├── geometric.cc │ ├── geometric.h │ ├── misc.cc │ ├── misc.h │ ├── nested.cc │ ├── nested.h │ ├── network.cc │ ├── network.h │ ├── numeric.cc │ ├── numeric.h │ ├── stringlike.cc │ ├── stringlike.h │ ├── text_search.cc │ └── text_search.h │ ├── pg_interface.cc │ ├── pg_interface.h │ ├── table_builder.cc │ ├── table_builder.h │ └── util │ └── streambuffer.h └── tests ├── _todo.py ├── conftest.py ├── test_basic.py ├── test_exceptions.py └── test_options.py /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | ColumnLimit: 90 3 | DerivePointerAlignment: false 4 | SortIncludes: CaseSensitive 5 | SortUsingDeclarations: true 6 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pgeon-dev", 3 | "dockerComposeFile": "docker-compose.yml", 4 | "service": "app", 5 | "workspaceFolder": "/workspace", 6 | "shutdownAction": "stopCompose", 7 | "customizations": { 8 | "vscode": { 9 | "settings": {}, 10 | "extensions": [] 11 | } 12 | } 13 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 14 | // "forwardPorts": [5432], 15 | // Use 'postCreateCommand' to run commands after the container is created. 16 | // "postCreateCommand": "python --version", 17 | // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 18 | // "remoteUser": "vscode" 19 | } 20 | -------------------------------------------------------------------------------- /.devcontainer/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | app: 5 | build: 6 | context: .. 7 | dockerfile: Dockerfile 8 | 9 | env_file: 10 | - ./docker.env 11 | 12 | volumes: 13 | - ..:/workspace:cached 14 | 15 | # Overrides default command so things don't shut down after the process ends. 16 | command: sleep infinity 17 | 18 | # Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function. 19 | network_mode: service:db 20 | 21 | # Uncomment the next line to use a non-root user for all processes. 22 | # user: vscode 23 | 24 | # Required for ptrace-based debuggers like C++, Go, and Rust 25 | cap_add: 26 | - SYS_PTRACE 27 | security_opt: 28 | - seccomp:unconfined 29 | # Use "forwardPorts" in **devcontainer.json** to forward an app port locally. 30 | # (Adding the "ports" property to this file will not forward from a Codespace.) 31 | 32 | db: 33 | image: postgres:latest 34 | restart: unless-stopped 35 | volumes: 36 | - postgres-data:/var/lib/postgresql/data 37 | env_file: 38 | - ./docker.env 39 | # Add "forwardPorts": ["5432"] to **devcontainer.json** to forward PostgreSQL locally. 40 | # (Adding the "ports" property to this file will not forward from a Codespace.) 41 | 42 | volumes: 43 | postgres-data: 44 | -------------------------------------------------------------------------------- /.devcontainer/docker.env: -------------------------------------------------------------------------------- 1 | PGEON_TEST_DB=postgresql://postgres:postgres@localhost/postgres 2 | POSTGRES_HOST_AUTH_METHOD=trust 3 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.{py,pyx,pxd}] 12 | indent_size = 4 13 | 14 | [*.sql] 15 | indent_size = 4 16 | 17 | [Makefile] 18 | indent_style = tab 19 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | build_wheels: 12 | name: Build wheels on ${{ matrix.os }} 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest, macos-latest, windows-latest] 17 | 18 | steps: 19 | - name: Add pg_config to path on Windows 20 | run: | 21 | if [ "$RUNNER_OS" == "Windows" ]; then 22 | echo "$PGBIN" >> $GITHUB_PATH 23 | echo "$PGROOT\lib" >> $GITHUB_PATH 24 | echo "PQ_LIB_DIR=$PGROOT\lib" >> $GITHUB_ENV 25 | fi 26 | shell: bash 27 | 28 | - uses: actions/checkout@v3 29 | with: 30 | fetch-depth: 0 31 | 32 | - name: Build wheels 33 | uses: pypa/cibuildwheel@v2.11.2 34 | 35 | - uses: actions/upload-artifact@v3 36 | with: 37 | path: ./wheelhouse/*.whl 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ccls-cache/ 2 | .vimspector.session 3 | .vscode/ 4 | build/ 5 | 6 | python/_pgeon.cpp 7 | 8 | ### Generated by gibo (https://github.com/simonwhitaker/gibo) 9 | ### https://raw.github.com/github/gitignore/cdd9e946da421758c6f42c427c7bc65c8326155d/C++.gitignore 10 | 11 | # Prerequisites 12 | *.d 13 | 14 | # Compiled Object files 15 | *.slo 16 | *.lo 17 | *.o 18 | *.obj 19 | 20 | # Precompiled Headers 21 | *.gch 22 | *.pch 23 | 24 | # Compiled Dynamic libraries 25 | *.so 26 | *.dylib 27 | *.dll 28 | 29 | # Fortran module files 30 | *.mod 31 | *.smod 32 | 33 | # Compiled Static libraries 34 | *.lai 35 | *.la 36 | *.a 37 | *.lib 38 | 39 | # Executables 40 | *.exe 41 | *.out 42 | *.app 43 | 44 | 45 | ### Generated by gibo (https://github.com/simonwhitaker/gibo) 46 | ### https://raw.github.com/github/gitignore/cdd9e946da421758c6f42c427c7bc65c8326155d/CMake.gitignore 47 | 48 | CMakeLists.txt.user 49 | CMakeCache.txt 50 | CMakeFiles 51 | CMakeScripts 52 | Testing 53 | # Makefile 54 | cmake_install.cmake 55 | install_manifest.txt 56 | compile_commands.json 57 | CTestTestfile.cmake 58 | _deps 59 | 60 | 61 | ### Generated by gibo (https://github.com/simonwhitaker/gibo) 62 | ### https://raw.github.com/github/gitignore/cdd9e946da421758c6f42c427c7bc65c8326155d/Global/macOS.gitignore 63 | 64 | # General 65 | .DS_Store 66 | .AppleDouble 67 | .LSOverride 68 | 69 | # Icon must end with two \r 70 | Icon 71 | 72 | 73 | # Thumbnails 74 | ._* 75 | 76 | # Files that might appear in the root of a volume 77 | .DocumentRevisions-V100 78 | .fseventsd 79 | .Spotlight-V100 80 | .TemporaryItems 81 | .Trashes 82 | .VolumeIcon.icns 83 | .com.apple.timemachine.donotpresent 84 | 85 | # Directories potentially created on remote AFP share 86 | .AppleDB 87 | .AppleDesktop 88 | Network Trash Folder 89 | Temporary Items 90 | .apdisk 91 | 92 | # Byte-compiled / optimized / DLL files 93 | __pycache__/ 94 | *.py[cod] 95 | *$py.class 96 | 97 | # C extensions 98 | *.so 99 | 100 | # Distribution / packaging 101 | .Python 102 | build/ 103 | develop-eggs/ 104 | dist/ 105 | downloads/ 106 | eggs/ 107 | .eggs/ 108 | lib/ 109 | lib64/ 110 | parts/ 111 | sdist/ 112 | var/ 113 | wheels/ 114 | share/python-wheels/ 115 | *.egg-info/ 116 | *.dist-info/ 117 | .installed.cfg 118 | *.egg 119 | MANIFEST 120 | 121 | # PyInstaller 122 | # Usually these files are written by a python script from a template 123 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 124 | *.manifest 125 | *.spec 126 | 127 | # Installer logs 128 | pip-log.txt 129 | pip-delete-this-directory.txt 130 | 131 | # Unit test / coverage reports 132 | htmlcov/ 133 | .tox/ 134 | .nox/ 135 | .coverage 136 | .coverage.* 137 | .cache 138 | nosetests.xml 139 | coverage.xml 140 | *.cover 141 | *.py,cover 142 | .hypothesis/ 143 | .pytest_cache/ 144 | cover/ 145 | 146 | # Translations 147 | *.mo 148 | *.pot 149 | 150 | # Django stuff: 151 | *.log 152 | local_settings.py 153 | db.sqlite3 154 | db.sqlite3-journal 155 | 156 | # Flask stuff: 157 | instance/ 158 | .webassets-cache 159 | 160 | # Scrapy stuff: 161 | .scrapy 162 | 163 | # Sphinx documentation 164 | docs/_build/ 165 | 166 | # PyBuilder 167 | .pybuilder/ 168 | target/ 169 | 170 | # Jupyter Notebook 171 | .ipynb_checkpoints 172 | 173 | # IPython 174 | profile_default/ 175 | ipython_config.py 176 | 177 | # pyenv 178 | # For a library or package, you might want to ignore these files since the code is 179 | # intended to run in multiple environments; otherwise, check them in: 180 | # .python-version 181 | 182 | # pipenv 183 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 184 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 185 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 186 | # install all needed dependencies. 187 | #Pipfile.lock 188 | 189 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 190 | __pypackages__/ 191 | 192 | # Celery stuff 193 | celerybeat-schedule 194 | celerybeat.pid 195 | 196 | # SageMath parsed files 197 | *.sage.py 198 | 199 | # Environments 200 | .env 201 | .venv 202 | env/ 203 | venv/ 204 | ENV/ 205 | env.bak/ 206 | venv.bak/ 207 | 208 | # Spyder project settings 209 | .spyderproject 210 | .spyproject 211 | 212 | # Rope project settings 213 | .ropeproject 214 | 215 | # mkdocs documentation 216 | /site 217 | 218 | # mypy 219 | .mypy_cache/ 220 | .dmypy.json 221 | dmypy.json 222 | 223 | # Pyre type checker 224 | .pyre/ 225 | 226 | # pytype static type analyzer 227 | .pytype/ 228 | 229 | # Cython debug symbols 230 | cython_debug/ 231 | # Generated by skbuild 232 | _skbuild/ 233 | dist/ 234 | *.egg-info/ 235 | MANIFEST.in 236 | 237 | wheelhouse/ 238 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | message(STATUS "Using CMake ${CMAKE_VERSION}") 3 | 4 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 5 | 6 | # Main project configuration 7 | project( 8 | pgeon 9 | VERSION 0.1 10 | DESCRIPTION "Apache Arrow PostgreSQL connector" 11 | HOMEPAGE_URL "https://github.com/0x0L/pgeon" 12 | LANGUAGES CXX 13 | ) 14 | 15 | cmake_policy(SET CMP0054 NEW) 16 | 17 | # Compiler settings 18 | set(CMAKE_CXX_STANDARD 20) 19 | set(CMAKE_CXX_STANDARD_REQUIRED True) 20 | 21 | find_package(PostgreSQL REQUIRED) 22 | message(STATUS "Using PostgreSQL ${PostgreSQL_VERSION_STRING} ${PostgreSQL_LIBRARIES}") 23 | 24 | find_package(Arrow REQUIRED) 25 | message(STATUS "Using Arrow ${ARROW_FULL_SO_VERSION}") 26 | 27 | add_subdirectory(src) 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # FROM mambaorg/micromamba:1.3.1 2 | # COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/environment.yml 3 | # RUN micromamba install -y -n base -f /tmp/environment.yml && \ 4 | # micromamba clean --all --yes 5 | 6 | FROM python:3.11 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 nullptr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pgeon 🐦 2 | 3 | [![Build](https://github.com/0x0L/pgeon/actions/workflows/build.yml/badge.svg)](https://github.com/0x0L/pgeon/actions/workflows/build.yml) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/0x0L/pgeon/blob/main/LICENSE) 5 | 6 | [Apache Arrow](https://arrow.apache.org/) [PostgreSQL](https://www.postgresql.org/) connector 7 | 8 | The goal of `pgeon` is to provide fast bulk data download from a PostgreSQL database into Apache Arrow tables. `pgeon` provides a C++ library and simple python bindings. Almost all PostgreSQL native types are supported (see [below](#notes)). 9 | 10 | If you're looking to upload data, you might want to have a look at [Arrow ADBC](https://github.com/apache/arrow-adbc). 11 | 12 | This project is similar to [pg2arrow](https://github.com/heterodb/pg2arrow) and is heavily inspired by it. The main differences are the use of `COPY` instead of `FETCH` and that our implementation uses the Arrow C++ API. 13 | 14 | ## Usage 15 | 16 | ```python 17 | from pgeon import copy_query 18 | db = "postgresql://postgres@localhost:5432/postgres" 19 | query = "SELECT * FROM some_table" 20 | tbl = copy_query(db, query) 21 | ``` 22 | 23 | The actual query performed is `COPY ({query}) TO STDOUT (FORMAT binary)`, see [this page](https://www.postgresql.org/docs/current/sql-copy.html) for more information. 24 | 25 | ## Installation 26 | 27 | ### Pre-built binary wheels 28 | 29 | We provide pre-built binary wheels in the [Release](https://github.com/0x0L/pgeon/releases) section. No dependencies are required. Conda users, please read below. 30 | 31 | ### Install from sources 32 | 33 | Building `pgeon` requires [libpq](https://www.postgresql.org/docs/current/libpq.html) to be available on your system. 34 | 35 | ```shell 36 | git clone https://github.com/0x0L/pgeon.git 37 | cd pgeon 38 | pip install . 39 | ``` 40 | 41 | The pre-built binary wheels are built using the old C++ ABI as used by the `pyarrow` package available from [pypi](https://pypi.org/project/pyarrow/). Unfortunately the conda-forge `pyarrow` package uses the new C++ ABI. If you are using `pyarrow` from conda at runtime, you can install `pgeon` using 42 | 43 | ```shell 44 | CONDA_BUILD=1 pip install . 45 | ``` 46 | 47 | ### [optional] C++ library and tools 48 | 49 | This requires [cmake](https://cmake.org/) and [ninja](https://ninja-build.org/). In addition you'll need to install `libpq` and the Arrow C++ libraries (e.g. `arrow-cpp` in conda) 50 | 51 | ```shell 52 | mkdir build 53 | cd build 54 | cmake -GNinja .. 55 | ninja 56 | ``` 57 | 58 | ## Performance 59 | 60 | Elapsed time distributions of a query fetching 7 columns (1 timestamp, 2 ints, 4 reals) and around 4.5 million rows. The result is returned as a `pandas.DataFrame` in all cases. 61 | 62 | ![](benchmarks/minute_bars.svg) 63 | 64 | ## Notes 65 | 66 | - Queries using `ROW` (e.g. `SELECT ROW('a', 1)`) do not work (anonymous structs) 67 | 68 | - SQL arrays are mapped to `pyarrow.list_(...)`. Due to the PostgreSQL wire format, only 1D arrays are fully supported. Higher dimensional arrays will be flattened. 69 | 70 | - BitString types output format is not really helpful 71 | 72 | - tsvector types with letter weights are not supported 73 | 74 | - PostgreSQL range and domain types are not supported 75 | 76 | - Dynamic record types are not supported 77 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | - More UserOptions (ex. treat null floats as nan, ...) 4 | 5 | - More tests & benchmarks 6 | 7 | - Batchbuilder simple `void (*callback)(std::shared_ptr)` interface 8 | 9 | - Refactor Builders 10 | 11 | - Standalone Flight server 12 | 13 | - Is there any issue with `COPY` ? If so, explore use of `FETCH` again 14 | 15 | - Review of output format for some struct types 16 | -------------------------------------------------------------------------------- /benchmarks/create_tables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SCRIPT=$(readlink -f "${0}") 3 | SCRIPTPATH=$(dirname "${SCRIPT}") 4 | 5 | for f in $SCRIPTPATH/sql/*.sql; do 6 | echo Running ${f}; 7 | psql -d $PGEON_TEST_DB -q -f ${f}; 8 | done 9 | -------------------------------------------------------------------------------- /benchmarks/minute_bars.csv: -------------------------------------------------------------------------------- 1 | asyncpg_fetch,asyncpg_copy_csv,psycopg_fetchall,psycopg_copy_csv,pgeon_copy 2 | 10.095713138580322,3.4536800384521484,8.678394794464111,6.086690902709961,2.4030420780181885 3 | 10.560228109359741,3.422717809677124,8.38644003868103,5.897386789321899,2.3597848415374756 4 | 10.177809953689575,3.42207670211792,8.902992010116577,5.924168825149536,2.295711040496826 5 | 10.099334955215454,3.4022300243377686,8.74126386642456,6.103153228759766,2.389941930770874 6 | 10.565501928329468,3.4130542278289795,8.513407945632935,6.047927141189575,2.3649909496307373 7 | 9.905993938446045,3.411057949066162,8.639901876449585,6.0126190185546875,2.314399003982544 8 | 10.670677185058594,3.4151041507720947,8.392177104949951,5.9890711307525635,2.39845609664917 9 | 10.306501865386963,3.4398081302642822,8.822201013565063,6.110926866531372,2.2930400371551514 10 | 10.051358938217163,3.4272308349609375,8.839097738265991,6.003473997116089,2.27715802192688 11 | 10.573030233383179,3.4210267066955566,8.549156665802002,6.03466796875,2.3648080825805664 12 | 10.262261152267456,3.4349639415740967,8.509052991867065,6.088923931121826,2.3728771209716797 13 | 10.383577823638916,3.4317641258239746,8.592118978500366,6.034713983535767,2.39363694190979 14 | 9.920910835266113,3.4290151596069336,8.841479063034058,6.026019096374512,2.3601129055023193 15 | 10.407181024551392,3.4288759231567383,8.709710836410522,6.041751146316528,2.3246006965637207 16 | 10.114190101623535,3.4385170936584473,8.560504674911499,6.007788181304932,2.301820993423462 17 | 10.471460103988647,3.420072317123413,8.661936044692993,6.10942006111145,2.354604959487915 18 | 10.183914184570312,3.4538238048553467,8.41605019569397,6.076411962509155,2.38792085647583 19 | 10.391186952590942,3.426831007003784,8.828518867492676,6.06396484375,2.3048861026763916 20 | 10.390706062316895,3.435349941253662,8.83112382888794,6.079980134963989,2.3510677814483643 21 | 10.635796070098877,3.4399518966674805,8.555356979370117,6.028672933578491,2.3596980571746826 22 | 10.440201997756958,3.4243340492248535,8.488816261291504,6.023843050003052,2.359287977218628 23 | 10.163487195968628,3.435991048812866,8.554356098175049,6.091469049453735,2.2827987670898438 24 | 10.437966108322144,3.440228223800659,8.845073938369751,6.043931007385254,2.394116163253784 25 | 10.54007887840271,3.434037923812866,8.728957891464233,6.019822120666504,2.2803728580474854 26 | 10.749107122421265,3.444566011428833,8.521671056747437,6.067982196807861,2.3578219413757324 27 | 9.947474002838135,3.4577107429504395,8.66011095046997,5.998857021331787,2.342482089996338 28 | 10.73874807357788,3.44120192527771,8.420476913452148,5.941181182861328,2.2920150756835938 29 | 10.51664113998413,3.460557222366333,8.810289859771729,6.001388072967529,2.3033227920532227 30 | 10.142718076705933,3.435565948486328,8.806547164916992,5.997693061828613,2.291018009185791 31 | 10.778900146484375,3.4343202114105225,8.573671102523804,5.997773885726929,2.3699870109558105 32 | 10.475090265274048,3.4310150146484375,8.50592589378357,5.989703893661499,2.281140089035034 33 | 10.872930765151978,3.4464383125305176,8.579365730285645,5.95805811882019,2.3755061626434326 34 | 10.245704889297485,3.4738049507141113,8.859824895858765,5.98901891708374,2.273061752319336 35 | 10.664002180099487,3.428598165512085,8.73629093170166,6.069189071655273,2.295767068862915 36 | 10.356072902679443,3.4334747791290283,8.508718967437744,6.0036890506744385,2.3080310821533203 37 | 10.560563087463379,3.4319992065429688,8.631680011749268,6.044823884963989,2.3893392086029053 38 | 10.23069429397583,3.4536218643188477,8.422562837600708,6.012848138809204,2.3506078720092773 39 | 10.382498741149902,3.442765951156616,8.79504108428955,6.0293049812316895,2.3969268798828125 40 | 10.238735914230347,3.465494155883789,8.83447003364563,6.069157123565674,2.3415586948394775 41 | 10.541892766952515,3.4549989700317383,8.598083019256592,6.0219151973724365,2.3654489517211914 42 | 10.29967212677002,3.4509379863739014,8.464033126831055,6.001490831375122,2.3910627365112305 43 | 9.863965034484863,3.4555346965789795,8.554350852966309,6.041918992996216,2.346611261367798 44 | 10.283767938613892,3.474482297897339,8.854292869567871,6.041573762893677,2.3542709350585938 45 | 10.243877172470093,3.4618611335754395,8.727796077728271,6.024172067642212,2.3041751384735107 46 | 10.476894855499268,3.4507787227630615,8.523108959197998,6.005989074707031,2.288313865661621 47 | 9.66651201248169,3.470623016357422,8.616523027420044,6.073851823806763,2.3591418266296387 48 | 10.561643123626709,3.465301990509033,8.381051301956177,6.0842320919036865,2.2924561500549316 49 | 10.136812210083008,3.4743590354919434,8.805369138717651,6.034684181213379,2.328608989715576 50 | 9.876017093658447,3.464846134185791,8.776822805404663,6.003312110900879,2.361663818359375 51 | 10.43434190750122,3.449636936187744,8.558181047439575,6.049262046813965,2.3151538372039795 52 | 10.115739822387695,3.4674408435821533,8.488451957702637,6.078228950500488,2.410093069076538 53 | 10.241002798080444,3.4877970218658447,8.551777839660645,6.0928802490234375,2.305414915084839 54 | 9.73887324333191,3.4978060722351074,8.814873933792114,6.074985980987549,2.3662378787994385 55 | 10.180108070373535,3.4995713233947754,8.702198028564453,6.091523170471191,2.3788790702819824 56 | 10.008484840393066,3.5207931995391846,8.528488874435425,6.16368293762207,2.4020931720733643 57 | 10.207884073257446,3.461606025695801,8.609689950942993,6.064941167831421,2.3892059326171875 58 | 10.09879207611084,3.473416328430176,8.392743825912476,6.096439599990845,2.3911638259887695 59 | 10.124965906143188,3.460336923599243,8.7671537399292,6.053673028945923,2.306981086730957 60 | 10.08671259880066,3.4769551753997803,8.791364908218384,6.081356048583984,2.3908181190490723 61 | 10.622474908828735,3.47306489944458,8.531331062316895,6.030739068984985,2.3648221492767334 62 | 10.156677007675171,3.5110950469970703,8.461261749267578,6.069777011871338,2.306187152862549 63 | 9.76278305053711,3.483621835708618,8.531108856201172,6.113928318023682,2.360764980316162 64 | 10.146149158477783,3.4528887271881104,8.85041618347168,6.093689680099487,2.326408863067627 65 | 10.307142972946167,3.5127179622650146,8.70587706565857,6.00843071937561,2.3933000564575195 66 | 10.534659147262573,3.4699158668518066,8.52911901473999,6.02446722984314,2.309987783432007 67 | 9.627742052078247,3.478774070739746,8.5724618434906,6.0781331062316895,2.355074882507324 68 | 10.40033221244812,3.5225930213928223,8.380789995193481,6.017845869064331,2.304987668991089 69 | 10.08962106704712,3.4965100288391113,8.791450023651123,6.016102313995361,2.3765649795532227 70 | 9.738504886627197,3.5028889179229736,8.781290054321289,6.064890146255493,2.3351471424102783 71 | 10.461288928985596,3.5006940364837646,8.550048828125,6.083592176437378,2.3985390663146973 72 | 10.14470100402832,3.4681410789489746,8.494976997375488,6.078736066818237,2.3354461193084717 73 | 10.207927942276001,3.498332977294922,8.512158393859863,6.043616056442261,2.3808000087738037 74 | 9.675312757492065,3.5493481159210205,8.789582967758179,6.043591022491455,2.371655225753784 75 | 10.257430076599121,3.4811558723449707,8.694859027862549,6.048892021179199,2.3663790225982666 76 | 9.990855932235718,3.4608349800109863,8.549638986587524,6.0162718296051025,2.2751588821411133 77 | 10.260905981063843,3.4984359741210938,8.593547821044922,6.0313780307769775,2.2981717586517334 78 | 10.010271072387695,3.495699882507324,8.396156787872314,6.0639190673828125,2.300577163696289 79 | 10.19277310371399,3.480020046234131,8.776932001113892,6.05824613571167,2.274275064468384 80 | 10.145508766174316,3.485482931137085,8.773476839065552,5.98048996925354,2.3942267894744873 81 | 10.36385726928711,3.482633113861084,8.524598121643066,6.14532995223999,2.3947808742523193 82 | 10.160391330718994,3.460085153579712,8.480232954025269,6.043949604034424,2.3744640350341797 83 | 9.81281304359436,3.5030269622802734,8.526901006698608,6.063862085342407,2.2795369625091553 84 | 10.186419010162354,3.5230588912963867,8.84254503250122,6.010951995849609,2.3191659450531006 85 | 10.238094806671143,3.5138182640075684,8.72116208076477,6.06764817237854,2.28700590133667 86 | 10.580572843551636,3.4931483268737793,8.520257949829102,5.992443084716797,2.299736738204956 87 | 9.700630187988281,3.496506929397583,8.6043701171875,6.106410026550293,2.3922293186187744 88 | 10.39240312576294,3.4788429737091064,8.377356052398682,6.096276044845581,2.382521152496338 89 | 10.182204008102417,3.5018959045410156,8.794171810150146,6.06470799446106,2.328428030014038 90 | 9.779273986816406,3.4965038299560547,8.765751123428345,6.0634989738464355,2.3727309703826904 91 | 10.36894178390503,3.5221140384674072,8.532798290252686,6.050763130187988,2.315681219100952 92 | 10.154377937316895,3.4898622035980225,8.48030686378479,6.127264022827148,2.271536111831665 93 | 10.173689126968384,3.506088972091675,8.511484861373901,6.066236257553101,2.306636095046997 94 | 9.693994760513306,3.4950809478759766,8.813354969024658,6.048033952713013,2.3062961101531982 95 | 10.199285984039307,3.494696855545044,8.71052598953247,6.0643470287323,2.263500928878784 96 | 9.989582777023315,3.501615047454834,8.509217023849487,6.125725984573364,2.3444199562072754 97 | 10.700802087783813,3.5028398036956787,8.595217943191528,6.074347019195557,2.4064478874206543 98 | 10.032166004180908,3.5114400386810303,8.368451118469238,6.056747913360596,2.3144190311431885 99 | 10.196884870529175,3.4856069087982178,8.789282083511353,6.111891269683838,2.3867950439453125 100 | 10.18030595779419,3.4756081104278564,8.780909061431885,6.178780794143677,2.3664958477020264 101 | 10.424238681793213,3.4874560832977295,8.519793033599854,6.046826124191284,2.3111391067504883 102 | -------------------------------------------------------------------------------- /benchmarks/minute_bars.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 2023-02-19T13:54:02.625828 10 | image/svg+xml 11 | 12 | 13 | Matplotlib v3.7.0, https://matplotlib.org/ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 42 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 1268 | 1269 | 1270 | 1271 | 1272 | 1273 | 1274 | 1275 | 1679 | 1680 | 1681 | 1682 | 1683 | 1684 | 1685 | 1686 | 2090 | 2091 | 2092 | 2093 | 2094 | 2095 | 2096 | 2097 | 2098 | 2099 | 2102 | 2103 | 2104 | 2105 | 2106 | 2107 | 2108 | 2109 | 2110 | 2111 | 2135 | 2136 | 2137 | 2138 | 2139 | 2140 | 2141 | 2142 | 2143 | 2144 | 2145 | 2146 | 2147 | 2148 | 2149 | 2150 | 2169 | 2170 | 2171 | 2172 | 2173 | 2174 | 2175 | 2176 | 2177 | 2178 | 2179 | 2180 | 2181 | 2182 | 2183 | 2184 | 2214 | 2215 | 2216 | 2217 | 2218 | 2219 | 2220 | 2221 | 2222 | 2223 | 2224 | 2225 | 2226 | 2227 | 2228 | 2229 | 2268 | 2269 | 2270 | 2271 | 2272 | 2273 | 2274 | 2275 | 2276 | 2277 | 2278 | 2279 | 2280 | 2281 | 2282 | 2283 | 2297 | 2318 | 2319 | 2320 | 2321 | 2322 | 2323 | 2324 | 2325 | 2326 | 2327 | 2328 | 2359 | 2384 | 2405 | 2426 | 2445 | 2471 | 2472 | 2473 | 2474 | 2475 | 2476 | 2477 | 2478 | 2479 | 2480 | 2481 | 2482 | 2483 | 2486 | 2487 | 2488 | 2489 | 2495 | 2496 | 2497 | 2498 | 2499 | 2500 | 2533 | 2550 | 2576 | 2610 | 2617 | 2638 | 2659 | 2678 | 2679 | 2680 | 2681 | 2682 | 2683 | 2684 | 2685 | 2686 | 2687 | 2688 | 2689 | 2690 | 2691 | 2692 | 2693 | 2694 | 2695 | 2701 | 2702 | 2703 | 2704 | 2705 | 2706 | 2716 | 2717 | 2718 | 2719 | 2720 | 2721 | 2722 | 2723 | 2724 | 2725 | 2726 | 2727 | 2728 | 2729 | 2730 | 2731 | 2732 | 2733 | 2734 | 2735 | 2736 | 2742 | 2743 | 2744 | 2745 | 2746 | 2747 | 2754 | 2755 | 2756 | 2757 | 2758 | 2759 | 2760 | 2761 | 2762 | 2763 | 2764 | 2765 | 2766 | 2767 | 2768 | 2769 | 2770 | 2771 | 2772 | 2773 | 2774 | 2780 | 2781 | 2782 | 2783 | 2784 | 2785 | 2786 | 2787 | 2788 | 2789 | 2790 | 2791 | 2792 | 2793 | 2794 | 2795 | 2796 | 2797 | 2798 | 2799 | 2800 | 2801 | 2802 | 2803 | 2809 | 2810 | 2811 | 2812 | 2813 | 2814 | 2815 | 2816 | 2817 | 2818 | 2819 | 2820 | 2821 | 2822 | 2823 | 2824 | 2825 | 2826 | 2827 | 2828 | 2829 | 2830 | 2831 | 2832 | 2833 | 2834 | -------------------------------------------------------------------------------- /benchmarks/run.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | from io import BytesIO 4 | 5 | import asyncpg 6 | import pgeon 7 | import psycopg 8 | 9 | import pandas as pd 10 | import pyarrow.csv as csv 11 | import seaborn as sns 12 | 13 | 14 | def _df_from_buffer(buffer): 15 | buffer.seek(0) 16 | read_options = csv.ReadOptions(autogenerate_column_names=True) 17 | df = csv.read_csv(buffer, read_options=read_options).to_pandas() 18 | return df 19 | 20 | 21 | def asyncpg_fetch(db, query): 22 | async def fn(): 23 | conn = await asyncpg.connect(dsn=db) 24 | return pd.DataFrame(await conn.fetch(query)) 25 | 26 | return fn 27 | 28 | 29 | def asyncpg_copy_csv(db, query): 30 | async def fn(): 31 | with BytesIO() as buf: 32 | conn = await asyncpg.connect(dsn=db) 33 | await conn.copy_from_query(query, output=buf, format="csv") 34 | return _df_from_buffer(buf) 35 | 36 | return fn 37 | 38 | 39 | def psycopg_fetchall(db, query): 40 | def fn(): 41 | with psycopg.connect(db) as conn: 42 | with conn.cursor(binary=True) as cur: 43 | cur.execute(query) 44 | return pd.DataFrame(cur.fetchall()) 45 | 46 | return fn 47 | 48 | 49 | def psycopg_copy_csv(db, query): 50 | def fn(): 51 | with BytesIO() as buf, psycopg.connect(db) as conn: 52 | with conn.cursor(binary=True) as cur: 53 | with cur.copy( 54 | f"COPY ({query}) TO STDOUT (FORMAT csv)", 55 | ) as copy: 56 | for data in copy: 57 | buf.write(data) 58 | return _df_from_buffer(buf) 59 | 60 | return fn 61 | 62 | 63 | def pgeon_copy(db, query): 64 | def fn(): 65 | return pgeon.copy_query(db, query).to_pandas() 66 | 67 | return fn 68 | 69 | 70 | def benchmark(fn, n=1): 71 | elapsed = [] 72 | for _ in range(n): 73 | start = time.time() 74 | _ = fn() 75 | elapsed.append(time.time() - start) 76 | return elapsed 77 | 78 | 79 | def async_benchmark(fn, n=1): 80 | async def wrap(): 81 | elapsed = [] 82 | for _ in range(n): 83 | start = time.time() 84 | _ = await fn() 85 | elapsed.append(time.time() - start) 86 | return elapsed 87 | 88 | return asyncio.run(wrap()) 89 | 90 | 91 | def bench_minute_bars(db, n=1): 92 | print("Running minute_bars benchmark...") 93 | 94 | query = "select * from minute_bars" 95 | df = { 96 | "asyncpg_fetch": async_benchmark(asyncpg_fetch(db, query), n=n), 97 | "asyncpg_copy_csv": async_benchmark(asyncpg_copy_csv(db, query), n=n), 98 | "psycopg_fetchall": benchmark(psycopg_fetchall(db, query), n=n), 99 | "psycopg_copy_csv": benchmark(psycopg_copy_csv(db, query), n=n), 100 | "pgeon_copy": benchmark(pgeon_copy(db, query), n=n), 101 | } 102 | 103 | df = pd.DataFrame(df) 104 | df.to_csv("minute_bars.csv", index=False) 105 | 106 | ax = sns.kdeplot(data=df, fill=True) 107 | ax.get_legend().set_frame_on(False) 108 | ax.figure.set_size_inches(12, 3) 109 | ax.xaxis.set_label_text("seconds") 110 | ax.yaxis.set_visible(False) 111 | sns.despine(left=True) 112 | 113 | ax.figure.tight_layout() 114 | ax.figure.savefig("minute_bars.svg") # , transparent=True) 115 | 116 | 117 | if __name__ == "__main__": 118 | import os 119 | db = os.environ.get( 120 | "PGEON_TEST_DB", "postgresql://localhost:5432/postgres" 121 | ) 122 | bench_minute_bars(db, n=100) 123 | -------------------------------------------------------------------------------- /benchmarks/sql/array.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS sal_emp; 2 | 3 | CREATE TABLE sal_emp ( 4 | name text, 5 | pay_by_quarter integer [], 6 | schedule text [] [] 7 | ); 8 | 9 | INSERT INTO 10 | sal_emp 11 | VALUES 12 | ( 13 | 'Bill', 14 | '{10000, 10000, 10000, 10000}', 15 | '{{"meeting", "lunch"}, {"training", "presentation"}}' 16 | ), 17 | ( 18 | 'Carol', 19 | '{20000, 25000, 25000, 25000}', 20 | '{{"breakfast", "consulting"}, {"meeting", "lunch"}}' 21 | ); 22 | 23 | -- SELECT 24 | -- * 25 | -- FROM 26 | -- ( 27 | -- VALUES 28 | -- ( 29 | -- 'Bill', 30 | -- '{10000, 10000, 10000, 10000}', 31 | -- '{{"meeting", "lunch"}, {"training", "presentation"}}' 32 | -- ), 33 | -- ( 34 | -- 'Carol', 35 | -- '{20000, 25000, 25000, 25000}', 36 | -- '{{"breakfast", "consulting"}, {"meeting", "lunch"}}' 37 | -- ) 38 | -- ) AS foo; 39 | -------------------------------------------------------------------------------- /benchmarks/sql/composite.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS on_hand; 2 | 3 | DROP TYPE IF EXISTS inventory_item CASCADE; 4 | 5 | CREATE TYPE inventory_item AS ( 6 | name text, 7 | supplier_id integer, 8 | price numeric 9 | ); 10 | 11 | CREATE TABLE on_hand (item inventory_item, count integer); 12 | 13 | INSERT INTO 14 | on_hand 15 | VALUES 16 | (ROW('fuzzy dice', 42, 1.99), 1000); 17 | 18 | 19 | -- SELECT * FROM (VALUES (ROW('a', 1, 0.99)::inventory_item)) as foo; 20 | -------------------------------------------------------------------------------- /benchmarks/sql/enum.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS person; 2 | 3 | DROP TYPE IF EXISTS mood CASCADE; 4 | 5 | CREATE TYPE mood AS ENUM ('sad', 'ok', 'happy'); 6 | 7 | CREATE TABLE person (name text, current_mood mood); 8 | 9 | INSERT INTO 10 | person 11 | VALUES 12 | ('Moe', 'happy'), 13 | ('Larry', 'sad'), 14 | ('Curly', 'ok'); 15 | 16 | SELECT 17 | * 18 | FROM 19 | ( 20 | VALUES 21 | ('Moe', 'happy'), 22 | ('Larry', 'sad'), 23 | ('Curly', 'ok') 24 | ) AS foo; 25 | -------------------------------------------------------------------------------- /benchmarks/sql/json.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS json_table; 2 | 3 | CREATE TABLE json_table (a_json json, a_jsonb jsonb); 4 | 5 | INSERT INTO 6 | json_table 7 | VALUES 8 | ('{"a": 1}' :: json, '{"a": 1}' :: jsonb), 9 | ('{"b": 1}' :: json, '{"b": 1}' :: jsonb), 10 | ('{"a": 2}' :: json, '{"a": 2}' :: jsonb); 11 | -------------------------------------------------------------------------------- /benchmarks/sql/minute_bars.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS minute_bars; 2 | 3 | CREATE TABLE minute_bars ( 4 | timestamp timestamp, 5 | symbol integer, 6 | open real, 7 | high real, 8 | low real, 9 | close real, 10 | volume integer 11 | ); 12 | 13 | INSERT INTO 14 | minute_bars 15 | SELECT 16 | ts, 17 | symbol, 18 | random(), 19 | random(), 20 | random(), 21 | random(), 22 | floor(random() * 10000) 23 | FROM 24 | generate_series( 25 | '2020-01-01' :: timestamp, 26 | '2020-02-01' :: timestamp, 27 | '1 minute' :: interval 28 | ) AS ts, 29 | generate_series(0, 100) AS symbol; 30 | -------------------------------------------------------------------------------- /benchmarks/sql/numbers.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS numeric_table; 2 | 3 | CREATE TABLE numeric_table ( 4 | a_double double precision, 5 | a_real real, 6 | a_smallint smallint, 7 | a_integer integer, 8 | a_bigint bigint, 9 | a_decimal_22_9 decimal(22, 9), 10 | a_decimal_17_3 decimal(17, 3), 11 | a_smallserial smallserial, 12 | a_serial serial, 13 | a_bigserial bigserial 14 | ); 15 | 16 | INSERT INTO 17 | numeric_table 18 | VALUES 19 | (1.23, 4.56, 0, 0, 0, '1.2345', '1.23'), 20 | (7.89, 10.1112, 1, 2, 3, '12345.123456789', '123424.1345'); 21 | -------------------------------------------------------------------------------- /ci/linux/repair-wheel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Ignore libarrow libraries in the wheel""" 3 | 4 | import logging 5 | import sys 6 | from fnmatch import fnmatch 7 | 8 | from auditwheel.patcher import Patchelf 9 | from auditwheel.policy import ( 10 | POLICY_PRIORITY_HIGHEST, 11 | POLICY_PRIORITY_LOWEST, 12 | get_policy_by_name, 13 | get_policy_name, 14 | ) 15 | from auditwheel.repair import repair_wheel 16 | from auditwheel.wheel_abi import analyze_wheel_abi 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | 20 | 21 | EXCLUDE_PATTERN = "libarrow*" 22 | 23 | if __name__ == "__main__": 24 | wheel, dest_dir = sys.argv[1:] 25 | 26 | policy_name = get_policy_name(POLICY_PRIORITY_HIGHEST) 27 | if policy_name is None: 28 | raise ValueError("Invalid policy") 29 | 30 | policy = get_policy_by_name(policy_name) 31 | if policy is None: 32 | raise ValueError("Invalid policy") 33 | 34 | winfo = analyze_wheel_abi(wheel) 35 | libs = winfo.external_refs[get_policy_name(POLICY_PRIORITY_LOWEST)]["libs"] 36 | 37 | excludes = [] 38 | for lib in libs: 39 | if fnmatch(lib, EXCLUDE_PATTERN): 40 | excludes.append(lib) 41 | 42 | abis = [policy["name"]] + policy["aliases"] 43 | 44 | repair_wheel( 45 | wheel, 46 | abis, 47 | out_dir=dest_dir, 48 | lib_sdir=".libs", 49 | update_tags=True, 50 | patcher=Patchelf(), 51 | exclude=excludes, 52 | ) 53 | -------------------------------------------------------------------------------- /ci/macos/repair-wheel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Ignore libarrow libraries in the wheel""" 3 | import logging 4 | import sys 5 | from fnmatch import fnmatch 6 | from pathlib import Path 7 | 8 | import delocate.tools 9 | from delocate import delocate_wheel 10 | from delocate.delocating import filter_system_libs 11 | from delocate.tools import _parse_otool_install_names as _parse_names 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | EXCLUDE_PATTERN = "*libarrow*" 16 | 17 | 18 | # Patching otool search to exclude libraries we don't want parsed 19 | def parse_otool_install_names(stdout: str) -> dict[str, list[tuple[str, str, str]]]: 20 | names = _parse_names(stdout) 21 | 22 | for k, v in names.copy().items(): 23 | names[k] = [t for t in v if not (fnmatch(t[0], EXCLUDE_PATTERN))] 24 | 25 | return names 26 | 27 | 28 | delocate.tools._parse_otool_install_names = parse_otool_install_names 29 | 30 | 31 | if __name__ == "__main__": 32 | wheel, dest_dir, delocate_archs = sys.argv[1:] 33 | 34 | wheel = Path(wheel) 35 | dest_dir = Path(dest_dir) 36 | require_archs = [s.strip() for s in delocate_archs.split(",")] 37 | 38 | delocate_wheel( 39 | in_wheel=wheel.as_posix(), 40 | out_wheel=(dest_dir / wheel.name).as_posix(), 41 | require_archs=require_archs, 42 | ) 43 | -------------------------------------------------------------------------------- /ci/setup-db.sh: -------------------------------------------------------------------------------- 1 | #/usr/bin/env bash 2 | set -e 3 | 4 | if [ "$RUNNER_OS" == "Linux" ]; then 5 | echo "Install postgres" 6 | yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm 7 | yum install -y postgresql15-server postgresql15-devel 8 | 9 | ln -sf /usr/pgsql-15/bin/pg_config /usr/bin/ 10 | 11 | export PATH=/usr/pgsql-15/bin:$PATH 12 | export RUNNER_TEMP="/tmp" 13 | fi 14 | 15 | echo "Start database" 16 | PGDATA="$RUNNER_TEMP/pgdata" 17 | 18 | if [ "$RUNNER_OS" == "Linux" ]; then 19 | su postgres -c "initdb --locale=C -E UTF-8 $PGDATA" 20 | su postgres -c "pg_ctl -D $PGDATA start" 21 | else 22 | initdb --username=postgres --locale=C -E UTF-8 $PGDATA 23 | pg_ctl -D $PGDATA start 24 | fi 25 | -------------------------------------------------------------------------------- /ci/windows/repair-wheel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Ignore libarrow libraries in the wheel""" 3 | import logging 4 | import re 5 | import sys 6 | 7 | from delvewheel._dll_list import ignore_regexes 8 | from delvewheel._wheel_repair import WheelRepair 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | ignore_regexes.add(re.compile(r"\S*arrow\S*")) 13 | 14 | if __name__ == "__main__": 15 | wheel, dest_dir = sys.argv[1:] 16 | wr = WheelRepair( 17 | wheel, 18 | dest_dir, 19 | add_dlls=None, 20 | no_dlls=None, 21 | ignore_in_wheel=True, 22 | verbose=0, 23 | test=[""], 24 | ) 25 | wr.repair( 26 | dest_dir, no_mangles=set(), no_mangle_all=True, lib_sdir=".libs", strip=True 27 | ) 28 | -------------------------------------------------------------------------------- /ci/windows/repair-wheel.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | wheel="$1" 4 | dest_dir="$2" 5 | 6 | # Install delvewheel in separate environment 7 | pipx install delvewheel 8 | pipx environment 9 | PIPX_HOME="$(pipx environment | grep -e 'PIPX_HOME=' | sed -re 's/PIPX_HOME=(.*)/\1/g')" 10 | PIPX_HOME=$(cygpath "$PIPX_HOME") 11 | 12 | "$PIPX_HOME/venvs/delvewheel/Scripts/python" ci/windows/repair-wheel.py "${wheel}" "${dest_dir}" 13 | -------------------------------------------------------------------------------- /cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -fr build python/_pgeon.cpp .pytest_cache/ pgeon.egg* *.whl 3 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pgeon-dev 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - cxx-compiler 6 | - git 7 | - libpq 8 | - python=3.11 9 | -------------------------------------------------------------------------------- /include/pgeon.h: -------------------------------------------------------------------------------- 1 | // MIT License 2 | 3 | // Copyright (c) 2022 nullptr 4 | 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | #pragma once 24 | 25 | #include 26 | 27 | #include 28 | 29 | namespace pgeon { 30 | 31 | struct UserOptions { 32 | bool string_as_dictionaries = false; 33 | // TODO(xav) max precision of 128 decimal ? 34 | int default_numeric_precision = 22; 35 | int default_numeric_scale = 6; 36 | // TODO(xav) lc_monetary 37 | int monetary_fractional_precision = 2; 38 | 39 | static UserOptions Defaults(); 40 | arrow::Status Validate() const; 41 | }; 42 | 43 | arrow::Result> CopyQuery( 44 | const char* conninfo, const char* query, 45 | const UserOptions& options = UserOptions::Defaults()); 46 | 47 | } // namespace pgeon 48 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools_scm[toml]", "cython", "pyarrow"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "pgeon" 7 | description = "Apache Arrow PostgreSQL connector" 8 | readme = "README.md" 9 | requires-python = ">=3.7" 10 | keywords = ["arrow", "postgresql"] 11 | license = { text = "MIT License" } 12 | classifiers = [ 13 | "License :: OSI Approved :: MIT License", 14 | "Programming Language :: Python :: 3", 15 | "Topic :: Database", 16 | ] 17 | dynamic = ["version"] 18 | dependencies = ["pyarrow"] 19 | 20 | [project.urls] 21 | "Homepage" = "https://github.com/0x0L/pgeon" 22 | "Bug Tracker" = "https://github.com/0x0L/pgeon/issues" 23 | 24 | [project.optional-dependencies] 25 | tests = ["pytest"] 26 | benchmarks = ["asyncpg", "psycopg[binary]", "pandas", "seaborn"] 27 | 28 | [tool.setuptools_scm] 29 | 30 | [tool.cibuildwheel] 31 | build = "cp39-* cp310-* cp311-*" 32 | skip = ["*_i686", "*-musllinux_*", "*-win32", "pp*"] 33 | build-verbosity = 2 34 | before-all = "bash ci/setup-db.sh" 35 | environment = { MACOSX_DEPLOYMENT_TARGET = '10.14', PGEON_TEST_DB = 'postgresql://postgres@localhost/postgres' } 36 | test-command = "pytest {project}/tests" 37 | test-extras = ["tests"] 38 | 39 | [tool.cibuildwheel.macos] 40 | repair-wheel-command = "python ci/macos/repair-wheel.py {wheel} {dest_dir} {delocate_archs}" 41 | 42 | [tool.cibuildwheel.linux] 43 | environment-pass = ["RUNNER_OS"] 44 | repair-wheel-command = "/opt/_internal/pipx/venvs/auditwheel/bin/python ci/linux/repair-wheel.py {wheel} {dest_dir}" 45 | 46 | [tool.cibuildwheel.windows] 47 | repair-wheel-command = "bash ci/windows/repair-wheel.sh {wheel} {dest_dir}" 48 | -------------------------------------------------------------------------------- /python/__init__.py: -------------------------------------------------------------------------------- 1 | """Apache Arrow PostgreSQL connector""" 2 | 3 | # We need the following line before importing anything from _pgeon 4 | # in order to preload libarrow.so.* 5 | import pyarrow as _pa # noqa 6 | 7 | from ._pgeon import UserOptions, copy_query 8 | 9 | __all__ = ["UserOptions", "copy_query"] 10 | -------------------------------------------------------------------------------- /python/_pgeon.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language=c++ 2 | # cython: language_level=3 3 | # cython: binding=True 4 | 5 | from cython.operator cimport dereference as deref 6 | from libcpp.memory cimport shared_ptr, unique_ptr 7 | 8 | from pyarrow.lib cimport ( 9 | _Weakrefable, CResult, CStatus, CTable, GetResultValue, 10 | c_bool, check_status, move, pyarrow_wrap_table 11 | ) 12 | 13 | import pyarrow 14 | 15 | cdef extern from "pgeon.h" namespace "pgeon" nogil: 16 | cdef cppclass CUserOptions" pgeon::UserOptions": 17 | c_bool string_as_dictionaries 18 | int default_numeric_precision 19 | int default_numeric_scale 20 | int monetary_fractional_precision 21 | 22 | CUserOptions() 23 | CUserOptions(CUserOptions&&) 24 | 25 | @staticmethod 26 | CUserOptions Defaults() 27 | 28 | CStatus Validate() 29 | 30 | cdef CResult[shared_ptr[CTable]] CopyQuery( 31 | const char* conninfo, const char* query, CUserOptions options) 32 | 33 | 34 | cdef class UserOptions(_Weakrefable): 35 | """Options 36 | 37 | Parameters 38 | ---------- 39 | string_as_dictionaries : bool, optional (default False) 40 | Whether to treat string columns as dictionaries 41 | 42 | default_numeric_precision : int, optional (default 22) 43 | Default precision for numeric type 44 | 45 | default_numeric_scale : bool, optional (default 6) 46 | Default scale for numeric type 47 | 48 | monetary_fractional_precision : bool, optional (default 2) 49 | Default monetary precision 50 | """ 51 | cdef: 52 | unique_ptr[CUserOptions] options 53 | 54 | __slots__ = () 55 | 56 | def __cinit__(self, *argw, **kwargs): 57 | self.options.reset( 58 | new CUserOptions(CUserOptions.Defaults())) 59 | 60 | def __init__(self, *, string_as_dictionaries=None, default_numeric_precision=None, 61 | default_numeric_scale=None, monetary_fractional_precision=None): 62 | if string_as_dictionaries is not None: 63 | self.string_as_dictionaries = string_as_dictionaries 64 | if default_numeric_precision is not None: 65 | self.default_numeric_precision = default_numeric_precision 66 | if default_numeric_scale is not None: 67 | self.default_numeric_scale = default_numeric_scale 68 | if monetary_fractional_precision is not None: 69 | self.monetary_fractional_precision = monetary_fractional_precision 70 | 71 | def __repr__(self) -> str: 72 | return ( 73 | "pgeon.UserOptions".format( 75 | self.string_as_dictionaries, self.default_numeric_precision, 76 | self.default_numeric_scale, self.monetary_fractional_precision 77 | ) 78 | ) 79 | 80 | @property 81 | def string_as_dictionaries(self): 82 | """ 83 | Whether to treat string columns as dictionaries 84 | """ 85 | return deref(self.options).string_as_dictionaries 86 | 87 | @string_as_dictionaries.setter 88 | def string_as_dictionaries(self, value): 89 | deref(self.options).string_as_dictionaries = value 90 | 91 | @property 92 | def default_numeric_precision(self): 93 | """ 94 | Default precision for numeric type 95 | """ 96 | return deref(self.options).default_numeric_precision 97 | 98 | @default_numeric_precision.setter 99 | def default_numeric_precision(self, value): 100 | deref(self.options).default_numeric_precision = value 101 | 102 | @property 103 | def default_numeric_scale(self): 104 | """ 105 | Default scale for numeric type 106 | """ 107 | return deref(self.options).default_numeric_scale 108 | 109 | @default_numeric_scale.setter 110 | def default_numeric_scale(self, value): 111 | deref(self.options).default_numeric_scale = value 112 | 113 | @property 114 | def monetary_fractional_precision(self): 115 | """ 116 | Default monetary precision 117 | """ 118 | return deref(self.options).monetary_fractional_precision 119 | 120 | @monetary_fractional_precision.setter 121 | def monetary_fractional_precision(self, value): 122 | deref(self.options).monetary_fractional_precision = value 123 | 124 | @staticmethod 125 | cdef UserOptions wrap(CUserOptions options): 126 | out = UserOptions() 127 | out.options.reset(new CUserOptions(move(options))) 128 | return out 129 | 130 | def validate(self): 131 | check_status(deref(self.options).Validate()) 132 | 133 | def equals(self, UserOptions other): 134 | return ( 135 | self.string_as_dictionaries == other.string_as_dictionaries and 136 | self.default_numeric_precision == other.default_numeric_precision and 137 | self.default_numeric_scale == other.default_numeric_scale and 138 | self.monetary_fractional_precision == other.monetary_fractional_precision 139 | ) 140 | 141 | def __getstate__(self): 142 | return (self.string_as_dictionaries, self.default_numeric_precision, 143 | self.default_numeric_scale, self.monetary_fractional_precision) 144 | 145 | def __setstate__(self, state): 146 | (self.string_as_dictionaries, self.default_numeric_precision, 147 | self.default_numeric_scale, self.monetary_fractional_precision) = state 148 | 149 | def __eq__(self, other): 150 | try: 151 | return self.equals(other) 152 | except TypeError: 153 | return False 154 | 155 | 156 | cdef _get_user_options(UserOptions user_options, CUserOptions* out): 157 | if user_options is None: 158 | out[0] = CUserOptions.Defaults() 159 | else: 160 | out[0] = deref(user_options.options) 161 | 162 | 163 | def copy_query(conninfo : str, query : str, options: UserOptions=None) -> pyarrow.Table: 164 | """Perform a query using the COPY interface 165 | 166 | Parameters 167 | ---------- 168 | conninfo : str 169 | Connection string 170 | 171 | query : str 172 | The SQL query 173 | 174 | options : UserOptions, optional 175 | User options 176 | 177 | Returns 178 | ------- 179 | pyarrow.Table 180 | The query result 181 | """ 182 | enc_conninfo = conninfo.encode('utf8') 183 | enc_query = query.encode('utf8') 184 | 185 | cdef: 186 | CResult[shared_ptr[CTable]] table 187 | const char* c_conninfo = enc_conninfo 188 | const char* c_query = enc_query 189 | CUserOptions c_options 190 | 191 | _get_user_options(options, &c_options) 192 | 193 | with nogil: 194 | table = CopyQuery(c_conninfo, c_query, c_options) 195 | 196 | return pyarrow_wrap_table(GetResultValue(table)) 197 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pyarrow as pa 8 | from Cython.Build import cythonize 9 | from setuptools import Extension, setup 10 | 11 | 12 | def get_pg_info(): 13 | def run(x): 14 | return subprocess.check_output(["pg_config", x]).decode().strip() 15 | 16 | return {k: run("--" + k) for k in ("version", "includedir", "libdir")} 17 | 18 | 19 | pa.create_library_symlinks() 20 | 21 | include_dirs = ["include", "src", pa.get_include(), np.get_include()] 22 | library_dirs = pa.get_library_dirs() 23 | try: 24 | pg_config = get_pg_info() 25 | include_dirs.append(pg_config["includedir"]) 26 | library_dirs.append(pg_config["libdir"]) 27 | except Exception: 28 | print("pg_config not found in PATH") 29 | 30 | macros = [("_GLIBCXX_USE_CXX11_ABI", "0")] 31 | if os.environ.get("CONDA_BUILD"): 32 | macros = None 33 | 34 | cflags = { 35 | "win32": ["/std:c++latest"], 36 | "darwin": ["-std=c++17", "-mmacosx-version-min=10.14"], 37 | "linux": ["-std=c++17"], 38 | }[sys.platform] 39 | 40 | sources = [str(p) for p in Path("src/pgeon").glob("**/*.cc")] 41 | extra_objects = [ 42 | [ 43 | "pgeon_cpp", 44 | { 45 | "sources": sources, 46 | "include_dirs": include_dirs, 47 | "language": "c++", 48 | "macros": macros, 49 | "cflags": cflags, 50 | }, 51 | ] 52 | ] 53 | 54 | libraries = ["pgeon_cpp"] + pa.get_libraries() 55 | if sys.platform == "win32": 56 | libraries.extend(["libpq", "Ws2_32"]) 57 | else: 58 | libraries.append("pq") 59 | 60 | extensions = [ 61 | Extension( 62 | name="pgeon._pgeon", 63 | sources=["python/_pgeon.pyx"], 64 | language="c++", 65 | define_macros=macros, 66 | include_dirs=include_dirs, 67 | libraries=libraries, 68 | library_dirs=library_dirs, 69 | extra_compile_args=cflags, 70 | ) 71 | ] 72 | 73 | setup( 74 | name="pgeon", 75 | ext_modules=cythonize(extensions), 76 | libraries=extra_objects, 77 | packages=["pgeon"], 78 | package_dir={"pgeon": "python"}, 79 | include_package_data=False, 80 | ) 81 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(pgeon SHARED) 2 | 3 | target_sources(pgeon PRIVATE 4 | pgeon/builder/datetime.cc 5 | pgeon/builder/geometric.cc 6 | pgeon/builder/misc.cc 7 | pgeon/builder/nested.cc 8 | pgeon/builder/network.cc 9 | pgeon/builder/numeric.cc 10 | pgeon/builder/stringlike.cc 11 | pgeon/builder/text_search.cc 12 | pgeon/builder.cc 13 | pgeon/table_builder.cc 14 | pgeon/pg_interface.cc 15 | pgeon/api.cc 16 | ) 17 | 18 | target_include_directories(pgeon PUBLIC ../include) 19 | target_include_directories(pgeon PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 20 | 21 | target_link_libraries(pgeon PRIVATE arrow_shared PostgreSQL::PostgreSQL) 22 | 23 | include(GNUInstallDirs) 24 | install( 25 | TARGETS pgeon 26 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 27 | ) 28 | 29 | find_package(Parquet REQUIRED) 30 | add_executable(pgeon-cli cli.cc) 31 | target_link_libraries(pgeon-cli PRIVATE pgeon arrow_shared parquet_shared PostgreSQL::PostgreSQL) 32 | -------------------------------------------------------------------------------- /src/cli.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | int main(int argc, char const* argv[]) { 10 | if ((argc != 3) && (argc != 4)) { 11 | std::cerr << "Usage: " << argv[0] << " DB QUERY [OUTPUT]" << std::endl; 12 | return EXIT_FAILURE; 13 | } 14 | 15 | auto table = pgeon::CopyQuery(argv[1], argv[2]); 16 | if (!table.ok()) { 17 | std::cerr << table.status() << std::endl; 18 | return EXIT_FAILURE; 19 | } 20 | 21 | std::cout << (*table)->schema()->ToString() << std::endl; 22 | std::cout << "Fetched " << (*table)->num_rows() << " rows" << std::endl; 23 | 24 | if (argc == 3) return EXIT_SUCCESS; 25 | 26 | auto output = arrow::io::FileOutputStream::Open(argv[3]); 27 | if (!output.ok()) { 28 | std::cerr << output.status() << std::endl; 29 | return EXIT_FAILURE; 30 | } 31 | 32 | auto status = 33 | parquet::arrow::WriteTable(**table, arrow::default_memory_pool(), *output); 34 | if (!status.ok()) { 35 | std::cerr << status << std::endl; 36 | return EXIT_FAILURE; 37 | } 38 | 39 | return EXIT_SUCCESS; 40 | } 41 | -------------------------------------------------------------------------------- /src/pgeon/api.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include 4 | #include 5 | 6 | #include "pgeon/pg_interface.h" 7 | 8 | namespace pgeon { 9 | 10 | UserOptions UserOptions::Defaults() { return UserOptions(); } 11 | 12 | arrow::Status UserOptions::Validate() const { 13 | if (default_numeric_precision < 1) { 14 | return arrow::Status::Invalid( 15 | "UserOptions: default_numeric_precision must be at least 1, got ", 16 | std::to_string(default_numeric_precision)); 17 | } 18 | if (default_numeric_scale < 1) { 19 | return arrow::Status::Invalid( 20 | "UserOptions: default_numeric_scale must be at least 1, got ", 21 | std::to_string(default_numeric_scale)); 22 | } 23 | if (default_numeric_scale >= default_numeric_precision) { 24 | return arrow::Status::Invalid( 25 | "UserOptions: default_numeric_scale must be smaller than " 26 | "default_numeric_precision, got ", 27 | std::to_string(default_numeric_scale)); 28 | } 29 | if (monetary_fractional_precision < 1) { 30 | return arrow::Status::Invalid( 31 | "UserOptions: monetary_fractional_precision must be at least 1, got ", 32 | std::to_string(monetary_fractional_precision)); 33 | } 34 | return arrow::Status::OK(); 35 | } 36 | 37 | arrow::Result> CopyQuery(const char* conninfo, 38 | const char* query, 39 | const UserOptions& options) { 40 | auto status = arrow::Status::OK(); 41 | 42 | auto conn = PQconnectdb(conninfo); 43 | if (PQstatus(conn) != CONNECTION_OK) { 44 | status = arrow::Status::IOError("[libpq] ", PQerrorMessage(conn)); 45 | PQfinish(conn); 46 | return status; 47 | } 48 | 49 | auto res = PQexec(conn, "BEGIN READ ONLY"); 50 | if (PQresultStatus(res) != PGRES_COMMAND_OK) 51 | status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 52 | 53 | PQclear(res); 54 | if (!status.ok()) { 55 | PQfinish(conn); 56 | return status; 57 | } 58 | 59 | auto builder = MakeTableBuilder(conn, query, options); 60 | if (!builder.ok()) { 61 | PQfinish(conn); 62 | return builder.status(); 63 | } 64 | 65 | status = CopyQuery(conn, query, *builder); 66 | if (!status.ok()) { 67 | PQfinish(conn); 68 | return status; 69 | } 70 | 71 | auto table = (*builder)->Flush(); 72 | if (!table.ok()) { 73 | PQfinish(conn); 74 | return table.status(); 75 | } 76 | 77 | res = PQexec(conn, "END"); 78 | if (PQresultStatus(res) != PGRES_COMMAND_OK) 79 | status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 80 | 81 | PQclear(res); 82 | PQfinish(conn); 83 | return status.ok() ? table : status; 84 | } 85 | 86 | // void CopyBatch(const char* conninfo, const char* query, size_t batch_size, 87 | // void (*callback)(std::shared_ptr)) {} 88 | 89 | } // namespace pgeon 90 | -------------------------------------------------------------------------------- /src/pgeon/builder.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "pgeon/builder/common.h" 10 | #include "pgeon/builder/datetime.h" 11 | #include "pgeon/builder/geometric.h" 12 | #include "pgeon/builder/misc.h" 13 | #include "pgeon/builder/nested.h" 14 | #include "pgeon/builder/network.h" 15 | #include "pgeon/builder/numeric.h" 16 | #include "pgeon/builder/stringlike.h" 17 | #include "pgeon/builder/text_search.h" 18 | 19 | namespace pgeon { 20 | 21 | using StringBuilder = GenericBuilder; 22 | // probably for xml json text 23 | // using LargeStringBuilder = GenericBuilder; 24 | using StringDictionaryBuilder = 25 | GenericBuilder; 26 | 27 | using BooleanBuilder = GenericBuilder; 28 | using Int32Builder = GenericBuilder; 29 | using Int64Builder = GenericBuilder; 30 | 31 | using FloatBuilder = GenericBuilder; 32 | using DoubleBuilder = GenericBuilder; 33 | 34 | template 35 | std::shared_ptr make(const SqlTypeInfo& info, const UserOptions& options) { 36 | return std::make_shared(info, options); 37 | } 38 | 39 | std::map (*)(const SqlTypeInfo&, const UserOptions&)> 41 | gTypeMap = { 42 | {"anyarray_recv", &make}, 43 | {"anycompatiblearray_recv", &make}, 44 | {"array_recv", &make}, 45 | {"bit_recv", &make}, 46 | {"boolrecv", &make}, 47 | {"box_recv", &make}, 48 | {"bpcharrecv", &make}, 49 | {"brin_bloom_summary_recv", &make}, 50 | {"brin_minmax_multi_summary_recv", &make}, 51 | {"bytearecv", &make}, 52 | {"cash_recv", &make}, 53 | {"charrecv", &make>}, 54 | {"cidr_recv", &make}, 55 | {"cidrecv", &make}, 56 | {"circle_recv", &make}, 57 | {"cstring_recv", &make}, 58 | {"date_recv", &make>}, 59 | // TODO(xav) this probably needs to get done in MakeBuilder 60 | // {"domain_recv", &make}, 61 | {"enum_recv", &make}, 62 | {"float4recv", &make}, 63 | {"float8recv", &make}, 64 | {"hstore_recv", &make}, 65 | {"inet_recv", &make}, 66 | {"int2recv", &make>}, 67 | // {"int2vectorrecv", &make<>}, // should need no implem 68 | {"int4recv", &make}, 69 | {"int8recv", &make}, 70 | {"interval_recv", &make}, 71 | {"json_recv", &make}, 72 | {"jsonb_recv", &make}, 73 | {"jsonpath_recv", &make}, 74 | {"line_recv", &make}, 75 | {"lseg_recv", &make}, 76 | {"macaddr_recv", &make}, 77 | {"macaddr8_recv", &make}, 78 | // {"multirange_recv", &make}, // require more type info 79 | {"namerecv", &make}, 80 | {"numeric_recv", &make}, 81 | {"oidrecv", &make}, 82 | // {"oidvectorrecv", &make<>}, // should need no implem 83 | {"path_recv", &make}, 84 | {"pg_dependencies_recv", &make}, 85 | {"pg_lsn_recv", &make}, 86 | {"pg_mcv_list_recv", &make}, 87 | {"pg_ndistinct_recv", &make}, 88 | {"pg_snapshot_recv", &make}, 89 | {"point_recv", &make}, 90 | {"poly_recv", &make}, 91 | // {"range_recv", &make}, // require more type info 92 | {"record_recv", &make}, 93 | {"regclassrecv", &make}, 94 | {"regcollationrecv", &make}, 95 | {"regconfigrecv", &make}, 96 | {"regdictionaryrecv", &make}, 97 | {"regnamespacerecv", &make}, 98 | {"regoperatorrecv", &make}, 99 | {"regoperrecv", &make}, 100 | {"regprocedurerecv", &make}, 101 | {"regprocrecv", &make}, 102 | {"regrolerecv", &make}, 103 | {"regtyperecv", &make}, 104 | {"textrecv", &make}, 105 | {"tidrecv", &make}, 106 | {"time_recv", &make}, 107 | {"timetz_recv", &make}, 108 | {"timestamp_recv", &make}, 109 | {"timestamptz_recv", &make}, 110 | {"tsqueryrecv", &make}, 111 | {"tsvectorrecv", &make}, 112 | {"unknownrecv", &make}, 113 | {"uuid_recv", &make}, 114 | {"varbit_recv", &make}, 115 | {"varcharrecv", &make}, 116 | {"void_recv", &make}, 117 | {"xid8recv", &make}, 118 | {"xidrecv", &make}, 119 | {"xml_recv", &make}, 120 | }; 121 | 122 | std::shared_ptr MakeBuilder(const SqlTypeInfo& info, 123 | const UserOptions& options) { 124 | if (options.string_as_dictionaries) { 125 | if ((info.typreceive == "bpcharrecv") || (info.typreceive == "varcharrecv") || 126 | (info.typreceive == "textrecv")) { 127 | return make(info, options); 128 | } 129 | } 130 | 131 | bool found = gTypeMap.count(info.typreceive) > 0; 132 | return found ? gTypeMap[info.typreceive](info, options) 133 | : make(info, options); 134 | } 135 | 136 | } // namespace pgeon 137 | -------------------------------------------------------------------------------- /src/pgeon/builder.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include "pgeon/builder/base.h" 8 | 9 | namespace pgeon { 10 | 11 | std::shared_ptr MakeBuilder(const SqlTypeInfo&, const UserOptions&); 12 | 13 | } // namespace pgeon 14 | -------------------------------------------------------------------------------- /src/pgeon/builder/base.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "pgeon/util/streambuffer.h" 14 | 15 | namespace pgeon { 16 | 17 | class ArrayBuilder { 18 | public: 19 | std::unique_ptr arrow_builder_; 20 | virtual ~ArrayBuilder() = default; 21 | virtual arrow::Status Append(StreamBuffer* sb) = 0; 22 | 23 | inline std::shared_ptr type() { return arrow_builder_->type(); } 24 | 25 | inline arrow::Result> Flush() { 26 | std::shared_ptr array; 27 | ARROW_RETURN_NOT_OK(arrow_builder_->Finish(&array)); 28 | return array; 29 | } 30 | }; 31 | 32 | using Field = std::pair>; 33 | using FieldVector = std::vector; 34 | 35 | struct SqlTypeInfo { 36 | std::string typreceive; 37 | int typmod = -1; 38 | int typlen = -1; 39 | 40 | // for ListBuilder 41 | std::shared_ptr value_builder; 42 | 43 | // for StructBuilder 44 | FieldVector field_builders; 45 | }; 46 | 47 | } // namespace pgeon 48 | -------------------------------------------------------------------------------- /src/pgeon/builder/common.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include "pgeon/builder/base.h" 8 | 9 | #define APPEND_AND_RETURN_IF_EMPTY(sb, ptr) \ 10 | do { \ 11 | int32_t len = sb->ReadInt32(); \ 12 | if (len == -1) { \ 13 | return ptr->AppendNull(); \ 14 | } \ 15 | } while (0) 16 | 17 | namespace pgeon { 18 | 19 | struct BinaryRecv { 20 | static const char* recv(StreamBuffer* sb, size_t len) { return sb->ReadBinary(len); } 21 | }; 22 | 23 | struct DateRecv { 24 | static const int32_t kEpoch = 10957; // 2000-01-01 - 1970-01-01 (days) 25 | static inline int32_t recv(StreamBuffer* sb) { return sb->ReadInt32() + kEpoch; } 26 | }; 27 | 28 | struct BoolRecv { 29 | static inline bool recv(StreamBuffer* sb) { return (sb->ReadUInt8() != 0); } 30 | }; 31 | 32 | struct UInt8Recv { 33 | static inline uint8_t recv(StreamBuffer* sb) { return sb->ReadUInt8(); } 34 | }; 35 | 36 | struct Int16Recv { 37 | static inline int16_t recv(StreamBuffer* sb) { return sb->ReadInt16(); } 38 | }; 39 | 40 | struct Int32Recv { 41 | static inline int32_t recv(StreamBuffer* sb) { return sb->ReadInt32(); } 42 | }; 43 | 44 | struct Int64Recv { 45 | static inline int64_t recv(StreamBuffer* sb) { return sb->ReadInt64(); } 46 | }; 47 | 48 | struct Float32Recv { 49 | static inline float recv(StreamBuffer* sb) { return sb->ReadFloat32(); } 50 | }; 51 | 52 | struct Float64Recv { 53 | static inline double recv(StreamBuffer* sb) { return sb->ReadFloat64(); } 54 | }; 55 | 56 | template 57 | class GenericBuilder : public ArrayBuilder { 58 | private: 59 | BuilderT* ptr_; 60 | 61 | public: 62 | GenericBuilder(const SqlTypeInfo& info, const UserOptions&) { 63 | arrow_builder_ = std::make_unique(); 64 | ptr_ = reinterpret_cast(arrow_builder_.get()); 65 | } 66 | 67 | arrow::Status Append(StreamBuffer* sb) { 68 | int32_t len = sb->ReadInt32(); 69 | if (len == -1) { 70 | // TODO(xav): as an option ? 71 | // if constexpr (std::is_base_of::value || 72 | // std::is_base_of::value) 73 | // return ptr_->Append(NAN); 74 | // else 75 | return ptr_->AppendNull(); 76 | } 77 | 78 | if constexpr (std::is_base_of::value || 79 | std::is_base_of::value || 80 | std::is_base_of::value || 81 | std::is_base_of::value) { 82 | return ptr_->Append(RecvT::recv(sb, len), len); 83 | } else { 84 | return ptr_->Append(RecvT::recv(sb)); 85 | } 86 | } 87 | }; 88 | 89 | } // namespace pgeon 90 | -------------------------------------------------------------------------------- /src/pgeon/builder/datetime.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/datetime.h" 4 | 5 | #include 6 | 7 | #include "pgeon/builder/common.h" 8 | 9 | namespace pgeon { 10 | 11 | TimeBuilder::TimeBuilder(const SqlTypeInfo&, const UserOptions&) { 12 | arrow_builder_ = std::make_unique( 13 | arrow::time64(arrow::TimeUnit::MICRO), arrow::default_memory_pool()); 14 | ptr_ = reinterpret_cast(arrow_builder_.get()); 15 | } 16 | 17 | arrow::Status TimeBuilder::Append(StreamBuffer* sb) { 18 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 19 | return ptr_->Append(sb->ReadInt64()); 20 | } 21 | 22 | TimeTzBuilder::TimeTzBuilder(const SqlTypeInfo&, const UserOptions&) { 23 | arrow_builder_ = std::make_unique( 24 | arrow::time64(arrow::TimeUnit::MICRO), arrow::default_memory_pool()); 25 | ptr_ = reinterpret_cast(arrow_builder_.get()); 26 | } 27 | 28 | arrow::Status TimeTzBuilder::Append(StreamBuffer* sb) { 29 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 30 | auto value = sb->ReadInt64(); 31 | auto tz = sb->ReadInt32(); 32 | return ptr_->Append(value + tz * 1000000LL); 33 | } 34 | 35 | TimestampBuilder::TimestampBuilder(const SqlTypeInfo& info, const UserOptions&) { 36 | auto type = arrow::timestamp(arrow::TimeUnit::MICRO); 37 | if (info.typreceive == "timestamptz_recv") 38 | type = arrow::timestamp(arrow::TimeUnit::MICRO, "UTC"); 39 | 40 | arrow_builder_ = 41 | std::make_unique(type, arrow::default_memory_pool()); 42 | ptr_ = reinterpret_cast(arrow_builder_.get()); 43 | } 44 | 45 | arrow::Status TimestampBuilder::Append(StreamBuffer* sb) { 46 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 47 | static const int64_t kEpoch = 946684800000000LL; // 2000-01-01 - 1970-01-01 (us) 48 | return ptr_->Append(sb->ReadInt64() + kEpoch); 49 | } 50 | 51 | IntervalBuilder::IntervalBuilder(const SqlTypeInfo&, const UserOptions&) { 52 | arrow_builder_ = 53 | std::make_unique(arrow::default_memory_pool()); 54 | ptr_ = reinterpret_cast(arrow_builder_.get()); 55 | } 56 | 57 | arrow::Status IntervalBuilder::Append(StreamBuffer* sb) { 58 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 59 | // static const int64_t kMicrosecondsPerDay = 24 * 3600 * 1000000LL; 60 | int64_t nano = sb->ReadInt64() * 1000LL; 61 | int32_t days = sb->ReadInt32(); 62 | int32_t months = sb->ReadInt32(); 63 | return ptr_->Append({.months = months, .days = days, .nanoseconds = nano}); 64 | } 65 | 66 | } // namespace pgeon 67 | -------------------------------------------------------------------------------- /src/pgeon/builder/datetime.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class TimeBuilder : public ArrayBuilder { 10 | private: 11 | arrow::Time64Builder* ptr_; 12 | 13 | public: 14 | TimeBuilder(const SqlTypeInfo&, const UserOptions&); 15 | arrow::Status Append(StreamBuffer*); 16 | }; 17 | 18 | class TimeTzBuilder : public ArrayBuilder { 19 | private: 20 | arrow::Time64Builder* ptr_; 21 | 22 | public: 23 | TimeTzBuilder(const SqlTypeInfo&, const UserOptions&); 24 | arrow::Status Append(StreamBuffer*); 25 | }; 26 | 27 | class TimestampBuilder : public ArrayBuilder { 28 | private: 29 | arrow::TimestampBuilder* ptr_; 30 | 31 | public: 32 | TimestampBuilder(const SqlTypeInfo&, const UserOptions&); 33 | arrow::Status Append(StreamBuffer*); 34 | }; 35 | 36 | class IntervalBuilder : public ArrayBuilder { 37 | private: 38 | arrow::MonthDayNanoIntervalBuilder* ptr_; 39 | 40 | public: 41 | IntervalBuilder(const SqlTypeInfo&, const UserOptions&); 42 | arrow::Status Append(StreamBuffer*); 43 | }; 44 | 45 | } // namespace pgeon 46 | -------------------------------------------------------------------------------- /src/pgeon/builder/geometric.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/geometric.h" 4 | 5 | #include "pgeon/builder/common.h" 6 | 7 | namespace pgeon { 8 | 9 | inline arrow::Status AppendFlatDoubleHelper(StreamBuffer* sb, arrow::StructBuilder* ptr) { 10 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr); 11 | ARROW_RETURN_NOT_OK(ptr->Append()); 12 | for (int i = 0; i < ptr->num_children(); i++) { 13 | ARROW_RETURN_NOT_OK(reinterpret_cast(ptr->child(i)) 14 | ->Append(Float64Recv::recv(sb))); 15 | } 16 | return arrow::Status::OK(); 17 | } 18 | 19 | PointBuilder::PointBuilder(const SqlTypeInfo&, const UserOptions&) { 20 | static const auto& type = arrow::struct_({ 21 | arrow::field("x", arrow::float64()), 22 | arrow::field("y", arrow::float64()), 23 | }); 24 | 25 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 26 | ptr_ = reinterpret_cast(arrow_builder_.get()); 27 | } 28 | 29 | arrow::Status PointBuilder::Append(StreamBuffer* sb) { 30 | return AppendFlatDoubleHelper(sb, ptr_); 31 | } 32 | 33 | LineBuilder::LineBuilder(const SqlTypeInfo&, const UserOptions&) { 34 | static const auto& type = arrow::struct_({ 35 | arrow::field("A", arrow::float64()), 36 | arrow::field("B", arrow::float64()), 37 | arrow::field("C", arrow::float64()), 38 | }); 39 | 40 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 41 | ptr_ = reinterpret_cast(arrow_builder_.get()); 42 | } 43 | 44 | arrow::Status LineBuilder::Append(StreamBuffer* sb) { 45 | return AppendFlatDoubleHelper(sb, ptr_); 46 | } 47 | 48 | BoxBuilder::BoxBuilder(const SqlTypeInfo&, const UserOptions&) { 49 | static const auto& type = arrow::struct_({ 50 | arrow::field("x1", arrow::float64()), 51 | arrow::field("y1", arrow::float64()), 52 | arrow::field("x2", arrow::float64()), 53 | arrow::field("y2", arrow::float64()), 54 | }); 55 | 56 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 57 | ptr_ = reinterpret_cast(arrow_builder_.get()); 58 | } 59 | 60 | arrow::Status BoxBuilder::Append(StreamBuffer* sb) { 61 | return AppendFlatDoubleHelper(sb, ptr_); 62 | } 63 | 64 | CircleBuilder::CircleBuilder(const SqlTypeInfo&, const UserOptions&) { 65 | static const auto& type = arrow::struct_({ 66 | arrow::field("x", arrow::float64()), 67 | arrow::field("y", arrow::float64()), 68 | arrow::field("r", arrow::float64()), 69 | }); 70 | 71 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 72 | ptr_ = reinterpret_cast(arrow_builder_.get()); 73 | } 74 | 75 | arrow::Status CircleBuilder::Append(StreamBuffer* sb) { 76 | return AppendFlatDoubleHelper(sb, ptr_); 77 | } 78 | 79 | PathBuilder::PathBuilder(const SqlTypeInfo&, const UserOptions&) { 80 | static const auto& type = 81 | arrow::struct_({arrow::field("closed", arrow::boolean()), 82 | arrow::field("points", arrow::list(arrow::struct_({ 83 | arrow::field("x", arrow::float64()), 84 | arrow::field("y", arrow::float64()), 85 | })))}); 86 | 87 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 88 | ptr_ = reinterpret_cast(arrow_builder_.get()); 89 | is_closed_builder_ = reinterpret_cast(ptr_->child(0)); 90 | point_list_builder_ = reinterpret_cast(ptr_->child(1)); 91 | point_builder_ = 92 | reinterpret_cast(point_list_builder_->value_builder()); 93 | x_builder_ = reinterpret_cast(point_builder_->child(0)); 94 | y_builder_ = reinterpret_cast(point_builder_->child(1)); 95 | } 96 | 97 | arrow::Status PathBuilder::Append(StreamBuffer* sb) { 98 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 99 | ARROW_RETURN_NOT_OK(ptr_->Append()); 100 | ARROW_RETURN_NOT_OK(is_closed_builder_->Append(BoolRecv::recv(sb))); 101 | 102 | int32_t npts = sb->ReadInt32(); 103 | ARROW_RETURN_NOT_OK(point_list_builder_->Append()); 104 | for (int32_t i = 0; i < npts; i++) { 105 | ARROW_RETURN_NOT_OK(point_builder_->Append()); 106 | ARROW_RETURN_NOT_OK(x_builder_->Append(Float64Recv::recv(sb))); 107 | ARROW_RETURN_NOT_OK(y_builder_->Append(Float64Recv::recv(sb))); 108 | } 109 | return arrow::Status::OK(); 110 | } 111 | 112 | PolygonBuilder::PolygonBuilder(const SqlTypeInfo&, const UserOptions&) { 113 | static const auto& type = arrow::list(arrow::struct_({ 114 | arrow::field("x", arrow::float64()), 115 | arrow::field("y", arrow::float64()), 116 | })); 117 | 118 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 119 | ptr_ = reinterpret_cast(arrow_builder_.get()); 120 | point_builder_ = reinterpret_cast(ptr_->value_builder()); 121 | x_builder_ = reinterpret_cast(point_builder_->child(0)); 122 | y_builder_ = reinterpret_cast(point_builder_->child(1)); 123 | } 124 | 125 | arrow::Status PolygonBuilder::Append(StreamBuffer* sb) { 126 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 127 | ARROW_RETURN_NOT_OK(ptr_->Append()); 128 | int32_t npts = sb->ReadInt32(); 129 | for (int32_t i = 0; i < npts; i++) { 130 | ARROW_RETURN_NOT_OK(point_builder_->Append()); 131 | ARROW_RETURN_NOT_OK(x_builder_->Append(Float64Recv::recv(sb))); 132 | ARROW_RETURN_NOT_OK(y_builder_->Append(Float64Recv::recv(sb))); 133 | } 134 | return arrow::Status::OK(); 135 | } 136 | 137 | } // namespace pgeon 138 | -------------------------------------------------------------------------------- /src/pgeon/builder/geometric.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class PointBuilder : public ArrayBuilder { 10 | private: 11 | arrow::StructBuilder* ptr_; 12 | 13 | public: 14 | PointBuilder(const SqlTypeInfo&, const UserOptions&); 15 | arrow::Status Append(StreamBuffer*); 16 | }; 17 | 18 | class LineBuilder : public ArrayBuilder { 19 | private: 20 | arrow::StructBuilder* ptr_; 21 | 22 | public: 23 | LineBuilder(const SqlTypeInfo&, const UserOptions&); 24 | arrow::Status Append(StreamBuffer*); 25 | }; 26 | 27 | class BoxBuilder : public ArrayBuilder { 28 | private: 29 | arrow::StructBuilder* ptr_; 30 | 31 | public: 32 | BoxBuilder(const SqlTypeInfo&, const UserOptions&); 33 | arrow::Status Append(StreamBuffer*); 34 | }; 35 | 36 | class CircleBuilder : public ArrayBuilder { 37 | private: 38 | arrow::StructBuilder* ptr_; 39 | 40 | public: 41 | CircleBuilder(const SqlTypeInfo&, const UserOptions&); 42 | arrow::Status Append(StreamBuffer*); 43 | }; 44 | 45 | class PathBuilder : public ArrayBuilder { 46 | private: 47 | arrow::StructBuilder* ptr_; 48 | arrow::BooleanBuilder* is_closed_builder_; 49 | arrow::ListBuilder* point_list_builder_; 50 | arrow::StructBuilder* point_builder_; 51 | arrow::DoubleBuilder* x_builder_; 52 | arrow::DoubleBuilder* y_builder_; 53 | 54 | public: 55 | PathBuilder(const SqlTypeInfo&, const UserOptions&); 56 | arrow::Status Append(StreamBuffer*); 57 | }; 58 | 59 | class PolygonBuilder : public ArrayBuilder { 60 | private: 61 | arrow::ListBuilder* ptr_; 62 | arrow::StructBuilder* point_builder_; 63 | arrow::DoubleBuilder* x_builder_; 64 | arrow::DoubleBuilder* y_builder_; 65 | 66 | public: 67 | PolygonBuilder(const SqlTypeInfo&, const UserOptions&); 68 | arrow::Status Append(StreamBuffer*); 69 | }; 70 | 71 | } // namespace pgeon 72 | -------------------------------------------------------------------------------- /src/pgeon/builder/misc.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/misc.h" 4 | 5 | #include 6 | 7 | #include "pgeon/builder/common.h" 8 | 9 | namespace pgeon { 10 | 11 | NullBuilder::NullBuilder(const SqlTypeInfo&, const UserOptions&) { 12 | arrow_builder_ = std::make_unique(); 13 | ptr_ = reinterpret_cast(arrow_builder_.get()); 14 | } 15 | 16 | arrow::Status NullBuilder::Append(StreamBuffer* sb) { 17 | int32_t len = sb->ReadInt32(); 18 | return ptr_->AppendNull(); 19 | } 20 | 21 | TidBuilder::TidBuilder(const SqlTypeInfo&, const UserOptions&) { 22 | static const auto& type = arrow::struct_({ 23 | arrow::field("block", arrow::int32()), 24 | arrow::field("offset", arrow::int16()), 25 | }); 26 | 27 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 28 | ptr_ = reinterpret_cast(arrow_builder_.get()); 29 | block_builder_ = reinterpret_cast(ptr_->child(0)); 30 | offset_builder_ = reinterpret_cast(ptr_->child(1)); 31 | } 32 | 33 | arrow::Status TidBuilder::Append(StreamBuffer* sb) { 34 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 35 | ARROW_RETURN_NOT_OK(ptr_->Append()); 36 | ARROW_RETURN_NOT_OK(block_builder_->Append(sb->ReadInt32())); 37 | return offset_builder_->Append(sb->ReadInt16()); 38 | } 39 | 40 | PgSnapshotBuilder::PgSnapshotBuilder(const SqlTypeInfo&, const UserOptions&) { 41 | static const auto& type = arrow::struct_( 42 | {arrow::field("xmin", arrow::int64()), arrow::field("xmax", arrow::int64()), 43 | arrow::field("xip", arrow::list(arrow::int64()))}); 44 | 45 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 46 | ptr_ = reinterpret_cast(arrow_builder_.get()); 47 | xmin_builder_ = reinterpret_cast(ptr_->child(0)); 48 | xmax_builder_ = reinterpret_cast(ptr_->child(1)); 49 | xip_builder_ = reinterpret_cast(ptr_->child(2)); 50 | value_builder_ = reinterpret_cast(xip_builder_->value_builder()); 51 | } 52 | 53 | arrow::Status PgSnapshotBuilder::Append(StreamBuffer* sb) { 54 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 55 | ARROW_RETURN_NOT_OK(ptr_->Append()); 56 | ARROW_RETURN_NOT_OK(xip_builder_->Append()); 57 | 58 | int32_t nxip = sb->ReadInt32(); 59 | ARROW_RETURN_NOT_OK(xmin_builder_->Append(sb->ReadInt64())); 60 | ARROW_RETURN_NOT_OK(xmax_builder_->Append(sb->ReadInt64())); 61 | for (int32_t i = 0; i < nxip; i++) { 62 | ARROW_RETURN_NOT_OK(value_builder_->Append(sb->ReadInt64())); 63 | } 64 | return arrow::Status::OK(); 65 | } 66 | 67 | } // namespace pgeon 68 | -------------------------------------------------------------------------------- /src/pgeon/builder/misc.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class NullBuilder : public ArrayBuilder { 10 | private: 11 | arrow::NullBuilder* ptr_; 12 | 13 | public: 14 | NullBuilder(const SqlTypeInfo&, const UserOptions&); 15 | arrow::Status Append(StreamBuffer*); 16 | }; 17 | 18 | class TidBuilder : public ArrayBuilder { 19 | private: 20 | arrow::StructBuilder* ptr_; 21 | arrow::Int32Builder* block_builder_; 22 | arrow::Int16Builder* offset_builder_; 23 | 24 | public: 25 | TidBuilder(const SqlTypeInfo&, const UserOptions&); 26 | arrow::Status Append(StreamBuffer*); 27 | }; 28 | 29 | class PgSnapshotBuilder : public ArrayBuilder { 30 | private: 31 | arrow::StructBuilder* ptr_; 32 | arrow::Int64Builder* xmin_builder_; 33 | arrow::Int64Builder* xmax_builder_; 34 | arrow::ListBuilder* xip_builder_; 35 | arrow::Int64Builder* value_builder_; 36 | 37 | public: 38 | PgSnapshotBuilder(const SqlTypeInfo&, const UserOptions&); 39 | arrow::Status Append(StreamBuffer*); 40 | }; 41 | 42 | } // namespace pgeon 43 | -------------------------------------------------------------------------------- /src/pgeon/builder/nested.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | #include "pgeon/builder/nested.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "pgeon/builder/common.h" 9 | 10 | namespace pgeon { 11 | 12 | ListBuilder::ListBuilder(const SqlTypeInfo& info, const UserOptions&) 13 | : value_builder_(info.value_builder) { 14 | arrow_builder_ = std::make_unique( 15 | arrow::default_memory_pool(), std::move(value_builder_->arrow_builder_)); 16 | ptr_ = reinterpret_cast(arrow_builder_.get()); 17 | } 18 | 19 | arrow::Status ListBuilder::Append(StreamBuffer* sb) { 20 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 21 | int32_t ndim = sb->ReadInt32(); 22 | int32_t hasnull = sb->ReadInt32(); 23 | int32_t element_type = sb->ReadInt32(); 24 | 25 | // Element will be flattened 26 | int32_t nitems = 1; 27 | for (int32_t i = 0; i < ndim; i++) { 28 | int32_t dim = sb->ReadInt32(); 29 | int32_t lb = sb->ReadInt32(); 30 | nitems *= dim; 31 | } 32 | 33 | ARROW_RETURN_NOT_OK(ptr_->Append()); 34 | for (int32_t i = 0; i < nitems; i++) { 35 | ARROW_RETURN_NOT_OK(value_builder_->Append(sb)); 36 | } 37 | return arrow::Status::OK(); 38 | } 39 | 40 | StructBuilder::StructBuilder(const SqlTypeInfo& info, const UserOptions&) { 41 | FieldVector fields = info.field_builders; 42 | ncolumns_ = fields.size(); 43 | 44 | std::vector> builders(ncolumns_); 45 | arrow::FieldVector fv(ncolumns_); 46 | for (size_t i = 0; i < ncolumns_; i++) { 47 | builders_.push_back(fields[i].second); 48 | builders[i] = std::move(fields[i].second->arrow_builder_); 49 | fv[i] = arrow::field(fields[i].first, builders[i]->type()); 50 | } 51 | 52 | arrow_builder_ = std::make_unique( 53 | arrow::struct_(fv), arrow::default_memory_pool(), builders); 54 | ptr_ = reinterpret_cast(arrow_builder_.get()); 55 | } 56 | 57 | arrow::Status StructBuilder::Append(StreamBuffer* sb) { 58 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 59 | ARROW_RETURN_NOT_OK(ptr_->Append()); 60 | int32_t validcols = sb->ReadInt32(); 61 | assert(validcols == ncolumns_); 62 | for (size_t i = 0; i < ncolumns_; i++) { 63 | int32_t column_type = sb->ReadInt32(); 64 | ARROW_RETURN_NOT_OK(builders_[i]->Append(sb)); 65 | } 66 | return arrow::Status::OK(); 67 | } 68 | 69 | } // namespace pgeon 70 | -------------------------------------------------------------------------------- /src/pgeon/builder/nested.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | 8 | #include "pgeon/builder/base.h" 9 | 10 | namespace pgeon { 11 | 12 | class ListBuilder : public ArrayBuilder { 13 | private: 14 | std::shared_ptr value_builder_; 15 | arrow::ListBuilder* ptr_; 16 | 17 | public: 18 | ListBuilder(const SqlTypeInfo&, const UserOptions&); 19 | arrow::Status Append(StreamBuffer*); 20 | }; 21 | 22 | class StructBuilder : public ArrayBuilder { 23 | private: 24 | std::vector> builders_; 25 | arrow::StructBuilder* ptr_; 26 | size_t ncolumns_; 27 | 28 | public: 29 | StructBuilder(const SqlTypeInfo&, const UserOptions&); 30 | arrow::Status Append(StreamBuffer*); 31 | }; 32 | 33 | } // namespace pgeon 34 | -------------------------------------------------------------------------------- /src/pgeon/builder/network.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/network.h" 4 | 5 | #include "pgeon/builder/common.h" 6 | 7 | namespace pgeon { 8 | 9 | InetBuilder::InetBuilder(const SqlTypeInfo&, const UserOptions&) { 10 | static const auto& type = arrow::struct_({ 11 | arrow::field("family", arrow::uint8()), 12 | arrow::field("bits", arrow::uint8()), 13 | arrow::field("is_cidr", arrow::boolean()), 14 | arrow::field("ipaddr", arrow::binary()), 15 | }); 16 | 17 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 18 | ptr_ = reinterpret_cast(arrow_builder_.get()); 19 | family_builder_ = reinterpret_cast(ptr_->child(0)); 20 | bits_builder_ = reinterpret_cast(ptr_->child(1)); 21 | is_cidr_builder_ = reinterpret_cast(ptr_->child(2)); 22 | ipaddr_builder_ = reinterpret_cast(ptr_->child(3)); 23 | } 24 | 25 | arrow::Status InetBuilder::Append(StreamBuffer* sb) { 26 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 27 | ARROW_RETURN_NOT_OK(ptr_->Append()); 28 | ARROW_RETURN_NOT_OK(family_builder_->Append(sb->ReadUInt8())); 29 | ARROW_RETURN_NOT_OK(bits_builder_->Append(sb->ReadUInt8())); 30 | ARROW_RETURN_NOT_OK(is_cidr_builder_->Append(sb->ReadUInt8() != 0)); 31 | int8_t size = sb->ReadUInt8(); 32 | if (size > -1) { 33 | ARROW_RETURN_NOT_OK(ipaddr_builder_->Append(sb->ReadBinary(size), size)); 34 | } 35 | return arrow::Status::OK(); 36 | } 37 | 38 | } // namespace pgeon 39 | -------------------------------------------------------------------------------- /src/pgeon/builder/network.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class InetBuilder : public ArrayBuilder { 10 | private: 11 | arrow::StructBuilder* ptr_; 12 | arrow::UInt8Builder* family_builder_; 13 | arrow::UInt8Builder* bits_builder_; 14 | arrow::BooleanBuilder* is_cidr_builder_; 15 | arrow::BinaryBuilder* ipaddr_builder_; 16 | 17 | public: 18 | InetBuilder(const SqlTypeInfo&, const UserOptions&); 19 | arrow::Status Append(StreamBuffer*); 20 | }; 21 | 22 | } // namespace pgeon 23 | -------------------------------------------------------------------------------- /src/pgeon/builder/numeric.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/numeric.h" 4 | 5 | #include 6 | 7 | #include "pgeon/builder/common.h" 8 | 9 | namespace pgeon { 10 | 11 | #define NUMERIC_SIGN_MASK 0xC000 12 | #define NUMERIC_POS 0x0000 13 | #define NUMERIC_NEG 0x4000 14 | #define NUMERIC_NAN 0xC000 15 | 16 | #define NBASE 10000 17 | #define HALF_NBASE 5000 18 | #define DEC_DIGITS 4 /* decimal digits per NBASE digit */ 19 | #define MUL_GUARD_DIGITS 2 /* these are measured in NBASE digits */ 20 | #define DIV_GUARD_DIGITS 4 21 | 22 | struct _NumericHelper { 23 | int16_t ndigits; 24 | int16_t weight; /* weight of first digit */ 25 | int16_t sign; /* NUMERIC_(POS|NEG|NAN) */ 26 | int16_t dscale; /* display scale */ 27 | int16_t digits[]; 28 | }; 29 | 30 | #define VARHDRSZ ((int32_t)sizeof(int32_t)) 31 | 32 | NumericBuilder::NumericBuilder(const SqlTypeInfo& info, const UserOptions& options) { 33 | precision_ = ((info.typmod - VARHDRSZ) >> 16) & 0xffff; 34 | scale_ = (((info.typmod - VARHDRSZ) & 0x7ff) ^ 1024) - 1024; 35 | 36 | // undefined precision decimals 37 | if (precision_ == 0xffff) { 38 | precision_ = options.default_numeric_precision; 39 | scale_ = options.default_numeric_scale; 40 | } 41 | 42 | arrow_builder_ = 43 | std::make_unique(arrow::decimal128(precision_, scale_)); 44 | ptr_ = reinterpret_cast(arrow_builder_.get()); 45 | } 46 | 47 | arrow::Status NumericBuilder::Append(StreamBuffer* sb) { 48 | int32_t len = sb->ReadInt32(); 49 | if (len == -1) return ptr_->AppendNull(); 50 | 51 | const char* buf = sb->ReadBinary(len); 52 | auto rawdata = reinterpret_cast(buf); 53 | 54 | int16_t ndigits = ntohs(rawdata->ndigits); 55 | int16_t weight = ntohs(rawdata->weight); 56 | int16_t sign = ntohs(rawdata->sign); 57 | int16_t scale = scale_; // ntoh16(rawdata->dscale); 58 | 59 | arrow::Decimal128 value = 0; 60 | int16_t d, dig; 61 | 62 | if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_NAN) { 63 | return ptr_->AppendNull(); 64 | } 65 | 66 | /* makes integer portion first */ 67 | for (d = 0; d <= weight; d++) { 68 | dig = (d < ndigits) ? ntohs(rawdata->digits[d]) : 0; 69 | if (dig < 0 || dig >= NBASE) 70 | return arrow::Status::IOError("[numeric] digit is out of range"); 71 | value = NBASE * value + dig; 72 | } 73 | 74 | /* makes floating point portion if any */ 75 | while (scale > 0) { 76 | dig = (d >= 0 && d < ndigits) ? ntohs(rawdata->digits[d]) : 0; 77 | if (dig < 0 || dig >= NBASE) 78 | return arrow::Status::IOError("[numeric] digit is out of range"); 79 | 80 | if (scale >= DEC_DIGITS) 81 | value = NBASE * value + dig; 82 | else if (scale == 3) 83 | value = 1000L * value + dig / 10L; 84 | else if (scale == 2) 85 | value = 100L * value + dig / 100L; 86 | else if (scale == 1) 87 | value = 10L * value + dig / 1000L; 88 | else 89 | return arrow::Status::IOError("[numeric] Unexpected error while parsing"); 90 | scale -= DEC_DIGITS; 91 | d++; 92 | } 93 | /* is it a negative value? */ 94 | if ((sign & NUMERIC_NEG) != 0) value = -value; 95 | 96 | return ptr_->Append(value); 97 | } 98 | 99 | MonetaryBuilder::MonetaryBuilder(const SqlTypeInfo&, const UserOptions& options) { 100 | precision_ = options.default_numeric_precision; 101 | scale_ = options.monetary_fractional_precision; 102 | 103 | arrow_builder_ = 104 | std::make_unique(arrow::decimal128(precision_, scale_)); 105 | ptr_ = reinterpret_cast(arrow_builder_.get()); 106 | } 107 | 108 | arrow::Status MonetaryBuilder::Append(StreamBuffer* sb) { 109 | int32_t len = sb->ReadInt32(); 110 | if (len == -1) { 111 | return ptr_->AppendNull(); 112 | } 113 | 114 | arrow::Decimal128 value = sb->ReadInt64(); 115 | return ptr_->Append(value); 116 | } 117 | 118 | } // namespace pgeon 119 | -------------------------------------------------------------------------------- /src/pgeon/builder/numeric.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class NumericBuilder : public ArrayBuilder { 10 | private: 11 | arrow::Decimal128Builder* ptr_; 12 | int precision_; 13 | int scale_; 14 | 15 | public: 16 | NumericBuilder(const SqlTypeInfo&, const UserOptions&); 17 | arrow::Status Append(StreamBuffer*); 18 | }; 19 | 20 | class MonetaryBuilder : public ArrayBuilder { 21 | private: 22 | arrow::Decimal128Builder* ptr_; 23 | int precision_; 24 | int scale_; 25 | 26 | public: 27 | MonetaryBuilder(const SqlTypeInfo&, const UserOptions&); 28 | arrow::Status Append(StreamBuffer*); 29 | }; 30 | 31 | } // namespace pgeon 32 | -------------------------------------------------------------------------------- /src/pgeon/builder/stringlike.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/stringlike.h" 4 | 5 | #include 6 | 7 | #include "pgeon/builder/common.h" 8 | 9 | namespace pgeon { 10 | 11 | BinaryBuilder::BinaryBuilder(const SqlTypeInfo& info, const UserOptions&) { 12 | auto type = arrow::binary(); 13 | if (info.typlen > -1) type = arrow::fixed_size_binary(info.typlen); 14 | 15 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 16 | 17 | ptr_ = reinterpret_cast(arrow_builder_.get()); 18 | binary_ptr_ = nullptr; 19 | fixed_size_binary_ptr_ = nullptr; 20 | 21 | if (info.typlen > -1) { 22 | fixed_size_binary_ptr_ = 23 | reinterpret_cast(arrow_builder_.get()); 24 | } else { 25 | binary_ptr_ = reinterpret_cast(arrow_builder_.get()); 26 | } 27 | } 28 | 29 | arrow::Status BinaryBuilder::Append(StreamBuffer* sb) { 30 | int32_t len = sb->ReadInt32(); 31 | if (len == -1) return ptr_->AppendNull(); 32 | 33 | auto value = sb->ReadBinary(len); 34 | return binary_ptr_ != nullptr ? binary_ptr_->Append(value, len) 35 | : fixed_size_binary_ptr_->Append(value); 36 | } 37 | 38 | JsonbBuilder::JsonbBuilder(const SqlTypeInfo&, const UserOptions&) { 39 | arrow_builder_ = std::make_unique(); 40 | ptr_ = reinterpret_cast(arrow_builder_.get()); 41 | } 42 | 43 | arrow::Status JsonbBuilder::Append(StreamBuffer* sb) { 44 | int32_t len = sb->ReadInt32(); 45 | if (len == -1) return ptr_->AppendNull(); 46 | 47 | const char* buf = sb->ReadBinary(len); 48 | // First byte is format number 49 | return ptr_->Append(buf + 1, len - 1); 50 | } 51 | 52 | HstoreBuilder::HstoreBuilder(const SqlTypeInfo&, const UserOptions&) { 53 | auto status = 54 | arrow::MakeBuilder(arrow::default_memory_pool(), 55 | arrow::map(arrow::utf8(), arrow::utf8()), &arrow_builder_); 56 | 57 | ptr_ = reinterpret_cast(arrow_builder_.get()); 58 | key_builder_ = reinterpret_cast(ptr_->key_builder()); 59 | item_builder_ = reinterpret_cast(ptr_->item_builder()); 60 | } 61 | 62 | arrow::Status HstoreBuilder::Append(StreamBuffer* sb) { 63 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 64 | ARROW_RETURN_NOT_OK(ptr_->Append()); 65 | 66 | int32_t pcount = sb->ReadInt32(); 67 | int32_t flen; 68 | for (int32_t i = 0; i < pcount; i++) { 69 | flen = sb->ReadInt32(); 70 | ARROW_RETURN_NOT_OK(key_builder_->Append(sb->ReadBinary(flen), flen)); 71 | 72 | flen = sb->ReadInt32(); 73 | if (flen > -1) { 74 | ARROW_RETURN_NOT_OK(item_builder_->Append(sb->ReadBinary(flen), flen)); 75 | } else { 76 | ARROW_RETURN_NOT_OK(item_builder_->AppendNull()); 77 | } 78 | } 79 | return arrow::Status::OK(); 80 | } 81 | 82 | } // namespace pgeon 83 | -------------------------------------------------------------------------------- /src/pgeon/builder/stringlike.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class BinaryBuilder : public ArrayBuilder { 10 | private: 11 | arrow::ArrayBuilder* ptr_; 12 | arrow::BinaryBuilder* binary_ptr_; 13 | arrow::FixedSizeBinaryBuilder* fixed_size_binary_ptr_; 14 | 15 | public: 16 | BinaryBuilder(const SqlTypeInfo&, const UserOptions&); 17 | arrow::Status Append(StreamBuffer*); 18 | }; 19 | 20 | class JsonbBuilder : public ArrayBuilder { 21 | private: 22 | arrow::StringBuilder* ptr_; 23 | 24 | public: 25 | JsonbBuilder(const SqlTypeInfo&, const UserOptions&); 26 | arrow::Status Append(StreamBuffer*); 27 | }; 28 | 29 | class HstoreBuilder : public ArrayBuilder { 30 | private: 31 | arrow::MapBuilder* ptr_; 32 | arrow::StringBuilder* key_builder_; 33 | arrow::StringBuilder* item_builder_; 34 | 35 | public: 36 | HstoreBuilder(const SqlTypeInfo&, const UserOptions&); 37 | arrow::Status Append(StreamBuffer*); 38 | }; 39 | 40 | } // namespace pgeon 41 | -------------------------------------------------------------------------------- /src/pgeon/builder/text_search.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/builder/text_search.h" 4 | 5 | #include "pgeon/builder/common.h" 6 | 7 | namespace pgeon { 8 | 9 | TsVectorBuilder::TsVectorBuilder(const SqlTypeInfo&, const UserOptions&) { 10 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), 11 | arrow::map(arrow::utf8(), arrow::list(arrow::int32())), 12 | &arrow_builder_); 13 | 14 | ptr_ = reinterpret_cast(arrow_builder_.get()); 15 | key_builder_ = reinterpret_cast(ptr_->key_builder()); 16 | item_builder_ = reinterpret_cast(ptr_->item_builder()); 17 | value_builder_ = reinterpret_cast(item_builder_->value_builder()); 18 | } 19 | 20 | arrow::Status TsVectorBuilder::Append(StreamBuffer* sb) { 21 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 22 | ARROW_RETURN_NOT_OK(ptr_->Append()); 23 | 24 | int32_t size = sb->ReadInt32(); 25 | int16_t npos; 26 | for (int32_t i = 0; i < size; i++) { 27 | const char* buf = sb->ReadBinary(1); 28 | const char* start_buf = buf; 29 | int16_t flen = 0; 30 | while (*buf != '\0') { 31 | flen += 1; 32 | buf = sb->ReadBinary(1); 33 | } 34 | 35 | ARROW_RETURN_NOT_OK(key_builder_->Append(start_buf, flen)); 36 | 37 | ARROW_RETURN_NOT_OK(item_builder_->Append()); 38 | npos = sb->ReadInt16(); 39 | for (int16_t j = 0; j < npos; j++) { 40 | ARROW_RETURN_NOT_OK(value_builder_->Append(sb->ReadInt16())); 41 | } 42 | } 43 | return arrow::Status::OK(); 44 | } 45 | 46 | #define QI_VAL 1 47 | #define QI_OPR 2 48 | #define QI_VALSTOP 3 49 | 50 | #define OP_NOT 1 51 | #define OP_AND 2 52 | #define OP_OR 3 53 | #define OP_PHRASE 4 /* highest code, tsquery_cleanup.c */ 54 | #define OP_COUNT 55 | 56 | TsQueryBuilder::TsQueryBuilder(const SqlTypeInfo&, const UserOptions&) { 57 | static const auto& type = arrow::list(arrow::struct_({ 58 | arrow::field("type", arrow::int8()), 59 | arrow::field("weight", arrow::int8()), 60 | arrow::field("prefix", arrow::int8()), 61 | arrow::field("operand", arrow::utf8()), 62 | arrow::field("oper", arrow::int8()), 63 | arrow::field("distance", arrow::int16()), 64 | })); 65 | 66 | auto status = arrow::MakeBuilder(arrow::default_memory_pool(), type, &arrow_builder_); 67 | 68 | ptr_ = reinterpret_cast(arrow_builder_.get()); 69 | value_builder_ = reinterpret_cast(ptr_->value_builder()); 70 | type_builder_ = reinterpret_cast(value_builder_->child(0)); 71 | weight_builder_ = reinterpret_cast(value_builder_->child(1)); 72 | prefix_builder_ = reinterpret_cast(value_builder_->child(2)); 73 | operand_builder_ = reinterpret_cast(value_builder_->child(3)); 74 | oper_builder_ = reinterpret_cast(value_builder_->child(4)); 75 | distance_builder_ = reinterpret_cast(value_builder_->child(5)); 76 | } 77 | 78 | arrow::Status TsQueryBuilder::Append(StreamBuffer* sb) { 79 | APPEND_AND_RETURN_IF_EMPTY(sb, ptr_); 80 | ARROW_RETURN_NOT_OK(ptr_->Append()); 81 | 82 | int32_t size = sb->ReadInt32(); 83 | int16_t npos; 84 | for (int32_t i = 0; i < size; i++) { 85 | ARROW_RETURN_NOT_OK(value_builder_->Append()); 86 | int8_t type = sb->ReadUInt8(); 87 | switch (type) { 88 | case QI_VAL: { 89 | int8_t weight = sb->ReadUInt8(); 90 | int8_t prefix = sb->ReadUInt8(); 91 | 92 | const char* buf = sb->ReadBinary(1); 93 | const char* start_buf = buf; 94 | int16_t flen = 0; 95 | while (*buf != '\0') { 96 | flen += 1; 97 | buf = sb->ReadBinary(1); 98 | } 99 | 100 | ARROW_RETURN_NOT_OK(type_builder_->Append(type)); 101 | ARROW_RETURN_NOT_OK(weight_builder_->Append(weight)); 102 | ARROW_RETURN_NOT_OK(prefix_builder_->Append(prefix)); 103 | ARROW_RETURN_NOT_OK(operand_builder_->Append(start_buf, flen)); // TODO(xav) 104 | ARROW_RETURN_NOT_OK(oper_builder_->AppendNull()); 105 | ARROW_RETURN_NOT_OK(distance_builder_->AppendNull()); 106 | 107 | buf += flen + 1; 108 | } break; 109 | 110 | case QI_OPR: { 111 | int8_t oper = sb->ReadUInt8(); 112 | 113 | ARROW_RETURN_NOT_OK(type_builder_->Append(type)); 114 | ARROW_RETURN_NOT_OK(weight_builder_->AppendNull()); 115 | ARROW_RETURN_NOT_OK(prefix_builder_->AppendNull()); 116 | ARROW_RETURN_NOT_OK(operand_builder_->AppendNull()); 117 | ARROW_RETURN_NOT_OK(oper_builder_->Append(oper)); 118 | 119 | if (oper == OP_PHRASE) { 120 | int16_t distance = sb->ReadInt16(); 121 | ARROW_RETURN_NOT_OK(distance_builder_->Append(distance)); 122 | } else { 123 | ARROW_RETURN_NOT_OK(distance_builder_->AppendNull()); 124 | } 125 | } break; 126 | 127 | default: { 128 | ARROW_RETURN_NOT_OK(type_builder_->Append(type)); 129 | ARROW_RETURN_NOT_OK(weight_builder_->AppendNull()); 130 | ARROW_RETURN_NOT_OK(prefix_builder_->AppendNull()); 131 | ARROW_RETURN_NOT_OK(operand_builder_->AppendNull()); 132 | ARROW_RETURN_NOT_OK(oper_builder_->AppendNull()); 133 | ARROW_RETURN_NOT_OK(distance_builder_->AppendNull()); 134 | } break; 135 | } 136 | } 137 | return arrow::Status::OK(); 138 | } 139 | 140 | } // namespace pgeon 141 | -------------------------------------------------------------------------------- /src/pgeon/builder/text_search.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include "pgeon/builder/base.h" 6 | 7 | namespace pgeon { 8 | 9 | class TsVectorBuilder : public ArrayBuilder { 10 | private: 11 | arrow::MapBuilder* ptr_; 12 | arrow::StringBuilder* key_builder_; 13 | arrow::ListBuilder* item_builder_; 14 | arrow::Int32Builder* value_builder_; 15 | 16 | public: 17 | TsVectorBuilder(const SqlTypeInfo&, const UserOptions&); 18 | arrow::Status Append(StreamBuffer*); 19 | }; 20 | 21 | class TsQueryBuilder : public ArrayBuilder { 22 | private: 23 | arrow::ListBuilder* ptr_; 24 | arrow::StructBuilder* value_builder_; 25 | arrow::Int8Builder* type_builder_; 26 | arrow::Int8Builder* weight_builder_; 27 | arrow::Int8Builder* prefix_builder_; 28 | arrow::StringBuilder* operand_builder_; 29 | arrow::Int8Builder* oper_builder_; 30 | arrow::Int16Builder* distance_builder_; 31 | 32 | public: 33 | TsQueryBuilder(const SqlTypeInfo&, const UserOptions&); 34 | arrow::Status Append(StreamBuffer*); 35 | }; 36 | 37 | } // namespace pgeon 38 | -------------------------------------------------------------------------------- /src/pgeon/pg_interface.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/pg_interface.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "pgeon/builder.h" 11 | 12 | namespace pgeon { 13 | 14 | using ColumnVector = std::vector>; 15 | 16 | arrow::Result> ColumnTypesForQuery(PGconn* conn, 17 | const char* query) { 18 | auto descr_query = 19 | arrow::util::StringBuilder("SELECT * FROM (", query, ") AS foo LIMIT 0;"); 20 | 21 | PGresult* res = PQexec(conn, descr_query.c_str()); 22 | if (PQresultStatus(res) != PGRES_TUPLES_OK) { 23 | auto status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 24 | PQclear(res); 25 | return status; 26 | } 27 | 28 | int n = PQnfields(res); 29 | ColumnVector fields(n); 30 | 31 | for (int i = 0; i < n; i++) { 32 | const char* name = PQfname(res, i); 33 | Oid oid = PQftype(res, i); 34 | int mod = PQfmod(res, i); 35 | fields[i] = {name, oid, mod}; 36 | } 37 | 38 | PQclear(res); 39 | return std::make_shared(fields); 40 | } 41 | 42 | arrow::Result> RecordTypeInfo(PGconn* conn, Oid oid) { 43 | auto query = arrow::util::StringBuilder( 44 | "SELECT attnum, attname, atttypid, atttypmod ", 45 | "FROM pg_catalog.pg_attribute a, pg_catalog.pg_type t, pg_catalog.pg_namespace n ", 46 | "WHERE t.typnamespace = n.oid AND a.atttypid = t.oid AND a.attrelid = ", 47 | std::to_string(oid), ";"); 48 | 49 | auto res = PQexec(conn, query.c_str()); 50 | if (PQresultStatus(res) != PGRES_TUPLES_OK) { 51 | auto status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 52 | PQclear(res); 53 | return status; 54 | } 55 | 56 | int nfields = PQntuples(res); 57 | std::vector> fields(nfields); 58 | 59 | for (int i = 0; i < nfields; i++) { 60 | int attnum = atoi(PQgetvalue(res, i, 0)); 61 | const char* attname = PQgetvalue(res, i, 1); 62 | Oid atttypid = atooid(PQgetvalue(res, i, 2)); 63 | int atttypmod = atoi(PQgetvalue(res, i, 3)); 64 | 65 | fields[attnum - 1] = {attname, atttypid, atttypmod}; 66 | } 67 | 68 | PQclear(res); 69 | return std::make_shared(fields); 70 | } 71 | 72 | arrow::Result> MakeColumnBuilder( 73 | PGconn* conn, Oid oid, int mod, const UserOptions& options) { 74 | auto query = arrow::util::StringBuilder( 75 | "SELECT typreceive, typelem, typrelid, typlen ", 76 | "FROM pg_catalog.pg_type t, pg_catalog.pg_namespace n ", 77 | "WHERE t.typnamespace = n.oid AND t.oid = ", std::to_string(oid), ";"); 78 | 79 | auto res = PQexec(conn, query.c_str()); 80 | if (PQresultStatus(res) != PGRES_TUPLES_OK) { 81 | auto status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 82 | PQclear(res); 83 | return status; 84 | } 85 | 86 | if (PQntuples(res) == 0) { 87 | // this happens with attmissingval (anyarrayrecv) in pg_attribute 88 | PQclear(res); 89 | return MakeBuilder({.typreceive = "void_recv"}, options); 90 | } 91 | 92 | std::string typreceive = PQgetvalue(res, 0, 0); 93 | Oid typelem = atooid(PQgetvalue(res, 0, 1)); 94 | Oid typrelid = atooid(PQgetvalue(res, 0, 2)); 95 | int typlen = atoi(PQgetvalue(res, 0, 3)); 96 | PQclear(res); 97 | 98 | std::shared_ptr builder; 99 | SqlTypeInfo sql_info{.typreceive = typreceive, .typmod = mod, .typlen = typlen}; 100 | if (typreceive == "anyarray_recv" || typreceive == "anycompatiblearray_recv" || 101 | typreceive == "array_recv") { 102 | ARROW_ASSIGN_OR_RAISE(builder, MakeColumnBuilder(conn, typelem, mod, options)); 103 | sql_info.value_builder = builder; 104 | } else if (typreceive == "record_recv") { 105 | std::shared_ptr fields_info; 106 | ARROW_ASSIGN_OR_RAISE(fields_info, RecordTypeInfo(conn, typrelid)); 107 | 108 | FieldVector fields; 109 | for (size_t i = 0; i < fields_info->size(); i++) { 110 | auto [name, oid, mod] = (*fields_info)[i]; 111 | ARROW_ASSIGN_OR_RAISE(builder, MakeColumnBuilder(conn, oid, mod, options)); 112 | fields.push_back({name, builder}); 113 | } 114 | sql_info.field_builders = fields; 115 | } 116 | 117 | return MakeBuilder(sql_info, options); 118 | } 119 | 120 | arrow::Result> MakeTableBuilder( 121 | PGconn* conn, const char* query, const UserOptions& options) { 122 | std::shared_ptr columns; 123 | ARROW_ASSIGN_OR_RAISE(columns, ColumnTypesForQuery(conn, query)); 124 | 125 | FieldVector fields; 126 | std::shared_ptr builder; 127 | for (auto& [name, oid, mod] : *columns) { 128 | ARROW_ASSIGN_OR_RAISE(builder, MakeColumnBuilder(conn, oid, mod, options)); 129 | fields.push_back({name, builder}); 130 | } 131 | return std::make_shared(fields); 132 | } 133 | 134 | arrow::Status CopyQuery(PGconn* conn, const char* query, 135 | std::shared_ptr builder) { 136 | arrow::Status status = arrow::Status::OK(); 137 | auto copy_query = 138 | arrow::util::StringBuilder("COPY (", query, ") TO STDOUT (FORMAT binary);"); 139 | 140 | PGresult* res = PQexec(conn, copy_query.c_str()); 141 | if (PQresultStatus(res) != PGRES_COPY_OUT) { 142 | status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 143 | } 144 | PQclear(res); 145 | ARROW_RETURN_NOT_OK(status); 146 | 147 | // Attempts to obtain another row of data from the server during a COPY. 148 | // Data is always returned one data row at a time; if only a partial row 149 | // is available, it is not returned. Successful return of a data row involves 150 | // allocating a chunk of memory to hold the data. The buffer parameter must 151 | // be non-NULL. *buffer is set to point to the allocated memory, or to NULL 152 | // in cases where no buffer is returned. A non-NULL result buffer should be 153 | // freed using PQfreemem when no longer needed. 154 | 155 | // When a row is successfully returned, the return value is the number of 156 | // data bytes in the row (this will always be greater than zero). The returned 157 | // string is always null-terminated, though this is probably only useful for 158 | // textual COPY. A result of zero indicates that the COPY is still in progress, 159 | // but no row is yet available (this is only possible when async is true). 160 | // A result of -1 indicates that the COPY is done. A result of -2 indicates 161 | // that an error occurred (consult PQerrorMessage for the reason). 162 | 163 | // After PQgetCopyData returns -1, call PQgetResult to obtain the final result 164 | // status of the COPY command. One can wait for this result to be available 165 | // in the usual way. Then return to normal operation. 166 | char* tuple = nullptr; 167 | auto tuple_size = PQgetCopyData(conn, &tuple, 0); 168 | StreamBuffer sb = StreamBuffer(tuple); 169 | 170 | if (tuple_size > 0) { 171 | const int kBinaryHeaderSize = 19; 172 | const char* header = sb.ReadBinary(kBinaryHeaderSize); 173 | } 174 | 175 | TableBuilder* builder_ = builder.get(); 176 | while (tuple_size > 0) { 177 | status = builder_->Append(&sb); 178 | if (tuple != nullptr) PQfreemem(tuple); 179 | ARROW_RETURN_NOT_OK(status); 180 | 181 | tuple_size = PQgetCopyData(conn, &tuple, 0); 182 | sb = StreamBuffer(tuple); 183 | } 184 | 185 | if (tuple != nullptr) PQfreemem(tuple); 186 | 187 | res = PQgetResult(conn); 188 | if (PQresultStatus(res) != PGRES_COMMAND_OK) { 189 | // not really an issue... 190 | // pg_attribute gives "ERROR: no binary output function available for type aclitem" 191 | status = arrow::Status::IOError("[libpq] ", PQresultErrorMessage(res)); 192 | } 193 | PQclear(res); 194 | return status; 195 | } 196 | 197 | } // namespace pgeon 198 | -------------------------------------------------------------------------------- /src/pgeon/pg_interface.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | #include 8 | 9 | #include "pgeon/table_builder.h" 10 | 11 | namespace pgeon { 12 | 13 | arrow::Result> MakeTableBuilder(PGconn*, const char*, 14 | const UserOptions&); 15 | 16 | arrow::Status CopyQuery(PGconn*, const char*, std::shared_ptr); 17 | 18 | } // namespace pgeon 19 | -------------------------------------------------------------------------------- /src/pgeon/table_builder.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #include "pgeon/table_builder.h" 4 | 5 | namespace pgeon { 6 | 7 | TableBuilder::TableBuilder(const FieldVector& fields) : fields_(fields) { 8 | arrow::FieldVector arrow_fields; 9 | for (auto& f : fields) { 10 | auto& [name, builder] = f; 11 | builders_.push_back(builder.get()); 12 | arrow_fields.push_back(arrow::field(name, builder->type())); 13 | } 14 | schema_ = arrow::schema(arrow_fields); 15 | } 16 | 17 | arrow::Status TableBuilder::Append(StreamBuffer* sb) { 18 | int16_t nfields = sb->ReadInt16(); 19 | if (nfields == -1) return arrow::Status::OK(); 20 | 21 | for (int16_t i = 0; i < nfields; i++) { 22 | ARROW_RETURN_NOT_OK(builders_[i]->Append(sb)); 23 | } 24 | return arrow::Status::OK(); 25 | } 26 | 27 | arrow::Result> TableBuilder::Flush() { 28 | std::vector> arrays(fields_.size()); 29 | std::shared_ptr array; 30 | for (size_t i = 0; i < fields_.size(); i++) { 31 | ARROW_ASSIGN_OR_RAISE(array, builders_[i]->Flush()); 32 | arrays[i] = array; 33 | } 34 | return arrow::Table::Make(schema_, arrays); 35 | } 36 | 37 | } // namespace pgeon 38 | -------------------------------------------------------------------------------- /src/pgeon/table_builder.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | 8 | #include "pgeon/builder/base.h" 9 | 10 | namespace pgeon { 11 | 12 | class TableBuilder { 13 | private: 14 | FieldVector fields_; 15 | std::vector builders_; 16 | std::shared_ptr schema_; 17 | 18 | public: 19 | explicit TableBuilder(const FieldVector&); 20 | 21 | arrow::Status Append(StreamBuffer*); 22 | 23 | arrow::Result> Flush(); 24 | }; 25 | 26 | } // namespace pgeon 27 | -------------------------------------------------------------------------------- /src/pgeon/util/streambuffer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022 nullptr 2 | 3 | #pragma once 4 | 5 | #ifdef _WIN32 6 | #include 7 | #else 8 | #include 9 | #endif 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #if defined(__linux__) 16 | #include 17 | #define ntohll(x) be64toh(x) 18 | #elif defined(__APPLE__) 19 | #include 20 | #endif 21 | 22 | namespace pgeon { 23 | 24 | // Source from https://github.com/apache/arrow-adbc/tree/main/c/driver/postgresql 25 | static inline uint16_t LoadNetworkUInt16(const char* buf) { 26 | uint16_t v = 0; 27 | std::memcpy(&v, buf, sizeof(uint16_t)); 28 | return ntohs(v); 29 | } 30 | 31 | static inline uint32_t LoadNetworkUInt32(const char* buf) { 32 | uint32_t v = 0; 33 | std::memcpy(&v, buf, sizeof(uint32_t)); 34 | return ntohl(v); 35 | } 36 | 37 | static inline int64_t LoadNetworkUInt64(const char* buf) { 38 | uint64_t v = 0; 39 | std::memcpy(&v, buf, sizeof(uint64_t)); 40 | return ntohll(v); 41 | } 42 | 43 | static inline int16_t LoadNetworkInt16(const char* buf) { 44 | return static_cast(LoadNetworkUInt16(buf)); 45 | } 46 | 47 | static inline int32_t LoadNetworkInt32(const char* buf) { 48 | return static_cast(LoadNetworkUInt32(buf)); 49 | } 50 | 51 | static inline int64_t LoadNetworkInt64(const char* buf) { 52 | return static_cast(LoadNetworkUInt64(buf)); 53 | } 54 | 55 | static inline float LoadNetworkFloat32(const char* buf) { 56 | int32_t raw_value = LoadNetworkUInt32(buf); 57 | float value = 0.0; 58 | std::memcpy(&value, &raw_value, sizeof(float)); 59 | return value; 60 | } 61 | 62 | static inline double LoadNetworkFloat64(const char* buf) { 63 | int64_t raw_value = LoadNetworkUInt64(buf); 64 | double value = 0.0; 65 | std::memcpy(&value, &raw_value, sizeof(double)); 66 | return value; 67 | } 68 | 69 | class StreamBuffer { 70 | private: 71 | const char* buffer_; 72 | 73 | public: 74 | explicit StreamBuffer(const char* buffer) : buffer_(buffer) {} 75 | 76 | inline const char* ReadBinary(size_t n) { 77 | const char* buf = buffer_; 78 | buffer_ += n; 79 | return buf; 80 | } 81 | 82 | inline uint8_t ReadUInt8() { 83 | const char* buf = ReadBinary(1); 84 | return *buf; 85 | } 86 | 87 | inline int16_t ReadInt16() { 88 | const char* buf = ReadBinary(2); 89 | return LoadNetworkInt16(buf); 90 | } 91 | 92 | inline int32_t ReadInt32() { 93 | const char* buf = ReadBinary(4); 94 | return LoadNetworkInt32(buf); 95 | } 96 | 97 | inline int64_t ReadInt64() { 98 | const char* buf = ReadBinary(8); 99 | return LoadNetworkInt64(buf); 100 | } 101 | 102 | inline float ReadFloat32() { 103 | const char* buf = ReadBinary(4); 104 | return LoadNetworkFloat32(buf); 105 | } 106 | 107 | inline double ReadFloat64() { 108 | const char* buf = ReadBinary(8); 109 | return LoadNetworkFloat64(buf); 110 | } 111 | }; 112 | 113 | } // namespace pgeon 114 | -------------------------------------------------------------------------------- /tests/_todo.py: -------------------------------------------------------------------------------- 1 | # Bit string types 2 | # TODO which interface should it be ? 3 | query = "SELECT B'101'::bit(3)" 4 | query = "SELECT B'10'::bit varying(5)" 5 | 6 | # Text search types 7 | # TODO check 8 | query = "SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector" 9 | query = "SELECT $$the lexeme ' ' contains spaces$$::tsvector" 10 | query = "SELECT $$the lexeme 'Joe''s' contains a quote$$::tsvector" 11 | query = "SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector" 12 | query = "SELECT 'a:1A fat:2B,4C cat:5D'::tsvector" # not supported 13 | query = "SELECT 'The Fat Rats'::tsvector" 14 | query = "SELECT to_tsvector('english', 'The Fat Rats')" 15 | query = "SELECT 'fat & rat'::tsquery" 16 | query = "SELECT 'fat & (rat | cat)'::tsquery" 17 | query = "SELECT 'fat & rat & ! cat'::tsquery" 18 | query = "SELECT 'fat:ab & cat'::tsquery" 19 | query = "SELECT 'super:*'::tsquery" 20 | query = "SELECT to_tsquery('Fat:ab & Cats')" 21 | query = "SELECT to_tsvector( 'postgraduate' ) @@ to_tsquery( 'postgres:*' )" 22 | query = "SELECT to_tsvector( 'postgraduate' ), to_tsquery( 'postgres:*' )" 23 | 24 | # XML type 25 | query = """SELECT * FROM XMLPARSE (DOCUMENT 'Manual...')""" 26 | query = "SELECT * FROM XMLPARSE (CONTENT 'abcbarfoo')" 27 | 28 | # JSON types 29 | query = "SELECT * from json_table" 30 | query = "SELECT (a_jsonb->>'a')::int from json_table" 31 | query = "SELECT '$.key'::jsonpath" 32 | 33 | # Arrays 34 | query = "SELECT * from sal_emp" 35 | 36 | # Composite types 37 | # arrays are flattened 38 | query = "SELECT * from on_hand" 39 | 40 | # TODO Range types 41 | query = "SELECT int4range(10, 20)" 42 | query = "SELECT '{[3,7), [8,9)}'::int4multirange" 43 | query = "SELECT int8range(10, 20)" 44 | query = "SELECT numrange(11.1, 22.2)" 45 | query = "SELECT nummultirange(numrange(1.0, 14.0), numrange(20.0, 25.0))" 46 | # tsrange, tstzrange, daterange 47 | 48 | # TODO domain types 49 | 50 | # Object identifier types 51 | query = "SELECT * FROM pg_attribute WHERE attrelid = 'on_hand'::regclass" 52 | 53 | # TODO `pg_lsn` type 54 | query = "SELECT pg_current_snapshot()" 55 | 56 | # pseudo types 57 | query = "SELECT 'NULL'::void" 58 | 59 | # Hstore extension 60 | query = "SELECT 'a=>1,b=>2'::hstore" 61 | query = "SELECT 'a=>1,a=>2'::hstore" 62 | query = """SELECT 'a=>1,b=>""'::hstore""" 63 | query = """SELECT 'a=>1,b=>NULL'::hstore""" 64 | 65 | # does not work, apparently no schema 66 | query = "SELECT ROW('a', 1)" 67 | 68 | # dictionary encoded control ? 69 | # fixed size handling => templates ? 70 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="session") 7 | def dsn(): 8 | yield os.environ.get("PGEON_TEST_DB", "postgresql://localhost:5432/postgres") 9 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pyarrow as pa 4 | import pyarrow.compute as pc 5 | import pytest 6 | 7 | from pgeon import copy_query 8 | 9 | u = pa.array([-3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5]) 10 | u_int = pc.round(u, round_mode="towards_infinity") 11 | 12 | tests = [ 13 | ( 14 | "SELECT * FROM (VALUES (True), (False)) AS foo", 15 | pa.table({"column1": [True, False]}), 16 | ), 17 | ( 18 | """SELECT 19 | x::smallint AS int16, 20 | x::integer AS int32, 21 | x::bigint AS int64, 22 | x::real AS float, 23 | x::double precision AS double, 24 | x::numeric(13, 3) AS numeric_13_3, 25 | x::numeric AS numeric_default 26 | FROM generate_series(-3.5, 3.5, 1) AS x 27 | """, 28 | pa.table( 29 | { 30 | "int16": u_int.cast(pa.int16()), 31 | "int32": u_int.cast(pa.int32()), 32 | "int64": u_int.cast(pa.int64()), 33 | "float": u.cast(pa.float32()), 34 | "double": u, 35 | "numeric_13_3": u.cast(pa.decimal128(13, 3)), 36 | "numeric_default": u.cast(pa.decimal128(22, 6)), 37 | } 38 | ), 39 | ), 40 | ( 41 | "SELECT '12.3342'::money", 42 | pa.table({"money": pa.array([12.33]).cast(pa.decimal128(22, 2))}), 43 | ), 44 | ("SELECT 'ok'::character(4)", pa.table({"bpchar": pa.array(["ok "])})), 45 | ("SELECT 'too long'::varchar(5)", pa.table({"varchar": pa.array(["too l"])})), 46 | ("SELECT 'b'::char", pa.table({"bpchar": pa.array(["b"])})), 47 | ("SELECT 'b'::name", pa.table({"name": pa.array(["b"])})), 48 | (r"SELECT '\xDEADBEEF'::text", pa.table({"text": pa.array([r"\xDEADBEEF"])})), 49 | ( 50 | r"SELECT 'abc \153\154\155 \052\251\124'::bytea", 51 | pa.table({"bytea": pa.array([b"abc \153\154\155 \052\251\124"])}), 52 | ), 53 | ( 54 | "SELECT 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'::uuid", 55 | pa.table( 56 | { 57 | "uuid": pa.array( 58 | [ 59 | b"\xA0\xEE\xBC\x99\x9C\x0B\x4E\xF8\xBB\x6D\x6B\xB9\xBD\x38\x0A\x11" 60 | ], 61 | pa.binary(16), 62 | ) 63 | } 64 | ), 65 | ), 66 | ( 67 | "SELECT TIMESTAMP '2001-01-01 14:00:00'", 68 | pa.table( 69 | {"timestamp": pa.array(["2001-01-01 14:00:00"]).cast(pa.timestamp("us"))} 70 | ), 71 | ), 72 | ( 73 | "SELECT TIMESTAMP WITH TIME ZONE '2001-01-01 14:00:00+02:00'", 74 | pa.table( 75 | { 76 | "timestamptz": pa.array(["2001-01-01 14:00:00+02:00"]).cast( 77 | pa.timestamp("us", "UTC") 78 | ) 79 | } 80 | ), 81 | ), 82 | ( 83 | "SELECT '1999-01-08'::date", 84 | pa.table( 85 | {"date": pa.array(["1999-01-08"]).cast(pa.timestamp("s")).cast(pa.date32())} 86 | ), 87 | ), 88 | ( 89 | "SELECT TIME '14:00:00'", 90 | pa.table({"time": pa.array([50400000000]).cast(pa.time64("us"))}), 91 | ), 92 | ( 93 | "SELECT TIME WITH TIME ZONE '14:00:00+02:00'", 94 | pa.table({"timetz": pa.array([43200000000]).cast(pa.time64("us"))}), 95 | ), 96 | ( 97 | "SELECT '1 year 2 months 3 days 4 hours 5 minutes 6 seconds'::interval", 98 | pa.table({"interval": pa.array([pa.MonthDayNano([14, 3, 14706000000000])])}), 99 | ), 100 | ( 101 | "SELECT '(1.2, 4.3)'::point", 102 | pa.table({"point": pa.array([{"x": 1.2, "y": 4.3}])}), 103 | ), 104 | ( 105 | "SELECT '{1.2, 4.3, 0.0}'::line", 106 | pa.table({"line": pa.array([{"A": 1.2, "B": 4.3, "C": 0.0}])}), 107 | ), 108 | ( 109 | "SELECT '((1.2, 4.3), (5.6, 7.8))'::lseg", 110 | pa.table( 111 | { 112 | "lseg": pa.array( 113 | [{"x1": 1.2, "y1": 4.3, "x2": 5.6, "y2": 7.8}], 114 | pa.struct( 115 | [(k, pa.float64()) for k in ("x1", "y1", "x2", "y2")] 116 | ), # needed to maintain field order 117 | ) 118 | } 119 | ), 120 | ), 121 | ( 122 | "SELECT '((1.2, 4.3), (5.6, 7.8))'::box", 123 | pa.table( 124 | { 125 | "box": pa.array( 126 | [{"x1": 5.6, "y1": 7.8, "x2": 1.2, "y2": 4.3}], 127 | pa.struct([(k, pa.float64()) for k in ("x1", "y1", "x2", "y2")]), 128 | ) 129 | } 130 | ), 131 | ), 132 | ( 133 | "SELECT '((1,2),(3,4))'::path", 134 | pa.table( 135 | { 136 | "path": pa.array( 137 | [ 138 | { 139 | "closed": True, 140 | "points": [{"x": 1.0, "y": 2.0}, {"x": 3.0, "y": 4.0}], 141 | } 142 | ] 143 | ) 144 | } 145 | ), 146 | ), 147 | ( 148 | "SELECT '[(1,2),(3,4)]'::path", 149 | pa.table( 150 | { 151 | "path": pa.array( 152 | [ 153 | { 154 | "closed": False, 155 | "points": [{"x": 1.0, "y": 2.0}, {"x": 3.0, "y": 4.0}], 156 | } 157 | ] 158 | ) 159 | } 160 | ), 161 | ), 162 | ( 163 | "SELECT '((1,2),(3,4))'::polygon", 164 | pa.table({"polygon": pa.array([[{"x": 1.0, "y": 2.0}, {"x": 3.0, "y": 4.0}]])}), 165 | ), 166 | ( 167 | "SELECT '<(1.3, 3.4), 6>'::circle", 168 | pa.table( 169 | { 170 | "circle": pa.array( 171 | [{"x": 1.3, "y": 3.4, "r": 6}], 172 | pa.struct([(k, pa.float64()) for k in ("x", "y", "r")]), 173 | ) 174 | } 175 | ), 176 | ), 177 | ( 178 | "SELECT '::ffff:1.2.3.0/120'::cidr", 179 | pa.table( 180 | { 181 | "cidr": pa.array( 182 | [ 183 | { 184 | "family": 3, 185 | "bits": 120, 186 | "is_cidr": True, 187 | "ipaddr": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x01\x02\x03\x00", 188 | } 189 | ], 190 | pa.struct( 191 | [ 192 | ("family", pa.uint8()), 193 | ("bits", pa.uint8()), 194 | ("is_cidr", pa.bool_()), 195 | ("ipaddr", pa.binary()), 196 | ] 197 | ), 198 | ) 199 | } 200 | ), 201 | ), 202 | ( 203 | "SELECT '192.168.100.128/25'::inet", 204 | pa.table( 205 | { 206 | "inet": pa.array( 207 | [ 208 | { 209 | "family": 2, 210 | "bits": 25, 211 | "is_cidr": False, 212 | "ipaddr": b"\xC0\xA8\x64\x80", 213 | } 214 | ], 215 | pa.struct( 216 | [ 217 | ("family", pa.uint8()), 218 | ("bits", pa.uint8()), 219 | ("is_cidr", pa.bool_()), 220 | ("ipaddr", pa.binary()), 221 | ] 222 | ), 223 | ) 224 | } 225 | ), 226 | ), 227 | ( 228 | "SELECT '08:00:2b:01:02:03'::macaddr", 229 | pa.table({"macaddr": pa.array([b"\x08\x00\x2B\x01\x02\x03"], pa.binary(6))}), 230 | ), 231 | ( 232 | "SELECT '08:00:2b:01:02:03:04:05'::macaddr8", 233 | pa.table( 234 | {"macaddr8": pa.array([b"\x08\x00\x2B\x01\x02\x03\x04\x05"], pa.binary(8))} 235 | ), 236 | ), 237 | ("SELECT '{1000, 2000}'::bigint[]", pa.table({"int8": pa.array([[1000, 2000]])})), 238 | ] 239 | 240 | 241 | @pytest.mark.parametrize("test", tests) 242 | def test_query(dsn, test): 243 | query, expected = test 244 | tbl = copy_query(dsn, query) 245 | assert tbl.equals(expected) 246 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pyarrow as pa 3 | 4 | from pgeon import UserOptions, copy_query 5 | 6 | 7 | def test_bad_dsn(dsn): 8 | with pytest.raises(IOError): 9 | copy_query("not_a_connection_string", "") 10 | 11 | with pytest.raises(IOError): 12 | copy_query(dsn + "_fizz", "") 13 | 14 | 15 | def test_bad_query(dsn): 16 | with pytest.raises(IOError): 17 | copy_query(dsn, "smurfh") 18 | 19 | 20 | def test_bad_options(): 21 | with pytest.raises(pa.ArrowInvalid): 22 | UserOptions(default_numeric_precision=0).validate() 23 | 24 | with pytest.raises(pa.ArrowInvalid): 25 | UserOptions(default_numeric_scale=0).validate() 26 | 27 | with pytest.raises(pa.ArrowInvalid): 28 | UserOptions(default_numeric_precision=4, default_numeric_scale=6).validate() 29 | 30 | with pytest.raises(pa.ArrowInvalid): 31 | UserOptions(monetary_fractional_precision=0).validate() 32 | -------------------------------------------------------------------------------- /tests/test_options.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | 3 | from pgeon import UserOptions, copy_query 4 | 5 | 6 | def test_string_as_dictionaries(dsn): 7 | query = "SELECT 'a' AS c" 8 | options = UserOptions(string_as_dictionaries=True) 9 | expected = pa.table({"c": pa.array(["a"]).dictionary_encode()}) 10 | 11 | tbl = copy_query(dsn, query, options) 12 | assert tbl.equals(expected) 13 | --------------------------------------------------------------------------------