├── .circleci └── config.yml ├── .gitignore ├── Dockerfile ├── Dockerfile.ci ├── LICENSE ├── Makefile ├── README.md ├── __python__ └── foo.py ├── buffer.go ├── chunked.go ├── cmd └── example │ └── example.go ├── column.go ├── datatype.go ├── field.go ├── go.mod ├── go.sum ├── schema.go ├── table.go ├── table_test.go └── utils.go /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Golang CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-go/ for more details 4 | version: 2 5 | jobs: 6 | build: 7 | machine: true 8 | steps: 9 | - checkout 10 | - run: make ci 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/ 2 | *.test 3 | bin/ 4 | vendor/ 5 | __pycache__/ 6 | .vscode/ 7 | cpu.svg 8 | cpu.prof -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | # Tools 4 | RUN apt-get update && apt-get install -y \ 5 | g++ \ 6 | gdb \ 7 | git \ 8 | make \ 9 | vim \ 10 | wget \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | # Go installation 14 | RUN cd /tmp && \ 15 | wget https://dl.google.com/go/go1.12.5.linux-amd64.tar.gz && \ 16 | tar -C /usr/local -xzf go1.12.5.linux-amd64.tar.gz && \ 17 | rm go1.12.5.linux-amd64.tar.gz 18 | ENV PATH="/usr/local/go/bin:${PATH}" 19 | 20 | # Python bindings 21 | RUN cd /tmp && \ 22 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 23 | bash Miniconda3-latest-Linux-x86_64.sh -b -p /miniconda && \ 24 | rm Miniconda3-latest-Linux-x86_64.sh 25 | ENV PATH="/miniconda/bin:${PATH}" 26 | RUN conda install -c conda-forge -y \ 27 | Cython \ 28 | ipython \ 29 | numpy \ 30 | pkg-config \ 31 | pyarrow=0.13.0 32 | 33 | ENV LD_LIBRARY_PATH=/miniconda/lib 34 | ENV CONDA_PREFIX=/miniconda 35 | WORKDIR /src/go-py-arrow-bridge -------------------------------------------------------------------------------- /Dockerfile.ci: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | # Tools 4 | RUN apt-get update && apt-get install -y -V \ 5 | g++ \ 6 | git \ 7 | make \ 8 | wget \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | # Go installation 12 | RUN cd /tmp && \ 13 | wget https://dl.google.com/go/go1.12.5.linux-amd64.tar.gz && \ 14 | tar -C /usr/local -xzf go1.12.5.linux-amd64.tar.gz 15 | ENV PATH="/usr/local/go/bin:${PATH}" 16 | 17 | # Python bindings 18 | RUN cd /tmp && \ 19 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 20 | bash Miniconda3-latest-Linux-x86_64.sh -b -p /miniconda && \ 21 | rm Miniconda3-latest-Linux-x86_64.sh 22 | ENV PATH="/miniconda/bin:${PATH}" 23 | RUN conda install -c conda-forge -y \ 24 | Cython \ 25 | ipython \ 26 | numpy \ 27 | pkg-config \ 28 | pyarrow=0.13.0 29 | 30 | ENV LD_LIBRARY_PATH=/miniconda/lib 31 | ENV CONDA_PREFIX=/miniconda 32 | 33 | WORKDIR /src/go-py-arrow-bridge 34 | COPY . . 35 | RUN make test -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 Nick Poorman 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # You may need to update this to reflect your PYTHONPATH. 2 | PKG_CONFIG_PATH=${CONDA_PREFIX}/lib/pkgconfig 3 | LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/python3.7:${CONDA_PREFIX}/lib 4 | PYTHONPATH=${CONDA_PREFIX}/lib/python3.7/site-packages:${PWD}/__python__ 5 | GO_PREFIX=PKG_CONFIG_PATH=${PKG_CONFIG_PATH} LD_LIBRARY_PATH=${LD_LIBRARY_PATH} PYTHONPATH=${PYTHONPATH} 6 | GO_CMD=${GO_PREFIX} go 7 | 8 | GO_BUILD=$(GO_CMD) build 9 | GO_TEST?=$(GO_CMD) test 10 | GO_RUN=${GO_CMD} run 11 | 12 | DIST_DIR=bin 13 | 14 | GO_SOURCES := $(shell find . -path -prune -o -name '*.go' -not -name '*_test.go') 15 | 16 | .PHONY: default clean clean-cache test test-no-cache bench build run prof ci ci-test docker 17 | 18 | # 19 | # Our default target, clean up, do our install, test, and build locally. 20 | # 21 | default: clean build 22 | 23 | # Clean up after our install and build processes. Should get us back to as 24 | # clean as possible. 25 | # 26 | clean: 27 | @for d in ./bin/*; do \ 28 | if [ -f $$d ] ; then rm $$d ; fi \ 29 | done 30 | rm -rf ./__python__/**/*.pyc 31 | 32 | clean-cache: clean 33 | go clean -cache -testcache -modcache 34 | 35 | # 36 | # Do what we need to do to run our tests. 37 | # 38 | test: clean $(GO_SOURCES) 39 | $(GO_TEST) -v $(GO_TEST_ARGS) ./... 40 | 41 | test-no-cache: clean $(GO_SOURCES) 42 | $(GO_TEST) -count=1 -v $(GO_TEST_ARGS) ./... 43 | 44 | # 45 | # Run the benchmarks for the tools. 46 | # 47 | bench: $(GO_SOURCES) 48 | $(GO_TEST) $(GO_TEST_ARGS) -bench=. -run=- ./... 49 | 50 | # 51 | # Build/compile our application. 52 | # 53 | build: 54 | @for d in ./cmd/*; do \ 55 | echo "Building ${DIST_DIR}/`basename $$d`"; \ 56 | ${GO_BUILD} -o ${DIST_DIR}/`basename $$d` $$d; \ 57 | done 58 | 59 | # 60 | # Most of this is setup with telling python c-api where the python modules are. 61 | # 62 | run: clean build 63 | ${GO_PREFIX} ./bin/example 64 | 65 | # 66 | # Generate prof reports. 67 | # 68 | prof: 69 | $(GO_TEST) -bench=. -run=- -cpuprofile cpu.prof $(GO_TEST_ARGS) 70 | go tool pprof -svg cpu.prof > cpu.svg 71 | 72 | ci: 73 | docker build -f Dockerfile.ci . 74 | 75 | docker: 76 | docker build . -t go-py-arrow-bridge:builder 77 | docker run \ 78 | -v $(PWD):/src/go-py-arrow-bridge \ 79 | -it --workdir=/src/go-py-arrow-bridge/ \ 80 | go-py-arrow-bridge:builder 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-py-arrow-bridge 2 | 3 | [![GoDoc](https://godoc.org/github.com/nickpoorman/go-py-arrow-bridge?status.svg)](https://godoc.org/github.com/nickpoorman/go-py-arrow-bridge) 4 | [![CircleCI](https://circleci.com/gh/nickpoorman/go-py-arrow-bridge.svg?style=svg)](https://circleci.com/gh/nickpoorman/go-py-arrow-bridge) 5 | 6 | A rudimentary bridge for [Apache Arrow](https://github.com/apache/arrow) between Go and Python to facilitate zero-copy. 7 | 8 | This Go module demonstrates in the [tests](table_test.go) how easy it is to create an Arrow Table in Python and use the same Arrow Table in Go without copying the underlying buffers. 9 | 10 | 11 | 12 | ## Installation 13 | 14 | Add the package to your `go.mod` file: 15 | 16 | require github.com/nickpoorman/go-py-arrow-bridge master 17 | 18 | Or, clone the repository: 19 | 20 | git clone --branch master https://github.com/nickpoorman/go-py-arrow-bridge.git $GOPATH/src/github.com/nickpoorman/go-py-arrow-bridge 21 | 22 | 23 | 24 | ## Example 25 | 26 | See the [example](cmd/example/example.go) or clone down the repo an run it via `make run`. 27 | 28 | ## Benchmarks 29 | 30 | As you can see below, the amount of time to move data across the Python/Go language boundary stays constant as the number of elements increases. 31 | 32 | However, as the number of chunks increase, the amount of time also increases. I believe this is due to the large number of CGO calls happening in loops. A future version might try to reduce the number of CGO calls by implementing the schema data gathering in C. In the meantime, a workaround could compress the table down to a single chunk before crossing the language boundary. 33 | 34 | These results are from my Mid 2012 MacBook Air (1.8GHz i5 / 8 GB 1600 MHz DDR3). 35 | 36 | ``` 37 | (bullseye) ➜ go-py-arrow-bridge git:(master) ✗ PKG_CONFIG_PATH=/Users/nick/anaconda3/envs/bullseye/lib/pkgconfig LD_LIBRARY_PATH=/Users/nick/anaconda3/envs/bullseye/lib/python3.7:/Users/nick/anaconda3/envs/bullseye/lib PYTHONPATH=/Users/nick/anaconda3/envs/bullseye/lib/python3.7/site-packages:/Users/nick/projects/go-py-arrow-bridge/__python__ go test -bench=. -run=- -cpuprofile cpu.prof 38 | goos: darwin 39 | goarch: amd64 40 | pkg: github.com/nickpoorman/go-py-arrow-bridge 41 | BenchmarkAll/BenchmarkZeroCopyChunks_5-4 3000 402994 ns/op 42 | BenchmarkAll/BenchmarkZeroCopyChunks_7-4 3000 463610 ns/op 43 | BenchmarkAll/BenchmarkZeroCopyChunks_9-4 2000 553570 ns/op 44 | BenchmarkAll/BenchmarkZeroCopyChunks_1000-4 30 67841714 ns/op 45 | BenchmarkAll/BenchmarkZeroCopyChunks_1500-4 20 72966442 ns/op 46 | BenchmarkAll/BenchmarkZeroCopyChunks_2000-4 20 91238313 ns/op 47 | BenchmarkAll/BenchmarkZeroCopyChunks_2500-4 10 120581479 ns/op 48 | BenchmarkAll/BenchmarkZeroCopyChunks_3000-4 10 149069387 ns/op 49 | BenchmarkAll/BenchmarkZeroCopyChunks_3500-4 10 168897623 ns/op 50 | BenchmarkAll/BenchmarkZeroCopyChunks_4000-4 10 187915637 ns/op 51 | BenchmarkAll/BenchmarkZeroCopyChunks_4500-4 5 209468675 ns/op 52 | BenchmarkAll/BenchmarkZeroCopyChunks_5000-4 5 232361156 ns/op 53 | BenchmarkAll/BenchmarkZeroCopyChunks_5500-4 5 249023617 ns/op 54 | BenchmarkAll/BenchmarkZeroCopyChunks_6000-4 5 274385207 ns/op 55 | BenchmarkAll/BenchmarkZeroCopyChunks_6500-4 5 305522949 ns/op 56 | BenchmarkAll/BenchmarkZeroCopyChunks_7000-4 5 324781757 ns/op 57 | BenchmarkAll/BenchmarkZeroCopyChunks_7500-4 3 349889266 ns/op 58 | BenchmarkAll/BenchmarkZeroCopyChunks_8000-4 3 372640132 ns/op 59 | BenchmarkAll/BenchmarkZeroCopyChunks_8500-4 3 394905472 ns/op 60 | BenchmarkAll/BenchmarkZeroCopyChunks_9000-4 3 413965959 ns/op 61 | BenchmarkAll/BenchmarkZeroCopyChunks_9500-4 3 440292768 ns/op 62 | BenchmarkAll/BenchmarkZeroCopyChunks_10000-4 3 461282623 ns/op 63 | BenchmarkAll/BenchmarkZeroCopyElements_5-4 10000 166585 ns/op 64 | BenchmarkAll/BenchmarkZeroCopyElements_7-4 10000 165732 ns/op 65 | BenchmarkAll/BenchmarkZeroCopyElements_9-4 10000 177193 ns/op 66 | BenchmarkAll/BenchmarkZeroCopyElements_1000-4 10000 166462 ns/op 67 | BenchmarkAll/BenchmarkZeroCopyElements_1500-4 10000 166774 ns/op 68 | BenchmarkAll/BenchmarkZeroCopyElements_2000-4 10000 169948 ns/op 69 | BenchmarkAll/BenchmarkZeroCopyElements_2500-4 10000 171018 ns/op 70 | BenchmarkAll/BenchmarkZeroCopyElements_3000-4 10000 168100 ns/op 71 | BenchmarkAll/BenchmarkZeroCopyElements_3500-4 10000 171136 ns/op 72 | BenchmarkAll/BenchmarkZeroCopyElements_4000-4 10000 166941 ns/op 73 | BenchmarkAll/BenchmarkZeroCopyElements_4500-4 10000 171599 ns/op 74 | BenchmarkAll/BenchmarkZeroCopyElements_5000-4 10000 169485 ns/op 75 | BenchmarkAll/BenchmarkZeroCopyElements_5500-4 10000 169657 ns/op 76 | BenchmarkAll/BenchmarkZeroCopyElements_6000-4 10000 168274 ns/op 77 | BenchmarkAll/BenchmarkZeroCopyElements_6500-4 10000 171372 ns/op 78 | BenchmarkAll/BenchmarkZeroCopyElements_7000-4 10000 168484 ns/op 79 | BenchmarkAll/BenchmarkZeroCopyElements_7500-4 10000 169056 ns/op 80 | BenchmarkAll/BenchmarkZeroCopyElements_8000-4 10000 166486 ns/op 81 | BenchmarkAll/BenchmarkZeroCopyElements_8500-4 10000 167760 ns/op 82 | BenchmarkAll/BenchmarkZeroCopyElements_9000-4 10000 173118 ns/op 83 | BenchmarkAll/BenchmarkZeroCopyElements_9500-4 10000 166797 ns/op 84 | BenchmarkAll/BenchmarkZeroCopyElements_10000-4 10000 169281 ns/op 85 | PASS 86 | ok github.com/nickpoorman/go-py-arrow-bridge 86.975s 87 | ``` 88 | 89 | ## License 90 | 91 | (c) 2019 Nick Poorman. Licensed under the Apache License, Version 2.0. 92 | -------------------------------------------------------------------------------- /__python__/foo.py: -------------------------------------------------------------------------------- 1 | # foo.py 2 | import random 3 | import pandas as pd 4 | import pyarrow as pa 5 | 6 | random.seed(3) 7 | 8 | 9 | def zero_copy_chunks(num_chunks=5): 10 | a1 = pa.array([1, 2, 3, 4]) 11 | a2 = pa.array(['foo', 'bar', 'baz', None]) 12 | a3 = pa.array([True, None, False, True]) 13 | data = [a1, a2, a3] 14 | batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) 15 | batches = [batch] * num_chunks 16 | table = pa.Table.from_batches(batches) 17 | return table 18 | 19 | 20 | def zero_copy_elements(num_elements=5): 21 | a1 = pa.array([random.uniform(1000, 2000) for x in range(num_elements)]) 22 | a2 = pa.array(['foo'] * num_elements) 23 | a3 = pa.array([True] * num_elements) 24 | data = [a1, a2, a3] 25 | batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) 26 | batches = [batch] 27 | table = pa.Table.from_batches(batches) 28 | return table 29 | -------------------------------------------------------------------------------- /buffer.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow/memory" 8 | ) 9 | 10 | func PyBuffersToBuffers(pyBuffers *python3.PyObject) ([]*memory.Buffer, error) { 11 | // First buffer is the null mask buffer, second is the values. 12 | // [, ] 13 | if !python3.PyList_Check(pyBuffers) { 14 | return nil, errors.New("pyBuffers is not a list") 15 | } 16 | 17 | length := python3.PyList_Size(pyBuffers) 18 | buffers := make([]*memory.Buffer, 0, length) 19 | for i := 0; i < length; i++ { 20 | buffer, err := PyBuffersGetBuffer(pyBuffers, i) 21 | if err != nil { 22 | return nil, err 23 | } 24 | // buffers[i] = buffer 25 | buffers = append(buffers, buffer) 26 | } 27 | 28 | return buffers, nil 29 | } 30 | 31 | func PyBuffersGetBuffer(pyBuffers *python3.PyObject, i int) (*memory.Buffer, error) { 32 | // Get the buffer at index i 33 | pyBuffer := python3.PyList_GetItem(pyBuffers, i) 34 | if pyBuffer == nil { 35 | return nil, errors.New("could not get pyBuffer") 36 | } 37 | defer pyBuffer.DecRef() 38 | 39 | goBytes, err := PyBufferToBytes(pyBuffer) 40 | if err != nil { 41 | return nil, err 42 | } 43 | 44 | buffer := memory.NewBufferBytes(goBytes) 45 | return buffer, nil 46 | } 47 | 48 | func PyBufferToBytes(pyBuffer *python3.PyObject) ([]byte, error) { 49 | // 50 | // Convert the buffer to our Py_buffer struct type 51 | pyBuf, err := python3.PyObject_GetBuffer(pyBuffer, python3.PyBUF_SIMPLE) 52 | if err { 53 | return nil, errors.New("could not get pyBuf") 54 | } 55 | 56 | goBytes := python3.PyObject_GetBufferBytes(pyBuf) 57 | if goBytes == nil { 58 | return nil, errors.New("could not get goBytes") 59 | } 60 | 61 | return goBytes, nil 62 | } 63 | -------------------------------------------------------------------------------- /chunked.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | "github.com/apache/arrow/go/arrow/memory" 10 | ) 11 | 12 | func PyChunkedToChunked(pyChunked *python3.PyObject, dtype arrow.DataType) (*array.Chunked, error) { 13 | // Convert pyChunks to []Interface 14 | chunks, err := PyChunkedToChunks(pyChunked, dtype) 15 | if err != nil { 16 | return nil, err 17 | } 18 | 19 | chunked := array.NewChunked(dtype, chunks) 20 | return chunked, nil 21 | } 22 | 23 | func PyChunkedToChunks(pyChunked *python3.PyObject, dtype arrow.DataType) ([]array.Interface, error) { 24 | pyChunks, err := PyChunkedGetPyChunks(pyChunked) 25 | if err != nil { 26 | return nil, err 27 | } 28 | defer pyChunks.DecRef() 29 | 30 | if !python3.PyList_Check(pyChunks) { 31 | return nil, errors.New("pyChunks is not a list") 32 | } 33 | 34 | length := python3.PyList_Size(pyChunks) 35 | chunks := make([]array.Interface, 0, length) 36 | for i := 0; i < length; i++ { 37 | chunk, err := PyChunksGetChunk(pyChunks, i, dtype) 38 | if err != nil { 39 | // TODO: Should release any chunks we've already got. 40 | return nil, err 41 | } 42 | chunks = append(chunks, chunk) 43 | } 44 | 45 | return chunks, nil 46 | } 47 | 48 | func PyChunkedGetPyChunks(pyChunked *python3.PyObject) (*python3.PyObject, error) { 49 | pyChunks := pyChunked.GetAttrString("chunks") 50 | if pyChunks == nil { 51 | return nil, errors.New("could not get pyChunks") 52 | } 53 | return pyChunks, nil 54 | } 55 | 56 | func PyChunksGetChunk(pyChunks *python3.PyObject, i int, dtype arrow.DataType) (array.Interface, error) { 57 | pyChunk, err := PyChunksGetPyChunk(pyChunks, i) 58 | if err != nil { 59 | return nil, err 60 | } 61 | defer pyChunk.DecRef() 62 | 63 | chunk, err := PyChunkToChunk(pyChunk, dtype) 64 | if err != nil { 65 | return nil, err 66 | } 67 | return chunk, nil 68 | } 69 | 70 | func PyChunksGetPyChunk(pyChunks *python3.PyObject, i int) (*python3.PyObject, error) { 71 | pyChunk := python3.PyList_GetItem(pyChunks, i) 72 | if pyChunk == nil { 73 | return nil, errors.New("could not get pyChunk from list") 74 | } 75 | return pyChunk, nil 76 | } 77 | 78 | func PyChunkToChunk(pyChunk *python3.PyObject, dtype arrow.DataType) (array.Interface, error) { 79 | data, err := PyChunkToData(pyChunk, dtype) 80 | if err != nil { 81 | return nil, err 82 | } 83 | defer data.Release() 84 | chunk := array.MakeFromData(data) 85 | return chunk, nil 86 | } 87 | 88 | func PyChunkToData(pyChunk *python3.PyObject, dtype arrow.DataType) (*array.Data, error) { 89 | buffers, err := PyChunkGetBuffers(pyChunk) 90 | if err != nil { 91 | return nil, err 92 | } 93 | 94 | nullCount, err := PyChunkGetNullCount(pyChunk) 95 | if err != nil { 96 | return nil, err 97 | } 98 | 99 | offset, err := PyChunkGetOffset(pyChunk) 100 | if err != nil { 101 | return nil, err 102 | } 103 | 104 | chunkLen, err := PyChunkGetLength(pyChunk) 105 | if err != nil { 106 | return nil, err 107 | } 108 | 109 | var childData []*array.Data // TODO: Implement 110 | data := array.NewData(dtype, chunkLen, buffers, childData, nullCount, offset) 111 | return data, nil 112 | } 113 | 114 | func PyChunkGetBuffers(pyChunk *python3.PyObject) ([]*memory.Buffer, error) { 115 | pyBuffers, err := PyChunkGetPyBuffers(pyChunk) 116 | if err != nil { 117 | return nil, err 118 | } 119 | defer pyBuffers.DecRef() 120 | 121 | return PyBuffersToBuffers(pyBuffers) 122 | } 123 | 124 | func PyChunkGetPyBuffers(pyChunk *python3.PyObject) (*python3.PyObject, error) { 125 | pyBuffersFunc := pyChunk.GetAttrString("buffers") 126 | if pyBuffersFunc == nil { 127 | return nil, errors.New("could not get pyBuffersFunc") 128 | } 129 | defer pyBuffersFunc.DecRef() 130 | 131 | pyBuffers := pyBuffersFunc.CallFunctionObjArgs() 132 | if pyBuffers == nil { 133 | return nil, errors.New("could not get pyBuffers") 134 | } 135 | 136 | return pyBuffers, nil 137 | } 138 | 139 | func PyChunkGetNullCount(pyChunk *python3.PyObject) (int, error) { 140 | v, ok := GetIntAttr(pyChunk, "null_count") 141 | if !ok { 142 | return 0, errors.New("could not get null_count") 143 | } 144 | return v, nil 145 | } 146 | 147 | func PyChunkGetOffset(pyChunk *python3.PyObject) (int, error) { 148 | v, ok := GetIntAttr(pyChunk, "offset") 149 | if !ok { 150 | return 0, errors.New("could not get offset") 151 | } 152 | return v, nil 153 | } 154 | 155 | func PyChunkGetLength(pyChunk *python3.PyObject) (int, error) { 156 | pyLength := CallPyFunc(pyChunk, "__len__") 157 | if pyLength == nil { 158 | return 0, errors.New("could not get pyChunk.__len__()") 159 | } 160 | defer pyLength.DecRef() 161 | length := python3.PyLong_AsLong(pyLength) 162 | return length, nil 163 | } 164 | -------------------------------------------------------------------------------- /cmd/example/example.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow/array" 8 | "github.com/apache/arrow/go/arrow/memory" 9 | "github.com/go-bullseye/bullseye/dataframe" 10 | bridge "github.com/nickpoorman/go-py-arrow-bridge" 11 | "github.com/nickpoorman/pytasks" 12 | ) 13 | 14 | func main() { 15 | py := pytasks.GetPythonSingleton() 16 | 17 | fooModule, err := py.ImportModule("foo") 18 | if err != nil { 19 | panic(err) 20 | } 21 | defer func() { 22 | err := pytasks.GetPythonSingleton().NewTaskSync(func() { 23 | fooModule.DecRef() 24 | }) 25 | if err != nil { 26 | panic(err) 27 | } 28 | }() 29 | 30 | var table array.Table 31 | taskErr := py.NewTaskSync(func() { 32 | pyTable := genPyTable(fooModule) 33 | table, err = bridge.PyTableToTable(pyTable) 34 | pyTable.DecRef() 35 | }) 36 | if taskErr != nil { 37 | panic(taskErr) 38 | } 39 | if err != nil { 40 | panic(err) 41 | } 42 | 43 | // Wrapping it in a bullseye dataframe allows us to print it easily 44 | pool := memory.NewGoAllocator() 45 | df, err := dataframe.NewDataFrameFromTable(pool, table) 46 | if err != nil { 47 | panic(err) 48 | } 49 | 50 | fmt.Println("\nArrow Table from Python now in Go:") 51 | fmt.Println(df.Display(0)) 52 | 53 | // Arrow Table from Python now in Go: 54 | // rec[0]["f0"]: [1 2 3 4] 55 | // rec[0]["f1"]: ["foo" "bar" "baz" (null)] 56 | // rec[0]["f2"]: [true (null) false true] 57 | // rec[1]["f0"]: [1 2 3 4] 58 | // rec[1]["f1"]: ["foo" "bar" "baz" (null)] 59 | // rec[1]["f2"]: [true (null) false true] 60 | // rec[2]["f0"]: [1 2 3 4] 61 | // rec[2]["f1"]: ["foo" "bar" "baz" (null)] 62 | // rec[2]["f2"]: [true (null) false true] 63 | // rec[3]["f0"]: [1 2 3 4] 64 | // rec[3]["f1"]: ["foo" "bar" "baz" (null)] 65 | // rec[3]["f2"]: [true (null) false true] 66 | // rec[4]["f0"]: [1 2 3 4] 67 | // rec[4]["f1"]: ["foo" "bar" "baz" (null)] 68 | // rec[4]["f2"]: [true (null) false true] 69 | } 70 | 71 | func genPyTable(module *python3.PyObject) *python3.PyObject { 72 | pyTable := bridge.CallPyFunc(module, "zero_copy_chunks") 73 | if pyTable == nil { 74 | panic("pyTable is nil") 75 | } 76 | return pyTable 77 | } 78 | -------------------------------------------------------------------------------- /column.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | ) 10 | 11 | // PyColumnToColumnWithField turns a PyColumn into a GoColumn 12 | func PyColumnToColumnWithField(pyColumn *python3.PyObject, field arrow.Field) (*array.Column, error) { 13 | chunks, err := PyColumnToChunkedWithField(pyColumn, field) 14 | if err != nil { 15 | return nil, err 16 | } 17 | 18 | col := array.NewColumn(field, chunks) 19 | return col, nil 20 | } 21 | 22 | func PyColumnToChunkedWithField(pyColumn *python3.PyObject, field arrow.Field) (*array.Chunked, error) { 23 | pyChunked, err := PyColumnGetPyChunked(pyColumn) 24 | if err != nil { 25 | return nil, err 26 | } 27 | defer pyChunked.DecRef() 28 | 29 | return PyChunkedToChunked(pyChunked, field.Type) 30 | } 31 | 32 | func PyColumnGetPyChunked(pyColumn *python3.PyObject) (*python3.PyObject, error) { 33 | pyChunked := pyColumn.GetAttrString("data") 34 | if pyChunked == nil { 35 | return nil, errors.New("could not get pyChunked") 36 | } 37 | return pyChunked, nil 38 | } 39 | -------------------------------------------------------------------------------- /datatype.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | "github.com/DataDog/go-python3" 8 | "github.com/apache/arrow/go/arrow" 9 | ) 10 | 11 | // PyDataTypeToDataType returns the Go arrow DataType given the Python type. 12 | func PyDataTypeToDataType(pyDtype *python3.PyObject) (arrow.DataType, error) { 13 | // get the id 14 | id, err := PyDataTypeGetID(pyDtype) 15 | if err != nil { 16 | return nil, err 17 | } 18 | 19 | t := arrow.Type(id) 20 | return GetFromType(t) 21 | } 22 | 23 | func PyDataTypeGetID(pyDtype *python3.PyObject) (int, error) { 24 | v, ok := GetIntAttr(pyDtype, "id") 25 | if !ok { 26 | return 0, errors.New("could not get pyDtype.id") 27 | } 28 | return v, nil 29 | } 30 | 31 | var ( 32 | dataTypeForType [32]arrow.DataType 33 | ) 34 | 35 | // GetFromType returns a arrow.DataType for a given arrow.Type 36 | func GetFromType(t arrow.Type) (arrow.DataType, error) { 37 | dtype := dataTypeForType[byte(t&0x1f)] 38 | if dtype == nil { 39 | return nil, fmt.Errorf("DataType for id=%v is not yet implemented", t) 40 | } 41 | return dtype, nil 42 | } 43 | 44 | // DataTypeFromType returns an arrow.DataType given the Type 45 | func init() { 46 | dataTypeForType = [...]arrow.DataType{ 47 | arrow.NULL: arrow.Null, 48 | arrow.BOOL: arrow.FixedWidthTypes.Boolean, 49 | arrow.UINT8: arrow.PrimitiveTypes.Uint8, 50 | arrow.INT8: arrow.PrimitiveTypes.Int8, 51 | arrow.UINT16: arrow.PrimitiveTypes.Uint16, 52 | arrow.INT16: arrow.PrimitiveTypes.Int16, 53 | arrow.UINT32: arrow.PrimitiveTypes.Uint32, 54 | arrow.INT32: arrow.PrimitiveTypes.Int32, 55 | arrow.UINT64: arrow.PrimitiveTypes.Uint64, 56 | arrow.INT64: arrow.PrimitiveTypes.Int64, 57 | arrow.FLOAT16: arrow.FixedWidthTypes.Float16, 58 | arrow.FLOAT32: arrow.PrimitiveTypes.Float32, 59 | arrow.FLOAT64: arrow.PrimitiveTypes.Float64, 60 | arrow.STRING: arrow.BinaryTypes.String, 61 | arrow.BINARY: arrow.BinaryTypes.Binary, 62 | arrow.FIXED_SIZE_BINARY: nil, // arrow.FixedSizeBinaryType, 63 | arrow.DATE32: arrow.PrimitiveTypes.Date32, 64 | arrow.DATE64: arrow.PrimitiveTypes.Date64, 65 | arrow.TIMESTAMP: nil, // arrow.FixedWidthTypes.Timestamp_s, // TODO 66 | arrow.TIME32: nil, // arrow.FixedWidthTypes.Time32s, // TODO 67 | arrow.TIME64: nil, // arrow.FixedWidthTypes.Time64us, // TODO 68 | arrow.INTERVAL: nil, // arrow.FixedWidthTypes.MonthInterval, // TODO 69 | arrow.DECIMAL: nil, 70 | arrow.LIST: nil, 71 | arrow.STRUCT: nil, 72 | arrow.UNION: nil, 73 | arrow.DICTIONARY: nil, 74 | arrow.MAP: nil, 75 | arrow.EXTENSION: nil, 76 | arrow.FIXED_SIZE_LIST: nil, 77 | arrow.DURATION: nil, // arrow.FixedWidthTypes.Duration_s, // TODO 78 | 79 | // invalid data types to fill out array size 2⁵-1 80 | 31: nil, 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /field.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow" 8 | ) 9 | 10 | // PyFieldToField given a Python field gets the Go Arrow field. 11 | func PyFieldToField(pyField *python3.PyObject) (*arrow.Field, error) { 12 | pyName := pyField.GetAttrString("name") 13 | if pyName == nil { 14 | return nil, errors.New("could not get pyName") 15 | } 16 | defer pyName.DecRef() 17 | 18 | pyDtype := pyField.GetAttrString("type") 19 | if pyDtype == nil { 20 | return nil, errors.New("could not get pyDtype") 21 | } 22 | defer pyDtype.DecRef() 23 | 24 | pyNullable := pyField.GetAttrString("nullable") 25 | if pyNullable == nil { 26 | return nil, errors.New("could not get pyNullable") 27 | } 28 | defer pyNullable.DecRef() 29 | 30 | // TODO: Implement 31 | // pyMetadata := CallPyFunc(pyField, "metadata") 32 | // if pyMetadata == nil { 33 | // return nil, errors.New("could not get pyMetadata") 34 | // } 35 | 36 | name := python3.PyUnicode_AsUTF8(pyName) 37 | dtype, err := PyDataTypeToDataType(pyDtype) 38 | if err != nil { 39 | return nil, err 40 | } 41 | nullable := python3.PyBool_Check(pyNullable) 42 | 43 | field := &arrow.Field{ 44 | Name: name, 45 | Type: dtype, 46 | Nullable: nullable, 47 | // TODO: Implement 48 | // Metadata: metadata, 49 | } 50 | 51 | return field, nil 52 | } 53 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/nickpoorman/go-py-arrow-bridge 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/DataDog/go-python3 v0.0.0-20190130222855-0b25cc550560 7 | github.com/apache/arrow/go/arrow v0.0.0-20190714060934-486b97bd49c9 8 | github.com/go-bullseye/bullseye v0.0.0-20190714184620-91edcbc8ba1c 9 | github.com/nickpoorman/pytasks v0.0.0-20190706034506-5f0c8f7bc6b6 10 | ) 11 | 12 | replace github.com/DataDog/go-python3 => github.com/nickpoorman/go-python3 v0.0.0-20190713164746-bfc9d2df89d4 13 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/apache/arrow/go/arrow v0.0.0-20190615061817-720be32a0bb5/go.mod h1:NG5SvIQXIxzJR5lGmoXTX9R/EmkArKbPPFu0DUFSz10= 2 | github.com/apache/arrow/go/arrow v0.0.0-20190714060934-486b97bd49c9 h1:RgKlReA/ZycIH1VYKSC2Y+3RftTDKDtbiQKcXjBGPLc= 3 | github.com/apache/arrow/go/arrow v0.0.0-20190714060934-486b97bd49c9/go.mod h1:VTxUBvSJ3s3eHAg65PNgrsn5BtqCRPdmyXh6rAfdxN0= 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 6 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 7 | github.com/go-bullseye/bullseye v0.0.0-20190714184620-91edcbc8ba1c h1:oDv3LHsaSMg4NYpolglbv9MslwQi3T6JfZHwH1q3WMM= 8 | github.com/go-bullseye/bullseye v0.0.0-20190714184620-91edcbc8ba1c/go.mod h1:5OhpIKAVfTB2B63U16GHfwlnIEe3sukOIn9mAUcmLis= 9 | github.com/google/flatbuffers v1.11.0 h1:O7CEyB8Cb3/DmtxODGtLHcEvpr81Jm5qLg/hsHnxA2A= 10 | github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= 11 | github.com/nickpoorman/go-python3 v0.0.0-20190713164746-bfc9d2df89d4 h1:CLTzK8yhrf5jN0z1ozzUsKixsRVBd04xYB5uF1lWGKQ= 12 | github.com/nickpoorman/go-python3 v0.0.0-20190713164746-bfc9d2df89d4/go.mod h1:FX9q+avghnDFTEGOkfHFO4aGmgenMR9Ntxbz/NIoMSY= 13 | github.com/nickpoorman/pytasks v0.0.0-20190706034506-5f0c8f7bc6b6 h1:43HWO7WMGIA5sJ7pXx7W8gcpofLBv9oejmt9hW5eYbY= 14 | github.com/nickpoorman/pytasks v0.0.0-20190706034506-5f0c8f7bc6b6/go.mod h1:VDRxietscyuUIJCywrIRxoSgMX+c0bQA3wFfFiKH8do= 15 | github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= 16 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 17 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 18 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 19 | github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 20 | github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= 21 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 22 | -------------------------------------------------------------------------------- /schema.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow" 8 | ) 9 | 10 | // PySchemaFromPyTable returns a pyarrow schema from a pyarrow Table. 11 | func PySchemaFromPyTable(pyTable *python3.PyObject) (*python3.PyObject, error) { 12 | pySchema := pyTable.GetAttrString("schema") 13 | if pySchema == nil { 14 | return nil, errors.New("could not get pySchema") 15 | } 16 | return pySchema, nil 17 | } 18 | 19 | // PySchemaToSchema given a Python schema gets the Go Arrow schema. 20 | func PySchemaToSchema(pySchema *python3.PyObject) (*arrow.Schema, error) { 21 | // start with the field names 22 | pyFieldNames, err := getPyFieldNames(pySchema) 23 | if err != nil { 24 | return nil, err 25 | } 26 | defer func() { 27 | for i := range pyFieldNames { 28 | pyFieldNames[i].DecRef() 29 | } 30 | }() 31 | 32 | // Get the fields 33 | fields, err := getFields(pySchema, pyFieldNames) 34 | if err != nil { 35 | return nil, err 36 | } 37 | 38 | return arrow.NewSchema(fields, nil), nil 39 | } 40 | 41 | func getPyFieldNames(pySchema *python3.PyObject) ([]*python3.PyObject, error) { 42 | pyFieldNames := pySchema.GetAttrString("names") 43 | if pyFieldNames == nil { 44 | return nil, errors.New("could not get pyFieldNames") 45 | } 46 | defer pyFieldNames.DecRef() 47 | 48 | // verify the result is a list 49 | if !python3.PyList_Check(pyFieldNames) { 50 | return nil, errors.New("not a list of field names") 51 | } 52 | 53 | length := python3.PyList_Size(pyFieldNames) 54 | pyNames := make([]*python3.PyObject, 0, length) 55 | for i := 0; i < length; i++ { 56 | pyName := python3.PyList_GetItem(pyFieldNames, i) 57 | if pyName == nil { 58 | return nil, errors.New("could not get name") 59 | } 60 | pyName.IncRef() 61 | // pyNames[i] = pyName 62 | pyNames = append(pyNames, pyName) 63 | } 64 | 65 | return pyNames, nil 66 | } 67 | 68 | func getFields(pySchema *python3.PyObject, pyFieldNames []*python3.PyObject) ([]arrow.Field, error) { 69 | fields := make([]arrow.Field, 0, len(pyFieldNames)) 70 | for _, pyFieldName := range pyFieldNames { 71 | field, err := getField(pySchema, pyFieldName) 72 | if err != nil { 73 | return nil, err 74 | } 75 | // fields[i] = *field 76 | fields = append(fields, *field) 77 | } 78 | return fields, nil 79 | } 80 | 81 | func getField(schema *python3.PyObject, fieldName *python3.PyObject) (*arrow.Field, error) { 82 | pyField := CallPyFunc(schema, "field_by_name", fieldName) 83 | if pyField == nil { 84 | return nil, errors.New("could not get pyField") 85 | } 86 | defer pyField.DecRef() 87 | 88 | field, err := PyFieldToField(pyField) 89 | if err != nil { 90 | return nil, err 91 | } 92 | 93 | return field, nil 94 | } 95 | -------------------------------------------------------------------------------- /table.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/DataDog/go-python3" 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | ) 10 | 11 | func PyTableToTable(pyTable *python3.PyObject) (array.Table, error) { 12 | schema, cols, err := PyTableToColumns(pyTable) 13 | if err != nil { 14 | return nil, err 15 | } 16 | 17 | // Build the table 18 | table := array.NewTable(schema, cols, -1) // -1 tells it to determine the numRows from the first column 19 | 20 | return table, nil 21 | } 22 | 23 | // PyTableToColumns returns the records in the pyarrow table. 24 | func PyTableToColumns(pyTable *python3.PyObject) (*arrow.Schema, []array.Column, error) { 25 | // Get the PySchema from the PyTable 26 | pySchema, err := PySchemaFromPyTable(pyTable) 27 | if err != nil { 28 | return nil, nil, err 29 | } 30 | defer pySchema.DecRef() 31 | 32 | // Get the GoSchema 33 | schema, err := PySchemaToSchema(pySchema) 34 | if err != nil { 35 | return nil, nil, err 36 | } 37 | 38 | columns, err := PyTableToColumnsWithSchema(pyTable, schema) 39 | if err != nil { 40 | return nil, nil, err 41 | } 42 | 43 | return schema, columns, nil 44 | } 45 | 46 | // PyTableToColumns returns the columns in the pyarrow table. 47 | func PyTableToColumnsWithSchema(pyTable *python3.PyObject, schema *arrow.Schema) ([]array.Column, error) { 48 | fields := schema.Fields() 49 | columns := make([]array.Column, 0, len(fields)) 50 | 51 | for i := range fields { 52 | pyColumn, err := PyTableGetPyColumn(pyTable, fields[i].Name) 53 | if err != nil { 54 | return nil, err 55 | } 56 | defer pyColumn.DecRef() 57 | 58 | col, err := PyColumnToColumnWithField(pyColumn, fields[i]) 59 | if err != nil { 60 | return nil, err 61 | } 62 | // columns[i] = *col 63 | columns = append(columns, *col) 64 | } 65 | 66 | return columns, nil 67 | } 68 | 69 | // PyTableGetPyColumn returns the PyColumn given the name from the PyTable 70 | func PyTableGetPyColumn(pyTable *python3.PyObject, name string) (*python3.PyObject, error) { 71 | pyName := python3.PyUnicode_FromString(name) 72 | defer pyName.DecRef() 73 | 74 | pyColumn := CallPyFunc(pyTable, "column", pyName) 75 | if pyColumn == nil { 76 | return nil, errors.New("could not get pyColumn") 77 | } 78 | 79 | return pyColumn, nil 80 | } 81 | -------------------------------------------------------------------------------- /table_test.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/DataDog/go-python3" 8 | "github.com/apache/arrow/go/arrow/array" 9 | "github.com/apache/arrow/go/arrow/memory" 10 | "github.com/go-bullseye/bullseye/dataframe" 11 | "github.com/nickpoorman/pytasks" 12 | ) 13 | 14 | func BenchmarkAll(b *testing.B) { 15 | for i := 5; i <= 10; i += 2 { 16 | b.Run(fmt.Sprintf("BenchmarkZeroCopyChunks_%d", i), zeroCopyBenchmarkN(i, "zero_copy_chunks")) 17 | } 18 | for i := 1000; i <= 10000; i += 500 { 19 | b.Run(fmt.Sprintf("BenchmarkZeroCopyChunks_%d", i), zeroCopyBenchmarkN(i, "zero_copy_chunks")) 20 | } 21 | for i := 5; i <= 10; i += 2 { 22 | b.Run(fmt.Sprintf("BenchmarkZeroCopyElements_%d", i), zeroCopyBenchmarkN(i, "zero_copy_elements")) 23 | } 24 | for i := 1000; i <= 10000; i += 500 { 25 | b.Run(fmt.Sprintf("BenchmarkZeroCopyElements_%d", i), zeroCopyBenchmarkN(i, "zero_copy_elements")) 26 | } 27 | 28 | // At this point we know we won't need Python anymore in this 29 | // program, we can restore the state and lock the GIL to perform 30 | // the final operations before exiting. 31 | err := pytasks.GetPythonSingleton().Finalize() 32 | if err != nil { 33 | panic(err) 34 | } 35 | } 36 | 37 | // So the benchmarks don't get compiled out during optimization. 38 | var benchTable array.Table 39 | 40 | func zeroCopyBenchmarkN(numChunks int, pyMethod string) func(b *testing.B) { 41 | return func(b *testing.B) { 42 | if numChunks <= 0 { 43 | b.Fatal("numChunks must be greater than zero") 44 | } 45 | 46 | py := pytasks.GetPythonSingleton() 47 | 48 | fooModule, err := py.ImportModule("foo") 49 | if err != nil { 50 | b.Fatal(err) 51 | } 52 | defer func() { 53 | err := pytasks.GetPythonSingleton().NewTaskSync(func() { 54 | fooModule.DecRef() 55 | }) 56 | if err != nil { 57 | panic(err) 58 | } 59 | }() 60 | 61 | var pyTable *python3.PyObject 62 | taskErr := py.NewTaskSync(func() { 63 | pyNumChunks := python3.PyLong_FromLong(numChunks) 64 | defer pyNumChunks.DecRef() 65 | pyTable = CallPyFunc(fooModule, pyMethod, pyNumChunks) 66 | if pyTable == nil { 67 | b.Fatal("pyTable is nil") 68 | } 69 | }) 70 | if taskErr != nil { 71 | b.Fatal(taskErr) 72 | } 73 | defer func() { 74 | err := pytasks.GetPythonSingleton().NewTaskSync(func() { 75 | pyTable.DecRef() 76 | }) 77 | if err != nil { 78 | panic(err) 79 | } 80 | }() 81 | 82 | var table array.Table 83 | b.ResetTimer() 84 | 85 | taskErr = py.NewTaskSync(func() { 86 | // Run the loop inside of the task so we don't benchmark 87 | // grabbing the GIL over and over. When the loop 88 | // is outside the task, the results are still consistent. 89 | for i := 0; i < b.N; i++ { 90 | table, err = PyTableToTable(pyTable) 91 | if err != nil { 92 | b.Fatal(err) 93 | } 94 | } 95 | }) 96 | if taskErr != nil { 97 | b.Fatal(taskErr) 98 | } 99 | 100 | benchTable = table 101 | } 102 | } 103 | 104 | func TestTable(t *testing.T) { 105 | // Init Python 106 | _ = pytasks.GetPythonSingleton() 107 | 108 | t.Run("PyTableToTable", testPyTableToTable) 109 | 110 | // At this point we know we won't need Python anymore in this 111 | // program, we can restore the state and lock the GIL to perform 112 | // the final operations before exiting. 113 | err := pytasks.GetPythonSingleton().Finalize() 114 | if err != nil { 115 | t.Fatal(err) 116 | } 117 | } 118 | 119 | func testPyTableToTable(t *testing.T) { 120 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 121 | defer pool.AssertSize(t, 0) 122 | 123 | py := pytasks.GetPythonSingleton() 124 | fooModule, err := py.ImportModule("foo") 125 | if err != nil { 126 | t.Fatal(err) 127 | } 128 | defer func() { 129 | err := pytasks.GetPythonSingleton().NewTaskSync(func() { 130 | fooModule.DecRef() 131 | }) 132 | if err != nil { 133 | panic(err) 134 | } 135 | }() 136 | 137 | var table array.Table 138 | taskErr := py.NewTaskSync(func() { 139 | pyTable := genPyTable(fooModule) 140 | table, err = PyTableToTable(pyTable) 141 | pyTable.DecRef() 142 | }) 143 | if taskErr != nil { 144 | t.Fatal(taskErr) 145 | } 146 | if err != nil { 147 | t.Fatal(err) 148 | } 149 | 150 | // Wrapping it in a bullseye dataframe allows us to print it easily 151 | df, err := dataframe.NewDataFrameFromTable(pool, table) 152 | if err != nil { 153 | t.Fatal(err) 154 | } 155 | 156 | got := df.Display(0) 157 | want := `rec[0]["f0"]: [1 2 3 4] 158 | rec[0]["f1"]: ["foo" "bar" "baz" (null)] 159 | rec[0]["f2"]: [true (null) false true] 160 | rec[1]["f0"]: [1 2 3 4] 161 | rec[1]["f1"]: ["foo" "bar" "baz" (null)] 162 | rec[1]["f2"]: [true (null) false true] 163 | rec[2]["f0"]: [1 2 3 4] 164 | rec[2]["f1"]: ["foo" "bar" "baz" (null)] 165 | rec[2]["f2"]: [true (null) false true] 166 | rec[3]["f0"]: [1 2 3 4] 167 | rec[3]["f1"]: ["foo" "bar" "baz" (null)] 168 | rec[3]["f2"]: [true (null) false true] 169 | rec[4]["f0"]: [1 2 3 4] 170 | rec[4]["f1"]: ["foo" "bar" "baz" (null)] 171 | rec[4]["f2"]: [true (null) false true] 172 | ` 173 | if got != want { 174 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 175 | } 176 | } 177 | 178 | func genPyTable(module *python3.PyObject) *python3.PyObject { 179 | pyTable := CallPyFunc(module, "zero_copy_chunks") 180 | if pyTable == nil { 181 | panic("pyTable is nil") 182 | } 183 | return pyTable 184 | } 185 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package bridge 2 | 3 | import "github.com/DataDog/go-python3" 4 | 5 | // A helper for first fetching the function and then calling it 6 | func CallPyFunc(obj *python3.PyObject, name string, args ...*python3.PyObject) *python3.PyObject { 7 | fn := obj.GetAttrString(name) 8 | defer fn.DecRef() 9 | 10 | return fn.CallFunctionObjArgs(args...) 11 | } 12 | 13 | func GetIntAttr(obj *python3.PyObject, attr string) (int, bool) { 14 | v := obj.GetAttrString(attr) 15 | if v == nil { 16 | return 0, false 17 | } 18 | defer v.DecRef() 19 | return python3.PyLong_AsLong(v), true 20 | } 21 | --------------------------------------------------------------------------------