├── .asf.yaml ├── .clang-format ├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── data ├── README.md ├── arrow-commits │ ├── README.md │ ├── arrow-commits.R │ ├── arrow-commits.arrows │ └── arrow-commits.jsonl ├── rand-many-types │ ├── Makefile │ ├── README.md │ ├── arrows-to-parquet.py │ ├── generate.py │ ├── parquet-to-duckdb.sql │ ├── random.arrows │ ├── random.duckdb │ ├── random.parquet │ └── requirements.txt └── taxi-data │ ├── README.md │ └── train.parquet ├── dissociated-ipc ├── CMakeLists.txt ├── README.md ├── cudf-flight-client.cc ├── cudf-flight-server.cc ├── cudf-flight-ucx.cc ├── cudf-flight-ucx.h ├── ucx_client.cc ├── ucx_client.h ├── ucx_conn.cc ├── ucx_conn.h ├── ucx_server.cc ├── ucx_server.h ├── ucx_utils.cc └── ucx_utils.h └── http ├── README.md ├── get_compressed ├── README.md ├── curl │ └── client │ │ ├── README.md │ │ └── client.sh └── python │ ├── client │ ├── README.md │ └── client.py │ └── server │ ├── README.md │ └── server.py ├── get_indirect ├── README.md ├── curl │ ├── .gitignore │ └── client │ │ ├── README.md │ │ └── client.sh └── python │ ├── .gitignore │ ├── client │ ├── README.md │ └── client.py │ └── server │ ├── README.md │ └── server.py ├── get_multipart ├── README.md └── python │ ├── client │ ├── README.md │ └── simple_client.py │ └── server │ ├── README.md │ └── server.py ├── get_range ├── README.md ├── curl │ ├── .gitignore │ └── client │ │ ├── README.md │ │ └── client.sh └── js │ ├── .gitignore │ └── server │ ├── README.md │ └── serve.json ├── get_simple ├── README.md ├── c_glib │ └── client │ │ ├── .gitignore │ │ ├── README.md │ │ └── client.c ├── cpp │ ├── .gitignore │ └── client │ │ ├── README.md │ │ └── client.cpp ├── csharp │ ├── .gitignore │ ├── client │ │ ├── ArrowHttpClient.cs │ │ ├── Client.csproj │ │ └── README.md │ └── server │ │ ├── ArrowHttpServer.cs │ │ ├── README.md │ │ └── Server.csproj ├── curl │ └── client │ │ ├── README.md │ │ └── client.sh ├── go │ ├── .gitignore │ ├── client │ │ ├── README.md │ │ ├── client.go │ │ └── go.mod │ └── server │ │ ├── README.md │ │ ├── go.mod │ │ └── server.go ├── java │ ├── .gitignore │ ├── client │ │ ├── README.md │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── example │ │ │ └── ArrowHttpClient.java │ └── server │ │ ├── README.md │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── example │ │ └── ArrowHttpServer.java ├── js │ ├── .gitignore │ └── client │ │ ├── README.md │ │ ├── client.js │ │ └── package.json ├── julia │ ├── Project.toml │ ├── client │ │ ├── README.md │ │ └── client.jl │ └── server │ │ ├── README.md │ │ └── server.jl ├── matlab │ └── client │ │ ├── README.md │ │ └── client.m ├── python │ ├── client │ │ ├── README.md │ │ └── urllib.request │ │ │ ├── README.md │ │ │ └── client.py │ └── server │ │ ├── README.md │ │ ├── fastapi_uvicorn │ │ ├── README.md │ │ └── server.py │ │ └── http.server │ │ ├── README.md │ │ └── server.py ├── r │ └── client │ │ ├── README.md │ │ └── client.R ├── rs │ ├── .gitignore │ ├── Cargo.toml │ ├── client │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── src │ │ │ └── main.rs │ └── server │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── src │ │ └── main.rs └── ruby │ ├── client │ ├── .gitignore │ ├── Gemfile │ ├── README.md │ └── client.rb │ └── server │ ├── .gitignore │ ├── Gemfile │ ├── README.md │ └── config.ru ├── post_multipart └── README.md └── post_simple └── README.md /.asf.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | github: 19 | description: "Apache Arrow Development Experiments" 20 | homepage: https://arrow.apache.org/ 21 | enabled_merge_buttons: 22 | merge: false 23 | rebase: false 24 | squash: true 25 | features: 26 | issues: false 27 | 28 | notifications: 29 | commits: commits@arrow.apache.org 30 | pullrequests: github@arrow.apache.org 31 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | --- 18 | BasedOnStyle: Google 19 | ColumnLimit: 90 20 | DerivePointerAlignment: false 21 | IncludeBlocks: Preserve 22 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | data/**/*.arrows filter=lfs diff=lfs merge=lfs -text 19 | data/**/*.arrow filter=lfs diff=lfs merge=lfs -text 20 | data/**/*.jsonl filter=lfs diff=lfs merge=lfs -text 21 | data/**/*.parquet filter=lfs diff=lfs merge=lfs -text 22 | data/**/*.db filter=lfs diff=lfs merge=lfs -text 23 | data/**/*.duckdb filter=lfs diff=lfs merge=lfs -text 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | vendored 19 | build 20 | .vscode 21 | cufile.log 22 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Arrow Experiments 2 | Copyright 2024 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Apache Arrow Experiments 21 | 22 | This repository is for collaborative prototyping and research in the Apache Arrow project. 23 | 24 | | Directory | Contents | 25 | | --------- | -------- | 26 | | **[data](./data)** | Various datasets that are used by the experiments in this repository or intended to be used in future Arrow experiments | 27 | | **[dissociated-ipc](./dissociated-ipc)** | Reference example implementation of the experimental [Arrow Dissociated IPC Protocol](https://arrow.apache.org/docs/dev/format/DissociatedIPC.html) | 28 | | **[http](./http)** | Examples demonstrating ways of sending and receiving data in Arrow IPC stream format (IANA media type `application/vnd.apache.arrow.stream`) over HTTP APIs | 29 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Apache Arrow Data Experiments 21 | 22 | This directory contains various datasets that are used by the experiments 23 | in this repository or intended to be used in future Arrow experiments. 24 | This currently includes data used to generate compelling examples that is 25 | more realistic than generated data or the testing data found in 26 | [apache/arrow-testing](http://github.com/apache/arrow-testing). This 27 | directory is intended as a semi-temporary staging area; eventually, much 28 | of the data here should find a permanent home elsewhere. 29 | 30 | > [!IMPORTANT] 31 | > Please install and use [Git LFS](https://git-lfs.com) when contributing to this subdirectory. Add any new large file extensions to [`.gitattributes`](https://github.com/apache/arrow-experiments/blob/main/.gitattributes). 32 | -------------------------------------------------------------------------------- /data/arrow-commits/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # arrow-commits 21 | 22 | Commits to the apache/arrow repository as of ~2024-03-06 as generated by 23 | `git log`, interpreted by the [gert package for R](https://docs.ropensci.org/gert/), 24 | and written by the [arrow package for R](https://arrow.apache.org/docs/r) as an uncompressed 25 | [Arrow IPC Stream](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc). 26 | For comparison and testing purposes, a line-delimited JSON version is also included. 27 | The data contain 15,487 rows, 5 columns, and are approximiatly 2 MB in size. 28 | 29 | Read in R: 30 | 31 | ```r 32 | library(arrow, warn.conflicts = FALSE) 33 | read_ipc_stream("data/arrow-commits/arrow-commits.arrows") 34 | # # A tibble: 15,487 × 5 35 | # commit time files merge message 36 | # 37 | # 1 49cdb0fe4e98fda19031c864a18e6156c6ed… 2024-03-07 02:00:52 2 FALSE GH-403… 38 | # 2 1d966e98e41ce817d1f8c5159c0b9caa4de7… 2024-03-06 21:51:34 1 FALSE GH-403… 39 | # 3 96f26a89bd73997f7532643cdb27d04b7097… 2024-03-06 20:29:15 1 FALSE GH-402… 40 | # 4 ee1a8c39a55f3543a82fed900dadca791f6e… 2024-03-06 07:46:45 1 FALSE GH-403… 41 | # 5 3d467ac7bfae03cf2db09807054c5672e195… 2024-03-05 16:13:32 1 FALSE GH-201… 42 | # 6 ef6ea6beed071ed070daf03508f4c14b4072… 2024-03-05 14:53:13 20 FALSE GH-403… 43 | # 7 53e0c745ad491af98a5bf18b67541b12d779… 2024-03-05 12:31:38 2 FALSE GH-401… 44 | # 8 3ba6d286caad328b8572a3b9228045da8c8d… 2024-03-05 08:15:42 6 FALSE GH-400… 45 | # 9 4ce9a5edd2710fb8bf0c642fd0e3863b01c2… 2024-03-05 07:56:25 2 FALSE GH-401… 46 | # 10 2445975162905bd8d9a42ffc9cd0daa0e19d… 2024-03-05 01:04:20 1 FALSE GH-403… 47 | # # ℹ 15,477 more rows 48 | # # ℹ Use `print(n = ...)` to see more rows 49 | ``` 50 | 51 | Read in Python: 52 | 53 | ```python 54 | from pyarrow import ipc 55 | 56 | with ipc.open_stream("data/arrow-commits/arrow-commits.arrows") as stream: 57 | stream.read_all() 58 | 59 | # pyarrow.Table 60 | # commit: string 61 | # time: timestamp[us, tz=UTC] 62 | # files: int32 63 | # merge: bool 64 | # message: string 65 | # ---- 66 | # commit: [["49cdb0fe4e98fda19031c864a18e6156c6edbf3c","1d966e98e41ce817d1f8c5159c 67 | # time: [[2024-03-07 02:00:52.000000Z,2024-03-06 21:51:34.000000Z,2024-03-06 20:29 68 | # files: [[2,1,1,1,1,...,1,8,2,2,4],[19,3,8,33,1,...,1,1,2,3,7],...,[7,21,2,3,6,.. 69 | # merge: [[false,false,false,false,false,...,false,false,false,false,false],[false 70 | # message: [["GH-40370: [C++] Define ARROW_FORCE_INLINE for non-MSVC builds (#4037 71 | ``` 72 | -------------------------------------------------------------------------------- /data/arrow-commits/arrow-commits.R: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | library(gert) 19 | library(arrow, warn.conflicts = FALSE) 20 | 21 | # Assumes the working directory is the root arrow-experiments 22 | out_stream <- file.path(getwd(), "data/arrow-commits/arrow-commits.arrows") 23 | out_jsonl <- file.path(getwd(), "data/arrow-commits/arrow-commits.jsonl") 24 | 25 | # ...and an apache/arrow checkout at ../arrow 26 | commits <- withr::with_dir("../arrow", { 27 | gert::git_log(max = .Machine$integer.max) 28 | }) 29 | 30 | # Best if names and email addresses are not included in example data 31 | commits$author <- NULL 32 | 33 | # Ensure times are UTC 34 | commits$time <- lubridate::with_tz(commits$time, "UTC") 35 | 36 | # Just take the first line of each commit message 37 | commits$message <- vapply( 38 | commits$message, 39 | function(x) strsplit(x, "\n+")[[1]][1], 40 | character(1), 41 | USE.NAMES = FALSE 42 | ) 43 | 44 | # Don't include any R metadata in the example Arrow file 45 | commits_table <- arrow_table(commits) 46 | commits_table$metadata <- NULL 47 | 48 | # R bindings don't expose non-default batch size, so do the chunking manually 49 | batch_size <- 1024 50 | num_batches <- nrow(commits_table) %/% batch_size + 1 51 | batch <- function(i) { 52 | begin <- (i - 1) * batch_size + 1 53 | end <- min(begin + batch_size - 1, nrow(commits_table)) 54 | commits_table[begin:end, ] 55 | } 56 | 57 | fs <- LocalFileSystem$create() 58 | out <- fs$OpenOutputStream(out_stream) 59 | writer <- RecordBatchStreamWriter$create(out, commits_table$schema) 60 | for (i in seq_len(num_batches)) { 61 | writer$write_table(batch(i)) 62 | } 63 | writer$close() 64 | 65 | # Check a simple read of the Arrow stream 66 | stopifnot(identical(read_ipc_stream(out_stream), commits)) 67 | 68 | # Also write a .jsonl version 69 | withr::with_connection(list(con = file(out_jsonl)), { 70 | open(con, "w") 71 | for (i in seq_len(nrow(commits))) { 72 | item <- as.list(commits[i, , drop = FALSE]) 73 | line <- jsonlite::toJSON(item, POSIXt = "ISO8601", auto_unbox = TRUE) 74 | writeLines(line, con) 75 | } 76 | }) 77 | -------------------------------------------------------------------------------- /data/arrow-commits/arrow-commits.arrows: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1dfc48f3c411af0447bb973343e086af6705d4505d641cd1bafd9c5f337c85ab 3 | size 1990976 4 | -------------------------------------------------------------------------------- /data/arrow-commits/arrow-commits.jsonl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:59ee7bd690c83c6619656b07fd86492fd38c5feea71f245ddf37eeaaf456398f 3 | size 2914541 4 | -------------------------------------------------------------------------------- /data/rand-many-types/Makefile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | all: arrow parquet duckdb 19 | 20 | arrow: generate.py 21 | python ./generate.py 22 | 23 | parquet: arrow 24 | python ./arrows-to-parquet.py 25 | 26 | duckdb: parquet 27 | duckdb -f ./parquet-to-duckdb.sql 28 | -------------------------------------------------------------------------------- /data/rand-many-types/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # rand-many-types 21 | 22 | This directory contains a file `random.arrows` in Arrow IPC stream format with randomly generated values in 20+ columns exercising many different Arrow data types. The Python script `generate.py` that generated the data file is included. 23 | 24 | The same data is also included as a Parquet file (`random.parquet`) and as a DuckDB database file (`random.duckdb`) as the table named `random`. The Python and SQL used to generate these files is included. 25 | 26 | To re-generate the data files (for example, if you change `generate.py`), 27 | 28 | 1. Make sure `duckdb` is in your path and activate a Python environment with the packages in `./requirements.txt` 29 | 2. Run `make` 30 | -------------------------------------------------------------------------------- /data/rand-many-types/arrows-to-parquet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pyarrow.parquet as pq 20 | 21 | 22 | with open("random.arrows", "rb") as f: 23 | reader = pa.ipc.open_stream(f) 24 | table = reader.read_all() 25 | 26 | pq.write_table(table, "random.parquet") 27 | -------------------------------------------------------------------------------- /data/rand-many-types/parquet-to-duckdb.sql: -------------------------------------------------------------------------------- 1 | -- Licensed to the Apache Software Foundation (ASF) under one 2 | -- or more contributor license agreements. See the NOTICE file 3 | -- distributed with this work for additional information 4 | -- regarding copyright ownership. The ASF licenses this file 5 | -- to you under the Apache License, Version 2.0 (the 6 | -- "License"); you may not use this file except in compliance 7 | -- with the License. You may obtain a copy of the License at 8 | -- 9 | -- http://www.apache.org/licenses/LICENSE-2.0 10 | -- 11 | -- Unless required by applicable law or agreed to in writing, 12 | -- software distributed under the License is distributed on an 13 | -- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | -- KIND, either express or implied. See the License for the 15 | -- specific language governing permissions and limitations 16 | -- under the License. 17 | 18 | .open random.duckdb 19 | DROP TABLE IF EXISTS random; 20 | CREATE TABLE random AS SELECT * FROM './random.parquet'; 21 | .exit 22 | -------------------------------------------------------------------------------- /data/rand-many-types/random.arrows: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d5f42338317901eb00343a85394b263ebbcf488ec08d45434088d9323fb26d79 3 | size 13550776 4 | -------------------------------------------------------------------------------- /data/rand-many-types/random.duckdb: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f1f00607e3755773432dc81ba4bfe8bcdae8457f3430587a12d632261c4451fe 3 | size 17575936 4 | -------------------------------------------------------------------------------- /data/rand-many-types/random.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1518178747ce49b87f13feca3b125f3dcc0c686d198a205c6c8d38c5c4db1158 3 | size 11109117 4 | -------------------------------------------------------------------------------- /data/rand-many-types/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pyarrow 3 | -------------------------------------------------------------------------------- /data/taxi-data/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # taxi-data 21 | 22 | A small subset of the public [NYC Taxi Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) used in the dissociated-ipc example. 23 | -------------------------------------------------------------------------------- /data/taxi-data/train.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:854cf53ab8669aa260a8ae65beafe880ab1a0232dbdac09705fb9b6f3f84eacd 3 | size 38521857 4 | -------------------------------------------------------------------------------- /dissociated-ipc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more contributor 2 | # license agreements. See the NOTICE file distributed with this work for 3 | # additional information regarding copyright ownership. The ASF licenses this 4 | # file to you under the Apache License, Version 2.0 (the "License"); you may not 5 | # use this file except in compliance with the License. You may obtain a copy of 6 | # the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | # License for the specific language governing permissions and limitations under 14 | # the License. 15 | 16 | cmake_minimum_required(VERSION 3.20) 17 | message(STATUS "Building using CMake version: ${CMAKE_VERSION}") 18 | project(arrow-cudf-flight CXX CUDA) 19 | 20 | include(CMakeParseArguments) 21 | 22 | # https://www.cmake.org/cmake/help/latest/policy/CMP0025.html 23 | # 24 | # Compiler id for Apple Clang is now AppleClang. 25 | cmake_policy(SET CMP0025 NEW) 26 | 27 | # https://cmake.org/cmake/help/latest/policy/CMP0042.html 28 | # 29 | # Enable MACOSX_RPATH by default. @rpath in a target's install name is a more 30 | # flexible and powerful mechanism than @executable_path or @loader_path for 31 | # locating shared libraries. 32 | cmake_policy(SET CMP0042 NEW) 33 | 34 | # https://www.cmake.org/cmake/help/latest/policy/CMP0054.html 35 | # 36 | # Only interpret if() arguments as variables or keywords when unquoted. 37 | cmake_policy(SET CMP0054 NEW) 38 | 39 | # https://www.cmake.org/cmake/help/latest/policy/CMP0057.html 40 | # 41 | # Support new if() IN_LIST operator. 42 | cmake_policy(SET CMP0057 NEW) 43 | 44 | # https://www.cmake.org/cmake/help/latest/policy/CMP0063.html 45 | # 46 | # Adapted from Apache Kudu: 47 | # https://github.com/apache/kudu/commit/bd549e13743a51013585 Honor visibility 48 | # properties for all target types. 49 | cmake_policy(SET CMP0063 NEW) 50 | 51 | # https://cmake.org/cmake/help/latest/policy/CMP0068.html 52 | # 53 | # RPATH settings on macOS do not affect install_name. 54 | cmake_policy(SET CMP0068 NEW) 55 | 56 | # https://cmake.org/cmake/help/latest/policy/CMP0074.html 57 | # 58 | # find_package() uses _ROOT variables. 59 | cmake_policy(SET CMP0074 NEW) 60 | 61 | # https://cmake.org/cmake/help/latest/policy/CMP0091.html 62 | # 63 | # MSVC runtime library flags are selected by an abstraction. 64 | cmake_policy(SET CMP0091 NEW) 65 | 66 | # https://cmake.org/cmake/help/latest/policy/CMP0135.html 67 | # 68 | # CMP0135 is for solving re-building and re-downloading. We don't have a real 69 | # problem with the OLD behavior for now but we use the NEW behavior explicitly 70 | # to suppress CMP0135 warnings. 71 | if(POLICY CMP0135) 72 | cmake_policy(SET CMP0135 NEW) 73 | endif() 74 | 75 | find_package(ArrowFlight REQUIRED) 76 | find_package(ArrowCUDA REQUIRED) 77 | message(STATUS "Found Arrow: ${ARROW_VERSION}") 78 | 79 | find_package(CUDA QUIET REQUIRED) 80 | find_package(gflags REQUIRED) 81 | find_package(cudf REQUIRED) 82 | find_package(ucx REQUIRED) 83 | add_library(ucx::ucx INTERFACE IMPORTED) 84 | target_link_libraries(ucx::ucx INTERFACE ucx::ucp ucx::uct ucx::ucs) 85 | 86 | add_definitions(-DFMT_USE_NONTYPE_TEMPLATE_ARGS=0) 87 | add_executable( 88 | arrow-cudf-flight 89 | cudf-flight-ucx.cc 90 | cudf-flight-client.cc 91 | cudf-flight-server.cc 92 | ucx_utils.cc 93 | ucx_server.cc 94 | ucx_client.cc 95 | ucx_conn.cc) 96 | target_link_libraries( 97 | arrow-cudf-flight 98 | arrow_shared 99 | arrow_cuda_shared 100 | arrow_flight_shared 101 | gflags 102 | cudf::cudf 103 | ucx::ucx) 104 | set_target_properties( 105 | arrow-cudf-flight 106 | PROPERTIES BUILD_RPATH "\$ORIGIN" 107 | INSTALL_RPATH "\$ORIGIN" 108 | CXX_STANDARD 20 109 | CXX_STANDARD_REQUIRED ON 110 | CXX_EXTENSIONS ON 111 | CUDA_STANDARD 20 112 | CUDA_STANDARD_REQUIRED ON) 113 | -------------------------------------------------------------------------------- /dissociated-ipc/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Arrow Dissociated IPC Protocol Example 21 | 22 | This directory contains a reference example implementation of the 23 | [Arrow Dissociated IPC Protocol](https://arrow.apache.org/docs/dev/format/DissociatedIPC.html). 24 | 25 | This protocol splits the Arrow Flatbuffers IPC metadata and the body buffers 26 | into separate streams to allow for utilizing shared memory, non-cpu device 27 | memory, or remote memory (RDMA) with Arrow formatted datasets. 28 | 29 | This example utilizes [libcudf](https://docs.rapids.ai/api) and 30 | [UCX](https://openucx.readthedocs.io/en/master/#) to transfer Arrow data 31 | located on an NVIDIA GPU. 32 | 33 | ## Building 34 | 35 | You must have libcudf, libarrow, libarrow_flight, libarrow_cuda, and ucx 36 | accessible on your `CMAKE_MODULE_PATH`/`CMAKE_PREFIX_PATH` so that `cmake` can find them. 37 | 38 | After that you can simply do the following: 39 | 40 | ```console 41 | $ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release 42 | $ cmake --build build 43 | ``` 44 | 45 | to build the `arrow-cudf-flight` mainprog. 46 | 47 | ## Running 48 | 49 | You can start the server by just running `arrow-cudf-flight` which will 50 | default to using `31337` as the Flight port and `127.0.0.1` for the host. 51 | Both of these can be changed via the `-port` and `-address` gflags 52 | respectively. 53 | 54 | You can run the client by adding the `-client` option when running the 55 | command. 56 | -------------------------------------------------------------------------------- /dissociated-ipc/cudf-flight-ucx.cc: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "cudf-flight-ucx.h" 24 | 25 | DEFINE_int32(port, 31337, "port to listen or connect"); 26 | DEFINE_string(address, "127.0.0.1", "address to connect to"); 27 | DEFINE_bool(client, false, "run the client"); 28 | 29 | int main(int argc, char** argv) { 30 | arrow::util::ArrowLog::StartArrowLog("cudf-flight-poc", 31 | arrow::util::ArrowLogLevel::ARROW_DEBUG); 32 | 33 | gflags::ParseCommandLineFlags(&argc, &argv, true); 34 | if (FLAGS_client) { 35 | ARROW_CHECK_OK(run_client(FLAGS_address, FLAGS_port)); 36 | } else { 37 | ARROW_CHECK_OK(run_server(FLAGS_address, FLAGS_port)); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /dissociated-ipc/cudf-flight-ucx.h: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #pragma once 19 | 20 | #include 21 | 22 | // Define some constants for the `want_data` tags 23 | static constexpr ucp_tag_t kWantDataTag = 0x00000DEADBA0BAB0; 24 | static constexpr ucp_tag_t kWantCtrlTag = 0xFFFFFDEADBA0BAB0; 25 | // define a mask to check the tag 26 | static constexpr ucp_tag_t kWantCtrlMask = 0xFFFFF00000000000; 27 | 28 | // constant for the bit shift to make the data body type the most 29 | // significant byte 30 | static constexpr int kShiftBodyType = 55; 31 | 32 | enum class MetadataMsgType : uint8_t { 33 | EOS = 0, 34 | METADATA = 1, 35 | }; 36 | 37 | arrow::Status run_server(const std::string& addr, const int port); 38 | arrow::Status run_client(const std::string& addr, const int port); -------------------------------------------------------------------------------- /dissociated-ipc/ucx_client.cc: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #include "ucx_client.h" 19 | #include "ucx_utils.h" 20 | 21 | #include 22 | #include 23 | 24 | arrow::Status UcxClient::Init(const std::string& host, const int32_t port) { 25 | ucp_config_t* ucp_config; 26 | ucp_params_t ucp_params; 27 | ucs_status_t status; 28 | 29 | status = ucp_config_read(nullptr, nullptr, &ucp_config); 30 | ARROW_RETURN_NOT_OK(utils::FromUcsStatus("ucp_config_read", status)); 31 | 32 | // if location is IPv6 must adjust UCX config 33 | // we assume locations always resolve to IPv6 or IPv4 34 | // but that's not necessarily true 35 | ARROW_ASSIGN_OR_RAISE(addrlen_, utils::to_sockaddr(host, port, &connect_addr_)); 36 | if (connect_addr_.ss_family == AF_INET6) { 37 | ARROW_RETURN_NOT_OK(utils::FromUcsStatus( 38 | "ucp_config_modify", ucp_config_modify(ucp_config, "AF_PRIO", "inet6"))); 39 | } 40 | 41 | std::memset(&ucp_params, 0, sizeof(ucp_params)); 42 | ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES; 43 | ucp_params.features = UCP_FEATURE_WAKEUP | UCP_FEATURE_AM | UCP_FEATURE_RMA | 44 | UCP_FEATURE_STREAM | UCP_FEATURE_TAG; 45 | 46 | ucp_context_h ucp_context; 47 | status = ucp_init(&ucp_params, ucp_config, &ucp_context); 48 | ucp_config_release(ucp_config); 49 | 50 | ARROW_RETURN_NOT_OK(utils::FromUcsStatus("ucp_init", status)); 51 | ucp_context_.reset(new utils::UcpContext(ucp_context)); 52 | return arrow::Status::OK(); 53 | } 54 | 55 | arrow::Result> UcxClient::CreateConn() { 56 | ucp_worker_params_t worker_params; 57 | std::memset(&worker_params, 0, sizeof(worker_params)); 58 | worker_params.field_mask = 59 | UCP_WORKER_PARAM_FIELD_THREAD_MODE | UCP_WORKER_PARAM_FIELD_FLAGS; 60 | worker_params.thread_mode = UCS_THREAD_MODE_MULTI; 61 | worker_params.flags = UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK; 62 | 63 | ucp_worker_h ucp_worker; 64 | ucs_status_t status = 65 | ucp_worker_create(ucp_context_->get(), &worker_params, &ucp_worker); 66 | ARROW_RETURN_NOT_OK(utils::FromUcsStatus("ucp_worker_create", status)); 67 | 68 | auto cnxn = std::make_unique( 69 | std::make_shared(ucp_context_, ucp_worker)); 70 | ARROW_RETURN_NOT_OK(cnxn->CreateEndpoint(connect_addr_, addrlen_)); 71 | 72 | return cnxn; 73 | } 74 | -------------------------------------------------------------------------------- /dissociated-ipc/ucx_client.h: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #pragma once 19 | 20 | #include "ucx_conn.h" 21 | #include "ucx_utils.h" 22 | 23 | #include 24 | #include 25 | 26 | #include 27 | 28 | class UcxClient { 29 | public: 30 | UcxClient() = default; 31 | ~UcxClient() = default; 32 | 33 | arrow::Status Init(const std::string& host, const int32_t port); 34 | arrow::Result> CreateConn(); 35 | 36 | private: 37 | std::shared_ptr ucp_context_; 38 | struct sockaddr_storage connect_addr_; 39 | size_t addrlen_; 40 | }; 41 | -------------------------------------------------------------------------------- /dissociated-ipc/ucx_conn.h: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #pragma once 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "arrow/util/logging.h" 26 | #include "ucx_utils.h" 27 | 28 | namespace utils { 29 | class Connection { 30 | public: 31 | explicit Connection(std::shared_ptr worker); 32 | Connection(std::shared_ptr worker, ucp_ep_h endpoint); 33 | 34 | ARROW_DISALLOW_COPY_AND_ASSIGN(Connection); 35 | ARROW_DEFAULT_MOVE_AND_ASSIGN(Connection); 36 | ~Connection() { DCHECK(!ucp_worker_) << "Connection was not closed!"; } 37 | 38 | arrow::Status CreateEndpoint(ucp_conn_request_h request); 39 | arrow::Status CreateEndpoint(const sockaddr_storage& addr, const size_t addrlen); 40 | arrow::Status Flush(); 41 | arrow::Status Close(); 42 | inline bool is_closed() const { return closed_; } 43 | inline unsigned int Progress() { return ucp_worker_progress(ucp_worker_->get()); } 44 | inline ucs_status_t WorkerWait() { return ucp_worker_wait(ucp_worker_->get()); } 45 | 46 | arrow::Status SetAMHandler(unsigned int id, void* user_data, ucp_am_recv_callback_t cb); 47 | 48 | arrow::Result> ProbeForTag( 49 | ucp_tag_t tag, ucp_tag_t mask, int remove); 50 | arrow::Result> ProbeForTagSync( 51 | ucp_tag_t tag, ucp_tag_t mask, int remove); 52 | arrow::Status RecvTagData(ucp_tag_message_h msg, void* buffer, const size_t count, 53 | void* user_data, ucp_tag_recv_nbx_callback_t cb, 54 | const ucs_memory_type_t memory_type); 55 | ucs_status_t RecvAM(std::promise> p, const void* header, 56 | const size_t header_length, void* data, const size_t data_length, 57 | const ucp_am_recv_param_t* param); 58 | 59 | arrow::Status SendAM(unsigned int id, const void* data, const int64_t size); 60 | arrow::Status SendAMIov(unsigned int id, const ucp_dt_iov_t* iov, const size_t iov_cnt, 61 | void* user_data, ucp_send_nbx_callback_t cb, 62 | const ucs_memory_type_t memory_type); 63 | arrow::Status SendTagIov(ucp_tag_t tag, const ucp_dt_iov_t* iov, const size_t iov_cnt, 64 | void* user_data, ucp_send_nbx_callback_t cb, 65 | const ucs_memory_type_t memory_type); 66 | arrow::Status SendTagSync(ucp_tag_t tag, const void* buffer, const size_t count); 67 | 68 | protected: 69 | static void err_cb(void* arg, ucp_ep_h ep, ucs_status_t status) { 70 | if (!is_ignorable_disconnect_error(status)) { 71 | ARROW_LOG(DEBUG) << FromUcsStatus("error handling callback", status).ToString(); 72 | } 73 | Connection* cnxn = reinterpret_cast(arg); 74 | cnxn->closed_ = true; 75 | } 76 | 77 | inline arrow::Status CheckClosed() { 78 | if (!remote_endpoint_) { 79 | return arrow::Status::Invalid("connection is closed"); 80 | } 81 | return arrow::Status::OK(); 82 | } 83 | 84 | private: 85 | std::shared_ptr ucp_worker_; 86 | ucp_ep_h remote_endpoint_; 87 | 88 | bool closed_{false}; 89 | }; 90 | } // namespace utils 91 | -------------------------------------------------------------------------------- /dissociated-ipc/ucx_server.h: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #pragma once 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "ucx_conn.h" 28 | #include "ucx_utils.h" 29 | 30 | #include "arrow/flight/types.h" 31 | #include "arrow/gpu/cuda_context.h" 32 | #include "arrow/status.h" 33 | 34 | class UcxServer { 35 | public: 36 | virtual ~UcxServer() = default; 37 | arrow::Status Init(const std::string& host, const int32_t port); 38 | 39 | arrow::Status Wait(); 40 | virtual arrow::Status Shutdown(); 41 | 42 | inline void set_cuda_context(std::shared_ptr ctx) { 43 | cuda_context_ = std::move(ctx); 44 | } 45 | 46 | protected: 47 | inline arrow::flight::Location location() const { return location_; } 48 | 49 | struct ClientWorker { 50 | std::shared_ptr worker_; 51 | std::unique_ptr conn_; 52 | }; 53 | 54 | virtual arrow::Status setup_handlers(ClientWorker* worker) = 0; 55 | virtual arrow::Status do_work(ClientWorker* worker) = 0; 56 | 57 | private: 58 | static void HandleIncomingConnection(ucp_conn_request_h connection_request, 59 | void* data) { 60 | UcxServer* server = reinterpret_cast(data); 61 | server->EnqueueClient(connection_request); 62 | } 63 | 64 | void DriveConnections(); 65 | void EnqueueClient(ucp_conn_request_h connection_request) { 66 | std::lock_guard guard(pending_connections_mutex_); 67 | pending_connections_.push(connection_request); 68 | } 69 | 70 | void HandleConnection(ucp_conn_request_h request); 71 | arrow::Result> CreateWorker(); 72 | 73 | protected: 74 | std::atomic counter_{0}; 75 | arrow::flight::Location location_; 76 | std::shared_ptr ucp_context_; 77 | std::shared_ptr worker_conn_; 78 | ucp_listener_h listener_; 79 | 80 | std::atomic listening_; 81 | std::thread listener_thread_; 82 | // std::thread::join cannot be called concurrently 83 | std::mutex join_mutex_; 84 | std::mutex pending_connections_mutex_; 85 | std::queue pending_connections_; 86 | 87 | std::shared_ptr cuda_context_; 88 | }; 89 | -------------------------------------------------------------------------------- /dissociated-ipc/ucx_utils.h: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #pragma once 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace utils { 33 | static inline void Uint32ToBytesLE(const uint32_t in, uint8_t* out) { 34 | arrow::util::SafeStore(out, arrow::bit_util::ToLittleEndian(in)); 35 | } 36 | 37 | static inline uint32_t BytesToUint32LE(const uint8_t* in) { 38 | return arrow::bit_util::FromLittleEndian(arrow::util::SafeLoadAs(in)); 39 | } 40 | 41 | class UcpContext final { 42 | public: 43 | UcpContext() = default; 44 | explicit UcpContext(ucp_context_h context) : ucp_context_(context) {} 45 | ~UcpContext() { 46 | if (ucp_context_) ucp_cleanup(ucp_context_); 47 | ucp_context_ = nullptr; 48 | } 49 | 50 | ucp_context_h get() const { 51 | DCHECK(ucp_context_); 52 | return ucp_context_; 53 | } 54 | 55 | private: 56 | ucp_context_h ucp_context_{nullptr}; 57 | }; 58 | 59 | class UcpWorker final { 60 | public: 61 | UcpWorker() = default; 62 | UcpWorker(std::shared_ptr context, ucp_worker_h worker) 63 | : ucp_context_(std::move(context)), ucp_worker_(worker) {} 64 | ~UcpWorker() { 65 | if (ucp_worker_) ucp_worker_destroy(ucp_worker_); 66 | ucp_worker_ = nullptr; 67 | } 68 | 69 | ucp_worker_h get() const { return ucp_worker_; } 70 | const UcpContext& context() const { return *ucp_context_; } 71 | 72 | private: 73 | ucp_worker_h ucp_worker_{nullptr}; 74 | std::shared_ptr ucp_context_; 75 | }; 76 | 77 | class UcxStatusDetail : public arrow::StatusDetail { 78 | public: 79 | explicit UcxStatusDetail(ucs_status_t status) : status_(status) {} 80 | static constexpr char const kTypeId[] = "ucx::UcxStatusDetail"; 81 | 82 | const char* type_id() const override { return kTypeId; } 83 | std::string ToString() const override; 84 | static ucs_status_t Unwrap(const arrow::Status& status); 85 | 86 | private: 87 | ucs_status_t status_; 88 | }; 89 | 90 | arrow::Status FromUcsStatus(const std::string& context, ucs_status_t ucs_status); 91 | 92 | class UcxDataBuffer : public arrow::Buffer { 93 | public: 94 | UcxDataBuffer(std::shared_ptr worker, void* data, const size_t size) 95 | : arrow::Buffer(reinterpret_cast(data), static_cast(size)), 96 | worker_(std::move(worker)) {} 97 | ~UcxDataBuffer() override { 98 | ucp_am_data_release(worker_->get(), 99 | const_cast(reinterpret_cast(data()))); 100 | } 101 | 102 | private: 103 | std::shared_ptr worker_; 104 | }; 105 | 106 | arrow::Result to_sockaddr(const std::string& host, const int32_t port, 107 | struct sockaddr_storage* addr); 108 | arrow::Result SockaddrToString(const struct sockaddr_storage& address); 109 | 110 | static inline bool is_ignorable_disconnect_error(ucs_status_t ucs_status) { 111 | // not connected, connection reset: we're already disconnected 112 | // timeout: most likely disconnected, but we can't tell from our end 113 | switch (ucs_status) { 114 | case UCS_OK: 115 | case UCS_ERR_ENDPOINT_TIMEOUT: 116 | case UCS_ERR_NOT_CONNECTED: 117 | case UCS_ERR_CONNECTION_RESET: 118 | return true; 119 | } 120 | return false; 121 | } 122 | } // namespace utils 123 | -------------------------------------------------------------------------------- /http/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Apache Arrow HTTP Data Transport 21 | 22 | This area of the Apache Arrow Experiments repository is for collaborative prototyping and research on the subject of sending and receiving data in Arrow IPC stream format (IANA media type `application/vnd.apache.arrow.stream`) over HTTP APIs. 23 | 24 | The subdirectories beginning with **get** demonstrate clients receiving data from servers (HTTP GET request). Those beginning with **post** demonstrate clients sending data to servers (HTTP POST request). 25 | 26 | | Subdirectory | Purpose | 27 | | ------------ | ------- | 28 | | **[get_compressed](get_compressed)** | Demonstrates various ways of using compression when sending and receiving Arrow IPC stream data over HTTP | 29 | | **[get_indirect](get_indirect)** | Demonstrates a two-step sequence for fetching Arrow data from a server, in which a JSON document provides the URIs for the Arrow data | 30 | | **[get_multipart](get_multipart)** | Demonstrates how to send and receive a multipart HTTP response body (`multipart/mixed`) containing Arrow IPC stream data and other data | 31 | | **[get_range](get_range)** | Demonstrates how to use HTTP range requests to download Arrow IPC stream data of known length in multiple requests | 32 | | **[get_simple](get_simple)** | Contains a large set of examples demonstrating the basics of fetching an Arrow IPC stream from a server to a client in 12+ languages | 33 | | **[post_multipart](post_multipart)** | Demonstrates how to send and receive a multipart HTTP request body (`multipart/form-data`) containing Arrow IPC stream data and other data | 34 | | **[post_simple](post_simple)** | Demonstrates the basics of sending Arrow IPC stream data from a client to a server | 35 | 36 | 37 | The intent of this work is to: 38 | - Ensure excellent interoperability across languages. 39 | - Allow implementation within existing HTTP APIs. 40 | - Maximize performance. 41 | - Minimize implementation complexity. 42 | 43 | The end goal of this work is to inform and guide the creation of a set of conventions to be published in the Arrow documentation. 44 | 45 | > [!IMPORTANT] 46 | > Before contributing to this area of the repository, please see the [related discussion on the Arrow developer mailing list](https://lists.apache.org/thread/vfz74gv1knnhjdkro47shzd1z5g5ggnf) and the [Arrow GitHub issue listing the tasks that are a part of this effort](https://github.com/apache/arrow/issues/40465). 47 | -------------------------------------------------------------------------------- /http/get_compressed/curl/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Compressed Arrow Data Examples 21 | 22 | This directory contains a simple `curl` script that issues multiple HTTP GET 23 | requests to the server implemented in the parent directory, negotiating 24 | different compression algorithms for the Arrow IPC stream data piping the output 25 | to different files with extensions that indicate the compression algorithm used. 26 | 27 | To run this example, first start one of the server examples in the parent 28 | directory, then run the `client.sh` script. 29 | 30 | You can check all the sizes with a simple command: 31 | 32 | ```bash 33 | $ du -sh out* | sort -gr 34 | 816M out.arrows 35 | 804M out_from_chunked.arrows 36 | 418M out_from_chunked.arrows+lz4 37 | 405M out.arrows+lz4 38 | 257M out.arrows.gz 39 | 256M out_from_chunked.arrows.gz 40 | 229M out_from_chunked.arrows+zstd 41 | 229M out.arrows+zstd 42 | 220M out.arrows.zstd 43 | 219M out_from_chunked.arrows.zstd 44 | 39M out_from_chunked.arrows.br 45 | 38M out.arrows.br 46 | ``` 47 | 48 | > [!WARNING] 49 | > Better compression is not the only relevant metric as it might come with a 50 | > trade-off in terms of CPU usage. The best compression algorithm for your use 51 | > case will depend on your specific requirements. 52 | 53 | ## Meaning of the file extensions 54 | 55 | Files produced by HTTP/1.0 requests are not chunked, they get buffered in memory 56 | at the server before being sent to the client. If compressed, they end up 57 | slightly smaller than the results of chunked responses, but the extra delay for 58 | first byte is not worth it in most cases. 59 | 60 | - `out.arrows` (Uncompressed) 61 | - `out.arrows.gz` (Gzip HTTP compression) 62 | - `out.arrows.zstd` (Zstandard HTTP compression) 63 | - `out.arrows.br` (Brotli HTTP compression) 64 | 65 | - `out.arrows+zstd` (Zstandard IPC compression) 66 | - `out.arrows+lz4` (LZ4 IPC compression) 67 | 68 | HTTP/1.1 requests are returned by the server with `Transfer-Encoding: chunked` 69 | to send the data in smaller chunks that are sent to the socket as soon as they 70 | are ready. This is useful for large responses that take a long time to generate 71 | at the cost of a small overhead caused by the independent compression of each 72 | chunk. 73 | 74 | - `out_from_chunked.arrows` (Uncompressed) 75 | - `out_from_chunked.arrows.gz` (Gzip HTTP compression) 76 | - `out_from_chunked.arrows.zstd` (Zstandard HTTP compression) 77 | - `out_from_chunked.arrows.br` (Brotli HTTP compression) 78 | 79 | - `out_from_chunked.arrows+lz4` (LZ4 IPC compression) 80 | - `out_from_chunked.arrows+zstd` (Zstandard IPC compression) 81 | -------------------------------------------------------------------------------- /http/get_compressed/curl/client/client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | CURL="curl --verbose" 21 | URI="http://localhost:8008" 22 | OUT_HTTP1=out.arrows 23 | OUT_CHUNKED=out_from_chunked.arrows 24 | 25 | # HTTP/1.0 means that response is not chunked and not compressed... 26 | $CURL --http1.0 -o $OUT_HTTP1 $URI 27 | # ...but it may be compressed with an explicitly set Accept-Encoding 28 | # header 29 | $CURL --http1.0 -H "Accept-Encoding: gzip, *;q=0" -o $OUT_HTTP1.gz $URI 30 | $CURL --http1.0 -H "Accept-Encoding: zstd, *;q=0" -o $OUT_HTTP1.zstd $URI 31 | $CURL --http1.0 -H "Accept-Encoding: br, *;q=0" -o $OUT_HTTP1.br $URI 32 | # ...or with IPC buffer compression if the Accept header specifies codecs. 33 | $CURL --http1.0 -H "Accept: application/vnd.apache.arrow.stream; codecs=\"zstd, lz4\"" -o $OUT_HTTP1+zstd $URI 34 | $CURL --http1.0 -H "Accept: application/vnd.apache.arrow.stream; codecs=lz4" -o $OUT_HTTP1+lz4 $URI 35 | 36 | # HTTP/1.1 means compression is on by default... 37 | # ...but it can be refused with the Accept-Encoding: identity header. 38 | $CURL -H "Accept-Encoding: identity" -o $OUT_CHUNKED $URI 39 | # ...with gzip if no Accept-Encoding header is set. 40 | $CURL -o $OUT_CHUNKED.gz $URI 41 | # ...or with the compression algorithm specified in the Accept-Encoding. 42 | $CURL -H "Accept-Encoding: zstd, *;q=0" -o $OUT_CHUNKED.zstd $URI 43 | $CURL -H "Accept-Encoding: br, *;q=0" -o $OUT_CHUNKED.br $URI 44 | # ...or with IPC buffer compression if the Accept header specifies codecs. 45 | $CURL -H "Accept: application/vnd.apache.arrow.stream; codecs=\"zstd, lz4\"" -o $OUT_CHUNKED+zstd $URI 46 | $CURL -H "Accept: application/vnd.apache.arrow.stream; codecs=lz4" -o $OUT_CHUNKED+lz4 $URI 47 | -------------------------------------------------------------------------------- /http/get_compressed/python/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Compressed Arrow Data Examples 21 | 22 | This directory contains an HTTP client implemented in Python that issues multiple 23 | requests to one of the server examples implemented in the parent directory, 24 | negotiating different compression algorithms for the Arrow IPC stream data. 25 | 26 | To run this example, first start one of the compressed server examples in the 27 | parent directory, then: 28 | 29 | ```sh 30 | pip install pyarrow 31 | python client.py 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_compressed/python/client/client.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import urllib.request 19 | import pyarrow as pa 20 | import time 21 | 22 | URI = "http://localhost:8008" 23 | ARROW_STREAM_FORMAT = "application/vnd.apache.arrow.stream" 24 | 25 | 26 | def make_request(uri, compression): 27 | coding = "identity" if compression.startswith("identity") else compression 28 | # urllib.request.urlopen() always sends an HTTP/1.1 request 29 | # with Accept-Encoding: identity, so we need to setup a request 30 | # object with custom headers to request a specific compression 31 | headers = { 32 | "Accept-Encoding": f"{coding}, *;q=0", 33 | } 34 | if compression.startswith("identity+"): 35 | # request IPC buffer compression instead of HTTP compression 36 | ipc_codec = compression.split("+")[1] 37 | headers["Accept"] = f'{ARROW_STREAM_FORMAT};codecs="{ipc_codec}"' 38 | request = urllib.request.Request(uri, headers=headers) 39 | 40 | response = urllib.request.urlopen(request) 41 | content_type = response.headers["Content-Type"] 42 | if not content_type.startswith(ARROW_STREAM_FORMAT): 43 | raise ValueError(f"Expected {ARROW_STREAM_FORMAT}, got {content_type}") 44 | if compression.startswith("identity"): 45 | return response 46 | # IANA nomenclature for Brotli is "br" and not "brotli" 47 | compression = "brotli" if compression == "br" else compression 48 | return pa.CompressedInputStream(response, compression) 49 | 50 | 51 | def request_and_process(uri, compression): 52 | batches = [] 53 | log_prefix = f"{'[' + compression + ']':>10}:" 54 | print( 55 | f"{log_prefix} Requesting data from {uri} with `{compression}` compression strategy." 56 | ) 57 | start_time = time.time() 58 | response = make_request(uri, compression) 59 | with pa.ipc.open_stream(response) as reader: 60 | schema = reader.schema 61 | time_to_schema = time.time() - start_time 62 | try: 63 | batch = reader.read_next_batch() 64 | time_to_first_batch = time.time() - start_time 65 | batches.append(batch) 66 | while True: 67 | batch = reader.read_next_batch() 68 | batches.append(batch) 69 | except StopIteration: 70 | pass 71 | processing_time = time.time() - start_time 72 | reader_stats = reader.stats 73 | print( 74 | f"{log_prefix} Schema received in {time_to_schema:.3f} seconds." 75 | f" schema=({', '.join(schema.names)})." 76 | ) 77 | print( 78 | f"{log_prefix} First batch received and processed in" 79 | f" {time_to_first_batch:.3f} seconds" 80 | ) 81 | print( 82 | f"{log_prefix} Processing of all batches completed in" 83 | f" {processing_time:.3f} seconds." 84 | ) 85 | print(f"{log_prefix}", reader_stats) 86 | return batches 87 | 88 | 89 | # HTTP compression 90 | request_and_process(URI, "identity") 91 | request_and_process(URI, "zstd") 92 | request_and_process(URI, "br") 93 | request_and_process(URI, "gzip") 94 | # using IPC buffer compression instead of HTTP compression 95 | request_and_process(URI, "identity+zstd") 96 | request_and_process(URI, "identity+lz4") 97 | -------------------------------------------------------------------------------- /http/get_compressed/python/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | 21 | # HTTP GET Arrow Data: Compressed Arrow Data Examples 22 | 23 | This directory contains an example of an HTTP server implemented in Python 24 | able to serve Arrow IPC streams compressed with different algorithms negotiated 25 | with the client via different standard HTTP headers. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | pip install pyarrow 31 | python server.py 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_indirect/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Indirect Examples 21 | 22 | This directory contains examples of HTTP clients and servers that use a two-step sequence to retrieve Arrow data: 23 | 1. The client sends a GET request to a server and receives a JSON response from the server containing one or more server URIs. 24 | 2. The client sends GET requests to each of those URIs and receives a response from each server containing an Arrow IPC stream of record batches (exactly as in the [simple GET examples](https://github.com/apache/arrow-experiments/tree/main/http/get_simple)). 25 | 26 | > [!IMPORTANT] 27 | > The structure of the JSON document in these examples is an illustration, not a recommendation. Developers should use JSON document structures appropriate to the needs of their applications. 28 | -------------------------------------------------------------------------------- /http/get_indirect/curl/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | *.arrows 19 | -------------------------------------------------------------------------------- /http/get_indirect/curl/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Indirect curl Client Example 21 | 22 | This directory contains an example of a series of shell commands that use `curl` and `jq` to: 23 | 1. Send a GET request to the server to get a JSON listing of the URIs of a set of `.arrows` files. 24 | 2. Send GET requests to download each of the `.arrows` files from the server to files in the current directory. 25 | -------------------------------------------------------------------------------- /http/get_indirect/curl/client/client.sh: -------------------------------------------------------------------------------- 1 | !/bin/sh 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | # Use curl to get a JSON document containing URIs to 22 | # Arrow stream files, then use jq to extract the URIs 23 | uris=$(curl -s -S http://localhost:8008/ | jq -r '.arrow_stream_files[].uri') 24 | 25 | # Use curl to download the files from the URIs in parallel 26 | if [ -n "$uris" ]; then 27 | curl --parallel --remote-name-all $(print $uris) 28 | fi 29 | -------------------------------------------------------------------------------- /http/get_indirect/python/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | *.arrows 19 | -------------------------------------------------------------------------------- /http/get_indirect/python/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Indirect Python Client Example with Requests 21 | 22 | This directory contains an example of an HTTP client implemented in Python using the [Requests](https://requests.readthedocs.io/) library. The client: 23 | 1. Sends a GET request to the server to get a JSON listing of the URIs of available `.arrows` files. 24 | 2. Sends GET requests to download each of the `.arrows` files from the server. 25 | 3. Loads the contents of each file into an in-memory PyArrow Table. 26 | 27 | To run this example, first start one of the indirect server examples in the parent directory, then: 28 | 29 | ```sh 30 | pip install requests pyarrow 31 | python client.py 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_indirect/python/client/client.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import requests 19 | import json 20 | import os 21 | import pyarrow as pa 22 | 23 | 24 | HOST = "http://localhost:8008/" 25 | 26 | JSON_FORMAT = "application/json" 27 | ARROW_STREAM_FORMAT = "application/vnd.apache.arrow.stream" 28 | 29 | json_response = requests.get(HOST) 30 | 31 | response_status = json_response.status_code 32 | if not response_status == 200: 33 | raise ValueError(f"Expected response status 200, got {response_status}") 34 | 35 | content_type = json_response.headers.get("Content-Type", "") 36 | if not content_type.startswith(JSON_FORMAT): 37 | raise ValueError(f"Expected content type {JSON_FORMAT}, got {content_type}") 38 | 39 | print("Downloaded JSON file listing.") 40 | 41 | parsed_data = json_response.json() 42 | uris = [file["uri"] for file in parsed_data["arrow_stream_files"]] 43 | 44 | if not all(uri.endswith(".arrows") for uri in uris): 45 | raise ValueError(f"Some listed files do not have extension '.arrows'") 46 | 47 | print(f"Parsed JSON and found {len(uris)} Arrow stream files.") 48 | 49 | tables = {} 50 | 51 | for uri in uris: 52 | arrow_response = requests.get(uri) 53 | 54 | response_status = arrow_response.status_code 55 | if not response_status == 200: 56 | raise ValueError(f"Expected response status 200, got {response_status}") 57 | 58 | content_type = arrow_response.headers.get("Content-Type", "") 59 | if not content_type.startswith(ARROW_STREAM_FORMAT): 60 | raise ValueError(f"Expected content type {ARROW_STREAM_FORMAT}, got {content_type}") 61 | 62 | filename = os.path.basename(uri) 63 | 64 | print(f"Downloaded file '{filename}'.") 65 | 66 | tablename = os.path.splitext(filename)[0] 67 | with pa.ipc.open_stream(arrow_response.content) as reader: 68 | tables[tablename] = reader.read_all() 69 | 70 | print(f"Loaded into in-memory Arrow table '{tablename}'.") 71 | -------------------------------------------------------------------------------- /http/get_indirect/python/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Indirect Python Server Example 21 | 22 | This directory contains an example of an HTTP server implemented in Python using the built-in [`http.server`](https://docs.python.org/3/library/http.server.html) module. The server: 23 | 1. Listens for HTTP GET requests from clients. 24 | 2. Upon receiving a GET request for the document root, serve a JSON document that lists the URIs of all the `.arrows` files in the current directory. 25 | 3. Upon receiving a GET request for a specific `.arrows` file, serve that file. 26 | 27 | To run this example, first copy two `.arrows` files from the `data` section of this repository into the current directory: 28 | 29 | ```sh 30 | cp ../../../../data/arrow-commits/arrow-commits.arrows . 31 | cp ../../../../data/rand-many-types/random.arrows . 32 | ``` 33 | 34 | Then start the HTTP server: 35 | 36 | ```sh 37 | python server.py 38 | ``` 39 | 40 | In this example, the JSON document listing the URIs of the `.arrows` files is structured as shown below. **This JSON structure is provided for example purposes only. It is not a recommendation.** Developers should use JSON document structures appropriate to the needs of their applications. 41 | 42 | ```json 43 | { 44 | "arrow_stream_files": [ 45 | { 46 | "uri": "http://127.0.0.1:8008/random.arrows" 47 | }, 48 | { 49 | "uri": "http://127.0.0.1:8008/arrow-commits.arrows" 50 | } 51 | ] 52 | } 53 | ``` 54 | -------------------------------------------------------------------------------- /http/get_indirect/python/server/server.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from http.server import SimpleHTTPRequestHandler, HTTPServer 19 | import json 20 | import os 21 | import mimetypes 22 | 23 | mimetypes.add_type("application/vnd.apache.arrow.stream", ".arrows") 24 | 25 | class MyServer(SimpleHTTPRequestHandler): 26 | def list_directory(self, path): 27 | host, port = self.server.server_address 28 | 29 | try: 30 | file_paths = [ 31 | f for f in os.listdir(path) 32 | if f.endswith(".arrows") and os.path.isfile(os.path.join(path, f)) 33 | ] 34 | except OSError: 35 | self.send_error(404, "No permission to list directory") 36 | return None 37 | 38 | file_uris = [f"http://{host}:{port}{self.path}{f}" for f in file_paths] 39 | uris_doc = {"arrow_stream_files": [{"uri": f} for f in file_uris]} 40 | self.send_response(200) 41 | self.send_header("Content-Type", "application/json") 42 | self.end_headers() 43 | self.wfile.write(json.dumps(uris_doc, indent=4).encode("utf-8")) 44 | return None 45 | 46 | server_address = ("localhost", 8008) 47 | try: 48 | httpd = HTTPServer(server_address, MyServer) 49 | print(f"Serving on {server_address[0]}:{server_address[1]}...") 50 | httpd.serve_forever() 51 | except KeyboardInterrupt: 52 | print("Shutting down server") 53 | httpd.socket.close() 54 | -------------------------------------------------------------------------------- /http/get_multipart/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Multipart Examples 21 | 22 | This directory contains examples of HTTP servers/clients that send/receive a multipart response (`Content-Type: multipart/mixed`) containing JSON data (`Content-Type: application/json`), an Arrow IPC stream data (`Content-Type: application/vnd.apache.arrow.stream`), and (optionally) plain text data (`Content-Type: text/plain`). 23 | 24 | ## Picking a Boundary 25 | 26 | The `multipart/mixed` response format uses a boundary string to separate the 27 | parts. This string **must not appear in the content of any part** according 28 | to RFC 1341.[^1] 29 | 30 | We **do not recommend** checking for the boundary string in the content of the 31 | parts as that would prevent streaming them. Which would add up to the memory 32 | usage of the server and waste CPU time. 33 | 34 | ### Recommended Algorithm 35 | 36 | For every `multipart/mixed` response produced by the server: 37 | 1. Using a CSPRNG,[^2] generate a byte string of enough entropy to make the 38 | probability of collision[^3] negligible (at least 160 bits = 20 bytes).[^4] 39 | 2. Encode the byte string in a way that is safe to use in HTTP headers. We 40 | recommend using `base64url` encoding described in RFC 4648.[^5] 41 | 42 | `base64url` encoding is a variant of `base64` encoding that uses `-` and `_` 43 | instead of `+` and `/` respectively. It also omits padding characters (`=`). 44 | 45 | This algorithm can be implemented in Python using the `secret.token_urlsafe()` 46 | function. 47 | 48 | If you generate a boundary string with generous 224 bits of entropy 49 | (i.e. 28 bytes), the base64url encoding will produce a 38-character 50 | string which is well below the limit defined by RFC 1341 (70 characters). 51 | 52 | >>> import secrets 53 | >>> boundary = secrets.token_urlsafe(28) 54 | >>> len(boundary) 55 | 38 56 | 57 | 58 | [^1]: [RFC 1341 - Section 7.2 The Multipart Content-Type](https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html) 59 | [^2]: [Cryptographically Secure Pseudo-Random Number Generator](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) 60 | [^3]: [Birthday Problem](https://en.wikipedia.org/wiki/Birthday_problem) 61 | [^4]: [Hash Collision Probabilities](https://preshing.com/20110504/hash-collision-probabilities/) 62 | [^5]: [RFC 4648 - Section 5 Base 64 Encoding with URL and Filename Safe Alphabet](https://tools.ietf.org/html/rfc4648#section-5) 63 | -------------------------------------------------------------------------------- /http/get_multipart/python/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data in multipart/mixed: Python Client Example 21 | 22 | This directory contains an example of a Python HTTP client that receives a 23 | `multipart/mixed` response from the server. The client: 24 | 1. Sends an HTTP GET request to a server. 25 | 2. Receives an HTTP 200 response from the server, with the response body 26 | containing a `multipart/mixed` response. 27 | 3. Parses the `multipart/mixed` response using the `email` module.[^1] 28 | 4. Extracts the JSON part, parses it and prints a preview of the JSON data. 29 | 5. Extracts the Arrow stream part, reads the Arrow stream, and sums the 30 | total number of records in the entire Arrow stream. 31 | 6. Extracts the plain text part and prints it as it is. 32 | 33 | To run this example, first start one of the server examples in the parent 34 | directory, then: 35 | 36 | ```sh 37 | pip install pyarrow 38 | python simple_client.py 39 | ``` 40 | 41 | > [!WARNING] 42 | > This `simple_client.py` parses the multipart response using the multipart 43 | > message parser from the Python `email` module. This module puts the entire 44 | > message in memory and seems to spend a lot of time looking for part delimiter 45 | > and encoding/decoding the parts. 46 | > 47 | > The overhead of `multipart/mixed` parsing is 85% on my machine and after the 48 | > ~1GB Arrow Stream message is fully in memory, it takes only 0.06% of the total 49 | > execution time to parse it. 50 | 51 | [^1]: The `multipart/mixed` standard, used by HTTP, is derived from the MIME 52 | standard used in email. 53 | -------------------------------------------------------------------------------- /http/get_multipart/python/client/simple_client.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from email import policy 19 | import email 20 | import json 21 | import pyarrow as pa 22 | import sys 23 | import time 24 | import urllib.request 25 | 26 | JSON_FORMAT = "application/json" 27 | TEXT_FORMAT = "text/plain" 28 | ARROW_STREAM_FORMAT = "application/vnd.apache.arrow.stream" 29 | 30 | start_time = time.time() 31 | response_parsing_time = 0 # time to parse the multipart message 32 | arrow_stream_parsing_time = 0 # time to parse the Arrow stream 33 | 34 | 35 | def parse_multipart_message(response, boundary, buffer_size=8192): 36 | """ 37 | Parse a multipart/mixed HTTP response into a list of Message objects. 38 | 39 | Returns 40 | ------- 41 | list of email.message.Message containing the parts of the multipart message. 42 | """ 43 | global response_parsing_time 44 | buffer_size = max(buffer_size, 8192) 45 | buffer = bytearray(buffer_size) 46 | 47 | header = f'MIME-Version: 1.0\r\nContent-Type: multipart/mixed; boundary="{boundary}"\r\n\r\n' 48 | feedparser = email.parser.BytesFeedParser(policy=policy.default) 49 | feedparser.feed(header.encode("utf-8")) 50 | while bytes_read := response.readinto(buffer): 51 | start_time = time.time() 52 | feedparser.feed(buffer[0:bytes_read]) 53 | response_parsing_time += time.time() - start_time 54 | start_time = time.time() 55 | message = feedparser.close() 56 | response_parsing_time += time.time() - start_time 57 | assert message.is_multipart() 58 | return message.get_payload() 59 | 60 | 61 | def process_json_part(message): 62 | assert message.get_content_type() == JSON_FORMAT 63 | payload = part.get_payload() 64 | print(f"-- {len(payload)} bytes of JSON data:") 65 | try: 66 | PREVIW_SIZE = 5 67 | data = json.loads(payload) 68 | print("[") 69 | for i in range(min(PREVIW_SIZE, len(data))): 70 | print(f" {data[i]}") 71 | if len(data) > PREVIW_SIZE: 72 | print(f" ...+{len(data) - PREVIW_SIZE} entries...") 73 | print("]") 74 | except json.JSONDecodeError as e: 75 | print(f"Error parsing JSON data: {e}\n", file=sys.stderr) 76 | return data 77 | 78 | 79 | def process_arrow_stream_message(message): 80 | global arrow_stream_parsing_time 81 | assert message.get_content_type() == ARROW_STREAM_FORMAT 82 | payload = part.get_payload(decode=True) 83 | print(f"-- {len(payload)} bytes of Arrow data:") 84 | num_batches = 0 85 | num_records = 0 86 | start_time = time.time() 87 | with pa.ipc.open_stream(payload) as reader: 88 | schema = reader.schema 89 | print(f"Schema: \n{schema}\n") 90 | try: 91 | while True: 92 | batch = reader.read_next_batch() 93 | num_batches += 1 94 | num_records += batch.num_rows 95 | except StopIteration: 96 | pass 97 | arrow_stream_parsing_time = time.time() - start_time 98 | print(f"Parsed {num_records} records in {num_batches} batch(es)") 99 | 100 | 101 | def process_text_part(message): 102 | assert message.get_content_type() == TEXT_FORMAT 103 | payload = part.get_payload() 104 | print("-- Text Message:") 105 | print(payload, end="") 106 | print("-- End of Text Message --") 107 | 108 | 109 | response = urllib.request.urlopen("http://localhost:8008?include_footnotes") 110 | 111 | content_type = response.headers.get_content_type() 112 | if content_type != "multipart/mixed": 113 | raise ValueError(f"Expected multipart/mixed Content-Type, got {content_type}") 114 | boundary = response.headers.get_boundary() 115 | if boundary is None or len(boundary) == 0: 116 | raise ValueError("No multipart boundary found in Content-Type header") 117 | 118 | parts = parse_multipart_message(response, boundary, buffer_size=64 * 1024) 119 | batches = None 120 | for part in parts: 121 | content_type = part.get_content_type() 122 | if content_type == JSON_FORMAT: 123 | process_json_part(part) 124 | elif content_type == ARROW_STREAM_FORMAT: 125 | batches = process_arrow_stream_message(part) 126 | elif content_type == TEXT_FORMAT: 127 | process_text_part(part) 128 | 129 | end_time = time.time() 130 | execution_time = end_time - start_time 131 | 132 | rel_response_parsing_time = response_parsing_time / execution_time 133 | rel_arrow_stream_parsing_time = arrow_stream_parsing_time / execution_time 134 | print(f"{execution_time:.3f} seconds elapsed") 135 | print( 136 | f"""{response_parsing_time:.3f} seconds \ 137 | ({rel_response_parsing_time * 100:.2f}%) \ 138 | seconds parsing multipart/mixed response""" 139 | ) 140 | print( 141 | f"""{arrow_stream_parsing_time:.3f} seconds \ 142 | ({rel_arrow_stream_parsing_time * 100:.2f}%) \ 143 | seconds parsing Arrow stream""" 144 | ) 145 | -------------------------------------------------------------------------------- /http/get_multipart/python/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data in multipart/mixed: Python Server Example 21 | 22 | This directory contains an example of a Python HTTP server that sends a 23 | `multipart/mixed` response to clients. The server: 24 | 1. Creates a list of record batches and populates it with synthesized data. 25 | 2. Listens for HTTP GET requests from clients. 26 | 3. Upon receiving a request, builds and sends an HTTP 200 `multipart/mixed` 27 | response containing: 28 | - A JSON part with metadata about the Arrow stream. 29 | - An Arrow stream part with the Arrow IPC stream of record batches. 30 | - A plain text part with a message containing timing information. This part 31 | is optional (included if `?include_footnotes` is present in the URL). 32 | 33 | To run this example: 34 | 35 | ```sh 36 | pip install pyarrow 37 | python server.py 38 | ``` 39 | 40 | > [!NOTE] 41 | > This example uses Python's built-in 42 | > [`http.server`](https://docs.python.org/3/library/http.server.html) module. 43 | > This allows us to implement [chunked transfer 44 | > encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) manually. 45 | > Other servers may implement chunked transfer encoding automatically at the 46 | > cost of an undesirable new layer of buffering. Arrow IPC streams already offer 47 | > a natural way of chunking large amounts of tabular data. It's not a general 48 | > requirement, but in this example each chunk corresponds to one Arrow record 49 | > batch. 50 | -------------------------------------------------------------------------------- /http/get_range/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Range Request Examples 21 | 22 | This directory contains examples of HTTP servers/clients that send/receive data of known size (`Content-Length`) in the Arrow IPC streaming format and support range requests (`Accept-Ranges: bytes`). 23 | -------------------------------------------------------------------------------- /http/get_range/curl/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | *.arrows 19 | *.arrows.part* 20 | -------------------------------------------------------------------------------- /http/get_range/curl/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Range Request curl Client Example 21 | 22 | This directory contains examples of `curl` commands that send HTTP GET requests with the `Range` request header. 23 | 24 | To run this example, first start one of the range request server examples in the parent directory, then run the shell commands in `client.sh`. 25 | -------------------------------------------------------------------------------- /http/get_range/curl/client/client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | ### Use range requests to download an Arrow IPC stream file in two parts 22 | 23 | # Get the length of the file `random.arrows` in bytes 24 | curl -I localhost:8008/random.arrows 25 | # Content-Length: 13550776 26 | 27 | # Download the first half of the file to `random.arrows.part1` 28 | curl -r 0-6775388 localhost:8008/random.arrows -o random.arrows.part1 29 | 30 | # Download the second half of the file to `random.arrows.part2` 31 | curl -r 6775389-13550776 localhost:8008/random.arrows -o random.arrows.part2 32 | 33 | # Combine the two separate files into one file `random.arrows` then delete them 34 | cat random.arrows.part1 random.arrows.part2 > random.arrows 35 | rm random.arrows.part1 random.arrows.part2 36 | 37 | # Clean up 38 | rm random.arrows 39 | 40 | 41 | ### Simulate an interrupted download over a slow connection 42 | 43 | # Begin downloading the file at 1M/s but interrupt after five seconds 44 | timeout 5s curl --limit-rate 1M localhost:8008/random.arrows -o random.arrows 45 | 46 | # Resume the download at 1M/s 47 | curl -C - --limit-rate 1M localhost:8008/random.arrows -o random.arrows 48 | 49 | # Clean up 50 | rm random.arrows 51 | -------------------------------------------------------------------------------- /http/get_range/js/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /**/node_modules 19 | package-lock.json 20 | package.json 21 | *.arrows 22 | -------------------------------------------------------------------------------- /http/get_range/js/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Range Request Node.js Server Example 21 | 22 | The example in this directory shows how to use the Node.js package [`serve`](https://www.npmjs.com/package/serve) (which supports range requests) to serve a static Arrow IPC stream file over HTTP. 23 | 24 | To run this example, copy the file `random.arrows` from the directory `data/rand-many-types/` into the current directory: 25 | 26 | ```sh 27 | cp ../../../../data/rand-many-types/random.arrows . 28 | ``` 29 | 30 | Then start the HTTP server to serve this file: 31 | 32 | ```sh 33 | npx serve -l 8008 34 | ``` 35 | 36 | > [!NOTE] 37 | > The npm package `serve` _should_ automatically set the `Content-Type` header to `application/vnd.apache.arrow.stream` when serving a file with extension `.arrows`, because [the Arrow IPC stream format is officially registered with IANA](https://www.iana.org/assignments/media-types/application/vnd.apache.arrow.stream) and most web servers including `serve` use registration data from IANA to determine the media type of a file based on its file extension and set the `Content-Type` header to that media type when serving a file with that extension. However, this is not working with `.arrows` files in the `serve` package, seemingly because of a problem with the npm package [`mimedb`](https://github.com/jshttp/mime-db) which `serve` depends on. So the file `serve.json` is used to set the `Content-Type` header correctly when serving `.arrows` files. 38 | -------------------------------------------------------------------------------- /http/get_range/js/server/serve.json: -------------------------------------------------------------------------------- 1 | { 2 | "headers": [ 3 | { 4 | "source" : "**/*.arrows", 5 | "headers" : [{ 6 | "key" : "Content-Type", 7 | "value" : "application/vnd.apache.arrow.stream" 8 | }] 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /http/get_simple/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Examples 21 | 22 | This directory contains a set of minimal examples of HTTP clients and servers implemented in several languages. These examples demonstrate: 23 | - How a client can send a GET request to a server and receive a response from the server containing an Arrow IPC stream of record batches. 24 | - How a server can respond to a GET request from a client and send the client a response containing an Arrow IPC stream of record batches. 25 | 26 | The examples here assume that the server cannot determine the exact length in bytes of the full Arrow IPC stream before sending it, so they cannot set the `Content-Length` header or serve Range requests. 27 | 28 | Most of the client examples here assume that the client needs to hold the full received data in memory in an Arrow data structure for further in-memory processing. The case in which the client simply writes the result directly to a file is much simpler and is demonstrated by the [curl client example](curl/client). 29 | 30 | To enable performance comparisons to Arrow Flight RPC, the server examples generate the data in exactly the same way as in [`flight_benchmark.cc`](https://github.com/apache/arrow/blob/7346bdffbdca36492089f6160534bfa2b81bad90/cpp/src/arrow/flight/flight_benchmark.cc#L194-L245) as cited in the [original blog post introducing Flight RPC](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/). But note that Flight example sends four concurrent streams. 31 | 32 | If you are collaborating on the set of examples in this directory, please follow these guidelines: 33 | - Each new example must be implemented as minimally as possible. For example, error handling should be minimized or omitted. 34 | - Each new client example must be tested to ensure that it works with each existing server example. 35 | - Each new server example must be tested to ensure that it works with each existing client example. 36 | - To the greatest extent possible, each new server example should be functionally equivalent to each existing server example (generating equivalent data with the same schema, size, shape, and distribution of values; sending the same HTTP headers; and so on). 37 | - Each new client example must print timing and size information before exiting. If possible this must include the number of seconds elapsed (rounded to the second decimal place) and the number of record batches received. 38 | -------------------------------------------------------------------------------- /http/get_simple/c_glib/client/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /client 19 | -------------------------------------------------------------------------------- /http/get_simple/c_glib/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple C GLib Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in C with GLib. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Collects the record batches as they are received. 26 | 27 | To run this example, first start one of the server examples in the parent directory. Then install the `arrow-glib` and `libsoup` C libraries, compile `client.c`, and run the executable. For example, using `clang`: 28 | 29 | ```sh 30 | clang client.c $(pkg-config --cflags --libs arrow-glib libsoup-3.0) -o client 31 | ./client 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/c_glib/client/client.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | #include 21 | 22 | #include 23 | #include 24 | 25 | int 26 | main(int argc, char **argv) 27 | { 28 | int exit_code = EXIT_FAILURE; 29 | 30 | SoupSession *session = soup_session_new(); 31 | SoupMessage *message = soup_message_new(SOUP_METHOD_GET, 32 | "http://localhost:8008"); 33 | /* Disable keep-alive explicitly. (libsoup uses keep-alive by 34 | * default.) 35 | * 36 | * In general, keep-alive will improve performance when we send 37 | * many GET requests to the same server. But in this case, we send 38 | * only one GET request. So we don't need keep-alive here. 39 | */ 40 | SoupMessageHeaders *headers = soup_message_get_request_headers(message); 41 | soup_message_headers_append(headers, "connection", "close"); 42 | 43 | GTimer *timer = g_timer_new(); 44 | 45 | GError *error = NULL; 46 | GInputStream *input = soup_session_send(session, message, NULL, &error); 47 | if (error) { 48 | g_printerr("Failed to download: %s\n", error->message); 49 | g_error_free(error); 50 | goto exit; 51 | } 52 | 53 | GArrowGIOInputStream *arrow_input = garrow_gio_input_stream_new(input); 54 | GArrowRecordBatchStreamReader *reader = 55 | garrow_record_batch_stream_reader_new(GARROW_INPUT_STREAM(arrow_input), 56 | &error); 57 | if (error) { 58 | g_printerr("Failed to create reader: %s\n", error->message); 59 | g_error_free(error); 60 | g_object_unref(arrow_input); 61 | goto exit; 62 | } 63 | 64 | GArrowTable *table = 65 | garrow_record_batch_reader_read_all(GARROW_RECORD_BATCH_READER(reader), 66 | &error); 67 | if (error) { 68 | g_printerr("Failed to read record batches: %s\n", error->message); 69 | g_error_free(error); 70 | g_object_unref(reader); 71 | g_object_unref(arrow_input); 72 | goto exit; 73 | } 74 | GArrowChunkedArray *chunked_array = garrow_table_get_column_data(table, 0); 75 | guint n_received_record_batches = 76 | garrow_chunked_array_get_n_chunks(chunked_array); 77 | g_object_unref(chunked_array); 78 | g_object_unref(table); 79 | g_object_unref(reader); 80 | g_object_unref(arrow_input); 81 | 82 | g_timer_stop(timer); 83 | 84 | g_print("%u record batches received\n", n_received_record_batches); 85 | 86 | g_print("%.2f seconds elapsed\n", g_timer_elapsed(timer, NULL)); 87 | 88 | exit_code = EXIT_SUCCESS; 89 | 90 | exit: 91 | g_object_unref(input); 92 | g_timer_destroy(timer); 93 | g_object_unref(message); 94 | g_object_unref(session); 95 | 96 | return exit_code; 97 | } 98 | -------------------------------------------------------------------------------- /http/get_simple/cpp/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | * 19 | !*.* 20 | !*/ 21 | .DS_Store 22 | 23 | -------------------------------------------------------------------------------- /http/get_simple/cpp/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple C++ Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in C++. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Collects the record batches as they are received. 26 | 27 | To run this example, first start one of the server examples in the parent directory. Then install the `arrow` and `libcurl` C++ libraries, compile `client.cpp`, and run the executable. For example, using `clang++`: 28 | 29 | ```sh 30 | clang++ client.cpp -std=c++17 $(pkg-config --cflags --libs arrow libcurl) -o client 31 | ./client 32 | ``` 33 | 34 | > [!NOTE] 35 | > The example here requires version 15.0.0 or higher of the Arrow C++ library because of a bug ([#39163](https://github.com/apache/arrow/issues/39163)) that existed in earlier versions. If you must use an earlier version of the Arrow C++ library, it is possible to implement an HTTP client by using `arrow::ipc::RecordBatchStreamReader` instead of `arrow::ipc::StreamDecoder`. See [this example](https://github.com/apache/arrow/pull/39081/commits/3b937b98295b5dd4f9e297a865a9303a317c9983) for reference. 36 | -------------------------------------------------------------------------------- /http/get_simple/cpp/client/client.cpp: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | static size_t 25 | WriteFunction(void *contents, size_t size, size_t nmemb, void *userp) 26 | { 27 | size_t real_size = size * nmemb; 28 | auto decoder = static_cast(userp); 29 | if (decoder->Consume(static_cast(contents), real_size).ok()) { 30 | return real_size; 31 | } else { 32 | return 0; 33 | } 34 | } 35 | 36 | int main(void) 37 | { 38 | std::string url = "http://localhost:8008"; 39 | 40 | CURL *curl_handle; 41 | CURLcode res; 42 | 43 | // We use arrow::ipc::CollectListner() here for simplicity, 44 | // but another option is to process decoded record batches 45 | // as a stream by overriding arrow::ipc::Listener(). 46 | auto collect_listener = std::make_shared(); 47 | arrow::ipc::StreamDecoder decoder(collect_listener); 48 | 49 | curl_global_init(CURL_GLOBAL_ALL); 50 | curl_handle = curl_easy_init(); 51 | 52 | curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); 53 | curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteFunction); 54 | curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, &decoder); 55 | 56 | auto start_time = std::chrono::steady_clock::now(); 57 | 58 | res = curl_easy_perform(curl_handle); 59 | 60 | printf("%lld record batches received\n", collect_listener->num_record_batches()); 61 | 62 | auto end_time = std::chrono::steady_clock::now(); 63 | 64 | auto time_duration = std::chrono::duration_cast>(end_time - start_time); 65 | printf("%.2f seconds elapsed\n", time_duration.count()); 66 | 67 | curl_easy_cleanup(curl_handle); 68 | curl_global_cleanup(); 69 | 70 | std::vector> record_batches; 71 | record_batches = collect_listener->record_batches(); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /http/get_simple/csharp/client/ArrowHttpClient.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | using System; 19 | using System.Collections.Generic; 20 | using System.Net.Http; 21 | using System.Threading.Tasks; 22 | using Apache.Arrow; 23 | using Apache.Arrow.Ipc; 24 | 25 | namespace ArrowHttpClient 26 | { 27 | public class Program 28 | { 29 | public static async Task Main(string[] args) 30 | { 31 | string serverUri = "http://localhost:8008/"; 32 | 33 | DateTime startTime = DateTime.UtcNow; 34 | 35 | HttpClient httpClient = new HttpClient 36 | { 37 | BaseAddress = new Uri(serverUri), 38 | }; 39 | 40 | using (var stream = await httpClient.GetStreamAsync(serverUri)) 41 | using (var reader = new ArrowStreamReader(stream)) 42 | { 43 | Console.WriteLine("Connected"); 44 | 45 | List batches = new List(); 46 | 47 | int numRows = 0; 48 | RecordBatch batch; 49 | while ((batch = await reader.ReadNextRecordBatchAsync()) != null) 50 | { 51 | numRows += batch.Length; 52 | batches.Add(batch); 53 | } 54 | Schema schema = reader.Schema; 55 | 56 | DateTime endTime = DateTime.UtcNow; 57 | 58 | Console.WriteLine($"{numRows} records received"); 59 | Console.WriteLine($"{batches.Count} record batches received"); 60 | Console.WriteLine($"{(endTime - startTime).TotalSeconds} seconds elapsed"); 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /http/get_simple/csharp/client/Client.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Exe 5 | net8.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /http/get_simple/csharp/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple C# Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in C#. The client: 23 | 24 | 1. Sends an HTTP GET request to a server. 25 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 26 | 3. Adds the record batches to a list as they are received. 27 | 28 | To run this example, first start one of the server examples in the parent directory, then: 29 | 30 | ```sh 31 | dotnet run -c Release 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/csharp/server/ArrowHttpServer.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | using System; 19 | using System.Collections.Generic; 20 | using System.Linq; 21 | using System.Net; 22 | using System.Threading.Tasks; 23 | using Apache.Arrow; 24 | using Apache.Arrow.Ipc; 25 | using Apache.Arrow.Types; 26 | 27 | namespace ArrowHttpServer 28 | { 29 | public class Program 30 | { 31 | static readonly Schema schema = new Schema( 32 | new Field[] 33 | { 34 | new Field("a", Int64Type.Default, true), 35 | new Field("b", Int64Type.Default, true), 36 | new Field("c", Int64Type.Default, true), 37 | new Field("d", Int64Type.Default, true), 38 | }, null); 39 | 40 | static List GenerateBatches(int totalRecords = 100000000, int batchSize = 4096) 41 | { 42 | Random random = new Random(); 43 | IArrowArray[] columns = new IArrowArray[schema.FieldsList.Count]; 44 | for (int i = 0; i < columns.Length; i++) 45 | { 46 | byte[] dataBytes = new byte[batchSize * sizeof(long)]; 47 | random.NextBytes(dataBytes); 48 | ArrowBuffer data = new ArrowBuffer(dataBytes); 49 | byte[] validityBytes = new byte[batchSize / 8]; 50 | System.Array.Fill(validityBytes, (byte)0xff); 51 | ArrowBuffer validity = new ArrowBuffer(validityBytes); 52 | 53 | columns[i] = ArrowArrayFactory.BuildArray(new ArrayData(schema.FieldsList[i].DataType, batchSize, 0, 0, new [] { validity, data }, null)); 54 | } 55 | 56 | List batches = new List(totalRecords); 57 | using (RecordBatch batch = new RecordBatch(schema, columns, batchSize)) 58 | { 59 | int records = 0; 60 | while (records < totalRecords) 61 | { 62 | RecordBatch newBatch = batch.Clone(); 63 | if (records + batchSize > totalRecords) 64 | { 65 | int newLength = totalRecords - records; 66 | newBatch = new RecordBatch(schema, newBatch.Arrays.Select(a => ArrowArrayFactory.Slice(a, 0, newLength)), newLength); 67 | } 68 | 69 | batches.Add(batch); 70 | records += batchSize; 71 | } 72 | } 73 | 74 | return batches; 75 | } 76 | 77 | public static async Task Main(string[] args) 78 | { 79 | string serverUri = "http://*:8008/"; 80 | 81 | HttpListener listener = new HttpListener(); 82 | listener.Prefixes.Add(serverUri); 83 | listener.Start(); 84 | 85 | Console.Write("Generating data... "); 86 | var batches = GenerateBatches(); 87 | Console.WriteLine("done."); 88 | 89 | while (true) 90 | { 91 | Console.Write("Waiting... "); 92 | var context = await listener.GetContextAsync(); 93 | Console.WriteLine("client connected"); 94 | DateTime startTime = DateTime.UtcNow; 95 | 96 | context.Response.SendChunked = true; 97 | context.Response.StatusCode = (int)HttpStatusCode.OK; 98 | context.Response.ContentType = "application/vnd.apache.arrow.stream"; 99 | 100 | int numRows = 0; 101 | using (context.Response.OutputStream) 102 | using (var writer = new ArrowStreamWriter(context.Response.OutputStream, schema)) 103 | { 104 | foreach (RecordBatch batch in batches) 105 | { 106 | await writer.WriteRecordBatchAsync(batch); 107 | numRows += batch.Length; 108 | } 109 | } 110 | DateTime endTime = DateTime.UtcNow; 111 | 112 | Console.WriteLine("Done"); 113 | Console.WriteLine($"{numRows} records sent"); 114 | Console.WriteLine($"{batches.Count} record batches sent"); 115 | Console.WriteLine($"{(endTime - startTime).TotalSeconds} seconds elapsed"); 116 | } 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /http/get_simple/csharp/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple C# Server Example 21 | 22 | This directory contains a minimal example of an HTTP server implemented in C#. The server: 23 | 24 | 1. Creates a list of record batches and populates it with synthesized data. 25 | 2. Listens for HTTP requests from clients. 26 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 27 | 28 | To run this example: 29 | 30 | ```sh 31 | dotnet run -c Release 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/csharp/server/Server.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Exe 5 | net8.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /http/get_simple/curl/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple curl Client Example 21 | 22 | This directory contains a simple `curl` command that: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Writes the stream of record batches to an Arrow IPC stream file with filename `output.arrows`. 26 | 27 | To run this example, first start one of the server examples in the parent directory, then run the `curl` command. 28 | 29 | ### Reading the Resulting Arrow IPC Stream File 30 | 31 | To read the resulting file `output.arrows` and retrieve the schema and record batches that it contains, you can use one of the code examples below, or use similar examples in other languages that have Arrow implementations. You can also read the file with any application that supports reading data in the Arrow IPC streaming format. 32 | 33 |
34 | Example: Read Arrow IPC stream file with Python 35 | 36 | ```py 37 | import pyarrow as pa 38 | 39 | with open("output.arrows", "rb") as f: 40 | reader = pa.ipc.open_stream(f) 41 | 42 | schema = reader.schema 43 | 44 | batch = reader.read_next_batch() 45 | # ... 46 | 47 | # or alternatively: 48 | batches = [b for b in reader] 49 | ``` 50 |
51 | 52 |
53 | Example: Read Arrow IPC stream file with R 54 | 55 | ```r 56 | library(arrow) 57 | 58 | reader <- RecordBatchStreamReader$create(ReadableFile$create("output.arrows")) 59 | 60 | schema <- reader$schema 61 | 62 | batch <- reader$read_next_batch() 63 | # ... 64 | 65 | # or alternatively: 66 | table <- reader$read_table() 67 | ``` 68 |
69 | -------------------------------------------------------------------------------- /http/get_simple/curl/client/client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | curl -o output.arrows http://localhost:8008 21 | -------------------------------------------------------------------------------- /http/get_simple/go/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | * 19 | !*.* 20 | !*/ 21 | .DS_Store 22 | go.sum 23 | 24 | -------------------------------------------------------------------------------- /http/get_simple/go/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Go Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in Go. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Adds the record batches to a slice as they are received. 26 | 27 | To run this example, first start one of the server examples in the parent directory, then: 28 | 29 | ```sh 30 | go mod init client 31 | go mod tidy 32 | go build client.go 33 | ./client 34 | ``` 35 | -------------------------------------------------------------------------------- /http/get_simple/go/client/client.go: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "net/http" 22 | "time" 23 | 24 | "github.com/apache/arrow/go/v15/arrow" 25 | "github.com/apache/arrow/go/v15/arrow/ipc" 26 | "github.com/apache/arrow/go/v15/arrow/memory" 27 | ) 28 | 29 | func main() { 30 | start := time.Now() 31 | resp, err := http.Get("http://localhost:8008") 32 | if err != nil { 33 | panic(err) 34 | } 35 | 36 | if resp.StatusCode != http.StatusOK { 37 | panic(fmt.Errorf("got non-200 status: %d", resp.StatusCode)) 38 | } 39 | defer resp.Body.Close() 40 | 41 | rdr, err := ipc.NewReader(resp.Body, ipc.WithAllocator(memory.DefaultAllocator)) 42 | if err != nil { 43 | panic(err) 44 | } 45 | defer rdr.Release() 46 | 47 | batches := make([]arrow.Record, 0) 48 | defer func() { 49 | for _, b := range batches { 50 | b.Release() 51 | } 52 | }() 53 | 54 | for rdr.Next() { 55 | rec := rdr.Record() 56 | rec.Retain() 57 | batches = append(batches, rec) 58 | } 59 | 60 | if rdr.Err() != nil { 61 | panic(rdr.Err()) 62 | } 63 | 64 | execTime := time.Since(start) 65 | 66 | fmt.Printf("%d record batches received\n", len(batches)) 67 | fmt.Printf("%.2f seconds elapsed\n", execTime.Seconds()) 68 | } 69 | -------------------------------------------------------------------------------- /http/get_simple/go/client/go.mod: -------------------------------------------------------------------------------- 1 | module client 2 | 3 | go 1.21.5 4 | 5 | require github.com/apache/arrow/go/v15 v15.0.1 6 | 7 | require ( 8 | github.com/goccy/go-json v0.10.2 // indirect 9 | github.com/google/flatbuffers v23.5.26+incompatible // indirect 10 | github.com/klauspost/compress v1.16.7 // indirect 11 | github.com/klauspost/cpuid/v2 v2.2.5 // indirect 12 | github.com/pierrec/lz4/v4 v4.1.18 // indirect 13 | github.com/zeebo/xxh3 v1.0.2 // indirect 14 | golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect 15 | golang.org/x/mod v0.13.0 // indirect 16 | golang.org/x/sys v0.13.0 // indirect 17 | golang.org/x/tools v0.14.0 // indirect 18 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 19 | ) 20 | -------------------------------------------------------------------------------- /http/get_simple/go/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Go Server Example 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Go. The server: 23 | 1. Creates a slice of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | go mod init server 31 | go mod tidy 32 | go build server.go 33 | ./server 34 | ``` 35 | -------------------------------------------------------------------------------- /http/get_simple/go/server/go.mod: -------------------------------------------------------------------------------- 1 | module server 2 | 3 | go 1.21.5 4 | 5 | require github.com/apache/arrow/go/v15 v15.0.1 6 | 7 | require ( 8 | github.com/goccy/go-json v0.10.2 // indirect 9 | github.com/google/flatbuffers v23.5.26+incompatible // indirect 10 | github.com/klauspost/compress v1.16.7 // indirect 11 | github.com/klauspost/cpuid/v2 v2.2.5 // indirect 12 | github.com/pierrec/lz4/v4 v4.1.18 // indirect 13 | github.com/zeebo/xxh3 v1.0.2 // indirect 14 | golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect 15 | golang.org/x/mod v0.13.0 // indirect 16 | golang.org/x/sys v0.13.0 // indirect 17 | golang.org/x/tools v0.14.0 // indirect 18 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 19 | ) 20 | -------------------------------------------------------------------------------- /http/get_simple/go/server/server.go: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "log" 22 | "math/rand" 23 | "net/http" 24 | 25 | "github.com/apache/arrow/go/v15/arrow" 26 | "github.com/apache/arrow/go/v15/arrow/array" 27 | "github.com/apache/arrow/go/v15/arrow/ipc" 28 | "github.com/apache/arrow/go/v15/arrow/memory" 29 | ) 30 | 31 | var schema = arrow.NewSchema([]arrow.Field{ 32 | {Name: "a", Type: arrow.PrimitiveTypes.Int64}, 33 | {Name: "b", Type: arrow.PrimitiveTypes.Int64}, 34 | {Name: "c", Type: arrow.PrimitiveTypes.Int64}, 35 | {Name: "d", Type: arrow.PrimitiveTypes.Int64}, 36 | }, nil) 37 | 38 | func GetPutData() []arrow.Record { 39 | const ( 40 | totalRecords = 100000000 41 | length = 4096 42 | ncolumns = 4 43 | seed = 42 44 | ) 45 | 46 | var ( 47 | r = rand.New(rand.NewSource(seed)) 48 | mem = memory.DefaultAllocator 49 | arrs = make([]arrow.Array, 0, ncolumns) 50 | ) 51 | for i := 0; i < ncolumns; i++ { 52 | buf := memory.NewResizableBuffer(mem) 53 | buf.Resize(length * 8) 54 | _, err := r.Read(buf.Buf()) 55 | if err != nil { 56 | panic(err) 57 | } 58 | defer buf.Release() 59 | 60 | data := array.NewData(arrow.PrimitiveTypes.Int64, length, []*memory.Buffer{nil, buf}, nil, 0, 0) 61 | defer data.Release() 62 | a := array.NewInt64Data(data) 63 | defer a.Release() 64 | arrs = append(arrs, a) 65 | } 66 | 67 | batch := array.NewRecord(schema, arrs, length) 68 | defer batch.Release() 69 | 70 | batches := make([]arrow.Record, 0) 71 | records := int64(0) 72 | for records < totalRecords { 73 | if records+length > totalRecords { 74 | lastLen := totalRecords - records 75 | batches = append(batches, batch.NewSlice(0, lastLen)) 76 | records += lastLen 77 | } else { 78 | batch.Retain() 79 | batches = append(batches, batch) 80 | records += length 81 | } 82 | } 83 | 84 | return batches 85 | } 86 | 87 | func main() { 88 | batches := GetPutData() 89 | 90 | http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 91 | if r.Method != http.MethodGet { 92 | w.WriteHeader(http.StatusBadRequest) 93 | return 94 | } 95 | 96 | hdrs := w.Header() 97 | 98 | //// set this header to disable chunked transfer encoding: 99 | //hdrs.Add("Transfer-Encoding", "identity") 100 | 101 | //// set these headers if testing with a local browser-based client: 102 | //hdrs.Add("Access-Control-Allow-Origin", "http://localhost:8008") 103 | //hdrs.Add("Access-Control-Allow-Methods", "GET") 104 | //hdrs.Add("Access-Control-Allow-Headers", "content-type") 105 | 106 | hdrs.Add("Content-Type", "application/vnd.apache.arrow.stream") 107 | w.WriteHeader(http.StatusOK) 108 | 109 | wr := ipc.NewWriter(w, ipc.WithSchema(batches[0].Schema())) 110 | defer wr.Close() 111 | 112 | for _, b := range batches { 113 | if err := wr.Write(b); err != nil { 114 | panic(err) 115 | } 116 | } 117 | }) 118 | 119 | fmt.Println("Serving on localhost:8008...") 120 | log.Fatal(http.ListenAndServe(":8008", nil)) 121 | } 122 | -------------------------------------------------------------------------------- /http/get_simple/java/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /**/target 19 | 20 | -------------------------------------------------------------------------------- /http/get_simple/java/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Java Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in Java. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Adds the record batches to a list as they are received. 26 | 27 | To run this example, first start one of the server examples in the parent directory, then: 28 | 29 | ```sh 30 | mvn package 31 | _JAVA_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED" mvn exec:java -Dexec.mainClass="ArrowHttpClient" 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/java/client/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 15 | 4.0.0 16 | 17 | com.example 18 | ArrowHttpClient 19 | 1.0-SNAPSHOT 20 | 21 | 22 | 14.0.1 23 | 21 24 | 21 25 | 26 | 27 | 28 | 29 | 30 | org.apache.arrow 31 | arrow-memory-core 32 | ${arrow.version} 33 | 34 | 35 | 36 | org.apache.arrow 37 | arrow-memory-netty 38 | ${arrow.version} 39 | 40 | 41 | 42 | org.apache.arrow 43 | arrow-vector 44 | ${arrow.version} 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /http/get_simple/java/client/src/main/java/com/example/ArrowHttpClient.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import org.apache.arrow.memory.BufferAllocator; 19 | import org.apache.arrow.memory.RootAllocator; 20 | import org.apache.arrow.vector.VectorSchemaRoot; 21 | import org.apache.arrow.vector.VectorUnloader; 22 | import org.apache.arrow.vector.types.pojo.Schema; 23 | import org.apache.arrow.vector.ipc.ArrowStreamReader; 24 | import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; 25 | 26 | import java.io.IOException; 27 | import java.io.InputStream; 28 | import java.net.HttpURLConnection; 29 | import java.net.URL; 30 | import java.util.List; 31 | import java.util.ArrayList; 32 | 33 | public class ArrowHttpClient { 34 | 35 | public static void main(String[] args) { 36 | String serverUrl = "http://localhost:8008"; 37 | 38 | try { 39 | long startTime = System.currentTimeMillis(); 40 | 41 | URL url = new URL(serverUrl); 42 | HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 43 | connection.setRequestMethod("GET"); 44 | 45 | if (connection.getResponseCode() == HttpURLConnection.HTTP_OK) { 46 | InputStream inputStream = connection.getInputStream(); 47 | 48 | BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); 49 | ArrowStreamReader reader = new ArrowStreamReader(inputStream, allocator); 50 | VectorSchemaRoot root = reader.getVectorSchemaRoot(); 51 | VectorUnloader unloader = new VectorUnloader(root); 52 | ArrowRecordBatch batch; 53 | 54 | Schema schema = root.getSchema(); 55 | List batches = new ArrayList<>(); 56 | 57 | int numRows = 0; 58 | while (reader.loadNextBatch()) { 59 | numRows += root.getRowCount(); 60 | batch = unloader.getRecordBatch(); 61 | batches.add(batch); 62 | } 63 | 64 | long endTime = System.currentTimeMillis(); 65 | float execTime = (endTime - startTime) / 1000F; 66 | 67 | System.out.println(reader.bytesRead() + " bytes received"); 68 | System.out.println(numRows + " records received"); 69 | System.out.println(batches.size() + " record batches received"); 70 | System.out.printf("%.2f seconds elapsed\n", execTime); 71 | 72 | reader.close(); 73 | } else { 74 | System.err.println("Failed with response code: " + connection.getResponseCode()); 75 | } 76 | } catch (IOException e) { 77 | e.printStackTrace(); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /http/get_simple/java/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Java Server Example 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Java using the Jetty web server. The server: 23 | 1. Creates a list of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | mvn package 31 | _JAVA_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED" mvn exec:java -Dexec.mainClass="ArrowHttpServer" 32 | ``` 33 | > [!NOTE] 34 | > For simplicity, the example here uses static class members and does not properly initialize and release resources or handle errors. 35 | -------------------------------------------------------------------------------- /http/get_simple/java/server/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 15 | 4.0.0 16 | 17 | com.example 18 | ArrowHttpServer 19 | 1.0-SNAPSHOT 20 | 21 | 22 | 15.0.0 23 | 11.0.24 24 | 21 25 | 21 26 | 27 | 28 | 29 | 30 | 31 | org.apache.arrow 32 | arrow-bom 33 | ${arrow.version} 34 | pom 35 | import 36 | 37 | 38 | 39 | 40 | 41 | 42 | org.apache.arrow 43 | arrow-memory-core 44 | 45 | 46 | 47 | org.apache.arrow 48 | arrow-memory-netty 49 | 50 | 51 | 52 | org.apache.arrow 53 | arrow-vector 54 | 55 | 56 | 57 | org.eclipse.jetty 58 | jetty-server 59 | ${jetty.version} 60 | 61 | 62 | 63 | org.eclipse.jetty 64 | jetty-servlet 65 | ${jetty.version} 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /http/get_simple/js/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /**/node_modules 19 | package-lock.json 20 | 21 | -------------------------------------------------------------------------------- /http/get_simple/js/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple JavaScript Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in JavaScript. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Creates an Arrow table from the record batches 26 | 27 | To run this example, first start one of the server examples in the parent directory, then: 28 | 29 | ```sh 30 | npm install apache-arrow 31 | node client.js 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/js/client/client.js: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | const Arrow = require('apache-arrow'); 19 | 20 | const url = 'http://localhost:8008'; 21 | 22 | async function runExample(url) { 23 | const startTime = new Date(); 24 | 25 | const table = await Arrow.tableFromIPC(fetch(url)); 26 | 27 | const duration = (new Date() - startTime) / 1000; 28 | console.log(`${table.batches.length} record batches received`); 29 | console.log(`${duration.toFixed(2)} seconds elapsed`); 30 | } 31 | 32 | runExample(url); 33 | -------------------------------------------------------------------------------- /http/get_simple/js/client/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "apache-arrow": "^15.0.1" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /http/get_simple/julia/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" 3 | HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" 4 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 5 | Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" 6 | -------------------------------------------------------------------------------- /http/get_simple/julia/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Julia Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in Julia. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Adds the record batches to a list as they are received. 26 | 27 | To run this example, first start one of the server examples in the parent directory, then: 28 | 29 | ```sh 30 | julia --project=.. -e "using Pkg; Pkg.instantiate()" 31 | julia --project=.. client.jl 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/julia/client/client.jl: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | using Arrow, HTTP 19 | 20 | function get_batches() 21 | res = HTTP.get("http://localhost:8008") 22 | buffer = res.body 23 | stream = Arrow.Stream(res.body) 24 | batches = collect(stream) 25 | 26 | println("$(length(buffer)) bytes received") 27 | println("$(length(batches)) record batches received") 28 | 29 | return batches 30 | end 31 | 32 | execution_time = @elapsed get_batches() 33 | println("$(execution_time) seconds elapsed") 34 | -------------------------------------------------------------------------------- /http/get_simple/julia/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Julia Server Example 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Julia. The server: 23 | 1. Creates a list of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | julia --project=.. -e "using Pkg; Pkg.instantiate()" 31 | julia --project=.. server.jl 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/julia/server/server.jl: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | using Arrow, HTTP, Random, Tables 19 | 20 | function randint_nullable(n::Integer) 21 | v = Vector{Union{Missing, Int}}(undef, n) 22 | rand!(v, Int) 23 | return v 24 | end 25 | 26 | function get_stream(::HTTP.Request) 27 | total_records = 100_000_000 28 | batch_len = 4096 29 | stream = Tables.partitioner(Iterators.partition(1:total_records, batch_len)) do indices 30 | nrows = length(indices) 31 | return ( 32 | a = randint_nullable(nrows), 33 | b = randint_nullable(nrows), 34 | c = randint_nullable(nrows), 35 | d = randint_nullable(nrows) 36 | ) 37 | end 38 | buffer = IOBuffer() 39 | Arrow.write(buffer, stream) 40 | return HTTP.Response(200, take!(buffer)) 41 | end 42 | 43 | println("Serving on localhost:8008...") 44 | server = HTTP.serve(get_stream, "127.0.0.1", 8008) 45 | -------------------------------------------------------------------------------- /http/get_simple/matlab/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple MATLAB Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in MATLAB. The client: 23 | 24 | 1. Sends an HTTP GET request to a server. 25 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 26 | 3. Creates an Arrow table from the record batches 27 | 28 | To run this example, first start one of the server examples in the parent directory, then: 29 | 30 | Run the MATLAB `client` script in "batch mode": 31 | 32 | ```sh 33 | matlab -batch client 34 | ``` 35 | -------------------------------------------------------------------------------- /http/get_simple/matlab/client/client.m: -------------------------------------------------------------------------------- 1 | % Licensed to the Apache Software Foundation (ASF) under one 2 | % or more contributor license agreements. See the NOTICE file 3 | % distributed with this work for additional information 4 | % regarding copyright ownership. The ASF licenses this file 5 | % to you under the Apache License, Version 2.0 (the 6 | % "License"); you may not use this file except in compliance 7 | % with the License. You may obtain a copy of the License at 8 | % 9 | % http://www.apache.org/licenses/LICENSE-2.0 10 | % 11 | % Unless required by applicable law or agreed to in writing, 12 | % software distributed under the License is distributed on an 13 | % "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | % KIND, either express or implied. See the License for the 15 | % specific language governing permissions and limitations 16 | % under the License. 17 | 18 | % The address of the HTTP server that 19 | % returns Arrow IPC Stream responses. 20 | server = "http://localhost:8008"; 21 | 22 | % Diagnostic output. 23 | disp("Reading Arrow IPC Stream from " + server + "..."); 24 | 25 | % Start timing. 26 | tic; 27 | 28 | % Make an HTTP GET request to the local server 29 | % to fetch an Arrow IPC Stream and read all the 30 | % data into memory as a byte (uint8) array. 31 | options = weboptions(ContentType="binary"); 32 | bytes = webread(server, options); 33 | 34 | % Construct an Arrow RecordBatchStreamReader from the in-memory bytes. 35 | reader = arrow.io.ipc.RecordBatchStreamReader.fromBytes(bytes); 36 | 37 | % Read an Arrow table from the in-memory bytes. 38 | arrowTable = reader.readTable(); 39 | 40 | % Stop timing. 41 | time = toc; 42 | % Round elapsed time to two decimal places. 43 | time = round(time, 2); 44 | 45 | % Number of bytes received. 46 | nbytes = length(bytes); 47 | 48 | % Diagnostic output. 49 | disp("DONE ✔"); 50 | disp("---------------"); 51 | disp("Results") 52 | disp("---------------"); 53 | disp("Time (s): " + sprintf("%.2f", time)); 54 | disp("Num Bytes: " + string(nbytes)); 55 | disp("Num Rows:" + string(arrowTable.NumRows)); 56 | disp("Num Columns:" + string(arrowTable.NumColumns)); 57 | -------------------------------------------------------------------------------- /http/get_simple/python/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Python Client Examples 21 | 22 | This directory contains minimal examples of HTTP clients implemented in Python using various libraries. Each of these clients: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Adds the record batches to a list as they are received. 26 | -------------------------------------------------------------------------------- /http/get_simple/python/client/urllib.request/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Python Client Example with `urllib.request` 21 | 22 | This directory contains a minimal example of an HTTP client implemented in Python using the built-in [`urllib.request`](https://docs.python.org/3/library/urllib.request.html) module. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Adds the record batches to a list as they are received. 26 | 27 | To run this example, first start one of the server examples in the parent directory, then: 28 | 29 | ```sh 30 | pip install pyarrow 31 | python client.py 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/python/client/urllib.request/client.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import urllib.request 19 | import pyarrow as pa 20 | import time 21 | 22 | ARROW_STREAM_FORMAT = 'application/vnd.apache.arrow.stream' 23 | 24 | start_time = time.time() 25 | 26 | response = urllib.request.urlopen('http://localhost:8008') 27 | content_type = response.headers['Content-Type'] 28 | if not content_type.startswith(ARROW_STREAM_FORMAT): 29 | raise ValueError(f"Expected {ARROW_STREAM_FORMAT}, got {content_type}") 30 | 31 | batches = [] 32 | 33 | with pa.ipc.open_stream(response) as reader: 34 | schema = reader.schema 35 | try: 36 | while True: 37 | batches.append(reader.read_next_batch()) 38 | except StopIteration: 39 | pass 40 | 41 | # or: 42 | # with pa.ipc.open_stream(response) as reader: 43 | # schema = reader.schema 44 | # batches = [b for b in reader] 45 | 46 | end_time = time.time() 47 | execution_time = end_time - start_time 48 | 49 | print(f"{len(batches)} record batches received") 50 | print(f"{execution_time} seconds elapsed") 51 | -------------------------------------------------------------------------------- /http/get_simple/python/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Python Server Examples 21 | 22 | This directory contains minimal examples of HTTP servers implemented in Python using various libraries. Each of these servers: 23 | 1. Creates a list of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | -------------------------------------------------------------------------------- /http/get_simple/python/server/fastapi_uvicorn/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Python Server Example with FastAPI and Uvicorn 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Python using the [FastAPI](https://fastapi.tiangolo.com) framework and the [Uvicorn](https://www.uvicorn.org) web server. This example: 23 | 1. Creates a list of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | pip install fastapi 31 | pip install "uvicorn[standard]" 32 | pip install pyarrow 33 | uvicorn server:app --port 8008 34 | ``` 35 | 36 | > [!NOTE] 37 | > This example requires Starlette 0.38.0 or newer, which added support for `memoryview` in `StreamingResponse`. If using an older version of Starlette, change both instances of: 38 | > ```py 39 | > with sink.getbuffer() as buffer: 40 | > yield buffer 41 | > ``` 42 | > to: 43 | > ```py 44 | > yield sink.getvalue() 45 | > ``` 46 | -------------------------------------------------------------------------------- /http/get_simple/python/server/fastapi_uvicorn/server.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | from random import randbytes 20 | import io 21 | from fastapi import FastAPI 22 | from fastapi.responses import StreamingResponse 23 | 24 | schema = pa.schema([ 25 | ("a", pa.int64()), 26 | ("b", pa.int64()), 27 | ("c", pa.int64()), 28 | ("d", pa.int64()) 29 | ]) 30 | 31 | 32 | def GetPutData(): 33 | total_records = 100000000 34 | length = 4096 35 | ncolumns = 4 36 | 37 | arrays = [] 38 | 39 | for x in range(0, ncolumns): 40 | buffer = pa.py_buffer(randbytes(length * 8)) 41 | arrays.append(pa.Int64Array.from_buffers( 42 | pa.int64(), length, [None, buffer], null_count=0)) 43 | 44 | batch = pa.record_batch(arrays, schema) 45 | batches = [] 46 | 47 | records = 0 48 | while records < total_records: 49 | if records + length > total_records: 50 | last_length = total_records - records 51 | batches.append(batch.slice(0, last_length)) 52 | records += last_length 53 | else: 54 | batches.append(batch) 55 | records += length 56 | 57 | return batches 58 | 59 | 60 | def generate_bytes(schema, batches): 61 | with pa.RecordBatchReader.from_batches(schema, batches) as source, \ 62 | io.BytesIO() as sink, \ 63 | pa.ipc.new_stream(sink, schema) as writer: 64 | for batch in source: 65 | sink.seek(0) 66 | writer.write_batch(batch) 67 | sink.truncate() 68 | with sink.getbuffer() as buffer: 69 | yield buffer 70 | 71 | sink.seek(0) 72 | writer.close() 73 | sink.truncate() 74 | with sink.getbuffer() as buffer: 75 | yield buffer 76 | 77 | 78 | batches = GetPutData() 79 | 80 | app = FastAPI() 81 | 82 | 83 | @app.get("/") 84 | def main(): 85 | return StreamingResponse( 86 | generate_bytes(schema, batches), 87 | media_type="application/vnd.apache.arrow.stream" 88 | ) 89 | -------------------------------------------------------------------------------- /http/get_simple/python/server/http.server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Python Server Example with `http.server` 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Python using the built-in [`http.server`](https://docs.python.org/3/library/http.server.html) module. The server: 23 | 1. Creates a list of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | pip install pyarrow 31 | python server.py 32 | ``` 33 | 34 | > [!NOTE] 35 | > This example implements [chunked transfer encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding) manually. Other servers may implement chunked transfer encoding automatically at the cost of an undesirable new layer of buffering. Arrow IPC streams already offer a natural way of chunking large amounts of tabular data. It's not a general requirement, but in this example each chunk corresponds to one Arrow record batch. 36 | -------------------------------------------------------------------------------- /http/get_simple/r/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple R Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in R. The client: 23 | 1. Sends an HTTP GET request to a server. 24 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 25 | 3. Creates an Arrow table from the record batches 26 | 27 | To run this example, first start one of the server examples in the parent directory, then: 28 | 29 | ```sh 30 | R -e 'install.packages(c("httr", "arrow", "tictoc"), repos="https://cloud.r-project.org")' 31 | R -f client.R -s 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/r/client/client.R: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | library(httr) 19 | library(tictoc) 20 | suppressPackageStartupMessages(library(arrow)) 21 | 22 | url <- 'http://localhost:8008' 23 | 24 | tic() 25 | 26 | response <- GET(url) 27 | buffer <- content(response, "raw") 28 | reader <- RecordBatchStreamReader$create(buffer) 29 | table <- reader$read_table() 30 | 31 | # or: 32 | #batches <- reader$batches() 33 | # but this is very slow (https://github.com/apache/arrow/issues/39090) 34 | 35 | 36 | # or: 37 | #result <- read_ipc_stream(buffer, as_data_frame = FALSE) 38 | 39 | # or: 40 | #result <- read_ipc_stream(url, as_data_frame = FALSE) 41 | 42 | toc() 43 | 44 | cat(format(table$nbytes(), scientific = FALSE), "bytes received\n") 45 | cat(table$num_rows, "records received\n") 46 | -------------------------------------------------------------------------------- /http/get_simple/rs/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /target 19 | Cargo.lock 20 | -------------------------------------------------------------------------------- /http/get_simple/rs/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [workspace] 19 | resolver = "2" 20 | members = ["client", "server"] 21 | 22 | [workspace.dependencies] 23 | arrow-array = "50.0.0" 24 | arrow-ipc = "50.0.0" 25 | arrow-schema = "50.0.0" 26 | tracing = "0.1.40" 27 | tracing-subscriber = "0.3.18" 28 | -------------------------------------------------------------------------------- /http/get_simple/rs/client/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "client" 20 | version = "0.1.0" 21 | edition = "2021" 22 | 23 | [dependencies] 24 | arrow-ipc.workspace = true 25 | tracing.workspace = true 26 | tracing-subscriber.workspace = true 27 | -------------------------------------------------------------------------------- /http/get_simple/rs/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Rust Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in Rust. The client: 23 | 24 | 1. Sends an HTTP GET request to a server. 25 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 26 | 3. Adds the record batches to a list as they are received. 27 | 28 | To run this example, first start one of the server examples in the parent directory, then: 29 | 30 | ```sh 31 | cargo r --release 32 | ``` 33 | > [!NOTE] 34 | > This client example implements low-level HTTP/1.1 details directly, instead of using an HTTP library. We intend to update the example to use [hyper](https://docs.rs/hyper/latest/hyper/) after [arrow-rs has an async Arrow IPC reader](https://github.com/apache/arrow-rs/issues/1207)). 35 | -------------------------------------------------------------------------------- /http/get_simple/rs/client/src/main.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use arrow_ipc::reader::StreamReader; 19 | use std::{ 20 | io::{BufRead, BufReader, Read, Write}, 21 | net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}, 22 | }; 23 | use tracing::{error, info, info_span}; 24 | use tracing_subscriber::fmt::format::FmtSpan; 25 | 26 | fn main() { 27 | // Configure tracing subscriber. 28 | tracing_subscriber::fmt() 29 | .with_span_events(FmtSpan::CLOSE) 30 | .init(); 31 | 32 | info_span!("get_simple").in_scope(|| { 33 | // Connect to server. 34 | let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 8008); 35 | match TcpStream::connect(addr) { 36 | Ok(mut stream) => { 37 | info_span!("Reading Arrow IPC stream", %addr).in_scope(|| { 38 | info!("Connected"); 39 | 40 | // Send request. 41 | stream 42 | .write_all(format!("GET / HTTP/1.1\r\nHost: {addr}\r\n\r\n").as_bytes()) 43 | .unwrap(); 44 | 45 | // Ignore response header. 46 | let mut reader = BufReader::new(&mut stream); 47 | let mut chunked = false; 48 | loop { 49 | let mut line = String::default(); 50 | reader.read_line(&mut line).unwrap(); 51 | if let Some(("transfer-encoding", "chunked")) = line 52 | .to_lowercase() 53 | .split_once(':') 54 | .map(|(key, value)| (key.trim(), value.trim())) 55 | { 56 | chunked = true; 57 | } 58 | if line == "\r\n" { 59 | break; 60 | } 61 | } 62 | 63 | // Read Arrow IPC stream 64 | let batches: Vec<_> = if chunked { 65 | let mut buffer = Vec::default(); 66 | loop { 67 | // Chunk size 68 | let mut line = String::default(); 69 | reader.read_line(&mut line).unwrap(); 70 | let chunk_size = u64::from_str_radix(line.trim(), 16).unwrap(); 71 | 72 | if chunk_size == 0 { 73 | // Terminating chunk 74 | break; 75 | } else { 76 | // Append chunk to buffer 77 | let mut chunk_reader = reader.take(chunk_size); 78 | chunk_reader.read_to_end(&mut buffer).unwrap(); 79 | // Terminating CR-LF sequence 80 | reader = chunk_reader.into_inner(); 81 | reader.read_line(&mut String::default()).unwrap(); 82 | } 83 | } 84 | StreamReader::try_new_unbuffered(buffer.as_slice(), None) 85 | .unwrap() 86 | .flat_map(Result::ok) 87 | .collect() 88 | } else { 89 | StreamReader::try_new_unbuffered(reader, None) 90 | .unwrap() 91 | .flat_map(Result::ok) 92 | .collect() 93 | }; 94 | 95 | info!( 96 | batches = batches.len(), 97 | rows = batches.iter().map(|rb| rb.num_rows()).sum::() 98 | ); 99 | }); 100 | } 101 | Err(error) => { 102 | error!(%error, "Connection failed") 103 | } 104 | } 105 | }) 106 | } 107 | -------------------------------------------------------------------------------- /http/get_simple/rs/server/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | [package] 19 | name = "server" 20 | version = "0.1.0" 21 | edition = "2021" 22 | 23 | [dependencies] 24 | arrow-array.workspace = true 25 | arrow-ipc.workspace = true 26 | arrow-schema.workspace = true 27 | once_cell = "1.19.0" 28 | rand = "0.8.5" 29 | rayon = "1.9.0" 30 | tracing.workspace = true 31 | tracing-subscriber.workspace = true 32 | -------------------------------------------------------------------------------- /http/get_simple/rs/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Rust Server Example 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Rust. The server: 23 | 24 | 1. Creates a list of record batches and populates it with synthesized data. 25 | 2. Listens for HTTP requests from clients. 26 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 27 | 28 | To run this example: 29 | 30 | ```sh 31 | cargo r --release 32 | ``` 33 | > [!NOTE] 34 | > This server example implements low-level HTTP/1.1 details directly, instead of using an HTTP library. We intend to update the example to use [hyper](https://docs.rs/hyper/latest/hyper/) after [arrow-rs has an async Arrow IPC writer](https://github.com/apache/arrow-rs/issues/1207)). 35 | -------------------------------------------------------------------------------- /http/get_simple/rs/server/src/main.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | use std::{ 19 | io::{BufRead, BufReader, Result, Write}, 20 | net::{IpAddr, Ipv4Addr, SocketAddr, TcpListener}, 21 | sync::Arc, 22 | thread, 23 | }; 24 | 25 | use arrow_array::{Int64Array, RecordBatch}; 26 | use arrow_ipc::writer::StreamWriter; 27 | use arrow_schema::{DataType, Field, Fields, Schema}; 28 | use once_cell::sync::Lazy; 29 | use rand::{distributions::Standard, prelude::*}; 30 | use rayon::{iter, prelude::*}; 31 | use tracing::{error, info, info_span}; 32 | use tracing_subscriber::fmt::format::FmtSpan; 33 | 34 | const RECORDS_PER_BATCH: usize = 4096; 35 | const TOTAL_RECORDS: usize = if cfg!(debug_assertions) { 36 | 100_000 37 | } else { 38 | 100_000_000 39 | }; 40 | 41 | /// Schema for random data 42 | static SCHEMA: Lazy> = Lazy::new(|| { 43 | Arc::new(Schema::new( 44 | ('a'..='d') 45 | .map(|field_name| Field::new(field_name, DataType::Int64, true)) 46 | .collect::(), 47 | )) 48 | }); 49 | 50 | /// Random data 51 | static DATA: Lazy> = Lazy::new(|| { 52 | info_span!("data", TOTAL_RECORDS, RECORDS_PER_BATCH).in_scope(|| { 53 | info!("Generating random data"); 54 | // Generate recordbatches with random data 55 | iter::repeatn( 56 | RECORDS_PER_BATCH, 57 | TOTAL_RECORDS.div_euclid(RECORDS_PER_BATCH), 58 | ) 59 | .chain(iter::once(TOTAL_RECORDS.rem_euclid(RECORDS_PER_BATCH))) 60 | .map_init(rand::thread_rng, |rng, len| { 61 | RecordBatch::try_new( 62 | Arc::clone(&SCHEMA), 63 | (0..SCHEMA.all_fields().len()) 64 | .map(|_| { 65 | Arc::new( 66 | rng.sample_iter::(Standard) 67 | .take(len) 68 | .collect::(), 69 | ) as _ 70 | }) 71 | .collect(), 72 | ) 73 | }) 74 | .flatten() 75 | .collect() 76 | }) 77 | }); 78 | 79 | fn get_simple(mut stream: std::net::TcpStream) { 80 | info!("Incoming connection"); 81 | 82 | // Ignore incoming request. 83 | for _ in BufReader::new(&mut stream) 84 | .lines() 85 | .take_while(|line| line.as_ref().is_ok_and(|line| !line.is_empty())) 86 | {} 87 | 88 | // Write response header. 89 | stream 90 | .write_all( 91 | "HTTP/1.1 200 OK\r\ncontent-type: application/vnd.apache.arrow.stream\r\n\r\n" 92 | .as_bytes(), 93 | ) 94 | .unwrap(); 95 | 96 | // Stream the body. 97 | let mut writer = StreamWriter::try_new(stream, &SCHEMA).unwrap(); 98 | for batch in DATA.iter() { 99 | writer.write(batch).unwrap(); 100 | } 101 | writer.finish().unwrap(); 102 | 103 | let stream = writer.into_inner().unwrap(); 104 | stream.shutdown(std::net::Shutdown::Both).unwrap(); 105 | } 106 | 107 | fn main() -> Result<()> { 108 | // Configure tracing subscriber. 109 | tracing_subscriber::fmt() 110 | .with_span_events(FmtSpan::CLOSE) 111 | .init(); 112 | 113 | // Generate random data. 114 | let _ = Lazy::force(&DATA); 115 | 116 | // Start listening. 117 | let bind_addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 8008); 118 | let listener = TcpListener::bind(bind_addr)?; 119 | info!(%bind_addr, "Listening"); 120 | 121 | // Handle incoming connections. 122 | loop { 123 | match listener.accept() { 124 | Ok((stream, remote_peer)) => { 125 | thread::spawn(move || { 126 | info_span!("Writing Arrow IPC stream", %remote_peer) 127 | .in_scope(|| get_simple(stream)) 128 | }); 129 | } 130 | Err(error) => { 131 | error!(%error, "Connection failed"); 132 | } 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /http/get_simple/ruby/client/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /Gemfile.lock 19 | -------------------------------------------------------------------------------- /http/get_simple/ruby/client/Gemfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | source "https://rubygems.org" 19 | 20 | gem "red-arrow" 21 | -------------------------------------------------------------------------------- /http/get_simple/ruby/client/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Ruby Client Example 21 | 22 | This directory contains a minimal example of an HTTP client implemented in Ruby. 23 | 24 | The client: 25 | 26 | 1. Sends an HTTP GET request to a server. 27 | 2. Receives an HTTP 200 response from the server, with the response body containing an Arrow IPC stream of record batches. 28 | 3. Creates an Arrow table from the record batches. 29 | 30 | To run this example, first start one of the server examples in the parent directory, then: 31 | 32 | ```sh 33 | bundle install 34 | bundle exec ruby client.rb 35 | ``` 36 | -------------------------------------------------------------------------------- /http/get_simple/ruby/client/client.rb: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | require "net/http" 19 | require "arrow" 20 | 21 | uri = URI("http://localhost:8008") 22 | 23 | start = Time.now 24 | arrows_data = Net::HTTP.get(uri).freeze 25 | table = Arrow::Table.load(Arrow::Buffer.new(arrows_data), format: :arrows) 26 | elapsed_time = Time.now - start 27 | 28 | n_received_record_batches = table[0].data.n_chunks 29 | puts("#{n_received_record_batches} record batches received") 30 | puts("%.2f seconds elapsed" % elapsed_time) 31 | -------------------------------------------------------------------------------- /http/get_simple/ruby/server/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | /Gemfile.lock 19 | -------------------------------------------------------------------------------- /http/get_simple/ruby/server/Gemfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | source "https://rubygems.org/" 19 | 20 | gem "rackup" 21 | gem "red-arrow" 22 | -------------------------------------------------------------------------------- /http/get_simple/ruby/server/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP GET Arrow Data: Simple Ruby Server Example 21 | 22 | This directory contains a minimal example of an HTTP server implemented in Ruby. The server: 23 | 1. Creates a list of record batches and populates it with synthesized data. 24 | 2. Listens for HTTP GET requests from clients. 25 | 3. Upon receiving a request, sends an HTTP 200 response with the body containing an Arrow IPC stream of record batches. 26 | 27 | To run this example: 28 | 29 | ```sh 30 | bundle install 31 | bundle exec rackup --port=8008 32 | ``` 33 | -------------------------------------------------------------------------------- /http/get_simple/ruby/server/config.ru: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | require "arrow" 19 | 20 | class ArrowStreamGenerator 21 | def initialize(env) 22 | @env = env 23 | @schema = Arrow::Schema.new(a: :int64, 24 | b: :int64, 25 | c: :int64, 26 | d: :int64) 27 | generate_record_batches 28 | end 29 | 30 | def each(&block) 31 | body = RackBody.new(block, need_manual_chunked?) 32 | Gio::RubyOutputStream.open(body) do |gio_output| 33 | Arrow::GIOOutputStream.open(gio_output) do |arrow_output| 34 | Arrow::RecordBatchStreamWriter.open(arrow_output, @schema) do |writer| 35 | @record_batches.each do |record_batch| 36 | writer.write_record_batch(record_batch) 37 | end 38 | end 39 | end 40 | end 41 | end 42 | 43 | private 44 | def need_manual_chunked? 45 | not (@env["SERVER_SOFTWARE"] || "").start_with?("WEBrick") 46 | end 47 | 48 | def generate_record_batches 49 | n_total_records = 100000000 50 | n_columns = 4 51 | 52 | n_rows = 4096 53 | max_int64 = 2 ** 63 - 1 54 | arrays = n_columns.times.collect do 55 | Arrow::Int64Array.new(n_rows.times.collect {rand(max_int64)}) 56 | end 57 | 58 | record_batch = Arrow::RecordBatch.new(@schema, n_rows, arrays) 59 | @record_batches = [record_batch] * (n_total_records / n_rows) 60 | n_remained_records = n_total_records % n_rows 61 | if n_remained_records 62 | @record_batches << record_batch.slice(0, n_remained_records) 63 | end 64 | end 65 | 66 | class RackBody 67 | def initialize(block, need_manual_chunked) 68 | @block = block 69 | @need_manual_chunked = need_manual_chunked 70 | end 71 | 72 | def write(buffer) 73 | @block.call("#{buffer.bytesize.to_s(16)}\r\n") if @need_manual_chunked 74 | @block.call(buffer) 75 | @block.call("\r\n") if @need_manual_chunked 76 | buffer.bytesize 77 | end 78 | 79 | def flush 80 | end 81 | 82 | def close 83 | @block.call("0\r\n\r\n") if @need_manual_chunked 84 | end 85 | end 86 | end 87 | 88 | run do |env| 89 | headers = { 90 | "content-type" => "application/vnd.apache.arrow.stream", 91 | "transfer-encoding" => "chunked", 92 | } 93 | [200, headers, ArrowStreamGenerator.new(env)] 94 | end 95 | -------------------------------------------------------------------------------- /http/post_multipart/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP POST Arrow Data: Multipart Examples 21 | 22 | This directory contains examples of HTTP clients and servers demonstrating how a client can send a POST request to a server with a multipart request body (`Content-Type: multipart/form-data`) containing JSON data (`Content-Type: application/json`) and Arrow IPC stream data (`Content-Type: application/vnd.apache.arrow.stream`). 23 | -------------------------------------------------------------------------------- /http/post_simple/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # HTTP POST Arrow Data: Simple Examples 21 | 22 | This directory contains minimal examples of HTTP clients and servers demonstrating how a client can send a POST request to a server with a request body consisting of an Arrow IPC stream of record batches. 23 | --------------------------------------------------------------------------------