├── .github ├── dependabot.yml └── workflows │ └── rust.yml ├── .gitignore ├── .gitmodules ├── Cargo.toml ├── LICENSE ├── README.md └── src ├── error.rs ├── lib.rs └── object_store ├── mod.rs └── s3.rs /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | version: 2 19 | updates: 20 | - package-ecosystem: cargo 21 | directory: "/" 22 | schedule: 23 | interval: daily 24 | open-pull-requests-limit: 10 25 | target-branch: master 26 | labels: [auto-dependencies] 27 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | name: Tests 19 | on: [push, pull_request] 20 | 21 | jobs: 22 | test: 23 | name: Test Workspace on AMD64 Rust ${{ matrix.rust }} 24 | runs-on: ubuntu-latest 25 | strategy: 26 | matrix: 27 | arch: [amd64] 28 | rust: [stable] 29 | steps: 30 | - uses: actions/checkout@v2 31 | with: 32 | submodules: true 33 | - name: Cache Cargo 34 | uses: actions/cache@v2 35 | with: 36 | path: /home/runner/.cargo 37 | key: cargo-S3-cache- 38 | - name: Cache Rust dependencies 39 | uses: actions/cache@v2 40 | with: 41 | path: /home/runner/target 42 | key: target-S3-cache- 43 | - name: Setup Rust toolchain 44 | run: | 45 | rustup toolchain install ${{ matrix.rust }} 46 | rustup default ${{ matrix.rust }} 47 | rustup component add rustfmt 48 | - name: Setup minio 49 | run: | 50 | docker run \ 51 | --detach \ 52 | --rm \ 53 | --publish 9000:9000 \ 54 | --publish 9001:9001 \ 55 | --name minio \ 56 | --volume "$(pwd)/parquet-testing:/data" \ 57 | --env "MINIO_ROOT_USER=AKIAIOSFODNN7EXAMPLE" \ 58 | --env "MINIO_ROOT_PASSWORD=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" \ 59 | quay.io/minio/minio server /data \ 60 | --console-address ":9001" 61 | - name: Run tests 62 | run: | 63 | cargo test 64 | fmt: 65 | name: Rust formatting 66 | runs-on: ubuntu-latest 67 | steps: 68 | - uses: actions/checkout@v2 69 | - name: Setup toolchain 70 | run: | 71 | rustup toolchain install stable 72 | rustup default stable 73 | rustup component add rustfmt 74 | - name: Run 75 | run: cargo fmt --all -- --check 76 | clippy: 77 | name: Clippy 78 | runs-on: ubuntu-latest 79 | strategy: 80 | matrix: 81 | arch: [amd64] 82 | rust: [stable] 83 | container: 84 | image: ${{ matrix.arch }}/rust 85 | env: 86 | # Disable full debug symbol generation to speed up CI build and keep memory down 87 | # "1" means line tables only, which is useful for panic tracebacks. 88 | RUSTFLAGS: "-C debuginfo=1" 89 | steps: 90 | - uses: actions/checkout@v2 91 | with: 92 | submodules: true 93 | - name: Cache Cargo 94 | uses: actions/cache@v2 95 | with: 96 | path: /home/runner/.cargo 97 | key: cargo-S3-cache- 98 | - name: Cache Rust dependencies 99 | uses: actions/cache@v2 100 | with: 101 | path: /home/runner/target 102 | key: target-S3-cache- 103 | - name: Setup Rust toolchain 104 | run: | 105 | rustup toolchain install ${{ matrix.rust }} 106 | rustup default ${{ matrix.rust }} 107 | rustup component add rustfmt clippy 108 | - name: Run clippy 109 | run: | 110 | cargo clippy --all-targets --workspace -- -D warnings 111 | env: 112 | CARGO_HOME: "/github/home/.cargo" 113 | CARGO_TARGET_DIR: "/github/home/target" 114 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | parquet-testing -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "parquet-testing"] 2 | path = parquet-testing 3 | url = https://github.com/apache/parquet-testing.git 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition = "2021" 3 | name = "datafusion-objectstore-s3" 4 | description = "S3 as an ObjectStore for Datafusion" 5 | version = "0.2.1" 6 | homepage = "https://github.com/datafusion-contrib/datafusion-objectstore-s3" 7 | repository = "https://github.com/datafusion-contrib/datafusion-objectstore-s3" 8 | readme = "README.md" 9 | authors = ["Matthew Turner ", "Michael Seddon "] 10 | license = "Apache-2.0" 11 | keywords = [ "arrow", "query", "sql", "datafusion" ] 12 | 13 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 14 | 15 | [lib] 16 | name = "datafusion_objectstore_s3" 17 | path = "src/lib.rs" 18 | 19 | [dependencies] 20 | async-trait = "0.1.52" 21 | aws-config = "0.9.0" 22 | aws-sdk-s3 = "0.9.0" 23 | aws-smithy-async = "0.39.0" 24 | aws-smithy-types = "0.39.0" 25 | aws-smithy-types-convert = { version = "0.39.0", features = ["convert-chrono"] } 26 | aws-types = "0.9.0" 27 | bytes = "1.1.0" 28 | datafusion-data-access = { version = "8.0.0" } 29 | futures = "0.3.19" 30 | http = "0.2.6" 31 | num_cpus = "1.13.1" 32 | tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs"] } 33 | 34 | [dev-dependencies] 35 | arrow = { version = "13", features = ["prettyprint"] } 36 | datafusion = { version = "8.0.0" } 37 | 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataFusion-ObjectStore-S3 2 | 3 | S3 as an ObjectStore for [Datafusion](https://github.com/apache/arrow-datafusion). 4 | 5 | ## Querying files on S3 with DataFusion 6 | 7 | This crate implements the DataFusion `ObjectStore` trait on AWS S3 and implementers of the S3 standard. We leverage the official [AWS Rust SDK](https://github.com/awslabs/aws-sdk-rust) for interacting with S3. While it is our understanding that the AWS APIs we are using a relatively stable, we can make no assurances on API stability either on AWS' part or within this crate. This crates API is tightly connected with DataFusion, a fast moving project, and as such we will make changes inline with those upstream changes. 8 | 9 | ## Examples 10 | 11 | Examples for querying AWS and other implementors, such as MinIO, are shown below. 12 | 13 | Load credentials from default AWS credential provider (such as environment or ~/.aws/credentials) 14 | 15 | ```rust 16 | let s3_file_system = Arc::new(S3FileSystem::default().await); 17 | ``` 18 | 19 | `S3FileSystem::default()` is a convenience wrapper for `S3FileSystem::new(None, None, None, None, None, None)`. 20 | 21 | Connect to implementor of S3 API (MinIO, in this case) using access key and secret. 22 | 23 | ```rust 24 | // Example credentials provided by MinIO 25 | const ACCESS_KEY_ID: &str = "AKIAIOSFODNN7EXAMPLE"; 26 | const SECRET_ACCESS_KEY: &str = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"; 27 | const PROVIDER_NAME: &str = "Static"; 28 | const MINIO_ENDPOINT: &str = "http://localhost:9000"; 29 | 30 | let s3_file_system = S3FileSystem::new( 31 | Some(SharedCredentialsProvider::new(Credentials::new( 32 | MINIO_ACCESS_KEY_ID, 33 | MINIO_SECRET_ACCESS_KEY, 34 | None, 35 | None, 36 | PROVIDER_NAME, 37 | ))), // Credentials provider 38 | None, // Region 39 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), // Endpoint 40 | None, // RetryConfig 41 | None, // AsyncSleep 42 | None, // TimeoutConfig 43 | ) 44 | .await; 45 | ``` 46 | 47 | Using DataFusion's `ListingTableConfig` we register a table into a DataFusion `ExecutionContext` so that it can be queried. 48 | 49 | ```rust 50 | let filename = "data/alltypes_plain.snappy.parquet"; 51 | 52 | let config = ListingTableConfig::new(s3_file_system, filename).infer().await?; 53 | 54 | let table = ListingTable::try_new(config)?; 55 | 56 | let mut ctx = ExecutionContext::new(); 57 | 58 | ctx.register_table("tbl", Arc::new(table))?; 59 | 60 | let df = ctx.sql("SELECT * FROM tbl").await?; 61 | df.show() 62 | ``` 63 | 64 | We can also register the `S3FileSystem` directly as an `ObjectStore` on an `ExecutionContext`. This provides an idiomatic way of creating `TableProviders` that can be queried. 65 | 66 | ```rust 67 | execution_ctx.register_object_store( 68 | "s3", 69 | Arc::new(S3FileSystem::default().await), 70 | ); 71 | 72 | let input_uri = "s3://parquet-testing/data/alltypes_plain.snappy.parquet"; 73 | 74 | let (object_store, _) = ctx.object_store(input_uri)?; 75 | 76 | let config = ListingTableConfig::new(s3_file_system, filename).infer().await?; 77 | 78 | let mut table_provider: Arc = Arc::new(ListingTable::try_new(config)?); 79 | ``` 80 | 81 | ## Testing 82 | 83 | Tests are run with [MinIO](https://min.io/) which provides a containerized implementation of the Amazon S3 API. 84 | 85 | First clone the test data repository: 86 | 87 | ```bash 88 | git submodule update --init --recursive 89 | ``` 90 | 91 | Then start the MinIO container: 92 | 93 | ```bash 94 | docker run \ 95 | --detach \ 96 | --rm \ 97 | --publish 9000:9000 \ 98 | --publish 9001:9001 \ 99 | --name minio \ 100 | --volume "$(pwd)/parquet-testing:/data" \ 101 | --env "MINIO_ROOT_USER=AKIAIOSFODNN7EXAMPLE" \ 102 | --env "MINIO_ROOT_PASSWORD=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" \ 103 | quay.io/minio/minio server /data \ 104 | --console-address ":9001" 105 | ``` 106 | 107 | Once started, run tests in normal fashion: 108 | 109 | ```bash 110 | cargo test 111 | ``` 112 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | //! Custom error type for `DataFusion-ObjectStore-S3` 2 | 3 | use std::error::Error; 4 | use std::fmt::{Display, Formatter}; 5 | 6 | /// Enum with all errors in this crate. 7 | /// PartialEq is to enable testing for specific error types 8 | #[derive(Debug, PartialEq)] 9 | pub enum S3Error { 10 | /// Returned when functionaly is not yet available. 11 | NotImplemented(String), 12 | /// Wrapper for AWS errors 13 | AWS(String), 14 | } 15 | 16 | impl Display for S3Error { 17 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 18 | match self { 19 | S3Error::NotImplemented(desc) => write!(f, "Not yet implemented: {}", desc), 20 | S3Error::AWS(desc) => write!(f, "AWS error: {}", desc), 21 | } 22 | } 23 | } 24 | 25 | impl Error for S3Error {} 26 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | #![warn(missing_docs)] 19 | 20 | //! [DataFusion-ObjectStore-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) 21 | //! provides a `TableProvider` interface for using `Datafusion` to query data in S3. This includes AWS S3 22 | //! and services such as MinIO that implement the S3 API. 23 | //! 24 | //! ## Examples 25 | //! Examples for querying AWS and other implementors, such as MinIO, are shown below. 26 | //! 27 | //! Load credentials from default AWS credential provider (such as environment or ~/.aws/credentials) 28 | //! 29 | //! ```rust 30 | //! # use std::sync::Arc; 31 | //! # use datafusion::error::Result; 32 | //! # use datafusion_objectstore_s3::object_store::s3::S3FileSystem; 33 | //! # #[tokio::main] 34 | //! # async fn main() -> Result<()> { 35 | //! let s3_file_system = Arc::new(S3FileSystem::default().await); 36 | //! # Ok(()) 37 | //! # } 38 | //! ``` 39 | //! 40 | //! `S3FileSystem::default()` is a convenience wrapper for `S3FileSystem::new(None, None, None, None, None, None)`. 41 | //! 42 | //! Connect to implementor of S3 API (MinIO, in this case) using access key and secret. 43 | //! 44 | //! ```rust 45 | //! use datafusion_objectstore_s3::object_store::s3::S3FileSystem; 46 | //! 47 | //! use aws_types::credentials::SharedCredentialsProvider; 48 | //! use aws_types::credentials::Credentials; 49 | //! use aws_sdk_s3::Endpoint; 50 | //! use http::Uri; 51 | //! 52 | //! # #[tokio::main] 53 | //! # async fn main() { 54 | //! // Example credentials provided by MinIO 55 | //! const MINIO_ACCESS_KEY_ID: &str = "AKIAIOSFODNN7EXAMPLE"; 56 | //! const MINIO_SECRET_ACCESS_KEY: &str = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"; 57 | //! const PROVIDER_NAME: &str = "Static"; 58 | //! const MINIO_ENDPOINT: &str = "http://localhost:9000"; 59 | //! 60 | //! let s3_file_system = S3FileSystem::new( 61 | //! Some(SharedCredentialsProvider::new(Credentials::new( 62 | //! MINIO_ACCESS_KEY_ID, 63 | //! MINIO_SECRET_ACCESS_KEY, 64 | //! None, 65 | //! None, 66 | //! PROVIDER_NAME, 67 | //! ))), // SharedCredentialsProvider 68 | //! None, //Region 69 | //! Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), //Endpoint 70 | //! None, // RetryConfig 71 | //! None, // AsyncSleep 72 | //! None, // TimeoutConfig 73 | //! ) 74 | //! .await; 75 | //! # } 76 | //! ``` 77 | //! 78 | //! Using DataFusion's `ListingOtions` and `ListingTable` we register a table into a DataFusion `ExecutionContext` so that it can be queried. 79 | //! 80 | //! ```rust 81 | //! use std::sync::Arc; 82 | //! 83 | //! use datafusion::datasource::listing::*; 84 | //! use datafusion::datasource::TableProvider; 85 | //! use datafusion::prelude::SessionContext; 86 | //! use datafusion::datasource::file_format::parquet::ParquetFormat; 87 | //! use datafusion::error::Result; 88 | //! 89 | //! use datafusion_objectstore_s3::object_store::s3::S3FileSystem; 90 | //! 91 | //! use aws_types::credentials::SharedCredentialsProvider; 92 | //! use aws_types::credentials::Credentials; 93 | //! use aws_sdk_s3::Endpoint; 94 | //! use http::Uri; 95 | //! 96 | //! # const MINIO_ACCESS_KEY_ID: &str = "AKIAIOSFODNN7EXAMPLE"; 97 | //! # const MINIO_SECRET_ACCESS_KEY: &str = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"; 98 | //! # const PROVIDER_NAME: &str = "Static"; 99 | //! # const MINIO_ENDPOINT: &str = "http://localhost:9000"; 100 | //! 101 | //! # #[tokio::main] 102 | //! # async fn main() -> Result<()> { 103 | //! let filename = "s3://data/alltypes_plain.snappy.parquet"; 104 | //! 105 | //! # let s3_file_system = Arc::new(S3FileSystem::new( 106 | //! # Some(SharedCredentialsProvider::new(Credentials::new( 107 | //! # MINIO_ACCESS_KEY_ID, 108 | //! # MINIO_SECRET_ACCESS_KEY, 109 | //! # None, 110 | //! # None, 111 | //! # PROVIDER_NAME, 112 | //! # ))), 113 | //! # None, 114 | //! # Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 115 | //! # None, 116 | //! # None, 117 | //! # None, 118 | //! # ) 119 | //! # .await); 120 | //! 121 | //! let config = ListingTableConfig::new(s3_file_system, filename).infer().await?; 122 | //! 123 | //! let table = ListingTable::try_new(config)?; 124 | //! 125 | //! let mut ctx = SessionContext::new(); 126 | //! 127 | //! ctx.register_table("tbl", Arc::new(table))?; 128 | //! 129 | //! let df = ctx.sql("SELECT * FROM tbl").await?; 130 | //! df.show(); 131 | //! # Ok(()) 132 | //! # } 133 | //! ``` 134 | //! 135 | //! We can also register the `S3FileSystem` directly as an `ObjectStore` on an `ExecutionContext`. This provides an idiomatic way of creating `TableProviders` that can be queried. 136 | //! 137 | //! ```rust 138 | //! use std::sync::Arc; 139 | //! 140 | //! use datafusion::datasource::listing::*; 141 | //! use datafusion::error::Result; 142 | //! 143 | //! use datafusion_objectstore_s3::object_store::s3::S3FileSystem; 144 | //! 145 | //! use aws_sdk_s3::Endpoint; 146 | //! use aws_types::credentials::Credentials; 147 | //! use aws_types::credentials::SharedCredentialsProvider; 148 | //! use datafusion::prelude::SessionContext; 149 | //! use http::Uri; 150 | //! 151 | //! const MINIO_ACCESS_KEY_ID: &str = "AKIAIOSFODNN7EXAMPLE"; 152 | //! const MINIO_SECRET_ACCESS_KEY: &str = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"; 153 | //! const PROVIDER_NAME: &str = "Static"; 154 | //! const MINIO_ENDPOINT: &str = "http://localhost:9000"; 155 | //! 156 | //! #[tokio::main] 157 | //! async fn main() -> Result<()> { 158 | //! let s3_file_system = Arc::new( 159 | //! S3FileSystem::new( 160 | //! Some(SharedCredentialsProvider::new(Credentials::new( 161 | //! MINIO_ACCESS_KEY_ID, 162 | //! MINIO_SECRET_ACCESS_KEY, 163 | //! None, 164 | //! None, 165 | //! PROVIDER_NAME, 166 | //! ))), 167 | //! None, 168 | //! Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 169 | //! None, 170 | //! None, 171 | //! None, 172 | //! ) 173 | //! .await, 174 | //! ); 175 | //! 176 | //! let ctx = SessionContext::new(); 177 | //! 178 | //! let uri = "s3://data/alltypes_plain.snappy.parquet"; 179 | //! 180 | //! let config = ListingTableConfig::new(s3_file_system, uri) 181 | //! .infer() 182 | //! .await?; 183 | //! 184 | //! let table = ListingTable::try_new(config)?; 185 | //! 186 | //! ctx.register_table("tbl", Arc::new(table))?; 187 | //! 188 | //! let df = ctx.sql("SELECT * FROM tbl").await?; 189 | //! df.show().await?; 190 | //! Ok(()) 191 | //! } 192 | //! ``` 193 | 194 | pub mod error; 195 | pub mod object_store; 196 | -------------------------------------------------------------------------------- /src/object_store/mod.rs: -------------------------------------------------------------------------------- 1 | //! `ObjectStore` implementation for the Amazon S3 API 2 | 3 | pub mod s3; 4 | -------------------------------------------------------------------------------- /src/object_store/s3.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | //! ObjectStore implementation for the Amazon S3 API 19 | 20 | use std::io::{ErrorKind, Read}; 21 | use std::sync::{mpsc, Arc}; 22 | use std::time::Duration; 23 | 24 | use async_trait::async_trait; 25 | use futures::{stream, AsyncRead}; 26 | 27 | use datafusion_data_access::object_store::{ 28 | FileMetaStream, ListEntryStream, ObjectReader, ObjectStore, 29 | }; 30 | use datafusion_data_access::{FileMeta, Result, SizedFile}; 31 | 32 | use aws_config::meta::region::RegionProviderChain; 33 | use aws_sdk_s3::{config::Builder, Client, Endpoint, Region, RetryConfig}; 34 | use aws_smithy_async::rt::sleep::AsyncSleep; 35 | use aws_smithy_types::timeout::Config; 36 | use aws_smithy_types_convert::date_time::DateTimeExt; 37 | use aws_types::credentials::SharedCredentialsProvider; 38 | use bytes::Buf; 39 | 40 | use crate::error::S3Error; 41 | 42 | /// new_client creates a new aws_sdk_s3::Client 43 | /// this uses aws_config::load_from_env() as a base config then allows users to override specific settings if required 44 | /// 45 | /// an example use case for overriding is to specify an endpoint which is not Amazon S3 such as MinIO or Ceph. 46 | async fn new_client( 47 | credentials_provider: Option, 48 | region: Option, 49 | endpoint: Option, 50 | retry_config: Option, 51 | sleep: Option>, 52 | timeout_config: Option, 53 | ) -> Client { 54 | let config = aws_config::load_from_env().await; 55 | 56 | let region_provider = RegionProviderChain::first_try(region) 57 | .or_default_provider() 58 | .or_else(Region::new("us-west-2")); 59 | 60 | let mut config_builder = Builder::from(&config).region(region_provider.region().await); 61 | 62 | if let Some(credentials_provider) = credentials_provider { 63 | config_builder = config_builder.credentials_provider(credentials_provider); 64 | } 65 | 66 | if let Some(endpoint) = endpoint { 67 | config_builder = config_builder.endpoint_resolver(endpoint); 68 | } 69 | 70 | if let Some(retry_config) = retry_config { 71 | config_builder = config_builder.retry_config(retry_config); 72 | } 73 | 74 | if let Some(sleep) = sleep { 75 | config_builder = config_builder.sleep_impl(sleep); 76 | } 77 | 78 | if let Some(timeout_config) = timeout_config { 79 | config_builder = config_builder.timeout_config(timeout_config); 80 | }; 81 | 82 | let config = config_builder.build(); 83 | Client::from_conf(config) 84 | } 85 | 86 | /// `ObjectStore` implementation for the Amazon S3 API 87 | #[derive(Debug)] 88 | pub struct S3FileSystem { 89 | credentials_provider: Option, 90 | region: Option, 91 | endpoint: Option, 92 | retry_config: Option, 93 | sleep: Option>, 94 | timeout_config: Option, 95 | client: Client, 96 | } 97 | 98 | impl S3FileSystem { 99 | /// Create new `ObjectStore` 100 | pub async fn new( 101 | credentials_provider: Option, 102 | region: Option, 103 | endpoint: Option, 104 | retry_config: Option, 105 | sleep: Option>, 106 | timeout_config: Option, 107 | ) -> Self { 108 | Self { 109 | credentials_provider: credentials_provider.clone(), 110 | region: region.clone(), 111 | endpoint: endpoint.clone(), 112 | retry_config: retry_config.clone(), 113 | sleep: sleep.clone(), 114 | timeout_config: timeout_config.clone(), 115 | client: new_client(credentials_provider, region, endpoint, None, None, None).await, 116 | } 117 | } 118 | } 119 | 120 | #[async_trait] 121 | impl ObjectStore for S3FileSystem { 122 | async fn list_file(&self, uri: &str) -> Result { 123 | let (_, prefix) = uri.split_once("s3://").ok_or_else(|| { 124 | std::io::Error::new(ErrorKind::Other, S3Error::AWS("No s3 scheme found".into())) 125 | })?; 126 | let (bucket, prefix) = match prefix.split_once('/') { 127 | Some((bucket, prefix)) => (bucket.to_owned(), prefix), 128 | None => (prefix.to_owned(), ""), 129 | }; 130 | 131 | let objects = self 132 | .client 133 | .list_objects_v2() 134 | .bucket(&bucket) 135 | .prefix(prefix) 136 | .send() 137 | .await 138 | .map_err(|err| { 139 | std::io::Error::new(ErrorKind::Other, S3Error::AWS(format!("{:?}", err))) 140 | })? 141 | .contents() 142 | .unwrap_or_default() 143 | .to_vec(); 144 | 145 | let result = stream::iter(objects.into_iter().map(move |object| { 146 | Ok(FileMeta { 147 | sized_file: SizedFile { 148 | path: format!("{}/{}", &bucket, object.key().unwrap_or("")), 149 | size: object.size() as u64, 150 | }, 151 | last_modified: object 152 | .last_modified() 153 | .map(|last_modified| last_modified.to_chrono_utc()), 154 | }) 155 | })); 156 | 157 | Ok(Box::pin(result)) 158 | } 159 | 160 | async fn list_dir(&self, _prefix: &str, _delimiter: Option) -> Result { 161 | todo!() 162 | } 163 | 164 | fn file_reader(&self, file: SizedFile) -> Result> { 165 | Ok(Arc::new(AmazonS3FileReader::new( 166 | self.credentials_provider.clone(), 167 | self.region.clone(), 168 | self.endpoint.clone(), 169 | self.retry_config.clone(), 170 | self.sleep.clone(), 171 | self.timeout_config.clone(), 172 | file, 173 | )?)) 174 | } 175 | } 176 | 177 | #[allow(dead_code)] 178 | impl S3FileSystem { 179 | /// Convenience wrapper for creating a new `S3FileSystem` using default configuration options. Only works with AWS. 180 | pub async fn default() -> Self { 181 | S3FileSystem::new(None, None, None, None, None, None).await 182 | } 183 | } 184 | 185 | struct AmazonS3FileReader { 186 | credentials_provider: Option, 187 | region: Option, 188 | endpoint: Option, 189 | retry_config: Option, 190 | sleep: Option>, 191 | timeout_config: Option, 192 | file: SizedFile, 193 | } 194 | 195 | impl AmazonS3FileReader { 196 | #[allow(clippy::too_many_arguments)] 197 | fn new( 198 | credentials_provider: Option, 199 | region: Option, 200 | endpoint: Option, 201 | retry_config: Option, 202 | sleep: Option>, 203 | timeout_config: Option, 204 | file: SizedFile, 205 | ) -> Result { 206 | Ok(Self { 207 | credentials_provider, 208 | region, 209 | endpoint, 210 | retry_config, 211 | sleep, 212 | timeout_config, 213 | file, 214 | }) 215 | } 216 | } 217 | 218 | #[async_trait] 219 | impl ObjectReader for AmazonS3FileReader { 220 | async fn chunk_reader(&self, _start: u64, _length: usize) -> Result> { 221 | todo!("implement once async file readers are available (arrow-rs#78, arrow-rs#111)") 222 | } 223 | 224 | fn sync_chunk_reader(&self, start: u64, length: usize) -> Result> { 225 | let credentials_provider = self.credentials_provider.clone(); 226 | let region = self.region.clone(); 227 | let endpoint = self.endpoint.clone(); 228 | let retry_config = self.retry_config.clone(); 229 | let sleep = self.sleep.clone(); 230 | let timeout_config = self.timeout_config.clone(); 231 | let file_path = self.file.path.clone(); 232 | 233 | // once the async chunk file readers have been implemented this complexity can be removed 234 | let (tx, rx) = mpsc::channel(); 235 | std::thread::spawn(move || { 236 | let rt = tokio::runtime::Builder::new_current_thread() 237 | .enable_all() 238 | .build() 239 | .unwrap(); 240 | 241 | rt.block_on(async move { 242 | // aws_sdk_s3::Client appears bound to the runtime and will deadlock if cloned from the main runtime 243 | let client = new_client( 244 | credentials_provider, 245 | region, 246 | endpoint, 247 | retry_config, 248 | sleep, 249 | timeout_config, 250 | ) 251 | .await; 252 | 253 | let (bucket, key) = match file_path.split_once('/') { 254 | Some((bucket, prefix)) => (bucket, prefix), 255 | None => (file_path.as_str(), ""), 256 | }; 257 | 258 | let get_object = client.get_object().bucket(bucket).key(key); 259 | let resp = if length > 0 { 260 | // range bytes requests are inclusive 261 | get_object 262 | .range(format!("bytes={}-{}", start, start + (length - 1) as u64)) 263 | .send() 264 | .await 265 | } else { 266 | get_object.send().await 267 | }; 268 | 269 | let bytes = match resp { 270 | Ok(res) => { 271 | let data = res.body.collect().await; 272 | match data { 273 | Ok(data) => Ok(data.into_bytes()), 274 | Err(err) => Err(std::io::Error::new( 275 | ErrorKind::Other, 276 | S3Error::AWS(format!("{:?}", err)), 277 | )), 278 | } 279 | } 280 | Err(err) => Err(std::io::Error::new( 281 | ErrorKind::Other, 282 | S3Error::AWS(format!("{:?}", err)), 283 | )), 284 | }; 285 | 286 | tx.send(bytes).unwrap(); 287 | }) 288 | }); 289 | 290 | let bytes = rx.recv_timeout(Duration::from_secs(10)).map_err(|err| { 291 | std::io::Error::new(ErrorKind::TimedOut, S3Error::AWS(format!("{:?}", err))) 292 | })??; 293 | 294 | Ok(Box::new(bytes.reader())) 295 | } 296 | 297 | fn length(&self) -> u64 { 298 | self.file.size 299 | } 300 | } 301 | #[cfg(test)] 302 | mod tests { 303 | use crate::object_store::s3::*; 304 | use aws_types::credentials::Credentials; 305 | use datafusion::assert_batches_eq; 306 | use datafusion::datasource::listing::*; 307 | use datafusion::datasource::TableProvider; 308 | use datafusion::error::DataFusionError; 309 | use datafusion::prelude::*; 310 | use futures::StreamExt; 311 | use http::Uri; 312 | 313 | const ACCESS_KEY_ID: &str = "AKIAIOSFODNN7EXAMPLE"; 314 | const SECRET_ACCESS_KEY: &str = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"; 315 | const PROVIDER_NAME: &str = "Static"; 316 | const MINIO_ENDPOINT: &str = "http://localhost:9000"; 317 | 318 | // Test that `S3FileSystem` can read files 319 | #[tokio::test] 320 | async fn test_read_files() -> Result<()> { 321 | let s3_file_system = S3FileSystem::new( 322 | Some(SharedCredentialsProvider::new(Credentials::new( 323 | ACCESS_KEY_ID, 324 | SECRET_ACCESS_KEY, 325 | None, 326 | None, 327 | PROVIDER_NAME, 328 | ))), 329 | None, 330 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 331 | None, 332 | None, 333 | None, 334 | ) 335 | .await; 336 | 337 | let mut files = s3_file_system.list_file("s3://data").await?; 338 | 339 | let mut files_handled = 0; 340 | while let Some(file) = files.next().await { 341 | files_handled += 1; 342 | let sized_file = file.unwrap().sized_file; 343 | let mut reader = s3_file_system 344 | .file_reader(sized_file.clone()) 345 | .unwrap() 346 | .sync_chunk_reader(0, sized_file.size as usize) 347 | .unwrap(); 348 | 349 | let mut bytes = Vec::new(); 350 | let size = reader.read_to_end(&mut bytes)?; 351 | 352 | assert_eq!(size as u64, sized_file.size); 353 | } 354 | assert!(files_handled > 0); 355 | 356 | Ok(()) 357 | } 358 | 359 | // Test that reading files with `S3FileSystem` produces the expected results 360 | #[tokio::test] 361 | async fn test_read_range() -> Result<()> { 362 | let start = 10; 363 | let length = 128; 364 | 365 | let mut file = std::fs::File::open("parquet-testing/data/alltypes_plain.snappy.parquet")?; 366 | let mut raw_bytes = Vec::new(); 367 | file.read_to_end(&mut raw_bytes)?; 368 | let raw_slice = &raw_bytes[start..start + length]; 369 | assert_eq!(raw_slice.len(), length); 370 | 371 | let s3_file_system = S3FileSystem::new( 372 | Some(SharedCredentialsProvider::new(Credentials::new( 373 | ACCESS_KEY_ID, 374 | SECRET_ACCESS_KEY, 375 | None, 376 | None, 377 | PROVIDER_NAME, 378 | ))), 379 | None, 380 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 381 | None, 382 | None, 383 | None, 384 | ) 385 | .await; 386 | let mut files = s3_file_system 387 | .list_file("s3://data/alltypes_plain.snappy.parquet") 388 | .await?; 389 | 390 | let mut files_handled = 0; 391 | if let Some(file) = files.next().await { 392 | files_handled += 1; 393 | let sized_file = file.unwrap().sized_file; 394 | let mut reader = s3_file_system 395 | .file_reader(sized_file) 396 | .unwrap() 397 | .sync_chunk_reader(start as u64, length) 398 | .unwrap(); 399 | 400 | let mut reader_bytes = Vec::new(); 401 | let size = reader.read_to_end(&mut reader_bytes)?; 402 | 403 | assert_eq!(size, length); 404 | assert_eq!(&reader_bytes, raw_slice); 405 | } 406 | assert!(files_handled > 0); 407 | 408 | Ok(()) 409 | } 410 | 411 | fn map_datafusion_error_to_io_error(err: DataFusionError) -> std::io::Error { 412 | std::io::Error::new(ErrorKind::Other, S3Error::AWS(format!("{:?}", err))) 413 | } 414 | 415 | // Test that reading Parquet file with `S3FileSystem` can create a `ListingTable` 416 | #[tokio::test] 417 | async fn test_read_parquet() -> Result<()> { 418 | let s3_file_system = Arc::new( 419 | S3FileSystem::new( 420 | Some(SharedCredentialsProvider::new(Credentials::new( 421 | ACCESS_KEY_ID, 422 | SECRET_ACCESS_KEY, 423 | None, 424 | None, 425 | PROVIDER_NAME, 426 | ))), 427 | None, 428 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 429 | None, 430 | None, 431 | None, 432 | ) 433 | .await, 434 | ); 435 | 436 | let filename = "s3://data/alltypes_plain.snappy.parquet"; 437 | 438 | let config = ListingTableConfig::new(s3_file_system, filename) 439 | .infer() 440 | .await 441 | .map_err(map_datafusion_error_to_io_error)?; 442 | 443 | let table = ListingTable::try_new(config).map_err(map_datafusion_error_to_io_error)?; 444 | 445 | let exec = table 446 | .scan(&None, &[], Some(1024)) 447 | .await 448 | .map_err(map_datafusion_error_to_io_error)?; 449 | assert_eq!(exec.statistics().num_rows, Some(2)); 450 | 451 | Ok(()) 452 | } 453 | 454 | // Test that a SQL query can be executed on a Parquet file that was read from `S3FileSystem` 455 | #[tokio::test] 456 | async fn test_sql_query() -> Result<()> { 457 | let s3_file_system = Arc::new( 458 | S3FileSystem::new( 459 | Some(SharedCredentialsProvider::new(Credentials::new( 460 | ACCESS_KEY_ID, 461 | SECRET_ACCESS_KEY, 462 | None, 463 | None, 464 | PROVIDER_NAME, 465 | ))), 466 | None, 467 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 468 | None, 469 | None, 470 | None, 471 | ) 472 | .await, 473 | ); 474 | 475 | let filename = "s3://data/alltypes_plain.snappy.parquet"; 476 | 477 | let config = ListingTableConfig::new(s3_file_system, filename) 478 | .infer() 479 | .await 480 | .map_err(map_datafusion_error_to_io_error)?; 481 | 482 | let table = ListingTable::try_new(config).map_err(map_datafusion_error_to_io_error)?; 483 | 484 | let ctx = SessionContext::new(); 485 | 486 | ctx.register_table("tbl", Arc::new(table)).unwrap(); 487 | 488 | let batches = ctx 489 | .sql("SELECT * FROM tbl") 490 | .await 491 | .map_err(map_datafusion_error_to_io_error)? 492 | .collect() 493 | .await 494 | .map_err(map_datafusion_error_to_io_error)?; 495 | let expected = vec![ 496 | "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", 497 | "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |", 498 | "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", 499 | "| 6 | true | 0 | 0 | 0 | 0 | 0 | 0 | 30342f30312f3039 | 30 | 2009-04-01 00:00:00 |", 500 | "| 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01 00:01:00 |", 501 | "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+" 502 | ]; 503 | assert_batches_eq!(expected, &batches); 504 | Ok(()) 505 | } 506 | 507 | // Test that a SQL query can be executed on a Parquet file that was read from `S3FileSystem` 508 | #[tokio::test] 509 | async fn test_create_external_table_sql_query() -> Result<()> { 510 | let s3_file_system = Arc::new( 511 | S3FileSystem::new( 512 | Some(SharedCredentialsProvider::new(Credentials::new( 513 | ACCESS_KEY_ID, 514 | SECRET_ACCESS_KEY, 515 | None, 516 | None, 517 | PROVIDER_NAME, 518 | ))), 519 | None, 520 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 521 | None, 522 | None, 523 | None, 524 | ) 525 | .await, 526 | ); 527 | 528 | let ctx = SessionContext::new(); 529 | 530 | ctx.runtime_env() 531 | .register_object_store("s3", s3_file_system); 532 | 533 | let sql = "CREATE EXTERNAL TABLE abc STORED AS PARQUET LOCATION 's3://data/alltypes_plain.snappy.parquet'"; 534 | 535 | ctx.sql(sql).await.unwrap().collect().await.unwrap(); 536 | 537 | ctx.table("abc").unwrap(); 538 | Ok(()) 539 | } 540 | 541 | // Test that the S3FileSystem allows reading from different buckets 542 | #[tokio::test] 543 | #[should_panic(expected = "Could not parse metadata: bad data")] 544 | async fn test_read_alternative_bucket() { 545 | let s3_file_system = Arc::new( 546 | S3FileSystem::new( 547 | Some(SharedCredentialsProvider::new(Credentials::new( 548 | ACCESS_KEY_ID, 549 | SECRET_ACCESS_KEY, 550 | None, 551 | None, 552 | PROVIDER_NAME, 553 | ))), 554 | None, 555 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 556 | None, 557 | None, 558 | None, 559 | ) 560 | .await, 561 | ); 562 | 563 | let filename = "s3://bad_data/PARQUET-1481.parquet"; 564 | 565 | let config = ListingTableConfig::new(s3_file_system, filename) 566 | .infer() 567 | .await 568 | .unwrap(); 569 | 570 | let table = ListingTable::try_new(config).unwrap(); 571 | 572 | table.scan(&None, &[], Some(1024)).await.unwrap(); 573 | } 574 | 575 | // Test that `S3FileSystem` can be registered as object store on a DataFusion `ExecutionContext` 576 | #[tokio::test] 577 | async fn test_ctx_register_object_store() -> Result<()> { 578 | let s3_file_system = Arc::new( 579 | S3FileSystem::new( 580 | Some(SharedCredentialsProvider::new(Credentials::new( 581 | ACCESS_KEY_ID, 582 | SECRET_ACCESS_KEY, 583 | None, 584 | None, 585 | PROVIDER_NAME, 586 | ))), 587 | None, 588 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 589 | None, 590 | None, 591 | None, 592 | ) 593 | .await, 594 | ); 595 | 596 | let ctx = SessionContext::new(); 597 | ctx.runtime_env() 598 | .register_object_store("s3", s3_file_system); 599 | let (_, name) = ctx.runtime_env().object_store("s3").unwrap(); 600 | assert_eq!(name, "s3"); 601 | 602 | Ok(()) 603 | } 604 | 605 | // Test that an appropriate error message is produced for a non existent bucket 606 | #[tokio::test] 607 | #[should_panic(expected = "NoSuchBucket")] 608 | async fn test_read_nonexistent_bucket() { 609 | let s3_file_system = S3FileSystem::new( 610 | Some(SharedCredentialsProvider::new(Credentials::new( 611 | ACCESS_KEY_ID, 612 | SECRET_ACCESS_KEY, 613 | None, 614 | None, 615 | PROVIDER_NAME, 616 | ))), 617 | None, 618 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 619 | None, 620 | None, 621 | None, 622 | ) 623 | .await; 624 | 625 | let mut files = s3_file_system 626 | .list_file("s3://nonexistent_data") 627 | .await 628 | .unwrap(); 629 | 630 | while let Some(file) = files.next().await { 631 | let sized_file = file.unwrap().sized_file; 632 | let mut reader = s3_file_system 633 | .file_reader(sized_file.clone()) 634 | .unwrap() 635 | .sync_chunk_reader(0, sized_file.size as usize) 636 | .unwrap(); 637 | 638 | let mut bytes = Vec::new(); 639 | let size = reader.read_to_end(&mut bytes).unwrap(); 640 | 641 | assert_eq!(size as u64, sized_file.size); 642 | } 643 | } 644 | 645 | // Test that no files are returned if a non existent file URI is provided 646 | #[tokio::test] 647 | async fn test_read_nonexistent_file() { 648 | let s3_file_system = S3FileSystem::new( 649 | Some(SharedCredentialsProvider::new(Credentials::new( 650 | ACCESS_KEY_ID, 651 | SECRET_ACCESS_KEY, 652 | None, 653 | None, 654 | PROVIDER_NAME, 655 | ))), 656 | None, 657 | Some(Endpoint::immutable(Uri::from_static(MINIO_ENDPOINT))), 658 | None, 659 | None, 660 | None, 661 | ) 662 | .await; 663 | let mut files = s3_file_system 664 | .list_file("s3://data/nonexistent_file.txt") 665 | .await 666 | .unwrap(); 667 | 668 | assert!(files.next().await.is_none()) 669 | } 670 | } 671 | --------------------------------------------------------------------------------