├── src ├── vcpkg.json ├── README.md ├── include │ ├── httpfs_extension.hpp │ ├── hash_functions.hpp │ ├── httpfs_curl_client.hpp │ ├── httpfs_client.hpp │ ├── crypto.hpp │ ├── http_metadata_cache.hpp │ ├── create_secret_functions.hpp │ ├── hffs.hpp │ └── http_state.hpp ├── httpfs_client_wasm.cpp ├── CMakeLists.txt ├── httpfs_config.py ├── hash_functions.cpp └── http_state.cpp ├── vcpkg.json ├── data └── secrets │ ├── httpfs │ ├── s3_config_secret_v1_1_2.duckdb_secret │ ├── s3_config_secret_v1_1_3.duckdb_secret │ ├── s3_config_secret_v_1_0_0.duckdb_secret │ ├── s3_secret_chain_v_1_0_0.duckdb_secret │ ├── s3_secret_chain_v_1_1_2.duckdb_secret │ └── s3_secret_chain_v_1_1_3.duckdb_secret │ └── README.md ├── .gitmodules ├── Makefile ├── test ├── sql │ ├── copy │ │ ├── csv │ │ │ ├── test_url_with_plus.test │ │ │ ├── test_sniff_httpfs.test │ │ │ ├── parallel │ │ │ │ ├── test_parallel_csv.test │ │ │ │ └── csv_parallel_httpfs.test │ │ │ ├── test_12314.test_slow │ │ │ ├── test_csv_remote.test_slow │ │ │ ├── test_csv_httpfs.test_slow │ │ │ ├── glob │ │ │ │ └── copy_csv_glob_s3.test │ │ │ ├── test_csv_httpfs_prepared.test │ │ │ └── test_csv_remote.test │ │ ├── test_remote_head_forbidden.test │ │ ├── parquet │ │ │ ├── delta_byte_array_length_mismatch.test │ │ │ ├── snowflake_lineitem.test │ │ │ ├── parquet_5968.test │ │ │ ├── delta_byte_array_multiple_pages.test │ │ │ ├── parquet_boolean_page.test_slow │ │ │ ├── parquet_http_prefetch.test │ │ │ ├── parquet_encryption_mbedtls_openssl.test │ │ │ ├── parquet_2102.test_slow │ │ │ ├── parquet_encryption_httpfs.test │ │ │ ├── test_yellow_cab.test_slow │ │ │ ├── test_parquet_remote.test │ │ │ └── test_parquet_remote_foreign_files.test │ │ ├── s3 │ │ │ ├── s3_presigned_read.test │ │ │ ├── http_log.test │ │ │ ├── http_secret.test │ │ │ ├── s3_presigned_read.test_slow │ │ │ ├── csv_s3_file_size_bytes.test │ │ │ ├── hive_partitioned_write_s3.test_slow │ │ │ ├── upload_large_file.test_slow │ │ │ ├── glob_s3_paging.test_slow │ │ │ ├── upload_large_json_file.test_slow │ │ │ ├── parquet_s3_tpcds.test_slow │ │ │ ├── parquet_s3_tpch.test_slow │ │ │ ├── upload_file_parallel.test_slow │ │ │ ├── metadata_cache.test │ │ │ ├── download_config.test │ │ │ ├── http_proxy.test │ │ │ ├── s3_hive_partition.test │ │ │ └── upload_small_file.test │ │ ├── encryption │ │ │ └── different_aes_engines.test │ │ └── no_head_on_write.test │ ├── httpfs │ │ ├── internal_issue_2490.test │ │ └── hffs.test │ ├── crypto │ │ └── test_openssl_crypto.test │ ├── json │ │ └── table │ │ │ ├── read_json.test │ │ │ ├── read_json_objects.test │ │ │ └── read_json_auto.test_slow │ ├── attach │ │ ├── attach_remote.test │ │ ├── attach_httpfs.test │ │ ├── attach_s3.test │ │ └── attach_s3_tpch.test_slow │ ├── secret │ │ ├── test_secret_type.test │ │ ├── secret_s3_requester_pays.test │ │ ├── secret_aws.test │ │ ├── secret_refresh_attach.test │ │ ├── gcs_oauth.test │ │ └── secret_refresh.test │ ├── curl_client │ │ └── test_load_other_extensions.test │ ├── httpfs_client │ │ └── httpfs_client_implementation.test │ ├── storage │ │ ├── invalid_unicode_scrambled.test_slow │ │ ├── external_file_cache │ │ │ ├── external_file_cache_read_blob.test_slow │ │ │ └── external_file_cache_httpfs.test │ │ └── encryption │ │ │ └── temp_files │ │ │ └── encrypted_out_of_core.test_slow │ ├── extensions │ │ └── version_is_valid_httpfs.test │ ├── metadata_stats.test │ ├── secrets │ │ ├── secret_types_function.test │ │ ├── create_secret_invalid_map.test │ │ ├── create_secret_hffs.test │ │ ├── persistent_key_value_secret.test │ │ ├── create_secret_gcs.test_slow │ │ ├── create_secret_persistence_error_handling.test │ │ ├── create_secret_scope_matching.test │ │ ├── create_secret_non_writable_persistent_dir.test │ │ ├── create_secret_overwriting.test │ │ ├── create_secret_cascading.test_slow │ │ ├── create_secret_defaults.test │ │ ├── create_secret_settings.test │ │ ├── create_secret_r2.test │ │ ├── secret_compatibility_httpfs.test │ │ ├── create_secret_r2_serialization.test │ │ ├── create_secret_minio.test │ │ ├── create_secret.test_slow │ │ ├── create_secret_name_conflicts.test │ │ ├── create_secret_binding.test │ │ ├── create_secret_s3_serialization.test │ │ ├── create_secret_transactional.test │ │ └── create_secret_storage_backends.test │ ├── settings │ │ └── test_disabled_file_system_httpfs.test │ ├── full_file_download_fallback.test │ ├── test_headers_parsed.test │ ├── delete │ │ └── test_issue_1834.test_slow │ └── logging │ │ ├── http_logging.test │ │ └── file_system_logging.test ├── extension │ ├── duckdb_extension_settings.test │ ├── autoloading_load_only.test │ ├── autoloading_current_setting.test │ ├── autoloading_reset_setting.test │ ├── autoloading_filesystems.test │ └── autoloading_base.test └── README.md ├── extension_config.cmake ├── scripts ├── install_s3_test_server.sh ├── set_s3_test_server_variables.sh ├── generate_presigned_url.sh ├── run_s3_test_server.sh ├── run_squid.sh └── minio_s3.yml ├── .clang-format ├── LICENSE ├── .github └── workflows │ ├── MainDistributionPipeline.yml │ └── IntegrationTests.yml └── CMakeLists.txt /src/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | "openssl" 4 | ] 5 | } -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | "openssl", 4 | "curl" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | Documentation on S3 tests setup can be found [in the duckdb/duckdb repository](https://github.com/duckdb/duckdb/blob/main/test/sql/copy/s3/README.md) 2 | -------------------------------------------------------------------------------- /data/secrets/httpfs/s3_config_secret_v1_1_2.duckdb_secret: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-httpfs/HEAD/data/secrets/httpfs/s3_config_secret_v1_1_2.duckdb_secret -------------------------------------------------------------------------------- /data/secrets/httpfs/s3_config_secret_v1_1_3.duckdb_secret: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-httpfs/HEAD/data/secrets/httpfs/s3_config_secret_v1_1_3.duckdb_secret -------------------------------------------------------------------------------- /data/secrets/httpfs/s3_config_secret_v_1_0_0.duckdb_secret: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-httpfs/HEAD/data/secrets/httpfs/s3_config_secret_v_1_0_0.duckdb_secret -------------------------------------------------------------------------------- /data/secrets/httpfs/s3_secret_chain_v_1_0_0.duckdb_secret: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-httpfs/HEAD/data/secrets/httpfs/s3_secret_chain_v_1_0_0.duckdb_secret -------------------------------------------------------------------------------- /data/secrets/httpfs/s3_secret_chain_v_1_1_2.duckdb_secret: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-httpfs/HEAD/data/secrets/httpfs/s3_secret_chain_v_1_1_2.duckdb_secret -------------------------------------------------------------------------------- /data/secrets/httpfs/s3_secret_chain_v_1_1_3.duckdb_secret: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-httpfs/HEAD/data/secrets/httpfs/s3_secret_chain_v_1_1_3.duckdb_secret -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb.git 4 | [submodule "extension-ci-tools"] 5 | path = extension-ci-tools 6 | url = https://github.com/duckdb/extension-ci-tools.git 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=httpfs 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile 9 | -------------------------------------------------------------------------------- /src/include/httpfs_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb { 6 | 7 | class HttpfsExtension : public Extension { 8 | public: 9 | void Load(ExtensionLoader &loader) override; 10 | std::string Name() override; 11 | std::string Version() const override; 12 | }; 13 | 14 | } // namespace duckdb 15 | -------------------------------------------------------------------------------- /test/sql/copy/csv/test_url_with_plus.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_url_with_plus.test 2 | # description: Tests url with plus 3 | # group: [csv] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | statement ok 11 | FROM read_csv('https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'); 12 | -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | ################# HTTPFS 4 | duckdb_extension_load(json) 5 | duckdb_extension_load(parquet) 6 | 7 | duckdb_extension_load(httpfs 8 | SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} 9 | INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/src/include 10 | ) 11 | -------------------------------------------------------------------------------- /test/sql/copy/test_remote_head_forbidden.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/test_remote_head_forbidden.test 2 | # description: Test Force download with server that doesn't want to give us the head 3 | # group: [copy] 4 | 5 | require httpfs 6 | 7 | require json 8 | 9 | statement ok 10 | FROM read_json('https://api.spring.io/projects/spring-boot/generations') 11 | -------------------------------------------------------------------------------- /test/sql/httpfs/internal_issue_2490.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/httpfs/internal_issue_2490.test 2 | # description: Internal issue 2490 - Wrong URL encoding leads to 404 for redirects with httplib v0.14.3 3 | # group: [httpfs] 4 | 5 | require httpfs 6 | 7 | require parquet 8 | 9 | statement ok 10 | FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/us+er+da+ta.parquet' LIMIT 1; 11 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/delta_byte_array_length_mismatch.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/delta_byte_array_length_mismatch.test 2 | # description: Test reading a delta 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | SELECT * FROM parquet_scan('https://github.com/duckdb/duckdb-data/releases/download/v1.0/delta_byte_array_length_mismatch.parquet') 11 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/snowflake_lineitem.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/snowflake_lineitem.test 2 | # description: Test parquet file exported from snowflake 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | CREATE TABLE snowflake_lineitem AS FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/snowflake_lineitem_export.parquet' 11 | -------------------------------------------------------------------------------- /test/sql/crypto/test_openssl_crypto.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/attach/attach_encryption_fallback_readonly.test 2 | # description: Test the openssl based crypto util 3 | # group: [attach] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | ATTACH '__TEST_DIR__/test_write_only.db' as enc (ENCRYPTION_KEY 'abcde', ENCRYPTION_CIPHER 'GCM'); 9 | 10 | statement ok 11 | CREATE TABLE enc.test AS SELECT 1 as a; 12 | 13 | query I 14 | FROM enc.test 15 | ---- 16 | 1 -------------------------------------------------------------------------------- /scripts/install_s3_test_server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Note: needs sudo 3 | 4 | unamestr=$(uname) 5 | if [[ "$unamestr" == 'Linux' ]]; then 6 | apt-get install -y docker.io 7 | fi 8 | 9 | docker --version 10 | echo '127.0.0.1 duckdb-minio.com' >> /etc/hosts 11 | echo '127.0.0.1 test-bucket.duckdb-minio.com' >> /etc/hosts 12 | echo '127.0.0.1 test-bucket-2.duckdb-minio.com' >> /etc/hosts 13 | echo '127.0.0.1 test-bucket-public.duckdb-minio.com' >> /etc/hosts -------------------------------------------------------------------------------- /test/sql/json/table/read_json.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/json/table/read_json.test 2 | # description: Read json files straight to columnar data 3 | # group: [table] 4 | 5 | require json 6 | 7 | require httpfs 8 | 9 | query II 10 | select * from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/example_rn.ndjson'); 11 | ---- 12 | 1 O Brother, Where Art Thou? 13 | 2 Home for the Holidays 14 | 3 The Firm 15 | 4 Broadcast News 16 | 5 Raising Arizona 17 | -------------------------------------------------------------------------------- /test/sql/attach/attach_remote.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/attach/attach_remote.test 2 | # description: Test attaching of remote database 3 | # group: [attach] 4 | 5 | require httpfs 6 | 7 | statement error 8 | ATTACH 'https://duckdb.org/non_existing.db' AS db2 (READ_ONLY) 9 | ---- 10 | 11 | statement error 12 | ATTACH 'https://duckdb.org/non_existing.db' AS db2 13 | ---- 14 | 15 | statement error 16 | ATTACH 'https://duckdb.org/non_existing.db' AS db2 (READ_WRITE) 17 | ---- 18 | -------------------------------------------------------------------------------- /test/sql/copy/csv/test_sniff_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_sniff_httpfs.test 2 | # description: Test sniff_csv functions over httpfs with auto-detection on compression 3 | # group: [csv] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | statement ok 11 | from sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/who.csv.gz'); 12 | 13 | statement ok 14 | from sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/who.csv.gz?v=1'); 15 | -------------------------------------------------------------------------------- /scripts/set_s3_test_server_variables.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Run this script with 'source' or the shorthand: '.': 4 | # i.e: source scripts/set_s3_test_server_variables.sh 5 | 6 | # Enable the S3 tests to run 7 | export S3_TEST_SERVER_AVAILABLE=1 8 | 9 | export AWS_DEFAULT_REGION=eu-west-1 10 | export AWS_ACCESS_KEY_ID=minio_duckdb_user 11 | export AWS_SECRET_ACCESS_KEY=minio_duckdb_user_password 12 | export DUCKDB_S3_ENDPOINT=duckdb-minio.com:9000 13 | export DUCKDB_S3_USE_SSL=false 14 | -------------------------------------------------------------------------------- /test/sql/secret/test_secret_type.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secret/test_secret_type.test 2 | # description: Test the secret types added by this extension 3 | # group: [secret] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | LOAD httpfs 9 | 10 | statement ok 11 | PRAGMA enable_verification 12 | 13 | query II 14 | SELECT type, default_provider from duckdb_secret_types() where extension='httpfs' order by type; 15 | ---- 16 | aws config 17 | gcs config 18 | huggingface config 19 | r2 config 20 | s3 config 21 | -------------------------------------------------------------------------------- /data/secrets/README.md: -------------------------------------------------------------------------------- 1 | # Test secrets 2 | DuckDB only allows persistent secrets with the x00 permission (e.g. 600 or 700). Therefore to use these 3 | secrets, the permissions need to be set before running any tests that uses them. 4 | 5 | The recommended way to add tests that touch these persistent secret files is to put them behind a 6 | ```shell 7 | require-env TEST_PERSISTENT_SECRETS_AVAILABLE 8 | ``` 9 | statement, which ensures the tests only run in CI jobs where the permissions are set correctly. 10 | 11 | -------------------------------------------------------------------------------- /test/sql/curl_client/test_load_other_extensions.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/curl_client/test_load_other_extensions.test 2 | # description: when using the curl client, test loading other extensions 3 | # group: [curl_client] 4 | 5 | require httpfs 6 | 7 | # Do not ignore 'HTTP' error messages! 8 | set ignore_error_messages 9 | 10 | statement ok 11 | SET httpfs_client_implementation='curl'; 12 | 13 | statement error 14 | INSTALL non_existent_extension; 15 | ---- 16 | :.*HTTP Error: Failed to download extension.* 17 | -------------------------------------------------------------------------------- /test/sql/httpfs_client/httpfs_client_implementation.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/httpfs_client/httpfs_client_implementation.test 2 | # description: Tests basic valus for httpfs_client_implementation 3 | # group: [httpfs_client] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | set httpfs_client_implementation = 'default'; 9 | 10 | statement ok 11 | set httpfs_client_implementation = 'httplib'; 12 | 13 | statement error 14 | set httpfs_client_implementation = 'something else'; 15 | ---- 16 | Unsupported option for httpfs_client_implementation 17 | -------------------------------------------------------------------------------- /src/include/hash_functions.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/helper.hpp" 4 | 5 | namespace duckdb { 6 | 7 | typedef unsigned char hash_bytes[32]; 8 | typedef unsigned char hash_str[64]; 9 | 10 | void sha256(const char *in, size_t in_len, hash_bytes &out); 11 | 12 | void hmac256(const std::string &message, const char *secret, size_t secret_len, hash_bytes &out); 13 | 14 | void hmac256(std::string message, hash_bytes secret, hash_bytes &out); 15 | 16 | void hex256(hash_bytes &in, hash_str &out); 17 | 18 | } // namespace duckdb 19 | -------------------------------------------------------------------------------- /test/sql/storage/invalid_unicode_scrambled.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/storage/invalid_unicode_scrambled.test_slow 2 | # description: Issue #1650 - "invalid unicode detected in segment statistics" when inserting structs with strings and NULL values 3 | # group: [storage] 4 | 5 | require httpfs 6 | 7 | require parquet 8 | 9 | statement ok 10 | create or replace table blah as (with 11 | us as (select distinct * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/invalid_unicode_scrambled.parquet') select Address from 12 | us); 13 | 14 | 15 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/parquet_5968.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/parquet_5968.test 2 | # description: Issue #5968: Segmentation fault on reading parquet file 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | CREATE TABLE issue_5968 AS FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/issue_5968.parquet'; 11 | 12 | query I 13 | SELECT COUNT(*) FROM issue_5968 14 | ---- 15 | 2028587 16 | 17 | query I 18 | SELECT * FROM issue_5968 LIMIT 5 19 | ---- 20 | B00001 21 | B00001 22 | B00009 23 | B00009 24 | B00009 25 | -------------------------------------------------------------------------------- /test/sql/extensions/version_is_valid_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/extensions/version_is_valid_httpfs.test 2 | # description: Test version metadata on load 3 | # group: [extensions] 4 | 5 | require-env LOCAL_EXTENSION_REPO 6 | 7 | require httpfs 8 | 9 | statement ok 10 | SET autoinstall_known_extensions=true; 11 | 12 | statement ok 13 | SET autoload_known_extensions=true; 14 | 15 | statement ok 16 | SET enable_server_cert_verification = true; 17 | 18 | query I 19 | SELECT count(*) FROM duckdb_extensions() WHERE extension_version != '' AND extension_name == 'httpfs'; 20 | ---- 21 | 1 22 | -------------------------------------------------------------------------------- /src/httpfs_client_wasm.cpp: -------------------------------------------------------------------------------- 1 | #include "httpfs_client.hpp" 2 | #include "http_state.hpp" 3 | 4 | namespace duckdb { 5 | 6 | unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { 7 | throw InternalException("HTTPFSUtil::InitializeClient is not expected to be called"); 8 | } 9 | 10 | unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { 11 | unordered_map result; 12 | // TODO: HTTPFSUtil::ParseGetParameters is currently not implemented 13 | return result; 14 | } 15 | 16 | } // namespace duckdb 17 | -------------------------------------------------------------------------------- /test/sql/copy/csv/parallel/test_parallel_csv.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/parallel/test_parallel_csv.test 2 | # description: Test parallel read CSV function on ghub bugs 3 | # group: [parallel] 4 | 5 | # TODO: figure out where that bucket went 6 | mode skip 7 | 8 | require httpfs 9 | 10 | query II 11 | select * from read_csv_auto("https://duckdb-public-gzip-test.s3.us-east-2.amazonaws.com/test.csv", header = 0); 12 | ---- 13 | foo bar 14 | foo bar 15 | 16 | 17 | query II 18 | from read_csv_auto("https://duckdb-public-gzip-test.s3.us-east-2.amazonaws.com/test.csv.gz", header = 0); 19 | ---- 20 | foo bar 21 | foo bar 22 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HTTPFS_SOURCES 2 | hffs.cpp 3 | s3fs.cpp 4 | httpfs.cpp 5 | http_state.cpp 6 | crypto.cpp 7 | hash_functions.cpp 8 | create_secret_functions.cpp 9 | httpfs_extension.cpp) 10 | if(NOT EMSCRIPTEN) 11 | set(HTTPFS_SOURCES ${HTTPFS_SOURCES} crypto.cpp httpfs_httplib_client.cpp 12 | httpfs_curl_client.cpp) 13 | else() 14 | set(HTTPFS_SOURCES ${HTTPFS_SOURCES} httpfs_client_wasm.cpp) 15 | endif() 16 | 17 | add_library(httpfs_library OBJECT ${HTTPFS_SOURCES}) 18 | set(ALL_OBJECT_FILES 19 | ${ALL_OBJECT_FILES} $ 20 | PARENT_SCOPE) 21 | -------------------------------------------------------------------------------- /test/sql/metadata_stats.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/metadata_stats.test 2 | # description: Test getting metadata stats 3 | # group: [sql] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require json 10 | 11 | # Test Force download with server that doesn't want to give us the head 12 | statement ok 13 | FROM read_json('https://api.spring.io/projects/spring-boot/generations') 14 | 15 | statement ok 16 | SET force_download=false; 17 | 18 | query II 19 | explain analyze SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') 20 | ---- 21 | analyzed_plan :.*GET: 2.* 22 | -------------------------------------------------------------------------------- /src/httpfs_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # list all include directories 4 | include_directories = [ 5 | os.path.sep.join(x.split('/')) for x in ['src/include', 'third_party/httplib', 'extension/parquet/include'] 6 | ] 7 | # source files 8 | source_files = [ 9 | os.path.sep.join(x.split('/')) 10 | for x in [ 11 | 'src/' + s 12 | for s in [ 13 | 'create_secret_functions.cpp', 14 | 'crypto.cpp', 15 | 'hffs.cpp', 16 | 'http_state.cpp', 17 | 'httpfs.cpp', 18 | 'httpfs_extension.cpp', 19 | 'httpfs_client.cpp', 20 | 's3fs.cpp', 21 | ] 22 | ] 23 | ] 24 | -------------------------------------------------------------------------------- /test/sql/copy/csv/test_12314.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_12314.test_slow 2 | # description: Test CSV reading for issue 12314 3 | # group: [csv] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | statement error 11 | from read_csv('https://github.com/duckdb/duckdb-data/releases/download/v1.0/sample_data_12314.csv.gz',HEADER = 1, PARALLEL=false); 12 | ---- 13 | Change the maximum length size, e.g., max_line_size=2097408 14 | 15 | query I 16 | select count(*) from read_csv('https://github.com/duckdb/duckdb-data/releases/download/v1.0/sample_data_12314.csv.gz',HEADER = 1, PARALLEL=false , max_line_size=2097408); 17 | ---- 18 | 26238 19 | -------------------------------------------------------------------------------- /test/sql/copy/csv/test_csv_remote.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_csv_remote.test_slow 2 | # description: Test reading csv files over http, slow queries 3 | # group: [csv] 4 | 5 | statement ok 6 | pragma enable_verification; 7 | 8 | require httpfs 9 | 10 | # Read a compressed file (~44MB compressed, ~700MB uncompressed) over HTTP 11 | query IIIIII 12 | select count(*), min(strain), max(strain), min(strlen(sequence)), max(strlen(sequence)), avg(strlen(sequence)) 13 | from read_csv_auto('https://raw.githubusercontent.com/duckdb/duckdb/main/data/csv/sequences.csv.gz', delim=','); 14 | ---- 15 | 100000 ARG/Cordoba-1006-155/2020 tiger/NY/040420/2020 17340 30643 29821.264410 16 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/delta_byte_array_multiple_pages.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/delta_byte_array_multiple_pages.test 2 | # description: Test delta byte array parquet file with multiple pages 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | CREATE TABLE delta_byte_array AS SELECT * FROM parquet_scan('https://github.com/duckdb/duckdb-data/releases/download/v1.0/delta_byte_array_multiple_pages.parquet') 11 | 12 | query I 13 | SELECT COUNT(*) FROM delta_byte_array 14 | ---- 15 | 100000 16 | 17 | query II 18 | SELECT min(strlen(json_column)), max(strlen(json_column)) FROM delta_byte_array 19 | ---- 20 | 54 54 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test/sql/secrets/secret_types_function.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/secret_types_function.test 2 | # description: Test duckdb_secret_types function 3 | # group: [secrets] 4 | 5 | mode skip 6 | 7 | query III 8 | FROM duckdb_secret_types() WHERE type IN ['s3', 'r2', 'gcs', 'http'] ORDER BY type 9 | ---- 10 | http config (empty) 11 | 12 | require httpfs 13 | 14 | require no_extension_autoloading "EXPECTED: The duckdb_secret_types() function does not trigger autoloading httpfs" 15 | 16 | query III 17 | FROM duckdb_secret_types() WHERE type IN ['s3', 'r2', 'gcs', 'http'] ORDER BY type 18 | ---- 19 | gcs config httpfs 20 | http config (empty) 21 | r2 config httpfs 22 | s3 config httpfs 23 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_invalid_map.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_invalid_map.test 2 | # description: Test throwing input errors on multi map input. 3 | # group: [secrets] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification; 9 | 10 | statement error 11 | CREATE PERSISTENT SECRET http_multimap ( 12 | TYPE HTTP, 13 | EXTRA_HTTP_HEADERS MAP{123: 'quack1', 123 : 'quack2'} 14 | ); 15 | ---- 16 | :Invalid Input Error.*Map keys must be unique.* 17 | 18 | statement error 19 | CREATE PERSISTENT SECRET http_multimap ( 20 | TYPE HTTP, 21 | EXTRA_HTTP_HEADERS MAP{NULL: 'quack1', 123 : 'quack2'} 22 | ); 23 | ---- 24 | :Invalid Input Error.*Map keys can not be NULL.* -------------------------------------------------------------------------------- /test/sql/copy/parquet/parquet_boolean_page.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/parquet_boolean_page.test_slow 2 | # description: Test that boolean values that cross column pages are correctly read 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | PRAGMA enable_verification 11 | 12 | query IIIII 13 | SELECT 14 | SUM(CASE WHEN is_successful THEN 1 ELSE 0 END), 15 | SUM(CASE WHEN advanced_on_error_flag THEN 1 ELSE 0 END), 16 | SUM(CASE WHEN safe_on_error_flag THEN 1 ELSE 0 END), 17 | SUM(CASE WHEN rbi_flag THEN 1 ELSE 0 END), 18 | SUM(CASE WHEN team_unearned_flag THEN 1 ELSE 0 END) 19 | FROM read_parquet('https://github.com/duckdb/duckdb-data/releases/download/v1.0/event_baserunning_advance_attempt.parquet'); 20 | ---- 21 | 9252616 111041 7120 1609612 1860 22 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_hffs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_hffs.test 2 | # description: Test huggingface secrets 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | statement ok 11 | set allow_persistent_secrets=false; 12 | 13 | # Manually setting token is simplest 14 | statement ok 15 | CREATE SECRET hf1 ( 16 | TYPE HUGGINGFACE, 17 | TOKEN 'bla' 18 | ) 19 | 20 | # Cache provider will automatically try to fetch the token from the cache 21 | statement ok 22 | CREATE SECRET hf2 ( 23 | TYPE HUGGINGFACE, 24 | PROVIDER 'credential_chain' 25 | ) 26 | 27 | query IIII 28 | SELECT name, type, provider, scope FROM duckdb_secrets() order by name; 29 | ---- 30 | hf1 huggingface config ['hf://'] 31 | hf2 huggingface credential_chain ['hf://'] 32 | -------------------------------------------------------------------------------- /test/sql/secrets/persistent_key_value_secret.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/persistent_key_value_secret.test 2 | # group: [secrets] 3 | 4 | load __TEST_DIR__/persistent_extra_headers 5 | 6 | require httpfs 7 | 8 | require json 9 | 10 | statement ok 11 | CREATE PERSISTENT SECRET http ( 12 | TYPE HTTP, 13 | EXTRA_HTTP_HEADERS MAP { 14 | 'Authorization': 'Bearer sk_test_not_valid_key' 15 | } 16 | ); 17 | 18 | restart 19 | 20 | # Because this is an https host, the 'EXTRA_HTTP_HEADERS' will be used, as long as this doesn't crash anything 21 | # we are happy with this test throwing an IO error. 22 | statement error 23 | select 24 | unnest(data) as customers 25 | from 26 | read_json('https://non.existant/endpoint'); 27 | ---- 28 | IO Error: Could not establish connection error for HTTP HEAD to 'https://non.existant/endpoint' 29 | -------------------------------------------------------------------------------- /test/extension/duckdb_extension_settings.test: -------------------------------------------------------------------------------- 1 | # name: test/extension/duckdb_extension_settings.test 2 | # description: settings for extensions 3 | # group: [extension] 4 | 5 | # TODO: move back to duckdb/duckdb 6 | mode skip 7 | 8 | require httpfs 9 | 10 | statement ok 11 | SET autoinstall_known_extensions = true; 12 | 13 | statement ok 14 | SET autoload_known_extensions = true; 15 | 16 | statement ok 17 | SET extension_directory = '__TEST_DIR__/custom_extension_directory'; 18 | 19 | statement ok 20 | SET custom_extension_repository = '__TEST_DIR__/not_existing_folder' 21 | 22 | statement error 23 | FROM read_csv('https://some.org/file.csv'); 24 | ---- 25 | not_existing_folder 26 | 27 | statement ok 28 | SET autoinstall_extension_repository = '__TEST_DIR__/other_folder'; 29 | 30 | statement error 31 | FROM read_csv('https://some.org/file.csv'); 32 | ---- 33 | other_folder 34 | -------------------------------------------------------------------------------- /test/sql/settings/test_disabled_file_system_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/settings/test_disabled_file_system_httpfs.test 2 | # description: Test disabled file systems with HTTPFS 3 | # group: [settings] 4 | 5 | require skip_reload 6 | 7 | require no_extension_autoloading "EXPECTED: Test disable loading from local file system" 8 | 9 | statement ok 10 | PRAGMA enable_verification 11 | 12 | require httpfs 13 | 14 | statement ok 15 | SET disabled_filesystems='LocalFileSystem'; 16 | 17 | # httpfs works 18 | statement ok 19 | from read_csv_auto('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv'); 20 | 21 | statement ok 22 | SET disabled_filesystems='LocalFileSystem,HTTPFileSystem'; 23 | 24 | # not if we disable it 25 | statement error 26 | from read_csv_auto('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv'); 27 | ---- 28 | File system HTTPFileSystem has been disabled by configuration 29 | -------------------------------------------------------------------------------- /test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow 2 | # description: Test the external file cache for read_blob HTTPFS reads 3 | # group: [external_file_cache] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | # first read_blob should do 1 GET 10 | query II 11 | explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet'); 12 | ---- 13 | analyzed_plan :.*GET: 1.* 14 | 15 | # second one should do 0 16 | query II 17 | explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet'); 18 | ---- 19 | analyzed_plan :.*GET: 0.* 20 | 21 | # although the read was cached using read_blob, the parquet reader can read from cache 22 | query II 23 | explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; 24 | ---- 25 | analyzed_plan :.*GET: 0.* 26 | -------------------------------------------------------------------------------- /scripts/generate_presigned_url.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #Note: DONT run as root 3 | 4 | DUCKDB_PATH=duckdb 5 | if command -v duckdb; then 6 | DUCKDB_PATH=duckdb 7 | elif test -f build/release/duckdb; then 8 | DUCKDB_PATH=build/release/duckdb 9 | elif test -f build/reldebug/duckdb; then 10 | DUCKDB_PATH=build/reldebug/duckdb 11 | elif test -f build/debug/duckdb; then 12 | DUCKDB_PATH=build/debug/duckdb 13 | fi 14 | 15 | rm -rf test/test_data 16 | mkdir -p test/test_data 17 | 18 | generate_large_parquet_query=$(cat <> 4) & 0xF]; 24 | pout[1] = hex[*pin & 0xF]; 25 | } 26 | } 27 | 28 | } // namespace duckdb 29 | -------------------------------------------------------------------------------- /test/sql/attach/attach_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/attach/attach_httpfs.test 2 | # description: Test attach using httpfs 3 | # group: [attach] 4 | 5 | require httpfs 6 | 7 | require-env S3_TEST_SERVER_AVAILABLE 1 8 | 9 | require-env AWS_DEFAULT_REGION 10 | 11 | require-env AWS_ACCESS_KEY_ID 12 | 13 | require-env AWS_SECRET_ACCESS_KEY 14 | 15 | require-env DUCKDB_S3_ENDPOINT 16 | 17 | require-env DUCKDB_S3_USE_SSL 18 | 19 | require-env S3_ATTACH_DB_PRESIGNED_URL 20 | 21 | # ATTACH a DuckDB database over HTTPFS 22 | statement ok 23 | ATTACH '${S3_ATTACH_DB_PRESIGNED_URL}' AS db (READONLY 1); 24 | 25 | query IIIII 26 | SELECT * FROM db.integral_values 27 | ---- 28 | 1 2 3 4 5 29 | NULL NULL NULL NULL NULL 30 | 31 | statement error 32 | CREATE TABLE db.integers(i INTEGER); 33 | ---- 34 | read-only 35 | 36 | statement ok 37 | SELECT * FROM db.all_types 38 | 39 | statement error 40 | SELECT * FROM db.all_typez 41 | ---- 42 | all_types 43 | 44 | statement ok 45 | DETACH db 46 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_gcs.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_gcs.test_slow 2 | # description: Test secret creation using the default gcs secret provider 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | # Ensure any currently stored secrets don't interfere with the test 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | statement ok 15 | reset s3_use_ssl; 16 | 17 | # GCS Secrets automatically default to the correct endpoint for Google Cloud Storage 18 | statement ok 19 | CREATE SECRET ( 20 | TYPE GCS, 21 | KEY_ID 'my_key', 22 | SECRET 'my_secret' 23 | ) 24 | 25 | # The secret will be created for the default scope 26 | query IIII 27 | SELECT name, type, provider, scope FROM duckdb_secrets(); 28 | ---- 29 | __default_gcs gcs config ['gcs://', 'gs://'] 30 | 31 | statement error 32 | FROM 'gcs://test-bucket/test.csv' 33 | ---- 34 | https://storage.googleapis.com/test-bucket/test.csv 35 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: LLVM 3 | SortIncludes: false 4 | TabWidth: 4 5 | IndentWidth: 4 6 | ColumnLimit: 120 7 | AllowShortFunctionsOnASingleLine: false 8 | --- 9 | UseTab: ForIndentation 10 | DerivePointerAlignment: false 11 | PointerAlignment: Right 12 | AlignConsecutiveMacros: true 13 | AlignTrailingComments: true 14 | AllowAllArgumentsOnNextLine: true 15 | AllowAllConstructorInitializersOnNextLine: true 16 | AllowAllParametersOfDeclarationOnNextLine: true 17 | AlignAfterOpenBracket: Align 18 | SpaceBeforeCpp11BracedList: true 19 | SpaceBeforeCtorInitializerColon: true 20 | SpaceBeforeInheritanceColon: true 21 | SpacesInAngles: false 22 | SpacesInCStyleCastParentheses: false 23 | SpacesInConditionalStatement: false 24 | AllowShortLambdasOnASingleLine: Inline 25 | AllowShortLoopsOnASingleLine: false 26 | AlwaysBreakTemplateDeclarations: Yes 27 | IncludeBlocks: Regroup 28 | Language: Cpp 29 | AccessModifierOffset: -4 30 | --- 31 | Language: Java 32 | SpaceAfterCStyleCast: true 33 | --- 34 | -------------------------------------------------------------------------------- /test/sql/storage/external_file_cache/external_file_cache_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/storage/external_file_cache/external_file_cache_httpfs.test 2 | # description: Test the external file cache for HTTPFS reads 3 | # group: [external_file_cache] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | # first query caches the data 10 | statement ok 11 | from 'https://blobs.duckdb.org/data/shakespeare.parquet'; 12 | 13 | # second query should only have a head request, no gets 14 | query II 15 | explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; 16 | ---- 17 | analyzed_plan :.*GET: 0.* 18 | 19 | statement ok 20 | SET enable_http_metadata_cache = true; 21 | 22 | # first query saves the metadata (and data, but that was already there) 23 | statement ok 24 | from 'https://blobs.duckdb.org/data/shakespeare.parquet'; 25 | 26 | # second query should do no HEAD and no GET 27 | query II 28 | explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; 29 | ---- 30 | analyzed_plan :.*HEAD: 0.* 31 | -------------------------------------------------------------------------------- /test/sql/copy/s3/s3_presigned_read.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/s3_presigned_read.test 2 | # description: Read small csv/parquet files from S3 Presigned URL. 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | require-env S3_SMALL_CSV_PRESIGNED_URL 24 | 25 | require-env S3_SMALL_PARQUET_PRESIGNED_URL 26 | 27 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 28 | set ignore_error_messages 29 | 30 | query I 31 | SELECT phone FROM read_csv_auto('${S3_SMALL_CSV_PRESIGNED_URL}'); 32 | ---- 33 | +318855443322 34 | +552244331122 35 | +12233445567 36 | 37 | query I 38 | SELECT i FROM '${S3_SMALL_PARQUET_PRESIGNED_URL}'; 39 | ---- 40 | 1 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018-2025 Stichting DuckDB Foundation 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /test/sql/copy/s3/http_log.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/http_log.test 2 | # description: Test http logger 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | require-env AWS_DEFAULT_REGION 12 | 13 | require-env AWS_ACCESS_KEY_ID 14 | 15 | require-env AWS_SECRET_ACCESS_KEY 16 | 17 | require-env DUCKDB_S3_ENDPOINT 18 | 19 | require-env DUCKDB_S3_USE_SSL 20 | 21 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 22 | set ignore_error_messages 23 | 24 | # Create some test data 25 | statement ok 26 | COPY (SELECT 'value-1' as value) TO 's3://test-bucket/http_log/test.parquet'; 27 | 28 | statement ok 29 | CALL enable_logging('HTTP') 30 | 31 | statement ok 32 | set logging_level='debug' 33 | 34 | query I 35 | FROM 's3://test-bucket/http_log/test.parquet' 36 | ---- 37 | value-1 38 | 39 | query II rowsort 40 | SELECT request.type, parse_filename(request.url) FROM duckdb_logs_parsed('HTTP'); 41 | ---- 42 | GET test.parquet 43 | HEAD test.parquet 44 | -------------------------------------------------------------------------------- /test/sql/copy/s3/http_secret.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/http_secret.test 2 | # description: Test http secret 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | require-env AWS_DEFAULT_REGION 12 | 13 | require-env AWS_ACCESS_KEY_ID 14 | 15 | require-env AWS_SECRET_ACCESS_KEY 16 | 17 | require-env DUCKDB_S3_ENDPOINT 18 | 19 | require-env DUCKDB_S3_USE_SSL 20 | 21 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 22 | set ignore_error_messages 23 | 24 | # Create some test data 25 | statement ok 26 | COPY (SELECT 'value-1' as value) TO 's3://test-bucket/http-secret-test/test.parquet'; 27 | 28 | statement ok 29 | PRAGMA enable_verification 30 | 31 | # Create some wonky headers 32 | statement ok 33 | CREATE SECRET http3 ( 34 | TYPE HTTP, 35 | EXTRA_HTTP_HEADERS MAP{ 36 | 'Authorization': 'Im very important', 37 | 'CustomHeader': 'fliepflap' 38 | } 39 | ); 40 | 41 | query I 42 | FROM 's3://test-bucket/http-secret-test/test.parquet' 43 | ---- 44 | value-1 45 | -------------------------------------------------------------------------------- /test/sql/full_file_download_fallback.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/full_file_download_fallback.test 2 | # group: [sql] 3 | 4 | require parquet 5 | 6 | require httpfs 7 | 8 | require tpch 9 | 10 | require-env PYTHON_HTTP_SERVER_URL 11 | 12 | require-env PYTHON_HTTP_SERVER_DIR 13 | 14 | statement ok 15 | CALL enable_logging(); 16 | 17 | statement ok 18 | call dbgen(sf=1); 19 | 20 | statement ok 21 | copy lineitem to '${PYTHON_HTTP_SERVER_DIR}/lineitem.csv' 22 | 23 | statement ok 24 | drop table lineitem; 25 | 26 | statement ok 27 | CREATE view lineitem AS FROM '${PYTHON_HTTP_SERVER_URL}/lineitem.csv'; 28 | 29 | query I 30 | pragma tpch(6); 31 | ---- 32 | 123141078.22829981 33 | 34 | query I 35 | select count(*) from duckdb_logs where log_level='WARN' and message like '%Falling back to full%' 36 | ---- 37 | 2 38 | 39 | statement ok 40 | set auto_fallback_to_full_download=false 41 | 42 | statement error 43 | pragma tpch(6); 44 | ---- 45 | HTTP Error: Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true` 46 | 47 | -------------------------------------------------------------------------------- /test/sql/secret/secret_s3_requester_pays.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secret/secret_s3_requester_pays.test 2 | # description: Tests secret refreshing with AWS requester pays mode 3 | # group: [secret] 4 | 5 | require-env S3_TEST_SERVER_AVAILABLE 1 6 | 7 | require-env AWS_DEFAULT_REGION 8 | 9 | require-env AWS_ACCESS_KEY_ID 10 | 11 | require-env AWS_SECRET_ACCESS_KEY 12 | 13 | require-env DUCKDB_S3_ENDPOINT 14 | 15 | require-env DUCKDB_S3_USE_SSL 16 | 17 | require httpfs 18 | 19 | require parquet 20 | 21 | statement ok 22 | SET enable_logging=true 23 | 24 | statement ok 25 | set s3_use_ssl='${DUCKDB_S3_USE_SSL}' 26 | 27 | statement ok 28 | set s3_endpoint='${DUCKDB_S3_ENDPOINT}' 29 | 30 | statement ok 31 | set s3_region='${AWS_DEFAULT_REGION}' 32 | 33 | # Create some test data 34 | statement ok 35 | CREATE SECRET s1 ( 36 | TYPE S3, 37 | KEY_ID '${AWS_ACCESS_KEY_ID}', 38 | SECRET '${AWS_SECRET_ACCESS_KEY}', 39 | REQUESTER_PAYS true 40 | ) 41 | 42 | statement ok 43 | copy (select 1 as a) to 's3://test-bucket/test-file.parquet' 44 | 45 | query I 46 | FROM "s3://test-bucket/test-file.parquet" 47 | ---- 48 | 1 -------------------------------------------------------------------------------- /test/sql/copy/csv/test_csv_httpfs.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_csv_httpfs.test_slow 2 | # description: This test triggers the http prefetch mechanism. 3 | # group: [csv] 4 | 5 | statement ok 6 | pragma enable_verification; 7 | 8 | require httpfs 9 | 10 | require parquet 11 | 12 | #FIXME: remote changed? 13 | mode skip 14 | 15 | # Add test for 3731 16 | query I 17 | SELECT count(*) FROM read_csv_auto('https://datasets.imdbws.com/name.basics.tsv.gz', delim='\t', quote='') 18 | ---- 19 | 12783090 20 | 21 | query I 22 | copy ( 23 | SELECT * 24 | REPLACE ( 25 | str_split(primaryProfession,',') as primaryProfession, 26 | str_split(knownForTitles,',') as knownForTitles, 27 | case WHEN regexp_matches(deathYear,'[0-9]+') THEN CAST(deathYear as integer) END as deathYear, 28 | case WHEN regexp_matches(birthYear,'[0-9]+') THEN CAST(birthYear as integer) END as birthYear 29 | ) 30 | FROM read_csv_auto('https://datasets.imdbws.com/name.basics.tsv.gz', delim='\t', quote='') 31 | ) to '__TEST_DIR__/name_basics.parquet' (FORMAT 'parquet', CODEC 'ZSTD') 32 | ---- 33 | 12783090 34 | -------------------------------------------------------------------------------- /test/sql/test_headers_parsed.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/test_headers_parsed.test 2 | # description: This test triggers the http prefetch mechanism. 3 | # group: [sql] 4 | 5 | require httpfs 6 | 7 | require parquet 8 | 9 | statement ok 10 | SET httpfs_client_implementation='curl'; 11 | 12 | statement ok 13 | CALL enable_logging('HTTP'); 14 | 15 | query II 16 | select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; 17 | ---- 18 | 1 actor 19 | 2 actress 20 | 3 producer 21 | 4 writer 22 | 5 cinematographer 23 | 6 composer 24 | 7 costume designer 25 | 8 director 26 | 9 editor 27 | 10 miscellaneous crew 28 | 11 production designer 29 | 12 guest 30 | 31 | query I 32 | select response.status from duckdb_logs_parsed('HTTP') order by all; 33 | ---- 34 | OK_200 35 | PartialContent_206 36 | 37 | # response status is either 38 | # HTTP/2 200 39 | # HTTP/2 206 40 | # OR 41 | # HTTP/1.1 200 OK 42 | # HTTP/1.1 206 Partial Content 43 | # depending on OS and CA (I think) 44 | query I 45 | select response.headers['__RESPONSE_STATUS__'] LIKE 'HTTP%20%' from duckdb_logs_parsed('HTTP') order by all; 46 | ---- 47 | true 48 | true 49 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/parquet_http_prefetch.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/parquet_http_prefetch.test 2 | # description: This test triggers the http prefetch mechanism. 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 24 | set ignore_error_messages 25 | 26 | statement ok 27 | CREATE TABLE test_fetch_delay (a INT, b INT); 28 | 29 | statement ok 30 | INSERT INTO test_fetch_delay (SELECT (i%2) * 2, (i%2) * 2 from range(0,2500000) as tbl(i)); 31 | 32 | statement ok 33 | COPY test_fetch_delay to 's3://test-bucket/skip_delay.parquet'; 34 | 35 | statement ok 36 | CREATE TABLE test as SELECT * from 's3://test-bucket/skip_delay.parquet' where a = 1; 37 | 38 | query I 39 | SELECT COUNT(*) FROM test; 40 | ---- 41 | 0 42 | -------------------------------------------------------------------------------- /src/include/httpfs_curl_client.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "duckdb/common/http_util.hpp" 6 | 7 | namespace duckdb { 8 | class HTTPLogger; 9 | class FileOpener; 10 | struct FileOpenerInfo; 11 | class HTTPState; 12 | 13 | class CURLHandle { 14 | public: 15 | CURLHandle(const string &token, const string &cert_path); 16 | ~CURLHandle(); 17 | 18 | public: 19 | operator CURL *() { 20 | return curl; 21 | } 22 | CURLcode Execute() { 23 | return curl_easy_perform(curl); 24 | } 25 | 26 | private: 27 | CURL *curl = NULL; 28 | }; 29 | 30 | class CURLRequestHeaders { 31 | public: 32 | CURLRequestHeaders(vector &input) { 33 | for (auto &header : input) { 34 | Add(header); 35 | } 36 | } 37 | CURLRequestHeaders() { 38 | } 39 | 40 | ~CURLRequestHeaders() { 41 | if (headers) { 42 | curl_slist_free_all(headers); 43 | } 44 | headers = NULL; 45 | } 46 | operator bool() const { 47 | return headers != NULL; 48 | } 49 | 50 | public: 51 | void Add(const string &header) { 52 | headers = curl_slist_append(headers, header.c_str()); 53 | } 54 | 55 | public: 56 | curl_slist *headers = NULL; 57 | }; 58 | 59 | } // namespace duckdb 60 | -------------------------------------------------------------------------------- /test/sql/attach/attach_s3.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/attach/attach_s3.test 2 | # description: Test attach using httpfs 3 | # group: [attach] 4 | 5 | require httpfs 6 | 7 | require-env S3_TEST_SERVER_AVAILABLE 1 8 | 9 | require-env AWS_DEFAULT_REGION 10 | 11 | require-env AWS_ACCESS_KEY_ID 12 | 13 | require-env AWS_SECRET_ACCESS_KEY 14 | 15 | require-env DUCKDB_S3_ENDPOINT 16 | 17 | require-env DUCKDB_S3_USE_SSL 18 | 19 | require-env S3_ATTACH_DB 20 | 21 | statement ok 22 | CREATE SECRET ( 23 | TYPE S3, 24 | PROVIDER config, 25 | KEY_ID '${AWS_ACCESS_KEY_ID}', 26 | SECRET '${AWS_SECRET_ACCESS_KEY}', 27 | REGION '${AWS_DEFAULT_REGION}', 28 | ENDPOINT '${DUCKDB_S3_ENDPOINT}', 29 | USE_SSL '${DUCKDB_S3_USE_SSL}' 30 | ) 31 | 32 | # ATTACH a DuckDB database over HTTPFS 33 | statement ok 34 | ATTACH '${S3_ATTACH_DB}' AS db (READONLY 1); 35 | 36 | query IIIII 37 | SELECT * FROM db.integral_values 38 | ---- 39 | 1 2 3 4 5 40 | NULL NULL NULL NULL NULL 41 | 42 | statement error 43 | CREATE TABLE db.integers(i INTEGER); 44 | ---- 45 | read-only 46 | 47 | statement ok 48 | SELECT * FROM db.all_types 49 | 50 | statement error 51 | SELECT * FROM db.all_typez 52 | ---- 53 | all_types 54 | 55 | statement ok 56 | DETACH db 57 | -------------------------------------------------------------------------------- /test/sql/secret/secret_aws.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secret/secret_aws.test 2 | # description: Tests secret refreshing 3 | # group: [secret] 4 | 5 | require-env S3_TEST_SERVER_AVAILABLE 1 6 | 7 | require-env AWS_DEFAULT_REGION 8 | 9 | require-env AWS_ACCESS_KEY_ID 10 | 11 | require-env AWS_SECRET_ACCESS_KEY 12 | 13 | require-env DUCKDB_S3_ENDPOINT 14 | 15 | require-env DUCKDB_S3_USE_SSL 16 | 17 | set ignore_error_messages 18 | 19 | require httpfs 20 | 21 | require parquet 22 | 23 | foreach httpfs_implementation curl httplib 24 | 25 | statement ok 26 | SET httpfs_client_implementation='${httpfs_implementation}'; 27 | 28 | statement ok 29 | SET enable_logging=true 30 | 31 | statement ok 32 | set s3_use_ssl='${DUCKDB_S3_USE_SSL}' 33 | 34 | statement ok 35 | set s3_endpoint='${DUCKDB_S3_ENDPOINT}' 36 | 37 | statement ok 38 | set s3_region='${AWS_DEFAULT_REGION}' 39 | 40 | # Create some test data 41 | statement ok 42 | CREATE or replace SECRET s1 ( 43 | TYPE AWS, 44 | KEY_ID '${AWS_ACCESS_KEY_ID}', 45 | SECRET '${AWS_SECRET_ACCESS_KEY}' 46 | ) 47 | 48 | statement ok 49 | copy (select 1 as a) to 's3://test-bucket/test-file.parquet' 50 | 51 | query I 52 | FROM "s3://test-bucket/test-file.parquet" 53 | ---- 54 | 1 55 | 56 | endloop -------------------------------------------------------------------------------- /test/sql/copy/s3/s3_presigned_read.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/s3_presigned_read.test_slow 2 | # description: Read large csv/parquet files from S3 Presigned URL. 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | 24 | require-env S3_LARGE_PARQUET_PRESIGNED_URL 25 | 26 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 27 | set ignore_error_messages 28 | 29 | statement ok 30 | set http_timeout=120000; 31 | 32 | # More retries (longest wait will be 25600ms) 33 | statement ok 34 | set http_retries=6; 35 | 36 | query I 37 | SELECT 38 | sum(l_extendedprice * l_discount) AS revenue 39 | FROM 40 | '${S3_LARGE_PARQUET_PRESIGNED_URL}' 41 | WHERE 42 | l_shipdate >= CAST('1994-01-01' AS date) 43 | AND l_shipdate < CAST('1995-01-01' AS date) 44 | AND l_discount BETWEEN 0.05 45 | AND 0.07 46 | AND l_quantity < 24; 47 | ---- 48 | 123141078.2283 49 | -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension 3 | # 4 | name: Main Extension Distribution Pipeline 5 | on: 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | duckdb-stable-build: 16 | name: Build extension binaries 17 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 18 | with: 19 | extension_name: httpfs 20 | duckdb_version: v1.4.2 21 | ci_tools_version: main 22 | 23 | 24 | duckdb-stable-deploy: 25 | name: Deploy extension binaries 26 | needs: duckdb-stable-build 27 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main 28 | secrets: inherit 29 | with: 30 | extension_name: httpfs 31 | duckdb_version: v1.4.2 32 | ci_tools_version: main 33 | deploy_latest: ${{ startsWith(github.ref, 'refs/heads/v') }} 34 | deploy_versioned: ${{ startsWith(github.ref, 'refs/heads/v') || github.ref == 'refs/heads/main' }} 35 | -------------------------------------------------------------------------------- /test/sql/delete/test_issue_1834.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/delete/test_issue_1834.test_slow 2 | # description: Deleting with DELETE USING causes a segmentation fault 3 | # group: [delete] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | CREATE TABLE Person_likes_Comment (creationDate timestamp without time zone not null, id bigint not null, likes_Comment bigint not null); 9 | 10 | statement ok 11 | CREATE TABLE Person_Delete_candidates (deletionDate timestamp without time zone not null, id bigint); 12 | 13 | statement ok 14 | COPY Person_likes_Comment FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/Person_likes_Comment.csv' (DELIMITER '|', TIMESTAMPFORMAT '%Y-%m-%dT%H:%M:%S.%g+00:00'); 15 | 16 | statement ok 17 | COPY Person_Delete_candidates FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/Person_Delete_candidates.csv' (DELIMITER '|', HEADER, TIMESTAMPFORMAT '%Y-%m-%dT%H:%M:%S.%g+00:00'); 18 | 19 | statement ok 20 | DELETE FROM Person_likes_Comment USING Person_Delete_candidates WHERE Person_Delete_candidates.id = Person_likes_Comment.id; 21 | 22 | # all tuples fulfilling this predicate should have been deleted 23 | query I 24 | SELECT COUNT(*) FROM Person_likes_Comment, Person_Delete_candidates WHERE Person_Delete_candidates.id = Person_likes_Comment.id; 25 | ---- 26 | 0 27 | -------------------------------------------------------------------------------- /test/sql/logging/http_logging.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/logging/http_logging.test 2 | # group: [logging] 3 | 4 | require parquet 5 | 6 | require httpfs 7 | 8 | statement ok 9 | CALL enable_logging('HTTP'); 10 | 11 | statement ok 12 | FROM 'https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv' 13 | 14 | query IIII 15 | SELECT 16 | request.type, 17 | request.url, 18 | response.status, 19 | response.reason, 20 | FROM duckdb_logs_parsed('HTTP') WHERE response.status != 'ServiceUnavailable_503' 21 | ---- 22 | HEAD https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv OK_200 OK 23 | GET https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv PartialContent_206 Partial Content 24 | 25 | query II 26 | SELECT request.headers['Range'], response.headers['Content-Range'] 27 | FROM duckdb_logs_parsed('HTTP') 28 | WHERE request.type='GET' 29 | ---- 30 | bytes=0-1275 bytes 0-1275/1276 31 | 32 | statement ok 33 | CALL truncate_duckdb_logs() 34 | 35 | # This old option still exists, however it now logs to the duckdb log instead of printing straight to stdout 36 | statement ok 37 | set enable_http_logging=false; 38 | 39 | statement ok 40 | FROM 'https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv' 41 | 42 | query I 43 | select count(*) FROM duckdb_logs_parsed('HTTP'); 44 | ---- 45 | 0 46 | -------------------------------------------------------------------------------- /test/sql/copy/s3/csv_s3_file_size_bytes.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/csv_s3_file_size_bytes.test 2 | # description: Test FILE_SIZE_BYTES parameter for csv copy over s3 3 | # group: [s3] 4 | 5 | require httpfs 6 | 7 | require-env S3_TEST_SERVER_AVAILABLE 1 8 | 9 | # Require that these environment variables are also set 10 | 11 | require-env AWS_DEFAULT_REGION 12 | 13 | require-env AWS_ACCESS_KEY_ID 14 | 15 | require-env AWS_SECRET_ACCESS_KEY 16 | 17 | require-env DUCKDB_S3_ENDPOINT 18 | 19 | require-env DUCKDB_S3_USE_SSL 20 | 21 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 22 | set ignore_error_messages 23 | 24 | # different vector sizes result in different number of files 25 | require no_vector_verification 26 | 27 | statement ok 28 | CREATE TABLE bigdata AS SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i); 29 | 30 | statement ok 31 | set threads=1 32 | 33 | # parameter in bytes 34 | statement ok 35 | COPY (FROM bigdata) TO 's3://test-bucket/file_size_bytes_csv1' (FORMAT CSV, FILE_SIZE_BYTES 1000); 36 | 37 | query I 38 | SELECT COUNT(*) FROM read_csv_auto('s3://test-bucket/file_size_bytes_csv1/*.csv') 39 | ---- 40 | 10000 41 | 42 | # should lead to 3 files 43 | query I 44 | SELECT count(*) FROM glob('s3://test-bucket/file_size_bytes_csv1/*.csv') 45 | ---- 46 | 3 47 | -------------------------------------------------------------------------------- /test/sql/secret/secret_refresh_attach.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secret/secret_refresh_attach.test 2 | # description: Tests secret refreshing 3 | # group: [secret] 4 | 5 | require-env S3_TEST_SERVER_AVAILABLE 1 6 | 7 | require-env AWS_DEFAULT_REGION 8 | 9 | require-env AWS_ACCESS_KEY_ID 10 | 11 | require-env AWS_SECRET_ACCESS_KEY 12 | 13 | require-env DUCKDB_S3_ENDPOINT 14 | 15 | require-env DUCKDB_S3_USE_SSL 16 | 17 | require-env S3_ATTACH_DB 18 | 19 | set ignore_error_messages 20 | 21 | require httpfs 22 | 23 | require parquet 24 | 25 | statement ok 26 | SET enable_logging=true 27 | 28 | statement ok 29 | set s3_use_ssl='${DUCKDB_S3_USE_SSL}' 30 | 31 | # Create secret with incorrect credentials to trigger secret refreshing 32 | statement ok 33 | CREATE SECRET uhuh_this_mah_sh ( 34 | TYPE S3, 35 | PROVIDER config, 36 | KEY_ID 'all the girls', 37 | SECRET 'stomp yo feet like dis', 38 | REGION '${AWS_DEFAULT_REGION}', 39 | ENDPOINT '${DUCKDB_S3_ENDPOINT}', 40 | USE_SSL '${DUCKDB_S3_USE_SSL}', 41 | REFRESH 'auto' 42 | ) 43 | 44 | statement error 45 | ATTACH 's3://test-bucket/presigned/attach.db' AS db (READONLY 1); 46 | ---- 47 | 48 | # Secret refresh has been triggered 49 | query II 50 | SELECT log_level, message FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' 51 | ---- 52 | INFO Successfully refreshed secret: uhuh_this_mah_sh, new key_id: all the girls -------------------------------------------------------------------------------- /test/sql/copy/encryption/different_aes_engines.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/encryption/different_aes_engines.test 2 | # group: [encryption] 3 | 4 | foreach cipher GCM CTR 5 | 6 | statement ok 7 | ATTACH '__TEST_DIR__/enc_test_${cipher}.db' as enc (ENCRYPTION_KEY 'asdf', ENCRYPTION_CIPHER '${cipher}'); 8 | 9 | statement ok 10 | CREATE TABLE enc.test (a INTEGER, b INTEGER); 11 | 12 | statement ok 13 | INSERT INTO enc.test VALUES (11, 22), (13, 22), (12, 21) 14 | 15 | statement ok 16 | DETACH enc 17 | 18 | restart 19 | 20 | require httpfs 21 | 22 | statement ok 23 | ATTACH '__TEST_DIR__/enc_test_${cipher}.db' as enc (ENCRYPTION_KEY 'asdf'); 24 | 25 | 26 | query II 27 | FROM enc.test 28 | ---- 29 | 11 22 30 | 13 22 31 | 12 21 32 | 33 | 34 | restart 35 | 36 | endloop 37 | 38 | 39 | foreach cipher GCM CTR 40 | 41 | require httpfs 42 | 43 | statement ok 44 | ATTACH '__TEST_DIR__/enc_test_${cipher}.db' as enc (ENCRYPTION_KEY 'asdf', ENCRYPTION_CIPHER '${cipher}'); 45 | 46 | statement ok 47 | CREATE TABLE enc.test (a INTEGER, b INTEGER); 48 | 49 | statement ok 50 | INSERT INTO enc.test VALUES (11, 22), (13, 22), (12, 21) 51 | 52 | statement ok 53 | DETACH enc 54 | 55 | restart 56 | 57 | statement ok 58 | ATTACH '__TEST_DIR__/enc_test_${cipher}.db' as enc (ENCRYPTION_KEY 'asdf'); 59 | 60 | query II 61 | FROM enc.test 62 | ---- 63 | 11 22 64 | 13 22 65 | 12 21 66 | 67 | 68 | restart 69 | 70 | 71 | endloop 72 | -------------------------------------------------------------------------------- /test/sql/json/table/read_json_auto.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/json/table/read_json_auto.test_slow 2 | # description: Read json files - schema detection 3 | # group: [table] 4 | 5 | require json 6 | 7 | require httpfs 8 | 9 | # this is one big object - yyjson uses it as a benchmark 10 | query II 11 | select typeof("type"), typeof(features) from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/canada.json', maximum_depth=3); 12 | ---- 13 | VARCHAR STRUCT("type" JSON, properties JSON, geometry JSON)[] 14 | 15 | # let's crank up maximum_depth and see if we can fully unnest this big object 16 | query II 17 | select typeof("type"), typeof(features) from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/canada.json', maximum_depth=8); 18 | ---- 19 | VARCHAR STRUCT("type" VARCHAR, properties STRUCT("name" VARCHAR), geometry STRUCT("type" VARCHAR, coordinates DOUBLE[][][]))[] 20 | 21 | # ^ fully unnested, no more JSON type in there 22 | 23 | # the "coordinates" array in "features.geometry" is huge, let's just check the length - not all the values 24 | query IIIII 25 | select type, features[1].type, features[1].properties.name, features[1].geometry.type, length(features[1].geometry.coordinates) 26 | from read_json_auto('https://github.com/duckdb/duckdb-data/releases/download/v1.0/canada.json', maximum_depth=8); 27 | ---- 28 | FeatureCollection Feature Canada Polygon 480 29 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/parquet_encryption_mbedtls_openssl.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/parquet_encryption_mbedtls_openssl.test 2 | # description: Test Parquet encryption with OpenSSL 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | # parquet keys are not persisted across restarts 10 | statement ok 11 | PRAGMA enable_verification 12 | 13 | # add keys of 3 different lengths 14 | statement ok 15 | PRAGMA add_parquet_key('key128', '0123456789112345') 16 | 17 | statement ok 18 | PRAGMA add_parquet_key('key192', '012345678911234501234567') 19 | 20 | statement ok 21 | PRAGMA add_parquet_key('key256', '01234567891123450123456789112345') 22 | 23 | # test all valid AES key lengths 24 | foreach key_len 128 192 256 25 | 26 | # write files with OpenSSL enabled 27 | statement error 28 | COPY (SELECT 42 i) to '__TEST_DIR__/encrypted${key_len}_openssl.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'}, DEBUG_USE_OPENSSL randomval) 29 | ---- 30 | BOOL 31 | 32 | # write files with OpenSSL enabled 33 | statement ok 34 | COPY (SELECT 42 i) to '__TEST_DIR__/encrypted${key_len}_openssl.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'}, DEBUG_USE_OPENSSL true) 35 | 36 | # read OpenSSL encrypted files by using mbedtls 37 | query I 38 | SELECT * FROM read_parquet('__TEST_DIR__/encrypted${key_len}_openssl.parquet', encryption_config={footer_key: 'key${key_len}'}, debug_use_openssl=false) 39 | ---- 40 | 42 41 | 42 | endloop 43 | -------------------------------------------------------------------------------- /test/sql/copy/no_head_on_write.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/no_head_on_write.test 2 | # description: Confirm that we don't send head requests for writes 3 | # group: [copy] 4 | 5 | require-env S3_TEST_SERVER_AVAILABLE 1 6 | 7 | require-env AWS_DEFAULT_REGION 8 | 9 | require-env AWS_ACCESS_KEY_ID 10 | 11 | require-env AWS_SECRET_ACCESS_KEY 12 | 13 | require-env DUCKDB_S3_ENDPOINT 14 | 15 | require-env DUCKDB_S3_USE_SSL 16 | 17 | require httpfs 18 | 19 | require parquet 20 | 21 | statement ok 22 | SET enable_logging=true 23 | 24 | statement ok 25 | set s3_use_ssl='${DUCKDB_S3_USE_SSL}' 26 | 27 | statement ok 28 | set s3_endpoint='${DUCKDB_S3_ENDPOINT}' 29 | 30 | statement ok 31 | set s3_region='${AWS_DEFAULT_REGION}' 32 | 33 | # Create some test data 34 | statement ok 35 | CREATE SECRET s1 ( 36 | TYPE S3, 37 | KEY_ID '${AWS_ACCESS_KEY_ID}', 38 | SECRET '${AWS_SECRET_ACCESS_KEY}', 39 | REQUESTER_PAYS true 40 | ) 41 | 42 | statement ok 43 | CALL enable_logging('HTTP'); 44 | 45 | statement ok 46 | copy (select 1 as a) to 's3://test-bucket/test-file.parquet' 47 | 48 | query I 49 | select request.type FROM duckdb_logs_parsed('HTTP') 50 | ---- 51 | PUT 52 | 53 | statement ok 54 | CALL truncate_duckdb_logs(); 55 | 56 | statement ok 57 | copy (select random() as a FROM range(8000000)) to 's3://test-bucket/test-file2.csv' 58 | 59 | query I 60 | select request.type FROM duckdb_logs_parsed('HTTP') 61 | ---- 62 | POST 63 | PUT 64 | PUT 65 | POST 66 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_persistence_error_handling.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_persistence_error_handling.test 2 | # description: Test secret persistence with buggy secrets 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | load __TEST_DIR__/create_secret_persistence_error_handling.db 9 | 10 | require httpfs 11 | 12 | statement ok 13 | set secret_directory='__TEST_DIR__/create_secret_persistence_error_handling' 14 | 15 | # Hacky way to make duckdb create the create_secret_persistence_error_handling dir 16 | statement ok 17 | COPY (select 1 as a, 2 as b ) to '__TEST_DIR__/create_secret_persistence_error_handling/' (FORMAT csv, PARTITION_BY a) 18 | 19 | # Now write a corrupt secret file 20 | statement ok 21 | COPY (select 1 as a ) to '__TEST_DIR__/create_secret_persistence_error_handling/s1.duckdb_secret' (FORMAT csv) 22 | 23 | statement error 24 | FROM duckdb_secrets(); 25 | ---- 26 | 27 | restart 28 | 29 | statement ok 30 | set secret_directory='__TEST_DIR__/create_secret_persistence_error_handling2' 31 | 32 | statement ok 33 | CREATE PERSISTENT SECRET s1 (TYPE S3); 34 | 35 | restart no_extension_load 36 | 37 | statement ok 38 | set secret_directory='__TEST_DIR__/create_secret_persistence_error_handling2' 39 | 40 | # Disable autoloading 41 | statement ok 42 | SET autoload_known_extensions=false; 43 | 44 | # Force persistent deserialization; we can deserialize generic key/value secrets 45 | statement ok 46 | from duckdb_secrets(); 47 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_scope_matching.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_scope_matching.test 2 | # description: Test scope matching behaviour is correct 3 | # group: [secrets] 4 | 5 | load __TEST_DIR__/create_secret_scope_matching.db 6 | 7 | statement ok 8 | PRAGMA enable_verification; 9 | 10 | require httpfs 11 | 12 | statement ok 13 | set secret_directory='__TEST_DIR__/create_secret_scope_matching' 14 | 15 | # No match 16 | query I 17 | SELECT name FROM which_secret('s3://', 's3') 18 | ---- 19 | 20 | statement ok 21 | CREATE TEMPORARY SECRET t1 ( TYPE S3 ) 22 | 23 | statement ok 24 | CREATE TEMPORARY SECRET t2 ( TYPE S3 ) 25 | 26 | statement ok 27 | CREATE SECRET p1 IN LOCAL_FILE ( TYPE S3 ) 28 | 29 | # This ties within the same storage: the two temporary secrets s1 and s2 both score identically. We solve this by 30 | # tie-breaking on secret name alphabetical ordering 31 | query I 32 | SELECT name FROM which_secret('s3://', 's3') 33 | ---- 34 | t1 35 | 36 | query III 37 | FROM which_secret('s3://', 's3') 38 | ---- 39 | t1 TEMPORARY memory 40 | 41 | statement ok 42 | DROP SECRET t1 43 | 44 | # Temporary secrets take preference over temporary ones 45 | query I 46 | SELECT name FROM which_secret('s3://', 's3') 47 | ---- 48 | t2 49 | 50 | statement ok 51 | DROP SECRET t2 52 | 53 | query I 54 | SELECT name FROM which_secret('s3://', 's3') 55 | ---- 56 | p1 57 | 58 | statement maybe 59 | DROP SECRET p1 60 | ---- 61 | Invalid Input Error: Failed to remove non-existent secret 62 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_non_writable_persistent_dir.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_non_writable_persistent_dir.test 2 | # description: Test persistent secrets when the secret dir is non-writable 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | load __TEST_DIR__/create_secret_non_writable_persistent_dir.db 9 | 10 | require httpfs 11 | 12 | # First we create any file 13 | statement ok 14 | COPY (SELECT 1 as a) to '__TEST_DIR__/file_to_prevent_the_secret_dir_from_being_created.csv' 15 | 16 | # Then we set the secret dir to this. 17 | statement ok 18 | set secret_directory='__TEST_DIR__/file_to_prevent_the_secret_dir_from_being_created.csv' 19 | 20 | # Now on creation of a tmp secret, the secret manager is initialized, but the persistent secret directory creation is impossible 21 | statement ok 22 | CREATE SECRET my_tmp_secret ( 23 | TYPE S3, 24 | SCOPE 's3://bucket1' 25 | ) 26 | 27 | # This now fails with the message that we could not create the persistent secret directory 28 | statement error 29 | CREATE PERSISTENT SECRET my_tmp_secret ( 30 | TYPE S3, 31 | SCOPE 's3://bucket2' 32 | ) 33 | ---- 34 | 35 | restart 36 | 37 | # Try with a correct, deeply nested path: AOK? 38 | statement ok 39 | set secret_directory='__TEST_DIR__/create_secret_non_writable_persistent_dir/a/deeply/nested/folder/will/be/created' 40 | 41 | statement maybe 42 | CREATE PERSISTENT SECRET my_tmp_secret ( 43 | TYPE S3, 44 | SCOPE 's3://bucket2' 45 | ) 46 | ---- 47 | -------------------------------------------------------------------------------- /test/sql/attach/attach_s3_tpch.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/attach/attach_s3_tpch.test_slow 2 | # description: Test running TPC-H over a database attached over S3 3 | # group: [attach] 4 | 5 | require httpfs 6 | 7 | require tpch 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | require-env AWS_DEFAULT_REGION 12 | 13 | require-env AWS_ACCESS_KEY_ID 14 | 15 | require-env AWS_SECRET_ACCESS_KEY 16 | 17 | require-env DUCKDB_S3_ENDPOINT 18 | 19 | require-env DUCKDB_S3_USE_SSL 20 | 21 | statement ok 22 | CREATE SECRET ( 23 | TYPE S3, 24 | PROVIDER config, 25 | KEY_ID '${AWS_ACCESS_KEY_ID}', 26 | SECRET '${AWS_SECRET_ACCESS_KEY}', 27 | REGION '${AWS_DEFAULT_REGION}', 28 | ENDPOINT '${DUCKDB_S3_ENDPOINT}', 29 | USE_SSL '${DUCKDB_S3_USE_SSL}' 30 | ) 31 | 32 | # ATTACH a DuckDB database over HTTPFS 33 | statement ok 34 | ATTACH 's3://test-bucket/presigned/lineitem_sf1.db' AS db (READONLY 1); 35 | 36 | statement ok 37 | USE db 38 | 39 | loop i 1 9 40 | 41 | query I 42 | PRAGMA tpch(${i}) 43 | ---- 44 | :duckdb/extension/tpch/dbgen/answers/sf1/q0${i}.csv 45 | 46 | endloop 47 | 48 | loop i 10 23 49 | 50 | query I 51 | PRAGMA tpch(${i}) 52 | ---- 53 | :duckdb/extension/tpch/dbgen/answers/sf1/q${i}.csv 54 | 55 | endloop 56 | 57 | statement ok 58 | USE memory 59 | 60 | statement ok 61 | DETACH db 62 | 63 | statement ok 64 | ATTACH 's3://test-bucket/presigned/lineitem_sf1.db' AS db (READONLY 1); 65 | 66 | statement ok 67 | USE db 68 | 69 | query IIIIIIIIIIIIIIII 70 | select count(distinct columns(*)) from lineitem; 71 | ---- 72 | 1500000 200000 10000 7 50 933900 11 9 3 2 2526 2466 2554 4 7 3610733 73 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/parquet_2102.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/parquet_2102.test_slow 2 | # description: Missing Column Data After Adding Left Join To Query in DuckDB Version 0.2.8 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | CREATE TABLE view_one AS SELECT * FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/issue2102_one.parquet'; 11 | 12 | statement ok 13 | CREATE TABLE view_two AS SELECT * FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/issue2102_two.parquet'; 14 | 15 | query I 16 | SELECT COUNT(*) FROM view_one WHERE date IS NULL 17 | ---- 18 | 6219 19 | 20 | statement ok 21 | CREATE TABLE tbl1 AS SELECT one.id id, one.date date 22 | FROM 23 | view_one AS one 24 | JOIN 25 | view_two two ON two.id = one.id AND two.line = 1; 26 | 27 | query I 28 | SELECT COUNT(*) FROM tbl1 29 | ---- 30 | 691951 31 | 32 | query I 33 | SELECT COUNT(*) FROM tbl1 WHERE date IS NULL 34 | ---- 35 | 4742 36 | 37 | statement ok 38 | CREATE TABLE tbl2 AS SELECT one.id id, one.date date 39 | FROM 40 | view_one AS one 41 | LEFT JOIN 42 | view_two two ON two.id = one.id AND two.line = 1; 43 | 44 | query I 45 | SELECT COUNT(*) FROM tbl2 46 | ---- 47 | 695434 48 | 49 | query I 50 | SELECT COUNT(*) FROM tbl2 WHERE date IS NULL 51 | ---- 52 | 6219 53 | 54 | statement ok 55 | CREATE TABLE tbl3 AS SELECT one.id id, one.date date 56 | FROM 57 | view_one AS one 58 | LEFT JOIN 59 | view_two two ON two.id = one.id; 60 | 61 | query I 62 | SELECT COUNT(*) FROM tbl3 63 | ---- 64 | 768666 65 | 66 | query I 67 | SELECT COUNT(*) FROM tbl3 WHERE date IS NULL 68 | ---- 69 | 7124 70 | -------------------------------------------------------------------------------- /test/extension/autoloading_load_only.test: -------------------------------------------------------------------------------- 1 | # name: test/extension/autoloading_load_only.test 2 | # description: Tests for autoloading with no autoinstall 3 | # group: [extension] 4 | 5 | require httpfs 6 | 7 | # This test assumes icu and json to be available in the LOCAL_EXTENSION_REPO and NOT linked into duckdb statically 8 | # -> this should be the case for our autoloading tests where we have the local_extension_repo variable set 9 | require-env LOCAL_EXTENSION_REPO 10 | 11 | # Ensure we have a clean extension directory without any preinstalled extensions 12 | statement ok 13 | set extension_directory='__TEST_DIR__/autoloading_load_only' 14 | 15 | ### No autoloading nor installing: throw error with installation hint 16 | statement ok 17 | set autoload_known_extensions=false 18 | 19 | statement ok 20 | set autoinstall_known_extensions=false 21 | 22 | statement error 23 | SET s3_region='eu-west-1'; 24 | ---- 25 | :.*Catalog Error.*Setting with name "s3_region" is not in the catalog.* 26 | 27 | ### Autoloading but not autoinstall, while the extension is not installed: still not working 28 | statement ok 29 | set autoload_known_extensions=true 30 | 31 | statement ok 32 | set autoinstall_extension_repository='/tmp/non-existent-repo'; 33 | 34 | statement error 35 | SET s3_region='eu-west-1'; 36 | ---- 37 | :.*Extension Autoloading Error.*An error occurred while trying to automatically install the required extension 'httpfs'.* 38 | 39 | ### Manually install the extension from the local repo 40 | statement ok 41 | INSTALL httpfs FROM '${LOCAL_EXTENSION_REPO}' 42 | 43 | # now autoloading works! 44 | statement ok 45 | SET s3_region='eu-west-1'; 46 | -------------------------------------------------------------------------------- /test/extension/autoloading_current_setting.test: -------------------------------------------------------------------------------- 1 | # name: test/extension/autoloading_current_setting.test 2 | # description: More tests for extension autoloading. 3 | # group: [extension] 4 | 5 | # This test assumes icu and json to be available in the LOCAL_EXTENSION_REPO and NOT linked into duckdb statically 6 | # -> this should be the case for our autoloading tests where we have the local_extension_repo variable set 7 | require-env LOCAL_EXTENSION_REPO 8 | 9 | require httpfs 10 | 11 | statement ok 12 | set extension_directory='__TEST_DIR__/autoloading_current_setting' 13 | 14 | ### No autoloading: throw error with installation hint 15 | statement ok 16 | set autoload_known_extensions=false 17 | 18 | statement ok 19 | set autoinstall_known_extensions=false 20 | 21 | statement error 22 | select current_setting('s3_region'); 23 | ---- 24 | :.*Catalog Error.*Setting with name "s3_region" is not in the catalog.* 25 | 26 | ### Autoloading, but but not autoinstall 27 | statement ok 28 | set autoload_known_extensions=true 29 | 30 | statement ok 31 | set autoinstall_extension_repository='/tmp/non-existent-repo'; 32 | 33 | # Error should inform the user on whats happening 34 | statement error 35 | select current_setting('s3_region'); 36 | ---- 37 | :.*Extension Autoloading Error.*An error occurred while trying to automatically install the required extension 'httpfs'.* 38 | 39 | ### Autoloading with autoinstall and correct extension repo 40 | statement ok 41 | set autoinstall_extension_repository='${LOCAL_EXTENSION_REPO}'; 42 | 43 | statement ok 44 | set autoinstall_known_extensions=true 45 | 46 | statement ok 47 | select current_setting('s3_region'); 48 | -------------------------------------------------------------------------------- /test/extension/autoloading_reset_setting.test: -------------------------------------------------------------------------------- 1 | # name: test/extension/autoloading_reset_setting.test 2 | # description: Testing reset setting that lives in an extension that can be autoloaded 3 | # group: [extension] 4 | 5 | require httpfs 6 | 7 | # This test assumes httpfs and json to be available in the LOCAL_EXTENSION_REPO and NOT linked into duckdb statically 8 | # -> this should be the case for our autoloading tests where we have the local_extension_repo variable set 9 | require-env LOCAL_EXTENSION_REPO 10 | 11 | statement ok 12 | set extension_directory='__TEST_DIR__/autoloading_reset_setting' 13 | 14 | ### No autoloading: throw error with installation hint 15 | statement ok 16 | set autoload_known_extensions=false 17 | 18 | statement ok 19 | set autoinstall_known_extensions=false 20 | 21 | # Testing reset setting 22 | statement error 23 | RESET s3_region; 24 | ---- 25 | Catalog Error: Setting with name "s3_region" is not in the catalog, but it exists in the httpfs extension. 26 | 27 | ### Autoloading, but no auto install 28 | statement ok 29 | set autoload_known_extensions=true 30 | 31 | statement ok 32 | set autoinstall_extension_repository='/tmp/non-existent-repo'; 33 | 34 | # Error should inform the user on whats happening 35 | statement error 36 | RESET s3_region; 37 | ---- 38 | Extension Autoloading Error: An error occurred while trying to automatically install the required extension 'httpfs': 39 | Extension 40 | 41 | ### Autoloading with correct tmp repo and autoinstall 42 | statement ok 43 | set autoinstall_extension_repository='${LOCAL_EXTENSION_REPO}'; 44 | 45 | statement ok 46 | set autoinstall_known_extensions=true 47 | 48 | statement ok 49 | RESET s3_region; 50 | -------------------------------------------------------------------------------- /test/extension/autoloading_filesystems.test: -------------------------------------------------------------------------------- 1 | # name: test/extension/autoloading_filesystems.test 2 | # description: Tests for autoloading with filesystems 3 | # group: [extension] 4 | 5 | require httpfs 6 | 7 | # This test assumes icu and json to be available in the LOCAL_EXTENSION_REPO and NOT linked into duckdb statically 8 | # -> this should be the case for our autoloading tests where we have the local_extension_repo variable set 9 | require-env LOCAL_EXTENSION_REPO 10 | 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | # Ensure we have a clean extension directory without any preinstalled extensions 15 | statement ok 16 | set extension_directory='__TEST_DIR__/autoloading_filesystems' 17 | 18 | ### No autoloading nor installing: throw error with installation hint 19 | statement ok 20 | set autoload_known_extensions=false 21 | 22 | statement ok 23 | set autoinstall_known_extensions=false 24 | 25 | statement error 26 | SELECT * FROM 's3://some-bucket/a-file.csv' 27 | ---- 28 | Missing Extension Error: File s3://some-bucket/a-file.csv requires the extension httpfs to be loaded 29 | 30 | ### With autoloading, install and correct repo 31 | statement ok 32 | set autoload_known_extensions=true 33 | 34 | statement ok 35 | set autoinstall_known_extensions=true 36 | 37 | statement ok 38 | set autoinstall_extension_repository='${LOCAL_EXTENSION_REPO}'; 39 | 40 | # Set an invalid endpoint to ensure we fail in the httpfs extension when trying to connect 41 | statement ok 42 | SET s3_endpoint='false_endpoint'; 43 | 44 | statement error 45 | SELECT * FROM 's3://some-bucket/a-file.csv' 46 | ---- 47 | Could not establish connection error for HTTP HEAD to 'https://some-bucket.false_endpoint/a-file.csv' 48 | -------------------------------------------------------------------------------- /scripts/run_s3_test_server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #Note: DONT run as root 3 | 4 | if [ ! -f test/test_data/attach.db ]; then 5 | echo "File test/test_data/attach.db not found, run ./scripts/generate_presigned_url.sh to generate" 6 | else 7 | rm -rf /tmp/minio_test_data 8 | rm -rf /tmp/minio_root_data 9 | mkdir -p /tmp/minio_test_data 10 | mkdir -p /tmp/minio_root_data 11 | docker compose -f scripts/minio_s3.yml -p duckdb-minio up -d 12 | 13 | # for testing presigned url 14 | container_name=$(docker ps -a --format '{{.Names}}' | grep -m 1 "duckdb-minio") 15 | echo $container_name 16 | 17 | for i in $(seq 1 360); 18 | do 19 | docker_finish_logs=$(docker logs $container_name 2>/dev/null | grep -m 1 'FINISHED SETTING UP MINIO' || echo '') 20 | if [ ! -z "${docker_finish_logs}" ]; then 21 | break 22 | fi 23 | sleep 1 24 | done 25 | 26 | 27 | export S3_SMALL_CSV_PRESIGNED_URL=$(docker logs $container_name 2>/dev/null | grep -m 1 'Share:.*phonenumbers\.csv' | grep -o 'http[s]\?://[^ ]\+') 28 | echo $S3_SMALL_CSV_PRESIGNED_URL 29 | 30 | export S3_SMALL_PARQUET_PRESIGNED_URL=$(docker logs $container_name 2>/dev/null | grep -m 1 'Share:.*t1\.parquet' | grep -o 'http[s]\?://[^ ]\+') 31 | echo $S3_SMALL_PARQUET_PRESIGNED_URL 32 | 33 | export S3_LARGE_PARQUET_PRESIGNED_URL=$(docker logs $container_name 2>/dev/null | grep -m 1 'Share:.*lineitem_large\.parquet' | grep -o 'http[s]\?://[^ ]\+') 34 | echo $S3_LARGE_PARQUET_PRESIGNED_URL 35 | 36 | export S3_ATTACH_DB_PRESIGNED_URL=$(docker logs $container_name 2>/dev/null | grep -m 1 'Share:.*attach\.db' | grep -o 'http[s]\?://[^ ]\+') 37 | echo $S3_ATTACH_DB_PRESIGNED_URL 38 | 39 | export S3_ATTACH_DB="s3://test-bucket/presigned/attach.db" 40 | fi -------------------------------------------------------------------------------- /test/sql/httpfs/hffs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/httpfs/hffs.test 2 | # description: Ensure the HuggingFace filesystem works as expected 3 | # group: [httpfs] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement error 10 | FROM parquet_scan('hf://') 11 | ---- 12 | IO Error: Failed to parse 'hf://'. Please format url like: 'hf://datasets/my-username/my-dataset/path/to/file.parquet' 13 | 14 | statement error 15 | FROM 'hf://file.parquet' 16 | ---- 17 | IO Error: Failed to parse 'hf://file.parquet'. Please format url like: 'hf://datasets/my-username/my-dataset/path/to/file.parquet' 18 | 19 | statement error 20 | FROM 'hf://yepthisdoesntwork/file.parquet' 21 | ---- 22 | IO Error: Failed to parse: 'hf://yepthisdoesntwork/file.parquet'. Currently DuckDB only supports querying datasets or spaces, so the url should start with 'hf://datasets' or 'hf://spaces' 23 | 24 | statement error 25 | FROM 'hf://stil/not/file.parquet' 26 | ---- 27 | IO Error: Failed to parse: 'hf://stil/not/file.parquet'. Currently DuckDB only supports querying datasets or spaces, so the url should start with 'hf://datasets' or 'hf://spaces' 28 | 29 | statement error 30 | FROM 'hf://datasets/file.parquet' 31 | ---- 32 | IO Error: Failed to parse 'hf://datasets/file.parquet'. Please format url like: 'hf://datasets/my-username/my-dataset/path/to/file.parquet' 33 | 34 | statement error 35 | FROM 'hf://datasets/myname/file.parquet' 36 | ---- 37 | IO Error: Failed to parse 'hf://datasets/myname/file.parquet'. Please format url like: 'hf://datasets/my-username/my-dataset/path/to/file.parquet' 38 | 39 | statement error 40 | FROM 'hf://datasets/**/file.parquet' 41 | ---- 42 | IO Error: Failed to parse 'hf://datasets/**/file.parquet'. Please format url like: 'hf://datasets/my-username/my-dataset/path/to/file.parquet' 43 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_overwriting.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_overwriting.test 2 | # description: Test secret overwriting and deleting 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | # Ensure any currently stored secrets don't interfere with the test 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | # Create some s3 secret 15 | statement ok 16 | CREATE SECRET my_secret ( 17 | TYPE S3, 18 | SCOPE 's3://bucket1' 19 | ) 20 | 21 | query II 22 | SELECT name, scope FROM duckdb_secrets(); 23 | ---- 24 | my_secret ['s3://bucket1'] 25 | 26 | statement error 27 | CREATE SECRET my_secret ( 28 | TYPE S3, 29 | KEY_ID 'my_key', 30 | SECRET 'my_secret', 31 | SCOPE 's3://bucket1' 32 | ) 33 | ---- 34 | Invalid Input Error: Temporary secret with name 'my_secret' already exists! 35 | 36 | # We should be able to replace the secret though 37 | statement ok 38 | CREATE OR REPLACE SECRET my_secret ( 39 | TYPE S3, 40 | SCOPE 's3://bucket2' 41 | ) 42 | 43 | query II 44 | SELECT name, scope FROM duckdb_secrets(); 45 | ---- 46 | my_secret ['s3://bucket2'] 47 | 48 | # We can also ignore if we want to 49 | statement ok 50 | CREATE SECRET IF NOT EXISTS my_secret ( 51 | TYPE S3, 52 | SCOPE 's3://bucket5' 53 | ) 54 | 55 | query II 56 | SELECT name, scope FROM duckdb_secrets(); 57 | ---- 58 | my_secret ['s3://bucket2'] 59 | 60 | # Now try dropping a secret that does not exist 61 | statement error 62 | DROP SECRET my_secret_does_not_exist; 63 | ---- 64 | Failed to remove non-existent secret with name 'my_secret_does_not_exist' 65 | 66 | # Drop one that does exist 67 | statement ok 68 | DROP SECRET my_secret; 69 | 70 | # Secret be gone! 71 | query II 72 | SELECT name, scope FROM duckdb_secrets(); 73 | ---- 74 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12...3.29) 2 | 3 | project(HTTPFsExtension) 4 | 5 | add_extension_definitions() 6 | 7 | if (NOT EMSCRIPTEN) 8 | add_definitions(-DOVERRIDE_ENCRYPTION_UTILS=1) 9 | else() 10 | set(DUCKDB_EXTENSION_HTTPFS_LINKED_LIBS "../../third_party/mbedtls/libduckdb_mbedtls.a") 11 | endif() 12 | 13 | if(MINGW) 14 | set(OPENSSL_USE_STATIC_LIBS TRUE) 15 | endif() 16 | 17 | find_package(OpenSSL REQUIRED) 18 | find_package(CURL REQUIRED) 19 | include_directories(${OPENSSL_INCLUDE_DIR}) 20 | include_directories(${CURL_INCLUDE_DIRS}) 21 | 22 | include_directories(src/include 23 | ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) 24 | add_subdirectory(src) 25 | set(EXTENSION_SOURCES ${ALL_OBJECT_FILES}) 26 | 27 | build_static_extension(httpfs ${EXTENSION_SOURCES}) 28 | 29 | set(PARAMETERS "-warnings") 30 | build_loadable_extension(httpfs ${PARAMETERS} ${EXTENSION_SOURCES}) 31 | 32 | if(EMSCRIPTEN) 33 | target_link_libraries(httpfs_loadable_extension duckdb_mbedtls) 34 | else() 35 | target_link_libraries(httpfs_loadable_extension duckdb_mbedtls 36 | ${OPENSSL_LIBRARIES}) 37 | target_link_libraries(httpfs_extension duckdb_mbedtls ${OPENSSL_LIBRARIES}) 38 | 39 | # Link dependencies into extension 40 | target_link_libraries(httpfs_loadable_extension ${CURL_LIBRARIES}) 41 | target_link_libraries(httpfs_extension ${CURL_LIBRARIES}) 42 | 43 | 44 | if(MINGW) 45 | find_package(ZLIB) 46 | target_link_libraries(httpfs_loadable_extension ZLIB::ZLIB -lcrypt32) 47 | target_link_libraries(httpfs_extension ZLIB::ZLIB -lcrypt32) 48 | endif() 49 | endif() 50 | 51 | 52 | install( 53 | TARGETS httpfs_extension 54 | EXPORT "${DUCKDB_EXPORT_SET}" 55 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 56 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 57 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_cascading.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_cascading.test_slow 2 | # description: Test the cascading mechanism of secret settings 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | # Ensure any currently stored secrets don't interfere with the test 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | statement ok 15 | set s3_endpoint = 'invalid-on-purpose-setting' 16 | 17 | statement ok 18 | set s3_url_style = 'path' 19 | 20 | statement ok 21 | set s3_use_ssl = false 22 | 23 | # This secret overrides only the url style, not the endpoint 24 | statement ok 25 | CREATE SECRET s1 ( 26 | TYPE S3, 27 | REGION 'my_region', 28 | URL_STYLE 'vhost', 29 | SCOPE 's3://url-style-only' 30 | ) 31 | 32 | # This secret overrides both the url style and the endpoint 33 | statement ok 34 | CREATE SECRET s2 ( 35 | TYPE S3, 36 | REGION 'my_region', 37 | URL_STYLE 'vhost', 38 | ENDPOINT 'invalid-on-purpose-secret', 39 | SCOPE 's3://url-style-and-endpoint' 40 | ) 41 | 42 | # Only the url style from the secret is used 43 | statement error 44 | FROM 's3://url-style-only/test.csv' 45 | ---- 46 | Could not establish connection error for HTTP HEAD to 'http://url-style-only.invalid-on-purpose-setting/test.csv' 47 | 48 | # Both Url style and endpoint are used now 49 | statement error 50 | FROM 's3://url-style-and-endpoint/test.csv' 51 | ---- 52 | Could not establish connection error for HTTP HEAD to 'http://url-style-and-endpoint.invalid-on-purpose-secret/test.csv' 53 | 54 | # This request matches none of the secrets, we use the settings 55 | statement error 56 | FROM 's3://test-bucket/test.csv' 57 | ---- 58 | Could not establish connection error for HTTP HEAD to 'http://invalid-on-purpose-setting/test-bucket/test.csv' 59 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_defaults.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_defaults.test 2 | # description: Test default values during secret creation 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | # Ensure any currently stored secrets don't interfere with the test 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | statement ok 15 | DROP SECRET IF EXISTS s1; 16 | 17 | # Without name we use the __default_ name. The default config for for the S3 type is config 18 | statement ok 19 | CREATE SECRET ( 20 | TYPE S3, 21 | KEY_ID 'my_key', 22 | SECRET 'my_secret' 23 | ) 24 | 25 | query IIII 26 | SELECT name, provider, type, scope FROM duckdb_secrets(); 27 | ---- 28 | __default_s3 config s3 ['s3://', 's3n://', 's3a://'] 29 | 30 | # Without name we use the __default_ name. The default config for for the R2 type is config 31 | statement ok 32 | CREATE SECRET ( 33 | TYPE R2, 34 | KEY_ID 'my_key', 35 | SECRET 'my_secret', 36 | ACCOUNT_ID 'my_account_id' 37 | ) 38 | 39 | query IIII 40 | SELECT name, provider, type, scope FROM duckdb_secrets() ORDER BY name; 41 | ---- 42 | __default_r2 config r2 ['r2://'] 43 | __default_s3 config s3 ['s3://', 's3n://', 's3a://'] 44 | 45 | 46 | # Without name we use the __default_ name. The default config for for the R2 type is config 47 | statement ok 48 | CREATE SECRET ( 49 | TYPE GCS, 50 | KEY_ID 'my_key', 51 | SECRET 'my_secret' 52 | ) 53 | 54 | # duckdb_secrets with all defaults looks like this now 55 | query IIIIII 56 | SELECT name, persistent, storage, provider, type, scope FROM duckdb_secrets() ORDER BY name; 57 | ---- 58 | __default_gcs 0 memory config gcs ['gcs://', 'gs://'] 59 | __default_r2 0 memory config r2 ['r2://'] 60 | __default_s3 0 memory config s3 ['s3://', 's3n://', 's3a://'] -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_settings.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_settings.test 2 | # description: Test setting secret settings 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | load __TEST_DIR__/secrets_settings.db 9 | 10 | require httpfs 11 | 12 | statement ok 13 | set secret_directory='__TEST_DIR__/create_secret_settings1' 14 | 15 | statement ok 16 | set allow_persistent_secrets=true; 17 | 18 | # Create some s3 secret, the normally the default is TEMPORARY 19 | statement ok 20 | CREATE PERSISTENT SECRET my_perm_secret ( 21 | TYPE S3, 22 | SCOPE 's3://bucket1' 23 | ) 24 | 25 | query II 26 | SELECT name, scope from duckdb_secrets(); 27 | ---- 28 | my_perm_secret ['s3://bucket1'] 29 | 30 | statement error 31 | set secret_directory='__TEST_DIR__/create_secret_settings2' 32 | ---- 33 | Invalid Input Error: Changing Secret Manager settings after the secret manager is used is not allowed! 34 | 35 | statement error 36 | set allow_persistent_secrets=false; 37 | ---- 38 | Invalid Input Error: Changing Secret Manager settings after the secret manager is used is not allowed! 39 | 40 | # This setting CAN be modified after init 41 | statement ok 42 | set default_secret_storage = 'local_file' 43 | 44 | statement ok 45 | reset default_secret_storage; 46 | 47 | restart 48 | 49 | # When disabling secrets, we won't read the one that we wrote earlier 50 | statement ok 51 | set allow_persistent_secrets=false 52 | 53 | query I 54 | select count(*) from duckdb_secrets(); 55 | ---- 56 | 0 57 | 58 | restart 59 | 60 | # Switch settings back and it works again 61 | statement ok 62 | set allow_persistent_secrets=true 63 | 64 | # setting the path right it will work 65 | statement ok 66 | set secret_directory='__TEST_DIR__/create_secret_settings1' 67 | 68 | query II 69 | SELECT name, scope from duckdb_secrets(); 70 | ---- 71 | my_perm_secret ['s3://bucket1'] 72 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_r2.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_r2.test 2 | # description: Test secret creation using the default r2 secret provider 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | statement ok 11 | set secret_directory='__TEST_DIR__/create_secret_r2' 12 | 13 | # R2 is secrets will instead of requiring manually constructing the endpoint of .r2.cloudflarestorage.com, 14 | # use the account_id to configure it. Also the region is not required at all. Also the scope defaults to r2:// 15 | statement ok 16 | CREATE SECRET ( 17 | TYPE R2, 18 | ACCOUNT_ID 'some_bogus_account', 19 | KEY_ID 'my_key', 20 | SECRET 'my_secret' 21 | ) 22 | 23 | # The secret will be created for the default scope 24 | query IIII 25 | SELECT name, type, provider, scope FROM duckdb_secrets(); 26 | ---- 27 | __default_r2 r2 config ['r2://'] 28 | 29 | # 30 | statement error 31 | FROM 's3://test-bucket/test.csv' 32 | ---- 33 | :.*HTTP Error.* 34 | 35 | # Account ID is only for R2, trying to set this for S3 will fail 36 | statement error 37 | CREATE SECRET ( 38 | TYPE S3, 39 | ACCOUNT_ID 'some_bogus_account', 40 | KEY_ID 'my_key', 41 | SECRET 'my_secret' 42 | ) 43 | ---- 44 | Binder Error: Unknown parameter 'account_id' for secret type 's3' with default provider 'config' 45 | 46 | # Account ID is only for R2, trying to set this for GCS will fail 47 | statement error 48 | CREATE SECRET ( 49 | TYPE GCS, 50 | PROVIDER config, 51 | ACCOUNT_ID 'some_bogus_account', 52 | KEY_ID 'my_key', 53 | SECRET 'my_secret' 54 | ) 55 | ---- 56 | Binder Error: Unknown parameter 'account_id' for secret type 'gcs' with provider 'config' 57 | 58 | # Ensure secret lookup works correctly; 59 | statement ok 60 | CREATE SECRET test( 61 | TYPE R2, 62 | ACCOUNT_ID 'some_bogus_account', 63 | KEY_ID 'my_key', 64 | SECRET 'my_secret' 65 | ) 66 | -------------------------------------------------------------------------------- /test/sql/storage/encryption/temp_files/encrypted_out_of_core.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/storage/encryption/temp_files/encrypted_out_of_core.test_slow 2 | # description: Encrypted large joins in persistent databases have a leftover temporary directory. 3 | # group: [temp_files] 4 | 5 | foreach cipher GCM CTR 6 | 7 | 8 | require httpfs 9 | 10 | require tpch 11 | 12 | load __TEST_DIR__/leftover_temp_files.db 13 | 14 | statement ok 15 | ATTACH '__TEST_DIR__/encrypted_temp_files_${cipher}.db' AS enc_${cipher} (ENCRYPTION_KEY 'asdf', ENCRYPTION_CIPHER '${cipher}'); 16 | 17 | statement ok 18 | SET temp_file_encryption=true; 19 | 20 | statement ok 21 | USE enc_${cipher}; 22 | 23 | statement ok 24 | SET threads=8 25 | 26 | statement ok 27 | SET memory_limit='1GB'; 28 | 29 | statement ok 30 | CALL dbgen(sf=1); 31 | 32 | statement ok 33 | ALTER TABLE lineitem RENAME TO lineitem1 34 | 35 | statement ok 36 | CREATE TABLE lineitem2 AS FROM lineitem1 37 | 38 | # creating and dropping a table with an ORDER BY 39 | statement ok 40 | CREATE OR REPLACE TEMPORARY TABLE ans as select l1.*, l1.* from lineitem1 l1 ORDER BY l_orderkey, l_returnflag 41 | 42 | statement ok 43 | DROP TABLE ans; 44 | 45 | # performing a small hash join 46 | statement ok 47 | CREATE OR REPLACE TEMPORARY TABLE ans as select l1.*, l2.* from lineitem1 l1 JOIN (FROM lineitem2 l2 WHERE l_orderkey<10000) AS l2 USING (l_orderkey, l_linenumber) 48 | 49 | statement ok 50 | DROP TABLE ans; 51 | 52 | # performing a large window function 53 | statement ok 54 | CREATE OR REPLACE TEMPORARY TABLE ans as select l1.*, row_number() OVER (PARTITION BY l_orderkey, l_linenumber ORDER BY l_orderkey) from lineitem1 l1 55 | 56 | statement ok 57 | DROP TABLE ans; 58 | 59 | # performing a large hash join 60 | statement ok 61 | CREATE OR REPLACE TEMPORARY TABLE ans as select l1.*, l2.* from lineitem1 l1 JOIN lineitem2 l2 USING (l_orderkey, l_linenumber) 62 | 63 | statement ok 64 | DROP TABLE ans; 65 | 66 | restart 67 | 68 | endloop -------------------------------------------------------------------------------- /test/sql/secrets/secret_compatibility_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/secret_compatibility_httpfs.test 2 | # description: Test secret compatibility across versions 3 | # group: [secrets] 4 | 5 | require httpfs 6 | 7 | require-env TEST_PERSISTENT_SECRETS_AVAILABLE 8 | 9 | # Ensure any currently stored secrets don't interfere with the test 10 | statement ok 11 | set secret_directory='./data/secrets/httpfs' 12 | 13 | query IIIIIII 14 | from duckdb_secrets() order by name; 15 | ---- 16 | s3_config_secret_v1_1_2 s3 config true local_file ['s3://', 's3n://', 's3a://'] name=s3_config_secret_v1_1_2;type=s3;provider=config;serializable=true;scope=s3://,s3n://,s3a://;region=us-east-2;use_ssl=false 17 | s3_config_secret_v1_1_3 s3 config true local_file ['s3://', 's3n://', 's3a://'] name=s3_config_secret_v1_1_3;type=s3;provider=config;serializable=true;scope=s3://,s3n://,s3a://;region=us-east-2;use_ssl=false 18 | s3_config_secret_v_1_0_0 s3 config true local_file ['s3://', 's3n://', 's3a://'] name=s3_config_secret_v_1_0_0;type=s3;provider=config;serializable=true;scope=s3://,s3n://,s3a://;endpoint=s3.amazonaws.com;key_id=;region=us-east-2;s3_url_compatibility_mode=0;secret=redacted;session_token=redacted;url_style=;use_ssl=0 19 | s3_secret_chain_v_1_0_0 s3 credential_chain true local_file ['s3://', 's3n://', 's3a://'] name=s3_secret_chain_v_1_0_0;type=s3;provider=credential_chain;serializable=true;scope=s3://,s3n://,s3a://;endpoint=s3.amazonaws.com;region=us-east-2;use_ssl=false 20 | s3_secret_chain_v_1_1_2 s3 credential_chain true local_file ['s3://', 's3n://', 's3a://'] name=s3_secret_chain_v_1_1_2;type=s3;provider=credential_chain;serializable=true;scope=s3://,s3n://,s3a://;endpoint=s3.amazonaws.com;region=us-east-2;use_ssl=false 21 | s3_secret_chain_v_1_1_3 s3 credential_chain true local_file ['s3://', 's3n://', 's3a://'] name=s3_secret_chain_v_1_1_3;type=s3;provider=credential_chain;serializable=true;scope=s3://,s3n://,s3a://;endpoint=s3.amazonaws.com;region=us-east-2;use_ssl=false 22 | -------------------------------------------------------------------------------- /test/sql/copy/s3/hive_partitioned_write_s3.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/hive_partitioned_write_s3.test_slow 2 | # description: slow test for the hive partitioned write to s3 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require tpch 10 | 11 | require-env S3_TEST_SERVER_AVAILABLE 1 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 24 | set ignore_error_messages 25 | 26 | statement ok 27 | pragma memory_limit='200MB' 28 | 29 | statement ok 30 | set http_timeout=120000; 31 | 32 | # More retries (longest wait will be 25600ms) 33 | statement ok 34 | set http_retries=6; 35 | 36 | # around 200MB worth of data, will require the PartitionedColumnData to spill to disk 37 | statement ok 38 | COPY (SELECT i%2::INT32 as part_col, i::INT32 FROM range(0,25000000) tbl(i)) TO 's3://test-bucket/partitioned_memory_spill' (FORMAT parquet, PARTITION_BY part_col, overwrite_or_ignore TRUE); 39 | 40 | statement ok 41 | pragma memory_limit='-1' 42 | 43 | statement ok 44 | call dbgen(sf=1); 45 | 46 | # Partition by 2 columns 47 | statement ok 48 | COPY lineitem TO 's3://test-bucket/lineitem_sf1_partitioned' (FORMAT PARQUET, PARTITION_BY (l_returnflag, l_linestatus), overwrite_or_ignore TRUE); 49 | 50 | statement ok 51 | DROP TABLE lineitem; 52 | 53 | statement ok 54 | CREATE VIEW lineitem as SELECT * FROM parquet_scan('s3://test-bucket/lineitem_sf1_partitioned/*/*/*.parquet', HIVE_PARTITIONING=1); 55 | 56 | loop i 1 9 57 | 58 | query I 59 | PRAGMA tpch(${i}) 60 | ---- 61 | :duckdb/extension/tpch/dbgen/answers/sf1/q0${i}.csv 62 | 63 | endloop 64 | 65 | loop i 10 23 66 | 67 | query I 68 | PRAGMA tpch(${i}) 69 | ---- 70 | :duckdb/extension/tpch/dbgen/answers/sf1/q${i}.csv 71 | 72 | endloop 73 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_r2_serialization.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_r2_serialization.test 2 | # description: Demo of secret serialization 3 | # group: [secrets] 4 | 5 | # NOTE: this is a testing feature that will be removed / replaced with actual persistent secrets. 6 | 7 | require httpfs 8 | 9 | require parquet 10 | 11 | load __TEST_DIR__/test_serialize_secrets.db 12 | 13 | statement ok 14 | PRAGMA enable_verification; 15 | 16 | statement ok 17 | set secret_directory='__TEST_DIR__/create_secret_r2_serialization' 18 | 19 | statement ok 20 | CREATE OR REPLACE PERSISTENT SECRET s1 ( 21 | TYPE S3, 22 | PROVIDER config, 23 | SCOPE 's3://my_scope', 24 | KEY_ID 'mekey', 25 | SECRET 'mesecret', 26 | REGION 'meregion', 27 | SESSION_TOKEN 'mesesh', 28 | ENDPOINT 'meendpoint', 29 | URL_STYLE 'mahstyle', 30 | USE_SSL true, 31 | URL_COMPATIBILITY_MODE true 32 | ) 33 | 34 | query IIII 35 | select name, type, provider, scope FROM duckdb_secrets(); 36 | ---- 37 | s1 s3 config ['s3://my_scope'] 38 | 39 | query I nosort secret_to_string 40 | select * from duckdb_secrets(); 41 | ---- 42 | 43 | restart 44 | 45 | # Now setting the secret dir somehwere nonexistent will yield no persistent secrets 46 | statement ok 47 | set secret_directory='__TEST_DIR__/does_not_exist2' 48 | 49 | query I 50 | select count(*) FROM duckdb_secrets(); 51 | ---- 52 | 0 53 | 54 | restart 55 | 56 | # However setting it to the dir that does, we can suddenly see our persisted secrets 57 | statement ok 58 | set secret_directory='__TEST_DIR__/create_secret_r2_serialization' 59 | 60 | # After restart secret is still there 61 | query IIII 62 | select name, type, provider, scope FROM duckdb_secrets(); 63 | ---- 64 | s1 s3 config ['s3://my_scope'] 65 | 66 | # Even more: it matches the exact string note that we don't disable redaction here to ensure we cover 67 | # redaction set serialization with this test 68 | query I nosort secret_to_string 69 | select * from duckdb_secrets(); 70 | ---- -------------------------------------------------------------------------------- /test/sql/copy/s3/upload_large_file.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/upload_large_file.test_slow 2 | # description: Copy large csv/parquet files from and to S3. 3 | # group: [s3] 4 | 5 | require tpch 6 | 7 | require parquet 8 | 9 | require httpfs 10 | 11 | require-env S3_TEST_SERVER_AVAILABLE 1 12 | 13 | # Require that these environment variables are also set 14 | 15 | require-env AWS_DEFAULT_REGION 16 | 17 | require-env AWS_ACCESS_KEY_ID 18 | 19 | require-env AWS_SECRET_ACCESS_KEY 20 | 21 | require-env DUCKDB_S3_ENDPOINT 22 | 23 | require-env DUCKDB_S3_USE_SSL 24 | 25 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 26 | set ignore_error_messages 27 | 28 | # confirm we use a reasonable amount of memory 29 | statement ok 30 | SET memory_limit='2.5GB'; 31 | 32 | statement ok 33 | set http_timeout=120000; 34 | 35 | # More retries (longest wait will be 25600ms) 36 | statement ok 37 | set http_retries=6; 38 | 39 | # disable tmp dir to force OOM if we exceed our set limit 40 | statement ok 41 | PRAGMA temp_directory='' 42 | 43 | statement ok 44 | SET s3_uploader_thread_limit = 5; 45 | 46 | statement ok 47 | CALL DBGEN(sf=1) 48 | 49 | query I 50 | SELECT 51 | sum(l_extendedprice * l_discount) AS revenue 52 | FROM 53 | lineitem 54 | WHERE 55 | l_shipdate >= CAST('1994-01-01' AS date) 56 | AND l_shipdate < CAST('1995-01-01' AS date) 57 | AND l_discount BETWEEN 0.05 58 | AND 0.07 59 | AND l_quantity < 24; 60 | ---- 61 | 123141078.2283 62 | 63 | # Parquet file ~300MB 64 | statement ok 65 | COPY lineitem TO 's3://test-bucket/multipart/export_large.parquet' (FORMAT 'parquet'); 66 | 67 | query I 68 | SELECT 69 | sum(l_extendedprice * l_discount) AS revenue 70 | FROM 71 | "s3://test-bucket/multipart/export_large.parquet" 72 | WHERE 73 | l_shipdate >= CAST('1994-01-01' AS date) 74 | AND l_shipdate < CAST('1995-01-01' AS date) 75 | AND l_discount BETWEEN 0.05 76 | AND 0.07 77 | AND l_quantity < 24; 78 | ---- 79 | 123141078.2283 80 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/parquet_encryption_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/parquet_encryption_httpfs.test 2 | # description: Test Parquet encryption with OpenSSL 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | # parquet keys are not persisted across restarts 10 | statement ok 11 | PRAGMA enable_verification 12 | 13 | # add keys of 3 different lengths 14 | statement ok 15 | PRAGMA add_parquet_key('key128', '0123456789112345') 16 | 17 | statement ok 18 | PRAGMA add_parquet_key('key192', '012345678911234501234567') 19 | 20 | statement ok 21 | PRAGMA add_parquet_key('key256', '01234567891123450123456789112345') 22 | 23 | # test all valid AES key lengths 24 | foreach key_len 128 192 256 25 | 26 | statement ok 27 | COPY (SELECT 42 i) to '__TEST_DIR__/encrypted${key_len}_openssl.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'}) 28 | 29 | query I 30 | SELECT * FROM read_parquet('__TEST_DIR__/encrypted${key_len}_openssl.parquet', encryption_config={footer_key: 'key${key_len}'}) 31 | ---- 32 | 42 33 | 34 | statement ok 35 | CREATE OR REPLACE TABLE test (i INTEGER) 36 | 37 | statement ok 38 | COPY test FROM '__TEST_DIR__/encrypted${key_len}_openssl.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'}) 39 | 40 | query I 41 | SELECT * FROM test 42 | ---- 43 | 42 44 | 45 | endloop 46 | 47 | # what happens if we don't try to decrypt even if the file is encrypted? 48 | statement error 49 | SELECT * FROM read_parquet('__TEST_DIR__/encrypted128_openssl.parquet') 50 | ---- 51 | Invalid Input Error 52 | 53 | # what if we try to decrypt with the wrong key? 54 | statement error 55 | SELECT * FROM read_parquet('__TEST_DIR__/encrypted128_openssl.parquet', encryption_config={footer_key: 'key192'}) 56 | ---- 57 | Invalid Input Error: Computed AES tag differs from read AES tag, are you using the right key? 58 | 59 | # what if we don't encrypt, but try to decrypt? 60 | statement ok 61 | COPY (SELECT 42 i) to '__TEST_DIR__/unencrypted.parquet' 62 | 63 | statement error 64 | SELECT * FROM read_parquet('__TEST_DIR__/unencrypted.parquet', encryption_config={footer_key: 'key256'}) 65 | ---- 66 | Invalid Input Error 67 | -------------------------------------------------------------------------------- /test/sql/copy/csv/glob/copy_csv_glob_s3.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/glob/copy_csv_glob_s3.test 2 | # description: Test globbing CSVs on s3 3 | # group: [glob] 4 | 5 | statement ok 6 | PRAGMA enable_verification 7 | 8 | require httpfs 9 | 10 | require-env S3_TEST_SERVER_AVAILABLE 1 11 | 12 | # Require that these environment variables are also set 13 | 14 | require-env AWS_DEFAULT_REGION 15 | 16 | require-env AWS_ACCESS_KEY_ID 17 | 18 | require-env AWS_SECRET_ACCESS_KEY 19 | 20 | require-env DUCKDB_S3_ENDPOINT 21 | 22 | require-env DUCKDB_S3_USE_SSL 23 | 24 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 25 | set ignore_error_messages 26 | 27 | # copy files to S3 before beginning tests 28 | statement ok 29 | COPY (select * from 'duckdb/data/csv/glob/a1/a1.csv') to 's3://test-bucket/copy_csv_glob_s3/copy/a1/a1.csv'; 30 | 31 | statement ok 32 | COPY (select * from 'duckdb/data/csv/glob/a2/a2.csv') to 's3://test-bucket/copy_csv_glob_s3/copy/a2/a2.csv'; 33 | 34 | statement ok 35 | COPY (select * from 'duckdb/data/csv/glob/a3/b1.csv') to 's3://test-bucket/copy_csv_glob_s3/copy/a3/b1.csv'; 36 | 37 | statement ok 38 | COPY (select null) to 's3://test-bucket/glob/copy/empty/empty.csv'; 39 | 40 | statement ok 41 | COPY (select * from 'duckdb/data/csv/glob/i1/integer.csv') to 's3://test-bucket/copy_csv_glob_s3/copy/empty/integer.csv'; 42 | 43 | statement ok 44 | CREATE TABLE dates(d DATE); 45 | 46 | statement ok 47 | COPY dates FROM 's3://test-bucket/copy_csv_glob_s3/copy/a[123]/*.csv' (AUTO_DETECT 1); 48 | 49 | # simple globbing for both url styles 50 | foreach urlstyle path vhost 51 | 52 | statement ok 53 | SET s3_url_style='${urlstyle}' 54 | 55 | query I 56 | SELECT * FROM dates ORDER BY 1 57 | ---- 58 | 2019-06-05 59 | 2019-06-15 60 | 2019-06-25 61 | 2019-07-05 62 | 2019-07-15 63 | 2019-07-25 64 | 2019-08-05 65 | 2019-08-15 66 | 2019-08-25 67 | 68 | # nothing matches the glob 69 | statement error 70 | INSERT INTO dates FROM read_csv('s3://test-bucket/copy_csv_glob_s3/copy/*/a*a.csv', auto_detect=1) 71 | ---- 72 | No files found that match the pattern 73 | 74 | endloop 75 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_minio.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_minio.test 2 | # description: Test s3 secrets actually work using minio 3 | # group: [secrets] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | set ignore_error_messages 24 | 25 | load __TEST_DIR__/persistent_secrets.db 26 | 27 | statement ok 28 | PRAGMA enable_verification; 29 | 30 | statement ok 31 | set secret_directory='__TEST_DIR__/create_secret_minio' 32 | 33 | # first need to unset the duckdb settings: currently the env variables are loaded automatically making all queries auth 34 | statement ok 35 | set s3_access_key_id=''; 36 | 37 | statement ok 38 | set s3_secret_access_key=''; 39 | 40 | statement error 41 | copy (select 1 as a) to 's3://test- /test-file.parquet' 42 | ---- 43 | 44 | # Now we create a scoped secret with correct credentials 45 | statement ok 46 | CREATE PERSISTENT SECRET ( 47 | TYPE S3, 48 | PROVIDER config, 49 | SCOPE 's3://test-bucket/only-this-file-gets-auth.parquet', 50 | KEY_ID '${AWS_ACCESS_KEY_ID}', 51 | SECRET '${AWS_SECRET_ACCESS_KEY}', 52 | REGION '${AWS_DEFAULT_REGION}', 53 | ENDPOINT '${DUCKDB_S3_ENDPOINT}', 54 | USE_SSL '${DUCKDB_S3_USE_SSL}' 55 | ) 56 | 57 | # scope doesn't match! query still fails 58 | statement error 59 | copy (select 1 as a) to 's3://test-bucket/test-file.parquet' 60 | ---- 61 | 62 | # scope matches, the secret is chosen and the query will succeed 63 | statement ok 64 | copy (select 1 as a) to 's3://test-bucket/only-this-file-gets-auth.parquet' 65 | 66 | restart 67 | 68 | statement ok 69 | set secret_directory='__TEST_DIR__/create_secret_minio' 70 | 71 | # persistent secrets survive restart 72 | statement ok 73 | copy (select 1 as a) to 's3://test-bucket/only-this-file-gets-auth.parquet' 74 | 75 | # Its still scoped 76 | statement error 77 | copy (select 1 as a) to 's3://test-bucket/no-auth-here.parquet' 78 | ---- -------------------------------------------------------------------------------- /src/include/httpfs_client.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/http_util.hpp" 4 | 5 | namespace duckdb { 6 | class HTTPLogger; 7 | class FileOpener; 8 | struct FileOpenerInfo; 9 | class HTTPState; 10 | 11 | struct HTTPFSParams : public HTTPParams { 12 | HTTPFSParams(HTTPUtil &http_util) : HTTPParams(http_util) { 13 | } 14 | 15 | static constexpr bool DEFAULT_ENABLE_SERVER_CERT_VERIFICATION = false; 16 | static constexpr uint64_t DEFAULT_HF_MAX_PER_PAGE = 0; 17 | static constexpr bool DEFAULT_FORCE_DOWNLOAD = false; 18 | static constexpr bool AUTO_FALLBACK_TO_FULL_DOWNLOAD = true; 19 | 20 | bool force_download = DEFAULT_FORCE_DOWNLOAD; 21 | bool auto_fallback_to_full_download = AUTO_FALLBACK_TO_FULL_DOWNLOAD; 22 | bool enable_server_cert_verification = DEFAULT_ENABLE_SERVER_CERT_VERIFICATION; 23 | bool enable_curl_server_cert_verification = true; 24 | idx_t hf_max_per_page = DEFAULT_HF_MAX_PER_PAGE; 25 | string ca_cert_file; 26 | string bearer_token; 27 | bool unsafe_disable_etag_checks {false}; 28 | shared_ptr state; 29 | string user_agent = {""}; 30 | // Additional fields needs to be appended at the end and need to be propagated to duckdb-wasm 31 | // TODO: make this unnecessary 32 | }; 33 | 34 | class HTTPFSUtil : public HTTPUtil { 35 | public: 36 | unique_ptr InitializeParameters(optional_ptr opener, 37 | optional_ptr info) override; 38 | unique_ptr InitializeClient(HTTPParams &http_params, const string &proto_host_port) override; 39 | 40 | static unordered_map ParseGetParameters(const string &text); 41 | static shared_ptr GetHTTPUtil(optional_ptr opener); 42 | 43 | string GetName() const override; 44 | }; 45 | 46 | class HTTPFSCurlUtil : public HTTPFSUtil { 47 | public: 48 | unique_ptr InitializeClient(HTTPParams &http_params, const string &proto_host_port) override; 49 | 50 | static unordered_map ParseGetParameters(const string &text); 51 | 52 | string GetName() const override; 53 | }; 54 | 55 | struct HeaderCollector { 56 | std::vector header_collection; 57 | }; 58 | 59 | } // namespace duckdb 60 | -------------------------------------------------------------------------------- /test/sql/copy/csv/parallel/csv_parallel_httpfs.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/parallel/csv_parallel_httpfs.test 2 | # description: This test issue #7336 and #7337 3 | # group: [parallel] 4 | 5 | statement ok 6 | PRAGMA enable_verification 7 | 8 | require httpfs 9 | 10 | query IIII 11 | select column00, column01, column02, column03 from read_csv_auto('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv') 12 | ---- 13 | 1 AAAAAAAABAAAAAAA 980124 7135 14 | 2 AAAAAAAACAAAAAAA 819667 1461 15 | 3 AAAAAAAADAAAAAAA 1473522 6247 16 | 4 AAAAAAAAEAAAAAAA 1703214 3986 17 | 5 AAAAAAAAFAAAAAAA 953372 4470 18 | 6 AAAAAAAAGAAAAAAA 213219 6374 19 | 7 AAAAAAAAHAAAAAAA 68377 3219 20 | 8 AAAAAAAAIAAAAAAA 1215897 2471 21 | 9 AAAAAAAAJAAAAAAA 1168667 1404 22 | 10 AAAAAAAAKAAAAAAA 1207553 5143 23 | 24 | query IIIIIIIIIIIIIIIIII 25 | from read_csv_auto('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv'); 26 | ---- 27 | 1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 28 | 2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 29 | 3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 30 | 4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 31 | 5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 32 | 6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 33 | 7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 34 | 8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 35 | 9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 36 | 10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 37 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/test_yellow_cab.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/test_yellow_cab.test_slow 2 | # description: Test yellow cab parquet file 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | statement ok 10 | CREATE TABLE yellow_cab AS SELECT * FROM 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/yellowcab.parquet' 11 | 12 | statement ok 13 | PRAGMA enable_verification 14 | 15 | query IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 16 | select min(VendorID::VARCHAR), max(VendorID::VARCHAR), min(tpep_pickup_datetime::VARCHAR), max(tpep_pickup_datetime::VARCHAR), min(tpep_dropoff_datetime::VARCHAR), max(tpep_dropoff_datetime::VARCHAR), min(passenger_count::VARCHAR), max(passenger_count::VARCHAR), min(trip_distance::VARCHAR), max(trip_distance::VARCHAR), min(pickup_longitude::VARCHAR), max(pickup_longitude::VARCHAR), min(pickup_latitude::VARCHAR), max(pickup_latitude::VARCHAR), min(RatecodeID::VARCHAR), max(RatecodeID::VARCHAR), min(store_and_fwd_flag::VARCHAR), max(store_and_fwd_flag::VARCHAR), min(dropoff_longitude::VARCHAR), max(dropoff_longitude::VARCHAR), min(dropoff_latitude::VARCHAR), max(dropoff_latitude::VARCHAR), min(payment_type::VARCHAR), max(payment_type::VARCHAR), min(fare_amount::VARCHAR), max(fare_amount::VARCHAR), min(extra::VARCHAR), max(extra::VARCHAR), min(mta_tax::VARCHAR), max(mta_tax::VARCHAR), min(tip_amount::VARCHAR), max(tip_amount::VARCHAR), min(tolls_amount::VARCHAR), max(tolls_amount::VARCHAR), min(improvement_surcharge::VARCHAR), max(improvement_surcharge::VARCHAR), min(total_amount::VARCHAR), max(total_amount::VARCHAR) from yellow_cab; 17 | ---- 18 | 1 2 2016-01-01 00:00:00 2016-01-29 12:08:57 2016-01-01 00:00:00 2016-01-30 12:05:11 0 8 .00 97.40 -0.13990700244903564 0 0 57.269275665283203 1 99 (empty) Y -73.210006713867188 0 0 41.317001342773437 1 4 -10 998 -0.5 2.0 -0.5 0.5 0 998.14 -10.5 9.75 -0.3 0.3 -10.8 998.3 19 | 20 | 21 | # writer round-trip 22 | statement ok 23 | COPY yellow_cab TO '__TEST_DIR__/yellowcab.parquet' (FORMAT PARQUET); 24 | 25 | query IIIIIIIIIIIIIIIIIII nosort yellowcab 26 | SELECT * FROM yellow_cab 27 | ---- 28 | 29 | query IIIIIIIIIIIIIIIIIII nosort yellowcab 30 | SELECT * FROM '__TEST_DIR__/yellowcab.parquet' 31 | ---- 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /test/sql/copy/s3/glob_s3_paging.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/glob_s3_paging.test_slow 2 | # description: Test globbing of a large number of parquet files to test the paging mechanism 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 24 | set ignore_error_messages 25 | 26 | statement ok 27 | set http_timeout=120000; 28 | 29 | # More retries (longest wait will be 25600ms) 30 | statement ok 31 | set http_retries=6; 32 | 33 | # Test should be a bit faster using the metadata cache 34 | statement ok 35 | SET enable_http_metadata_cache=true; 36 | 37 | foreach urlstyle path vhost 38 | 39 | statement ok 40 | SET s3_url_style='${urlstyle}' 41 | 42 | ## For both formats we generate 2000 files which we will glob to test the paging mechanism of aws ListObjectV2 call is handled properly 43 | foreach format parquet csv 44 | 45 | foreach i 0 1 46 | 47 | foreach j 0 1 2 3 4 5 6 7 8 9 48 | 49 | foreach k 0 1 2 3 4 5 6 7 8 9 50 | 51 | foreach l 0 1 2 3 4 5 6 7 8 9 52 | 53 | statement ok 54 | COPY (select (${i}${j}${k}${l})::INT as column0) to 's3://test-bucket/parquet_glob_s3_paging/paging/t${i}${j}${k}${l}-${urlstyle}-urls.${format}'; 55 | 56 | endloop 57 | 58 | endloop 59 | 60 | endloop 61 | 62 | endloop 63 | 64 | # Begin tests 65 | query I 66 | select sum(column0) from 's3://test-bucket/parquet_glob_s3_paging/paging/t*-${urlstyle}-urls.${format}' 67 | ---- 68 | 1999000 69 | 70 | endloop 71 | 72 | endloop 73 | 74 | # test with parquet_metadata_cache = true 75 | statement ok 76 | SET parquet_metadata_cache=true; 77 | 78 | foreach urlstyle path vhost 79 | 80 | foreach format parquet 81 | 82 | loop i 0 2 83 | 84 | # Begin tests 85 | query I 86 | select sum(column0) from 's3://test-bucket/parquet_glob_s3_paging/paging/t*-${urlstyle}-urls.${format}' 87 | ---- 88 | 1999000 89 | 90 | endloop 91 | 92 | endloop 93 | 94 | endloop 95 | -------------------------------------------------------------------------------- /src/include/crypto.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/encryption_state.hpp" 4 | #include "duckdb/common/helper.hpp" 5 | 6 | #include 7 | #include 8 | 9 | typedef struct evp_cipher_ctx_st EVP_CIPHER_CTX; 10 | typedef struct evp_cipher_st EVP_CIPHER; 11 | 12 | namespace duckdb { 13 | 14 | typedef unsigned char hash_bytes[32]; 15 | typedef unsigned char hash_str[64]; 16 | 17 | void sha256(const char *in, size_t in_len, hash_bytes &out); 18 | 19 | void hmac256(const std::string &message, const char *secret, size_t secret_len, hash_bytes &out); 20 | 21 | void hmac256(std::string message, hash_bytes secret, hash_bytes &out); 22 | 23 | void hex256(hash_bytes &in, hash_str &out); 24 | 25 | class DUCKDB_EXTENSION_API AESStateSSL : public EncryptionState { 26 | 27 | public: 28 | explicit AESStateSSL(EncryptionTypes::CipherType cipher_p, idx_t key_len_p); 29 | ~AESStateSSL() override; 30 | 31 | public: 32 | void InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, 33 | const_data_ptr_t aad, idx_t aad_len) override; 34 | void InitializeDecryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len, 35 | const_data_ptr_t aad, idx_t aad_len) override; 36 | size_t Process(const_data_ptr_t in, idx_t in_len, data_ptr_t out, idx_t out_len) override; 37 | size_t Finalize(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len) override; 38 | void GenerateRandomData(data_ptr_t data, idx_t len) override; 39 | 40 | const EVP_CIPHER *GetCipher(idx_t key_len); 41 | size_t FinalizeGCM(data_ptr_t out, idx_t out_len, data_ptr_t tag, idx_t tag_len); 42 | 43 | private: 44 | EVP_CIPHER_CTX *context; 45 | EncryptionTypes::Mode mode; 46 | }; 47 | 48 | } // namespace duckdb 49 | 50 | extern "C" { 51 | 52 | class DUCKDB_EXTENSION_API AESStateSSLFactory : public duckdb::EncryptionUtil { 53 | public: 54 | explicit AESStateSSLFactory() { 55 | } 56 | 57 | duckdb::shared_ptr CreateEncryptionState(duckdb::EncryptionTypes::CipherType cipher_p, 58 | duckdb::idx_t key_len_p) const override { 59 | return duckdb::make_shared_ptr(cipher_p, key_len_p); 60 | } 61 | 62 | ~AESStateSSLFactory() override { 63 | } 64 | }; 65 | } 66 | -------------------------------------------------------------------------------- /src/include/http_metadata_cache.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/atomic.hpp" 4 | #include "duckdb/common/chrono.hpp" 5 | #include "duckdb/common/list.hpp" 6 | #include "duckdb/common/mutex.hpp" 7 | #include "duckdb/common/string.hpp" 8 | #include "duckdb/common/types.hpp" 9 | #include "duckdb/common/unordered_map.hpp" 10 | #include "duckdb/main/client_context.hpp" 11 | #include "duckdb/main/client_context_state.hpp" 12 | 13 | #include 14 | #include 15 | 16 | namespace duckdb { 17 | 18 | struct HTTPMetadataCacheEntry { 19 | idx_t length; 20 | timestamp_t last_modified; 21 | string etag; 22 | }; 23 | 24 | // Simple cache with a max age for an entry to be valid 25 | class HTTPMetadataCache : public ClientContextState { 26 | public: 27 | explicit HTTPMetadataCache(bool flush_on_query_end_p, bool shared_p) 28 | : flush_on_query_end(flush_on_query_end_p), shared(shared_p) {}; 29 | 30 | void Insert(const string &path, HTTPMetadataCacheEntry val) { 31 | if (shared) { 32 | lock_guard parallel_lock(lock); 33 | map[path] = val; 34 | } else { 35 | map[path] = val; 36 | } 37 | }; 38 | 39 | void Erase(string path) { 40 | if (shared) { 41 | lock_guard parallel_lock(lock); 42 | map.erase(path); 43 | } else { 44 | map.erase(path); 45 | } 46 | }; 47 | 48 | bool Find(string path, HTTPMetadataCacheEntry &ret_val) { 49 | if (shared) { 50 | lock_guard parallel_lock(lock); 51 | auto lookup = map.find(path); 52 | if (lookup != map.end()) { 53 | ret_val = lookup->second; 54 | return true; 55 | } else { 56 | return false; 57 | } 58 | } else { 59 | auto lookup = map.find(path); 60 | if (lookup != map.end()) { 61 | ret_val = lookup->second; 62 | return true; 63 | } else { 64 | return false; 65 | } 66 | } 67 | }; 68 | 69 | void Clear() { 70 | if (shared) { 71 | lock_guard parallel_lock(lock); 72 | map.clear(); 73 | } else { 74 | map.clear(); 75 | } 76 | } 77 | 78 | //! Called by the ClientContext when the current query ends 79 | void QueryEnd(ClientContext &context) override { 80 | if (flush_on_query_end) { 81 | Clear(); 82 | } 83 | } 84 | 85 | protected: 86 | mutex lock; 87 | unordered_map map; 88 | bool flush_on_query_end; 89 | bool shared; 90 | }; 91 | 92 | } // namespace duckdb 93 | -------------------------------------------------------------------------------- /test/sql/logging/file_system_logging.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/logging/file_system_logging.test 2 | # group: [logging] 3 | 4 | require parquet 5 | 6 | require noforcestorage 7 | 8 | statement ok 9 | set enable_logging = true; 10 | 11 | statement ok 12 | set logging_level='trace'; 13 | 14 | statement ok 15 | COPY (SELECT 1 as a) TO '__TEST_DIR__/test.csv' 16 | 17 | statement ok 18 | FROM '__TEST_DIR__/test.csv' 19 | 20 | statement ok 21 | pragma threads=1 22 | 23 | # Note: regex for test stability 24 | query IIII 25 | SELECT scope, type, log_level, regexp_replace(message, '\"path\":.*test.csv"', '"test.csv"') 26 | FROM duckdb_logs 27 | WHERE type = 'FileSystem' 28 | ORDER BY timestamp 29 | ---- 30 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"OPEN"} 31 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"WRITE","bytes":"4","pos":"0"} 32 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"CLOSE"} 33 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"OPEN"} 34 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"READ","bytes":"4","pos":"0"} 35 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"READ","bytes":"0","pos":"4"} 36 | CONNECTION FileSystem TRACE {"fs":"LocalFileSystem","test.csv","op":"CLOSE"} 37 | 38 | statement ok 39 | CALL truncate_duckdb_logs(); 40 | 41 | require httpfs 42 | 43 | statement ok 44 | FROM 'https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv' 45 | 46 | # FIXME: investigate why we call READ twice? 47 | query IIII 48 | SELECT scope, type, log_level, regexp_replace(message, '\"path\":.*test.csv"', '"test.csv"') 49 | FROM duckdb_logs 50 | WHERE type = 'FileSystem' AND message NOT LIKE '%duckdb_extension%' 51 | ORDER BY timestamp 52 | ---- 53 | CONNECTION FileSystem TRACE {"fs":"HTTPFileSystem","path":"https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv","op":"OPEN"} 54 | CONNECTION FileSystem TRACE {"fs":"HTTPFileSystem","path":"https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv","op":"READ","bytes":"1276","pos":"0"} 55 | CONNECTION FileSystem TRACE {"fs":"HTTPFileSystem","path":"https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv","op":"READ","bytes":"0","pos":"1276"} 56 | CONNECTION FileSystem TRACE {"fs":"HTTPFileSystem","path":"https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv","op":"CLOSE"} 57 | -------------------------------------------------------------------------------- /test/sql/copy/s3/upload_large_json_file.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/upload_large_json_file.test_slow 2 | # description: Copy large json files from and to S3. 3 | # group: [s3] 4 | 5 | require tpch 6 | 7 | require json 8 | 9 | require parquet 10 | 11 | require httpfs 12 | 13 | require-env S3_TEST_SERVER_AVAILABLE 1 14 | 15 | # Require that these environment variables are also set 16 | 17 | require-env AWS_DEFAULT_REGION 18 | 19 | require-env AWS_ACCESS_KEY_ID 20 | 21 | require-env AWS_SECRET_ACCESS_KEY 22 | 23 | require-env DUCKDB_S3_ENDPOINT 24 | 25 | require-env DUCKDB_S3_USE_SSL 26 | 27 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 28 | set ignore_error_messages 29 | 30 | statement ok 31 | set http_timeout=120000; 32 | 33 | # More retries (longest wait will be 25600ms) 34 | statement ok 35 | set http_retries=6; 36 | 37 | statement ok 38 | CALL DBGEN(sf=0.1) 39 | 40 | query I 41 | SELECT 42 | sum(l_extendedprice * l_discount) AS revenue 43 | FROM 44 | lineitem 45 | WHERE 46 | l_shipdate >= CAST('1994-01-01' AS date) 47 | AND l_shipdate < CAST('1995-01-01' AS date) 48 | AND l_discount BETWEEN 0.05 49 | AND 0.07 50 | AND l_quantity < 24; 51 | ---- 52 | 11803420.2534 53 | 54 | statement ok 55 | COPY lineitem TO 's3://test-bucket/multipart/export_large.json' (FORMAT 'json'); 56 | 57 | query I 58 | SELECT 59 | sum(l_extendedprice * l_discount) AS revenue 60 | FROM 61 | "s3://test-bucket/multipart/export_large.json" 62 | WHERE 63 | l_shipdate >= CAST('1994-01-01' AS date) 64 | AND l_shipdate < CAST('1995-01-01' AS date) 65 | AND l_discount BETWEEN 0.05 66 | AND 0.07 67 | AND l_quantity < 24; 68 | ---- 69 | 11803420.2534 70 | 71 | # This query triggers an edge case where we apply an S3-specific optimization using multiple cached filehandles 72 | query I 73 | SELECT 74 | sum(l_extendedprice * l_discount)/3 AS revenue 75 | FROM 76 | read_json_auto([ 77 | 's3://test-bucket/multipart/export_large.json', 78 | 's3://test-bucket/multipart/export_large.json', 79 | 's3://test-bucket/multipart/export_large.json',]) 80 | WHERE 81 | l_shipdate >= CAST('1994-01-01' AS date) 82 | AND l_shipdate < CAST('1995-01-01' AS date) 83 | AND l_discount BETWEEN 0.05 84 | AND 0.07 85 | AND l_quantity < 24; 86 | ---- 87 | 11803420.2534 -------------------------------------------------------------------------------- /test/sql/secret/gcs_oauth.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secret/gcs_oauth.test 2 | # description: Test GCS OAuth2 bearer token support 3 | # group: [secret] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | # Test creating a GCS secret with OAuth2 bearer token 11 | statement ok 12 | CREATE SECRET gcs_oauth_test ( 13 | TYPE GCS, 14 | bearer_token 'test_oauth2_token_12345' 15 | ); 16 | 17 | # Verify the secret was created 18 | query I 19 | SELECT COUNT(*) FROM duckdb_secrets() WHERE name = 'gcs_oauth_test' AND type = 'gcs'; 20 | ---- 21 | 1 22 | 23 | # Verify bearer token is redacted 24 | query I 25 | SELECT COUNT(*) FROM duckdb_secrets() WHERE name = 'gcs_oauth_test' AND secret_string LIKE '%bearer_token=redacted%'; 26 | ---- 27 | 1 28 | 29 | # Test creating a GCS secret with HMAC keys (backward compatibility) 30 | statement ok 31 | CREATE SECRET gcs_hmac_test ( 32 | TYPE GCS, 33 | key_id 'test_key_id', 34 | secret 'test_secret' 35 | ); 36 | 37 | # Verify both secrets exist 38 | query II 39 | SELECT name, type FROM duckdb_secrets() WHERE name IN ('gcs_oauth_test', 'gcs_hmac_test') ORDER BY name; 40 | ---- 41 | gcs_hmac_test gcs 42 | gcs_oauth_test gcs 43 | 44 | # Test creating a GCS secret with both bearer token and HMAC (bearer token should take precedence) 45 | statement ok 46 | CREATE SECRET gcs_mixed_test ( 47 | TYPE GCS, 48 | bearer_token 'oauth_token', 49 | key_id 'hmac_key', 50 | secret 'hmac_secret' 51 | ); 52 | 53 | # Verify all three secrets exist 54 | query I 55 | SELECT COUNT(*) FROM duckdb_secrets() WHERE name LIKE 'gcs_%test'; 56 | ---- 57 | 3 58 | 59 | # Clean up 60 | statement ok 61 | DROP SECRET gcs_oauth_test; 62 | 63 | statement ok 64 | DROP SECRET gcs_hmac_test; 65 | 66 | statement ok 67 | DROP SECRET gcs_mixed_test; 68 | 69 | # Verify all secrets are removed 70 | query I 71 | SELECT COUNT(*) FROM duckdb_secrets() WHERE name LIKE 'gcs_%test'; 72 | ---- 73 | 0 74 | 75 | # Test that bearer_token parameter is not allowed for S3 secrets 76 | statement error Unknown named parameter 77 | CREATE SECRET s3_with_bearer ( 78 | TYPE S3, 79 | bearer_token 'should_not_work' 80 | ); 81 | ---- 82 | 83 | # Test that bearer_token parameter is not allowed for R2 secrets 84 | statement error Unknown named parameter 85 | CREATE SECRET r2_with_bearer ( 86 | TYPE R2, 87 | bearer_token 'should_not_work' 88 | ); 89 | ---- -------------------------------------------------------------------------------- /test/sql/copy/s3/parquet_s3_tpcds.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/parquet_s3_tpcds.test_slow 2 | # description: Test all tpcds queries on tpch sf0.01 over s3 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require tpcds 10 | 11 | require-env S3_TEST_SERVER_AVAILABLE 1 12 | 13 | # Require that these environment variables are also set 14 | 15 | require-env AWS_DEFAULT_REGION 16 | 17 | require-env AWS_ACCESS_KEY_ID 18 | 19 | require-env AWS_SECRET_ACCESS_KEY 20 | 21 | require-env DUCKDB_S3_ENDPOINT 22 | 23 | require-env DUCKDB_S3_USE_SSL 24 | 25 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 26 | set ignore_error_messages 27 | 28 | # answers are generated from postgres 29 | # hence check with NULLS LAST flag 30 | statement ok 31 | PRAGMA default_null_order='NULLS LAST' 32 | 33 | statement ok 34 | SET enable_http_metadata_cache=true; 35 | 36 | statement ok 37 | set http_timeout=120000; 38 | 39 | # More retries (longest wait will be 25600ms) 40 | statement ok 41 | set http_retries=6; 42 | 43 | statement ok 44 | CREATE SCHEMA tpcds; 45 | 46 | statement ok 47 | CALL dsdgen(sf=0.01, schema='tpcds'); 48 | 49 | foreach tbl call_center catalog_page catalog_returns catalog_sales customer customer_demographics customer_address date_dim household_demographics inventory income_band item promotion reason ship_mode store store_returns store_sales time_dim warehouse web_page web_returns web_sales web_site 50 | 51 | statement ok 52 | COPY tpcds.${tbl} TO 's3://test-bucket/tpcds-sf0_01/${tbl}.parquet' (FORMAT 'PARQUET', COMPRESSION 'ZSTD'); 53 | 54 | statement ok 55 | CREATE VIEW ${tbl} AS SELECT * FROM parquet_scan('s3://test-bucket/tpcds-sf0_01/${tbl}.parquet'); 56 | 57 | endloop 58 | 59 | # too slow queries: 60 | # 64, 85 61 | 62 | loop i 1 9 63 | 64 | query I 65 | PRAGMA tpcds(${i}) 66 | ---- 67 | :extension/tpcds/dsdgen/answers/sf0.01/0${i}.csv 68 | 69 | endloop 70 | 71 | loop i 10 64 72 | 73 | query I 74 | PRAGMA tpcds(${i}) 75 | ---- 76 | :extension/tpcds/dsdgen/answers/sf0.01/${i}.csv 77 | 78 | endloop 79 | 80 | loop i 65 85 81 | 82 | query I 83 | PRAGMA tpcds(${i}) 84 | ---- 85 | :extension/tpcds/dsdgen/answers/sf0.01/${i}.csv 86 | 87 | endloop 88 | 89 | loop i 86 99 90 | 91 | query I 92 | PRAGMA tpcds(${i}) 93 | ---- 94 | :extension/tpcds/dsdgen/answers/sf0.01/${i}.csv 95 | 96 | endloop 97 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret.test_slow 2 | # description: Test secret creation using the default s3 secret provider 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | # Ensure any currently stored secrets don't interfere with the test 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | statement ok 15 | reset s3_use_ssl 16 | 17 | # Create an S3 secret using the default provider (for s3, this will be the "config" provider, requiring the user to set all) 18 | statement ok 19 | CREATE SECRET default_provider_secret ( 20 | TYPE S3, 21 | KEY_ID 'my_key', 22 | SECRET 'my_secret', 23 | REGION 'my_region', 24 | ENDPOINT 'invalid-on-purpose' 25 | ) 26 | 27 | # The secret will be created for the default scope for this type 28 | query III 29 | SELECT name, type, scope FROM duckdb_secrets() WHERE name='default_provider_secret'; 30 | ---- 31 | default_provider_secret s3 ['s3://', 's3n://', 's3a://'] 32 | 33 | # Note the endpoint is now using the one in the default_provider_secret 34 | statement error 35 | FROM 's3://test-bucket/test.csv' 36 | ---- 37 | HTTP HEAD to 'https://test-bucket.invalid-on-purpose/test.csv' 38 | 39 | # Now create an S3 secret using the default (config) provider by explicitly passing it 40 | statement ok 41 | CREATE SECRET secret_scope_1 ( 42 | TYPE S3, 43 | PROVIDER config, 44 | SCOPE 's3://b1', 45 | ENDPOINT 'invalid-on-purpose-2' 46 | ) 47 | 48 | query III 49 | SELECT name, type, scope FROM duckdb_secrets() WHERE name='secret_scope_1'; 50 | ---- 51 | secret_scope_1 s3 ['s3://b1'] 52 | 53 | # Longest match of credential scope takes the win so, this is will grab the secret_scope_1 secret 54 | statement error 55 | FROM 's3://b1/test.csv' 56 | ---- 57 | Could not establish connection error for HTTP HEAD to 'https://b1.invalid-on-purpose-2/test.csv' 58 | 59 | # Now confirm we can also set multiple scopes 60 | statement ok 61 | CREATE SECRET secret_scope_2 ( 62 | TYPE S3, 63 | PROVIDER config, 64 | SCOPE ['s3://b2', 's3://b3'], 65 | ENDPOINT 'invalid-on-purpose-3' 66 | ) 67 | 68 | query III 69 | SELECT name, type, scope FROM duckdb_secrets() WHERE name='secret_scope_2'; 70 | ---- 71 | secret_scope_2 s3 ['s3://b2', 's3://b3'] 72 | 73 | statement error 74 | FROM 's3://b2/test.csv' 75 | ---- 76 | Could not establish connection error for HTTP HEAD to 'https://b2.invalid-on-purpose-3/test.csv' 77 | -------------------------------------------------------------------------------- /test/sql/copy/csv/test_csv_httpfs_prepared.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_csv_httpfs_prepared.test 2 | # description: CSV Reading From HTTPFS in Prepared Statements 3 | # group: [csv] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | statement ok 11 | PREPARE boaz_bug AS from read_csv_auto('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv') order by 1 12 | 13 | query ITIIIIITTTTIIITTTI 14 | EXECUTE boaz_bug 15 | ---- 16 | 1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 17 | 2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 18 | 3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 19 | 4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 20 | 5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 21 | 6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 22 | 7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 23 | 8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 24 | 9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 25 | 10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 26 | 27 | statement ok 28 | DEALLOCATE boaz_bug 29 | 30 | statement error 31 | EXECUTE boaz_bug 32 | ---- 33 | Prepared statement "boaz_bug" does not exist 34 | 35 | # Recreate prepared statement with different file 36 | 37 | #FIXME: FILE changed? 38 | mode skip 39 | 40 | statement ok 41 | PREPARE boaz_bug AS SELECT * from read_csv_auto('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c') order by all limit 5; 42 | 43 | query ITTRRR 44 | EXECUTE boaz_bug 45 | ---- 46 | 2020 Allemagne Germany 26.1 53196.069 200601.2 47 | 2020 Autriche Austria 18.0 4723.5 26215.8 48 | 2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 49 | 2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 50 | 2020 Chypre Cyprus 0.0 0.0 1627.6 51 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | 2 | In order to test these locally, `minio` is used. This requires Docker to be installed. 3 | 4 | ### Installing Docker on MacOS 5 | 6 | Install `docker` using `homebrew`. 7 | 8 | 9 | ```bash 10 | brew install docker --cask 11 | ``` 12 | 13 | Then open `/Applications/Docker`. Note that the first time you open the application you need to go to the `Applications` folder, right-click `Docker` and select `open`. 14 | 15 | ### Setting Up Docker 16 | 17 | In order to finish setting up Docker, you need to open the Docker application, and login to your Docker account. Create a Docker account if you do not have one and finish setting up. 18 | 19 | ### Running Minio 20 | 21 | Run the `install_s3_test_server` script. This requires root. This makes a few changes to your system, specifically to `/etc/hosts` to set up a few redirect interfaces to localhost. This only needs to be run once. 22 | 23 | ```bash 24 | sudo ./scripts/install_s3_test_server.sh 25 | ``` 26 | 27 | Then, if this has not been done yet, we need to generate some data: 28 | 29 | ``` 30 | ./scripts/generate_presigned_url.sh 31 | ``` 32 | 33 | Then run the test server in the back-ground using Docker. Note that Docker must be opened for this to work. On MacOS you can open the docker gui (`/Applications/Docker`) and leave it open to accomplish this. 34 | 35 | 36 | ```bash 37 | source ./scripts/run_s3_test_server.sh 38 | ``` 39 | 40 | Now set up the following environment variables to enable running of the tests. 41 | 42 | This can be done either manually: 43 | ```bash 44 | export S3_TEST_SERVER_AVAILABLE=1 45 | export AWS_DEFAULT_REGION=eu-west-1 46 | export AWS_ACCESS_KEY_ID=minio_duckdb_user 47 | export AWS_SECRET_ACCESS_KEY=minio_duckdb_user_password 48 | export DUCKDB_S3_ENDPOINT=duckdb-minio.com:9000 49 | export DUCKDB_S3_USE_SSL=false 50 | ``` 51 | 52 | Or using the `set_s3_test_server_variables.sh` script 53 | 54 | ```bash 55 | # use source so it sets the environment variables in your current environment 56 | source scripts/set_s3_test_server_variables.sh 57 | ``` 58 | 59 | Now you should be able to run the S3 tests using minio, e.g.: 60 | 61 | ```bash 62 | build/debug/test/unittest test/sql/copy/s3/s3_hive_partition.test 63 | ``` 64 | 65 | > minio uses port 9000. Clickhouse also uses port 9000. If the tests are not working and you have a running Clickhouse service - try killing it first, e.g. using `killall -9 clickhouse` 66 | 67 | #### Test Data 68 | 69 | The configuration for minio is stored in `scripts/minio_s3.yml`. Data is stored in `/tmp/minio_test_data`. -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_name_conflicts.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_name_conflicts.test 2 | # description: Test name conflict behaviour for secrets 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | load __TEST_DIR__/persistent_secrets.db 9 | 10 | require httpfs 11 | 12 | statement ok 13 | set secret_directory='__TEST_DIR__/create_secret_name_conflicts' 14 | 15 | statement ok 16 | CREATE TEMPORARY SECRET s1 ( TYPE S3 ) 17 | 18 | statement error 19 | CREATE TEMPORARY SECRET s1 ( TYPE S3 ) 20 | ---- 21 | Invalid Input Error: Temporary secret with name 's1' already exists! 22 | 23 | statement ok 24 | CREATE PERSISTENT SECRET s1 ( TYPE S3 ) 25 | 26 | statement error 27 | CREATE PERSISTENT SECRET s1 ( TYPE S3 ) 28 | ---- 29 | Persistent secret with name 's1' already exists in secret storage 'local_file'! 30 | 31 | statement error 32 | DROP SECRET s1; 33 | ---- 34 | Invalid Input Error: Ambiguity found for secret name 's1', secret occurs in multiple storages 35 | 36 | statement error 37 | DROP SECRET s1 FROM bogus; 38 | ---- 39 | Invalid Input Error: Unknown storage type found for drop secret: 'bogus' 40 | 41 | statement ok 42 | DROP TEMPORARY SECRET s1; 43 | 44 | # Re-dropping the temp s1 is now erroneous 45 | statement error 46 | DROP TEMPORARY SECRET s1; 47 | ---- 48 | Invalid Input Error: Failed to remove non-existent secret with name 's1' 49 | 50 | query II 51 | SELECT name, storage FROM duckdb_secrets() 52 | ---- 53 | s1 local_file 54 | 55 | # Now we will do it again but while the permanent secret is still lazily loaded 56 | restart 57 | 58 | statement ok 59 | set secret_directory='__TEST_DIR__/create_secret_name_conflicts' 60 | 61 | statement ok 62 | CREATE TEMPORARY SECRET s1 ( TYPE S3 ) 63 | 64 | # Now the drop should be ambiguous again: but the persistent secret will be lazily loaded now 65 | statement error 66 | DROP SECRET s1; 67 | ---- 68 | Invalid Input Error: Ambiguity found for secret name 's1', secret occurs in multiple storages 69 | 70 | # Fully specified drop statement this time 71 | statement ok 72 | DROP PERSISTENT SECRET s1 FROM LOCAL_FILE; 73 | 74 | # Now a semi-weird case: this will create if not exists only within its own storage: therefore this does actually create 75 | # the secret 76 | statement ok 77 | CREATE PERSISTENT SECRET IF NOT EXISTS s1 ( TYPE S3 ) 78 | 79 | query II 80 | SELECT name, storage FROM duckdb_secrets() ORDER BY storage 81 | ---- 82 | s1 local_file 83 | s1 memory 84 | 85 | statement ok 86 | DROP PERSISTENT SECRET s1; 87 | 88 | statement ok 89 | DROP SECRET s1; -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_binding.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_binding.test 2 | # description: Test secret binding & types 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | # Ensure any currently stored secrets don't interfere with the test 11 | statement ok 12 | set allow_persistent_secrets=false; 13 | 14 | # Binder autocasts options, also both with single quotes and without is allowed 15 | statement ok 16 | CREATE SECRET s1 ( 17 | TYPE R2, 18 | PROVIDER config, 19 | SCOPE ['s3://my_r2_scope', 's3://my_r2_scope2'], 20 | ACCOUNT_ID 'some_bogus_account', 21 | KEY_ID '123', 22 | USE_SSL 1, 23 | URL_COMPATIBILITY_MODE false 24 | ) 25 | 26 | query I nosort s1 27 | FROM duckdb_secrets(); 28 | ---- 29 | 30 | statement ok 31 | DROP SECRET s1 32 | 33 | # Create the secret again but in a different way to demonstrate casting and case insensitivity of param names 34 | statement ok 35 | CREATE SECRET s1 ( 36 | TYPE R2, 37 | PROVIDER config, 38 | SCOPE ['s3://my_r2_scope', 's3://my_r2_scope2'], 39 | account_id 'some_bogus_account', 40 | key_id 123, 41 | USE_SSL 'true', 42 | URL_COMPATIBILITY_MODE '0' 43 | ) 44 | 45 | query I nosort s1 46 | FROM duckdb_secrets(); 47 | ---- 48 | 49 | ### Now let's try some incorrect inputs 50 | 51 | # Incorrect type 52 | statement error 53 | CREATE SECRET incorrect_type ( 54 | TYPE R2, 55 | PROVIDER config, 56 | USE_SSL 'fliepflap' 57 | ) 58 | ---- 59 | Binder Error: Failed to cast option 'use_ssl' to type 'BOOLEAN': 'Could not convert string 'fliepflap' to BOOL' 60 | 61 | # Incorrect param altogether 62 | statement error 63 | CREATE SECRET incorrect_type ( 64 | TYPE R2, 65 | PROVIDER config, 66 | FLIEPFLAP true 67 | ) 68 | ---- 69 | Binder Error: Unknown parameter 'fliepflap' for secret type 'r2' with provider 'config' 70 | 71 | # Incorrect param for this type, but correct for other 72 | statement error 73 | CREATE SECRET incorrect_type ( 74 | TYPE S3, 75 | PROVIDER config, 76 | ACCOUNT_ID 'my_acount' 77 | ) 78 | ---- 79 | Binder Error: Unknown parameter 'account_id' for secret type 's3' with provider 'config' 80 | 81 | # Params can only occur once 82 | statement error 83 | CREATE SECRET duplicate_param ( 84 | TYPE R2, 85 | PROVIDER config, 86 | account_id 'some_bogus_account', 87 | key_id 123, 88 | KEY_ID 12098, 89 | account_id blablabla 90 | ) 91 | ---- 92 | Binder Error: Duplicate query param found while parsing create secret: 'key_id' 93 | -------------------------------------------------------------------------------- /test/sql/copy/csv/test_csv_remote.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/csv/test_csv_remote.test 2 | # description: Test reading csv files over http 3 | # group: [csv] 4 | 5 | require httpfs 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | # Test load from url with query string 11 | query IIIIIIIIIIII 12 | FROM sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv?v=1') 13 | ---- 14 | , " (empty) \n (empty) 0 0 [{'name': column00, 'type': BIGINT}, {'name': column01, 'type': VARCHAR}, {'name': column02, 'type': BIGINT}, {'name': column03, 'type': BIGINT}, {'name': column04, 'type': BIGINT}, {'name': column05, 'type': BIGINT}, {'name': column06, 'type': BIGINT}, {'name': column07, 'type': VARCHAR}, {'name': column08, 'type': VARCHAR}, {'name': column09, 'type': VARCHAR}, {'name': column10, 'type': VARCHAR}, {'name': column11, 'type': BIGINT}, {'name': column12, 'type': BIGINT}, {'name': column13, 'type': BIGINT}, {'name': column14, 'type': VARCHAR}, {'name': column15, 'type': VARCHAR}, {'name': column16, 'type': VARCHAR}, {'name': column17, 'type': BIGINT}] NULL NULL NULL FROM read_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv?v=1', auto_detect=false, delim=',', quote='"', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column00': 'BIGINT', 'column01': 'VARCHAR', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'BIGINT', 'column06': 'BIGINT', 'column07': 'VARCHAR', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'VARCHAR', 'column11': 'BIGINT', 'column12': 'BIGINT', 'column13': 'BIGINT', 'column14': 'VARCHAR', 'column15': 'VARCHAR', 'column16': 'VARCHAR', 'column17': 'BIGINT'}); 15 | 16 | 17 | # This test abuses the LOCAL_EXTENSION_REPO env to make sure tests are only run when running extension tests 18 | # in duckdb/duckdb. Otherwise you need to pass a data dir when exex 19 | 20 | require-env LOCAL_EXTENSION_REPO 21 | 22 | # regular csv file 23 | query ITTTIITITTIIII nosort webpagecsv 24 | SELECT * FROM read_csv_auto('data/csv/real/web_page.csv') ORDER BY 1; 25 | ---- 26 | 27 | # file with gzip 28 | query IIIIIIIIIIIIIII nosort lineitemcsv 29 | SELECT * FROM read_csv_auto('data/csv/lineitem1k.tbl.gz') ORDER BY ALL; 30 | ---- 31 | 32 | query ITTTIITITTIIII nosort webpagecsv 33 | SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/duckdb/duckdb/main/data/csv/real/web_page.csv') ORDER BY 1; 34 | ---- 35 | 36 | query IIIIIIIIIIIIIII nosort lineitemcsv 37 | select * from read_csv_auto('https://raw.githubusercontent.com/duckdb/duckdb/main/data/csv/lineitem1k.tbl.gz') ORDER BY ALL; 38 | ---- 39 | -------------------------------------------------------------------------------- /src/include/create_secret_functions.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb { 6 | struct CreateSecretInput; 7 | struct S3AuthParams; 8 | class CreateSecretFunction; 9 | class BaseSecret; 10 | struct SecretEntry; 11 | class ExtensionLoader; 12 | 13 | struct CreateS3SecretFunctions { 14 | public: 15 | //! Register all CreateSecretFunctions 16 | static void Register(ExtensionLoader &loader); 17 | 18 | //! Secret refreshing mechanisms 19 | static CreateSecretInput GenerateRefreshSecretInfo(const SecretEntry &secret_entry, Value &refresh_info); 20 | static bool TryRefreshS3Secret(ClientContext &context, const SecretEntry &secret_to_refresh); 21 | 22 | protected: 23 | //! Internal function to create BaseSecret from S3AuthParams 24 | static unique_ptr CreateSecretFunctionInternal(ClientContext &context, CreateSecretInput &input); 25 | 26 | //! Function for the "settings" provider: creates secret from current duckdb settings 27 | static unique_ptr CreateS3SecretFromSettings(ClientContext &context, CreateSecretInput &input); 28 | //! Function for the "config" provider: creates secret from parameters passed by user 29 | static unique_ptr CreateS3SecretFromConfig(ClientContext &context, CreateSecretInput &input); 30 | 31 | //! Helper function to set named params of secret function 32 | static void SetBaseNamedParams(CreateSecretFunction &function, string &type); 33 | //! Helper function to create secret types s3/r2/gcs 34 | static void RegisterCreateSecretFunction(ExtensionLoader &loader, string type); 35 | }; 36 | 37 | struct CreateBearerTokenFunctions { 38 | public: 39 | static constexpr const char *HUGGINGFACE_TYPE = "huggingface"; 40 | 41 | //! Register all CreateSecretFunctions 42 | static void Register(ExtensionLoader &loader); 43 | 44 | protected: 45 | //! Internal function to create bearer token 46 | static unique_ptr CreateSecretFunctionInternal(ClientContext &context, CreateSecretInput &input, 47 | const string &token); 48 | //! Function for the "config" provider: creates secret from parameters passed by user 49 | static unique_ptr CreateBearerSecretFromConfig(ClientContext &context, CreateSecretInput &input); 50 | //! Function for the "config" provider: creates secret from parameters passed by user 51 | static unique_ptr CreateHuggingFaceSecretFromCredentialChain(ClientContext &context, 52 | CreateSecretInput &input); 53 | }; 54 | 55 | } // namespace duckdb 56 | -------------------------------------------------------------------------------- /src/include/hffs.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "httpfs.hpp" 4 | 5 | namespace duckdb { 6 | 7 | struct ParsedHFUrl { 8 | //! Path within the 9 | string path; 10 | //! Name of the repo (i presume) 11 | string repository; 12 | 13 | //! Endpoint, defaults to HF 14 | string endpoint = "https://huggingface.co"; 15 | //! Which revision/branch/tag to use 16 | string revision = "main"; 17 | //! For DuckDB this may be a sensible default? 18 | string repo_type = "datasets"; 19 | }; 20 | 21 | class HuggingFaceFileSystem : public HTTPFileSystem { 22 | public: 23 | ~HuggingFaceFileSystem() override; 24 | 25 | vector Glob(const string &path, FileOpener *opener = nullptr) override; 26 | 27 | duckdb::unique_ptr HeadRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map) override; 28 | duckdb::unique_ptr GetRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map) override; 29 | duckdb::unique_ptr GetRangeRequest(FileHandle &handle, string hf_url, HTTPHeaders header_map, 30 | idx_t file_offset, char *buffer_out, 31 | idx_t buffer_out_len) override; 32 | 33 | bool CanHandleFile(const string &fpath) override { 34 | return fpath.rfind("hf://", 0) == 0; 35 | }; 36 | 37 | string GetName() const override { 38 | return "HuggingFaceFileSystem"; 39 | } 40 | static ParsedHFUrl HFUrlParse(const string &url); 41 | string GetHFUrl(const ParsedHFUrl &url); 42 | string GetTreeUrl(const ParsedHFUrl &url, idx_t limit); 43 | string GetFileUrl(const ParsedHFUrl &url); 44 | 45 | static void SetParams(HTTPFSParams ¶ms, const string &path, optional_ptr opener); 46 | 47 | protected: 48 | duckdb::unique_ptr CreateHandle(const OpenFileInfo &file, FileOpenFlags flags, 49 | optional_ptr opener) override; 50 | 51 | string ListHFRequest(ParsedHFUrl &url, HTTPFSParams &http_params, string &next_page_url, 52 | optional_ptr state); 53 | }; 54 | 55 | class HFFileHandle : public HTTPFileHandle { 56 | friend class HuggingFaceFileSystem; 57 | 58 | public: 59 | HFFileHandle(FileSystem &fs, ParsedHFUrl hf_url, const OpenFileInfo &file, FileOpenFlags flags, 60 | unique_ptr http_params) 61 | : HTTPFileHandle(fs, file, flags, std::move(http_params)), parsed_url(std::move(hf_url)) { 62 | } 63 | ~HFFileHandle() override; 64 | 65 | unique_ptr CreateClient() override; 66 | 67 | protected: 68 | ParsedHFUrl parsed_url; 69 | }; 70 | 71 | } // namespace duckdb 72 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_s3_serialization.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_s3_serialization.test 2 | # description: Test serialization of the S3/GCS/r2 secrets 3 | # group: [secrets] 4 | 5 | require httpfs 6 | 7 | require parquet 8 | 9 | load __TEST_DIR__/test_serialize_secrets.db 10 | 11 | statement ok 12 | PRAGMA enable_verification; 13 | 14 | statement ok 15 | set secret_directory='__TEST_DIR__/create_secret_s3_serialization' 16 | 17 | statement ok 18 | CREATE OR REPLACE PERSISTENT SECRET s1 ( 19 | TYPE S3, 20 | PROVIDER config, 21 | SCOPE 's3://my_s3_scope', 22 | KEY_ID 'mekey', 23 | SECRET 'mesecret', 24 | REGION 'meregion', 25 | SESSION_TOKEN 'mesesh', 26 | ENDPOINT 'meendpoint', 27 | URL_STYLE 'mahstyle', 28 | USE_SSL true, 29 | URL_COMPATIBILITY_MODE true 30 | ) 31 | 32 | statement ok 33 | CREATE OR REPLACE PERSISTENT SECRET s2 ( 34 | TYPE R2, 35 | PROVIDER config, 36 | SCOPE 's3://my_r2_scope', 37 | ACCOUNT_ID 'some_bogus_account', 38 | KEY_ID 'mekey', 39 | SECRET 'mesecret', 40 | SESSION_TOKEN 'mesesh', 41 | URL_STYLE 'mahstyle', 42 | USE_SSL 1, 43 | URL_COMPATIBILITY_MODE 1 44 | ) 45 | 46 | statement ok 47 | CREATE OR REPLACE PERSISTENT SECRET s3 ( 48 | TYPE GCS, 49 | PROVIDER config, 50 | SCOPE 's3://my_gcs_scope', 51 | KEY_ID 'mekey', 52 | SECRET 'mesecret', 53 | SESSION_TOKEN 'mesesh', 54 | URL_STYLE 'mahstyle', 55 | USE_SSL true, 56 | URL_COMPATIBILITY_MODE true 57 | ) 58 | 59 | query IIII 60 | select name, type, provider, scope FROM duckdb_secrets() order by name; 61 | ---- 62 | s1 s3 config ['s3://my_s3_scope'] 63 | s2 r2 config ['s3://my_r2_scope'] 64 | s3 gcs config ['s3://my_gcs_scope'] 65 | 66 | # Note: this query prints the tokens as an unredacted string 67 | query I nosort secret_to_string 68 | select secret_string from duckdb_secrets(redact=false) order by type; 69 | ---- 70 | 71 | restart 72 | 73 | # Now setting the secret dir somehwere nonexistent will yield no persistent secrets 74 | statement ok 75 | set secret_directory='__TEST_DIR__/does_not_exist1' 76 | 77 | query I 78 | select count(*) FROM duckdb_secrets(redact=false); 79 | ---- 80 | 0 81 | 82 | restart 83 | 84 | # However setting it to the dir that does, we can suddenly see our persisted secrets 85 | statement ok 86 | set secret_directory='__TEST_DIR__/create_secret_s3_serialization' 87 | 88 | # After restart secrets are still there 89 | query IIII 90 | select name, type, provider, scope FROM duckdb_secrets() order by name; 91 | ---- 92 | s1 s3 config ['s3://my_s3_scope'] 93 | s2 r2 config ['s3://my_r2_scope'] 94 | s3 gcs config ['s3://my_gcs_scope'] 95 | 96 | # Note: this query prints the tokens as an unredacted string 97 | query I nosort secret_to_string 98 | select secret_string from duckdb_secrets(redact=false) order by type; 99 | ---- -------------------------------------------------------------------------------- /test/sql/copy/s3/parquet_s3_tpch.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/parquet_s3_tpch.test_slow 2 | # description: Test all tpch queries on tpch sf0.01 over s3 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require tpch 10 | 11 | require-env S3_TEST_SERVER_AVAILABLE 1 12 | 13 | # Require that these environment variables are also set 14 | 15 | require-env AWS_DEFAULT_REGION 16 | 17 | require-env AWS_ACCESS_KEY_ID 18 | 19 | require-env AWS_SECRET_ACCESS_KEY 20 | 21 | require-env DUCKDB_S3_ENDPOINT 22 | 23 | require-env DUCKDB_S3_USE_SSL 24 | 25 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 26 | set ignore_error_messages 27 | 28 | statement ok 29 | SET enable_http_metadata_cache=true; 30 | 31 | statement ok 32 | set http_timeout=120000; 33 | 34 | # More retries (longest wait will be 25600ms) 35 | statement ok 36 | set http_retries=6; 37 | 38 | # Copy files to S3 before beginning tests 39 | statement ok 40 | CALL DBGEN(sf=0.01); 41 | 42 | # copy tpch files to S3 43 | statement ok 44 | COPY lineitem to 's3://test-bucket/tpch-sf0_01/lineitem.parquet'; 45 | COPY nation to 's3://test-bucket/tpch-sf0_01/nation.parquet'; 46 | COPY region to 's3://test-bucket/tpch-sf0_01/region.parquet'; 47 | COPY part to 's3://test-bucket/tpch-sf0_01/part.parquet'; 48 | COPY supplier to 's3://test-bucket/tpch-sf0_01/supplier.parquet'; 49 | COPY partsupp to 's3://test-bucket/tpch-sf0_01/partsupp.parquet'; 50 | COPY customer to 's3://test-bucket/tpch-sf0_01/customer.parquet'; 51 | COPY orders to 's3://test-bucket/tpch-sf0_01/orders.parquet'; 52 | 53 | # clears tables 54 | statement ok 55 | DROP TABLE lineitem; 56 | DROP TABLE nation; 57 | DROP TABLE region; 58 | DROP TABLE part; 59 | DROP TABLE supplier; 60 | DROP TABLE partsupp; 61 | DROP TABLE customer; 62 | DROP TABLE orders; 63 | 64 | statement ok 65 | CREATE VIEW lineitem as SELECT * FROM 's3://test-bucket/tpch-sf0_01/lineitem.parquet'; 66 | CREATE VIEW nation as SELECT * FROM 's3://test-bucket/tpch-sf0_01/nation.parquet'; 67 | CREATE VIEW region as SELECT * FROM 's3://test-bucket/tpch-sf0_01/region.parquet'; 68 | CREATE VIEW part as SELECT * FROM 's3://test-bucket/tpch-sf0_01/part.parquet'; 69 | CREATE VIEW supplier as SELECT * FROM 's3://test-bucket/tpch-sf0_01/supplier.parquet'; 70 | CREATE VIEW partsupp as SELECT * FROM 's3://test-bucket/tpch-sf0_01/partsupp.parquet'; 71 | CREATE VIEW customer as SELECT * FROM 's3://test-bucket/tpch-sf0_01/customer.parquet'; 72 | CREATE VIEW orders as SELECT * FROM 's3://test-bucket/tpch-sf0_01/orders.parquet'; 73 | 74 | 75 | # Run TPCH SF1 76 | loop i 1 9 77 | 78 | query I 79 | PRAGMA tpch(${i}) 80 | ---- 81 | :duckdb/extension/tpch/dbgen/answers/sf0.01/q0${i}.csv 82 | 83 | endloop 84 | 85 | loop i 10 23 86 | 87 | query I 88 | PRAGMA tpch(${i}) 89 | ---- 90 | :duckdb/extension/tpch/dbgen/answers/sf0.01/q${i}.csv 91 | 92 | endloop 93 | -------------------------------------------------------------------------------- /scripts/run_squid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | help() { 4 | echo "Usage: ${0} [port] [auth]" 5 | echo " port Port number for squid to lisen to (by default 3128)" 6 | echo " auth Optional string ('auth') to force user basic authentification (autherwise no authentification is required)" 7 | exit 0 8 | } 9 | 10 | port='3128' 11 | auth='false' 12 | log_dir="squid_logs" 13 | conf_file="squid.conf" 14 | pid_file='${service_name}.pid' 15 | 16 | while [[ $# -gt 0 ]]; do 17 | case "${1}" in 18 | -h|--help) 19 | help 20 | ;; 21 | -p|--port) 22 | port="${2}" 23 | shift # past argument 24 | shift # past value 25 | ;; 26 | --auth) 27 | auth='true' 28 | conf_file="squid_auth.conf" 29 | pid_file='${service_name}_auth.pid' 30 | shift # past argument 31 | ;; 32 | --log_dir) 33 | log_dir="${2}" 34 | shift # past argument 35 | shift # past value 36 | ;; 37 | *) 38 | echo "Unknown option ${1}" 39 | exit 1 40 | ;; 41 | esac 42 | done 43 | 44 | mkdir "${log_dir}" 45 | touch "${log_dir}/daemon.log" 46 | chmod -R 777 "${log_dir}" 47 | 48 | echo "http_port 127.0.0.1:${port}" >"${conf_file}" 49 | echo "pid_filename ${pid_file}" >>"${conf_file}" 50 | 51 | echo 'logfile_rotate 0' >>"${conf_file}" 52 | echo "logfile_daemon ${log_dir}/daemon.log" >>"${conf_file}" 53 | echo "access_log ${log_dir}/access.log" >>"${conf_file}" 54 | echo "cache_log ${log_dir}/cache.log" >>"${conf_file}" 55 | echo "cache_store_log ${log_dir}/cache_store.log" >>"${conf_file}" 56 | 57 | 58 | if [[ "${auth}" == "true" ]]; then 59 | # User 'john' with password 'doe' 60 | echo 'john:$apr1$dalj9e7s$AhqY28Hvl3EcNblNJMiXa0' >squid_users 61 | 62 | squid_version="$(squid -v | head -n1 | grep -o 'Version [^ ]*' | cut -d ' ' -f 2)" 63 | if [[ "$(uname)" == "Darwin" ]]; then 64 | auth_basic_program="/opt/homebrew/Cellar/squid/${squid_version}/libexec/basic_ncsa_auth" 65 | else 66 | if [[ -e '/usr/lib64/squid/basic_ncsa_auth' ]]; then 67 | auth_basic_program="/usr/lib64/squid/basic_ncsa_auth" 68 | else 69 | auth_basic_program="/usr/lib/squid/basic_ncsa_auth" 70 | fi 71 | fi 72 | 73 | echo '# Add authentification options' >>"${conf_file}" 74 | echo "auth_param basic program ${auth_basic_program} squid_users" >>"${conf_file}" 75 | echo 'auth_param basic children 3' >>"${conf_file}" 76 | echo 'auth_param basic realm Squid BA' >>"${conf_file}" 77 | echo 'acl auth_users proxy_auth REQUIRED' >>"${conf_file}" 78 | echo 'http_access allow auth_users' >>"${conf_file}" 79 | echo 'http_access deny all' >>"${conf_file}" 80 | else 81 | echo 'http_access allow localhost' >>"${conf_file}" 82 | fi 83 | 84 | exec squid -N -f "${conf_file}" 85 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/test_parquet_remote.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/test_parquet_remote.test 2 | # description: Parquet read from S3/HTTPS 3 | # group: [parquet] 4 | 5 | require httpfs 6 | 7 | require parquet 8 | 9 | # non existent host 10 | statement error 11 | SELECT * FROM PARQUET_SCAN('https://this-host-does-not-exist-for-sure/test.parquet'); 12 | ---- 13 | 14 | # non existent file 15 | statement error 16 | SELECT * FROM PARQUET_SCAN('https://duckdb.org/test.parquet'); 17 | ---- 18 | 19 | # missing path 20 | statement error 21 | SELECT * FROM PARQUET_SCAN('https://duckdb.org'); 22 | ---- 23 | 24 | # empty path 25 | statement error 26 | SELECT * FROM PARQUET_SCAN('https://duckdb.org/'); 27 | ---- 28 | 29 | # straightforward 30 | query IIII 31 | SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') LIMIT 10; 32 | ---- 33 | 1 Amanda Jordan ajordan0@com.com 34 | 2 Albert Freeman afreeman1@is.gd 35 | 3 Evelyn Morgan emorgan2@altervista.org 36 | 4 Denise Riley driley3@gmpg.org 37 | 5 Carlos Burns cburns4@miitbeian.gov.cn 38 | 6 Kathryn White kwhite5@google.com 39 | 7 Samuel Holmes sholmes6@foxnews.com 40 | 8 Harry Howell hhowell7@eepurl.com 41 | 9 Jose Foster jfoster8@yelp.com 42 | 10 Emily Stewart estewart9@opensource.org 43 | 44 | 45 | # with redirects 46 | query IIII 47 | SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://github.com/duckdb/duckdb/blob/main/data/parquet-testing/userdata1.parquet?raw=true') LIMIT 10; 48 | ---- 49 | 1 Amanda Jordan ajordan0@com.com 50 | 2 Albert Freeman afreeman1@is.gd 51 | 3 Evelyn Morgan emorgan2@altervista.org 52 | 4 Denise Riley driley3@gmpg.org 53 | 5 Carlos Burns cburns4@miitbeian.gov.cn 54 | 6 Kathryn White kwhite5@google.com 55 | 7 Samuel Holmes sholmes6@foxnews.com 56 | 8 Harry Howell hhowell7@eepurl.com 57 | 9 Jose Foster jfoster8@yelp.com 58 | 10 Emily Stewart estewart9@opensource.org 59 | 60 | # with explicit port nr 61 | query IIII 62 | SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://github.com:443/duckdb/duckdb/blob/main/data/parquet-testing/userdata1.parquet?raw=true') LIMIT 10; 63 | ---- 64 | 1 Amanda Jordan ajordan0@com.com 65 | 2 Albert Freeman afreeman1@is.gd 66 | 3 Evelyn Morgan emorgan2@altervista.org 67 | 4 Denise Riley driley3@gmpg.org 68 | 5 Carlos Burns cburns4@miitbeian.gov.cn 69 | 6 Kathryn White kwhite5@google.com 70 | 7 Samuel Holmes sholmes6@foxnews.com 71 | 8 Harry Howell hhowell7@eepurl.com 72 | 9 Jose Foster jfoster8@yelp.com 73 | 10 Emily Stewart estewart9@opensource.org 74 | 75 | query IIII 76 | SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://github.com/duckdb/duckdb-data/releases/download/v1.0/us+er+da+ta.parquet') LIMIT 1; 77 | ---- 78 | 1 Amanda Jordan ajordan0@com.com 79 | 80 | query IIII 81 | SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://github.com/duckdb/duckdb-data/releases/download/v1.0/us%2Ber%2Bda%2Bta.parquet') LIMIT 1; 82 | ---- 83 | 1 Amanda Jordan ajordan0@com.com 84 | -------------------------------------------------------------------------------- /.github/workflows/IntegrationTests.yml: -------------------------------------------------------------------------------- 1 | name: Integration Tests 2 | on: [push, pull_request,repository_dispatch] 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 5 | cancel-in-progress: true 6 | defaults: 7 | run: 8 | shell: bash 9 | 10 | jobs: 11 | linux-tests-httpfs: 12 | name: MinIO Tests 13 | runs-on: ubuntu-latest 14 | env: 15 | S3_TEST_SERVER_AVAILABLE: 1 16 | AWS_DEFAULT_REGION: eu-west-1 17 | AWS_ACCESS_KEY_ID: minio_duckdb_user 18 | AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password 19 | DUCKDB_S3_ENDPOINT: duckdb-minio.com:9000 20 | DUCKDB_S3_USE_SSL: false 21 | HTTP_PROXY_PUBLIC: localhost:3128 22 | TEST_PERSISTENT_SECRETS_AVAILABLE: true 23 | CORE_EXTENSIONS: "parquet;json;tpch" 24 | GEN: ninja 25 | VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake 26 | VCPKG_TARGET_TRIPLET: x64-linux 27 | PYTHON_HTTP_SERVER_URL: http://localhost:8008 28 | PYTHON_HTTP_SERVER_DIR: /tmp/python_test_server 29 | 30 | steps: 31 | - uses: actions/checkout@v4 32 | with: 33 | fetch-depth: 0 34 | submodules: 'true' 35 | 36 | - uses: actions/setup-python@v4 37 | with: 38 | python-version: '3.10' 39 | 40 | - name: Install Ninja 41 | shell: bash 42 | run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build 43 | 44 | - name: Setup Ccache 45 | uses: hendrikmuhs/ccache-action@main 46 | with: 47 | key: ${{ github.job }} 48 | save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb-httpfs' }} 49 | 50 | - name: Setup vcpkg 51 | uses: lukka/run-vcpkg@v11.1 52 | with: 53 | vcpkgGitCommitId: 5e5d0e1cd7785623065e77eff011afdeec1a3574 54 | 55 | - name: Fix permissions of test secrets 56 | shell: bash 57 | run: chmod -R 700 data/secrets 58 | 59 | # TODO: fix the authenticated proxy here 60 | - name: Install and run http proxy squid 61 | shell: bash 62 | run: | 63 | sudo apt-get install squid 64 | ./scripts/run_squid.sh --port 3128 --log_dir squid_logs & 65 | 66 | - name: Run & Populate Python test server 67 | shell: bash 68 | run: | 69 | mkdir -p $PYTHON_HTTP_SERVER_DIR 70 | cd $PYTHON_HTTP_SERVER_DIR 71 | python3 -m http.server 8008 & 72 | 73 | - name: Build 74 | shell: bash 75 | run: make 76 | 77 | - name: Install test server 78 | shell: bash 79 | run: | 80 | sudo ./scripts/install_s3_test_server.sh 81 | ./scripts/generate_presigned_url.sh 82 | 83 | - name: Start test server & run tests 84 | shell: bash 85 | run: | 86 | source ./scripts/run_s3_test_server.sh 87 | source ./scripts/set_s3_test_server_variables.sh 88 | ./build/release/test/unittest "*" --skip-error-messages "[]" 89 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_transactional.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_transactional.test 2 | # description: Test secret transactional safety 3 | # group: [secrets] 4 | 5 | statement ok 6 | PRAGMA enable_verification; 7 | 8 | require httpfs 9 | 10 | load __TEST_DIR__/create_secret_transactional.db 11 | 12 | statement ok 13 | set secret_directory='__TEST_DIR__/create_secret_transactional' 14 | 15 | statement ok 16 | PRAGMA threads=1 17 | 18 | foreach secret_type TEMPORARY PERSISTENT 19 | 20 | statement ok con1 21 | BEGIN TRANSACTION 22 | 23 | statement ok con1 24 | CREATE ${secret_type} SECRET s1 (TYPE S3) 25 | 26 | statement ok con2 27 | BEGIN TRANSACTION 28 | 29 | statement ok con2 30 | CREATE ${secret_type} SECRET s2 (TYPE S3) 31 | 32 | query I con1 33 | SELECT name FROM duckdb_secrets(); 34 | ---- 35 | s1 36 | 37 | query I con2 38 | SELECT name FROM duckdb_secrets(); 39 | ---- 40 | s2 41 | 42 | statement ok con1 43 | COMMIT 44 | 45 | # Transaction 2 still only sees own secret: it has not commited yet 46 | query I con2 47 | SELECT name FROM duckdb_secrets(); 48 | ---- 49 | s2 50 | 51 | # New transaction will see only committed secret 52 | query I con3 53 | SELECT name FROM duckdb_secrets(); 54 | ---- 55 | s1 56 | 57 | statement ok con2 58 | COMMIT 59 | 60 | # Now both are visible 61 | query I con3 62 | SELECT name FROM duckdb_secrets() ORDER BY name; 63 | ---- 64 | s1 65 | s2 66 | 67 | statement ok con1 68 | BEGIN TRANSACTION 69 | 70 | statement ok con1 71 | DROP SECRET s1; 72 | 73 | # Drop not yet commited: con3 will not see it yet 74 | query I con3 75 | SELECT name FROM duckdb_secrets() ORDER BY name; 76 | ---- 77 | s1 78 | s2 79 | 80 | # Commit the drop 81 | statement ok con1 82 | COMMIT 83 | 84 | # Drop now visible to con3 85 | query I con3 86 | SELECT name FROM duckdb_secrets(); 87 | ---- 88 | s2 89 | 90 | # Clean up for loop end 91 | statement ok 92 | DROP SECRET s2 93 | 94 | endloop 95 | 96 | # Now lets test transactional safety of lazily loaded persistent secrets 97 | 98 | statement ok 99 | CREATE PERSISTENT SECRET perm_s1 (TYPE S3) 100 | 101 | restart 102 | 103 | statement ok 104 | set secret_directory='__TEST_DIR__/create_secret_transactional' 105 | 106 | # After restart, we create 2 connections that each add their own tmp secret; the perm secret is now lazily loaded! 107 | statement ok con1 108 | BEGIN TRANSACTION 109 | 110 | statement ok con1 111 | CREATE SECRET tmp_s1 (TYPE S3) 112 | 113 | statement ok con2 114 | BEGIN TRANSACTION 115 | 116 | statement ok con2 117 | CREATE SECRET tmp_s2 (TYPE S3) 118 | 119 | # Now con1 drops the lazily loaded perm secret 120 | statement ok con1 121 | DROP SECRET perm_s1; 122 | 123 | query I con1 124 | SELECT name FROM duckdb_secrets(); 125 | ---- 126 | tmp_s1 127 | 128 | # con2 still has both secrets 129 | query I con2 130 | SELECT name FROM duckdb_secrets() ORDER BY name; 131 | ---- 132 | perm_s1 133 | tmp_s2 134 | 135 | statement ok con1 136 | COMMIT 137 | 138 | statement ok con2 139 | COMMIT 140 | 141 | # Now the deletion is visible to con2 142 | query I con2 143 | SELECT name FROM duckdb_secrets() ORDER BY name; 144 | ---- 145 | tmp_s1 146 | tmp_s2 -------------------------------------------------------------------------------- /test/sql/copy/s3/upload_file_parallel.test_slow: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/upload_file_parallel.test_slow 2 | # description: Copy large parquet files from and to S3 in parallel. 3 | # group: [s3] 4 | 5 | require tpch 6 | 7 | require parquet 8 | 9 | require httpfs 10 | 11 | require-env S3_TEST_SERVER_AVAILABLE 1 12 | 13 | # Require that these environment variables are also set 14 | 15 | require-env AWS_DEFAULT_REGION 16 | 17 | require-env AWS_ACCESS_KEY_ID 18 | 19 | require-env AWS_SECRET_ACCESS_KEY 20 | 21 | require-env DUCKDB_S3_ENDPOINT 22 | 23 | require-env DUCKDB_S3_USE_SSL 24 | 25 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 26 | set ignore_error_messages 27 | 28 | statement ok 29 | CALL DBGEN(sf=1) 30 | 31 | statement ok 32 | set http_timeout=120000; 33 | 34 | # More retries (longest wait will be 25600ms) 35 | statement ok 36 | set http_retries=6; 37 | 38 | query I 39 | SELECT 40 | sum(l_extendedprice * l_discount) AS revenue 41 | FROM 42 | lineitem 43 | WHERE 44 | l_shipdate >= CAST('1994-01-01' AS date) 45 | AND l_shipdate < CAST('1995-01-01' AS date) 46 | AND l_discount BETWEEN 0.05 47 | AND 0.07 48 | AND l_quantity < 24; 49 | ---- 50 | 123141078.2283 51 | 52 | # We do this in parallel to also test synchronization of s3fs between 2 connections 53 | concurrentloop threadid 0 2 54 | 55 | statement ok 56 | SET s3_endpoint='${DUCKDB_S3_ENDPOINT}';SET s3_use_ssl=${DUCKDB_S3_USE_SSL}; 57 | 58 | # Parquet file 59 | statement ok 60 | COPY lineitem TO 's3://test-bucket/multipart/export_large_${threadid}.parquet' (FORMAT 'parquet'); 61 | 62 | query I 63 | SELECT 64 | sum(l_extendedprice * l_discount) AS revenue 65 | FROM 66 | "s3://test-bucket/multipart/export_large_${threadid}.parquet" 67 | WHERE 68 | l_shipdate >= CAST('1994-01-01' AS date) 69 | AND l_shipdate < CAST('1995-01-01' AS date) 70 | AND l_discount BETWEEN 0.05 71 | AND 0.07 72 | AND l_quantity < 24; 73 | ---- 74 | 123141078.2283 75 | 76 | endloop 77 | 78 | statement ok 79 | CALL dbgen(sf=0.01, suffix='_small'); 80 | 81 | query I 82 | SELECT 83 | sum(l_extendedprice * l_discount) AS revenue 84 | FROM 85 | lineitem_small 86 | WHERE 87 | l_shipdate >= CAST('1994-01-01' AS date) 88 | AND l_shipdate < CAST('1995-01-01' AS date) 89 | AND l_discount BETWEEN 0.05 90 | AND 0.07 91 | AND l_quantity < 24; 92 | ---- 93 | 1193053.2253 94 | 95 | # Upload and query 100 tiny files in parallel 96 | concurrentloop threadid 0 100 97 | 98 | statement ok 99 | SET s3_secret_access_key='${AWS_SECRET_ACCESS_KEY}';SET s3_access_key_id='${AWS_ACCESS_KEY_ID}';SET s3_region='${AWS_DEFAULT_REGION}'; SET s3_endpoint='${DUCKDB_S3_ENDPOINT}';SET s3_use_ssl=${DUCKDB_S3_USE_SSL}; 100 | 101 | statement ok 102 | SET s3_uploader_thread_limit=1 103 | 104 | # Parquet file 105 | statement ok 106 | COPY lineitem_small TO 's3://test-bucket/multipart/export_small_${threadid}.parquet' (FORMAT 'parquet'); 107 | 108 | query I 109 | SELECT 110 | sum(l_extendedprice * l_discount) AS revenue 111 | FROM 112 | "s3://test-bucket/multipart/export_small_${threadid}.parquet" 113 | WHERE 114 | l_shipdate >= CAST('1994-01-01' AS date) 115 | AND l_shipdate < CAST('1995-01-01' AS date) 116 | AND l_discount BETWEEN 0.05 117 | AND 0.07 118 | AND l_quantity < 24; 119 | ---- 120 | 1193053.2253 121 | 122 | endloop 123 | -------------------------------------------------------------------------------- /scripts/minio_s3.yml: -------------------------------------------------------------------------------- 1 | services: 2 | minio: 3 | image: minio/minio:RELEASE.2021-11-03T03-36-36Z 4 | hostname: duckdb-minio.com 5 | ports: 6 | - "9000:9000" 7 | - "9001:9001" 8 | volumes: 9 | - /tmp/minio_test_data:/data 10 | - /tmp/minio_root_data:/root/.minio 11 | environment: 12 | - MINIO_ROOT_USER=duckdb_minio_admin 13 | - MINIO_ROOT_PASSWORD=duckdb_minio_admin_password 14 | - MINIO_REGION_NAME=eu-west-1 15 | - MINIO_DOMAIN=duckdb-minio.com 16 | - MINIO_ACCESS_KEY=duckdb_minio_admin 17 | - MINIO_SECRET_KEY=duckdb_minio_admin_password 18 | command: server /data --console-address ":9001" 19 | 20 | minio_setup: 21 | image: minio/mc:RELEASE.2021-11-05T10-05-06Z 22 | depends_on: 23 | - minio 24 | links: 25 | - minio 26 | volumes: 27 | - ${PWD}/duckdb/data:/duckdb/data 28 | - ${PWD}/test/test_data:/duckdb/test_data 29 | 30 | entrypoint: > 31 | /bin/sh -c " 32 | until ( 33 | /usr/bin/mc config host add myminio http://duckdb-minio.com:9000 duckdb_minio_admin duckdb_minio_admin_password 34 | ) do 35 | echo '...waiting...' && sleep 1; 36 | done; 37 | 38 | /usr/bin/mc admin user add myminio minio_duckdb_user minio_duckdb_user_password 39 | /usr/bin/mc admin user list myminio 40 | /usr/bin/mc admin user info myminio minio_duckdb_user 41 | /usr/bin/mc admin policy set myminio readwrite user=minio_duckdb_user 42 | 43 | /usr/bin/mc admin user add myminio minio_duckdb_user_2 minio_duckdb_user_2_password 44 | /usr/bin/mc admin user list myminio 45 | /usr/bin/mc admin user info myminio minio_duckdb_user_2 46 | /usr/bin/mc admin policy set myminio readwrite user=minio_duckdb_user_2 47 | 48 | /usr/bin/mc rb --force myminio/test-bucket 49 | /usr/bin/mc mb myminio/test-bucket 50 | /usr/bin/mc policy get myminio/test-bucket 51 | 52 | /usr/bin/mc rb --force myminio/test-bucket-2 53 | /usr/bin/mc mb myminio/test-bucket-2 54 | /usr/bin/mc policy get myminio/test-bucket-2 55 | 56 | /usr/bin/mc rb --force myminio/test-bucket-public 57 | /usr/bin/mc mb myminio/test-bucket-public 58 | /usr/bin/mc policy set download myminio/test-bucket-public 59 | /usr/bin/mc policy get myminio/test-bucket-public 60 | 61 | # This is for the test of presigned URLs 62 | # !!! When missing, be sure that you have ran 'scripts/generate_presigned_url.sh' !!! 63 | 64 | # small file upload 65 | /usr/bin/mc cp /duckdb/data/csv/phonenumbers.csv myminio/test-bucket/presigned/phonenumbers.csv 66 | /usr/bin/mc cp /duckdb/data/parquet-testing/glob/t1.parquet myminio/test-bucket/presigned/t1.parquet 67 | 68 | # large file upload 69 | /usr/bin/mc cp /duckdb/test_data/presigned-url-lineitem.parquet myminio/test-bucket/presigned/lineitem_large.parquet 70 | 71 | # Upload the db for the attach 72 | /usr/bin/mc cp /duckdb/test_data/attach.db myminio/test-bucket/presigned/attach.db 73 | /usr/bin/mc cp /duckdb/test_data/lineitem_sf1.db myminio/test-bucket/presigned/lineitem_sf1.db 74 | 75 | /usr/bin/mc share download myminio/test-bucket/presigned/phonenumbers.csv 76 | /usr/bin/mc share download myminio/test-bucket/presigned/t1.parquet 77 | /usr/bin/mc share download myminio/test-bucket/presigned/lineitem_large.parquet 78 | /usr/bin/mc share download myminio/test-bucket/presigned/attach.db 79 | 80 | echo 'FINISHED SETTING UP MINIO' 81 | exit 0; 82 | " -------------------------------------------------------------------------------- /test/sql/copy/s3/metadata_cache.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/metadata_cache.test 2 | # description: Test metadata cache that caches reponses from the initial HEAD requests to open a file. 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 24 | set ignore_error_messages 25 | 26 | # this test was written before we implemented the external file cache 27 | # when it is enabled, the request counts are different 28 | # we disable it so this test still makes sense 29 | statement ok 30 | set enable_external_file_cache=false; 31 | 32 | statement ok 33 | CREATE TABLE test as SELECT * FROM range(0,10) tbl(i); 34 | 35 | statement ok 36 | CREATE TABLE test1 as SELECT * FROM range(10,20) tbl(i); 37 | 38 | query II 39 | EXPLAIN ANALYZE COPY test TO 's3://test-bucket-public/root-dir/metadata_cache/test.parquet'; 40 | ---- 41 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 0.*GET\: 0.*PUT\: 1.*\#POST\: 0.* 42 | 43 | query II 44 | EXPLAIN ANALYZE COPY test TO 's3://test-bucket-public/root-dir/metadata_cache/test1.parquet'; 45 | ---- 46 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 0.*GET\: 0.*PUT\: 1.*\#POST\: 0.* 47 | 48 | # Now we query the file metadata without the global metadata cache: There should be 1 HEAD request for the file size, 49 | # then a GET for the pointer to the parquet metadata, then a GET for the metadata. 50 | query II 51 | EXPLAIN ANALYZE SELECT COUNT(*) FROM 's3://test-bucket-public/root-dir/metadata_cache/test.parquet'; 52 | ---- 53 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 1.*GET\: 1.*PUT\: 0.*\#POST\: 0.* 54 | 55 | # Redoing query should still result in same request count 56 | query II 57 | EXPLAIN ANALYZE SELECT COUNT(*) FROM 's3://test-bucket-public/root-dir/metadata_cache/test.parquet'; 58 | ---- 59 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 1.*GET\: 1.*PUT\: 0.*\#POST\: 0.* 60 | 61 | # Now enable the global metadata cache to store the results of the head requests, saving 1 HEAD per file 62 | statement ok 63 | SET enable_http_metadata_cache=true; 64 | 65 | query II 66 | EXPLAIN ANALYZE SELECT COUNT(*) FROM 's3://test-bucket-public/root-dir/metadata_cache/test1.parquet'; 67 | ---- 68 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 1.*GET\: 1.*PUT\: 0.*\#POST\: 0.* 69 | 70 | # Now with global metadata cache, we dont need to do the head request again. noice. 71 | query II 72 | EXPLAIN ANALYZE SELECT COUNT(*) FROM 's3://test-bucket-public/root-dir/metadata_cache/test1.parquet'; 73 | ---- 74 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 0.*GET\: 1.*PUT\: 0.*\#POST\: 0.* 75 | 76 | # Now when we write a file to a cached url, this would break so the cache entry should be invalidated 77 | statement ok 78 | COPY (SELECT * from range(0,100) tbl(i)) TO 's3://test-bucket-public/root-dir/metadata_cache/test1.parquet'; 79 | 80 | # We need to do a new head request here 81 | query II 82 | EXPLAIN ANALYZE SELECT COUNT(*) FROM 's3://test-bucket-public/root-dir/metadata_cache/test1.parquet'; 83 | ---- 84 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 1.*GET\: 1.*PUT\: 0.*\#POST\: 0.* 85 | 86 | # but now its cached again 87 | query II 88 | EXPLAIN ANALYZE SELECT COUNT(*) FROM 's3://test-bucket-public/root-dir/metadata_cache/test1.parquet'; 89 | ---- 90 | analyzed_plan :.*HTTP Stats.*\#HEAD\: 0.*GET\: 1.*PUT\: 0.*\#POST\: 0.* 91 | -------------------------------------------------------------------------------- /src/include/http_state.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/file_opener.hpp" 4 | #include "duckdb/main/client_context.hpp" 5 | #include "duckdb/main/client_data.hpp" 6 | #include "duckdb/common/atomic.hpp" 7 | #include "duckdb/common/optional_ptr.hpp" 8 | #include "duckdb/main/client_context_state.hpp" 9 | 10 | namespace duckdb { 11 | 12 | class CachedFileHandle; 13 | 14 | //! Represents a file that is intended to be fully downloaded, then used in parallel by multiple threads 15 | class CachedFile : public enable_shared_from_this { 16 | friend class CachedFileHandle; 17 | 18 | public: 19 | unique_ptr GetHandle() { 20 | auto this_ptr = shared_from_this(); 21 | return make_uniq(this_ptr); 22 | } 23 | 24 | private: 25 | //! Cached Data 26 | shared_ptr data; 27 | //! Data capacity 28 | uint64_t capacity = 0; 29 | //! Size of file 30 | idx_t size; 31 | //! Lock for initializing the file 32 | mutex lock; 33 | //! When initialized is set to true, the file is safe for parallel reading without holding the lock 34 | atomic initialized = {false}; 35 | }; 36 | 37 | //! Handle to a CachedFile 38 | class CachedFileHandle { 39 | public: 40 | explicit CachedFileHandle(shared_ptr &file_p); 41 | 42 | //! allocate a buffer for the file 43 | void AllocateBuffer(idx_t size); 44 | //! Indicate the file is fully downloaded and safe for parallel reading without lock 45 | void SetInitialized(idx_t total_size); 46 | //! Grow buffer to new size, copying over `bytes_to_copy` to the new buffer 47 | void GrowBuffer(idx_t new_capacity, idx_t bytes_to_copy); 48 | //! Write to the buffer 49 | void Write(const char *buffer, idx_t length, idx_t offset = 0); 50 | 51 | bool Initialized() { 52 | return file->initialized; 53 | } 54 | const char *GetData() { 55 | return file->data.get(); 56 | } 57 | uint64_t GetCapacity() { 58 | return file->capacity; 59 | } 60 | //! Return the size of the initialized file 61 | idx_t GetSize() { 62 | D_ASSERT(file->initialized); 63 | return file->size; 64 | } 65 | 66 | private: 67 | unique_ptr> lock; 68 | shared_ptr file; 69 | }; 70 | 71 | class HTTPState : public ClientContextState { 72 | public: 73 | //! Reset all counters and cached files 74 | void Reset(); 75 | //! Get cache entry, create if not exists 76 | shared_ptr &GetCachedFile(const string &path); 77 | //! Helper functions to get the HTTP state 78 | static shared_ptr TryGetState(ClientContext &context); 79 | static shared_ptr TryGetState(optional_ptr opener); 80 | 81 | bool IsEmpty() { 82 | return head_count == 0 && get_count == 0 && put_count == 0 && post_count == 0 && delete_count == 0 && 83 | total_bytes_received == 0 && total_bytes_sent == 0; 84 | } 85 | 86 | atomic head_count {0}; 87 | atomic get_count {0}; 88 | atomic put_count {0}; 89 | atomic post_count {0}; 90 | atomic delete_count {0}; 91 | atomic total_bytes_received {0}; 92 | atomic total_bytes_sent {0}; 93 | 94 | //! Called by the ClientContext when the current query ends 95 | void QueryEnd(ClientContext &context) override { 96 | Reset(); 97 | } 98 | void WriteProfilingInformation(std::ostream &ss) override; 99 | 100 | private: 101 | //! Mutex to lock when getting the cached file(Parallel Only) 102 | mutex cached_files_mutex; 103 | //! In case of fully downloading the file, the cached files of this query 104 | unordered_map> cached_files; 105 | }; 106 | 107 | } // namespace duckdb 108 | -------------------------------------------------------------------------------- /test/sql/copy/s3/download_config.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/download_config.test 2 | # description: Test S3 configuration 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | ## Require that these environment variables are also set 12 | require-env AWS_DEFAULT_REGION 13 | 14 | require-env AWS_ACCESS_KEY_ID 15 | 16 | require-env AWS_SECRET_ACCESS_KEY 17 | 18 | require-env DUCKDB_S3_ENDPOINT 19 | 20 | require-env DUCKDB_S3_USE_SSL 21 | 22 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 23 | set ignore_error_messages 24 | 25 | statement ok 26 | CREATE TABLE test as SELECT * FROM range(0,10) tbl(i); 27 | 28 | foreach url_style path vhost 29 | # Have to set these because they get altered during the loop 30 | statement ok 31 | SET s3_secret_access_key='${AWS_SECRET_ACCESS_KEY}'; 32 | 33 | statement ok 34 | SET s3_access_key_id='${AWS_ACCESS_KEY_ID}'; 35 | 36 | statement ok 37 | SET s3_endpoint='${DUCKDB_S3_ENDPOINT}'; 38 | 39 | statement ok 40 | SET http_retries=2; 41 | 42 | statement ok 43 | SET http_retry_wait_ms=10; 44 | 45 | statement ok 46 | SET http_retry_backoff=1; 47 | 48 | statement ok 49 | SET http_timeout=50000; 50 | 51 | statement ok 52 | SET http_keep_alive=false; 53 | 54 | # Test the vhost style urls (this is the default) 55 | statement ok 56 | SET s3_url_style='${url_style}'; 57 | 58 | statement ok 59 | COPY test TO 's3://test-bucket-public/root-dir/test_${url_style}_url_style.parquet'; 60 | 61 | # vhost style access 62 | query I 63 | SELECT i FROM "http://test-bucket-public.${DUCKDB_S3_ENDPOINT}/root-dir/test_${url_style}_url_style.parquet" LIMIT 3 64 | ---- 65 | 0 66 | 1 67 | 2 68 | 69 | # path style access 70 | query I 71 | SELECT i FROM "http://${DUCKDB_S3_ENDPOINT}/test-bucket-public/root-dir/test_${url_style}_url_style.parquet" LIMIT 3 72 | ---- 73 | 0 74 | 1 75 | 2 76 | 77 | # Test public access through s3 url 78 | statement ok 79 | SET s3_secret_access_key='';SET s3_access_key_id=''; 80 | 81 | query I 82 | SELECT i FROM "s3://test-bucket-public/root-dir/test_${url_style}_url_style.parquet" LIMIT 3 83 | ---- 84 | 0 85 | 1 86 | 2 87 | 88 | endloop 89 | 90 | # empty url style is also allowed to select the default 91 | statement ok 92 | SET s3_secret_access_key='${AWS_SECRET_ACCESS_KEY}';SET s3_access_key_id='${AWS_ACCESS_KEY_ID}';SET s3_region='${AWS_DEFAULT_REGION}'; SET s3_endpoint='${DUCKDB_S3_ENDPOINT}'; SET s3_use_ssl=${DUCKDB_S3_USE_SSL}; 93 | 94 | statement ok 95 | COPY test TO 's3://test-bucket-public/root-dir/test_default_url_style.parquet'; 96 | 97 | query I 98 | SELECT i FROM "http://test-bucket-public.${DUCKDB_S3_ENDPOINT}/root-dir/test_default_url_style.parquet" LIMIT 3 99 | ---- 100 | 0 101 | 1 102 | 2 103 | 104 | # Incorrect path style throws error 105 | statement ok 106 | SET s3_url_style='handwritten'; 107 | 108 | statement error 109 | COPY test TO 's3://test-bucket-public/root-dir/test2.parquet'; 110 | ---- 111 | 112 | # 404 113 | statement error 114 | SELECT i FROM "http://test-bucket-public.${DUCKDB_S3_ENDPOINT}/root-dir/non-existent-file-ljaslkjdas.parquet" LIMIT 3 115 | ---- 116 | Unable to connect to URL "http://test-bucket-public. 117 | 118 | # Connection error 119 | statement error 120 | SELECT i FROM "http://test-bucket-public.duckdb-minio-non-existent-host.com:9000/root-dir/non-existent-file-ljaslkjdas.parquet" LIMIT 3 121 | ---- 122 | Could not establish connection error for HTTP HEAD to 'http://test-bucket-public. 123 | 124 | # S3 errors should throw on 125 | statement error 126 | SELECT * FROM parquet_scan('s3://this-aint-no-bucket/no-path/no-file'); 127 | ---- 128 | Unable to connect to URL "http:// 129 | -------------------------------------------------------------------------------- /test/sql/copy/s3/http_proxy.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/http_proxy.test 2 | # description: Test http proxy 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | require-env HTTP_PROXY_PUBLIC 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 24 | set ignore_error_messages 25 | 26 | statement ok 27 | PRAGMA enable_verification 28 | 29 | statement ok 30 | COPY (SELECT 'value-1' as value) TO 's3://test-bucket/proxy-test/test.parquet'; 31 | 32 | query I 33 | FROM 's3://test-bucket/proxy-test/test.parquet' 34 | ---- 35 | value-1 36 | 37 | # Lets try a faulty proxy first 38 | statement ok 39 | set http_proxy='blabla:1337' 40 | 41 | statement ok 42 | set http_proxy_username='xxx' 43 | 44 | statement ok 45 | set http_proxy_password='yyy' 46 | 47 | statement error 48 | FROM 's3://test-bucket/proxy-test/test.parquet' 49 | ---- 50 | Could not establish connection 51 | 52 | # Now a working one 53 | statement ok 54 | set http_proxy='${HTTP_PROXY_PUBLIC}' 55 | 56 | statement ok 57 | RESET http_proxy_username 58 | 59 | statement ok 60 | RESET http_proxy_password 61 | 62 | query I 63 | FROM 's3://test-bucket/proxy-test/test.parquet' 64 | ---- 65 | value-1 66 | 67 | # And try the working one with an 'http://' prefix. 68 | statement ok 69 | set http_proxy='http://${HTTP_PROXY_PUBLIC}' 70 | 71 | query I 72 | FROM 's3://test-bucket/proxy-test/test.parquet' 73 | ---- 74 | value-1 75 | 76 | # Now we revert to the failing one 77 | statement ok 78 | set http_proxy='blabla:1337' 79 | 80 | # But we create a HTTP secret with the proxy 81 | statement ok 82 | CREATE SECRET http1 ( 83 | TYPE HTTP, 84 | http_proxy '${HTTP_PROXY_PUBLIC}' 85 | ); 86 | 87 | # This works now, because it uses the secret 88 | query I 89 | FROM 's3://test-bucket/proxy-test/test.parquet' 90 | ---- 91 | value-1 92 | 93 | statement ok 94 | DROP SECRET http1 95 | 96 | require-env HTTP_PROXY 97 | 98 | statement error 99 | FROM 's3://test-bucket/proxy-test/test.parquet' 100 | ---- 101 | Could not establish connection 102 | 103 | statement ok 104 | CREATE SECRET http1 ( 105 | TYPE HTTP, 106 | PROVIDER env 107 | ); 108 | 109 | # This works now, because it uses the secret 110 | query I 111 | FROM 's3://test-bucket/proxy-test/test.parquet' 112 | ---- 113 | value-1 114 | 115 | statement ok 116 | DROP SECRET http1 117 | 118 | require-env HTTP_PROXY_PRIVATE 119 | 120 | require-env HTTP_PROXY_PRIVATE_USERNAME 121 | 122 | require-env HTTP_PROXY_PRIVATE_PASSWORD 123 | 124 | # Let's try the private proxy 125 | statement ok 126 | CREATE SECRET http2 ( 127 | TYPE HTTP, 128 | http_proxy '${HTTP_PROXY_PRIVATE}', 129 | http_proxy_username '${HTTP_PROXY_PRIVATE_USERNAME}', 130 | http_proxy_password '${HTTP_PROXY_PRIVATE_PASSWORD}' 131 | ); 132 | 133 | # Correct auth means it works! 134 | query I 135 | FROM 's3://test-bucket/proxy-test/test.parquet' 136 | ---- 137 | value-1 138 | 139 | statement ok 140 | DROP SECRET http2 141 | 142 | # Now lets try incorrect auth 143 | statement ok 144 | CREATE SECRET http3 ( 145 | TYPE HTTP, 146 | http_proxy '${HTTP_PROXY_PRIVATE}', 147 | http_proxy_username 'malicious', 148 | http_proxy_password 'intruder' 149 | ); 150 | 151 | # We get a tasty HTTP 407 152 | statement error 153 | FROM 's3://test-bucket/proxy-test/test.parquet' 154 | ---- 155 | HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/proxy-test/test.parquet' (HTTP 407) 156 | -------------------------------------------------------------------------------- /test/sql/copy/parquet/test_parquet_remote_foreign_files.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/parquet/test_parquet_remote_foreign_files.test 2 | # description: Test queries on tricky parquet files over http. Note: on GH connection issues, these tests fail silently 3 | # group: [parquet] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | # /data/parquet-testing/bug1554.parquet 10 | query I 11 | SELECT COUNT(backlink_count) FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1554.parquet') WHERE http_status_code=200 12 | ---- 13 | 0 14 | 15 | query II 16 | SELECT http_status_code, COUNT(backlink_count) FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1554.parquet') GROUP BY http_status_code ORDER BY http_status_code 17 | ---- 18 | 200 0 19 | 301 0 20 | 21 | # /data/parquet-testing/bug1588.parquet 22 | 23 | query I 24 | SELECT has_image_link FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1588.parquet') where has_image_link = 1 25 | ---- 26 | 1 27 | 1 28 | 1 29 | 30 | # /data/parquet-testing/bug1589.parquet 31 | query I 32 | SELECT backlink_count FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1589.parquet') LIMIT 1 33 | ---- 34 | NULL 35 | 36 | statement ok 37 | SELECT * FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1589.parquet') 38 | 39 | 40 | query I 41 | SELECT "inner"['str_field'] FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1618_struct_strings.parquet') 42 | ---- 43 | hello 44 | NULL 45 | 46 | query I 47 | SELECT "inner"['f64_field'] FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1618_struct_strings.parquet') 48 | ---- 49 | NULL 50 | 1.23 51 | 52 | query I 53 | SELECT "inner" FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug1618_struct_strings.parquet') 54 | ---- 55 | {'str_field': hello, 'f64_field': NULL} 56 | {'str_field': NULL, 'f64_field': 1.23} 57 | 58 | # /data/parquet-testing/struct.parquet 59 | query I 60 | select "inner"['f64_field'] from parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/struct.parquet'); 61 | ---- 62 | NULL 63 | 1.23 64 | 65 | # /data/parquet-testing/bug2267.parquet 66 | query I 67 | SELECT * FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug2267.parquet') 68 | ---- 69 | [{'disabledPlans': [bea4c11e-220a-4e6d-8eb8-8ea15d019f90], 'skuId': c7df2760-2c81-4ef7-b578-5b5392b571df}, {'disabledPlans': [8a256a2b-b617-496d-b51b-e76466e88db0, 41781fb2-bc02-4b7c-bd55-b576c07bb09d, eec0eb4f-6444-4f95-aba0-50c24d67f998], 'skuId': 84a661c4-e949-4bd2-a560-ed7766fcaf2b}, {'disabledPlans': [], 'skuId': b05e124f-c7cc-45a0-a6aa-8cf78c946968}, {'disabledPlans': [], 'skuId': f30db892-07e9-47e9-837c-80727f46fd3d}] 70 | 71 | query I 72 | SELECT assignedLicenses[1] FROM parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/bug2267.parquet') 73 | ---- 74 | {'disabledPlans': [bea4c11e-220a-4e6d-8eb8-8ea15d019f90], 'skuId': c7df2760-2c81-4ef7-b578-5b5392b571df} 75 | 76 | # multiple files 77 | query II 78 | select * from parquet_scan(['https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/glob/t1.parquet', 'https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/glob/t2.parquet']) 79 | ---- 80 | 1 a 81 | 2 b 82 | 83 | # Malformed parquet to test fallback from prefetch 84 | query IIII 85 | select * from parquet_scan('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/arrow/nation.dict-malformed.parquet') limit 2; 86 | ---- 87 | 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai 88 | 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon 89 | -------------------------------------------------------------------------------- /test/extension/autoloading_base.test: -------------------------------------------------------------------------------- 1 | # name: test/extension/autoloading_base.test 2 | # description: Base tests for the autoloading mechanism for extensions 3 | # group: [extension] 4 | 5 | require httpfs 6 | 7 | # This test assumes icu and json to be available in the LOCAL_EXTENSION_REPO and NOT linked into duckdb statically 8 | # -> this should be the case for our autoloading tests where we have the local_extension_repo variable set 9 | require-env LOCAL_EXTENSION_REPO 10 | 11 | # Ensure we have a clean extension directory without any preinstalled extensions 12 | statement ok 13 | set extension_directory='__TEST_DIR__/autoloading_base' 14 | 15 | query I 16 | SELECT (count(*) > 0) FROM duckdb_extensions() WHERE install_path ILIKE '%duckdb_extension' 17 | ---- 18 | false 19 | 20 | # All extensions reported by duckdb are either statically linked or not installed 21 | query I 22 | SELECT count(*) FROM duckdb_extensions() WHERE install_mode != 'NOT_INSTALLED' AND install_mode != 'STATICALLY_LINKED' 23 | ---- 24 | 0 25 | 26 | ### No autoloading nor installing: throw error with installation hint 27 | statement ok 28 | set autoload_known_extensions=false 29 | 30 | statement ok 31 | set autoinstall_known_extensions=false 32 | 33 | statement error 34 | SET s3_region='eu-west-1'; 35 | ---- 36 | :.*Catalog Error.*Setting with name "s3_region" is not in the catalog.* 37 | 38 | statement error 39 | select * from read_json_auto('data/json/example_n.ndjson'); 40 | ---- 41 | :.*Catalog Error.*Table Function with name "read_json_auto" is not in the catalog.* 42 | 43 | statement error 44 | select * from thistablefunctionwillnotexistfosho(); 45 | ---- 46 | :.*Catalog Error.*Table Function with name thistablefunctionwillnotexistfosho does not exist.* 47 | 48 | ### Autoloading and installing, but the autoloading repository is set to non-existent location 49 | statement ok 50 | set autoload_known_extensions=true 51 | 52 | statement ok 53 | set autoinstall_known_extensions=true 54 | 55 | # Override the default repo with a non-existent local repo 56 | statement ok 57 | set autoinstall_extension_repository='/tmp/non-existent-repo'; 58 | 59 | # Error should inform the user on whats happening 60 | statement error 61 | SET s3_region='eu-west-1'; 62 | ---- 63 | :Extension Autoloading Error.*An error occurred while trying to automatically install the required extension 'httpfs'.* 64 | 65 | statement error 66 | select * from read_json_auto('data/json/example_n.ndjson'); 67 | ---- 68 | :Extension Autoloading Error.*An error occurred while trying to automatically install the required extension 'json'.* 69 | 70 | # Now override with non-existent remote repo 71 | statement ok 72 | set autoinstall_extension_repository='http://duckdb.org/what/are/the/odds/we/actually/make/this/path/and/break/this/tests'; 73 | 74 | # Error should inform the user on whats happening 75 | statement error 76 | SET s3_region='eu-west-1'; 77 | ---- 78 | :Extension Autoloading Error.*An error occurred while trying to automatically install the required extension 'httpfs'.* 79 | 80 | statement error 81 | select * from read_json_auto('data/json/example_n.ndjson'); 82 | ---- 83 | :Extension Autoloading Error.*An error occurred while trying to automatically install the required extension 'json'.* 84 | 85 | statement error 86 | select * from thistablefunctionwillnotexistfosho(); 87 | ---- 88 | :Catalog Error.*Table Function with name thistablefunctionwillnotexistfosho does not exist.* 89 | 90 | ### Autoloading with correct tmp repo 91 | statement ok 92 | set autoinstall_extension_repository='${LOCAL_EXTENSION_REPO}'; 93 | 94 | statement ok 95 | SET s3_region='eu-west-1'; 96 | 97 | statement ok 98 | select * from read_json_auto('data/json/example_n.ndjson'); 99 | 100 | query I 101 | SELECT (count(*) > 0) FROM duckdb_extensions() WHERE install_path ILIKE '%duckdb_extension'; 102 | ---- 103 | true 104 | -------------------------------------------------------------------------------- /test/sql/secrets/create_secret_storage_backends.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secrets/create_secret_storage_backends.test 2 | # description: Test different storage backends 3 | # group: [secrets] 4 | 5 | load __TEST_DIR__/create_secret_storage_backends.db 6 | 7 | statement ok 8 | PRAGMA enable_verification; 9 | 10 | require httpfs 11 | 12 | # Ensure any currently stored secrets don't interfere with the test 13 | statement ok 14 | set allow_persistent_secrets=false; 15 | 16 | statement error 17 | CREATE TEMPORARY SECRET s1 IN LOCAL_FILE ( TYPE S3 ) 18 | ---- 19 | Invalid Input Error: Persistent secrets are disabled. Restart DuckDB and enable persistent secrets through 'SET allow_persistent_secrets=true' 20 | 21 | statement error 22 | CREATE PERSISTENT SECRET s1 IN NON_EXISTENT_SECRET_STORAGE ( TYPE S3 ) 23 | ---- 24 | Invalid Input Error: Persistent secrets are disabled. Restart DuckDB and enable persistent secrets through 'SET allow_persistent_secrets=true' 25 | 26 | # We have disabled the permanent secrets, so this should fail 27 | statement error 28 | CREATE PERSISTENT SECRET perm_s1 ( TYPE S3 ) 29 | ---- 30 | Invalid Input Error: Persistent secrets are disabled. Restart DuckDB and enable persistent secrets through 'SET allow_persistent_secrets=true' 31 | 32 | restart 33 | 34 | # Enable persistent secrets so we can set a 'secret_directory' 35 | statement ok 36 | set allow_persistent_secrets=true; 37 | 38 | statement ok 39 | set secret_directory='__TEST_DIR__/create_secret_storages' 40 | 41 | # Default for persistent secret is currently LOCAL_FILE (only native persistent storage method currently) 42 | statement ok 43 | CREATE PERSISTENT SECRET perm_s1 ( TYPE S3 ) 44 | 45 | # Specifying IN ... implies persistent, hence this is okay 46 | statement ok 47 | CREATE SECRET perm_s2 IN LOCAL_FILE ( TYPE S3 ) 48 | 49 | # Explicitly stating temporary is cool 50 | statement ok 51 | CREATE TEMPORARY SECRET temp_s1 ( TYPE s3 ); 52 | 53 | # Not specifying it will use the system default (which is temp) 54 | statement ok 55 | CREATE SECRET temp_s2 ( TYPE s3 ); 56 | 57 | query IIIIII 58 | SELECT * EXCLUDE (secret_string) FROM duckdb_secrets() ORDER BY name 59 | ---- 60 | perm_s1 s3 config true local_file ['s3://', 's3n://', 's3a://'] 61 | perm_s2 s3 config true local_file ['s3://', 's3n://', 's3a://'] 62 | temp_s1 s3 config false memory ['s3://', 's3n://', 's3a://'] 63 | temp_s2 s3 config false memory ['s3://', 's3n://', 's3a://'] 64 | 65 | restart 66 | 67 | # Since extensions can add secret storage backends, we allow switching the default backend 68 | statement ok 69 | set default_secret_storage='currently-non-existent' 70 | 71 | statement ok 72 | set secret_directory='__TEST_DIR__/create_secret_storages' 73 | 74 | statement error 75 | CREATE PERSISTENT SECRET s1 ( TYPE S3 ) 76 | ---- 77 | Secret storage 'currently-non-existent' not found! 78 | 79 | # We can still work around this broken default by specifying the storage explicitly 80 | statement ok 81 | CREATE PERSISTENT SECRET s1 IN LOCAL_FILE ( TYPE S3 ) 82 | 83 | restart 84 | 85 | statement ok 86 | set secret_directory='__TEST_DIR__/create_secret_storages' 87 | 88 | # Let's restore and now things work again 89 | statement ok 90 | reset default_secret_storage 91 | 92 | statement ok 93 | CREATE PERSISTENT SECRET s2 ( TYPE S3 ) 94 | 95 | query IIIIII 96 | SELECT * EXCLUDE (secret_string) FROM duckdb_secrets() ORDER BY name 97 | ---- 98 | perm_s1 s3 config true local_file ['s3://', 's3n://', 's3a://'] 99 | perm_s2 s3 config true local_file ['s3://', 's3n://', 's3a://'] 100 | s1 s3 config true local_file ['s3://', 's3n://', 's3a://'] 101 | s2 s3 config true local_file ['s3://', 's3n://', 's3a://'] 102 | 103 | statement maybe 104 | DROP SECRET perm_s1; 105 | ---- 106 | Invalid Input Error: Failed to remove non-existent secret 107 | 108 | statement maybe 109 | DROP SECRET perm_s2; 110 | ---- 111 | Invalid Input Error: Failed to remove non-existent secret 112 | -------------------------------------------------------------------------------- /test/sql/secret/secret_refresh.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/secret/secret_refresh.test 2 | # description: Tests secret refreshing 3 | # group: [secret] 4 | 5 | require-env S3_TEST_SERVER_AVAILABLE 1 6 | 7 | require-env AWS_DEFAULT_REGION 8 | 9 | require-env AWS_ACCESS_KEY_ID 10 | 11 | require-env AWS_SECRET_ACCESS_KEY 12 | 13 | require-env DUCKDB_S3_ENDPOINT 14 | 15 | require-env DUCKDB_S3_USE_SSL 16 | 17 | set ignore_error_messages 18 | 19 | require httpfs 20 | 21 | require parquet 22 | 23 | statement ok 24 | SET enable_logging=true 25 | 26 | statement ok 27 | set s3_use_ssl='${DUCKDB_S3_USE_SSL}' 28 | 29 | statement ok 30 | set s3_endpoint='${DUCKDB_S3_ENDPOINT}' 31 | 32 | statement ok 33 | set s3_region='${AWS_DEFAULT_REGION}' 34 | 35 | # Create some test data 36 | statement ok 37 | CREATE SECRET s1 ( 38 | TYPE S3, 39 | KEY_ID '${AWS_ACCESS_KEY_ID}', 40 | SECRET '${AWS_SECRET_ACCESS_KEY}' 41 | ) 42 | 43 | statement ok 44 | copy (select 1 as a) to 's3://test-bucket/test-file.parquet' 45 | 46 | statement ok 47 | DROP SECRET s1; 48 | 49 | # Firstly: a secret that is initially wrong, but correct after refresh 50 | statement ok 51 | CREATE SECRET s1 ( 52 | TYPE S3, 53 | KEY_ID 'BOGUS', 54 | SECRET 'ALSO BOGUS', 55 | REFRESH_INFO MAP { 56 | 'KEY_ID': '${AWS_ACCESS_KEY_ID}', 57 | 'SECRET': '${AWS_SECRET_ACCESS_KEY}' 58 | } 59 | ) 60 | 61 | # Make the request: initial request will fail, but refresh will get triggered and the request succeeds on second attempt 62 | statement ok 63 | FROM "s3://test-bucket/test-file.parquet" 64 | 65 | query I 66 | SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' 67 | ---- 68 | Successfully refreshed secret: s1, new key_id: 69 | 70 | # Cleanup: drop secret and logs 71 | statement ok 72 | DROP SECRET s1;set enable_logging=false;set logging_storage='stdout';set logging_storage='memory';set enable_logging=true; 73 | 74 | # Secondly: a secret that is initially wrong, and still incorrect afterwards (REFRESH will just use the original secret input to refresh) 75 | statement ok 76 | CREATE SECRET s1 ( 77 | TYPE S3, 78 | KEY_ID 'BOGUS', 79 | SECRET 'ALSO BOGUS', 80 | REFRESH 1 81 | ) 82 | 83 | # TODO: add FORBIDDEN back in once enum util for http status codes is merged into httpfs 84 | statement error 85 | FROM "s3://test-bucket/test-file.parquet" 86 | ---- 87 | HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) 88 | 89 | query I 90 | SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' 91 | ---- 92 | Successfully refreshed secret: s1, new key_id: 93 | 94 | # Cleanup: drop secret and logs 95 | statement ok 96 | DROP SECRET s1;set enable_logging=false;set logging_storage='stdout';set logging_storage='memory';set enable_logging=true; 97 | 98 | # Thirdly: a secret that is initially wrong, and contains incorrect REFRESH_INFO 99 | statement ok 100 | CREATE SECRET s1 ( 101 | TYPE S3, 102 | KEY_ID 'BOGUS', 103 | SECRET 'ALSO BOGUS', 104 | REFRESH_INFO MAP { 105 | 'THIS_KEY_DOES_NOT_EXIST': '${BOGUS}' 106 | } 107 | ) 108 | 109 | # For now, we throw the actual error that get's thrown during refresh. Since refresh is op-in for now that ensures user can understand what's happening 110 | statement error 111 | FROM "s3://test-bucket/test-file.parquet" 112 | ---- 113 | Exception thrown while trying to refresh secret s1 114 | 115 | # Cleanup: drop secret 116 | statement ok 117 | DROP SECRET s1; 118 | 119 | # Set incorrect key id to force query to fail without secret 120 | statement ok 121 | set s3_access_key_id='bogus' 122 | 123 | # Without secret this query will fail, but since there are no suitable secrets, no refresh attempt will be made 124 | # TODO: add FORBIDDEN in once enum util for http status codes is merged into httpfs 125 | statement error 126 | FROM "s3://test-bucket/test-file.parquet" 127 | ---- 128 | HTTP Error: HTTP GET error on 'http://test-bucket.duckdb-minio.com:9000/test-file.parquet' (HTTP 403) 129 | 130 | # -> log empty 131 | query II 132 | SELECT log_level, message FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' 133 | ---- 134 | -------------------------------------------------------------------------------- /test/sql/copy/s3/s3_hive_partition.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/s3_hive_partition.test 2 | # description: Test the automatic parsing of the hive partitioning scheme 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | ## Require that these environment variables are also set 12 | require-env AWS_DEFAULT_REGION 13 | 14 | require-env AWS_ACCESS_KEY_ID 15 | 16 | require-env AWS_SECRET_ACCESS_KEY 17 | 18 | require-env DUCKDB_S3_ENDPOINT 19 | 20 | require-env DUCKDB_S3_USE_SSL 21 | 22 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 23 | set ignore_error_messages 24 | 25 | # Parquet filename name conflict 26 | statement ok 27 | CREATE TABLE test AS SELECT 1 as id, 'value1' as value; 28 | CREATE TABLE test2 AS SELECT 2 as id, 'value2' as value; 29 | 30 | statement ok 31 | COPY test TO 's3://test-bucket/hive-partitioning/simple/key_!-_.*()=zisiswurking1/test.parquet'; 32 | COPY test2 TO 's3://test-bucket/hive-partitioning/simple/key_!-_.*()=zisiswurking2/test.parquet'; 33 | 34 | # test parsing hive partitioning scheme, with some common special characters 35 | query III 36 | select id, value, "key_!-_.*()" from parquet_scan('s3://test-bucket/hive-partitioning/simple/*/test.parquet', HIVE_PARTITIONING=1) 37 | ---- 38 | 1 value1 zisiswurking1 39 | 2 value2 zisiswurking2 40 | 41 | # Test some medium sized files 42 | statement ok 43 | CREATE TABLE test3 as SELECT id FROM range(0,10000) tbl(id); 44 | CREATE TABLE test4 as SELECT id FROM range(10000,20000) tbl(id); 45 | 46 | statement ok 47 | COPY test3 TO 's3://test-bucket/hive-partitioning/medium/part=1/part2=1/test.parquet'; 48 | COPY test4 TO 's3://test-bucket/hive-partitioning/medium/part=1/part2=2/test.parquet'; 49 | COPY test3 TO 's3://test-bucket/hive-partitioning/medium/part=1/part2=1/test.csv'; 50 | COPY test4 TO 's3://test-bucket/hive-partitioning/medium/part=1/part2=2/test.csv'; 51 | 52 | query II 53 | select min(id), max(id) from parquet_scan('s3://test-bucket/hive-partitioning/medium/*/*/test.parquet', HIVE_PARTITIONING=1) where part2=2 54 | ---- 55 | 10000 19999 56 | 57 | query II 58 | select min(id), max(id) from parquet_scan('s3://test-bucket/hive-partitioning/medium/*/*/test.parquet', HIVE_PARTITIONING=1) where part2=1 59 | ---- 60 | 0 9999 61 | 62 | query II 63 | select min(id), max(id) from read_csv_auto('s3://test-bucket/hive-partitioning/medium/*/*/test.csv', HIVE_PARTITIONING=1) where part2=2 64 | ---- 65 | 10000 19999 66 | 67 | query II 68 | select min(id), max(id) from read_csv_auto('s3://test-bucket/hive-partitioning/medium/*/*/test.csv', HIVE_PARTITIONING=1) where part2=1 69 | ---- 70 | 0 9999 71 | 72 | # check cases where there are file filters AND table filters 73 | statement ok 74 | Create table t1 (a int, b int, c int); 75 | 76 | foreach i 0 1 2 3 4 5 6 7 8 9 77 | 78 | statement ok 79 | insert into t1 (select range, ${i}*10, ${i}*100 from range(0,10)); 80 | 81 | endloop 82 | 83 | statement ok 84 | COPY (SELECT * FROM t1) TO 's3://test-bucket/hive-partitioning/filter-test-parquet' (FORMAT PARQUET, PARTITION_BY c, OVERWRITE_OR_IGNORE); 85 | 86 | statement ok 87 | COPY (SELECT * FROM t1) TO 's3://test-bucket/hive-partitioning/filter-test-csv' (FORMAT CSV, PARTITION_BY c, OVERWRITE_OR_IGNORE); 88 | 89 | # There should be Table Filters (id < 50) and file filters (c = 500) 90 | query II 91 | EXPLAIN select a from parquet_scan('s3://test-bucket/hive-partitioning/filter-test-parquet/*/*.parquet', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where c=500 and a < 4; 92 | ---- 93 | physical_plan :.*PARQUET_SCAN.*Filters:.*a<4.*File Filters:.* \(CAST\(c AS.*INTEGER\) = 500\).* 94 | 95 | # There should be Table Filters (id < 50) and file filters (c = 500) 96 | query II 97 | EXPLAIN select a from read_csv_auto('s3://test-bucket/hive-partitioning/filter-test-csv/*/*.csv', HIVE_PARTITIONING=1, HIVE_TYPES_AUTOCAST=0) where c=500 and a < 4; 98 | ---- 99 | physical_plan :.*FILTER.*(a < 4).*READ_CSV_AUTO.*File Filters:.* \(CAST\(c AS.*INTEGER\) = 500\).* 100 | 101 | statement error 102 | COPY (SELECT * FROM t1) TO 's3://test-bucket/hive-partitioning/filter-test-parquet' (FORMAT PARQUET, PARTITION_BY c, OVERWRITE); 103 | ---- 104 | OVERWRITE is not supported for remote file systems 105 | -------------------------------------------------------------------------------- /src/http_state.cpp: -------------------------------------------------------------------------------- 1 | #include "http_state.hpp" 2 | #include "duckdb/main/query_profiler.hpp" 3 | 4 | namespace duckdb { 5 | 6 | CachedFileHandle::CachedFileHandle(shared_ptr &file_p) { 7 | // If the file was not yet initialized, we need to grab a lock. 8 | if (!file_p->initialized) { 9 | lock = make_uniq>(file_p->lock); 10 | } 11 | file = file_p; 12 | } 13 | 14 | void CachedFileHandle::SetInitialized(idx_t total_size) { 15 | if (file->initialized) { 16 | throw InternalException("Cannot set initialized on cached file that was already initialized"); 17 | } 18 | if (!lock) { 19 | throw InternalException("Cannot set initialized on cached file without lock"); 20 | } 21 | file->size = total_size; 22 | file->initialized = true; 23 | lock = nullptr; 24 | } 25 | 26 | void CachedFileHandle::AllocateBuffer(idx_t size) { 27 | if (file->initialized) { 28 | throw InternalException("Cannot allocate a buffer for a cached file that was already initialized"); 29 | } 30 | file->data = shared_ptr(new char[size], std::default_delete()); 31 | file->capacity = size; 32 | } 33 | 34 | void CachedFileHandle::GrowBuffer(idx_t new_capacity, idx_t bytes_to_copy) { 35 | // copy shared ptr to old data 36 | auto old_data = file->data; 37 | // allocate new buffer that can hold the new capacity 38 | AllocateBuffer(new_capacity); 39 | // copy the old data 40 | Write(old_data.get(), bytes_to_copy); 41 | } 42 | 43 | void CachedFileHandle::Write(const char *buffer, idx_t length, idx_t offset) { 44 | //! Only write to non-initialized files with a lock; 45 | D_ASSERT(!file->initialized && lock); 46 | memcpy(file->data.get() + offset, buffer, length); 47 | } 48 | 49 | void HTTPState::Reset() { 50 | // Reset Counters 51 | head_count = 0; 52 | get_count = 0; 53 | put_count = 0; 54 | post_count = 0; 55 | delete_count = 0; 56 | total_bytes_received = 0; 57 | total_bytes_sent = 0; 58 | 59 | // Reset cached files 60 | cached_files.clear(); 61 | } 62 | 63 | shared_ptr HTTPState::TryGetState(ClientContext &context) { 64 | return context.registered_state->GetOrCreate("http_state"); 65 | } 66 | 67 | shared_ptr HTTPState::TryGetState(optional_ptr opener) { 68 | auto client_context = FileOpener::TryGetClientContext(opener); 69 | if (client_context) { 70 | return TryGetState(*client_context); 71 | } 72 | return nullptr; 73 | } 74 | 75 | void HTTPState::WriteProfilingInformation(std::ostream &ss) { 76 | string read = "in: " + StringUtil::BytesToHumanReadableString(total_bytes_received); 77 | string written = "out: " + StringUtil::BytesToHumanReadableString(total_bytes_sent); 78 | string head = "#HEAD: " + to_string(head_count); 79 | string get = "#GET: " + to_string(get_count); 80 | string put = "#PUT: " + to_string(put_count); 81 | string post = "#POST: " + to_string(post_count); 82 | string del = "#DELETE: " + to_string(delete_count); 83 | 84 | constexpr idx_t TOTAL_BOX_WIDTH = 39; 85 | ss << "┌─────────────────────────────────────┐\n"; 86 | ss << "│┌───────────────────────────────────┐│\n"; 87 | ss << "││" + QueryProfiler::DrawPadded("HTTPFS HTTP Stats", TOTAL_BOX_WIDTH - 4) + "││\n"; 88 | ss << "││ ││\n"; 89 | ss << "││" + QueryProfiler::DrawPadded(read, TOTAL_BOX_WIDTH - 4) + "││\n"; 90 | ss << "││" + QueryProfiler::DrawPadded(written, TOTAL_BOX_WIDTH - 4) + "││\n"; 91 | ss << "││" + QueryProfiler::DrawPadded(head, TOTAL_BOX_WIDTH - 4) + "││\n"; 92 | ss << "││" + QueryProfiler::DrawPadded(get, TOTAL_BOX_WIDTH - 4) + "││\n"; 93 | ss << "││" + QueryProfiler::DrawPadded(put, TOTAL_BOX_WIDTH - 4) + "││\n"; 94 | ss << "││" + QueryProfiler::DrawPadded(post, TOTAL_BOX_WIDTH - 4) + "││\n"; 95 | ss << "││" + QueryProfiler::DrawPadded(del, TOTAL_BOX_WIDTH - 4) + "││\n"; 96 | ss << "│└───────────────────────────────────┘│\n"; 97 | ss << "└─────────────────────────────────────┘\n"; 98 | } 99 | 100 | //! Get cache entry, create if not exists 101 | shared_ptr &HTTPState::GetCachedFile(const string &path) { 102 | lock_guard lock(cached_files_mutex); 103 | auto &cache_entry_ref = cached_files[path]; 104 | if (!cache_entry_ref) { 105 | cache_entry_ref = make_shared_ptr(); 106 | } 107 | return cache_entry_ref; 108 | } 109 | 110 | } // namespace duckdb 111 | -------------------------------------------------------------------------------- /test/sql/copy/s3/upload_small_file.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/copy/s3/upload_small_file.test 2 | # description: Copy small csv/parquet files from and to S3. 3 | # group: [s3] 4 | 5 | require parquet 6 | 7 | require httpfs 8 | 9 | require-env S3_TEST_SERVER_AVAILABLE 1 10 | 11 | # Require that these environment variables are also set 12 | 13 | require-env AWS_DEFAULT_REGION 14 | 15 | require-env AWS_ACCESS_KEY_ID 16 | 17 | require-env AWS_SECRET_ACCESS_KEY 18 | 19 | require-env DUCKDB_S3_ENDPOINT 20 | 21 | require-env DUCKDB_S3_USE_SSL 22 | 23 | # override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues 24 | set ignore_error_messages 25 | 26 | statement ok 27 | CREATE TABLE web_page as (SELECT * FROM "duckdb/data/csv/real/web_page.csv"); 28 | 29 | query IIIIIIIIIIIIII 30 | SELECT * FROM web_page LIMIT 10; 31 | ---- 32 | 1 AAAAAAAABAAAAAAA 1997-09-03 NULL 2450810 2452620 Y 98539 http://www.foo.com welcome 2531 8 3 4 33 | 2 AAAAAAAACAAAAAAA 1997-09-03 2000-09-02 2450814 2452580 N NULL http://www.foo.com protected 1564 4 3 1 34 | 3 AAAAAAAACAAAAAAA 2000-09-03 NULL 2450814 2452611 N NULL http://www.foo.com feedback 1564 4 3 4 35 | 4 AAAAAAAAEAAAAAAA 1997-09-03 1999-09-03 2450812 2452579 N NULL http://www.foo.com general 3732 18 7 1 36 | 5 AAAAAAAAEAAAAAAA 1999-09-04 2001-09-02 2450812 2452597 N NULL http://www.foo.com welcome 3732 18 3 1 37 | 6 AAAAAAAAEAAAAAAA 2001-09-03 NULL 2450814 2452597 N NULL http://www.foo.com ad 3732 18 7 4 38 | 7 AAAAAAAAHAAAAAAA 1997-09-03 NULL 2450815 2452574 N NULL http://www.foo.com feedback 3034 18 7 4 39 | 8 AAAAAAAAIAAAAAAA 1997-09-03 2000-09-02 2450815 2452646 Y 1898 http://www.foo.com protected 3128 12 2 4 40 | 9 AAAAAAAAIAAAAAAA 2000-09-03 NULL 2450807 2452579 Y 84146 http://www.foo.com welcome 3128 13 5 3 41 | 10 AAAAAAAAKAAAAAAA 1997-09-03 1999-09-03 NULL 2452623 N NULL http://www.foo.com NULL NULL NULL NULL NULL 42 | 43 | # Parquet file 44 | statement ok 45 | COPY web_page TO 's3://test-bucket/multipart/web_page.parquet' (FORMAT 'parquet'); 46 | 47 | query IIIIIIIIIIIIII 48 | SELECT * FROM "s3://test-bucket/multipart/web_page.parquet" LIMIT 10; 49 | ---- 50 | 1 AAAAAAAABAAAAAAA 1997-09-03 NULL 2450810 2452620 Y 98539 http://www.foo.com welcome 2531 8 3 4 51 | 2 AAAAAAAACAAAAAAA 1997-09-03 2000-09-02 2450814 2452580 N NULL http://www.foo.com protected 1564 4 3 1 52 | 3 AAAAAAAACAAAAAAA 2000-09-03 NULL 2450814 2452611 N NULL http://www.foo.com feedback 1564 4 3 4 53 | 4 AAAAAAAAEAAAAAAA 1997-09-03 1999-09-03 2450812 2452579 N NULL http://www.foo.com general 3732 18 7 1 54 | 5 AAAAAAAAEAAAAAAA 1999-09-04 2001-09-02 2450812 2452597 N NULL http://www.foo.com welcome 3732 18 3 1 55 | 6 AAAAAAAAEAAAAAAA 2001-09-03 NULL 2450814 2452597 N NULL http://www.foo.com ad 3732 18 7 4 56 | 7 AAAAAAAAHAAAAAAA 1997-09-03 NULL 2450815 2452574 N NULL http://www.foo.com feedback 3034 18 7 4 57 | 8 AAAAAAAAIAAAAAAA 1997-09-03 2000-09-02 2450815 2452646 Y 1898 http://www.foo.com protected 3128 12 2 4 58 | 9 AAAAAAAAIAAAAAAA 2000-09-03 NULL 2450807 2452579 Y 84146 http://www.foo.com welcome 3128 13 5 3 59 | 10 AAAAAAAAKAAAAAAA 1997-09-03 1999-09-03 NULL 2452623 N NULL http://www.foo.com NULL NULL NULL NULL NULL 60 | 61 | # CSV file 62 | statement ok 63 | COPY web_page TO 's3://test-bucket/multipart/web_page.csv'; 64 | 65 | query IIIIIIIIIIIIII 66 | SELECT * FROM "s3://test-bucket/multipart/web_page.csv" LIMIT 10; 67 | ---- 68 | 1 AAAAAAAABAAAAAAA 1997-09-03 NULL 2450810 2452620 Y 98539 http://www.foo.com welcome 2531 8 3 4 69 | 2 AAAAAAAACAAAAAAA 1997-09-03 2000-09-02 2450814 2452580 N NULL http://www.foo.com protected 1564 4 3 1 70 | 3 AAAAAAAACAAAAAAA 2000-09-03 NULL 2450814 2452611 N NULL http://www.foo.com feedback 1564 4 3 4 71 | 4 AAAAAAAAEAAAAAAA 1997-09-03 1999-09-03 2450812 2452579 N NULL http://www.foo.com general 3732 18 7 1 72 | 5 AAAAAAAAEAAAAAAA 1999-09-04 2001-09-02 2450812 2452597 N NULL http://www.foo.com welcome 3732 18 3 1 73 | 6 AAAAAAAAEAAAAAAA 2001-09-03 NULL 2450814 2452597 N NULL http://www.foo.com ad 3732 18 7 4 74 | 7 AAAAAAAAHAAAAAAA 1997-09-03 NULL 2450815 2452574 N NULL http://www.foo.com feedback 3034 18 7 4 75 | 8 AAAAAAAAIAAAAAAA 1997-09-03 2000-09-02 2450815 2452646 Y 1898 http://www.foo.com protected 3128 12 2 4 76 | 9 AAAAAAAAIAAAAAAA 2000-09-03 NULL 2450807 2452579 Y 84146 http://www.foo.com welcome 3128 13 5 3 77 | 10 AAAAAAAAKAAAAAAA 1997-09-03 1999-09-03 NULL 2452623 N NULL http://www.foo.com NULL NULL NULL NULL NULL 78 | --------------------------------------------------------------------------------