├── .clang-format ├── .editorconfig ├── .github └── workflows │ └── MainDistributionPipeline.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── docs └── UPDATING.md ├── extension_config.cmake ├── scripts ├── extension-upload.sh └── setup-custom-toolchain.sh ├── src ├── avro_extension.cpp ├── avro_multi_file_info.cpp ├── avro_reader.cpp └── include │ ├── avro_extension.hpp │ ├── avro_multi_file_info.hpp │ ├── avro_reader.hpp │ └── avro_type.hpp ├── test ├── 4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro ├── README.md ├── all_nullable_list.avro ├── avro.avro ├── bigdata.avro ├── broken_record.avro ├── create_test_file.py ├── empty_record.avro ├── enum.avro ├── fixed.avro ├── iceberg │ ├── 10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro │ ├── 10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro │ ├── 23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro │ ├── cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro │ ├── snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro │ ├── snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro │ └── snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro ├── logical_types.avro ├── long_map.avro ├── manifest.avro ├── nested_nullable_lists.avro ├── null_first.avro ├── null_last.avro ├── nullable_entry_string_array.avro ├── nullable_string_array.avro ├── part-r-00000.avro ├── primitive_types.avro ├── query_small.avro ├── recursive.avro ├── reuse-1.avro ├── reuse-2.avro ├── root-int.avro ├── single-union.avro ├── sql │ ├── avro.test │ ├── bigdata.test │ ├── external_file_cache.test │ ├── iceberg.test │ └── test_missing_file.test ├── string_array.avro ├── union-name-1.avro ├── union-name-2.avro ├── union-name-3.avro ├── union.avro ├── userdata1.avro ├── userdata2.avro ├── userdata3.avro ├── userdata4.avro ├── userdata5.avro └── users.avro ├── vcpkg.json └── vcpkg_ports ├── liblzma ├── build-tools.patch ├── portfile.cmake ├── usage ├── vcpkg-cmake-wrapper.cmake └── vcpkg.json ├── snappy ├── fix_clang-cl_build.patch ├── no-werror.patch ├── pkgconfig.diff ├── portfile.cmake ├── snappy.pc.in ├── usage └── vcpkg.json └── zlib ├── 0001-Prevent-invalid-inclusions-when-HAVE_-is-set-to-0.patch ├── 0002-build-static-or-shared-not-both.patch ├── 0003-android-and-mingw-fixes.patch ├── portfile.cmake ├── usage ├── vcpkg-cmake-wrapper.cmake └── vcpkg.json /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: LLVM 3 | SortIncludes: false 4 | TabWidth: 4 5 | IndentWidth: 4 6 | ColumnLimit: 120 7 | AllowShortFunctionsOnASingleLine: false 8 | --- 9 | UseTab: ForIndentation 10 | DerivePointerAlignment: false 11 | PointerAlignment: Right 12 | AlignConsecutiveMacros: true 13 | AlignTrailingComments: true 14 | AllowAllArgumentsOnNextLine: true 15 | AllowAllConstructorInitializersOnNextLine: true 16 | AllowAllParametersOfDeclarationOnNextLine: true 17 | AlignAfterOpenBracket: Align 18 | SpaceBeforeCpp11BracedList: true 19 | SpaceBeforeCtorInitializerColon: true 20 | SpaceBeforeInheritanceColon: true 21 | SpacesInAngles: false 22 | SpacesInCStyleCastParentheses: false 23 | SpacesInConditionalStatement: false 24 | AllowShortLambdasOnASingleLine: Inline 25 | AllowShortLoopsOnASingleLine: false 26 | AlwaysBreakTemplateDeclarations: Yes 27 | IncludeBlocks: Regroup 28 | Language: Cpp 29 | AccessModifierOffset: -4 30 | --- 31 | Language: Java 32 | SpaceAfterCStyleCast: true 33 | --- 34 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | duckdb/.editorconfig -------------------------------------------------------------------------------- /.github/workflows/MainDistributionPipeline.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension 3 | # 4 | name: Main Extension Distribution Pipeline 5 | on: 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | 16 | duckdb-stable-build: 17 | name: Build extension binaries 18 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main 19 | with: 20 | extension_name: avro 21 | duckdb_version: main 22 | ci_tools_version: main 23 | exclude_archs: 'windows_amd64_rtools;windows_amd64_mingw;linux_amd64_gcc4' 24 | 25 | duckdb-stable-deploy: 26 | name: Deploy extension binaries 27 | needs: duckdb-stable-build 28 | uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main 29 | secrets: inherit 30 | with: 31 | extension_name: avro 32 | duckdb_version: main 33 | ci_tools_version: main 34 | exclude_archs: 'windows_amd64_rtools;windows_amd64_mingw;linux_amd64_gcc4' 35 | deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | vcpkg -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb 4 | branch = main 5 | [submodule "extension-ci-tools"] 6 | path = extension-ci-tools 7 | url = https://github.com/duckdb/extension-ci-tools 8 | branch = main 9 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | # Set extension name here 4 | set(TARGET_NAME avro) 5 | 6 | find_path( 7 | AVRO_INCLUDE_DIR 8 | NAMES avro.h 9 | PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include" 10 | PATH_SUFFIXES avro REQUIRED) 11 | 12 | if(MSVC) # endless screaming 13 | find_library(AVRO_LIBRARY avro.lib REQUIRED) 14 | find_library(JANSSON_LIBRARY jansson.lib REQUIRED) 15 | find_library(LZMA_LIBRARY lzma.lib REQUIRED) 16 | find_library(ZLIB_LIBRARY zlib.lib REQUIRED) 17 | else() 18 | find_library(AVRO_LIBRARY libavro.a REQUIRED) 19 | find_library(JANSSON_LIBRARY libjansson.a REQUIRED) 20 | find_library(LZMA_LIBRARY liblzma.a REQUIRED) 21 | find_library(ZLIB_LIBRARY libz.a REQUIRED) 22 | endif() 23 | 24 | find_library(SNAPPY_LIBRARY snappy REQUIRED) 25 | set(ALL_AVRO_LIBRARIES 26 | ${AVRO_LIBRARY} 27 | ${JEMALLOC_LIBRARY} 28 | ${JANSSON_LIBRARY} 29 | ${LZMA_LIBRARY} 30 | ${ZLIB_LIBRARY} 31 | ${SNAPPY_LIBRARY} 32 | ${GMP_LIBRARY} 33 | ${MATH_LIBRARY}) 34 | 35 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 36 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) 37 | 38 | project(${TARGET_NAME}) 39 | include_directories(src/include) 40 | 41 | set(EXTENSION_SOURCES src/avro_extension.cpp src/avro_reader.cpp 42 | src/avro_multi_file_info.cpp) 43 | 44 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) 45 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) 46 | 47 | target_include_directories(${EXTENSION_NAME} PRIVATE ${AVRO_INCLUDE_DIR}) 48 | target_include_directories(${LOADABLE_EXTENSION_NAME} 49 | PRIVATE ${AVRO_INCLUDE_DIR}) 50 | target_link_libraries(${EXTENSION_NAME} ${ALL_AVRO_LIBRARIES}) 51 | target_link_libraries(${LOADABLE_EXTENSION_NAME} ${ALL_AVRO_LIBRARIES}) 52 | 53 | install( 54 | TARGETS ${EXTENSION_NAME} 55 | EXPORT "${DUCKDB_EXPORT_SET}" 56 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 57 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018-2024 Stichting DuckDB Foundation 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 2 | 3 | # Configuration of extension 4 | EXT_NAME=avro 5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake 6 | 7 | # Include the Makefile from extension-ci-tools 8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The DuckDB Avro Extension 2 | This repo contains a DuckDB community extension that enables DuckDB to *read* [Apache Avro (TM)](https://avro.apache.org) files. Avro is the (self-declared) "leading serialization format for record data". Avro is a self-describing *row-major* binary table format. This is in contrast to the (much more popular) Parquet format that is *columnar*. Its row-major design enables Avro - for example - to handle appends of a few rows somewhat efficiently. 3 | 4 | The extension does not contain Avro *write* functionality. This is on purpose, by not providing a writer we hope to decrease the amount of Avro files in the world over time. 5 | 6 | ### Installation & Loading 7 | Installation is simple through the DuckDB Community Extension repository, just type 8 | 9 | ``` 10 | INSTALL avro FROM community; 11 | LOAD avro; 12 | ``` 13 | in a DuckDB instance near you. There is currently no build for WASM because of dependencies (sigh). 14 | 15 | ### The `read_avro` Function 16 | The extension adds a single DuckDB function, `read_avro`. This function can be used like so: 17 | ```SQL 18 | FROM read_avro('some_example_file.avro'); 19 | ``` 20 | This function will expose the contents of the avro file as a DuckDB table. You can then use any arbitrary SQL constructs to further transform this table. 21 | 22 | 23 | ### File IO 24 | The `read_avro` function is integrated into DuckDB's file system abstraction, meaning you can read Avro files directly from e.g. HTTP or S3 sources. For example 25 | 26 | ```SQL 27 | FROM read_avro('http://blob.duckdb.org/data/userdata1.avro'); 28 | FROM read_avro('s3://my-example-bucket/some_example_file.avro'); 29 | ``` 30 | 31 | should "just" work. 32 | 33 | You can also *glob* multiple files in a single read call or pass a list of files to the functions: 34 | 35 | ```SQL 36 | FROM read_avro('some_example_file_*.avro'); 37 | FROM read_avro(['some_example_file_1.avro', 'some_example_file_2.avro']); 38 | ``` 39 | 40 | If the filenames somehow contain valuable information (as is unfortunately all-too-common), you can pass the `filename` argument to `read_avro`: 41 | 42 | ```SQL 43 | FROM read_avro('some_example_file_*.avro', filename=true); 44 | ``` 45 | This will result in an additional column in the result set that contains the actual filename of the Avro file. 46 | 47 | ### Schema Conversion 48 | This extension automatically translates the Avro Schema to the DuckDB schema. *All* Avro types can be translated, except for *recursive type definitions*, which DuckDB does not support. 49 | 50 | The type mapping is very straightforward except for Avro's "unique" way of handling `NULL`. Unlike other systems, Avro does not treat `NULL` as a possible value in a range of e.g. `INTEGER` but instead represents `NULL` as a union of the actual type with a special `NULL` type. This is different to DuckDB, where any value can be `NULL`. Of course DuckDB also supports `UNION` types, but this would be quite cumbersome to work with. 51 | 52 | This extension *simplifies* the Avro schema where possible: An Avro union of any type and the special null type is simplified to just the non-null type. For example, an Avro record of the union type ` ["int","null"]` becomes a DuckDB `INTEGER`, which just happens to be `NULL` sometimes. Similarly, an Avro union that contains only a single type is converted to the type it contains. For example, an Avro record of the union type ` ["int"]` also becomes a DuckDB `INTEGER`. 53 | 54 | The extension also "flattens" the Avro schema. Avro defines tables as root-level "record" fields, which are the same as DuckDB `STRUCT` fields. For more convenient handling, this extension turns the entries of a single top-level record into top-level columns. 55 | 56 | ### Implementation 57 | Internally, this extension uses the "official" [Apache Avro C API](https://avro.apache.org/docs/++version++/api/c/), albeit with some minor patching to allow reading of Avro files from memory. 58 | 59 | ### Limitations & Next Steps 60 | - This extension currently does not make use of **parallelism** when reading either a single (large) Avro file or when reading a list of files. Adding support for parallelism in the latter case is on the roadmap. 61 | 62 | - There is currently no support for neither projection nor filter **pushdown**, but this is also planned at a later stage. 63 | 64 | - There is currently no support for the WASM or the Windows-MinGW builds of DuckDB due to issues with the Avro library dependency (sigh again). We plan to fix this eventually. 65 | 66 | - As mentioned above, DuckDB cannot express recursive type definitions that Avro has, this is unlikely to ever change. 67 | 68 | - There is no support to allow users to provide a separate Avro schema file. This is unlikely to change, all Avro files we have seen so far had their schema embedded. 69 | 70 | - There is currently no support for the `union_by_name` flag that other readers in DuckDB support. This is planned for the future. 71 | -------------------------------------------------------------------------------- /docs/UPDATING.md: -------------------------------------------------------------------------------- 1 | # Extension updating 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes 4 | as follows: 5 | 6 | - Bump submodules 7 | - `./duckdb` should be set to latest tagged release 8 | - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release. So if you're building for DuckDB `v1.1.0` there will be a branch in `extension-ci-tools` named `v1.1.0` to which you should check out. 9 | - Bump versions in `./github/workflows` 10 | - `duckdb_version` input in `duckdb-stable-build` job in `MainDistributionPipeline.yml` should be set to latest tagged release 11 | - `duckdb_version` input in `duckdb-stable-deploy` job in `MainDistributionPipeline.yml` should be set to latest tagged release 12 | - the reusable workflow `duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml` for the `duckdb-stable-build` job should be set to latest tagged release 13 | 14 | # API changes 15 | DuckDB extensions built with this extension template are built against the internal C++ API of DuckDB. This API is not guaranteed to be stable. 16 | What this means for extension development is that when updating your extensions DuckDB target version using the above steps, you may run into the fact that your extension no longer builds properly. 17 | 18 | Currently, DuckDB does not (yet) provide a specific change log for these API changes, but it is generally not too hard to figure out what has changed. 19 | 20 | For figuring out how and why the C++ API changed, we recommend using the following resources: 21 | - DuckDB's [Release Notes](https://github.com/duckdb/duckdb/releases) 22 | - DuckDB's history of [Core extension patches](https://github.com/duckdb/duckdb/commits/main/.github/patches/extensions) 23 | - The git history of the relevant C++ Header file of the API that has changed -------------------------------------------------------------------------------- /extension_config.cmake: -------------------------------------------------------------------------------- 1 | # This file is included by DuckDB's build system. It specifies which extension to load 2 | 3 | # Extension from this repo 4 | duckdb_extension_load(avro 5 | SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} 6 | LOAD_TESTS 7 | LINKED_LIBS "../../vcpkg_installed/wasm32-emscripten/lib/lib*.a" 8 | ) 9 | 10 | # Any extra extensions that should be built 11 | # e.g.: duckdb_extension_load(json) 12 | -------------------------------------------------------------------------------- /scripts/extension-upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Extension upload script 4 | 5 | # Usage: ./extension-upload.sh 6 | # : Name of the extension 7 | # : Version (commit / version tag) of the extension 8 | # : Version (commit / version tag) of DuckDB 9 | # : Architecture target of the extension binary 10 | # : S3 bucket to upload to 11 | # : Set this as the latest version ("true" / "false", default: "false") 12 | # : Set this as a versioned version that will prevent its deletion 13 | 14 | set -e 15 | 16 | if [[ $4 == wasm* ]]; then 17 | ext="/tmp/extension/$1.duckdb_extension.wasm" 18 | else 19 | ext="/tmp/extension/$1.duckdb_extension" 20 | fi 21 | 22 | echo $ext 23 | 24 | script_dir="$(dirname "$(readlink -f "$0")")" 25 | 26 | # calculate SHA256 hash of extension binary 27 | cat $ext > $ext.append 28 | 29 | if [[ $4 == wasm* ]]; then 30 | # 0 for custom section 31 | # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256) 32 | # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02] 33 | echo -n -e '\x00' >> $ext.append 34 | echo -n -e '\x93\x02' >> $ext.append 35 | # 10 in hex = 16 in decimal, lenght of name, 1 byte 36 | echo -n -e '\x10' >> $ext.append 37 | echo -n -e 'duckdb_signature' >> $ext.append 38 | # the name of the WebAssembly custom section, 16 bytes 39 | # 100 in hex, 256 in decimal 40 | # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)], 41 | # for a grand total of 2 bytes 42 | echo -n -e '\x80\x02' >> $ext.append 43 | fi 44 | 45 | # (Optionally) Sign binary 46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then 47 | echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem 48 | $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash 49 | openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign 50 | rm -f private.pem 51 | fi 52 | 53 | # Signature is always there, potentially defaulting to 256 zeros 54 | truncate -s 256 $ext.sign 55 | 56 | # append signature to extension binary 57 | cat $ext.sign >> $ext.append 58 | 59 | # compress extension binary 60 | if [[ $4 == wasm_* ]]; then 61 | brotli < $ext.append > "$ext.compressed" 62 | else 63 | gzip < $ext.append > "$ext.compressed" 64 | fi 65 | 66 | set -e 67 | 68 | # Abort if AWS key is not set 69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then 70 | echo "No AWS key found, skipping.." 71 | exit 0 72 | fi 73 | 74 | # upload versioned version 75 | if [[ $7 = 'true' ]]; then 76 | if [[ $4 == wasm* ]]; then 77 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 78 | else 79 | aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 80 | fi 81 | fi 82 | 83 | # upload to latest version 84 | if [[ $6 = 'true' ]]; then 85 | if [[ $4 == wasm* ]]; then 86 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm" 87 | else 88 | aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read 89 | fi 90 | fi 91 | -------------------------------------------------------------------------------- /scripts/setup-custom-toolchain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is an example script that can be used to install additional toolchain dependencies. Feel free to remove this script 4 | # if no additional toolchains are required 5 | 6 | # To enable this script, set the `custom_toolchain_script` option to true when calling the reusable workflow 7 | # `.github/workflows/_extension_distribution.yml` from `https://github.com/duckdb/extension-ci-tools` 8 | 9 | # note that the $DUCKDB_PLATFORM environment variable can be used to discern between the platforms 10 | echo "This is the sample custom toolchain script running for architecture '$DUCKDB_PLATFORM' for the avro extension." 11 | 12 | -------------------------------------------------------------------------------- /src/avro_extension.cpp: -------------------------------------------------------------------------------- 1 | #define DUCKDB_EXTENSION_MAIN 2 | 3 | #include "avro_extension.hpp" 4 | 5 | #include "duckdb.hpp" 6 | #include "duckdb/common/exception.hpp" 7 | #include "duckdb/common/string_util.hpp" 8 | #include "duckdb/function/scalar_function.hpp" 9 | 10 | #include "duckdb/main/extension_util.hpp" 11 | #include "include/avro_reader.hpp" 12 | #include "duckdb/common/multi_file/multi_file_reader.hpp" 13 | #include "avro_multi_file_info.hpp" 14 | #include "duckdb/common/multi_file/multi_file_function.hpp" 15 | 16 | #include 17 | 18 | namespace duckdb { 19 | 20 | static void LoadInternal(DatabaseInstance &instance) { 21 | // Register a scalar function 22 | auto table_function = MultiFileFunction("read_avro"); 23 | table_function.projection_pushdown = true; 24 | ExtensionUtil::RegisterFunction(instance, MultiFileReader::CreateFunctionSet(table_function)); 25 | } 26 | 27 | void AvroExtension::Load(DuckDB &db) { 28 | LoadInternal(*db.instance); 29 | } 30 | std::string AvroExtension::Name() { 31 | return "avro"; 32 | } 33 | 34 | std::string AvroExtension::Version() const { 35 | #ifdef EXT_VERSION_AVRO 36 | return EXT_VERSION_AVRO; 37 | #else 38 | return ""; 39 | #endif 40 | } 41 | 42 | } // namespace duckdb 43 | 44 | extern "C" { 45 | 46 | DUCKDB_EXTENSION_API void avro_init(duckdb::DatabaseInstance &db) { 47 | duckdb::DuckDB db_wrapper(db); 48 | db_wrapper.LoadExtension(); 49 | } 50 | 51 | DUCKDB_EXTENSION_API const char *avro_version() { 52 | return duckdb::DuckDB::LibraryVersion(); 53 | } 54 | } 55 | 56 | #ifndef DUCKDB_EXTENSION_MAIN 57 | #error DUCKDB_EXTENSION_MAIN not defined 58 | #endif 59 | -------------------------------------------------------------------------------- /src/avro_multi_file_info.cpp: -------------------------------------------------------------------------------- 1 | #include "avro_multi_file_info.hpp" 2 | #include "avro_reader.hpp" 3 | 4 | namespace duckdb { 5 | 6 | unique_ptr 7 | AvroMultiFileInfo::InitializeInterface(ClientContext &context, MultiFileReader &reader, MultiFileList &file_list) { 8 | return make_uniq(); 9 | } 10 | 11 | unique_ptr AvroMultiFileInfo::InitializeOptions(ClientContext &context, 12 | optional_ptr info) { 13 | return make_uniq(); 14 | } 15 | 16 | bool AvroMultiFileInfo::ParseCopyOption(ClientContext &context, const string &key, const vector &values, 17 | BaseFileReaderOptions &options_p, vector &expected_names, 18 | vector &expected_types) { 19 | // We currently do not have any options for the scanner, so we always return false 20 | return false; 21 | } 22 | 23 | bool AvroMultiFileInfo::ParseOption(ClientContext &context, const string &key, const Value &val, 24 | MultiFileOptions &file_options, BaseFileReaderOptions &options) { 25 | // We currently do not have any options for the scanner, so we always return false 26 | return false; 27 | } 28 | 29 | struct AvroMultiFileData final : public TableFunctionData { 30 | public: 31 | AvroMultiFileData() = default; 32 | }; 33 | 34 | unique_ptr AvroMultiFileInfo::InitializeBindData(MultiFileBindData &multi_file_data, 35 | unique_ptr options_p) { 36 | return make_uniq(); 37 | } 38 | 39 | void AvroMultiFileInfo::BindReader(ClientContext &context, vector &return_types, vector &names, 40 | MultiFileBindData &bind_data) { 41 | AvroFileReaderOptions options; 42 | if (bind_data.file_options.union_by_name) { 43 | throw NotImplementedException("'union_by_name' not implemented for Avro reader yet"); 44 | } 45 | bind_data.reader_bind = bind_data.multi_file_reader->BindReader(context, return_types, names, *bind_data.file_list, 46 | bind_data, options, bind_data.file_options); 47 | D_ASSERT(names.size() == return_types.size()); 48 | } 49 | 50 | optional_idx AvroMultiFileInfo::MaxThreads(const MultiFileBindData &bind_data_p, 51 | const MultiFileGlobalState &global_state, FileExpandResult expand_result) { 52 | if (expand_result == FileExpandResult::MULTIPLE_FILES) { 53 | // always launch max threads if we are reading multiple files 54 | return {}; 55 | } 56 | // Otherwise, only one thread 57 | return 1; 58 | } 59 | 60 | struct AvroFileGlobalState : public GlobalTableFunctionState { 61 | public: 62 | AvroFileGlobalState() = default; 63 | ~AvroFileGlobalState() override = default; 64 | 65 | public: 66 | //! TODO: this should contain the state of the current file being scanned 67 | //! so we can parallelize over a single file 68 | set files; 69 | }; 70 | 71 | unique_ptr AvroMultiFileInfo::InitializeGlobalState(ClientContext &context, 72 | MultiFileBindData &bind_data, 73 | MultiFileGlobalState &global_state) { 74 | return make_uniq(); 75 | } 76 | 77 | //! The Avro Local File State, basically refers to the Scan of one Avro File 78 | //! This is done by calling the Avro Scan directly on one file. 79 | struct AvroFileLocalState : public LocalTableFunctionState { 80 | public: 81 | explicit AvroFileLocalState(ExecutionContext &execution_context) : execution_context(execution_context) {}; 82 | 83 | public: 84 | shared_ptr file_scan; 85 | ExecutionContext &execution_context; 86 | }; 87 | 88 | unique_ptr AvroMultiFileInfo::InitializeLocalState(ExecutionContext &context, 89 | GlobalTableFunctionState &function_state) { 90 | return make_uniq(context); 91 | } 92 | 93 | shared_ptr AvroMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p, 94 | BaseUnionData &union_data, 95 | const MultiFileBindData &bind_data) { 96 | throw NotImplementedException("'union_by_name' is not implemented for the Avro reader yet"); 97 | } 98 | 99 | shared_ptr AvroMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p, 100 | const OpenFileInfo &file, idx_t file_idx, 101 | const MultiFileBindData &bind_data) { 102 | return make_shared_ptr(context, file); 103 | } 104 | 105 | shared_ptr AvroMultiFileInfo::CreateReader(ClientContext &context, const OpenFileInfo &file, 106 | BaseFileReaderOptions &options, 107 | const MultiFileOptions &file_options) { 108 | return make_shared_ptr(context, file); 109 | } 110 | 111 | bool AvroReader::TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate_p, 112 | LocalTableFunctionState &lstate_p) { 113 | auto &gstate = gstate_p.Cast(); 114 | auto &lstate = lstate_p.Cast(); 115 | if (gstate.files.count(file_list_idx.GetIndex())) { 116 | // Return false because we don't currently support more than one thread 117 | // scanning a file. 118 | return false; 119 | } 120 | gstate.files.insert(file_list_idx.GetIndex()); 121 | lstate.file_scan = shared_ptr_cast(shared_from_this()); 122 | return true; 123 | } 124 | 125 | void AvroReader::Scan(ClientContext &context, GlobalTableFunctionState &global_state, 126 | LocalTableFunctionState &local_state_p, DataChunk &chunk) { 127 | Read(chunk); 128 | } 129 | 130 | unique_ptr AvroMultiFileInfo::GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) { 131 | //! FIXME: Here is where we might set statistics, for optimizations if we have them 132 | return make_uniq(); 133 | } 134 | 135 | } // namespace duckdb 136 | -------------------------------------------------------------------------------- /src/avro_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "avro_reader.hpp" 2 | #include "utf8proc_wrapper.hpp" 3 | #include "duckdb/storage/caching_file_system.hpp" 4 | #include "duckdb/common/file_system.hpp" 5 | #include "duckdb/common/multi_file/multi_file_data.hpp" 6 | 7 | namespace duckdb { 8 | 9 | static AvroType TransformSchema(avro_schema_t &avro_schema, unordered_set parent_schema_names) { 10 | switch (avro_typeof(avro_schema)) { 11 | case AVRO_NULL: 12 | return AvroType(AVRO_NULL, LogicalType::SQLNULL); 13 | case AVRO_BOOLEAN: 14 | return AvroType(AVRO_BOOLEAN, LogicalType::BOOLEAN); 15 | case AVRO_INT32: 16 | return AvroType(AVRO_INT32, LogicalType::INTEGER); 17 | case AVRO_INT64: 18 | return AvroType(AVRO_INT64, LogicalType::BIGINT); 19 | case AVRO_FLOAT: 20 | return AvroType(AVRO_FLOAT, LogicalType::FLOAT); 21 | case AVRO_DOUBLE: 22 | return AvroType(AVRO_DOUBLE, LogicalType::DOUBLE); 23 | case AVRO_BYTES: 24 | return AvroType(AVRO_BYTES, LogicalType::BLOB); 25 | case AVRO_STRING: 26 | return AvroType(AVRO_STRING, LogicalType::VARCHAR); 27 | case AVRO_UNION: { 28 | auto num_children = avro_schema_union_size(avro_schema); 29 | child_list_t union_children; 30 | idx_t non_null_child_idx = 0; 31 | unordered_map union_child_map; 32 | for (idx_t child_idx = 0; child_idx < num_children; child_idx++) { 33 | auto child_schema = avro_schema_union_branch(avro_schema, child_idx); 34 | auto child_type = TransformSchema(child_schema, parent_schema_names); 35 | union_children.push_back( 36 | std::pair(StringUtil::Format("u%llu", child_idx), std::move(child_type))); 37 | if (child_type.duckdb_type.id() != LogicalTypeId::SQLNULL) { 38 | union_child_map[child_idx] = non_null_child_idx++; 39 | } 40 | } 41 | return AvroType(AVRO_UNION, LogicalTypeId::UNION, std::move(union_children), union_child_map); 42 | } 43 | case AVRO_RECORD: { 44 | auto schema_name = string(avro_schema_name(avro_schema)); 45 | if (parent_schema_names.find(schema_name) != parent_schema_names.end()) { 46 | throw InvalidInputException("Recursive Avro types not supported: %s", schema_name); 47 | } 48 | parent_schema_names.insert(schema_name); 49 | 50 | auto num_children = avro_schema_record_size(avro_schema); 51 | if (num_children == 0) { 52 | // this we just ignore but we need a marker so we don't get our offsets 53 | // wrong 54 | return AvroType(AVRO_RECORD, LogicalTypeId::SQLNULL); 55 | } 56 | child_list_t struct_children; 57 | for (idx_t child_idx = 0; child_idx < num_children; child_idx++) { 58 | auto child_schema = avro_schema_record_field_get_by_index(avro_schema, child_idx); 59 | auto child_type = TransformSchema(child_schema, parent_schema_names); 60 | auto child_name = avro_schema_record_field_name(avro_schema, child_idx); 61 | if (!child_name || strlen(child_name) == 0) { 62 | throw InvalidInputException("Empty avro field name"); 63 | } 64 | 65 | struct_children.push_back(std::pair(child_name, std::move(child_type))); 66 | } 67 | 68 | return AvroType(AVRO_RECORD, LogicalTypeId::STRUCT, std::move(struct_children)); 69 | } 70 | case AVRO_ENUM: { 71 | auto size = avro_schema_enum_number_of_symbols(avro_schema); 72 | Vector levels(LogicalType::VARCHAR, size); 73 | auto levels_data = FlatVector::GetData(levels); 74 | for (idx_t enum_idx = 0; enum_idx < size; enum_idx++) { 75 | levels_data[enum_idx] = StringVector::AddString(levels, avro_schema_enum_get(avro_schema, enum_idx)); 76 | } 77 | levels.Verify(size); 78 | return AvroType(AVRO_ENUM, LogicalType::ENUM(levels, size)); 79 | } 80 | case AVRO_FIXED: { 81 | return AvroType(AVRO_FIXED, LogicalType::BLOB); 82 | } 83 | case AVRO_ARRAY: { 84 | auto child_schema = avro_schema_array_items(avro_schema); 85 | auto child_type = TransformSchema(child_schema, parent_schema_names); 86 | child_list_t list_children; 87 | list_children.push_back(std::pair("list_entry", std::move(child_type))); 88 | return AvroType(AVRO_ARRAY, LogicalTypeId::LIST, std::move(list_children)); 89 | } 90 | case AVRO_MAP: { 91 | auto child_schema = avro_schema_map_values(avro_schema); 92 | auto child_type = TransformSchema(child_schema, parent_schema_names); 93 | child_list_t map_children; 94 | map_children.push_back(std::pair("list_entry", std::move(child_type))); 95 | return AvroType(AVRO_MAP, LogicalTypeId::MAP, std::move(map_children)); 96 | } 97 | case AVRO_LINK: { 98 | auto target = avro_schema_link_target(avro_schema); 99 | return TransformSchema(target, parent_schema_names); 100 | } 101 | default: 102 | throw NotImplementedException("Unknown Avro Type %s", avro_schema_type_name(avro_schema)); 103 | } 104 | } 105 | 106 | AvroReader::AvroReader(ClientContext &context, OpenFileInfo file) : BaseFileReader(file) { 107 | auto caching_file_system = CachingFileSystem::Get(context); 108 | 109 | auto caching_file_handle = caching_file_system.OpenFile(this->file, FileOpenFlags::FILE_FLAGS_READ); 110 | allocated_data = Allocator::Get(context).Allocate(caching_file_handle->GetFileSize()); 111 | auto total_size = allocated_data.GetSize(); 112 | auto data = allocated_data.get(); 113 | 114 | auto buf_handle = caching_file_handle->Read(data, total_size); 115 | auto buffer_data = buf_handle.Ptr(); 116 | 117 | D_ASSERT(buf_handle.IsValid()); 118 | D_ASSERT(buffer_data == data); 119 | auto avro_reader = avro_reader_memory(const_char_ptr_cast(buffer_data), total_size); 120 | 121 | if (avro_reader_reader(avro_reader, &reader)) { 122 | throw InvalidInputException(avro_strerror()); 123 | } 124 | 125 | auto avro_schema = avro_file_reader_get_writer_schema(reader); 126 | avro_type = TransformSchema(avro_schema, {}); 127 | duckdb_type = AvroType::TransformAvroType(avro_type); 128 | read_vec = make_uniq(duckdb_type); 129 | 130 | auto interface = avro_generic_class_from_schema(avro_schema); 131 | avro_generic_value_new(interface, &value); 132 | avro_value_iface_decref(interface); 133 | 134 | vector types; 135 | vector names; 136 | // special handling for root structs, we pull up the entries 137 | if (duckdb_type.id() == LogicalTypeId::STRUCT) { 138 | for (idx_t child_idx = 0; child_idx < StructType::GetChildCount(duckdb_type); child_idx++) { 139 | names.push_back(StructType::GetChildName(duckdb_type, child_idx)); 140 | types.push_back(StructType::GetChildType(duckdb_type, child_idx)); 141 | } 142 | } else { 143 | auto schema_name = avro_schema_name(avro_schema); 144 | names.push_back(schema_name ? schema_name : "avro_schema"); 145 | types.push_back(duckdb_type); 146 | } 147 | 148 | columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(names, types); 149 | avro_schema_decref(avro_schema); 150 | } 151 | 152 | static void TransformValue(avro_value *avro_val, const AvroType &avro_type, Vector &target, idx_t out_idx) { 153 | 154 | switch (avro_type.duckdb_type.id()) { 155 | case LogicalTypeId::SQLNULL: { 156 | FlatVector::SetNull(target, out_idx, true); 157 | break; 158 | } 159 | case LogicalTypeId::BOOLEAN: { 160 | int bool_val; 161 | if (avro_value_get_boolean(avro_val, &bool_val)) { 162 | throw InvalidInputException(avro_strerror()); 163 | } 164 | FlatVector::GetData(target)[out_idx] = bool_val != 0; 165 | break; 166 | } 167 | case LogicalTypeId::INTEGER: { 168 | if (avro_value_get_int(avro_val, &FlatVector::GetData(target)[out_idx])) { 169 | throw InvalidInputException(avro_strerror()); 170 | } 171 | break; 172 | } 173 | case LogicalTypeId::BIGINT: { 174 | if (avro_value_get_long(avro_val, &FlatVector::GetData(target)[out_idx])) { 175 | throw InvalidInputException(avro_strerror()); 176 | } 177 | break; 178 | } 179 | case LogicalTypeId::FLOAT: { 180 | if (avro_value_get_float(avro_val, &FlatVector::GetData(target)[out_idx])) { 181 | throw InvalidInputException(avro_strerror()); 182 | } 183 | break; 184 | } 185 | case LogicalTypeId::DOUBLE: { 186 | if (avro_value_get_double(avro_val, &FlatVector::GetData(target)[out_idx])) { 187 | throw InvalidInputException(avro_strerror()); 188 | } 189 | break; 190 | } 191 | case LogicalTypeId::BLOB: 192 | switch (avro_type.avro_type) { 193 | case AVRO_FIXED: { 194 | size_t fixed_size; 195 | const void *fixed_data; 196 | if (avro_value_get_fixed(avro_val, &fixed_data, &fixed_size)) { 197 | throw InvalidInputException(avro_strerror()); 198 | } 199 | FlatVector::GetData(target)[out_idx] = 200 | StringVector::AddStringOrBlob(target, const_char_ptr_cast(fixed_data), fixed_size); 201 | break; 202 | } 203 | case AVRO_BYTES: { 204 | avro_wrapped_buffer blob_buf = AVRO_WRAPPED_BUFFER_EMPTY; 205 | if (avro_value_grab_bytes(avro_val, &blob_buf)) { 206 | throw InvalidInputException(avro_strerror()); 207 | } 208 | FlatVector::GetData(target)[out_idx] = 209 | StringVector::AddStringOrBlob(target, const_char_ptr_cast(blob_buf.buf), blob_buf.size); 210 | blob_buf.free(&blob_buf); 211 | break; 212 | } 213 | default: 214 | throw NotImplementedException("Unknown Avro blob type %s"); 215 | } 216 | break; 217 | 218 | case LogicalTypeId::VARCHAR: { 219 | avro_wrapped_buffer str_buf = AVRO_WRAPPED_BUFFER_EMPTY; 220 | if (avro_value_grab_string(avro_val, &str_buf)) { 221 | throw InvalidInputException(avro_strerror()); 222 | } 223 | // avro strings are null-terminated 224 | D_ASSERT(const_char_ptr_cast(str_buf.buf)[str_buf.size - 1] == '\0'); 225 | if (Utf8Proc::Analyze(const_char_ptr_cast(str_buf.buf), str_buf.size - 1) == UnicodeType::INVALID) { 226 | throw InvalidInputException("Avro file contains invalid unicode string"); 227 | } 228 | FlatVector::GetData(target)[out_idx] = 229 | StringVector::AddString(target, const_char_ptr_cast(str_buf.buf), str_buf.size - 1); 230 | str_buf.free(&str_buf); 231 | break; 232 | } 233 | case LogicalTypeId::STRUCT: { 234 | size_t child_count; 235 | if (avro_value_get_size(avro_val, &child_count)) { 236 | throw InvalidInputException(avro_strerror()); 237 | } 238 | D_ASSERT(child_count == StructType::GetChildCount(target.GetType())); 239 | D_ASSERT(child_count == avro_type.children.size()); 240 | 241 | for (idx_t child_idx = 0; child_idx < child_count; child_idx++) { 242 | avro_value child_value; 243 | if (avro_value_get_by_index(avro_val, child_idx, &child_value, nullptr)) { 244 | throw InvalidInputException(avro_strerror()); 245 | } 246 | TransformValue(&child_value, avro_type.children[child_idx].second, 247 | *StructVector::GetEntries(target)[child_idx], out_idx); 248 | } 249 | break; 250 | } 251 | 252 | case LogicalTypeId::MAP: { 253 | size_t entry_count; 254 | if (avro_value_get_size(avro_val, &entry_count)) { 255 | throw InvalidInputException(avro_strerror()); 256 | } 257 | 258 | D_ASSERT(avro_type.children.size() == 1); 259 | auto child_offset = ListVector::GetListSize(target); 260 | ListVector::Reserve(target, child_offset + entry_count); 261 | 262 | auto &key_vector = MapVector::GetKeys(target); 263 | auto &value_vector = MapVector::GetValues(target); 264 | 265 | D_ASSERT(key_vector.GetType().id() == LogicalTypeId::VARCHAR); 266 | auto string_ptr = FlatVector::GetData(key_vector); 267 | for (idx_t entry_idx = 0; entry_idx < entry_count; entry_idx++) { 268 | avro_value child_value; 269 | const char *map_key; 270 | if (avro_value_get_by_index(avro_val, entry_idx, &child_value, &map_key)) { 271 | throw InvalidInputException(avro_strerror()); 272 | } 273 | D_ASSERT(map_key); 274 | string_ptr[child_offset + entry_idx] = StringVector::AddString(key_vector, map_key); 275 | TransformValue(&child_value, avro_type.children[0].second, value_vector, child_offset + entry_idx); 276 | } 277 | auto list_vector = ListVector::GetData(target); 278 | 279 | list_vector[out_idx].offset = child_offset; 280 | list_vector[out_idx].length = entry_count; 281 | ListVector::SetListSize(target, child_offset + entry_count); 282 | break; 283 | } 284 | 285 | case LogicalTypeId::UNION: { 286 | int discriminant; 287 | avro_value union_value; 288 | if (avro_value_get_discriminant(avro_val, &discriminant) || 289 | avro_value_get_current_branch(avro_val, &union_value)) { 290 | throw InvalidInputException(avro_strerror()); 291 | } 292 | if (discriminant >= avro_type.children.size()) { 293 | throw InvalidInputException("Invalid union tag"); 294 | } 295 | 296 | if (avro_type.children[discriminant].second.duckdb_type == LogicalTypeId::SQLNULL) { 297 | FlatVector::SetNull(target, out_idx, true); 298 | break; 299 | } 300 | 301 | if (target.GetType().id() == LogicalTypeId::UNION) { 302 | auto duckdb_child_index = avro_type.union_child_map.at(discriminant).GetIndex(); 303 | auto &tags = UnionVector::GetTags(target); 304 | FlatVector::GetData(tags)[out_idx] = duckdb_child_index; 305 | auto &union_vector = UnionVector::GetMember(target, duckdb_child_index); 306 | 307 | // orrrrrrrrrrrrr 308 | for (idx_t child_idx = 1; child_idx < StructVector::GetEntries(target).size(); child_idx++) { 309 | if (child_idx != duckdb_child_index + 1) { // duckdb child index is bigger because of the tag 310 | FlatVector::SetNull(*StructVector::GetEntries(target)[child_idx], out_idx, true); 311 | } 312 | } 313 | 314 | TransformValue(&union_value, avro_type.children[discriminant].second, union_vector, out_idx); 315 | } else { // directly recurse, we have dissolved the union 316 | TransformValue(&union_value, avro_type.children[discriminant].second, target, out_idx); 317 | } 318 | 319 | break; 320 | } 321 | case LogicalTypeId::ENUM: { 322 | auto enum_type = EnumType::GetPhysicalType(target.GetType()); 323 | int enum_val; 324 | 325 | if (avro_value_get_enum(avro_val, &enum_val)) { 326 | throw InvalidInputException(avro_strerror()); 327 | } 328 | if (enum_val < 0 || enum_val >= EnumType::GetSize(target.GetType())) { 329 | throw InvalidInputException("Enum value out of range"); 330 | } 331 | 332 | switch (enum_type) { 333 | case PhysicalType::UINT8: 334 | FlatVector::GetData(target)[out_idx] = enum_val; 335 | break; 336 | case PhysicalType::UINT16: 337 | FlatVector::GetData(target)[out_idx] = enum_val; 338 | break; 339 | case PhysicalType::UINT32: 340 | FlatVector::GetData(target)[out_idx] = enum_val; 341 | break; 342 | default: 343 | throw InternalException("Unsupported Enum Internal Type"); 344 | } 345 | break; 346 | } 347 | 348 | case LogicalTypeId::LIST: { 349 | size_t list_len; 350 | 351 | if (avro_value_get_size(avro_val, &list_len)) { 352 | throw InvalidInputException(avro_strerror()); 353 | } 354 | auto &child_vector = ListVector::GetEntry(target); 355 | auto child_offset = ListVector::GetListSize(target); 356 | ListVector::Reserve(target, child_offset + list_len); 357 | 358 | for (idx_t child_idx = 0; child_idx < list_len; child_idx++) { 359 | avro_value_t child_value; 360 | if (avro_value_get_by_index(avro_val, child_idx, &child_value, nullptr)) { 361 | throw InvalidInputException(avro_strerror()); 362 | } 363 | TransformValue(&child_value, avro_type.children[0].second, child_vector, child_offset + child_idx); 364 | } 365 | auto list_vector_data = ListVector::GetData(target); 366 | list_vector_data[out_idx].length = list_len; 367 | list_vector_data[out_idx].offset = child_offset; 368 | ListVector::SetListSize(target, child_offset + list_len); 369 | 370 | break; 371 | } 372 | 373 | default: 374 | throw NotImplementedException(avro_type.duckdb_type.ToString()); 375 | } 376 | } 377 | 378 | void AvroReader::Read(DataChunk &output) { 379 | idx_t out_idx = 0; 380 | 381 | while (avro_file_reader_read_value(reader, &value) == 0) { 382 | TransformValue(&value, avro_type, *read_vec, out_idx++); 383 | if (out_idx == STANDARD_VECTOR_SIZE) { 384 | break; 385 | } 386 | } 387 | // pull up root struct into output chunk 388 | if (duckdb_type.id() == LogicalTypeId::STRUCT) { 389 | for (idx_t col_idx = 0; col_idx < column_indexes.size(); col_idx++) { 390 | if (column_indexes[col_idx].GetPrimaryIndex() >= columns.size()) { 391 | continue; // to be filled in later 392 | } 393 | output.data[col_idx].Reference( 394 | *StructVector::GetEntries(*read_vec)[column_indexes[col_idx].GetPrimaryIndex()]); 395 | } 396 | } else { 397 | output.data[column_indexes[0].GetPrimaryIndex()].Reference(*read_vec); 398 | } 399 | output.SetCardinality(out_idx); 400 | } 401 | 402 | } // namespace duckdb 403 | -------------------------------------------------------------------------------- /src/include/avro_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb { 6 | 7 | class AvroExtension : public Extension { 8 | public: 9 | void Load(DuckDB &db) override; 10 | std::string Name() override; 11 | std::string Version() const override; 12 | }; 13 | 14 | } // namespace duckdb 15 | -------------------------------------------------------------------------------- /src/include/avro_multi_file_info.hpp: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // DuckDB 3 | // 4 | // avro_multi_file_info.hpp 5 | // 6 | // 7 | //===----------------------------------------------------------------------===// 8 | 9 | #pragma once 10 | 11 | #include "duckdb/common/multi_file/multi_file_function.hpp" 12 | 13 | namespace duckdb { 14 | 15 | //! We might have avro specific options one day 16 | class AvroFileReaderOptions : public BaseFileReaderOptions {}; 17 | 18 | struct AvroMultiFileInfo : MultiFileReaderInterface { 19 | static unique_ptr InitializeInterface(ClientContext &context, MultiFileReader &reader, 20 | MultiFileList &file_list); 21 | 22 | unique_ptr InitializeOptions(ClientContext &context, 23 | optional_ptr info) override; 24 | bool ParseCopyOption(ClientContext &context, const string &key, const vector &values, 25 | BaseFileReaderOptions &options, vector &expected_names, 26 | vector &expected_types) override; 27 | 28 | bool ParseOption(ClientContext &context, const string &key, const Value &val, MultiFileOptions &file_options, 29 | BaseFileReaderOptions &options) override; 30 | 31 | unique_ptr InitializeBindData(MultiFileBindData &multi_file_data, 32 | unique_ptr options) override; 33 | 34 | //! This is where the actual binding must happen, so in this function we either: 35 | //! 1. union_by_name = False. We set the schema/name depending on the first file 36 | //! 2. union_by_name = True. 37 | void BindReader(ClientContext &context, vector &return_types, vector &names, 38 | MultiFileBindData &bind_data) override; 39 | 40 | optional_idx MaxThreads(const MultiFileBindData &bind_data_p, const MultiFileGlobalState &global_state, 41 | FileExpandResult expand_result) override; 42 | 43 | unique_ptr InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data, 44 | MultiFileGlobalState &global_state) override; 45 | 46 | unique_ptr InitializeLocalState(ExecutionContext &context, 47 | GlobalTableFunctionState &function_state) override; 48 | 49 | shared_ptr CreateReader(ClientContext &context, GlobalTableFunctionState &gstate, 50 | BaseUnionData &union_data, const MultiFileBindData &bind_data_p) override; 51 | 52 | shared_ptr CreateReader(ClientContext &context, GlobalTableFunctionState &gstate, 53 | const OpenFileInfo &file, idx_t file_idx, 54 | const MultiFileBindData &bind_data) override; 55 | 56 | shared_ptr CreateReader(ClientContext &context, const OpenFileInfo &file, 57 | BaseFileReaderOptions &options, 58 | const MultiFileOptions &file_options) override; 59 | 60 | unique_ptr GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) override; 61 | }; 62 | 63 | } // namespace duckdb 64 | -------------------------------------------------------------------------------- /src/include/avro_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/helper.hpp" 4 | #include "avro_type.hpp" 5 | #include "duckdb/common/multi_file/base_file_reader.hpp" 6 | 7 | namespace duckdb { 8 | 9 | class AvroReader : public BaseFileReader { 10 | public: 11 | AvroReader(ClientContext &context, const OpenFileInfo file); 12 | 13 | ~AvroReader() { 14 | avro_value_decref(&value); 15 | avro_file_reader_close(reader); 16 | } 17 | 18 | public: 19 | void Read(DataChunk &output); 20 | 21 | string GetReaderType() const override { 22 | return "Avro"; 23 | } 24 | 25 | bool TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate, 26 | LocalTableFunctionState &lstate) override; 27 | void Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state, 28 | DataChunk &chunk) override; 29 | 30 | public: 31 | avro_file_reader_t reader; 32 | avro_value_t value; 33 | unique_ptr read_vec; 34 | 35 | AllocatedData allocated_data; 36 | AvroType avro_type; 37 | LogicalType duckdb_type; 38 | }; 39 | 40 | } // namespace duckdb 41 | -------------------------------------------------------------------------------- /src/include/avro_type.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb/common/types.hpp" 4 | #include 5 | #include "duckdb/common/optional_idx.hpp" 6 | 7 | namespace duckdb { 8 | 9 | struct AvroType { 10 | public: 11 | AvroType() : duckdb_type(LogicalType::INVALID) { 12 | } 13 | AvroType(avro_type_t avro_type_p, LogicalType duckdb_type_p, child_list_t children_p = {}, 14 | unordered_map union_child_map_p = {}) 15 | : duckdb_type(duckdb_type_p), avro_type(avro_type_p), children(children_p), union_child_map(union_child_map_p) { 16 | } 17 | 18 | public: 19 | bool operator==(const AvroType &other) const { 20 | return duckdb_type == other.duckdb_type && avro_type == other.avro_type && children == other.children && 21 | union_child_map == other.union_child_map; 22 | } 23 | 24 | public: 25 | // we use special transformation rules for unions with null: 26 | // 1) the null does not become a union entry and 27 | // 2) if there is only one entry the union disappears and is repaced by its 28 | // child 29 | static LogicalType TransformAvroType(const AvroType &avro_type) { 30 | child_list_t children; 31 | 32 | switch (avro_type.duckdb_type.id()) { 33 | case LogicalTypeId::STRUCT: { 34 | for (auto &child : avro_type.children) { 35 | children.push_back(std::pair(child.first, TransformAvroType(child.second))); 36 | } 37 | D_ASSERT(!children.empty()); 38 | return LogicalType::STRUCT(std::move(children)); 39 | } 40 | case LogicalTypeId::LIST: 41 | return LogicalType::LIST(TransformAvroType(avro_type.children[0].second)); 42 | case LogicalTypeId::MAP: { 43 | child_list_t children; 44 | children.push_back(std::pair("key", LogicalType::VARCHAR)); 45 | children.push_back( 46 | std::pair("value", TransformAvroType(avro_type.children[0].second))); 47 | return LogicalType::MAP(LogicalType::STRUCT(std::move(children))); 48 | } 49 | case LogicalTypeId::UNION: { 50 | for (auto &child : avro_type.children) { 51 | if (child.second.duckdb_type == LogicalTypeId::SQLNULL) { 52 | continue; 53 | } 54 | children.push_back(std::pair(child.first, TransformAvroType(child.second))); 55 | } 56 | if (children.size() == 1) { 57 | return children[0].second; 58 | } 59 | if (children.empty()) { 60 | throw InvalidInputException("Empty union type"); 61 | } 62 | return LogicalType::UNION(std::move(children)); 63 | } 64 | default: 65 | return LogicalType(avro_type.duckdb_type); 66 | } 67 | } 68 | 69 | public: 70 | LogicalType duckdb_type; 71 | avro_type_t avro_type; 72 | child_list_t children; 73 | unordered_map union_child_map; 74 | }; 75 | 76 | } // namespace duckdb 77 | -------------------------------------------------------------------------------- /test/4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing this extension 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too. 3 | 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 5 | ```bash 6 | make test 7 | ``` 8 | or 9 | ```bash 10 | make test_debug 11 | ``` -------------------------------------------------------------------------------- /test/all_nullable_list.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/all_nullable_list.avro -------------------------------------------------------------------------------- /test/avro.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/avro.avro -------------------------------------------------------------------------------- /test/bigdata.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/bigdata.avro -------------------------------------------------------------------------------- /test/broken_record.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/broken_record.avro -------------------------------------------------------------------------------- /test/create_test_file.py: -------------------------------------------------------------------------------- 1 | import avro.schema 2 | from avro.datafile import DataFileReader, DataFileWriter 3 | from avro.io import DatumReader, DatumWriter 4 | 5 | 6 | 7 | json_schema = """ 8 | {"namespace": "example.avro", 9 | "type": "record", 10 | "name": "User", 11 | "fields": [ 12 | {"name": "name", "type": "string"}, 13 | {"name": "favorite_number", "type": ["int", "null"]}, 14 | {"name": "favorite_color", "type": ["string", "null"]} 15 | ] 16 | } 17 | """ 18 | 19 | schema = avro.schema.parse(json_schema) 20 | 21 | writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), schema) 22 | writer.append({"name": "Alyssa", "favorite_number": 256}) 23 | writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"}) 24 | writer.close() 25 | 26 | reader = DataFileReader(open("users.avro", "rb"), DatumReader()) 27 | for user in reader: 28 | print(user) 29 | reader.close() 30 | 31 | 32 | 33 | 34 | json_schema = """ 35 | {"namespace": "example2.avro", 36 | "type": "int", 37 | "name": "my_int" 38 | } 39 | """ 40 | 41 | schema = avro.schema.parse(json_schema) 42 | 43 | writer = DataFileWriter(open("root-int.avro", "wb"), DatumWriter(), schema) 44 | writer.append(42) 45 | writer.append(43) 46 | 47 | writer.close() 48 | 49 | reader = DataFileReader(open("root-int.avro", "rb"), DatumReader()) 50 | for user in reader: 51 | print(user) 52 | reader.close() 53 | 54 | 55 | 56 | json_schema = """ 57 | { "type": "record", 58 | "name": "root", 59 | "fields": [ 60 | {"name": "single_union", "type": ["int"]} 61 | ] 62 | } 63 | """ 64 | 65 | schema = avro.schema.parse(json_schema) 66 | 67 | writer = DataFileWriter(open("single-union.avro", "wb"), DatumWriter(), schema) 68 | writer.append({ "single_union":42}) 69 | 70 | 71 | writer.close() 72 | 73 | reader = DataFileReader(open("single-union.avro", "rb"), DatumReader()) 74 | for user in reader: 75 | print(user) 76 | reader.close() 77 | 78 | 79 | 80 | 81 | json_schema = """ 82 | { "type": "record", 83 | "name": "root", 84 | "fields": [ 85 | {"name": "null_first", "type": ["null","int"]} 86 | ] 87 | } 88 | """ 89 | 90 | schema = avro.schema.parse(json_schema) 91 | 92 | writer = DataFileWriter(open("null_first.avro", "wb"), DatumWriter(), schema) 93 | writer.append({ "null_first":42}) 94 | writer.append({}) 95 | 96 | 97 | writer.close() 98 | 99 | reader = DataFileReader(open("null_first.avro", "rb"), DatumReader()) 100 | for user in reader: 101 | print(user) 102 | reader.close() 103 | 104 | 105 | 106 | 107 | json_schema = """ 108 | { "type": "record", 109 | "name": "root", 110 | "fields": [ 111 | {"name": "null_last", "type": ["int","null"]} 112 | ] 113 | } 114 | """ 115 | 116 | schema = avro.schema.parse(json_schema) 117 | 118 | writer = DataFileWriter(open("null_last.avro", "wb"), DatumWriter(), schema) 119 | writer.append({ "null_last":42}) 120 | writer.append({}) 121 | 122 | 123 | writer.close() 124 | 125 | reader = DataFileReader(open("null_last.avro", "rb"), DatumReader()) 126 | for user in reader: 127 | print(user) 128 | reader.close() 129 | 130 | 131 | json_schema = """ 132 | { "type": "record", 133 | "name": "root", 134 | "fields": [ 135 | {"name": "null", "type": "null"}, 136 | {"name": "boolean", "type": "boolean"}, 137 | {"name": "int", "type": "int"}, 138 | {"name": "long", "type": "long"}, 139 | {"name": "float", "type": "float"}, 140 | {"name": "double", "type": "double"}, 141 | {"name": "bytes", "type": "bytes"}, 142 | {"name": "string", "type": "string"} 143 | ] 144 | } 145 | """ 146 | 147 | schema = avro.schema.parse(json_schema) 148 | 149 | writer = DataFileWriter(open("primitive_types.avro", "wb"), DatumWriter(), schema) 150 | 151 | 152 | 153 | writer.append({ 'null':None, 'boolean': False, 'int': -2147483648, 'long' : -9223372036854775808, 'float' : -3.4028235e+38, 'double' : -1.7976931348623157e+308, 'bytes' : 'thisisalongblob\x00withnullbytes'.encode(), 'string' : "🦆🦆🦆🦆🦆🦆"}) 154 | writer.append({ 'null':None, 'boolean': True, 'int': 2147483647, 'long' : 9223372036854775807, 'float' : 3.4028235e+38, 'double' : 1.7976931348623157e+308, 'bytes': '\x00\x00\x00a'.encode(), 'string' : 'goo'}) 155 | 156 | 157 | writer.close() 158 | 159 | reader = DataFileReader(open("primitive_types.avro", "rb"), DatumReader()) 160 | for user in reader: 161 | print(user) 162 | reader.close() 163 | 164 | 165 | 166 | json_schema = """ 167 | { 168 | "type": "record", 169 | "name": "MySchema", 170 | "namespace": "com.company", 171 | "fields": [ 172 | { 173 | "name": "color", 174 | "type": { 175 | "type": "enum", 176 | "name": "Color", 177 | "symbols": [ 178 | "UNKNOWN", 179 | "GREEN", 180 | "RED" 181 | ] 182 | }, 183 | "default": "UNKNOWN" 184 | } 185 | ] 186 | } 187 | """ 188 | 189 | schema = avro.schema.parse(json_schema) 190 | 191 | writer = DataFileWriter(open("enum.avro", "wb"), DatumWriter(), schema) 192 | 193 | 194 | 195 | writer.append({ 'color': 'GREEN'}) 196 | writer.append({ 'color': 'GREEN'}) 197 | writer.append({ 'color': 'RED'}) 198 | writer.append({ 'color': 'UNKNOWN'}) 199 | writer.append({ 'color': 'UNKNOWN'}) 200 | 201 | writer.close() 202 | 203 | reader = DataFileReader(open("enum.avro", "rb"), DatumReader()) 204 | for user in reader: 205 | print(user) 206 | reader.close() 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | json_schema = """ 215 | { "type": "record", 216 | "name": "root", 217 | "fields": [ 218 | { 219 | "name": "md5", 220 | "type": {"type": "fixed", "size": 32, "name": "md5"} 221 | } 222 | ] 223 | } 224 | """ 225 | 226 | schema = avro.schema.parse(json_schema) 227 | 228 | writer = DataFileWriter(open("fixed.avro", "wb"), DatumWriter(), schema) 229 | 230 | 231 | 232 | writer.append({ 'md5' : '47336f3f2497b70ac046cf23298e20a7'.encode()}) 233 | writer.append({ 'md5' : 'a789a15a7ff7db4a0d1b186363ef0771'.encode()}) 234 | writer.append({ 'md5' : 'c9db7c67a6acb5a65c78b19e9e01d7b0'.encode()}) 235 | writer.append({ 'md5' : 'ac441296bcbd44442301204a8f061cf2'.encode()}) 236 | 237 | 238 | 239 | writer.close() 240 | 241 | reader = DataFileReader(open("fixed.avro", "rb"), DatumReader()) 242 | for user in reader: 243 | print(user) 244 | reader.close() 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | json_schema = """ 255 | { "type": "record", 256 | "name": "root", 257 | "fields": [ 258 | { 259 | "name": "string_arr", 260 | "type": { 261 | "type": "array", 262 | "items" : "string", 263 | "default": [] 264 | } 265 | } 266 | ] 267 | } 268 | """ 269 | 270 | schema = avro.schema.parse(json_schema) 271 | 272 | writer = DataFileWriter(open("string_array.avro", "wb"), DatumWriter(), schema) 273 | 274 | 275 | 276 | writer.append({ 'string_arr' : ['Hello' ,'World']}) 277 | writer.append({ 'string_arr' : ['this']}) 278 | writer.append({ 'string_arr' : []}) 279 | writer.append({ 'string_arr' : ['is', 'cool','array']}) 280 | writer.append({ 'string_arr' : ['data']}) 281 | 282 | 283 | 284 | writer.close() 285 | 286 | reader = DataFileReader(open("string_array.avro", "rb"), DatumReader()) 287 | for user in reader: 288 | print(user) 289 | reader.close() 290 | 291 | 292 | 293 | 294 | json_schema = """ 295 | { "type": "record", 296 | "name": "root", 297 | "fields": [ 298 | { 299 | "name": "long_map", 300 | "type": { 301 | "type": "map", 302 | "values" : "long", 303 | "default": {} 304 | } 305 | } 306 | ] 307 | } 308 | """ 309 | 310 | schema = avro.schema.parse(json_schema) 311 | 312 | writer = DataFileWriter(open("long_map.avro", "wb"), DatumWriter(), schema) 313 | 314 | writer.append({ 'long_map' : {'one': 42}}) 315 | writer.append({ 'long_map' : {'two': 43}}) 316 | writer.append({ 'long_map' : {'three': 44}}) 317 | 318 | writer.close() 319 | 320 | reader = DataFileReader(open("long_map.avro", "rb"), DatumReader()) 321 | for user in reader: 322 | print(user) 323 | reader.close() 324 | 325 | 326 | 327 | json_schema = """ 328 | { "type": "record", 329 | "name": "root", 330 | "fields": [ 331 | { 332 | "name": "string_arr", 333 | "type": ["null", { 334 | "type": "array", 335 | "items" : "string", 336 | "default": [] 337 | }] 338 | } 339 | ] 340 | } 341 | """ 342 | 343 | schema = avro.schema.parse(json_schema) 344 | 345 | writer = DataFileWriter(open("nullable_string_array.avro", "wb"), DatumWriter(), schema) 346 | 347 | 348 | 349 | writer.append({ 'string_arr' : ['Hello' ,'World']}) 350 | writer.append({ 'string_arr' : ['this']}) 351 | writer.append({ 'string_arr' : []}) 352 | writer.append({ 'string_arr' : None}) 353 | writer.append({ 'string_arr' : None}) 354 | writer.append({ 'string_arr' : ['is', 'cool','array']}) 355 | writer.append({ 'string_arr' : ['data']}) 356 | 357 | 358 | 359 | writer.close() 360 | 361 | reader = DataFileReader(open("nullable_string_array.avro", "rb"), DatumReader()) 362 | for user in reader: 363 | print(user) 364 | reader.close() 365 | 366 | 367 | 368 | 369 | 370 | json_schema = """ 371 | { "type": "record", 372 | "name": "root", 373 | "fields": [ 374 | { 375 | "name": "string_arr", 376 | "type": { 377 | "type": "array", 378 | "items" : ["string", "null"], 379 | "default": [] 380 | } 381 | } 382 | ] 383 | } 384 | """ 385 | 386 | schema = avro.schema.parse(json_schema) 387 | 388 | writer = DataFileWriter(open("nullable_entry_string_array.avro", "wb"), DatumWriter(), schema) 389 | 390 | 391 | 392 | writer.append({ 'string_arr' : ['Hello' ,None, 'World']}) 393 | writer.append({ 'string_arr' : ['this']}) 394 | writer.append({ 'string_arr' : [None]}) 395 | writer.append({ 'string_arr' : [None, None, None]}) 396 | writer.append({ 'string_arr' : []}) 397 | writer.append({ 'string_arr' : [None, 'is', 'cool',None, 'array',None]}) 398 | writer.append({ 'string_arr' : ['data',None]}) 399 | 400 | 401 | 402 | writer.close() 403 | 404 | reader = DataFileReader(open("nullable_entry_string_array.avro", "rb"), DatumReader()) 405 | for user in reader: 406 | print(user) 407 | reader.close() 408 | 409 | 410 | 411 | 412 | 413 | json_schema = """ 414 | { "type": "record", 415 | "name": "root", 416 | "fields": [ 417 | { 418 | "name": "string_arr", 419 | "type": ["null", { 420 | "type": "array", 421 | "items" : ["string", "null"], 422 | "default": [] 423 | }] 424 | } 425 | ] 426 | } 427 | """ 428 | 429 | schema = avro.schema.parse(json_schema) 430 | 431 | writer = DataFileWriter(open("all_nullable_list.avro", "wb"), DatumWriter(), schema) 432 | 433 | 434 | 435 | writer.append({ 'string_arr' : ['Hello' ,None, 'World']}) 436 | writer.append({ 'string_arr' : ['this']}) 437 | writer.append({ 'string_arr' : [None]}) 438 | writer.append({ 'string_arr' : [None, None, None]}) 439 | writer.append({ 'string_arr' : []}) 440 | writer.append({ 'string_arr' : None}) 441 | writer.append({ 'string_arr' : None}) 442 | writer.append({ 'string_arr' : [None, 'is', 'cool',None, 'array',None]}) 443 | writer.append({ 'string_arr' : ['data',None]}) 444 | 445 | 446 | 447 | writer.close() 448 | 449 | reader = DataFileReader(open("all_nullable_list.avro", "rb"), DatumReader()) 450 | for user in reader: 451 | print(user) 452 | reader.close() 453 | 454 | 455 | 456 | 457 | json_schema = """ 458 | { "type": "record", 459 | "name": "root", 460 | "fields": [ 461 | { 462 | "name": "nested_ints", 463 | "type": ["null", { 464 | "type": "array", 465 | "items" : ["null", { 466 | "type": "array", 467 | "items" : ["int", "null"], 468 | "default": [] 469 | }], 470 | "default": [] 471 | }] 472 | } 473 | ] 474 | } 475 | """ 476 | 477 | schema = avro.schema.parse(json_schema) 478 | 479 | writer = DataFileWriter(open("nested_nullable_lists.avro", "wb"), DatumWriter(), schema) 480 | 481 | 482 | writer.append({ 'nested_ints' : None}) 483 | writer.append({ 'nested_ints' : [None]}) 484 | writer.append({ 'nested_ints' : [[None], [None]]}) 485 | writer.append({ 'nested_ints' : [None, None]}) 486 | writer.append({ 'nested_ints' : [[42]]}) 487 | writer.append({ 'nested_ints' : [[42], [43]]}) 488 | writer.append({ 'nested_ints' : [[42, 43]]}) 489 | writer.append({ 'nested_ints' : [[42, 43], None, [44, 45]]}) 490 | writer.append({ 'nested_ints' : [[42, None, 43, None], None, [44, None, 45, None], None, [46]]}) 491 | 492 | writer.close() 493 | 494 | reader = DataFileReader(open("nested_nullable_lists.avro", "rb"), DatumReader()) 495 | for user in reader: 496 | print(user) 497 | reader.close() 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | json_schema = """ 508 | { 509 | "type": "record", 510 | "name": "LongList", 511 | "fields" : [ 512 | {"name": "value", "type": "long"}, 513 | {"name": "next", "type": ["null", "LongList"]} 514 | ] 515 | } 516 | """ 517 | 518 | schema = avro.schema.parse(json_schema) 519 | 520 | writer = DataFileWriter(open("recursive.avro", "wb"), DatumWriter(), schema) 521 | 522 | 523 | writer.append({ 'value': 42}) 524 | writer.append({ 'value': 43, 'next' : {'value': 44}}) 525 | writer.append({ 'value': 43, 'next' : {'value': 44, 'next' : {'value': 45}}}) 526 | 527 | writer.close() 528 | 529 | reader = DataFileReader(open("recursive.avro", "rb"), DatumReader()) 530 | for user in reader: 531 | print(user) 532 | reader.close() 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | json_schema = """ 541 | { "type": "record", 542 | "name": "root", 543 | "fields": [ 544 | {"name": "n", "type": "null"} 545 | ] 546 | } 547 | """ 548 | 549 | schema = avro.schema.parse(json_schema) 550 | 551 | writer = DataFileWriter(open("broken_record.avro", "wb"), DatumWriter(), schema) 552 | 553 | writer.append({}) 554 | writer.append({}) 555 | 556 | # writer.append({ 'value': 42}) 557 | # writer.append({ 'value': 43, 'next' : {'value': 44}}) 558 | # writer.append({ 'value': 43, 'next' : {'value': 44, 'next' : {'value': 45}}}) 559 | 560 | writer.close() 561 | 562 | reader = DataFileReader(open("broken_record.avro", "rb"), DatumReader()) 563 | for user in reader: 564 | print(user) 565 | reader.close() 566 | 567 | 568 | 569 | 570 | 571 | 572 | # record 573 | # detect recursive types or what happens here? 574 | 575 | 576 | # union by name 577 | 578 | 579 | json_schema = """ 580 | { "type": "record", 581 | "name": "root", 582 | "fields": [ 583 | {"name": "one", "type": "int"}, 584 | {"name": "two", "type": "double"}, 585 | {"name": "three", "type": "string"} 586 | ] 587 | } 588 | """ 589 | 590 | schema = avro.schema.parse(json_schema) 591 | 592 | writer = DataFileWriter(open("union-name-1.avro", "wb"), DatumWriter(), schema) 593 | 594 | 595 | 596 | writer.append({ 'one' : 10, 'two' : 2.0, 'three': 's30'}) 597 | writer.append({ 'one' : 11, 'two' : 2.1, 'three': 's31'}) 598 | 599 | 600 | writer.close() 601 | 602 | reader = DataFileReader(open("union-name-1.avro", "rb"), DatumReader()) 603 | for user in reader: 604 | print(user) 605 | reader.close() 606 | 607 | 608 | 609 | 610 | json_schema = """ 611 | { "type": "record", 612 | "name": "root", 613 | "fields": [ 614 | {"name": "two", "type": "double"}, 615 | {"name": "one", "type": "int"}, 616 | {"name": "three", "type": "string"} 617 | ] 618 | } 619 | """ 620 | 621 | schema = avro.schema.parse(json_schema) 622 | 623 | writer = DataFileWriter(open("union-name-2.avro", "wb"), DatumWriter(), schema) 624 | 625 | 626 | 627 | writer.append({ 'one' : 12, 'two' : 2.2, 'three': 's32'}) 628 | writer.append({ 'one' : 13, 'two' : 2.3, 'three': 's33'}) 629 | 630 | 631 | writer.close() 632 | 633 | reader = DataFileReader(open("union-name-2.avro", "rb"), DatumReader()) 634 | for user in reader: 635 | print(user) 636 | reader.close() 637 | 638 | 639 | 640 | json_schema = """ 641 | { "type": "record", 642 | "name": "root", 643 | "fields": [ 644 | {"name": "three", "type": "string"}, 645 | {"name": "two", "type": "double"}, 646 | {"name": "one", "type": "int"} 647 | ] 648 | } 649 | """ 650 | 651 | schema = avro.schema.parse(json_schema) 652 | 653 | writer = DataFileWriter(open("union-name-3.avro", "wb"), DatumWriter(), schema) 654 | 655 | 656 | 657 | writer.append({ 'one' : 14, 'two' : 2.4, 'three': 's34'}) 658 | writer.append({ 'one' : 15, 'two' : 2.5, 'three': 's35'}) 659 | 660 | 661 | writer.close() 662 | 663 | reader = DataFileReader(open("union-name-3.avro", "rb"), DatumReader()) 664 | for user in reader: 665 | print(user) 666 | reader.close() 667 | 668 | 669 | 670 | 671 | json_schema = """ 672 | { 673 | "type": "record", 674 | "name": "Request", 675 | "namespace": "example.avro", 676 | "fields": [ 677 | { 678 | "name": "request_id", 679 | "type": "string" 680 | }, 681 | { 682 | "name": "client_version", 683 | "type": { 684 | "type": "record", 685 | "name": "Version", 686 | "fields": [ 687 | { 688 | "name": "major", 689 | "type": "int" 690 | }, 691 | { 692 | "name": "minor", 693 | "type": "int" 694 | } 695 | ] 696 | } 697 | }, 698 | { 699 | "name": "server_version", 700 | "type": "Version" 701 | } 702 | ] 703 | } 704 | """ 705 | 706 | 707 | 708 | schema = avro.schema.parse(json_schema) 709 | 710 | writer = DataFileWriter(open("reuse-1.avro", "wb"), DatumWriter(), schema) 711 | 712 | 713 | writer.append({ 'request_id' : 'hello', 'client_version' : {'major': 4, 'minor' : 2}, 'server_version': {'major': 8, 'minor' : 5}}) 714 | writer.append({ 'request_id' : 'world', 'client_version' : {'major': 5, 'minor' : 3}, 'server_version': {'major': 9, 'minor' : 6}}) 715 | 716 | 717 | writer.close() 718 | 719 | reader = DataFileReader(open("reuse-1.avro", "rb"), DatumReader()) 720 | for user in reader: 721 | print(user) 722 | reader.close() 723 | 724 | 725 | 726 | 727 | json_schema = """ 728 | { 729 | "type": "record", 730 | "name": "Request", 731 | "namespace": "example.avro", 732 | "fields": [ 733 | { 734 | "name": "version", 735 | "type": { 736 | "type": "record", 737 | "name": "Version", 738 | "fields": [ 739 | { "name": "major", "type": "int" }, 740 | { "name": "minor", "type": "int" } 741 | ] 742 | } 743 | }, 744 | { 745 | "name": "details", 746 | "type": { 747 | "type": "record", 748 | "name": "Details", 749 | "fields": [ 750 | { "name": "release_version", "type": "Version" } 751 | ] 752 | } 753 | } 754 | ] 755 | } 756 | """ 757 | 758 | 759 | 760 | schema = avro.schema.parse(json_schema) 761 | 762 | writer = DataFileWriter(open("reuse-2.avro", "wb"), DatumWriter(), schema) 763 | 764 | 765 | writer.append({ 'version' : {'major': 4, 'minor' : 2}, 'details': {'release_version': {'major': 8, 'minor' : 5}}}) 766 | writer.append({ 'version' : {'major': 5, 'minor' : 3}, 'details': {'release_version': {'major': 9, 'minor' : 6}}}) 767 | 768 | 769 | writer.close() 770 | 771 | reader = DataFileReader(open("reuse-2.avro", "rb"), DatumReader()) 772 | for user in reader: 773 | print(user) 774 | reader.close() 775 | 776 | 777 | 778 | 779 | json_schema = """ 780 | {"type": "record", 781 | "name": "root", 782 | "fields": [ 783 | {"name": "c0", "type": "long"}, 784 | {"name": "c1", "type": "long"}, 785 | {"name": "c2", "type": "long"}, 786 | {"name": "c3", "type": "long"}, 787 | {"name": "c4", "type": "long"}, 788 | {"name": "c5", "type": "long"}, 789 | {"name": "c6", "type": "long"}, 790 | {"name": "c7", "type": "long"}, 791 | {"name": "c8", "type": "long"}, 792 | {"name": "c9", "type": "long"} 793 | ] 794 | } 795 | """ 796 | 797 | n = 100000 798 | 799 | schema = avro.schema.parse(json_schema) 800 | 801 | writer = DataFileWriter(open("bigdata.avro", "wb"), DatumWriter(), schema, codec="deflate") 802 | for r in range(1000000): 803 | writer.append({f'c{i}': 10000000*i + r for i in range(10)}) 804 | 805 | writer.close() 806 | 807 | 808 | count = 0 809 | reader = DataFileReader(open("bigdata.avro", "rb"), DatumReader()) 810 | for user in reader: 811 | count = count + 1 812 | reader.close() 813 | print(count) 814 | 815 | 816 | 817 | 818 | 819 | 820 | -------------------------------------------------------------------------------- /test/empty_record.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/empty_record.avro -------------------------------------------------------------------------------- /test/enum.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/enum.avro -------------------------------------------------------------------------------- /test/fixed.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/fixed.avro -------------------------------------------------------------------------------- /test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro -------------------------------------------------------------------------------- /test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro -------------------------------------------------------------------------------- /test/iceberg/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro -------------------------------------------------------------------------------- /test/iceberg/cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro -------------------------------------------------------------------------------- /test/iceberg/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro -------------------------------------------------------------------------------- /test/iceberg/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro -------------------------------------------------------------------------------- /test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro -------------------------------------------------------------------------------- /test/logical_types.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/logical_types.avro -------------------------------------------------------------------------------- /test/long_map.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/long_map.avro -------------------------------------------------------------------------------- /test/manifest.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/manifest.avro -------------------------------------------------------------------------------- /test/nested_nullable_lists.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/nested_nullable_lists.avro -------------------------------------------------------------------------------- /test/null_first.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/null_first.avro -------------------------------------------------------------------------------- /test/null_last.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/null_last.avro -------------------------------------------------------------------------------- /test/nullable_entry_string_array.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/nullable_entry_string_array.avro -------------------------------------------------------------------------------- /test/nullable_string_array.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/nullable_string_array.avro -------------------------------------------------------------------------------- /test/part-r-00000.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/part-r-00000.avro -------------------------------------------------------------------------------- /test/primitive_types.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/primitive_types.avro -------------------------------------------------------------------------------- /test/query_small.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/query_small.avro -------------------------------------------------------------------------------- /test/recursive.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/recursive.avro -------------------------------------------------------------------------------- /test/reuse-1.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/reuse-1.avro -------------------------------------------------------------------------------- /test/reuse-2.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/reuse-2.avro -------------------------------------------------------------------------------- /test/root-int.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/root-int.avro -------------------------------------------------------------------------------- /test/single-union.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/single-union.avro -------------------------------------------------------------------------------- /test/sql/avro.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/avro.test 2 | # description: test avro extension 3 | # group: [avro] 4 | 5 | require avro 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | # usual suspect, the userdata1 file 11 | query II 12 | select column_name, column_type from (DESCRIBE FROM read_avro('test/userdata1.avro')); 13 | ---- 14 | registration_dttm VARCHAR 15 | id BIGINT 16 | first_name VARCHAR 17 | last_name VARCHAR 18 | email VARCHAR 19 | gender VARCHAR 20 | ip_address VARCHAR 21 | cc BIGINT 22 | country VARCHAR 23 | birthdate VARCHAR 24 | salary DOUBLE 25 | title VARCHAR 26 | comments VARCHAR 27 | 28 | 29 | query I 30 | FROM read_avro('test/userdata1.avro') SELECT COUNT(*) 31 | ---- 32 | 1000 33 | 34 | query III 35 | FROM read_avro('test/userdata1.avro') SELECT first_name, cc, salary ORDER BY registration_dttm LIMIT 10; 36 | ---- 37 | Lillian 201713786459078 282503.77 38 | Chris 5602220700741429 NULL 39 | Nicholas 3575506969751259 192076.79 40 | Johnny 5602239825516409 169429.76 41 | Bruce NULL 118244.57 42 | Heather NULL 164117.18 43 | Larry 3531208154739438 139177.38 44 | Roy 3589146577885209 262816.87 45 | James 3589416270039051 211553.57 46 | Sean NULL NULL 47 | 48 | # usual suspect, the userdata1 file, this time with a filename arg 49 | query II 50 | select column_name, column_type from (DESCRIBE FROM read_avro('test/userdata1.avro', filename=true)); 51 | ---- 52 | registration_dttm VARCHAR 53 | id BIGINT 54 | first_name VARCHAR 55 | last_name VARCHAR 56 | email VARCHAR 57 | gender VARCHAR 58 | ip_address VARCHAR 59 | cc BIGINT 60 | country VARCHAR 61 | birthdate VARCHAR 62 | salary DOUBLE 63 | title VARCHAR 64 | comments VARCHAR 65 | filename VARCHAR 66 | 67 | 68 | query IIII 69 | FROM read_avro('test/userdata1.avro', filename=true) SELECT first_name, cc, salary, filename ORDER BY registration_dttm LIMIT 10; 70 | ---- 71 | Lillian 201713786459078 282503.77 test/userdata1.avro 72 | Chris 5602220700741429 NULL test/userdata1.avro 73 | Nicholas 3575506969751259 192076.79 test/userdata1.avro 74 | Johnny 5602239825516409 169429.76 test/userdata1.avro 75 | Bruce NULL 118244.57 test/userdata1.avro 76 | Heather NULL 164117.18 test/userdata1.avro 77 | Larry 3531208154739438 139177.38 test/userdata1.avro 78 | Roy 3589146577885209 262816.87 test/userdata1.avro 79 | James 3589416270039051 211553.57 test/userdata1.avro 80 | Sean NULL NULL test/userdata1.avro 81 | 82 | 83 | 84 | # with filename wildcard 85 | query III 86 | FROM read_avro('test/userdata*.avro', filename=true) SELECT filename[6:], count(*), max(salary) GROUP BY filename ORDER BY filename; 87 | ---- 88 | userdata1.avro 1000 286592.99 89 | userdata2.avro 998 286587.01 90 | userdata3.avro 1000 286735.82 91 | userdata4.avro 1000 286147.64 92 | userdata5.avro 1000 286384.03 93 | 94 | 95 | query II 96 | select column_name, column_type from (DESCRIBE FROM read_avro('test/users.avro')); 97 | ---- 98 | name VARCHAR 99 | favorite_number INTEGER 100 | favorite_color VARCHAR 101 | 102 | statement error 103 | from read_avro(['test/userdata1.avro', 'test/users.avro']) 104 | ---- 105 | schema mismatch in glob 106 | 107 | # example from readme 108 | query III 109 | FROM read_avro('test/users.avro') 110 | ---- 111 | Alyssa 256 NULL 112 | Ben 7 red 113 | 114 | query II 115 | select column_name, column_type from (DESCRIBE FROM read_avro('test/single-union.avro')); 116 | ---- 117 | single_union INTEGER 118 | 119 | # example from readme 120 | query I 121 | FROM read_avro('test/single-union.avro') 122 | ---- 123 | 42 124 | 125 | query II 126 | select column_name, column_type from (DESCRIBE FROM read_avro('test/null_first.avro')); 127 | ---- 128 | null_first INTEGER 129 | 130 | # example from readme 131 | query I 132 | FROM read_avro('test/null_first.avro') 133 | ---- 134 | 42 135 | NULL 136 | 137 | query II 138 | select column_name, column_type from (DESCRIBE FROM read_avro('test/null_last.avro')); 139 | ---- 140 | null_last INTEGER 141 | 142 | # example from readme 143 | query I 144 | FROM read_avro('test/null_last.avro') 145 | ---- 146 | 42 147 | NULL 148 | 149 | query II 150 | select column_name, column_type from (DESCRIBE FROM read_avro('test/primitive_types.avro')); 151 | ---- 152 | null INTEGER 153 | boolean BOOLEAN 154 | int INTEGER 155 | long BIGINT 156 | float FLOAT 157 | double DOUBLE 158 | bytes BLOB 159 | string VARCHAR 160 | 161 | # example from readme 162 | query IIIIIIII 163 | FROM read_avro('test/primitive_types.avro') 164 | ---- 165 | NULL 0 -2147483648 -9223372036854775808 -3.4028235e+38 -1.7976931348623157e+308 thisisalongblob\x00withnullbytes 🦆🦆🦆🦆🦆🦆 166 | NULL 1 2147483647 9223372036854775807 3.4028235e+38 1.7976931348623157e+308 \x00\x00\x00a goo 167 | 168 | 169 | 170 | query II 171 | select column_name, column_type from (DESCRIBE FROM read_avro('test/enum.avro')); 172 | ---- 173 | color ENUM('UNKNOWN', 'GREEN', 'RED') 174 | 175 | query I 176 | FROM read_avro('test/enum.avro') 177 | ---- 178 | GREEN 179 | GREEN 180 | RED 181 | UNKNOWN 182 | UNKNOWN 183 | 184 | query II 185 | select column_name, column_type from (DESCRIBE FROM read_avro('test/fixed.avro')); 186 | ---- 187 | md5 BLOB 188 | 189 | query I 190 | FROM read_avro('test/fixed.avro') 191 | ---- 192 | 47336f3f2497b70ac046cf23298e20a7 193 | a789a15a7ff7db4a0d1b186363ef0771 194 | c9db7c67a6acb5a65c78b19e9e01d7b0 195 | ac441296bcbd44442301204a8f061cf2 196 | 197 | 198 | 199 | query II 200 | select column_name, column_type from (DESCRIBE FROM read_avro('test/string_array.avro')); 201 | ---- 202 | string_arr VARCHAR[] 203 | 204 | query I 205 | FROM read_avro('test/string_array.avro') 206 | ---- 207 | [Hello, World] 208 | [this] 209 | [] 210 | [is, cool, array] 211 | [data] 212 | 213 | 214 | query II 215 | select column_name, column_type from (DESCRIBE FROM read_avro('test/nullable_string_array.avro')); 216 | ---- 217 | string_arr VARCHAR[] 218 | 219 | query I 220 | FROM read_avro('test/nullable_string_array.avro') 221 | ---- 222 | [Hello, World] 223 | [this] 224 | [] 225 | NULL 226 | NULL 227 | [is, cool, array] 228 | [data] 229 | 230 | 231 | query II 232 | select column_name, column_type from (DESCRIBE FROM read_avro('test/nullable_entry_string_array.avro')); 233 | ---- 234 | string_arr VARCHAR[] 235 | 236 | query I 237 | FROM read_avro('test/nullable_entry_string_array.avro') 238 | ---- 239 | [Hello, NULL, World] 240 | [this] 241 | [NULL] 242 | [NULL, NULL, NULL] 243 | [] 244 | [NULL, is, cool, NULL, array, NULL] 245 | [data, NULL] 246 | 247 | 248 | query II 249 | select column_name, column_type from (DESCRIBE FROM read_avro('test/all_nullable_list.avro')); 250 | ---- 251 | string_arr VARCHAR[] 252 | 253 | query I 254 | FROM read_avro('test/all_nullable_list.avro') 255 | ---- 256 | [Hello, NULL, World] 257 | [this] 258 | [NULL] 259 | [NULL, NULL, NULL] 260 | [] 261 | NULL 262 | NULL 263 | [NULL, is, cool, NULL, array, NULL] 264 | [data, NULL] 265 | 266 | 267 | query II 268 | select column_name, column_type from (DESCRIBE FROM read_avro('test/nested_nullable_lists.avro')); 269 | ---- 270 | nested_ints INTEGER[][] 271 | 272 | query I 273 | FROM read_avro('test/nested_nullable_lists.avro') 274 | ---- 275 | NULL 276 | [NULL] 277 | [[NULL], [NULL]] 278 | [NULL, NULL] 279 | [[42]] 280 | [[42], [43]] 281 | [[42, 43]] 282 | [[42, 43], NULL, [44, 45]] 283 | [[42, NULL, 43, NULL], NULL, [44, NULL, 45, NULL], NULL, [46]] 284 | 285 | 286 | query II 287 | select column_name, column_type from (DESCRIBE FROM read_avro('test/long_map.avro')); 288 | ---- 289 | long_map MAP(VARCHAR, BIGINT) 290 | 291 | query I 292 | FROM read_avro('test/long_map.avro') 293 | ---- 294 | {one=42} 295 | {two=43} 296 | {three=44} 297 | 298 | 299 | statement error 300 | from read_avro('does-not-exist.avro') 301 | ---- 302 | No files found that match the pattern 303 | 304 | statement error 305 | from read_avro('CMakeLists.txt') 306 | ---- 307 | Incorrect Avro container file magic number 308 | 309 | 310 | statement error 311 | FROM read_avro('test/recursive.avro') 312 | ---- 313 | Recursive Avro types not supported: LongList 314 | 315 | query II 316 | select column_name, column_type from (DESCRIBE FROM read_avro('test/broken_record.avro')); 317 | ---- 318 | n INTEGER 319 | 320 | query I 321 | FROM read_avro('test/broken_record.avro') 322 | ---- 323 | NULL 324 | NULL 325 | 326 | 327 | 328 | 329 | query II 330 | select column_name, column_type from (DESCRIBE FROM read_avro('test/query_small.avro')); 331 | ---- 332 | avro_schema UNION(u0 STRUCT("data" BLOB), u1 STRUCT(fatal BOOLEAN, "name" VARCHAR, description VARCHAR, "position" BIGINT), u2 STRUCT(bytesScanned BIGINT, totalBytes BIGINT), u3 STRUCT(totalBytes BIGINT)) 333 | 334 | query I 335 | FROM read_avro('test/query_small.avro') 336 | ---- 337 | {'data': '100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A'} 338 | {'bytesScanned': 1024, 'totalBytes': 1024} 339 | {'totalBytes': 1024} 340 | 341 | 342 | 343 | 344 | query II 345 | select column_name, column_type from (DESCRIBE FROM read_avro('test/avro.avro')); 346 | ---- 347 | visitor STRUCT(cookie_id VARCHAR, segments STRUCT(id INTEGER, expiration BIGINT)[], edges MAP(VARCHAR, BIGINT), behaviors MAP(VARCHAR, MAP(VARCHAR, INTEGER)), birthdate BIGINT, association_ids MAP(VARCHAR, VARCHAR)) 348 | events STRUCT(cookie_id VARCHAR, tstamp BIGINT, edge VARCHAR, changes UNION(u0 STRUCT(daystamp VARCHAR, context VARCHAR, "type" ENUM('ADX', 'RETARGET'), count INTEGER), u1 STRUCT(operation ENUM('ADD', 'REPLACE', 'UPDATE', 'REMOVE'), association_id VARCHAR, network VARCHAR, segments INTEGER[])))[] 349 | 350 | 351 | query II 352 | FROM read_avro('test/avro.avro') 353 | ---- 354 | {'cookie_id': 133263e9e100000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 133263e9e100000, 'tstamp': 1403721385042, 'edge': batchimport, 'changes': {'operation': REMOVE, 'association_id': NULL, 'network': et, 'segments': [49118]}}, {'cookie_id': 133263e9e100000, 'tstamp': 1403721385042, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49117]}}] 355 | {'cookie_id': 134adb391b00000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 134adb391b00000, 'tstamp': 1403721376988, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 356 | {'cookie_id': 1317beb84b00000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 1317beb84b00000, 'tstamp': 1403721380452, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 357 | {'cookie_id': 12b811f59080000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 12b811f59080000, 'tstamp': 1403721375367, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 358 | {'cookie_id': 134338dcf180000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 134338dcf180000, 'tstamp': 1403721380483, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 359 | {'cookie_id': 12aa3637e280000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 12aa3637e280000, 'tstamp': 1403721383922, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 360 | {'cookie_id': 133bc9432a80000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 133bc9432a80000, 'tstamp': 1403721385810, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 361 | {'cookie_id': 134dbec12c80000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 134dbec12c80000, 'tstamp': 1403721376778, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49117]}}] 362 | {'cookie_id': 12bd0b8c6201000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 12bd0b8c6201000, 'tstamp': 1403721384549, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}] 363 | {'cookie_id': 13114ef2b401000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}} [{'cookie_id': 13114ef2b401000, 'tstamp': 1403721375994, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49117]}}] 364 | 365 | 366 | 367 | 368 | query II 369 | select column_name, column_type from (DESCRIBE FROM read_avro('test/part-r-00000.avro')); 370 | ---- 371 | string VARCHAR 372 | simple_map MAP(VARCHAR, INTEGER) 373 | complex_map MAP(VARCHAR, MAP(VARCHAR, VARCHAR)) 374 | union_string_null VARCHAR 375 | union_int_long_null UNION(u0 INTEGER, u1 BIGINT) 376 | union_float_double UNION(u0 FLOAT, u1 DOUBLE) 377 | fixed3 BLOB 378 | fixed2 BLOB 379 | enum ENUM('SPADES', 'HEARTS', 'DIAMONDS', 'CLUBS') 380 | record STRUCT(value_field VARCHAR) 381 | array_of_boolean BOOLEAN[] 382 | bytes BLOB 383 | 384 | statement ok 385 | FROM read_avro('test/part-r-00000.avro') 386 | 387 | # iceberg yay 388 | query II 389 | select column_name, column_type from (DESCRIBE FROM read_avro('test/4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro')); 390 | ---- 391 | status INTEGER 392 | snapshot_id BIGINT 393 | data_file STRUCT(file_path VARCHAR, file_format VARCHAR, "partition" INTEGER, record_count BIGINT, file_size_in_bytes BIGINT, block_size_in_bytes BIGINT, file_ordinal INTEGER, sort_columns INTEGER[], column_sizes STRUCT("key" INTEGER, "value" BIGINT)[], value_counts STRUCT("key" INTEGER, "value" BIGINT)[], null_value_counts STRUCT("key" INTEGER, "value" BIGINT)[], lower_bounds STRUCT("key" INTEGER, "value" BLOB)[], upper_bounds STRUCT("key" INTEGER, "value" BLOB)[], key_metadata BLOB, split_offsets BIGINT[]) 394 | 395 | # usual suspect, the userdata1 file, this time with a wildcard 396 | query II 397 | select column_name, column_type from (DESCRIBE FROM read_avro('test/userdata1*.avro')); 398 | ---- 399 | registration_dttm VARCHAR 400 | id BIGINT 401 | first_name VARCHAR 402 | last_name VARCHAR 403 | email VARCHAR 404 | gender VARCHAR 405 | ip_address VARCHAR 406 | cc BIGINT 407 | country VARCHAR 408 | birthdate VARCHAR 409 | salary DOUBLE 410 | title VARCHAR 411 | comments VARCHAR 412 | 413 | 414 | # union by name 415 | statement error 416 | FROM read_avro('test/union-name-*.avro', filename=true, union_by_name=true) order by one; 417 | ---- 418 | Not implemented Error: 'union_by_name' not implemented for Avro reader yet 419 | 420 | 421 | # TODO: add test where schemas can't be combined 422 | 423 | # files with different schemas that can be safely combined are okay 424 | query III 425 | select * exclude filename FROM read_avro('test/union-name-*.avro', filename=true) order by all; 426 | ---- 427 | 10 2.0 s30 428 | 11 2.1 s31 429 | 12 2.2 s32 430 | 13 2.3 s33 431 | 14 2.4 s34 432 | 15 2.5 s35 433 | 434 | 435 | 436 | query II 437 | select column_name, column_type from (DESCRIBE FROM read_avro('test/reuse-1.avro')); 438 | ---- 439 | request_id VARCHAR 440 | client_version STRUCT(major INTEGER, minor INTEGER) 441 | server_version STRUCT(major INTEGER, minor INTEGER) 442 | 443 | query III 444 | FROM read_avro('test/reuse-1.avro') 445 | ---- 446 | hello {'major': 4, 'minor': 2} {'major': 8, 'minor': 5} 447 | world {'major': 5, 'minor': 3} {'major': 9, 'minor': 6} 448 | 449 | 450 | 451 | query II 452 | select column_name, column_type from (DESCRIBE FROM read_avro('test/reuse-2.avro')); 453 | ---- 454 | version STRUCT(major INTEGER, minor INTEGER) 455 | details STRUCT(release_version STRUCT(major INTEGER, minor INTEGER)) 456 | 457 | query II 458 | FROM read_avro('test/reuse-2.avro') 459 | ---- 460 | {'major': 4, 'minor': 2} {'release_version': {'major': 8, 'minor': 5}} 461 | {'major': 5, 'minor': 3} {'release_version': {'major': 9, 'minor': 6}} 462 | 463 | 464 | query II 465 | select column_name, column_type from (DESCRIBE FROM read_avro('test/union.avro')); 466 | ---- 467 | event UNION(u0 STRUCT(id VARCHAR, "timestamp" BIGINT, "data" VARCHAR), u1 STRUCT(id VARCHAR, "timestamp" BIGINT, updatedData VARCHAR), u2 STRUCT(id VARCHAR, "timestamp" BIGINT)) 468 | 469 | query I 470 | FROM read_avro('test/union.avro') 471 | ---- 472 | {'id': 1, 'timestamp': 1704367260, 'data': New record created} 473 | {'id': 1, 'timestamp': 1704367360, 'updatedData': Record updated} 474 | {'id': 1, 'timestamp': 1704367460} 475 | 476 | -------------------------------------------------------------------------------- /test/sql/bigdata.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/avro.test 2 | # description: test avro extension 3 | # group: [avro] 4 | 5 | require avro 6 | 7 | 8 | query I 9 | FROM read_avro('test/bigdata.avro') SELECT count(*) 10 | ---- 11 | 1000000 12 | 13 | 14 | query IIIIIIIIII 15 | FROM read_avro('test/bigdata.avro') ORDER BY c1 LIMIT 10 16 | ---- 17 | 0 10000000 20000000 30000000 40000000 50000000 60000000 70000000 80000000 90000000 18 | 1 10000001 20000001 30000001 40000001 50000001 60000001 70000001 80000001 90000001 19 | 2 10000002 20000002 30000002 40000002 50000002 60000002 70000002 80000002 90000002 20 | 3 10000003 20000003 30000003 40000003 50000003 60000003 70000003 80000003 90000003 21 | 4 10000004 20000004 30000004 40000004 50000004 60000004 70000004 80000004 90000004 22 | 5 10000005 20000005 30000005 40000005 50000005 60000005 70000005 80000005 90000005 23 | 6 10000006 20000006 30000006 40000006 50000006 60000006 70000006 80000006 90000006 24 | 7 10000007 20000007 30000007 40000007 50000007 60000007 70000007 80000007 90000007 25 | 8 10000008 20000008 30000008 40000008 50000008 60000008 70000008 80000008 90000008 26 | 9 10000009 20000009 30000009 40000009 50000009 60000009 70000009 80000009 90000009 27 | 28 | 29 | query IIIIIIIIII 30 | FROM read_avro('test/bigdata.avro') SELECT SUM(columns(*)) 31 | ---- 32 | 499999500000 10499999500000 20499999500000 30499999500000 40499999500000 50499999500000 60499999500000 70499999500000 80499999500000 90499999500000 33 | -------------------------------------------------------------------------------- /test/sql/external_file_cache.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/external_file_cache.test 2 | # description: test avro extension and external file cache 3 | # group: [avro] 4 | 5 | require avro 6 | 7 | query IIII 8 | from duckdb_external_file_cache(); 9 | ---- 10 | 11 | query I 12 | FROM read_avro('test/bigdata.avro') SELECT count(*) 13 | ---- 14 | 1000000 15 | 16 | query IIIIIIIIII 17 | FROM read_avro('test/bigdata.avro') ORDER BY c1 LIMIT 10 18 | ---- 19 | 0 10000000 20000000 30000000 40000000 50000000 60000000 70000000 80000000 90000000 20 | 1 10000001 20000001 30000001 40000001 50000001 60000001 70000001 80000001 90000001 21 | 2 10000002 20000002 30000002 40000002 50000002 60000002 70000002 80000002 90000002 22 | 3 10000003 20000003 30000003 40000003 50000003 60000003 70000003 80000003 90000003 23 | 4 10000004 20000004 30000004 40000004 50000004 60000004 70000004 80000004 90000004 24 | 5 10000005 20000005 30000005 40000005 50000005 60000005 70000005 80000005 90000005 25 | 6 10000006 20000006 30000006 40000006 50000006 60000006 70000006 80000006 90000006 26 | 7 10000007 20000007 30000007 40000007 50000007 60000007 70000007 80000007 90000007 27 | 8 10000008 20000008 30000008 40000008 50000008 60000008 70000008 80000008 90000008 28 | 9 10000009 20000009 30000009 40000009 50000009 60000009 70000009 80000009 90000009 29 | 30 | 31 | statement ok 32 | from read_avro('test/userdata1.avro'); 33 | 34 | query IIII 35 | from duckdb_external_file_cache() order by path; 36 | ---- 37 | test/bigdata.avro 17647257 0 true 38 | test/userdata1.avro 93561 0 true -------------------------------------------------------------------------------- /test/sql/iceberg.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/avro.test 2 | # description: test avro extension 3 | # group: [avro] 4 | 5 | require avro 6 | 7 | statement ok 8 | PRAGMA enable_verification 9 | 10 | query II 11 | select column_name, column_type from (DESCRIBE FROM read_avro('test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro')); 12 | ---- 13 | status INTEGER 14 | snapshot_id BIGINT 15 | sequence_number BIGINT 16 | data_file STRUCT("content" INTEGER, file_path VARCHAR, file_format VARCHAR, "partition" INTEGER, record_count BIGINT, file_size_in_bytes BIGINT, column_sizes STRUCT("key" INTEGER, "value" BIGINT)[], value_counts STRUCT("key" INTEGER, "value" BIGINT)[], null_value_counts STRUCT("key" INTEGER, "value" BIGINT)[], nan_value_counts STRUCT("key" INTEGER, "value" BIGINT)[], lower_bounds STRUCT("key" INTEGER, "value" BLOB)[], upper_bounds STRUCT("key" INTEGER, "value" BLOB)[], key_metadata BLOB, split_offsets BIGINT[], equality_ids INTEGER[], sort_order_id INTEGER) 17 | 18 | query IIIIIIIII 19 | FROM (FROM read_avro('test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro') SELECT status, snapshot_id, sequence_number, data_file.*) SELECT status, snapshot_id, sequence_number, content, file_path, file_format, partition, record_count, file_size_in_bytes 20 | ---- 21 | 2 7635660646343998149 NULL 0 lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET NULL 60175 1390176 22 | 23 | 24 | 25 | query II 26 | select column_name, column_type from (DESCRIBE FROM read_avro('test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro')); 27 | ---- 28 | manifest_path VARCHAR 29 | manifest_length BIGINT 30 | partition_spec_id INTEGER 31 | content INTEGER 32 | sequence_number BIGINT 33 | min_sequence_number BIGINT 34 | added_snapshot_id BIGINT 35 | added_data_files_count INTEGER 36 | existing_data_files_count INTEGER 37 | deleted_data_files_count INTEGER 38 | added_rows_count BIGINT 39 | existing_rows_count BIGINT 40 | deleted_rows_count BIGINT 41 | partitions STRUCT(contains_null BOOLEAN, contains_nan BOOLEAN, lower_bound BLOB, upper_bound BLOB)[] 42 | 43 | 44 | 45 | query IIIIIIIIIIIIII 46 | FROM read_avro('test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro') ORDER BY manifest_path 47 | ---- 48 | lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 7687 0 0 2 2 7635660646343998149 0 0 1 0 0 60175 [] 49 | lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 7692 0 0 2 2 7635660646343998149 1 0 0 51793 0 0 [] 50 | 51 | 52 | -------------------------------------------------------------------------------- /test/sql/test_missing_file.test: -------------------------------------------------------------------------------- 1 | # name: test/sql/test_missing_file.test 2 | # description: test avro extension and external file cache 3 | # group: [avro] 4 | 5 | require avro 6 | 7 | statement error 8 | from read_avro('not_exists'); 9 | ---- 10 | IO Error: No files found that match the pattern "not_exists" -------------------------------------------------------------------------------- /test/string_array.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/string_array.avro -------------------------------------------------------------------------------- /test/union-name-1.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union-name-1.avro -------------------------------------------------------------------------------- /test/union-name-2.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union-name-2.avro -------------------------------------------------------------------------------- /test/union-name-3.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union-name-3.avro -------------------------------------------------------------------------------- /test/union.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union.avro -------------------------------------------------------------------------------- /test/userdata1.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata1.avro -------------------------------------------------------------------------------- /test/userdata2.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata2.avro -------------------------------------------------------------------------------- /test/userdata3.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata3.avro -------------------------------------------------------------------------------- /test/userdata4.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata4.avro -------------------------------------------------------------------------------- /test/userdata5.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata5.avro -------------------------------------------------------------------------------- /test/users.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/users.avro -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | "avro-c" 4 | ], 5 | "vcpkg-configuration": { 6 | "overlay-ports": [ 7 | "./vcpkg_ports" 8 | ], 9 | "registries": [ 10 | { 11 | "kind": "git", 12 | "repository": "https://github.com/duckdb/vcpkg-duckdb-ports", 13 | "baseline": "0f9bf648ba1ee29291890a1ca9a49a80bba017eb", 14 | "packages": [ 15 | "vcpkg-cmake", 16 | "avro-c" 17 | ] 18 | } 19 | ] 20 | }, 21 | "builtin-baseline": "5e5d0e1cd7785623065e77eff011afdeec1a3574" 22 | } -------------------------------------------------------------------------------- /vcpkg_ports/liblzma/build-tools.patch: -------------------------------------------------------------------------------- 1 | --- a/CMakeLists.txt 2 | +++ b/CMakeLists.txt 3 | @@ -1484,7 +1484,7 @@ function(my_install_man COMPONENT SRC_FILE LINK_NAMES) 4 | endif() 5 | endfunction() 6 | 7 | - 8 | +if(BUILD_TOOLS) 9 | ############################################################################# 10 | # libgnu (getopt_long) 11 | ############################################################################# 12 | @@ -1982,6 +1982,7 @@ if(UNIX) 13 | my_install_man(scripts_Documentation src/scripts/xzless.1 "${XZLESS_LINKS}") 14 | endif() 15 | 16 | +endif() 17 | 18 | ############################################################################# 19 | # Documentation 20 | -------------------------------------------------------------------------------- /vcpkg_ports/liblzma/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO tukaani-project/xz 4 | REF "v${VERSION}" 5 | SHA512 0f814f4282c87cb74a8383199c1e55ec1bf49519daaf07f7b376cb644770b75cc9257c809b661405fcfd6cda28c54d799c67eb9e169665c35b1b87529468085e 6 | HEAD_REF master 7 | PATCHES 8 | build-tools.patch 9 | ) 10 | 11 | vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS 12 | FEATURES 13 | tools BUILD_TOOLS 14 | ) 15 | 16 | if(VCPKG_TARGET_ARCHITECTURE STREQUAL "wasm32") 17 | set(WASM_OPTIONS -DCMAKE_C_BYTE_ORDER=LITTLE_ENDIAN -DCMAKE_CXX_BYTE_ORDER=LITTLE_ENDIAN -DCMAKE_POSITION_INDEPENDENT_CODE=ON) 18 | endif() 19 | 20 | vcpkg_cmake_configure( 21 | SOURCE_PATH "${SOURCE_PATH}" 22 | OPTIONS 23 | ${FEATURE_OPTIONS} 24 | ${WASM_OPTIONS} 25 | -DBUILD_TESTING=OFF 26 | -DCREATE_XZ_SYMLINKS=OFF 27 | -DCREATE_LZMA_SYMLINKS=OFF 28 | -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT= # using flags from (vcpkg) toolchain 29 | -DENABLE_NLS=OFF # nls is not supported by this port, yet 30 | MAYBE_UNUSED_VARIABLES 31 | CMAKE_MSVC_DEBUG_INFORMATION_FORMAT 32 | CREATE_XZ_SYMLINKS 33 | CREATE_LZMA_SYMLINKS 34 | ENABLE_NLS 35 | ) 36 | vcpkg_cmake_install() 37 | vcpkg_copy_pdbs() 38 | 39 | set(exec_prefix "\${prefix}") 40 | set(libdir "\${prefix}/lib") 41 | set(includedir "\${prefix}/include") 42 | set(PACKAGE_URL https://tukaani.org/xz/) 43 | set(PACKAGE_VERSION "${VERSION}") 44 | if(NOT VCPKG_TARGET_IS_WINDOWS) 45 | set(PTHREAD_CFLAGS -pthread) 46 | endif() 47 | set(prefix "${CURRENT_INSTALLED_DIR}") 48 | configure_file("${SOURCE_PATH}/src/liblzma/liblzma.pc.in" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/liblzma.pc" @ONLY) 49 | if (NOT VCPKG_BUILD_TYPE) 50 | set(prefix "${CURRENT_INSTALLED_DIR}/debug") 51 | configure_file("${SOURCE_PATH}/src/liblzma/liblzma.pc.in" "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/liblzma.pc" @ONLY) 52 | endif() 53 | vcpkg_fixup_pkgconfig() 54 | 55 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/liblzma) 56 | 57 | if(VCPKG_LIBRARY_LINKAGE STREQUAL "static") 58 | vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/lzma.h" "defined(LZMA_API_STATIC)" "1") 59 | else() 60 | vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/lzma.h" "defined(LZMA_API_STATIC)" "0") 61 | endif() 62 | 63 | file(REMOVE_RECURSE 64 | "${CURRENT_PACKAGES_DIR}/debug/include" 65 | "${CURRENT_PACKAGES_DIR}/debug/share" 66 | "${CURRENT_PACKAGES_DIR}/share/man" 67 | ) 68 | 69 | set(TOOLS xz xzdec lzmadec lzmainfo) 70 | foreach(_tool IN LISTS TOOLS) 71 | if(NOT EXISTS "${CURRENT_PACKAGES_DIR}/bin/${_tool}${VCPKG_TARGET_EXECUTABLE_SUFFIX}") 72 | list(REMOVE_ITEM TOOLS ${_tool}) 73 | endif() 74 | endforeach() 75 | if(TOOLS) 76 | vcpkg_copy_tools(TOOL_NAMES ${TOOLS} AUTO_CLEAN) 77 | endif() 78 | 79 | if(VCPKG_LIBRARY_LINKAGE STREQUAL "static") 80 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/bin" "${CURRENT_PACKAGES_DIR}/debug/bin") 81 | endif() 82 | 83 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/vcpkg-cmake-wrapper.cmake" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") 84 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") 85 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/COPYING") 86 | -------------------------------------------------------------------------------- /vcpkg_ports/liblzma/usage: -------------------------------------------------------------------------------- 1 | liblzma is compatible with built-in CMake targets: 2 | 3 | find_package(LibLZMA REQUIRED) 4 | target_link_libraries(main PRIVATE LibLZMA::LibLZMA) 5 | 6 | liblzma provides CMake targets: 7 | 8 | find_package(liblzma CONFIG REQUIRED) 9 | target_link_libraries(main PRIVATE liblzma::liblzma) 10 | -------------------------------------------------------------------------------- /vcpkg_ports/liblzma/vcpkg-cmake-wrapper.cmake: -------------------------------------------------------------------------------- 1 | cmake_policy(PUSH) 2 | cmake_policy(SET CMP0012 NEW) 3 | cmake_policy(SET CMP0057 NEW) 4 | set(z_vcpkg_liblzma_fixup_needed 0) 5 | if(NOT "CONFIG" IN_LIST ARGS AND NOT "NO_MODULE" IN_LIST ARGS AND NOT CMAKE_DISABLE_FIND_PACKAGE_LibLZMA) 6 | get_filename_component(z_vcpkg_liblzma_prefix "${CMAKE_CURRENT_LIST_DIR}" DIRECTORY) 7 | get_filename_component(z_vcpkg_liblzma_prefix "${z_vcpkg_liblzma_prefix}" DIRECTORY) 8 | find_path(LIBLZMA_INCLUDE_DIR NAMES lzma.h PATHS "${z_vcpkg_liblzma_prefix}/include" NO_DEFAULT_PATH) 9 | # liblzma doesn't use a debug postfix, but FindLibLZMA.cmake expects it 10 | find_library(LIBLZMA_LIBRARY_RELEASE NAMES lzma PATHS "${z_vcpkg_liblzma_prefix}/lib" NO_DEFAULT_PATH) 11 | find_library(LIBLZMA_LIBRARY_DEBUG NAMES lzma PATHS "${z_vcpkg_liblzma_prefix}/debug/lib" NO_DEFAULT_PATH) 12 | unset(z_vcpkg_liblzma_prefix) 13 | if(CMAKE_VERSION VERSION_LESS 3.16) 14 | # Older versions of FindLibLZMA.cmake need a single lib in LIBLZMA_LIBRARY. 15 | set(z_vcpkg_liblzma_fixup_needed 1) 16 | set(LIBLZMA_LIBRARY "${LIBLZMA_LIBRARY_RELEASE}" CACHE INTERNAL "") 17 | elseif(NOT TARGET LibLZMA::LibLZMA) 18 | set(z_vcpkg_liblzma_fixup_needed 1) 19 | endif() 20 | # Known values, and required. Skip expensive tests. 21 | set(LIBLZMA_HAS_AUTO_DECODER 1 CACHE INTERNAL "") 22 | set(LIBLZMA_HAS_EASY_ENCODER 1 CACHE INTERNAL "") 23 | set(LIBLZMA_HAS_LZMA_PRESET 1 CACHE INTERNAL "") 24 | endif() 25 | 26 | _find_package(${ARGS}) 27 | 28 | if(z_vcpkg_liblzma_fixup_needed) 29 | include(SelectLibraryConfigurations) 30 | select_library_configurations(LIBLZMA) 31 | if(NOT TARGET LibLZMA::LibLZMA) 32 | # Backfill LibLZMA::LibLZMA to versions of cmake before 3.14 33 | add_library(LibLZMA::LibLZMA UNKNOWN IMPORTED) 34 | if(DEFINED LIBLZMA_INCLUDE_DIRS) 35 | set_target_properties(LibLZMA::LibLZMA PROPERTIES 36 | INTERFACE_INCLUDE_DIRECTORIES "${LIBLZMA_INCLUDE_DIRS}") 37 | endif() 38 | set_property(TARGET LibLZMA::LibLZMA APPEND PROPERTY 39 | IMPORTED_CONFIGURATIONS RELEASE) 40 | set_target_properties(LibLZMA::LibLZMA PROPERTIES 41 | IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C" 42 | IMPORTED_LOCATION_RELEASE "${LIBLZMA_LIBRARY_RELEASE}") 43 | if(EXISTS "${LIBLZMA_LIBRARY}") 44 | set_target_properties(LibLZMA::LibLZMA PROPERTIES 45 | IMPORTED_LINK_INTERFACE_LANGUAGES "C" 46 | IMPORTED_LOCATION "${LIBLZMA_LIBRARY}") 47 | endif() 48 | endif() 49 | if(LIBLZMA_LIBRARY_DEBUG) 50 | # Backfill debug variant to versions of cmake before 3.16 51 | set_property(TARGET LibLZMA::LibLZMA APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) 52 | set_target_properties(LibLZMA::LibLZMA PROPERTIES IMPORTED_LOCATION_DEBUG "${LIBLZMA_LIBRARY_DEBUG}") 53 | endif() 54 | endif() 55 | if(LIBLZMA_LIBRARIES AND NOT "Threads::Threads" IN_LIST LIBLZMA_LIBRARIES AND NOT EMSCRIPTEN) 56 | set(THREADS_PREFER_PTHREAD_FLAG TRUE) 57 | find_package(Threads) 58 | list(APPEND LIBLZMA_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) 59 | if(TARGET LibLZMA::LibLZMA) 60 | set_property(TARGET LibLZMA::LibLZMA APPEND PROPERTY INTERFACE_LINK_LIBRARIES Threads::Threads) 61 | endif() 62 | endif() 63 | unset(z_vcpkg_liblzma_fixup_needed) 64 | cmake_policy(POP) 65 | -------------------------------------------------------------------------------- /vcpkg_ports/liblzma/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "liblzma", 3 | "version": "5.6.3", 4 | "description": "Compression library with an API similar to that of zlib.", 5 | "homepage": "https://tukaani.org/xz/", 6 | "license": null, 7 | "dependencies": [ 8 | { 9 | "name": "vcpkg-cmake" 10 | }, 11 | { 12 | "name": "vcpkg-cmake-config" 13 | } 14 | ], 15 | "features": { 16 | "tools": { 17 | "description": "Build tools" 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /vcpkg_ports/snappy/fix_clang-cl_build.patch: -------------------------------------------------------------------------------- 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt 2 | index 672561e62..b6930b834 100644 3 | --- a/CMakeLists.txt 4 | +++ b/CMakeLists.txt 5 | @@ -38,7 +38,7 @@ if(NOT CMAKE_CXX_STANDARD) 6 | endif(NOT CMAKE_CXX_STANDARD) 7 | 8 | # https://github.com/izenecloud/cmake/blob/master/SetCompilerWarningAll.cmake 9 | -if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 10 | +if(MSVC) 11 | # Use the highest warning level for Visual Studio. 12 | set(CMAKE_CXX_WARNING_LEVEL 4) 13 | if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]") 14 | -------------------------------------------------------------------------------- /vcpkg_ports/snappy/no-werror.patch: -------------------------------------------------------------------------------- 1 | --- a/CMakeLists.txt 2 | +++ b/CMakeLists.txt 3 | @@ -68,7 +68,7 @@ 4 | 5 | # Use -Werror for clang only. 6 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") 7 | - if(NOT CMAKE_CXX_FLAGS MATCHES "-Werror") 8 | + if(0) 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") 10 | endif(NOT CMAKE_CXX_FLAGS MATCHES "-Werror") 11 | endif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") 12 | -------------------------------------------------------------------------------- /vcpkg_ports/snappy/pkgconfig.diff: -------------------------------------------------------------------------------- 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt 2 | index c3062e2..05477e9 100644 3 | --- a/CMakeLists.txt 4 | +++ b/CMakeLists.txt 5 | @@ -417,4 +417,18 @@ if(SNAPPY_INSTALL) 6 | "${PROJECT_BINARY_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake" 7 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" 8 | ) 9 | + 10 | + cmake_policy(SET CMP0057 NEW) 11 | + set(LIBS_PRIVATE "") 12 | + foreach(lib IN LISTS CMAKE_CXX_IMPLICIT_LINK_LIBRARIES) 13 | + if(lib IN_LIST CMAKE_C_IMPLICIT_LINK_LIBRARIES) 14 | + continue() 15 | + elseif(EXISTS "${lib}") 16 | + string(APPEND LIBS_PRIVATE " ${CMAKE_LINK_LIBRARY_FILE_FLAG}${lib}") 17 | + else() 18 | + string(APPEND LIBS_PRIVATE " ${CMAKE_LINK_LIBRARY_FLAG}${lib}") 19 | + endif() 20 | + endforeach() 21 | + configure_file(snappy.pc.in "${CMAKE_CURRENT_BINARY_DIR}/snappy.pc" @ONLY) 22 | + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/snappy.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") 23 | endif(SNAPPY_INSTALL) 24 | -------------------------------------------------------------------------------- /vcpkg_ports/snappy/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO google/snappy 4 | REF ${VERSION} 5 | SHA512 e7290d79ddd45605aafd02cba9eaa32309c94af04f137552a97a915c391f185dccab9b7b21a01b28f3f446be420232c3c22d91c06e0be6e1e2e32d645174798c 6 | HEAD_REF master 7 | PATCHES 8 | fix_clang-cl_build.patch 9 | no-werror.patch 10 | pkgconfig.diff 11 | ) 12 | file(COPY "${CURRENT_PORT_DIR}/snappy.pc.in" DESTINATION "${SOURCE_PATH}") 13 | 14 | if(VCPKG_TARGET_ARCHITECTURE STREQUAL "wasm32") 15 | set(WASM_OPTIONS -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_FLAGS=-fPIC -DCMAKE_C_FLAGS=-fPIC) 16 | endif() 17 | 18 | vcpkg_cmake_configure( 19 | SOURCE_PATH "${SOURCE_PATH}" 20 | OPTIONS 21 | -DSNAPPY_BUILD_TESTS=OFF 22 | -DSNAPPY_BUILD_BENCHMARKS=OFF 23 | ${WASM_OPTIONS} 24 | 25 | # These variables can be overriden in a custom triplet, see usage file 26 | -DSNAPPY_HAVE_SSSE3=OFF 27 | -DSNAPPY_HAVE_X86_CRC32=OFF 28 | -DSNAPPY_HAVE_NEON_CRC32=OFF 29 | -DSNAPPY_HAVE_BMI2=OFF 30 | -DSNAPPY_HAVE_NEON=OFF 31 | ) 32 | 33 | vcpkg_cmake_install() 34 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/Snappy) 35 | vcpkg_copy_pdbs() 36 | vcpkg_fixup_pkgconfig() 37 | 38 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") 39 | 40 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") 41 | 42 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/COPYING") 43 | -------------------------------------------------------------------------------- /vcpkg_ports/snappy/snappy.pc.in: -------------------------------------------------------------------------------- 1 | prefix=${pcfiledir}/../.. 2 | exec_prefix=${prefix} 3 | libdir=${prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: snappy 7 | Description: A fast compressor/decompressor. 8 | Version: @PROJECT_VERSION@ 9 | Libs: -L${libdir} -lsnappy 10 | Cflags: -I${includedir} 11 | Libs.private: @LIBS_PRIVATE@ -------------------------------------------------------------------------------- /vcpkg_ports/snappy/usage: -------------------------------------------------------------------------------- 1 | snappy provides CMake targets: 2 | 3 | find_package(Snappy CONFIG REQUIRED) 4 | target_link_libraries(main PRIVATE Snappy::snappy) 5 | 6 | Optimizations based on hardware support are disabled by default. 7 | You can enable them by adding corresponding flags to VCPKG_CMAKE_CONFIGURE_OPTIONS inside a custom triplet file, for example: 8 | 9 | if("${PORT}" STREQUAL "snappy") 10 | list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DSNAPPY_HAVE_SSSE3=ON -DSNAPPY_HAVE_BMI2=ON) 11 | endif() 12 | 13 | For a full list of possible options, see project's root CMakeLists.txt. 14 | -------------------------------------------------------------------------------- /vcpkg_ports/snappy/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "snappy", 3 | "version": "1.2.1", 4 | "description": "A fast compressor/decompressor.", 5 | "homepage": "https://github.com/google/snappy", 6 | "license": null, 7 | "dependencies": [ 8 | { 9 | "name": "vcpkg-cmake" 10 | }, 11 | { 12 | "name": "vcpkg-cmake-config" 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/0001-Prevent-invalid-inclusions-when-HAVE_-is-set-to-0.patch: -------------------------------------------------------------------------------- 1 | diff --git a/zconf.h.cmakein b/zconf.h.cmakein 2 | index a7f24cc..a1b359b 100644 3 | --- a/zconf.h.cmakein 4 | +++ b/zconf.h.cmakein 5 | @@ -434,11 +434,19 @@ typedef uLong FAR uLongf; 6 | #endif 7 | 8 | #ifdef HAVE_UNISTD_H /* may be set to #if 1 by ./configure */ 9 | -# define Z_HAVE_UNISTD_H 10 | +# if ~(~HAVE_UNISTD_H + 0) == 0 && ~(~HAVE_UNISTD_H + 1) == 1 11 | +# define Z_HAVE_UNISTD_H 12 | +# elif HAVE_UNISTD_H != 0 13 | +# define Z_HAVE_UNISTD_H 14 | +# endif 15 | #endif 16 | 17 | #ifdef HAVE_STDARG_H /* may be set to #if 1 by ./configure */ 18 | -# define Z_HAVE_STDARG_H 19 | +# if ~(~HAVE_STDARG_H + 0) == 0 && ~(~HAVE_STDARG_H + 1) == 1 20 | +# define Z_HAVE_STDARG_H 21 | +# elif HAVE_STDARG_H != 0 22 | +# define Z_HAVE_STDARG_H 23 | +# endif 24 | #endif 25 | 26 | #ifdef STDC 27 | diff --git a/zconf.h.in b/zconf.h.in 28 | index 5e1d68a..32f53c8 100644 29 | --- a/zconf.h.in 30 | +++ b/zconf.h.in 31 | @@ -432,11 +432,19 @@ typedef uLong FAR uLongf; 32 | #endif 33 | 34 | #ifdef HAVE_UNISTD_H /* may be set to #if 1 by ./configure */ 35 | -# define Z_HAVE_UNISTD_H 36 | +# if ~(~HAVE_UNISTD_H + 0) == 0 && ~(~HAVE_UNISTD_H + 1) == 1 37 | +# define Z_HAVE_UNISTD_H 38 | +# elif HAVE_UNISTD_H != 0 39 | +# define Z_HAVE_UNISTD_H 40 | +# endif 41 | #endif 42 | 43 | #ifdef HAVE_STDARG_H /* may be set to #if 1 by ./configure */ 44 | -# define Z_HAVE_STDARG_H 45 | +# if ~(~HAVE_STDARG_H + 0) == 0 && ~(~HAVE_STDARG_H + 1) == 1 46 | +# define Z_HAVE_STDARG_H 47 | +# elif HAVE_STDARG_H != 0 48 | +# define Z_HAVE_STDARG_H 49 | +# endif 50 | #endif 51 | 52 | #ifdef STDC 53 | 54 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/0002-build-static-or-shared-not-both.patch: -------------------------------------------------------------------------------- 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt 2 | --- a/CMakeLists.txt 3 | +++ b/CMakeLists.txt 4 | @@ -123,9 +123,11 @@ set(ZLIB_SRCS 5 | ) 6 | 7 | if(NOT MINGW) 8 | + if(BUILD_SHARED_LIBS) 9 | set(ZLIB_DLL_SRCS 10 | win32/zlib1.rc # If present will override custom build rule below. 11 | ) 12 | + endif() 13 | endif() 14 | 15 | # parse the full version number from zlib.h and include in ZLIB_FULL_VERSION 16 | @@ -146,15 +148,17 @@ if(MINGW) 17 | -I ${CMAKE_CURRENT_BINARY_DIR} 18 | -o ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj 19 | -i ${CMAKE_CURRENT_SOURCE_DIR}/win32/zlib1.rc) 20 | + if(BUILD_SHARED_LIBS) 21 | set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) 22 | + endif() 23 | endif(MINGW) 24 | 25 | -add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) 26 | +add_library(zlib ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) 27 | target_include_directories(zlib PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) 28 | -add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) 29 | -target_include_directories(zlibstatic PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) 30 | +if (BUILD_SHARED_LIBS) 31 | set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) 32 | set_target_properties(zlib PROPERTIES SOVERSION 1) 33 | +endif() 34 | 35 | if(NOT CYGWIN) 36 | # This property causes shared libraries on Linux to have the full version 37 | @@ -169,7 +173,7 @@ endif() 38 | 39 | if(UNIX) 40 | # On unix-like platforms the library is almost always called libz 41 | - set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) 42 | + set_target_properties(zlib PROPERTIES OUTPUT_NAME z) 43 | if(NOT APPLE AND NOT(CMAKE_SYSTEM_NAME STREQUAL AIX)) 44 | set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") 45 | endif() 46 | @@ -179,7 +183,7 @@ elseif(BUILD_SHARED_LIBS AND WIN32) 47 | endif() 48 | 49 | if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL ) 50 | - install(TARGETS zlib zlibstatic 51 | + install(TARGETS zlib 52 | RUNTIME DESTINATION "${INSTALL_BIN_DIR}" 53 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" 54 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ) 55 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/0003-android-and-mingw-fixes.patch: -------------------------------------------------------------------------------- 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt 2 | --- a/CMakeLists.txt 3 | +++ b/CMakeLists.txt 4 | @@ -58,7 +58,7 @@ endif() 5 | # 6 | check_include_file(unistd.h Z_HAVE_UNISTD_H) 7 | 8 | -if(MSVC) 9 | +if(WIN32) 10 | set(CMAKE_DEBUG_POSTFIX "d") 11 | add_definitions(-D_CRT_SECURE_NO_DEPRECATE) 12 | add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) 13 | @@ -135,7 +135,7 @@ file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h _zlib_h_contents) 14 | string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" 15 | "\\1" ZLIB_FULL_VERSION ${_zlib_h_contents}) 16 | 17 | -if(MINGW) 18 | +if(MINGW AND NOT ANDROID) 19 | # This gets us DLL resource information when compiling on MinGW. 20 | if(NOT CMAKE_RC_COMPILER) 21 | set(CMAKE_RC_COMPILER windres.exe) 22 | @@ -151,7 +151,7 @@ if(MINGW) 23 | if(BUILD_SHARED_LIBS) 24 | set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) 25 | endif() 26 | -endif(MINGW) 27 | +endif(MINGW AND NOT ANDROID) 28 | 29 | add_library(zlib ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) 30 | target_include_directories(zlib PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) 31 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/portfile.cmake: -------------------------------------------------------------------------------- 1 | # When this port is updated, the minizip port should be updated at the same time 2 | vcpkg_from_github( 3 | OUT_SOURCE_PATH SOURCE_PATH 4 | REPO madler/zlib 5 | REF v${VERSION} 6 | SHA512 8c9642495bafd6fad4ab9fb67f09b268c69ff9af0f4f20cf15dfc18852ff1f312bd8ca41de761b3f8d8e90e77d79f2ccacd3d4c5b19e475ecf09d021fdfe9088 7 | HEAD_REF master 8 | PATCHES 9 | 0001-Prevent-invalid-inclusions-when-HAVE_-is-set-to-0.patch 10 | 0002-build-static-or-shared-not-both.patch 11 | 0003-android-and-mingw-fixes.patch 12 | ) 13 | 14 | # This is generated during the cmake build 15 | file(REMOVE "${SOURCE_PATH}/zconf.h") 16 | 17 | if(VCPKG_TARGET_ARCHITECTURE STREQUAL "wasm32") 18 | set(WASM_OPTIONS -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_FLAGS=-fPIC -DCMAKE_C_FLAGS=-fPIC) 19 | endif() 20 | 21 | vcpkg_cmake_configure( 22 | SOURCE_PATH "${SOURCE_PATH}" 23 | OPTIONS 24 | -DSKIP_INSTALL_FILES=ON 25 | -DZLIB_BUILD_EXAMPLES=OFF 26 | ${WASM_OPTIONS} 27 | OPTIONS_DEBUG 28 | -DSKIP_INSTALL_HEADERS=ON 29 | ) 30 | 31 | vcpkg_cmake_install() 32 | file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/vcpkg-cmake-wrapper.cmake" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") 33 | 34 | # Install the pkgconfig file 35 | if(NOT DEFINED VCPKG_BUILD_TYPE OR VCPKG_BUILD_TYPE STREQUAL "release") 36 | if(VCPKG_TARGET_IS_WINDOWS) 37 | vcpkg_replace_string("${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-rel/zlib.pc" "-lz" "-lzlib") 38 | endif() 39 | file(COPY "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-rel/zlib.pc" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/pkgconfig") 40 | endif() 41 | if(NOT DEFINED VCPKG_BUILD_TYPE OR VCPKG_BUILD_TYPE STREQUAL "debug") 42 | if(VCPKG_TARGET_IS_WINDOWS) 43 | vcpkg_replace_string("${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-dbg/zlib.pc" "-lz" "-lzlibd") 44 | endif() 45 | file(COPY "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-dbg/zlib.pc" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig") 46 | endif() 47 | 48 | vcpkg_fixup_pkgconfig() 49 | vcpkg_copy_pdbs() 50 | 51 | if(VCPKG_LIBRARY_LINKAGE STREQUAL "static") 52 | vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/zconf.h" "ifdef ZLIB_DLL" "if 0") 53 | else() 54 | vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/zconf.h" "ifdef ZLIB_DLL" "if 1") 55 | endif() 56 | 57 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") 58 | file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) 59 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/usage: -------------------------------------------------------------------------------- 1 | The package zlib is compatible with built-in CMake targets: 2 | 3 | find_package(ZLIB REQUIRED) 4 | target_link_libraries(main PRIVATE ZLIB::ZLIB) 5 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/vcpkg-cmake-wrapper.cmake: -------------------------------------------------------------------------------- 1 | find_path(ZLIB_INCLUDE_DIR NAMES zlib.h PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include" NO_DEFAULT_PATH) 2 | find_library(ZLIB_LIBRARY_RELEASE NAMES zlib z PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib" NO_DEFAULT_PATH) 3 | find_library(ZLIB_LIBRARY_DEBUG NAMES zlibd z PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/debug/lib" NO_DEFAULT_PATH) 4 | 5 | if(NOT ZLIB_INCLUDE_DIR OR NOT (ZLIB_LIBRARY_RELEASE OR ZLIB_LIBRARY_DEBUG)) 6 | message(FATAL_ERROR "Broken installation of vcpkg port zlib") 7 | endif() 8 | 9 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 10 | 11 | if(CMAKE_VERSION VERSION_LESS 3.4) 12 | include(SelectLibraryConfigurations) 13 | select_library_configurations(ZLIB) 14 | unset(ZLIB_FOUND) 15 | endif() 16 | _find_package(${ARGS}) 17 | -------------------------------------------------------------------------------- /vcpkg_ports/zlib/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "zlib", 3 | "version": "1.3.1", 4 | "description": "A compression library", 5 | "homepage": "https://www.zlib.net/", 6 | "license": "Zlib", 7 | "dependencies": [ 8 | "vcpkg-cmake" 9 | ] 10 | } 11 | --------------------------------------------------------------------------------