├── .clang-format
├── .editorconfig
├── .github
    └── workflows
    │   └── MainDistributionPipeline.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── docs
    └── UPDATING.md
├── extension_config.cmake
├── scripts
    ├── extension-upload.sh
    └── setup-custom-toolchain.sh
├── src
    ├── avro_extension.cpp
    ├── avro_multi_file_info.cpp
    ├── avro_reader.cpp
    └── include
    │   ├── avro_extension.hpp
    │   ├── avro_multi_file_info.hpp
    │   ├── avro_reader.hpp
    │   └── avro_type.hpp
├── test
    ├── 4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro
    ├── README.md
    ├── all_nullable_list.avro
    ├── avro.avro
    ├── bigdata.avro
    ├── broken_record.avro
    ├── create_test_file.py
    ├── empty_record.avro
    ├── enum.avro
    ├── fixed.avro
    ├── iceberg
    │   ├── 10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro
    │   ├── 10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro
    │   ├── 23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro
    │   ├── cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro
    │   ├── snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro
    │   ├── snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro
    │   └── snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro
    ├── logical_types.avro
    ├── long_map.avro
    ├── manifest.avro
    ├── nested_nullable_lists.avro
    ├── null_first.avro
    ├── null_last.avro
    ├── nullable_entry_string_array.avro
    ├── nullable_string_array.avro
    ├── part-r-00000.avro
    ├── primitive_types.avro
    ├── query_small.avro
    ├── recursive.avro
    ├── reuse-1.avro
    ├── reuse-2.avro
    ├── root-int.avro
    ├── single-union.avro
    ├── sql
    │   ├── avro.test
    │   ├── bigdata.test
    │   ├── external_file_cache.test
    │   ├── iceberg.test
    │   └── test_missing_file.test
    ├── string_array.avro
    ├── union-name-1.avro
    ├── union-name-2.avro
    ├── union-name-3.avro
    ├── union.avro
    ├── userdata1.avro
    ├── userdata2.avro
    ├── userdata3.avro
    ├── userdata4.avro
    ├── userdata5.avro
    └── users.avro
├── vcpkg.json
└── vcpkg_ports
    ├── liblzma
        ├── build-tools.patch
        ├── portfile.cmake
        ├── usage
        ├── vcpkg-cmake-wrapper.cmake
        └── vcpkg.json
    ├── snappy
        ├── fix_clang-cl_build.patch
        ├── no-werror.patch
        ├── pkgconfig.diff
        ├── portfile.cmake
        ├── snappy.pc.in
        ├── usage
        └── vcpkg.json
    └── zlib
        ├── 0001-Prevent-invalid-inclusions-when-HAVE_-is-set-to-0.patch
        ├── 0002-build-static-or-shared-not-both.patch
        ├── 0003-android-and-mingw-fixes.patch
        ├── portfile.cmake
        ├── usage
        ├── vcpkg-cmake-wrapper.cmake
        └── vcpkg.json


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | BasedOnStyle: LLVM
 3 | SortIncludes: false
 4 | TabWidth: 4
 5 | IndentWidth: 4
 6 | ColumnLimit: 120
 7 | AllowShortFunctionsOnASingleLine: false
 8 | ---
 9 | UseTab: ForIndentation
10 | DerivePointerAlignment: false
11 | PointerAlignment: Right
12 | AlignConsecutiveMacros: true
13 | AlignTrailingComments: true
14 | AllowAllArgumentsOnNextLine: true
15 | AllowAllConstructorInitializersOnNextLine: true
16 | AllowAllParametersOfDeclarationOnNextLine: true
17 | AlignAfterOpenBracket: Align
18 | SpaceBeforeCpp11BracedList: true
19 | SpaceBeforeCtorInitializerColon: true
20 | SpaceBeforeInheritanceColon: true
21 | SpacesInAngles: false
22 | SpacesInCStyleCastParentheses: false
23 | SpacesInConditionalStatement: false
24 | AllowShortLambdasOnASingleLine: Inline
25 | AllowShortLoopsOnASingleLine: false
26 | AlwaysBreakTemplateDeclarations: Yes
27 | IncludeBlocks: Regroup
28 | Language: Cpp
29 | AccessModifierOffset: -4
30 | ---
31 | Language: Java
32 | SpaceAfterCStyleCast: true
33 | ---
34 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | duckdb/.editorconfig


--------------------------------------------------------------------------------
/.github/workflows/MainDistributionPipeline.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
 3 | #
 4 | name: Main Extension Distribution Pipeline
 5 | on:
 6 |   push:
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | concurrency:
11 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
12 |   cancel-in-progress: true
13 | 
14 | jobs:
15 | 
16 |   duckdb-stable-build:
17 |     name: Build extension binaries
18 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
19 |     with:
20 |       extension_name: avro
21 |       duckdb_version: main
22 |       ci_tools_version: main
23 |       exclude_archs: 'windows_amd64_rtools;windows_amd64_mingw;linux_amd64_gcc4'
24 | 
25 |   duckdb-stable-deploy:
26 |     name: Deploy extension binaries
27 |     needs: duckdb-stable-build
28 |     uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main
29 |     secrets: inherit
30 |     with:
31 |       extension_name: avro
32 |       duckdb_version: main
33 |       ci_tools_version: main
34 |       exclude_archs: 'windows_amd64_rtools;windows_amd64_mingw;linux_amd64_gcc4'
35 |       deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 | cmake-build-debug
4 | duckdb_unittest_tempdir/
5 | .DS_Store
6 | testext
7 | test/python/__pycache__/
8 | .Rhistory
9 | vcpkg


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "duckdb"]
2 | 	path = duckdb
3 | 	url = https://github.com/duckdb/duckdb
4 | 	branch = main
5 | [submodule "extension-ci-tools"]
6 | 	path = extension-ci-tools
7 | 	url = https://github.com/duckdb/extension-ci-tools
8 | 	branch = main
9 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | # Set extension name here
 4 | set(TARGET_NAME avro)
 5 | 
 6 | find_path(
 7 |   AVRO_INCLUDE_DIR
 8 |   NAMES avro.h
 9 |   PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include"
10 |   PATH_SUFFIXES avro REQUIRED)
11 | 
12 | if(MSVC) # endless screaming
13 |   find_library(AVRO_LIBRARY avro.lib REQUIRED)
14 |   find_library(JANSSON_LIBRARY jansson.lib REQUIRED)
15 |   find_library(LZMA_LIBRARY lzma.lib REQUIRED)
16 |   find_library(ZLIB_LIBRARY zlib.lib REQUIRED)
17 | else()
18 |   find_library(AVRO_LIBRARY libavro.a REQUIRED)
19 |   find_library(JANSSON_LIBRARY libjansson.a REQUIRED)
20 |   find_library(LZMA_LIBRARY liblzma.a REQUIRED)
21 |   find_library(ZLIB_LIBRARY libz.a REQUIRED)
22 | endif()
23 | 
24 | find_library(SNAPPY_LIBRARY snappy REQUIRED)
25 | set(ALL_AVRO_LIBRARIES
26 |     ${AVRO_LIBRARY}
27 |     ${JEMALLOC_LIBRARY}
28 |     ${JANSSON_LIBRARY}
29 |     ${LZMA_LIBRARY}
30 |     ${ZLIB_LIBRARY}
31 |     ${SNAPPY_LIBRARY}
32 |     ${GMP_LIBRARY}
33 |     ${MATH_LIBRARY})
34 | 
35 | set(EXTENSION_NAME ${TARGET_NAME}_extension)
36 | set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
37 | 
38 | project(${TARGET_NAME})
39 | include_directories(src/include)
40 | 
41 | set(EXTENSION_SOURCES src/avro_extension.cpp src/avro_reader.cpp
42 |                       src/avro_multi_file_info.cpp)
43 | 
44 | build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
45 | build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
46 | 
47 | target_include_directories(${EXTENSION_NAME} PRIVATE ${AVRO_INCLUDE_DIR})
48 | target_include_directories(${LOADABLE_EXTENSION_NAME}
49 |                            PRIVATE ${AVRO_INCLUDE_DIR})
50 | target_link_libraries(${EXTENSION_NAME} ${ALL_AVRO_LIBRARIES})
51 | target_link_libraries(${LOADABLE_EXTENSION_NAME} ${ALL_AVRO_LIBRARIES})
52 | 
53 | install(
54 |   TARGETS ${EXTENSION_NAME}
55 |   EXPORT "${DUCKDB_EXPORT_SET}"
56 |   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
57 |   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018-2024 Stichting DuckDB Foundation
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
2 | 
3 | # Configuration of extension
4 | EXT_NAME=avro
5 | EXT_CONFIG=${PROJ_DIR}extension_config.cmake
6 | 
7 | # Include the Makefile from extension-ci-tools
8 | include extension-ci-tools/makefiles/duckdb_extension.Makefile


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The DuckDB Avro Extension
 2 | This repo contains a DuckDB community extension that enables DuckDB to *read* [Apache Avro (TM)](https://avro.apache.org) files. Avro is the (self-declared) "leading serialization format for record data". Avro is a self-describing *row-major* binary table format. This is in contrast to the (much more popular) Parquet format that is *columnar*. Its row-major design enables Avro - for example - to handle appends of a few rows somewhat efficiently. 
 3 | 
 4 | The extension does not contain Avro *write* functionality. This is on purpose, by not providing a writer we hope to decrease the amount of Avro files in the world over time. 
 5 | 
 6 | ### Installation & Loading
 7 | Installation is simple through the DuckDB Community Extension repository, just type
 8 | 
 9 | ```
10 | INSTALL avro FROM community;
11 | LOAD avro;
12 | ```
13 | in a DuckDB instance near you. There is currently no build for WASM because of dependencies (sigh).
14 | 
15 | ### The `read_avro` Function
16 | The extension adds a single DuckDB function, `read_avro`. This function can be used like so:
17 | ```SQL
18 | FROM read_avro('some_example_file.avro');
19 | ```
20 | This function will expose the contents of the avro file as a DuckDB table. You can then use any arbitrary SQL constructs to further transform this table.
21 | 
22 | 
23 | ### File IO
24 | The `read_avro` function is integrated into DuckDB's file system abstraction, meaning you can read Avro files directly from e.g. HTTP or S3 sources. For example
25 | 
26 | ```SQL
27 | FROM read_avro('http://blob.duckdb.org/data/userdata1.avro');
28 | FROM read_avro('s3://my-example-bucket/some_example_file.avro');
29 | ```
30 | 
31 | should "just" work. 
32 | 
33 | You can also *glob* multiple files in a single read call or pass a list of files to the functions:
34 | 
35 | ```SQL
36 | FROM read_avro('some_example_file_*.avro');
37 | FROM read_avro(['some_example_file_1.avro', 'some_example_file_2.avro']);
38 | ```
39 | 
40 | If the filenames somehow contain valuable information (as is unfortunately all-too-common), you can pass the `filename` argument to `read_avro`:
41 | 
42 | ```SQL
43 | FROM read_avro('some_example_file_*.avro', filename=true);
44 | ```
45 | This will result in an additional column in the result set that contains the actual filename of the Avro file. 
46 | 
47 | ### Schema Conversion
48 | This extension automatically translates the Avro Schema to the DuckDB schema. *All* Avro types can be translated, except for *recursive type definitions*, which DuckDB does not support.
49 | 
50 | The type mapping is very straightforward except for Avro's "unique" way of handling `NULL`. Unlike other systems, Avro does not treat `NULL` as a possible value in a range of e.g. `INTEGER` but instead represents `NULL` as a union of the actual type with a special `NULL` type. This is different to DuckDB, where any value can be `NULL`. Of course DuckDB also supports `UNION` types, but this would be quite cumbersome to work with. 
51 | 
52 | This extension *simplifies* the Avro schema where possible: An Avro union of any type and the special null type is simplified to just the non-null type. For example, an Avro record of the union type ` ["int","null"]` becomes a DuckDB `INTEGER`, which just happens to be `NULL` sometimes. Similarly, an Avro union that contains only a single type is converted to the type it contains. For example, an Avro record of the union type ` ["int"]` also becomes a DuckDB `INTEGER`.
53 | 
54 | The extension also "flattens" the Avro schema. Avro defines tables as root-level "record" fields, which are the same as DuckDB `STRUCT` fields. For more convenient handling, this extension turns the entries of a single top-level record into top-level columns.
55 | 
56 | ### Implementation
57 | Internally, this extension uses the "official" [Apache Avro C API](https://avro.apache.org/docs/++version++/api/c/), albeit with some minor patching to allow reading of Avro files from memory.
58 | 
59 | ### Limitations & Next Steps
60 | - This extension currently does not make use of **parallelism** when reading either a single (large) Avro file or when reading a list of files. Adding support for parallelism in the latter case is on the roadmap. 
61 | 
62 | - There is currently no support for neither projection nor filter **pushdown**, but this is also planned at a later stage.
63 | 
64 | - There is currently no support for the WASM or the Windows-MinGW builds of DuckDB due to issues with the Avro library dependency (sigh again). We plan to fix this eventually.
65 | 
66 | - As mentioned above, DuckDB cannot express recursive type definitions that Avro has, this is unlikely to ever change.
67 | 
68 | - There is no support to allow users to provide a separate Avro schema file. This is unlikely to change, all Avro files we have seen so far had their schema embedded.
69 | 
70 | - There is currently no support for the `union_by_name` flag that other readers in DuckDB support. This is planned for the future.
71 | 


--------------------------------------------------------------------------------
/docs/UPDATING.md:
--------------------------------------------------------------------------------
 1 | # Extension updating 
 2 | When cloning this template, the target version of DuckDB should be the latest stable release of DuckDB. However, there 
 3 | will inevitably come a time when a new DuckDB is released and the extension repository needs updating. This process goes
 4 | as follows:
 5 | 
 6 | - Bump submodules
 7 |   - `./duckdb` should be set to latest tagged release
 8 |   - `./extension-ci-tools` should be set to updated branch corresponding to latest DuckDB release. So if you're building for DuckDB `v1.1.0` there will be a branch in `extension-ci-tools` named `v1.1.0` to which you should check out. 
 9 | - Bump versions in `./github/workflows`
10 |   - `duckdb_version` input in `duckdb-stable-build` job in `MainDistributionPipeline.yml` should be set to latest tagged release
11 |   - `duckdb_version` input in `duckdb-stable-deploy` job in `MainDistributionPipeline.yml` should be set to latest tagged release
12 |   - the reusable workflow `duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml` for the `duckdb-stable-build` job should be set to latest tagged release
13 | 
14 | # API changes
15 | DuckDB extensions built with this extension template are built against the internal C++ API of DuckDB. This API is not guaranteed to be stable.
16 | What this means for extension development is that when updating your extensions DuckDB target version using the above steps, you may run into the fact that your extension no longer builds properly.
17 | 
18 | Currently, DuckDB does not (yet) provide a specific change log for these API changes, but it is generally not too hard to figure out what has changed.
19 | 
20 | For figuring out how and why the C++ API changed, we recommend using the following resources:
21 | - DuckDB's [Release Notes](https://github.com/duckdb/duckdb/releases)
22 | - DuckDB's history of [Core extension patches](https://github.com/duckdb/duckdb/commits/main/.github/patches/extensions)
23 | - The git history of the relevant C++ Header file of the API that has changed


--------------------------------------------------------------------------------
/extension_config.cmake:
--------------------------------------------------------------------------------
 1 | # This file is included by DuckDB's build system. It specifies which extension to load
 2 | 
 3 | # Extension from this repo
 4 | duckdb_extension_load(avro
 5 |     SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
 6 |     LOAD_TESTS
 7 |     LINKED_LIBS "../../vcpkg_installed/wasm32-emscripten/lib/lib*.a"
 8 | )
 9 | 
10 | # Any extra extensions that should be built
11 | # e.g.: duckdb_extension_load(json)
12 | 


--------------------------------------------------------------------------------
/scripts/extension-upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Extension upload script
 4 | 
 5 | # Usage: ./extension-upload.sh <name> <extension_version> <duckdb_version> <architecture> <s3_bucket> <copy_to_latest> <copy_to_versioned>
 6 | # <name>                : Name of the extension
 7 | # <extension_version>   : Version (commit / version tag) of the extension
 8 | # <duckdb_version>      : Version (commit / version tag) of DuckDB
 9 | # <architecture>        : Architecture target of the extension binary
10 | # <s3_bucket>           : S3 bucket to upload to
11 | # <copy_to_latest>      : Set this as the latest version ("true" / "false", default: "false")
12 | # <copy_to_versioned>   : Set this as a versioned version that will prevent its deletion
13 | 
14 | set -e
15 | 
16 | if [[ $4 == wasm* ]]; then
17 |   ext="/tmp/extension/$1.duckdb_extension.wasm"
18 | else
19 |   ext="/tmp/extension/$1.duckdb_extension"
20 | fi
21 | 
22 | echo $ext
23 | 
24 | script_dir="$(dirname "$(readlink -f "$0")")"
25 | 
26 | # calculate SHA256 hash of extension binary
27 | cat $ext > $ext.append
28 | 
29 | if [[ $4 == wasm* ]]; then
30 |   # 0 for custom section
31 |   # 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256)
32 |   # [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
33 |   echo -n -e '\x00' >> $ext.append
34 |   echo -n -e '\x93\x02' >> $ext.append
35 |   # 10 in hex = 16 in decimal, lenght of name, 1 byte
36 |   echo -n -e '\x10' >> $ext.append
37 |   echo -n -e 'duckdb_signature' >> $ext.append
38 |   # the name of the WebAssembly custom section, 16 bytes
39 |   # 100 in hex, 256 in decimal
40 |   # [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
41 |   # for a grand total of 2 bytes
42 |   echo -n -e '\x80\x02' >> $ext.append
43 | fi
44 | 
45 | # (Optionally) Sign binary
46 | if [ "$DUCKDB_EXTENSION_SIGNING_PK" != "" ]; then
47 |   echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem
48 |   $script_dir/../duckdb/scripts/compute-extension-hash.sh $ext.append > $ext.hash
49 |   openssl pkeyutl -sign -in $ext.hash -inkey private.pem -pkeyopt digest:sha256 -out $ext.sign
50 |   rm -f private.pem
51 | fi
52 | 
53 | # Signature is always there, potentially defaulting to 256 zeros
54 | truncate -s 256 $ext.sign
55 | 
56 | # append signature to extension binary
57 | cat $ext.sign >> $ext.append
58 | 
59 | # compress extension binary
60 | if [[ $4 == wasm_* ]]; then
61 |   brotli < $ext.append > "$ext.compressed"
62 | else
63 |   gzip < $ext.append > "$ext.compressed"
64 | fi
65 | 
66 | set -e
67 | 
68 | # Abort if AWS key is not set
69 | if [ -z "$AWS_ACCESS_KEY_ID" ]; then
70 |     echo "No AWS key found, skipping.."
71 |     exit 0
72 | fi
73 | 
74 | # upload versioned version
75 | if [[ $7 = 'true' ]]; then
76 |   if [[ $4 == wasm* ]]; then
77 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
78 |   else
79 |     aws s3 cp $ext.compressed s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read
80 |   fi
81 | fi
82 | 
83 | # upload to latest version
84 | if [[ $6 = 'true' ]]; then
85 |   if [[ $4 == wasm* ]]; then
86 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
87 |   else
88 |     aws s3 cp $ext.compressed s3://$5/$3/$4/$1.duckdb_extension.gz --acl public-read
89 |   fi
90 | fi
91 | 


--------------------------------------------------------------------------------
/scripts/setup-custom-toolchain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is an example script that can be used to install additional toolchain dependencies. Feel free to remove this script
 4 | # if no additional toolchains are required
 5 | 
 6 | # To enable this script, set the `custom_toolchain_script` option to true when calling the reusable workflow
 7 | # `.github/workflows/_extension_distribution.yml` from `https://github.com/duckdb/extension-ci-tools`
 8 | 
 9 | # note that the $DUCKDB_PLATFORM environment variable can be used to discern between the platforms
10 | echo "This is the sample custom toolchain script running for architecture '$DUCKDB_PLATFORM' for the avro extension."
11 | 
12 | 


--------------------------------------------------------------------------------
/src/avro_extension.cpp:
--------------------------------------------------------------------------------
 1 | #define DUCKDB_EXTENSION_MAIN
 2 | 
 3 | #include "avro_extension.hpp"
 4 | 
 5 | #include "duckdb.hpp"
 6 | #include "duckdb/common/exception.hpp"
 7 | #include "duckdb/common/string_util.hpp"
 8 | #include "duckdb/function/scalar_function.hpp"
 9 | 
10 | #include "duckdb/main/extension_util.hpp"
11 | #include "include/avro_reader.hpp"
12 | #include "duckdb/common/multi_file/multi_file_reader.hpp"
13 | #include "avro_multi_file_info.hpp"
14 | #include "duckdb/common/multi_file/multi_file_function.hpp"
15 | 
16 | #include <avro.h>
17 | 
18 | namespace duckdb {
19 | 
20 | static void LoadInternal(DatabaseInstance &instance) {
21 | 	// Register a scalar function
22 | 	auto table_function = MultiFileFunction<AvroMultiFileInfo>("read_avro");
23 | 	table_function.projection_pushdown = true;
24 | 	ExtensionUtil::RegisterFunction(instance, MultiFileReader::CreateFunctionSet(table_function));
25 | }
26 | 
27 | void AvroExtension::Load(DuckDB &db) {
28 | 	LoadInternal(*db.instance);
29 | }
30 | std::string AvroExtension::Name() {
31 | 	return "avro";
32 | }
33 | 
34 | std::string AvroExtension::Version() const {
35 | #ifdef EXT_VERSION_AVRO
36 | 	return EXT_VERSION_AVRO;
37 | #else
38 | 	return "";
39 | #endif
40 | }
41 | 
42 | } // namespace duckdb
43 | 
44 | extern "C" {
45 | 
46 | DUCKDB_EXTENSION_API void avro_init(duckdb::DatabaseInstance &db) {
47 | 	duckdb::DuckDB db_wrapper(db);
48 | 	db_wrapper.LoadExtension<duckdb::AvroExtension>();
49 | }
50 | 
51 | DUCKDB_EXTENSION_API const char *avro_version() {
52 | 	return duckdb::DuckDB::LibraryVersion();
53 | }
54 | }
55 | 
56 | #ifndef DUCKDB_EXTENSION_MAIN
57 | #error DUCKDB_EXTENSION_MAIN not defined
58 | #endif
59 | 


--------------------------------------------------------------------------------
/src/avro_multi_file_info.cpp:
--------------------------------------------------------------------------------
  1 | #include "avro_multi_file_info.hpp"
  2 | #include "avro_reader.hpp"
  3 | 
  4 | namespace duckdb {
  5 | 
  6 | unique_ptr<MultiFileReaderInterface>
  7 | AvroMultiFileInfo::InitializeInterface(ClientContext &context, MultiFileReader &reader, MultiFileList &file_list) {
  8 | 	return make_uniq<AvroMultiFileInfo>();
  9 | }
 10 | 
 11 | unique_ptr<BaseFileReaderOptions> AvroMultiFileInfo::InitializeOptions(ClientContext &context,
 12 |                                                                        optional_ptr<TableFunctionInfo> info) {
 13 | 	return make_uniq<AvroFileReaderOptions>();
 14 | }
 15 | 
 16 | bool AvroMultiFileInfo::ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
 17 |                                         BaseFileReaderOptions &options_p, vector<string> &expected_names,
 18 |                                         vector<LogicalType> &expected_types) {
 19 | 	// We currently do not have any options for the scanner, so we always return false
 20 | 	return false;
 21 | }
 22 | 
 23 | bool AvroMultiFileInfo::ParseOption(ClientContext &context, const string &key, const Value &val,
 24 |                                     MultiFileOptions &file_options, BaseFileReaderOptions &options) {
 25 | 	// We currently do not have any options for the scanner, so we always return false
 26 | 	return false;
 27 | }
 28 | 
 29 | struct AvroMultiFileData final : public TableFunctionData {
 30 | public:
 31 | 	AvroMultiFileData() = default;
 32 | };
 33 | 
 34 | unique_ptr<TableFunctionData> AvroMultiFileInfo::InitializeBindData(MultiFileBindData &multi_file_data,
 35 |                                                                     unique_ptr<BaseFileReaderOptions> options_p) {
 36 | 	return make_uniq<AvroMultiFileData>();
 37 | }
 38 | 
 39 | void AvroMultiFileInfo::BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
 40 |                                    MultiFileBindData &bind_data) {
 41 | 	AvroFileReaderOptions options;
 42 | 	if (bind_data.file_options.union_by_name) {
 43 | 		throw NotImplementedException("'union_by_name' not implemented for Avro reader yet");
 44 | 	}
 45 | 	bind_data.reader_bind = bind_data.multi_file_reader->BindReader(context, return_types, names, *bind_data.file_list,
 46 | 	                                                                bind_data, options, bind_data.file_options);
 47 | 	D_ASSERT(names.size() == return_types.size());
 48 | }
 49 | 
 50 | optional_idx AvroMultiFileInfo::MaxThreads(const MultiFileBindData &bind_data_p,
 51 |                                            const MultiFileGlobalState &global_state, FileExpandResult expand_result) {
 52 | 	if (expand_result == FileExpandResult::MULTIPLE_FILES) {
 53 | 		// always launch max threads if we are reading multiple files
 54 | 		return {};
 55 | 	}
 56 | 	// Otherwise, only one thread
 57 | 	return 1;
 58 | }
 59 | 
 60 | struct AvroFileGlobalState : public GlobalTableFunctionState {
 61 | public:
 62 | 	AvroFileGlobalState() = default;
 63 | 	~AvroFileGlobalState() override = default;
 64 | 
 65 | public:
 66 | 	//! TODO: this should contain the state of the current file being scanned
 67 | 	//! so we can parallelize over a single file
 68 | 	set<idx_t> files;
 69 | };
 70 | 
 71 | unique_ptr<GlobalTableFunctionState> AvroMultiFileInfo::InitializeGlobalState(ClientContext &context,
 72 |                                                                               MultiFileBindData &bind_data,
 73 |                                                                               MultiFileGlobalState &global_state) {
 74 | 	return make_uniq<AvroFileGlobalState>();
 75 | }
 76 | 
 77 | //! The Avro Local File State, basically refers to the Scan of one Avro File
 78 | //! This is done by calling the Avro Scan directly on one file.
 79 | struct AvroFileLocalState : public LocalTableFunctionState {
 80 | public:
 81 | 	explicit AvroFileLocalState(ExecutionContext &execution_context) : execution_context(execution_context) {};
 82 | 
 83 | public:
 84 | 	shared_ptr<AvroReader> file_scan;
 85 | 	ExecutionContext &execution_context;
 86 | };
 87 | 
 88 | unique_ptr<LocalTableFunctionState> AvroMultiFileInfo::InitializeLocalState(ExecutionContext &context,
 89 |                                                                             GlobalTableFunctionState &function_state) {
 90 | 	return make_uniq<AvroFileLocalState>(context);
 91 | }
 92 | 
 93 | shared_ptr<BaseFileReader> AvroMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p,
 94 |                                                            BaseUnionData &union_data,
 95 |                                                            const MultiFileBindData &bind_data) {
 96 | 	throw NotImplementedException("'union_by_name' is not implemented for the Avro reader yet");
 97 | }
 98 | 
 99 | shared_ptr<BaseFileReader> AvroMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &gstate_p,
100 |                                                            const OpenFileInfo &file, idx_t file_idx,
101 |                                                            const MultiFileBindData &bind_data) {
102 | 	return make_shared_ptr<AvroReader>(context, file);
103 | }
104 | 
105 | shared_ptr<BaseFileReader> AvroMultiFileInfo::CreateReader(ClientContext &context, const OpenFileInfo &file,
106 |                                                            BaseFileReaderOptions &options,
107 |                                                            const MultiFileOptions &file_options) {
108 | 	return make_shared_ptr<AvroReader>(context, file);
109 | }
110 | 
111 | bool AvroReader::TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate_p,
112 |                                    LocalTableFunctionState &lstate_p) {
113 | 	auto &gstate = gstate_p.Cast<AvroFileGlobalState>();
114 | 	auto &lstate = lstate_p.Cast<AvroFileLocalState>();
115 | 	if (gstate.files.count(file_list_idx.GetIndex())) {
116 | 		// Return false because we don't currently support more than one thread
117 | 		// scanning a file.
118 | 		return false;
119 | 	}
120 | 	gstate.files.insert(file_list_idx.GetIndex());
121 | 	lstate.file_scan = shared_ptr_cast<BaseFileReader, AvroReader>(shared_from_this());
122 | 	return true;
123 | }
124 | 
125 | void AvroReader::Scan(ClientContext &context, GlobalTableFunctionState &global_state,
126 |                       LocalTableFunctionState &local_state_p, DataChunk &chunk) {
127 | 	Read(chunk);
128 | }
129 | 
130 | unique_ptr<NodeStatistics> AvroMultiFileInfo::GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) {
131 | 	//! FIXME: Here is where we might set statistics, for optimizations if we have them
132 | 	return make_uniq<NodeStatistics>();
133 | }
134 | 
135 | } // namespace duckdb
136 | 


--------------------------------------------------------------------------------
/src/avro_reader.cpp:
--------------------------------------------------------------------------------
  1 | #include "avro_reader.hpp"
  2 | #include "utf8proc_wrapper.hpp"
  3 | #include "duckdb/storage/caching_file_system.hpp"
  4 | #include "duckdb/common/file_system.hpp"
  5 | #include "duckdb/common/multi_file/multi_file_data.hpp"
  6 | 
  7 | namespace duckdb {
  8 | 
  9 | static AvroType TransformSchema(avro_schema_t &avro_schema, unordered_set<string> parent_schema_names) {
 10 | 	switch (avro_typeof(avro_schema)) {
 11 | 	case AVRO_NULL:
 12 | 		return AvroType(AVRO_NULL, LogicalType::SQLNULL);
 13 | 	case AVRO_BOOLEAN:
 14 | 		return AvroType(AVRO_BOOLEAN, LogicalType::BOOLEAN);
 15 | 	case AVRO_INT32:
 16 | 		return AvroType(AVRO_INT32, LogicalType::INTEGER);
 17 | 	case AVRO_INT64:
 18 | 		return AvroType(AVRO_INT64, LogicalType::BIGINT);
 19 | 	case AVRO_FLOAT:
 20 | 		return AvroType(AVRO_FLOAT, LogicalType::FLOAT);
 21 | 	case AVRO_DOUBLE:
 22 | 		return AvroType(AVRO_DOUBLE, LogicalType::DOUBLE);
 23 | 	case AVRO_BYTES:
 24 | 		return AvroType(AVRO_BYTES, LogicalType::BLOB);
 25 | 	case AVRO_STRING:
 26 | 		return AvroType(AVRO_STRING, LogicalType::VARCHAR);
 27 | 	case AVRO_UNION: {
 28 | 		auto num_children = avro_schema_union_size(avro_schema);
 29 | 		child_list_t<AvroType> union_children;
 30 | 		idx_t non_null_child_idx = 0;
 31 | 		unordered_map<idx_t, optional_idx> union_child_map;
 32 | 		for (idx_t child_idx = 0; child_idx < num_children; child_idx++) {
 33 | 			auto child_schema = avro_schema_union_branch(avro_schema, child_idx);
 34 | 			auto child_type = TransformSchema(child_schema, parent_schema_names);
 35 | 			union_children.push_back(
 36 | 			    std::pair<std::string, AvroType>(StringUtil::Format("u%llu", child_idx), std::move(child_type)));
 37 | 			if (child_type.duckdb_type.id() != LogicalTypeId::SQLNULL) {
 38 | 				union_child_map[child_idx] = non_null_child_idx++;
 39 | 			}
 40 | 		}
 41 | 		return AvroType(AVRO_UNION, LogicalTypeId::UNION, std::move(union_children), union_child_map);
 42 | 	}
 43 | 	case AVRO_RECORD: {
 44 | 		auto schema_name = string(avro_schema_name(avro_schema));
 45 | 		if (parent_schema_names.find(schema_name) != parent_schema_names.end()) {
 46 | 			throw InvalidInputException("Recursive Avro types not supported: %s", schema_name);
 47 | 		}
 48 | 		parent_schema_names.insert(schema_name);
 49 | 
 50 | 		auto num_children = avro_schema_record_size(avro_schema);
 51 | 		if (num_children == 0) {
 52 | 			// this we just ignore but we need a marker so we don't get our offsets
 53 | 			// wrong
 54 | 			return AvroType(AVRO_RECORD, LogicalTypeId::SQLNULL);
 55 | 		}
 56 | 		child_list_t<AvroType> struct_children;
 57 | 		for (idx_t child_idx = 0; child_idx < num_children; child_idx++) {
 58 | 			auto child_schema = avro_schema_record_field_get_by_index(avro_schema, child_idx);
 59 | 			auto child_type = TransformSchema(child_schema, parent_schema_names);
 60 | 			auto child_name = avro_schema_record_field_name(avro_schema, child_idx);
 61 | 			if (!child_name || strlen(child_name) == 0) {
 62 | 				throw InvalidInputException("Empty avro field name");
 63 | 			}
 64 | 
 65 | 			struct_children.push_back(std::pair<std::string, AvroType>(child_name, std::move(child_type)));
 66 | 		}
 67 | 
 68 | 		return AvroType(AVRO_RECORD, LogicalTypeId::STRUCT, std::move(struct_children));
 69 | 	}
 70 | 	case AVRO_ENUM: {
 71 | 		auto size = avro_schema_enum_number_of_symbols(avro_schema);
 72 | 		Vector levels(LogicalType::VARCHAR, size);
 73 | 		auto levels_data = FlatVector::GetData<string_t>(levels);
 74 | 		for (idx_t enum_idx = 0; enum_idx < size; enum_idx++) {
 75 | 			levels_data[enum_idx] = StringVector::AddString(levels, avro_schema_enum_get(avro_schema, enum_idx));
 76 | 		}
 77 | 		levels.Verify(size);
 78 | 		return AvroType(AVRO_ENUM, LogicalType::ENUM(levels, size));
 79 | 	}
 80 | 	case AVRO_FIXED: {
 81 | 		return AvroType(AVRO_FIXED, LogicalType::BLOB);
 82 | 	}
 83 | 	case AVRO_ARRAY: {
 84 | 		auto child_schema = avro_schema_array_items(avro_schema);
 85 | 		auto child_type = TransformSchema(child_schema, parent_schema_names);
 86 | 		child_list_t<AvroType> list_children;
 87 | 		list_children.push_back(std::pair<std::string, AvroType>("list_entry", std::move(child_type)));
 88 | 		return AvroType(AVRO_ARRAY, LogicalTypeId::LIST, std::move(list_children));
 89 | 	}
 90 | 	case AVRO_MAP: {
 91 | 		auto child_schema = avro_schema_map_values(avro_schema);
 92 | 		auto child_type = TransformSchema(child_schema, parent_schema_names);
 93 | 		child_list_t<AvroType> map_children;
 94 | 		map_children.push_back(std::pair<std::string, AvroType>("list_entry", std::move(child_type)));
 95 | 		return AvroType(AVRO_MAP, LogicalTypeId::MAP, std::move(map_children));
 96 | 	}
 97 | 	case AVRO_LINK: {
 98 | 		auto target = avro_schema_link_target(avro_schema);
 99 | 		return TransformSchema(target, parent_schema_names);
100 | 	}
101 | 	default:
102 | 		throw NotImplementedException("Unknown Avro Type %s", avro_schema_type_name(avro_schema));
103 | 	}
104 | }
105 | 
106 | AvroReader::AvroReader(ClientContext &context, OpenFileInfo file) : BaseFileReader(file) {
107 | 	auto caching_file_system = CachingFileSystem::Get(context);
108 | 
109 | 	auto caching_file_handle = caching_file_system.OpenFile(this->file, FileOpenFlags::FILE_FLAGS_READ);
110 | 	allocated_data = Allocator::Get(context).Allocate(caching_file_handle->GetFileSize());
111 | 	auto total_size = allocated_data.GetSize();
112 | 	auto data = allocated_data.get();
113 | 
114 | 	auto buf_handle = caching_file_handle->Read(data, total_size);
115 | 	auto buffer_data = buf_handle.Ptr();
116 | 
117 | 	D_ASSERT(buf_handle.IsValid());
118 | 	D_ASSERT(buffer_data == data);
119 | 	auto avro_reader = avro_reader_memory(const_char_ptr_cast(buffer_data), total_size);
120 | 
121 | 	if (avro_reader_reader(avro_reader, &reader)) {
122 | 		throw InvalidInputException(avro_strerror());
123 | 	}
124 | 
125 | 	auto avro_schema = avro_file_reader_get_writer_schema(reader);
126 | 	avro_type = TransformSchema(avro_schema, {});
127 | 	duckdb_type = AvroType::TransformAvroType(avro_type);
128 | 	read_vec = make_uniq<Vector>(duckdb_type);
129 | 
130 | 	auto interface = avro_generic_class_from_schema(avro_schema);
131 | 	avro_generic_value_new(interface, &value);
132 | 	avro_value_iface_decref(interface);
133 | 
134 | 	vector<LogicalType> types;
135 | 	vector<string> names;
136 | 	// special handling for root structs, we pull up the entries
137 | 	if (duckdb_type.id() == LogicalTypeId::STRUCT) {
138 | 		for (idx_t child_idx = 0; child_idx < StructType::GetChildCount(duckdb_type); child_idx++) {
139 | 			names.push_back(StructType::GetChildName(duckdb_type, child_idx));
140 | 			types.push_back(StructType::GetChildType(duckdb_type, child_idx));
141 | 		}
142 | 	} else {
143 | 		auto schema_name = avro_schema_name(avro_schema);
144 | 		names.push_back(schema_name ? schema_name : "avro_schema");
145 | 		types.push_back(duckdb_type);
146 | 	}
147 | 
148 | 	columns = MultiFileColumnDefinition::ColumnsFromNamesAndTypes(names, types);
149 | 	avro_schema_decref(avro_schema);
150 | }
151 | 
152 | static void TransformValue(avro_value *avro_val, const AvroType &avro_type, Vector &target, idx_t out_idx) {
153 | 
154 | 	switch (avro_type.duckdb_type.id()) {
155 | 	case LogicalTypeId::SQLNULL: {
156 | 		FlatVector::SetNull(target, out_idx, true);
157 | 		break;
158 | 	}
159 | 	case LogicalTypeId::BOOLEAN: {
160 | 		int bool_val;
161 | 		if (avro_value_get_boolean(avro_val, &bool_val)) {
162 | 			throw InvalidInputException(avro_strerror());
163 | 		}
164 | 		FlatVector::GetData<uint8_t>(target)[out_idx] = bool_val != 0;
165 | 		break;
166 | 	}
167 | 	case LogicalTypeId::INTEGER: {
168 | 		if (avro_value_get_int(avro_val, &FlatVector::GetData<int32_t>(target)[out_idx])) {
169 | 			throw InvalidInputException(avro_strerror());
170 | 		}
171 | 		break;
172 | 	}
173 | 	case LogicalTypeId::BIGINT: {
174 | 		if (avro_value_get_long(avro_val, &FlatVector::GetData<int64_t>(target)[out_idx])) {
175 | 			throw InvalidInputException(avro_strerror());
176 | 		}
177 | 		break;
178 | 	}
179 | 	case LogicalTypeId::FLOAT: {
180 | 		if (avro_value_get_float(avro_val, &FlatVector::GetData<float>(target)[out_idx])) {
181 | 			throw InvalidInputException(avro_strerror());
182 | 		}
183 | 		break;
184 | 	}
185 | 	case LogicalTypeId::DOUBLE: {
186 | 		if (avro_value_get_double(avro_val, &FlatVector::GetData<double>(target)[out_idx])) {
187 | 			throw InvalidInputException(avro_strerror());
188 | 		}
189 | 		break;
190 | 	}
191 | 	case LogicalTypeId::BLOB:
192 | 		switch (avro_type.avro_type) {
193 | 		case AVRO_FIXED: {
194 | 			size_t fixed_size;
195 | 			const void *fixed_data;
196 | 			if (avro_value_get_fixed(avro_val, &fixed_data, &fixed_size)) {
197 | 				throw InvalidInputException(avro_strerror());
198 | 			}
199 | 			FlatVector::GetData<string_t>(target)[out_idx] =
200 | 			    StringVector::AddStringOrBlob(target, const_char_ptr_cast(fixed_data), fixed_size);
201 | 			break;
202 | 		}
203 | 		case AVRO_BYTES: {
204 | 			avro_wrapped_buffer blob_buf = AVRO_WRAPPED_BUFFER_EMPTY;
205 | 			if (avro_value_grab_bytes(avro_val, &blob_buf)) {
206 | 				throw InvalidInputException(avro_strerror());
207 | 			}
208 | 			FlatVector::GetData<string_t>(target)[out_idx] =
209 | 			    StringVector::AddStringOrBlob(target, const_char_ptr_cast(blob_buf.buf), blob_buf.size);
210 | 			blob_buf.free(&blob_buf);
211 | 			break;
212 | 		}
213 | 		default:
214 | 			throw NotImplementedException("Unknown Avro blob type %s");
215 | 		}
216 | 		break;
217 | 
218 | 	case LogicalTypeId::VARCHAR: {
219 | 		avro_wrapped_buffer str_buf = AVRO_WRAPPED_BUFFER_EMPTY;
220 | 		if (avro_value_grab_string(avro_val, &str_buf)) {
221 | 			throw InvalidInputException(avro_strerror());
222 | 		}
223 | 		// avro strings are null-terminated
224 | 		D_ASSERT(const_char_ptr_cast(str_buf.buf)[str_buf.size - 1] == '\0');
225 | 		if (Utf8Proc::Analyze(const_char_ptr_cast(str_buf.buf), str_buf.size - 1) == UnicodeType::INVALID) {
226 | 			throw InvalidInputException("Avro file contains invalid unicode string");
227 | 		}
228 | 		FlatVector::GetData<string_t>(target)[out_idx] =
229 | 		    StringVector::AddString(target, const_char_ptr_cast(str_buf.buf), str_buf.size - 1);
230 | 		str_buf.free(&str_buf);
231 | 		break;
232 | 	}
233 | 	case LogicalTypeId::STRUCT: {
234 | 		size_t child_count;
235 | 		if (avro_value_get_size(avro_val, &child_count)) {
236 | 			throw InvalidInputException(avro_strerror());
237 | 		}
238 | 		D_ASSERT(child_count == StructType::GetChildCount(target.GetType()));
239 | 		D_ASSERT(child_count == avro_type.children.size());
240 | 
241 | 		for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
242 | 			avro_value child_value;
243 | 			if (avro_value_get_by_index(avro_val, child_idx, &child_value, nullptr)) {
244 | 				throw InvalidInputException(avro_strerror());
245 | 			}
246 | 			TransformValue(&child_value, avro_type.children[child_idx].second,
247 | 			               *StructVector::GetEntries(target)[child_idx], out_idx);
248 | 		}
249 | 		break;
250 | 	}
251 | 
252 | 	case LogicalTypeId::MAP: {
253 | 		size_t entry_count;
254 | 		if (avro_value_get_size(avro_val, &entry_count)) {
255 | 			throw InvalidInputException(avro_strerror());
256 | 		}
257 | 
258 | 		D_ASSERT(avro_type.children.size() == 1);
259 | 		auto child_offset = ListVector::GetListSize(target);
260 | 		ListVector::Reserve(target, child_offset + entry_count);
261 | 
262 | 		auto &key_vector = MapVector::GetKeys(target);
263 | 		auto &value_vector = MapVector::GetValues(target);
264 | 
265 | 		D_ASSERT(key_vector.GetType().id() == LogicalTypeId::VARCHAR);
266 | 		auto string_ptr = FlatVector::GetData<string_t>(key_vector);
267 | 		for (idx_t entry_idx = 0; entry_idx < entry_count; entry_idx++) {
268 | 			avro_value child_value;
269 | 			const char *map_key;
270 | 			if (avro_value_get_by_index(avro_val, entry_idx, &child_value, &map_key)) {
271 | 				throw InvalidInputException(avro_strerror());
272 | 			}
273 | 			D_ASSERT(map_key);
274 | 			string_ptr[child_offset + entry_idx] = StringVector::AddString(key_vector, map_key);
275 | 			TransformValue(&child_value, avro_type.children[0].second, value_vector, child_offset + entry_idx);
276 | 		}
277 | 		auto list_vector = ListVector::GetData(target);
278 | 
279 | 		list_vector[out_idx].offset = child_offset;
280 | 		list_vector[out_idx].length = entry_count;
281 | 		ListVector::SetListSize(target, child_offset + entry_count);
282 | 		break;
283 | 	}
284 | 
285 | 	case LogicalTypeId::UNION: {
286 | 		int discriminant;
287 | 		avro_value union_value;
288 | 		if (avro_value_get_discriminant(avro_val, &discriminant) ||
289 | 		    avro_value_get_current_branch(avro_val, &union_value)) {
290 | 			throw InvalidInputException(avro_strerror());
291 | 		}
292 | 		if (discriminant >= avro_type.children.size()) {
293 | 			throw InvalidInputException("Invalid union tag");
294 | 		}
295 | 
296 | 		if (avro_type.children[discriminant].second.duckdb_type == LogicalTypeId::SQLNULL) {
297 | 			FlatVector::SetNull(target, out_idx, true);
298 | 			break;
299 | 		}
300 | 
301 | 		if (target.GetType().id() == LogicalTypeId::UNION) {
302 | 			auto duckdb_child_index = avro_type.union_child_map.at(discriminant).GetIndex();
303 | 			auto &tags = UnionVector::GetTags(target);
304 | 			FlatVector::GetData<union_tag_t>(tags)[out_idx] = duckdb_child_index;
305 | 			auto &union_vector = UnionVector::GetMember(target, duckdb_child_index);
306 | 
307 | 			// orrrrrrrrrrrrr
308 | 			for (idx_t child_idx = 1; child_idx < StructVector::GetEntries(target).size(); child_idx++) {
309 | 				if (child_idx != duckdb_child_index + 1) { // duckdb child index is bigger because of the tag
310 | 					FlatVector::SetNull(*StructVector::GetEntries(target)[child_idx], out_idx, true);
311 | 				}
312 | 			}
313 | 
314 | 			TransformValue(&union_value, avro_type.children[discriminant].second, union_vector, out_idx);
315 | 		} else { // directly recurse, we have dissolved the union
316 | 			TransformValue(&union_value, avro_type.children[discriminant].second, target, out_idx);
317 | 		}
318 | 
319 | 		break;
320 | 	}
321 | 	case LogicalTypeId::ENUM: {
322 | 		auto enum_type = EnumType::GetPhysicalType(target.GetType());
323 | 		int enum_val;
324 | 
325 | 		if (avro_value_get_enum(avro_val, &enum_val)) {
326 | 			throw InvalidInputException(avro_strerror());
327 | 		}
328 | 		if (enum_val < 0 || enum_val >= EnumType::GetSize(target.GetType())) {
329 | 			throw InvalidInputException("Enum value out of range");
330 | 		}
331 | 
332 | 		switch (enum_type) {
333 | 		case PhysicalType::UINT8:
334 | 			FlatVector::GetData<uint8_t>(target)[out_idx] = enum_val;
335 | 			break;
336 | 		case PhysicalType::UINT16:
337 | 			FlatVector::GetData<uint16_t>(target)[out_idx] = enum_val;
338 | 			break;
339 | 		case PhysicalType::UINT32:
340 | 			FlatVector::GetData<uint32_t>(target)[out_idx] = enum_val;
341 | 			break;
342 | 		default:
343 | 			throw InternalException("Unsupported Enum Internal Type");
344 | 		}
345 | 		break;
346 | 	}
347 | 
348 | 	case LogicalTypeId::LIST: {
349 | 		size_t list_len;
350 | 
351 | 		if (avro_value_get_size(avro_val, &list_len)) {
352 | 			throw InvalidInputException(avro_strerror());
353 | 		}
354 | 		auto &child_vector = ListVector::GetEntry(target);
355 | 		auto child_offset = ListVector::GetListSize(target);
356 | 		ListVector::Reserve(target, child_offset + list_len);
357 | 
358 | 		for (idx_t child_idx = 0; child_idx < list_len; child_idx++) {
359 | 			avro_value_t child_value;
360 | 			if (avro_value_get_by_index(avro_val, child_idx, &child_value, nullptr)) {
361 | 				throw InvalidInputException(avro_strerror());
362 | 			}
363 | 			TransformValue(&child_value, avro_type.children[0].second, child_vector, child_offset + child_idx);
364 | 		}
365 | 		auto list_vector_data = ListVector::GetData(target);
366 | 		list_vector_data[out_idx].length = list_len;
367 | 		list_vector_data[out_idx].offset = child_offset;
368 | 		ListVector::SetListSize(target, child_offset + list_len);
369 | 
370 | 		break;
371 | 	}
372 | 
373 | 	default:
374 | 		throw NotImplementedException(avro_type.duckdb_type.ToString());
375 | 	}
376 | }
377 | 
378 | void AvroReader::Read(DataChunk &output) {
379 | 	idx_t out_idx = 0;
380 | 
381 | 	while (avro_file_reader_read_value(reader, &value) == 0) {
382 | 		TransformValue(&value, avro_type, *read_vec, out_idx++);
383 | 		if (out_idx == STANDARD_VECTOR_SIZE) {
384 | 			break;
385 | 		}
386 | 	}
387 | 	// pull up root struct into output chunk
388 | 	if (duckdb_type.id() == LogicalTypeId::STRUCT) {
389 | 		for (idx_t col_idx = 0; col_idx < column_indexes.size(); col_idx++) {
390 | 			if (column_indexes[col_idx].GetPrimaryIndex() >= columns.size()) {
391 | 				continue; // to be filled in later
392 | 			}
393 | 			output.data[col_idx].Reference(
394 | 			    *StructVector::GetEntries(*read_vec)[column_indexes[col_idx].GetPrimaryIndex()]);
395 | 		}
396 | 	} else {
397 | 		output.data[column_indexes[0].GetPrimaryIndex()].Reference(*read_vec);
398 | 	}
399 | 	output.SetCardinality(out_idx);
400 | }
401 | 
402 | } // namespace duckdb
403 | 


--------------------------------------------------------------------------------
/src/include/avro_extension.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "duckdb.hpp"
 4 | 
 5 | namespace duckdb {
 6 | 
 7 | class AvroExtension : public Extension {
 8 | public:
 9 | 	void Load(DuckDB &db) override;
10 | 	std::string Name() override;
11 | 	std::string Version() const override;
12 | };
13 | 
14 | } // namespace duckdb
15 | 


--------------------------------------------------------------------------------
/src/include/avro_multi_file_info.hpp:
--------------------------------------------------------------------------------
 1 | //===----------------------------------------------------------------------===//
 2 | //                         DuckDB
 3 | //
 4 | // avro_multi_file_info.hpp
 5 | //
 6 | //
 7 | //===----------------------------------------------------------------------===//
 8 | 
 9 | #pragma once
10 | 
11 | #include "duckdb/common/multi_file/multi_file_function.hpp"
12 | 
13 | namespace duckdb {
14 | 
15 | //! We might have avro specific options one day
16 | class AvroFileReaderOptions : public BaseFileReaderOptions {};
17 | 
18 | struct AvroMultiFileInfo : MultiFileReaderInterface {
19 | 	static unique_ptr<MultiFileReaderInterface> InitializeInterface(ClientContext &context, MultiFileReader &reader,
20 | 	                                                                MultiFileList &file_list);
21 | 
22 | 	unique_ptr<BaseFileReaderOptions> InitializeOptions(ClientContext &context,
23 | 	                                                    optional_ptr<TableFunctionInfo> info) override;
24 | 	bool ParseCopyOption(ClientContext &context, const string &key, const vector<Value> &values,
25 | 	                     BaseFileReaderOptions &options, vector<string> &expected_names,
26 | 	                     vector<LogicalType> &expected_types) override;
27 | 
28 | 	bool ParseOption(ClientContext &context, const string &key, const Value &val, MultiFileOptions &file_options,
29 | 	                 BaseFileReaderOptions &options) override;
30 | 
31 | 	unique_ptr<TableFunctionData> InitializeBindData(MultiFileBindData &multi_file_data,
32 | 	                                                 unique_ptr<BaseFileReaderOptions> options) override;
33 | 
34 | 	//! This is where the actual binding must happen, so in this function we either:
35 | 	//! 1. union_by_name = False. We set the schema/name depending on the first file
36 | 	//! 2. union_by_name = True.
37 | 	void BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
38 | 	                MultiFileBindData &bind_data) override;
39 | 
40 | 	optional_idx MaxThreads(const MultiFileBindData &bind_data_p, const MultiFileGlobalState &global_state,
41 | 	                        FileExpandResult expand_result) override;
42 | 
43 | 	unique_ptr<GlobalTableFunctionState> InitializeGlobalState(ClientContext &context, MultiFileBindData &bind_data,
44 | 	                                                           MultiFileGlobalState &global_state) override;
45 | 
46 | 	unique_ptr<LocalTableFunctionState> InitializeLocalState(ExecutionContext &context,
47 | 	                                                         GlobalTableFunctionState &function_state) override;
48 | 
49 | 	shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
50 | 	                                        BaseUnionData &union_data, const MultiFileBindData &bind_data_p) override;
51 | 
52 | 	shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
53 | 	                                        const OpenFileInfo &file, idx_t file_idx,
54 | 	                                        const MultiFileBindData &bind_data) override;
55 | 
56 | 	shared_ptr<BaseFileReader> CreateReader(ClientContext &context, const OpenFileInfo &file,
57 | 	                                        BaseFileReaderOptions &options,
58 | 	                                        const MultiFileOptions &file_options) override;
59 | 
60 | 	unique_ptr<NodeStatistics> GetCardinality(const MultiFileBindData &bind_data, idx_t file_count) override;
61 | };
62 | 
63 | } // namespace duckdb
64 | 


--------------------------------------------------------------------------------
/src/include/avro_reader.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "duckdb/common/helper.hpp"
 4 | #include "avro_type.hpp"
 5 | #include "duckdb/common/multi_file/base_file_reader.hpp"
 6 | 
 7 | namespace duckdb {
 8 | 
 9 | class AvroReader : public BaseFileReader {
10 | public:
11 | 	AvroReader(ClientContext &context, const OpenFileInfo file);
12 | 
13 | 	~AvroReader() {
14 | 		avro_value_decref(&value);
15 | 		avro_file_reader_close(reader);
16 | 	}
17 | 
18 | public:
19 | 	void Read(DataChunk &output);
20 | 
21 | 	string GetReaderType() const override {
22 | 		return "Avro";
23 | 	}
24 | 
25 | 	bool TryInitializeScan(ClientContext &context, GlobalTableFunctionState &gstate,
26 | 	                       LocalTableFunctionState &lstate) override;
27 | 	void Scan(ClientContext &context, GlobalTableFunctionState &global_state, LocalTableFunctionState &local_state,
28 | 	          DataChunk &chunk) override;
29 | 
30 | public:
31 | 	avro_file_reader_t reader;
32 | 	avro_value_t value;
33 | 	unique_ptr<Vector> read_vec;
34 | 
35 | 	AllocatedData allocated_data;
36 | 	AvroType avro_type;
37 | 	LogicalType duckdb_type;
38 | };
39 | 
40 | } // namespace duckdb
41 | 


--------------------------------------------------------------------------------
/src/include/avro_type.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "duckdb/common/types.hpp"
 4 | #include <avro.h>
 5 | #include "duckdb/common/optional_idx.hpp"
 6 | 
 7 | namespace duckdb {
 8 | 
 9 | struct AvroType {
10 | public:
11 | 	AvroType() : duckdb_type(LogicalType::INVALID) {
12 | 	}
13 | 	AvroType(avro_type_t avro_type_p, LogicalType duckdb_type_p, child_list_t<AvroType> children_p = {},
14 | 	         unordered_map<idx_t, optional_idx> union_child_map_p = {})
15 | 	    : duckdb_type(duckdb_type_p), avro_type(avro_type_p), children(children_p), union_child_map(union_child_map_p) {
16 | 	}
17 | 
18 | public:
19 | 	bool operator==(const AvroType &other) const {
20 | 		return duckdb_type == other.duckdb_type && avro_type == other.avro_type && children == other.children &&
21 | 		       union_child_map == other.union_child_map;
22 | 	}
23 | 
24 | public:
25 | 	// we use special transformation rules for unions with null:
26 | 	// 1) the null does not become a union entry and
27 | 	// 2) if there is only one entry the union disappears and is repaced by its
28 | 	// child
29 | 	static LogicalType TransformAvroType(const AvroType &avro_type) {
30 | 		child_list_t<LogicalType> children;
31 | 
32 | 		switch (avro_type.duckdb_type.id()) {
33 | 		case LogicalTypeId::STRUCT: {
34 | 			for (auto &child : avro_type.children) {
35 | 				children.push_back(std::pair<std::string, LogicalType>(child.first, TransformAvroType(child.second)));
36 | 			}
37 | 			D_ASSERT(!children.empty());
38 | 			return LogicalType::STRUCT(std::move(children));
39 | 		}
40 | 		case LogicalTypeId::LIST:
41 | 			return LogicalType::LIST(TransformAvroType(avro_type.children[0].second));
42 | 		case LogicalTypeId::MAP: {
43 | 			child_list_t<LogicalType> children;
44 | 			children.push_back(std::pair<std::string, LogicalType>("key", LogicalType::VARCHAR));
45 | 			children.push_back(
46 | 			    std::pair<std::string, LogicalType>("value", TransformAvroType(avro_type.children[0].second)));
47 | 			return LogicalType::MAP(LogicalType::STRUCT(std::move(children)));
48 | 		}
49 | 		case LogicalTypeId::UNION: {
50 | 			for (auto &child : avro_type.children) {
51 | 				if (child.second.duckdb_type == LogicalTypeId::SQLNULL) {
52 | 					continue;
53 | 				}
54 | 				children.push_back(std::pair<std::string, LogicalType>(child.first, TransformAvroType(child.second)));
55 | 			}
56 | 			if (children.size() == 1) {
57 | 				return children[0].second;
58 | 			}
59 | 			if (children.empty()) {
60 | 				throw InvalidInputException("Empty union type");
61 | 			}
62 | 			return LogicalType::UNION(std::move(children));
63 | 		}
64 | 		default:
65 | 			return LogicalType(avro_type.duckdb_type);
66 | 		}
67 | 	}
68 | 
69 | public:
70 | 	LogicalType duckdb_type;
71 | 	avro_type_t avro_type;
72 | 	child_list_t<AvroType> children;
73 | 	unordered_map<idx_t, optional_idx> union_child_map;
74 | };
75 | 
76 | } // namespace duckdb
77 | 


--------------------------------------------------------------------------------
/test/4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Testing this extension
 2 | This directory contains all the tests for this extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too.
 3 | 
 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests:
 5 | ```bash
 6 | make test
 7 | ```
 8 | or 
 9 | ```bash
10 | make test_debug
11 | ```


--------------------------------------------------------------------------------
/test/all_nullable_list.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/all_nullable_list.avro


--------------------------------------------------------------------------------
/test/avro.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/avro.avro


--------------------------------------------------------------------------------
/test/bigdata.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/bigdata.avro


--------------------------------------------------------------------------------
/test/broken_record.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/broken_record.avro


--------------------------------------------------------------------------------
/test/create_test_file.py:
--------------------------------------------------------------------------------
  1 | import avro.schema
  2 | from avro.datafile import DataFileReader, DataFileWriter
  3 | from avro.io import DatumReader, DatumWriter
  4 | 
  5 | 
  6 | 
  7 | json_schema = """
  8 | {"namespace": "example.avro",
  9 |  "type": "record",
 10 |  "name": "User",
 11 |  "fields": [
 12 |      {"name": "name", "type": "string"},
 13 |      {"name": "favorite_number",  "type": ["int", "null"]},
 14 |      {"name": "favorite_color", "type": ["string", "null"]}
 15 |  ]
 16 | }
 17 | """
 18 | 
 19 | schema = avro.schema.parse(json_schema)
 20 | 
 21 | writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), schema)
 22 | writer.append({"name": "Alyssa", "favorite_number": 256})
 23 | writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
 24 | writer.close()
 25 | 
 26 | reader = DataFileReader(open("users.avro", "rb"), DatumReader())
 27 | for user in reader:
 28 |     print(user)
 29 | reader.close()
 30 | 
 31 | 
 32 | 
 33 | 
 34 | json_schema = """
 35 | {"namespace": "example2.avro",
 36 |  "type": "int",
 37 |  "name": "my_int"
 38 | }
 39 | """
 40 | 
 41 | schema = avro.schema.parse(json_schema)
 42 | 
 43 | writer = DataFileWriter(open("root-int.avro", "wb"), DatumWriter(), schema)
 44 | writer.append(42)
 45 | writer.append(43)
 46 | 
 47 | writer.close()
 48 | 
 49 | reader = DataFileReader(open("root-int.avro", "rb"), DatumReader())
 50 | for user in reader:
 51 |     print(user)
 52 | reader.close()
 53 | 
 54 | 
 55 | 
 56 | json_schema = """
 57 | { "type": "record",
 58 |  "name": "root",
 59 |  "fields": [
 60 |      {"name": "single_union", "type": ["int"]}
 61 |  ]
 62 | }
 63 | """
 64 | 
 65 | schema = avro.schema.parse(json_schema)
 66 | 
 67 | writer = DataFileWriter(open("single-union.avro", "wb"), DatumWriter(), schema)
 68 | writer.append({ "single_union":42})
 69 | 
 70 | 
 71 | writer.close()
 72 | 
 73 | reader = DataFileReader(open("single-union.avro", "rb"), DatumReader())
 74 | for user in reader:
 75 |     print(user)
 76 | reader.close()
 77 | 
 78 | 
 79 | 
 80 | 
 81 | json_schema = """
 82 | { "type": "record",
 83 |  "name": "root",
 84 |  "fields": [
 85 |      {"name": "null_first", "type": ["null","int"]}
 86 |  ]
 87 | }
 88 | """
 89 | 
 90 | schema = avro.schema.parse(json_schema)
 91 | 
 92 | writer = DataFileWriter(open("null_first.avro", "wb"), DatumWriter(), schema)
 93 | writer.append({ "null_first":42})
 94 | writer.append({})
 95 | 
 96 | 
 97 | writer.close()
 98 | 
 99 | reader = DataFileReader(open("null_first.avro", "rb"), DatumReader())
100 | for user in reader:
101 |     print(user)
102 | reader.close()
103 | 
104 | 
105 | 
106 | 
107 | json_schema = """
108 | { "type": "record",
109 |  "name": "root",
110 |  "fields": [
111 |      {"name": "null_last", "type": ["int","null"]}
112 |  ]
113 | }
114 | """
115 | 
116 | schema = avro.schema.parse(json_schema)
117 | 
118 | writer = DataFileWriter(open("null_last.avro", "wb"), DatumWriter(), schema)
119 | writer.append({ "null_last":42})
120 | writer.append({})
121 | 
122 | 
123 | writer.close()
124 | 
125 | reader = DataFileReader(open("null_last.avro", "rb"), DatumReader())
126 | for user in reader:
127 |     print(user)
128 | reader.close()
129 | 
130 | 
131 | json_schema = """
132 | { "type": "record",
133 |  "name": "root",
134 |  "fields": [
135 |      {"name": "null", "type": "null"},
136 |      {"name": "boolean", "type": "boolean"},
137 |      {"name": "int", "type": "int"},
138 |      {"name": "long", "type": "long"},
139 |      {"name": "float", "type": "float"},
140 |      {"name": "double", "type": "double"},
141 |      {"name": "bytes", "type": "bytes"},
142 |      {"name": "string", "type": "string"}
143 |  ]
144 | }
145 | """
146 | 
147 | schema = avro.schema.parse(json_schema)
148 | 
149 | writer = DataFileWriter(open("primitive_types.avro", "wb"), DatumWriter(), schema)
150 | 
151 | 
152 | 
153 | writer.append({ 'null':None, 'boolean': False, 'int': -2147483648, 'long' : -9223372036854775808, 'float' : -3.4028235e+38, 'double' : -1.7976931348623157e+308,  'bytes' : 'thisisalongblob\x00withnullbytes'.encode(),  'string' : "🦆🦆🦆🦆🦆🦆"})
154 | writer.append({ 'null':None, 'boolean': True, 'int': 2147483647, 'long' : 9223372036854775807,  'float' : 3.4028235e+38, 'double' : 1.7976931348623157e+308, 'bytes': '\x00\x00\x00a'.encode(),  'string' : 'goo'})
155 | 
156 | 
157 | writer.close()
158 | 
159 | reader = DataFileReader(open("primitive_types.avro", "rb"), DatumReader())
160 | for user in reader:
161 |     print(user)
162 | reader.close()
163 | 
164 | 
165 | 
166 | json_schema = """
167 | {
168 |     "type": "record",
169 |     "name": "MySchema",
170 |     "namespace": "com.company",
171 |     "fields": [
172 |         {
173 |             "name": "color",
174 |             "type": {
175 |                 "type": "enum",
176 |                 "name": "Color",
177 |                 "symbols": [
178 |                     "UNKNOWN",
179 |                     "GREEN",
180 |                     "RED"
181 |                 ]
182 |             },
183 |             "default": "UNKNOWN"
184 |         }
185 |     ]
186 | }
187 | """
188 | 
189 | schema = avro.schema.parse(json_schema)
190 | 
191 | writer = DataFileWriter(open("enum.avro", "wb"), DatumWriter(), schema)
192 | 
193 | 
194 | 
195 | writer.append({ 'color': 'GREEN'})
196 | writer.append({ 'color': 'GREEN'})
197 | writer.append({ 'color': 'RED'})
198 | writer.append({ 'color': 'UNKNOWN'})
199 | writer.append({ 'color': 'UNKNOWN'})
200 | 
201 | writer.close()
202 | 
203 | reader = DataFileReader(open("enum.avro", "rb"), DatumReader())
204 | for user in reader:
205 |     print(user)
206 | reader.close()
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | json_schema = """
215 | { "type": "record",
216 |  "name": "root",
217 |      "fields": [
218 |         {
219 |             "name": "md5",
220 |             "type": {"type": "fixed", "size": 32, "name": "md5"}
221 |         }
222 |     ]
223 | }
224 | """
225 | 
226 | schema = avro.schema.parse(json_schema)
227 | 
228 | writer = DataFileWriter(open("fixed.avro", "wb"), DatumWriter(), schema)
229 | 
230 | 
231 | 
232 | writer.append({ 'md5' : '47336f3f2497b70ac046cf23298e20a7'.encode()})
233 | writer.append({ 'md5' : 'a789a15a7ff7db4a0d1b186363ef0771'.encode()})
234 | writer.append({ 'md5' : 'c9db7c67a6acb5a65c78b19e9e01d7b0'.encode()})
235 | writer.append({ 'md5' : 'ac441296bcbd44442301204a8f061cf2'.encode()})
236 | 
237 | 
238 | 
239 | writer.close()
240 | 
241 | reader = DataFileReader(open("fixed.avro", "rb"), DatumReader())
242 | for user in reader:
243 |     print(user)
244 | reader.close()
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | json_schema = """
255 | { "type": "record",
256 |  "name": "root",
257 |      "fields": [
258 |         {
259 |             "name": "string_arr",
260 |             "type": {
261 |               "type": "array",
262 |               "items" : "string",
263 |               "default": []
264 |             }
265 |         }
266 |     ]
267 | }
268 | """
269 | 
270 | schema = avro.schema.parse(json_schema)
271 | 
272 | writer = DataFileWriter(open("string_array.avro", "wb"), DatumWriter(), schema)
273 | 
274 | 
275 | 
276 | writer.append({ 'string_arr' : ['Hello' ,'World']})
277 | writer.append({ 'string_arr' : ['this']})
278 | writer.append({ 'string_arr' : []})
279 | writer.append({ 'string_arr' : ['is', 'cool','array']})
280 | writer.append({ 'string_arr' : ['data']})
281 | 
282 | 
283 | 
284 | writer.close()
285 | 
286 | reader = DataFileReader(open("string_array.avro", "rb"), DatumReader())
287 | for user in reader:
288 |     print(user)
289 | reader.close()
290 | 
291 | 
292 | 
293 | 
294 | json_schema = """
295 | { "type": "record",
296 |  "name": "root",
297 |      "fields": [
298 |         {
299 |             "name": "long_map",
300 |             "type":  {
301 |                "type": "map",
302 |                "values" : "long",
303 |                "default": {}
304 |              }
305 |         }
306 |     ]
307 | }
308 | """
309 | 
310 | schema = avro.schema.parse(json_schema)
311 | 
312 | writer = DataFileWriter(open("long_map.avro", "wb"), DatumWriter(), schema)
313 | 
314 | writer.append({ 'long_map' : {'one': 42}})
315 | writer.append({ 'long_map' : {'two': 43}})
316 | writer.append({ 'long_map' : {'three': 44}})
317 | 
318 | writer.close()
319 | 
320 | reader = DataFileReader(open("long_map.avro", "rb"), DatumReader())
321 | for user in reader:
322 |     print(user)
323 | reader.close()
324 | 
325 | 
326 | 
327 | json_schema = """
328 | { "type": "record",
329 |  "name": "root",
330 |      "fields": [
331 |         {
332 |             "name": "string_arr",
333 |             "type": ["null", {
334 |               "type": "array",
335 |               "items" : "string",
336 |               "default": []
337 |             }]
338 |         }
339 |     ]
340 | }
341 | """
342 | 
343 | schema = avro.schema.parse(json_schema)
344 | 
345 | writer = DataFileWriter(open("nullable_string_array.avro", "wb"), DatumWriter(), schema)
346 | 
347 | 
348 | 
349 | writer.append({ 'string_arr' : ['Hello' ,'World']})
350 | writer.append({ 'string_arr' : ['this']})
351 | writer.append({ 'string_arr' : []})
352 | writer.append({ 'string_arr' : None})
353 | writer.append({ 'string_arr' : None})
354 | writer.append({ 'string_arr' : ['is', 'cool','array']})
355 | writer.append({ 'string_arr' : ['data']})
356 | 
357 | 
358 | 
359 | writer.close()
360 | 
361 | reader = DataFileReader(open("nullable_string_array.avro", "rb"), DatumReader())
362 | for user in reader:
363 |     print(user)
364 | reader.close()
365 | 
366 | 
367 | 
368 | 
369 | 
370 | json_schema = """
371 | { "type": "record",
372 |  "name": "root",
373 |      "fields": [
374 |         {
375 |             "name": "string_arr",
376 |             "type": {
377 |               "type": "array",
378 |               "items" : ["string", "null"],
379 |               "default": []
380 |             }
381 |         }
382 |     ]
383 | }
384 | """
385 | 
386 | schema = avro.schema.parse(json_schema)
387 | 
388 | writer = DataFileWriter(open("nullable_entry_string_array.avro", "wb"), DatumWriter(), schema)
389 | 
390 | 
391 | 
392 | writer.append({ 'string_arr' : ['Hello' ,None, 'World']})
393 | writer.append({ 'string_arr' : ['this']})
394 | writer.append({ 'string_arr' : [None]})
395 | writer.append({ 'string_arr' : [None, None, None]})
396 | writer.append({ 'string_arr' : []})
397 | writer.append({ 'string_arr' : [None, 'is', 'cool',None, 'array',None]})
398 | writer.append({ 'string_arr' : ['data',None]})
399 | 
400 | 
401 | 
402 | writer.close()
403 | 
404 | reader = DataFileReader(open("nullable_entry_string_array.avro", "rb"), DatumReader())
405 | for user in reader:
406 |     print(user)
407 | reader.close()
408 | 
409 | 
410 | 
411 | 
412 | 
413 | json_schema = """
414 | { "type": "record",
415 |  "name": "root",
416 |      "fields": [
417 |         {
418 |             "name": "string_arr",
419 |             "type": ["null", {
420 |               "type": "array",
421 |               "items" : ["string", "null"],
422 |               "default": []
423 |             }]
424 |         }
425 |     ]
426 | }
427 | """
428 | 
429 | schema = avro.schema.parse(json_schema)
430 | 
431 | writer = DataFileWriter(open("all_nullable_list.avro", "wb"), DatumWriter(), schema)
432 | 
433 | 
434 | 
435 | writer.append({ 'string_arr' : ['Hello' ,None, 'World']})
436 | writer.append({ 'string_arr' : ['this']})
437 | writer.append({ 'string_arr' : [None]})
438 | writer.append({ 'string_arr' : [None, None, None]})
439 | writer.append({ 'string_arr' : []})
440 | writer.append({ 'string_arr' : None})
441 | writer.append({ 'string_arr' : None})
442 | writer.append({ 'string_arr' : [None, 'is', 'cool',None, 'array',None]})
443 | writer.append({ 'string_arr' : ['data',None]})
444 | 
445 | 
446 | 
447 | writer.close()
448 | 
449 | reader = DataFileReader(open("all_nullable_list.avro", "rb"), DatumReader())
450 | for user in reader:
451 |     print(user)
452 | reader.close()
453 | 
454 | 
455 | 
456 | 
457 | json_schema = """
458 | { "type": "record",
459 |  "name": "root",
460 |      "fields": [
461 |         {
462 |             "name": "nested_ints",
463 |             "type": ["null", {
464 |               "type": "array",
465 |               "items" : ["null", {
466 |                   "type": "array",
467 |                   "items" : ["int", "null"],
468 |                   "default": []
469 |                 }],
470 |               "default": []
471 |             }]
472 |         }
473 |     ]
474 | }
475 | """
476 | 
477 | schema = avro.schema.parse(json_schema)
478 | 
479 | writer = DataFileWriter(open("nested_nullable_lists.avro", "wb"), DatumWriter(), schema)
480 | 
481 | 
482 | writer.append({ 'nested_ints' : None})
483 | writer.append({ 'nested_ints' : [None]})
484 | writer.append({ 'nested_ints' : [[None], [None]]})
485 | writer.append({ 'nested_ints' : [None, None]})
486 | writer.append({ 'nested_ints' : [[42]]})
487 | writer.append({ 'nested_ints' : [[42], [43]]})
488 | writer.append({ 'nested_ints' : [[42, 43]]})
489 | writer.append({ 'nested_ints' : [[42, 43], None, [44, 45]]})
490 | writer.append({ 'nested_ints' : [[42, None, 43, None], None, [44, None, 45, None], None, [46]]})
491 | 
492 | writer.close()
493 | 
494 | reader = DataFileReader(open("nested_nullable_lists.avro", "rb"), DatumReader())
495 | for user in reader:
496 |     print(user)
497 | reader.close()
498 | 
499 | 
500 | 
501 | 
502 | 
503 | 
504 | 
505 | 
506 | 
507 | json_schema = """
508 | {
509 |   "type": "record",
510 |   "name": "LongList",
511 |   "fields" : [
512 |     {"name": "value", "type": "long"},          
513 |     {"name": "next", "type": ["null", "LongList"]}
514 |   ]
515 | }
516 | """
517 | 
518 | schema = avro.schema.parse(json_schema)
519 | 
520 | writer = DataFileWriter(open("recursive.avro", "wb"), DatumWriter(), schema)
521 | 
522 | 
523 | writer.append({ 'value': 42})
524 | writer.append({ 'value': 43, 'next' : {'value': 44}})
525 | writer.append({ 'value': 43, 'next' : {'value': 44, 'next' : {'value': 45}}})
526 | 
527 | writer.close()
528 | 
529 | reader = DataFileReader(open("recursive.avro", "rb"), DatumReader())
530 | for user in reader:
531 |     print(user)
532 | reader.close()
533 | 
534 | 
535 | 
536 | 
537 | 
538 | 
539 | 
540 | json_schema = """
541 | { "type": "record",
542 | "name": "root",
543 |      "fields": [
544 |     {"name": "n", "type": "null"}          
545 |     ]
546 | }
547 | """
548 | 
549 | schema = avro.schema.parse(json_schema)
550 | 
551 | writer = DataFileWriter(open("broken_record.avro", "wb"), DatumWriter(), schema)
552 | 
553 | writer.append({})
554 | writer.append({})
555 | 
556 | # writer.append({ 'value': 42})
557 | # writer.append({ 'value': 43, 'next' : {'value': 44}})
558 | # writer.append({ 'value': 43, 'next' : {'value': 44, 'next' : {'value': 45}}})
559 | 
560 | writer.close()
561 | 
562 | reader = DataFileReader(open("broken_record.avro", "rb"), DatumReader())
563 | for user in reader:
564 |     print(user)
565 | reader.close()
566 | 
567 | 
568 | 
569 | 
570 | 
571 | 
572 | # record
573 | # detect recursive types or what happens here?
574 | 
575 | 
576 | # union by name
577 | 
578 | 
579 | json_schema = """
580 | { "type": "record",
581 |  "name": "root",
582 |  "fields": [
583 |      {"name": "one", "type": "int"},
584 |      {"name": "two", "type": "double"},
585 |      {"name": "three", "type": "string"}
586 |  ]
587 | }
588 | """
589 | 
590 | schema = avro.schema.parse(json_schema)
591 | 
592 | writer = DataFileWriter(open("union-name-1.avro", "wb"), DatumWriter(), schema)
593 | 
594 | 
595 | 
596 | writer.append({ 'one' : 10, 'two' : 2.0, 'three': 's30'})
597 | writer.append({ 'one' : 11, 'two' : 2.1, 'three': 's31'})
598 | 
599 | 
600 | writer.close()
601 | 
602 | reader = DataFileReader(open("union-name-1.avro", "rb"), DatumReader())
603 | for user in reader:
604 |     print(user)
605 | reader.close()
606 | 
607 | 
608 | 
609 | 
610 | json_schema = """
611 | { "type": "record",
612 |  "name": "root",
613 |  "fields": [
614 |       {"name": "two", "type": "double"},
615 |      {"name": "one", "type": "int"},
616 |      {"name": "three", "type": "string"}
617 |  ]
618 | }
619 | """
620 | 
621 | schema = avro.schema.parse(json_schema)
622 | 
623 | writer = DataFileWriter(open("union-name-2.avro", "wb"), DatumWriter(), schema)
624 | 
625 | 
626 | 
627 | writer.append({ 'one' : 12, 'two' : 2.2, 'three': 's32'})
628 | writer.append({ 'one' : 13, 'two' : 2.3, 'three': 's33'})
629 | 
630 | 
631 | writer.close()
632 | 
633 | reader = DataFileReader(open("union-name-2.avro", "rb"), DatumReader())
634 | for user in reader:
635 |     print(user)
636 | reader.close()
637 | 
638 | 
639 | 
640 | json_schema = """
641 | { "type": "record",
642 |  "name": "root",
643 |  "fields": [
644 |       {"name": "three", "type": "string"},
645 |       {"name": "two", "type": "double"},
646 |      {"name": "one", "type": "int"}
647 |  ]
648 | }
649 | """
650 | 
651 | schema = avro.schema.parse(json_schema)
652 | 
653 | writer = DataFileWriter(open("union-name-3.avro", "wb"), DatumWriter(), schema)
654 | 
655 | 
656 | 
657 | writer.append({ 'one' : 14, 'two' : 2.4, 'three': 's34'})
658 | writer.append({ 'one' : 15, 'two' : 2.5, 'three': 's35'})
659 | 
660 | 
661 | writer.close()
662 | 
663 | reader = DataFileReader(open("union-name-3.avro", "rb"), DatumReader())
664 | for user in reader:
665 |     print(user)
666 | reader.close()
667 | 
668 | 
669 | 
670 | 
671 | json_schema = """
672 | {
673 |   "type": "record",
674 |   "name": "Request",
675 |   "namespace": "example.avro",
676 |   "fields": [
677 |     {
678 |       "name": "request_id",
679 |       "type": "string"
680 |     },
681 |     {
682 |       "name": "client_version",
683 |       "type": {
684 |         "type": "record",
685 |         "name": "Version",
686 |         "fields": [
687 |           {
688 |             "name": "major",
689 |             "type": "int"
690 |           },
691 |           {
692 |             "name": "minor",
693 |             "type": "int"
694 |           }
695 |         ]
696 |       }
697 |     },
698 |     {
699 |       "name": "server_version",
700 |       "type": "Version"
701 |     }
702 |   ]
703 | }
704 | """
705 | 
706 | 
707 | 
708 | schema = avro.schema.parse(json_schema)
709 | 
710 | writer = DataFileWriter(open("reuse-1.avro", "wb"), DatumWriter(), schema)
711 | 
712 | 
713 | writer.append({ 'request_id' : 'hello', 'client_version' : {'major': 4, 'minor' : 2}, 'server_version': {'major': 8, 'minor' : 5}})
714 | writer.append({ 'request_id' : 'world', 'client_version' : {'major': 5, 'minor' : 3}, 'server_version': {'major': 9, 'minor' : 6}})
715 | 
716 | 
717 | writer.close()
718 | 
719 | reader = DataFileReader(open("reuse-1.avro", "rb"), DatumReader())
720 | for user in reader:
721 |     print(user)
722 | reader.close()
723 | 
724 | 
725 | 
726 | 
727 | json_schema = """
728 | {
729 |   "type": "record",
730 |   "name": "Request",
731 |   "namespace": "example.avro",
732 |   "fields": [
733 |     {
734 |       "name": "version",
735 |       "type": {
736 |         "type": "record",
737 |         "name": "Version",
738 |         "fields": [
739 |           { "name": "major", "type": "int" },
740 |           { "name": "minor", "type": "int" }
741 |         ]
742 |       }
743 |     },
744 |     {
745 |       "name": "details",
746 |       "type": {
747 |         "type": "record",
748 |         "name": "Details",
749 |         "fields": [
750 |           { "name": "release_version", "type": "Version" }
751 |         ]
752 |       }
753 |     }
754 |   ]
755 | }
756 | """
757 | 
758 | 
759 | 
760 | schema = avro.schema.parse(json_schema)
761 | 
762 | writer = DataFileWriter(open("reuse-2.avro", "wb"), DatumWriter(), schema)
763 | 
764 | 
765 | writer.append({ 'version' : {'major': 4, 'minor' : 2}, 'details': {'release_version': {'major': 8, 'minor' : 5}}})
766 | writer.append({ 'version' : {'major': 5, 'minor' : 3}, 'details': {'release_version': {'major': 9, 'minor' : 6}}})
767 | 
768 | 
769 | writer.close()
770 | 
771 | reader = DataFileReader(open("reuse-2.avro", "rb"), DatumReader())
772 | for user in reader:
773 |     print(user)
774 | reader.close()
775 | 
776 | 
777 | 
778 | 
779 | json_schema = """
780 | {"type": "record",
781 |  "name": "root",
782 |  "fields": [
783 |       {"name": "c0", "type": "long"},
784 |       {"name": "c1", "type": "long"},
785 |       {"name": "c2", "type": "long"},
786 |       {"name": "c3", "type": "long"},
787 |       {"name": "c4", "type": "long"},
788 |       {"name": "c5", "type": "long"},
789 |       {"name": "c6", "type": "long"},
790 |       {"name": "c7", "type": "long"},
791 |       {"name": "c8", "type": "long"},
792 |       {"name": "c9", "type": "long"}
793 |  ]
794 | }
795 | """
796 | 
797 | n = 100000
798 | 
799 | schema = avro.schema.parse(json_schema)
800 | 
801 | writer = DataFileWriter(open("bigdata.avro", "wb"), DatumWriter(), schema, codec="deflate")
802 | for r in range(1000000):
803 |     writer.append({f'c{i}': 10000000*i + r for i in range(10)})
804 | 
805 | writer.close()
806 | 
807 | 
808 | count = 0
809 | reader = DataFileReader(open("bigdata.avro", "rb"), DatumReader())
810 | for user in reader:
811 |     count = count + 1
812 | reader.close()
813 | print(count)
814 | 
815 | 
816 | 
817 | 
818 | 
819 | 
820 | 


--------------------------------------------------------------------------------
/test/empty_record.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/empty_record.avro


--------------------------------------------------------------------------------
/test/enum.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/enum.avro


--------------------------------------------------------------------------------
/test/fixed.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/fixed.avro


--------------------------------------------------------------------------------
/test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro


--------------------------------------------------------------------------------
/test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro


--------------------------------------------------------------------------------
/test/iceberg/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro


--------------------------------------------------------------------------------
/test/iceberg/cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro


--------------------------------------------------------------------------------
/test/iceberg/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro


--------------------------------------------------------------------------------
/test/iceberg/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro


--------------------------------------------------------------------------------
/test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro


--------------------------------------------------------------------------------
/test/logical_types.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/logical_types.avro


--------------------------------------------------------------------------------
/test/long_map.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/long_map.avro


--------------------------------------------------------------------------------
/test/manifest.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/manifest.avro


--------------------------------------------------------------------------------
/test/nested_nullable_lists.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/nested_nullable_lists.avro


--------------------------------------------------------------------------------
/test/null_first.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/null_first.avro


--------------------------------------------------------------------------------
/test/null_last.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/null_last.avro


--------------------------------------------------------------------------------
/test/nullable_entry_string_array.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/nullable_entry_string_array.avro


--------------------------------------------------------------------------------
/test/nullable_string_array.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/nullable_string_array.avro


--------------------------------------------------------------------------------
/test/part-r-00000.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/part-r-00000.avro


--------------------------------------------------------------------------------
/test/primitive_types.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/primitive_types.avro


--------------------------------------------------------------------------------
/test/query_small.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/query_small.avro


--------------------------------------------------------------------------------
/test/recursive.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/recursive.avro


--------------------------------------------------------------------------------
/test/reuse-1.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/reuse-1.avro


--------------------------------------------------------------------------------
/test/reuse-2.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/reuse-2.avro


--------------------------------------------------------------------------------
/test/root-int.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/root-int.avro


--------------------------------------------------------------------------------
/test/single-union.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/single-union.avro


--------------------------------------------------------------------------------
/test/sql/avro.test:
--------------------------------------------------------------------------------
  1 | # name: test/sql/avro.test
  2 | # description: test avro extension
  3 | # group: [avro]
  4 | 
  5 | require avro
  6 | 
  7 | statement ok
  8 | PRAGMA enable_verification
  9 | 
 10 | # usual suspect, the userdata1 file
 11 | query II
 12 | select column_name, column_type from (DESCRIBE FROM read_avro('test/userdata1.avro'));
 13 | ----
 14 | registration_dttm	VARCHAR
 15 | id	BIGINT
 16 | first_name	VARCHAR
 17 | last_name	VARCHAR
 18 | email	VARCHAR
 19 | gender	VARCHAR
 20 | ip_address	VARCHAR
 21 | cc	BIGINT
 22 | country	VARCHAR
 23 | birthdate	VARCHAR
 24 | salary	DOUBLE
 25 | title	VARCHAR
 26 | comments	VARCHAR
 27 | 
 28 | 
 29 | query I
 30 | FROM read_avro('test/userdata1.avro') SELECT COUNT(*)
 31 | ----
 32 | 1000
 33 | 
 34 | query III
 35 | FROM read_avro('test/userdata1.avro') SELECT first_name, cc, salary ORDER BY registration_dttm LIMIT 10;
 36 | ----
 37 | Lillian	201713786459078	282503.77
 38 | Chris	5602220700741429	NULL
 39 | Nicholas	3575506969751259	192076.79
 40 | Johnny	5602239825516409	169429.76
 41 | Bruce	NULL	118244.57
 42 | Heather	NULL	164117.18
 43 | Larry	3531208154739438	139177.38
 44 | Roy	3589146577885209	262816.87
 45 | James	3589416270039051	211553.57
 46 | Sean	NULL	NULL
 47 | 
 48 | # usual suspect, the userdata1 file, this time with a filename arg
 49 | query II
 50 | select column_name, column_type from (DESCRIBE FROM read_avro('test/userdata1.avro', filename=true));
 51 | ----
 52 | registration_dttm	VARCHAR
 53 | id	BIGINT
 54 | first_name	VARCHAR
 55 | last_name	VARCHAR
 56 | email	VARCHAR
 57 | gender	VARCHAR
 58 | ip_address	VARCHAR
 59 | cc	BIGINT
 60 | country	VARCHAR
 61 | birthdate	VARCHAR
 62 | salary	DOUBLE
 63 | title	VARCHAR
 64 | comments	VARCHAR
 65 | filename	VARCHAR
 66 | 
 67 | 
 68 | query IIII
 69 | FROM read_avro('test/userdata1.avro', filename=true) SELECT first_name, cc, salary, filename ORDER BY registration_dttm LIMIT 10;
 70 | ----
 71 | Lillian	201713786459078	282503.77	test/userdata1.avro
 72 | Chris	5602220700741429	NULL	test/userdata1.avro
 73 | Nicholas	3575506969751259	192076.79	test/userdata1.avro
 74 | Johnny	5602239825516409	169429.76	test/userdata1.avro
 75 | Bruce	NULL	118244.57	test/userdata1.avro
 76 | Heather	NULL	164117.18	test/userdata1.avro
 77 | Larry	3531208154739438	139177.38	test/userdata1.avro
 78 | Roy	3589146577885209	262816.87	test/userdata1.avro
 79 | James	3589416270039051	211553.57	test/userdata1.avro
 80 | Sean	NULL	NULL	test/userdata1.avro
 81 | 
 82 | 
 83 | 
 84 | # with filename wildcard
 85 | query III
 86 | FROM read_avro('test/userdata*.avro', filename=true) SELECT filename[6:], count(*), max(salary) GROUP BY filename ORDER BY filename;
 87 | ----
 88 | userdata1.avro	1000	286592.99
 89 | userdata2.avro	998	286587.01
 90 | userdata3.avro	1000	286735.82
 91 | userdata4.avro	1000	286147.64
 92 | userdata5.avro	1000	286384.03
 93 | 
 94 | 
 95 | query II
 96 | select column_name, column_type from (DESCRIBE FROM read_avro('test/users.avro'));
 97 | ----
 98 | name	VARCHAR
 99 | favorite_number	INTEGER
100 | favorite_color	VARCHAR
101 | 
102 | statement error
103 | from read_avro(['test/userdata1.avro', 'test/users.avro'])
104 | ----
105 | schema mismatch in glob
106 | 
107 | # example from readme
108 | query III
109 | FROM read_avro('test/users.avro')
110 | ----
111 | Alyssa	256	NULL
112 | Ben	7	red
113 | 
114 | query II
115 | select column_name, column_type from (DESCRIBE FROM read_avro('test/single-union.avro'));
116 | ----
117 | single_union	INTEGER
118 | 
119 | # example from readme
120 | query I
121 | FROM read_avro('test/single-union.avro')
122 | ----
123 | 42
124 | 
125 | query II
126 | select column_name, column_type from (DESCRIBE FROM read_avro('test/null_first.avro'));
127 | ----
128 | null_first	INTEGER
129 | 
130 | # example from readme
131 | query I
132 | FROM read_avro('test/null_first.avro')
133 | ----
134 | 42
135 | NULL
136 | 
137 | query II
138 | select column_name, column_type from (DESCRIBE FROM read_avro('test/null_last.avro'));
139 | ----
140 | null_last	INTEGER
141 | 
142 | # example from readme
143 | query I
144 | FROM read_avro('test/null_last.avro')
145 | ----
146 | 42
147 | NULL
148 | 
149 | query II
150 | select column_name, column_type from (DESCRIBE FROM read_avro('test/primitive_types.avro'));
151 | ----
152 | null	INTEGER
153 | boolean	BOOLEAN
154 | int	INTEGER
155 | long	BIGINT
156 | float	FLOAT
157 | double	DOUBLE
158 | bytes	BLOB
159 | string	VARCHAR
160 | 
161 | # example from readme
162 | query IIIIIIII
163 | FROM read_avro('test/primitive_types.avro')
164 | ----
165 | NULL	0	-2147483648	-9223372036854775808	-3.4028235e+38	-1.7976931348623157e+308	thisisalongblob\x00withnullbytes	🦆🦆🦆🦆🦆🦆
166 | NULL	1	2147483647	9223372036854775807	3.4028235e+38	1.7976931348623157e+308	\x00\x00\x00a	goo
167 | 
168 | 
169 | 
170 | query II
171 | select column_name, column_type from (DESCRIBE FROM read_avro('test/enum.avro'));
172 | ----
173 | color	ENUM('UNKNOWN', 'GREEN', 'RED')
174 | 
175 | query I
176 | FROM read_avro('test/enum.avro')
177 | ----
178 | GREEN
179 | GREEN
180 | RED
181 | UNKNOWN
182 | UNKNOWN
183 | 
184 | query II
185 | select column_name, column_type from (DESCRIBE FROM read_avro('test/fixed.avro'));
186 | ----
187 | md5	BLOB
188 | 
189 | query I
190 | FROM read_avro('test/fixed.avro')
191 | ----
192 | 47336f3f2497b70ac046cf23298e20a7
193 | a789a15a7ff7db4a0d1b186363ef0771
194 | c9db7c67a6acb5a65c78b19e9e01d7b0
195 | ac441296bcbd44442301204a8f061cf2
196 | 
197 | 
198 | 
199 | query II
200 | select column_name, column_type from (DESCRIBE FROM read_avro('test/string_array.avro'));
201 | ----
202 | string_arr	VARCHAR[]
203 | 
204 | query I
205 | FROM read_avro('test/string_array.avro')
206 | ----
207 | [Hello, World]
208 | [this]
209 | []
210 | [is, cool, array]
211 | [data]
212 | 
213 | 
214 | query II
215 | select column_name, column_type from (DESCRIBE FROM read_avro('test/nullable_string_array.avro'));
216 | ----
217 | string_arr	VARCHAR[]
218 | 
219 | query I
220 | FROM read_avro('test/nullable_string_array.avro')
221 | ----
222 | [Hello, World]
223 | [this]
224 | []
225 | NULL
226 | NULL
227 | [is, cool, array]
228 | [data]
229 | 
230 | 
231 | query II
232 | select column_name, column_type from (DESCRIBE FROM read_avro('test/nullable_entry_string_array.avro'));
233 | ----
234 | string_arr	VARCHAR[]
235 | 
236 | query I
237 | FROM read_avro('test/nullable_entry_string_array.avro')
238 | ----
239 | [Hello, NULL, World]
240 | [this]
241 | [NULL]
242 | [NULL, NULL, NULL]
243 | []
244 | [NULL, is, cool, NULL, array, NULL]
245 | [data, NULL]
246 | 
247 | 
248 | query II
249 | select column_name, column_type from (DESCRIBE FROM read_avro('test/all_nullable_list.avro'));
250 | ----
251 | string_arr	VARCHAR[]
252 | 
253 | query I
254 | FROM read_avro('test/all_nullable_list.avro')
255 | ----
256 | [Hello, NULL, World]
257 | [this]
258 | [NULL]
259 | [NULL, NULL, NULL]
260 | []
261 | NULL
262 | NULL
263 | [NULL, is, cool, NULL, array, NULL]
264 | [data, NULL]
265 | 
266 | 
267 | query II
268 | select column_name, column_type from (DESCRIBE FROM read_avro('test/nested_nullable_lists.avro'));
269 | ----
270 | nested_ints	INTEGER[][]
271 | 
272 | query I
273 | FROM read_avro('test/nested_nullable_lists.avro')
274 | ----
275 | NULL
276 | [NULL]
277 | [[NULL], [NULL]]
278 | [NULL, NULL]
279 | [[42]]
280 | [[42], [43]]
281 | [[42, 43]]
282 | [[42, 43], NULL, [44, 45]]
283 | [[42, NULL, 43, NULL], NULL, [44, NULL, 45, NULL], NULL, [46]]
284 | 
285 | 
286 | query II
287 | select column_name, column_type from (DESCRIBE FROM read_avro('test/long_map.avro'));
288 | ----
289 | long_map	MAP(VARCHAR, BIGINT)
290 | 
291 | query I
292 | FROM read_avro('test/long_map.avro')
293 | ----
294 | {one=42}
295 | {two=43}
296 | {three=44}
297 | 
298 | 
299 | statement error
300 | from read_avro('does-not-exist.avro')
301 | ----
302 | No files found that match the pattern
303 | 
304 | statement error
305 | from read_avro('CMakeLists.txt')
306 | ----
307 | Incorrect Avro container file magic number
308 | 
309 | 
310 | statement error
311 | FROM read_avro('test/recursive.avro')
312 | ----
313 | Recursive Avro types not supported: LongList
314 | 
315 | query II
316 | select column_name, column_type from (DESCRIBE FROM read_avro('test/broken_record.avro'));
317 | ----
318 | n	INTEGER
319 | 
320 | query I
321 | FROM read_avro('test/broken_record.avro')
322 | ----
323 | NULL
324 | NULL
325 | 
326 | 
327 | 
328 | 
329 | query II
330 | select column_name, column_type from (DESCRIBE FROM read_avro('test/query_small.avro'));
331 | ----
332 | avro_schema	UNION(u0 STRUCT("data" BLOB), u1 STRUCT(fatal BOOLEAN, "name" VARCHAR, description VARCHAR, "position" BIGINT), u2 STRUCT(bytesScanned BIGINT, totalBytes BIGINT), u3 STRUCT(totalBytes BIGINT))
333 | 
334 | query I
335 | FROM read_avro('test/query_small.avro')
336 | ----
337 | {'data': '100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A100,200,300,400\\x0A300,400,500,600\\x0A'}
338 | {'bytesScanned': 1024, 'totalBytes': 1024}
339 | {'totalBytes': 1024}
340 | 
341 | 
342 | 
343 | 
344 | query II
345 | select column_name, column_type from (DESCRIBE FROM read_avro('test/avro.avro'));
346 | ----
347 | visitor	STRUCT(cookie_id VARCHAR, segments STRUCT(id INTEGER, expiration BIGINT)[], edges MAP(VARCHAR, BIGINT), behaviors MAP(VARCHAR, MAP(VARCHAR, INTEGER)), birthdate BIGINT, association_ids MAP(VARCHAR, VARCHAR))
348 | events	STRUCT(cookie_id VARCHAR, tstamp BIGINT, edge VARCHAR, changes UNION(u0 STRUCT(daystamp VARCHAR, context VARCHAR, "type" ENUM('ADX', 'RETARGET'), count INTEGER), u1 STRUCT(operation ENUM('ADD', 'REPLACE', 'UPDATE', 'REMOVE'), association_id VARCHAR, network VARCHAR, segments INTEGER[])))[]
349 | 
350 | 
351 | query II
352 | FROM read_avro('test/avro.avro')
353 | ----
354 | {'cookie_id': 133263e9e100000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 133263e9e100000, 'tstamp': 1403721385042, 'edge': batchimport, 'changes': {'operation': REMOVE, 'association_id': NULL, 'network': et, 'segments': [49118]}}, {'cookie_id': 133263e9e100000, 'tstamp': 1403721385042, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49117]}}]
355 | {'cookie_id': 134adb391b00000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 134adb391b00000, 'tstamp': 1403721376988, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
356 | {'cookie_id': 1317beb84b00000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 1317beb84b00000, 'tstamp': 1403721380452, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
357 | {'cookie_id': 12b811f59080000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 12b811f59080000, 'tstamp': 1403721375367, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
358 | {'cookie_id': 134338dcf180000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 134338dcf180000, 'tstamp': 1403721380483, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
359 | {'cookie_id': 12aa3637e280000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 12aa3637e280000, 'tstamp': 1403721383922, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
360 | {'cookie_id': 133bc9432a80000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 133bc9432a80000, 'tstamp': 1403721385810, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
361 | {'cookie_id': 134dbec12c80000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 134dbec12c80000, 'tstamp': 1403721376778, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49117]}}]
362 | {'cookie_id': 12bd0b8c6201000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 12bd0b8c6201000, 'tstamp': 1403721384549, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49118]}}]
363 | {'cookie_id': 13114ef2b401000, 'segments': [], 'edges': {}, 'behaviors': {}, 'birthdate': 0, 'association_ids': {}}	[{'cookie_id': 13114ef2b401000, 'tstamp': 1403721375994, 'edge': batchimport, 'changes': {'operation': ADD, 'association_id': NULL, 'network': et, 'segments': [49117]}}]
364 | 
365 | 
366 | 
367 | 
368 | query II
369 | select column_name, column_type from (DESCRIBE FROM read_avro('test/part-r-00000.avro'));
370 | ----
371 | string	VARCHAR
372 | simple_map	MAP(VARCHAR, INTEGER)
373 | complex_map	MAP(VARCHAR, MAP(VARCHAR, VARCHAR))
374 | union_string_null	VARCHAR
375 | union_int_long_null	UNION(u0 INTEGER, u1 BIGINT)
376 | union_float_double	UNION(u0 FLOAT, u1 DOUBLE)
377 | fixed3	BLOB
378 | fixed2	BLOB
379 | enum	ENUM('SPADES', 'HEARTS', 'DIAMONDS', 'CLUBS')
380 | record	STRUCT(value_field VARCHAR)
381 | array_of_boolean	BOOLEAN[]
382 | bytes	BLOB
383 | 
384 | statement ok
385 | FROM read_avro('test/part-r-00000.avro')
386 | 
387 | # iceberg yay
388 | query II
389 | select column_name, column_type from (DESCRIBE FROM read_avro('test/4551fe85-feb8-43ec-8408-730e593c8b12-m0.avro'));
390 | ----
391 | status	INTEGER
392 | snapshot_id	BIGINT
393 | data_file	STRUCT(file_path VARCHAR, file_format VARCHAR, "partition" INTEGER, record_count BIGINT, file_size_in_bytes BIGINT, block_size_in_bytes BIGINT, file_ordinal INTEGER, sort_columns INTEGER[], column_sizes STRUCT("key" INTEGER, "value" BIGINT)[], value_counts STRUCT("key" INTEGER, "value" BIGINT)[], null_value_counts STRUCT("key" INTEGER, "value" BIGINT)[], lower_bounds STRUCT("key" INTEGER, "value" BLOB)[], upper_bounds STRUCT("key" INTEGER, "value" BLOB)[], key_metadata BLOB, split_offsets BIGINT[])
394 | 
395 | # usual suspect, the userdata1 file, this time with a wildcard
396 | query II
397 | select column_name, column_type from (DESCRIBE FROM read_avro('test/userdata1*.avro'));
398 | ----
399 | registration_dttm	VARCHAR
400 | id	BIGINT
401 | first_name	VARCHAR
402 | last_name	VARCHAR
403 | email	VARCHAR
404 | gender	VARCHAR
405 | ip_address	VARCHAR
406 | cc	BIGINT
407 | country	VARCHAR
408 | birthdate	VARCHAR
409 | salary	DOUBLE
410 | title	VARCHAR
411 | comments	VARCHAR
412 | 
413 | 
414 | # union by name
415 | statement error
416 | FROM read_avro('test/union-name-*.avro', filename=true, union_by_name=true) order by one;
417 | ----
418 | Not implemented Error: 'union_by_name' not implemented for Avro reader yet
419 | 
420 | 
421 | # TODO: add test where schemas can't be combined
422 | 
423 | # files with different schemas that can be safely combined are okay
424 | query III
425 | select * exclude filename FROM read_avro('test/union-name-*.avro', filename=true) order by all;
426 | ----
427 | 10	2.0	s30
428 | 11	2.1	s31
429 | 12	2.2	s32
430 | 13	2.3	s33
431 | 14	2.4	s34
432 | 15	2.5	s35
433 | 
434 | 
435 | 
436 | query II
437 | select column_name, column_type from (DESCRIBE FROM read_avro('test/reuse-1.avro'));
438 | ----
439 | request_id	VARCHAR
440 | client_version	STRUCT(major INTEGER, minor INTEGER)
441 | server_version	STRUCT(major INTEGER, minor INTEGER)
442 | 
443 | query III
444 | FROM read_avro('test/reuse-1.avro')
445 | ----
446 | hello	{'major': 4, 'minor': 2}	{'major': 8, 'minor': 5}
447 | world	{'major': 5, 'minor': 3}	{'major': 9, 'minor': 6}
448 | 
449 | 
450 | 
451 | query II
452 | select column_name, column_type from (DESCRIBE FROM read_avro('test/reuse-2.avro'));
453 | ----
454 | version	STRUCT(major INTEGER, minor INTEGER)
455 | details	STRUCT(release_version STRUCT(major INTEGER, minor INTEGER))
456 | 
457 | query II
458 | FROM read_avro('test/reuse-2.avro')
459 | ----
460 | {'major': 4, 'minor': 2}	{'release_version': {'major': 8, 'minor': 5}}
461 | {'major': 5, 'minor': 3}	{'release_version': {'major': 9, 'minor': 6}}
462 | 
463 | 
464 | query II
465 | select column_name, column_type from (DESCRIBE FROM read_avro('test/union.avro'));
466 | ----
467 | event	UNION(u0 STRUCT(id VARCHAR, "timestamp" BIGINT, "data" VARCHAR), u1 STRUCT(id VARCHAR, "timestamp" BIGINT, updatedData VARCHAR), u2 STRUCT(id VARCHAR, "timestamp" BIGINT))
468 | 
469 | query I
470 | FROM read_avro('test/union.avro')
471 | ----
472 | {'id': 1, 'timestamp': 1704367260, 'data': New record created}
473 | {'id': 1, 'timestamp': 1704367360, 'updatedData': Record updated}
474 | {'id': 1, 'timestamp': 1704367460}
475 | 
476 | 


--------------------------------------------------------------------------------
/test/sql/bigdata.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/avro.test
 2 | # description: test avro extension
 3 | # group: [avro]
 4 | 
 5 | require avro
 6 | 
 7 | 
 8 | query I
 9 | FROM read_avro('test/bigdata.avro') SELECT count(*)
10 | ----
11 | 1000000
12 | 
13 | 
14 | query IIIIIIIIII
15 | FROM read_avro('test/bigdata.avro') ORDER BY c1 LIMIT 10
16 | ----
17 | 0	10000000	20000000	30000000	40000000	50000000	60000000	70000000	80000000	90000000
18 | 1	10000001	20000001	30000001	40000001	50000001	60000001	70000001	80000001	90000001
19 | 2	10000002	20000002	30000002	40000002	50000002	60000002	70000002	80000002	90000002
20 | 3	10000003	20000003	30000003	40000003	50000003	60000003	70000003	80000003	90000003
21 | 4	10000004	20000004	30000004	40000004	50000004	60000004	70000004	80000004	90000004
22 | 5	10000005	20000005	30000005	40000005	50000005	60000005	70000005	80000005	90000005
23 | 6	10000006	20000006	30000006	40000006	50000006	60000006	70000006	80000006	90000006
24 | 7	10000007	20000007	30000007	40000007	50000007	60000007	70000007	80000007	90000007
25 | 8	10000008	20000008	30000008	40000008	50000008	60000008	70000008	80000008	90000008
26 | 9	10000009	20000009	30000009	40000009	50000009	60000009	70000009	80000009	90000009
27 | 
28 | 
29 | query IIIIIIIIII
30 | FROM read_avro('test/bigdata.avro') SELECT SUM(columns(*))
31 | ----
32 | 499999500000	10499999500000	20499999500000	30499999500000	40499999500000	50499999500000	60499999500000	70499999500000	80499999500000	90499999500000
33 | 


--------------------------------------------------------------------------------
/test/sql/external_file_cache.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/external_file_cache.test
 2 | # description: test avro extension and external file cache
 3 | # group: [avro]
 4 | 
 5 | require avro
 6 | 
 7 | query IIII
 8 | from duckdb_external_file_cache();
 9 | ----
10 | 
11 | query I
12 | FROM read_avro('test/bigdata.avro') SELECT count(*)
13 | ----
14 | 1000000
15 | 
16 | query IIIIIIIIII
17 | FROM read_avro('test/bigdata.avro') ORDER BY c1 LIMIT 10
18 | ----
19 | 0	10000000	20000000	30000000	40000000	50000000	60000000	70000000	80000000	90000000
20 | 1	10000001	20000001	30000001	40000001	50000001	60000001	70000001	80000001	90000001
21 | 2	10000002	20000002	30000002	40000002	50000002	60000002	70000002	80000002	90000002
22 | 3	10000003	20000003	30000003	40000003	50000003	60000003	70000003	80000003	90000003
23 | 4	10000004	20000004	30000004	40000004	50000004	60000004	70000004	80000004	90000004
24 | 5	10000005	20000005	30000005	40000005	50000005	60000005	70000005	80000005	90000005
25 | 6	10000006	20000006	30000006	40000006	50000006	60000006	70000006	80000006	90000006
26 | 7	10000007	20000007	30000007	40000007	50000007	60000007	70000007	80000007	90000007
27 | 8	10000008	20000008	30000008	40000008	50000008	60000008	70000008	80000008	90000008
28 | 9	10000009	20000009	30000009	40000009	50000009	60000009	70000009	80000009	90000009
29 | 
30 | 
31 | statement ok
32 | from read_avro('test/userdata1.avro');
33 | 
34 | query IIII
35 | from duckdb_external_file_cache() order by path;
36 | ----
37 | test/bigdata.avro	17647257	0	true
38 | test/userdata1.avro	93561	0	true


--------------------------------------------------------------------------------
/test/sql/iceberg.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/avro.test
 2 | # description: test avro extension
 3 | # group: [avro]
 4 | 
 5 | require avro
 6 | 
 7 | statement ok
 8 | PRAGMA enable_verification
 9 | 
10 | query II
11 | select column_name, column_type from (DESCRIBE FROM read_avro('test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro'));
12 | ----
13 | status	INTEGER
14 | snapshot_id	BIGINT
15 | sequence_number	BIGINT
16 | data_file	STRUCT("content" INTEGER, file_path VARCHAR, file_format VARCHAR, "partition" INTEGER, record_count BIGINT, file_size_in_bytes BIGINT, column_sizes STRUCT("key" INTEGER, "value" BIGINT)[], value_counts STRUCT("key" INTEGER, "value" BIGINT)[], null_value_counts STRUCT("key" INTEGER, "value" BIGINT)[], nan_value_counts STRUCT("key" INTEGER, "value" BIGINT)[], lower_bounds STRUCT("key" INTEGER, "value" BLOB)[], upper_bounds STRUCT("key" INTEGER, "value" BLOB)[], key_metadata BLOB, split_offsets BIGINT[], equality_ids INTEGER[], sort_order_id INTEGER)
17 | 
18 | query IIIIIIIII
19 | FROM (FROM read_avro('test/iceberg/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro') SELECT status, snapshot_id, sequence_number, data_file.*) SELECT status, snapshot_id, sequence_number, content, file_path, file_format, partition, record_count, file_size_in_bytes
20 | ----
21 | 2	7635660646343998149	NULL	0	lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet	PARQUET	NULL	60175	1390176
22 | 
23 | 
24 | 
25 | query II
26 | select column_name, column_type from (DESCRIBE FROM read_avro('test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro'));
27 | ----
28 | manifest_path	VARCHAR
29 | manifest_length	BIGINT
30 | partition_spec_id	INTEGER
31 | content	INTEGER
32 | sequence_number	BIGINT
33 | min_sequence_number	BIGINT
34 | added_snapshot_id	BIGINT
35 | added_data_files_count	INTEGER
36 | existing_data_files_count	INTEGER
37 | deleted_data_files_count	INTEGER
38 | added_rows_count	BIGINT
39 | existing_rows_count	BIGINT
40 | deleted_rows_count	BIGINT
41 | partitions	STRUCT(contains_null BOOLEAN, contains_nan BOOLEAN, lower_bound BLOB, upper_bound BLOB)[]
42 | 
43 | 
44 | 
45 | query IIIIIIIIIIIIII
46 | FROM read_avro('test/iceberg/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro') ORDER BY manifest_path
47 | ----
48 | lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro	7687	0	0	2	2	7635660646343998149	0	0	1	0	0	60175	[]
49 | lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro	7692	0	0	2	2	7635660646343998149	1	0	0	51793	0	0	[]
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/test/sql/test_missing_file.test:
--------------------------------------------------------------------------------
 1 | # name: test/sql/test_missing_file.test
 2 | # description: test avro extension and external file cache
 3 | # group: [avro]
 4 | 
 5 | require avro
 6 | 
 7 | statement error
 8 | from read_avro('not_exists');
 9 | ----
10 | IO Error: No files found that match the pattern "not_exists"


--------------------------------------------------------------------------------
/test/string_array.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/string_array.avro


--------------------------------------------------------------------------------
/test/union-name-1.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union-name-1.avro


--------------------------------------------------------------------------------
/test/union-name-2.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union-name-2.avro


--------------------------------------------------------------------------------
/test/union-name-3.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union-name-3.avro


--------------------------------------------------------------------------------
/test/union.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/union.avro


--------------------------------------------------------------------------------
/test/userdata1.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata1.avro


--------------------------------------------------------------------------------
/test/userdata2.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata2.avro


--------------------------------------------------------------------------------
/test/userdata3.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata3.avro


--------------------------------------------------------------------------------
/test/userdata4.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata4.avro


--------------------------------------------------------------------------------
/test/userdata5.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/userdata5.avro


--------------------------------------------------------------------------------
/test/users.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdb/duckdb-avro/7e530267b1fa6c7e1c6b6ff48afafb27166af87b/test/users.avro


--------------------------------------------------------------------------------
/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dependencies": [
 3 | 		"avro-c"
 4 | 	],
 5 | 	"vcpkg-configuration": {
 6 | 		"overlay-ports": [
 7 | 			"./vcpkg_ports"
 8 | 		],
 9 | 		"registries": [
10 | 			{
11 | 				"kind": "git",
12 | 				"repository": "https://github.com/duckdb/vcpkg-duckdb-ports",
13 | 				"baseline": "0f9bf648ba1ee29291890a1ca9a49a80bba017eb",
14 | 				"packages": [
15 | 					"vcpkg-cmake",
16 | 					"avro-c"
17 | 				]
18 | 			}
19 | 		]
20 | 	},
21 | 	"builtin-baseline": "5e5d0e1cd7785623065e77eff011afdeec1a3574"
22 | }


--------------------------------------------------------------------------------
/vcpkg_ports/liblzma/build-tools.patch:
--------------------------------------------------------------------------------
 1 | --- a/CMakeLists.txt
 2 | +++ b/CMakeLists.txt
 3 | @@ -1484,7 +1484,7 @@ function(my_install_man COMPONENT SRC_FILE LINK_NAMES)
 4 |      endif()
 5 |  endfunction()
 6 |  
 7 | -
 8 | +if(BUILD_TOOLS)
 9 |  #############################################################################
10 |  # libgnu (getopt_long)
11 |  #############################################################################
12 | @@ -1982,6 +1982,7 @@ if(UNIX)
13 |      my_install_man(scripts_Documentation src/scripts/xzless.1 "${XZLESS_LINKS}")
14 |  endif()
15 |  
16 | +endif()
17 |  
18 |  #############################################################################
19 |  # Documentation
20 | 


--------------------------------------------------------------------------------
/vcpkg_ports/liblzma/portfile.cmake:
--------------------------------------------------------------------------------
 1 | vcpkg_from_github(
 2 |     OUT_SOURCE_PATH SOURCE_PATH
 3 |     REPO tukaani-project/xz
 4 |     REF "v${VERSION}"
 5 |     SHA512 0f814f4282c87cb74a8383199c1e55ec1bf49519daaf07f7b376cb644770b75cc9257c809b661405fcfd6cda28c54d799c67eb9e169665c35b1b87529468085e
 6 |     HEAD_REF master
 7 |     PATCHES
 8 |         build-tools.patch
 9 | )
10 | 
11 | vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
12 |     FEATURES
13 |         tools BUILD_TOOLS
14 | )
15 | 
16 | if(VCPKG_TARGET_ARCHITECTURE STREQUAL "wasm32")
17 |     set(WASM_OPTIONS -DCMAKE_C_BYTE_ORDER=LITTLE_ENDIAN -DCMAKE_CXX_BYTE_ORDER=LITTLE_ENDIAN -DCMAKE_POSITION_INDEPENDENT_CODE=ON)
18 | endif()
19 | 
20 | vcpkg_cmake_configure(
21 |     SOURCE_PATH "${SOURCE_PATH}"
22 |     OPTIONS
23 |         ${FEATURE_OPTIONS}
24 |         ${WASM_OPTIONS}
25 |         -DBUILD_TESTING=OFF
26 |         -DCREATE_XZ_SYMLINKS=OFF
27 |         -DCREATE_LZMA_SYMLINKS=OFF
28 |         -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT=   # using flags from (vcpkg) toolchain
29 |         -DENABLE_NLS=OFF # nls is not supported by this port, yet
30 |     MAYBE_UNUSED_VARIABLES
31 |         CMAKE_MSVC_DEBUG_INFORMATION_FORMAT
32 |         CREATE_XZ_SYMLINKS
33 |         CREATE_LZMA_SYMLINKS
34 |         ENABLE_NLS
35 | )
36 | vcpkg_cmake_install()
37 | vcpkg_copy_pdbs()
38 | 
39 | set(exec_prefix "\${prefix}")
40 | set(libdir "\${prefix}/lib")
41 | set(includedir "\${prefix}/include")
42 | set(PACKAGE_URL https://tukaani.org/xz/)
43 | set(PACKAGE_VERSION "${VERSION}")
44 | if(NOT VCPKG_TARGET_IS_WINDOWS)
45 |     set(PTHREAD_CFLAGS -pthread)
46 | endif()
47 | set(prefix "${CURRENT_INSTALLED_DIR}")
48 | configure_file("${SOURCE_PATH}/src/liblzma/liblzma.pc.in" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/liblzma.pc" @ONLY)
49 | if (NOT VCPKG_BUILD_TYPE)
50 |   set(prefix "${CURRENT_INSTALLED_DIR}/debug")
51 |   configure_file("${SOURCE_PATH}/src/liblzma/liblzma.pc.in" "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/liblzma.pc" @ONLY)
52 | endif()
53 | vcpkg_fixup_pkgconfig()
54 | 
55 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/liblzma)
56 | 
57 | if(VCPKG_LIBRARY_LINKAGE STREQUAL "static")
58 |     vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/lzma.h" "defined(LZMA_API_STATIC)" "1")
59 | else()
60 |     vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/lzma.h" "defined(LZMA_API_STATIC)" "0")
61 | endif()
62 | 
63 | file(REMOVE_RECURSE
64 |     "${CURRENT_PACKAGES_DIR}/debug/include"
65 |     "${CURRENT_PACKAGES_DIR}/debug/share"
66 |     "${CURRENT_PACKAGES_DIR}/share/man"
67 | )
68 | 
69 | set(TOOLS xz xzdec lzmadec lzmainfo)
70 | foreach(_tool IN LISTS TOOLS)
71 |     if(NOT EXISTS "${CURRENT_PACKAGES_DIR}/bin/${_tool}${VCPKG_TARGET_EXECUTABLE_SUFFIX}")
72 |         list(REMOVE_ITEM TOOLS ${_tool})
73 |     endif()
74 | endforeach()
75 | if(TOOLS)
76 |     vcpkg_copy_tools(TOOL_NAMES ${TOOLS} AUTO_CLEAN)
77 | endif()
78 | 
79 | if(VCPKG_LIBRARY_LINKAGE STREQUAL "static")
80 |     file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/bin" "${CURRENT_PACKAGES_DIR}/debug/bin")
81 | endif()
82 | 
83 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/vcpkg-cmake-wrapper.cmake" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
84 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
85 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/COPYING")
86 | 


--------------------------------------------------------------------------------
/vcpkg_ports/liblzma/usage:
--------------------------------------------------------------------------------
 1 | liblzma is compatible with built-in CMake targets:
 2 | 
 3 |     find_package(LibLZMA REQUIRED)
 4 |     target_link_libraries(main PRIVATE LibLZMA::LibLZMA)
 5 | 
 6 | liblzma provides CMake targets:
 7 | 
 8 |     find_package(liblzma CONFIG REQUIRED)
 9 |     target_link_libraries(main PRIVATE liblzma::liblzma)
10 | 


--------------------------------------------------------------------------------
/vcpkg_ports/liblzma/vcpkg-cmake-wrapper.cmake:
--------------------------------------------------------------------------------
 1 | cmake_policy(PUSH)
 2 | cmake_policy(SET CMP0012 NEW)
 3 | cmake_policy(SET CMP0057 NEW)
 4 | set(z_vcpkg_liblzma_fixup_needed 0)
 5 | if(NOT "CONFIG" IN_LIST ARGS AND NOT "NO_MODULE" IN_LIST ARGS AND NOT CMAKE_DISABLE_FIND_PACKAGE_LibLZMA)
 6 |     get_filename_component(z_vcpkg_liblzma_prefix "${CMAKE_CURRENT_LIST_DIR}" DIRECTORY)
 7 |     get_filename_component(z_vcpkg_liblzma_prefix "${z_vcpkg_liblzma_prefix}" DIRECTORY)
 8 |     find_path(LIBLZMA_INCLUDE_DIR NAMES lzma.h PATHS "${z_vcpkg_liblzma_prefix}/include" NO_DEFAULT_PATH)
 9 |     # liblzma doesn't use a debug postfix, but FindLibLZMA.cmake expects it 
10 |     find_library(LIBLZMA_LIBRARY_RELEASE NAMES lzma PATHS "${z_vcpkg_liblzma_prefix}/lib" NO_DEFAULT_PATH)
11 |     find_library(LIBLZMA_LIBRARY_DEBUG NAMES lzma PATHS "${z_vcpkg_liblzma_prefix}/debug/lib" NO_DEFAULT_PATH)
12 |     unset(z_vcpkg_liblzma_prefix)
13 |     if(CMAKE_VERSION VERSION_LESS 3.16)
14 |         # Older versions of FindLibLZMA.cmake need a single lib in LIBLZMA_LIBRARY.
15 |         set(z_vcpkg_liblzma_fixup_needed 1)
16 |         set(LIBLZMA_LIBRARY "${LIBLZMA_LIBRARY_RELEASE}" CACHE INTERNAL "")
17 |     elseif(NOT TARGET LibLZMA::LibLZMA)
18 |         set(z_vcpkg_liblzma_fixup_needed 1)
19 |     endif()
20 |     # Known values, and required. Skip expensive tests.
21 |     set(LIBLZMA_HAS_AUTO_DECODER 1 CACHE INTERNAL "")
22 |     set(LIBLZMA_HAS_EASY_ENCODER 1 CACHE INTERNAL "")
23 |     set(LIBLZMA_HAS_LZMA_PRESET 1 CACHE INTERNAL "")
24 | endif()
25 | 
26 | _find_package(${ARGS})
27 | 
28 | if(z_vcpkg_liblzma_fixup_needed)
29 |     include(SelectLibraryConfigurations)
30 |     select_library_configurations(LIBLZMA)
31 |     if(NOT TARGET LibLZMA::LibLZMA)
32 |         # Backfill LibLZMA::LibLZMA to versions of cmake before 3.14
33 |         add_library(LibLZMA::LibLZMA UNKNOWN IMPORTED)
34 |         if(DEFINED LIBLZMA_INCLUDE_DIRS)
35 |             set_target_properties(LibLZMA::LibLZMA PROPERTIES
36 |                 INTERFACE_INCLUDE_DIRECTORIES "${LIBLZMA_INCLUDE_DIRS}")
37 |         endif()
38 |         set_property(TARGET LibLZMA::LibLZMA APPEND PROPERTY
39 |             IMPORTED_CONFIGURATIONS RELEASE)
40 |         set_target_properties(LibLZMA::LibLZMA PROPERTIES
41 |             IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C"
42 |             IMPORTED_LOCATION_RELEASE "${LIBLZMA_LIBRARY_RELEASE}")
43 |         if(EXISTS "${LIBLZMA_LIBRARY}")
44 |             set_target_properties(LibLZMA::LibLZMA PROPERTIES
45 |                 IMPORTED_LINK_INTERFACE_LANGUAGES "C"
46 |                 IMPORTED_LOCATION "${LIBLZMA_LIBRARY}")
47 |         endif()
48 |     endif()
49 |     if(LIBLZMA_LIBRARY_DEBUG)
50 |         # Backfill debug variant to versions of cmake before 3.16
51 |         set_property(TARGET LibLZMA::LibLZMA APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
52 |         set_target_properties(LibLZMA::LibLZMA PROPERTIES IMPORTED_LOCATION_DEBUG "${LIBLZMA_LIBRARY_DEBUG}")
53 |     endif()
54 | endif()
55 | if(LIBLZMA_LIBRARIES AND NOT "Threads::Threads" IN_LIST LIBLZMA_LIBRARIES AND NOT EMSCRIPTEN)
56 |     set(THREADS_PREFER_PTHREAD_FLAG TRUE)
57 |     find_package(Threads)
58 |     list(APPEND LIBLZMA_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
59 |     if(TARGET LibLZMA::LibLZMA)
60 |         set_property(TARGET LibLZMA::LibLZMA APPEND PROPERTY INTERFACE_LINK_LIBRARIES Threads::Threads)
61 |     endif()
62 | endif()
63 | unset(z_vcpkg_liblzma_fixup_needed)
64 | cmake_policy(POP)
65 | 


--------------------------------------------------------------------------------
/vcpkg_ports/liblzma/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "liblzma",
 3 |   "version": "5.6.3",
 4 |   "description": "Compression library with an API similar to that of zlib.",
 5 |   "homepage": "https://tukaani.org/xz/",
 6 |   "license": null,
 7 |   "dependencies": [
 8 |     {
 9 |       "name": "vcpkg-cmake"
10 |     },
11 |     {
12 |       "name": "vcpkg-cmake-config"
13 |     }
14 |   ],
15 |   "features": {
16 |     "tools": {
17 |       "description": "Build tools"
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/fix_clang-cl_build.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt
 2 | index 672561e62..b6930b834 100644
 3 | --- a/CMakeLists.txt
 4 | +++ b/CMakeLists.txt
 5 | @@ -38,7 +38,7 @@ if(NOT CMAKE_CXX_STANDARD)
 6 |  endif(NOT CMAKE_CXX_STANDARD)
 7 |  
 8 |  # https://github.com/izenecloud/cmake/blob/master/SetCompilerWarningAll.cmake
 9 | -if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
10 | +if(MSVC)
11 |    # Use the highest warning level for Visual Studio.
12 |    set(CMAKE_CXX_WARNING_LEVEL 4)
13 |    if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]")
14 | 


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/no-werror.patch:
--------------------------------------------------------------------------------
 1 | --- a/CMakeLists.txt
 2 | +++ b/CMakeLists.txt
 3 | @@ -68,7 +68,7 @@ 
 4 |  
 5 |    # Use -Werror for clang only.
 6 |    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 7 | -    if(NOT CMAKE_CXX_FLAGS MATCHES "-Werror")
 8 | +    if(0)
 9 |        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
10 |      endif(NOT CMAKE_CXX_FLAGS MATCHES "-Werror")
11 |    endif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
12 | 


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/pkgconfig.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt
 2 | index c3062e2..05477e9 100644
 3 | --- a/CMakeLists.txt
 4 | +++ b/CMakeLists.txt
 5 | @@ -417,4 +417,18 @@ if(SNAPPY_INSTALL)
 6 |        "${PROJECT_BINARY_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake"
 7 |      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
 8 |    )
 9 | +
10 | +  cmake_policy(SET CMP0057 NEW)
11 | +  set(LIBS_PRIVATE "")
12 | +  foreach(lib IN LISTS CMAKE_CXX_IMPLICIT_LINK_LIBRARIES)
13 | +    if(lib IN_LIST CMAKE_C_IMPLICIT_LINK_LIBRARIES)
14 | +      continue()
15 | +    elseif(EXISTS "${lib}")
16 | +      string(APPEND LIBS_PRIVATE " ${CMAKE_LINK_LIBRARY_FILE_FLAG}${lib}")
17 | +    else()
18 | +      string(APPEND LIBS_PRIVATE " ${CMAKE_LINK_LIBRARY_FLAG}${lib}")
19 | +    endif()
20 | +  endforeach()
21 | +  configure_file(snappy.pc.in "${CMAKE_CURRENT_BINARY_DIR}/snappy.pc" @ONLY)
22 | +  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/snappy.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
23 |  endif(SNAPPY_INSTALL)
24 | 


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/portfile.cmake:
--------------------------------------------------------------------------------
 1 | vcpkg_from_github(
 2 |     OUT_SOURCE_PATH SOURCE_PATH
 3 |     REPO google/snappy
 4 |     REF ${VERSION}
 5 |     SHA512 e7290d79ddd45605aafd02cba9eaa32309c94af04f137552a97a915c391f185dccab9b7b21a01b28f3f446be420232c3c22d91c06e0be6e1e2e32d645174798c
 6 |     HEAD_REF master
 7 |     PATCHES
 8 |         fix_clang-cl_build.patch
 9 |         no-werror.patch
10 |         pkgconfig.diff
11 | )
12 | file(COPY "${CURRENT_PORT_DIR}/snappy.pc.in" DESTINATION "${SOURCE_PATH}")
13 | 
14 | if(VCPKG_TARGET_ARCHITECTURE STREQUAL "wasm32")
15 |     set(WASM_OPTIONS -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_FLAGS=-fPIC -DCMAKE_C_FLAGS=-fPIC)
16 | endif()
17 | 
18 | vcpkg_cmake_configure(
19 |     SOURCE_PATH "${SOURCE_PATH}"
20 |     OPTIONS
21 |         -DSNAPPY_BUILD_TESTS=OFF
22 |         -DSNAPPY_BUILD_BENCHMARKS=OFF
23 |         ${WASM_OPTIONS}
24 | 
25 |         # These variables can be overriden in a custom triplet, see usage file
26 |         -DSNAPPY_HAVE_SSSE3=OFF
27 |         -DSNAPPY_HAVE_X86_CRC32=OFF
28 |         -DSNAPPY_HAVE_NEON_CRC32=OFF
29 |         -DSNAPPY_HAVE_BMI2=OFF
30 |         -DSNAPPY_HAVE_NEON=OFF
31 | )
32 | 
33 | vcpkg_cmake_install()
34 | vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/Snappy)
35 | vcpkg_copy_pdbs()
36 | vcpkg_fixup_pkgconfig()
37 | 
38 | file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
39 | 
40 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
41 | 
42 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/COPYING")
43 | 


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/snappy.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=${pcfiledir}/../..
 2 | exec_prefix=${prefix}
 3 | libdir=${prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: snappy
 7 | Description: A fast compressor/decompressor.
 8 | Version: @PROJECT_VERSION@
 9 | Libs: -L${libdir} -lsnappy
10 | Cflags: -I${includedir}
11 | Libs.private: @LIBS_PRIVATE@


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/usage:
--------------------------------------------------------------------------------
 1 | snappy provides CMake targets:
 2 | 
 3 |     find_package(Snappy CONFIG REQUIRED)
 4 |     target_link_libraries(main PRIVATE Snappy::snappy)
 5 | 
 6 | Optimizations based on hardware support are disabled by default.
 7 | You can enable them by adding corresponding flags to VCPKG_CMAKE_CONFIGURE_OPTIONS inside a custom triplet file, for example:
 8 | 
 9 |     if("${PORT}" STREQUAL "snappy")
10 |         list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DSNAPPY_HAVE_SSSE3=ON -DSNAPPY_HAVE_BMI2=ON)
11 |     endif()
12 | 
13 | For a full list of possible options, see project's root CMakeLists.txt.
14 | 


--------------------------------------------------------------------------------
/vcpkg_ports/snappy/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "snappy",
 3 |   "version": "1.2.1",
 4 |   "description": "A fast compressor/decompressor.",
 5 |   "homepage": "https://github.com/google/snappy",
 6 |   "license": null,
 7 |   "dependencies": [
 8 |     {
 9 |       "name": "vcpkg-cmake"
10 |     },
11 |     {
12 |       "name": "vcpkg-cmake-config"
13 |     }
14 |   ]
15 | }
16 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/0001-Prevent-invalid-inclusions-when-HAVE_-is-set-to-0.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/zconf.h.cmakein b/zconf.h.cmakein
 2 | index a7f24cc..a1b359b 100644
 3 | --- a/zconf.h.cmakein
 4 | +++ b/zconf.h.cmakein
 5 | @@ -434,11 +434,19 @@ typedef uLong FAR uLongf;
 6 |  #endif
 7 |  
 8 |  #ifdef HAVE_UNISTD_H    /* may be set to #if 1 by ./configure */
 9 | -#  define Z_HAVE_UNISTD_H
10 | +#  if ~(~HAVE_UNISTD_H + 0) == 0 && ~(~HAVE_UNISTD_H + 1) == 1
11 | +#    define Z_HAVE_UNISTD_H
12 | +#  elif HAVE_UNISTD_H != 0
13 | +#    define Z_HAVE_UNISTD_H
14 | +#  endif
15 |  #endif
16 |  
17 |  #ifdef HAVE_STDARG_H    /* may be set to #if 1 by ./configure */
18 | -#  define Z_HAVE_STDARG_H
19 | +#  if ~(~HAVE_STDARG_H + 0) == 0 && ~(~HAVE_STDARG_H + 1) == 1
20 | +#    define Z_HAVE_STDARG_H
21 | +#  elif HAVE_STDARG_H != 0
22 | +#    define Z_HAVE_STDARG_H
23 | +#  endif
24 |  #endif
25 |  
26 |  #ifdef STDC
27 | diff --git a/zconf.h.in b/zconf.h.in
28 | index 5e1d68a..32f53c8 100644
29 | --- a/zconf.h.in
30 | +++ b/zconf.h.in
31 | @@ -432,11 +432,19 @@ typedef uLong FAR uLongf;
32 |  #endif
33 |  
34 |  #ifdef HAVE_UNISTD_H    /* may be set to #if 1 by ./configure */
35 | -#  define Z_HAVE_UNISTD_H
36 | +#  if ~(~HAVE_UNISTD_H + 0) == 0 && ~(~HAVE_UNISTD_H + 1) == 1
37 | +#    define Z_HAVE_UNISTD_H
38 | +#  elif HAVE_UNISTD_H != 0
39 | +#    define Z_HAVE_UNISTD_H
40 | +#  endif
41 |  #endif
42 |  
43 |  #ifdef HAVE_STDARG_H    /* may be set to #if 1 by ./configure */
44 | -#  define Z_HAVE_STDARG_H
45 | +#  if ~(~HAVE_STDARG_H + 0) == 0 && ~(~HAVE_STDARG_H + 1) == 1
46 | +#    define Z_HAVE_STDARG_H
47 | +#  elif HAVE_STDARG_H != 0
48 | +#    define Z_HAVE_STDARG_H
49 | +#  endif
50 |  #endif
51 |  
52 |  #ifdef STDC
53 | 
54 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/0002-build-static-or-shared-not-both.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt
 2 | --- a/CMakeLists.txt
 3 | +++ b/CMakeLists.txt
 4 | @@ -123,9 +123,11 @@ set(ZLIB_SRCS
 5 |  )
 6 |  
 7 |  if(NOT MINGW)
 8 | +    if(BUILD_SHARED_LIBS)
 9 |      set(ZLIB_DLL_SRCS
10 |          win32/zlib1.rc # If present will override custom build rule below.
11 |      )
12 | +    endif()
13 |  endif()
14 |  
15 |  # parse the full version number from zlib.h and include in ZLIB_FULL_VERSION
16 | @@ -146,15 +148,17 @@ if(MINGW)
17 |                              -I ${CMAKE_CURRENT_BINARY_DIR}
18 |                              -o ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj
19 |                              -i ${CMAKE_CURRENT_SOURCE_DIR}/win32/zlib1.rc)
20 | +    if(BUILD_SHARED_LIBS)
21 |      set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj)
22 | +    endif()
23 |  endif(MINGW)
24 |  
25 | -add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
26 | +add_library(zlib ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
27 |  target_include_directories(zlib PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
28 | -add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
29 | -target_include_directories(zlibstatic PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
30 | +if (BUILD_SHARED_LIBS)
31 |  set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL)
32 |  set_target_properties(zlib PROPERTIES SOVERSION 1)
33 | +endif()
34 |  
35 |  if(NOT CYGWIN)
36 |      # This property causes shared libraries on Linux to have the full version
37 | @@ -169,7 +173,7 @@ endif()
38 |  
39 |  if(UNIX)
40 |      # On unix-like platforms the library is almost always called libz
41 | -   set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z)
42 | +   set_target_properties(zlib PROPERTIES OUTPUT_NAME z)
43 |     if(NOT APPLE AND NOT(CMAKE_SYSTEM_NAME STREQUAL AIX))
44 |       set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"")
45 |     endif()
46 | @@ -179,7 +183,7 @@ elseif(BUILD_SHARED_LIBS AND WIN32)
47 |  endif()
48 |  
49 |  if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL )
50 | -    install(TARGETS zlib zlibstatic
51 | +    install(TARGETS zlib 
52 |          RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
53 |          ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
54 |          LIBRARY DESTINATION "${INSTALL_LIB_DIR}" )
55 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/0003-android-and-mingw-fixes.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/CMakeLists.txt b/CMakeLists.txt
 2 | --- a/CMakeLists.txt
 3 | +++ b/CMakeLists.txt
 4 | @@ -58,7 +58,7 @@ endif()
 5 |  #
 6 |  check_include_file(unistd.h Z_HAVE_UNISTD_H)
 7 |  
 8 | -if(MSVC)
 9 | +if(WIN32)
10 |      set(CMAKE_DEBUG_POSTFIX "d")
11 |      add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
12 |      add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
13 | @@ -135,7 +135,7 @@ file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h _zlib_h_contents)
14 |  string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*"
15 |      "\\1" ZLIB_FULL_VERSION ${_zlib_h_contents})
16 |  
17 | -if(MINGW)
18 | +if(MINGW AND NOT ANDROID)
19 |      # This gets us DLL resource information when compiling on MinGW.
20 |      if(NOT CMAKE_RC_COMPILER)
21 |          set(CMAKE_RC_COMPILER windres.exe)
22 | @@ -151,7 +151,7 @@ if(MINGW)
23 |      if(BUILD_SHARED_LIBS)
24 |      set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj)
25 |      endif()
26 | -endif(MINGW)
27 | +endif(MINGW AND NOT ANDROID)
28 |  
29 |  add_library(zlib ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
30 |  target_include_directories(zlib PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
31 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/portfile.cmake:
--------------------------------------------------------------------------------
 1 | # When this port is updated, the minizip port should be updated at the same time
 2 | vcpkg_from_github(
 3 |     OUT_SOURCE_PATH SOURCE_PATH
 4 |     REPO madler/zlib
 5 |     REF v${VERSION}
 6 |     SHA512 8c9642495bafd6fad4ab9fb67f09b268c69ff9af0f4f20cf15dfc18852ff1f312bd8ca41de761b3f8d8e90e77d79f2ccacd3d4c5b19e475ecf09d021fdfe9088
 7 |     HEAD_REF master
 8 |     PATCHES
 9 |         0001-Prevent-invalid-inclusions-when-HAVE_-is-set-to-0.patch
10 |         0002-build-static-or-shared-not-both.patch
11 |         0003-android-and-mingw-fixes.patch
12 | )
13 | 
14 | # This is generated during the cmake build
15 | file(REMOVE "${SOURCE_PATH}/zconf.h")
16 | 
17 | if(VCPKG_TARGET_ARCHITECTURE STREQUAL "wasm32")
18 |     set(WASM_OPTIONS -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_FLAGS=-fPIC -DCMAKE_C_FLAGS=-fPIC)
19 | endif()
20 | 
21 | vcpkg_cmake_configure(
22 |     SOURCE_PATH "${SOURCE_PATH}"
23 |     OPTIONS
24 |         -DSKIP_INSTALL_FILES=ON
25 |         -DZLIB_BUILD_EXAMPLES=OFF
26 |         ${WASM_OPTIONS}
27 |     OPTIONS_DEBUG
28 |         -DSKIP_INSTALL_HEADERS=ON
29 | )
30 | 
31 | vcpkg_cmake_install()
32 | file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/vcpkg-cmake-wrapper.cmake" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
33 | 
34 | # Install the pkgconfig file
35 | if(NOT DEFINED VCPKG_BUILD_TYPE OR VCPKG_BUILD_TYPE STREQUAL "release")
36 |     if(VCPKG_TARGET_IS_WINDOWS)
37 |         vcpkg_replace_string("${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-rel/zlib.pc" "-lz" "-lzlib")
38 |     endif()
39 |     file(COPY "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-rel/zlib.pc" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/pkgconfig")
40 | endif()
41 | if(NOT DEFINED VCPKG_BUILD_TYPE OR VCPKG_BUILD_TYPE STREQUAL "debug")
42 |     if(VCPKG_TARGET_IS_WINDOWS)
43 |         vcpkg_replace_string("${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-dbg/zlib.pc" "-lz" "-lzlibd")
44 |     endif()
45 |     file(COPY "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-dbg/zlib.pc" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig")
46 | endif()
47 | 
48 | vcpkg_fixup_pkgconfig()
49 | vcpkg_copy_pdbs()
50 | 
51 | if(VCPKG_LIBRARY_LINKAGE STREQUAL "static")
52 |     vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/zconf.h" "ifdef ZLIB_DLL" "if 0")
53 | else()
54 |     vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/include/zconf.h" "ifdef ZLIB_DLL" "if 1")
55 | endif()
56 | 
57 | file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
58 | file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
59 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/usage:
--------------------------------------------------------------------------------
1 | The package zlib is compatible with built-in CMake targets:
2 | 
3 |     find_package(ZLIB REQUIRED)
4 |     target_link_libraries(main PRIVATE ZLIB::ZLIB)
5 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/vcpkg-cmake-wrapper.cmake:
--------------------------------------------------------------------------------
 1 | find_path(ZLIB_INCLUDE_DIR NAMES zlib.h PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include" NO_DEFAULT_PATH)
 2 | find_library(ZLIB_LIBRARY_RELEASE NAMES zlib  z PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib" NO_DEFAULT_PATH)
 3 | find_library(ZLIB_LIBRARY_DEBUG   NAMES zlibd z PATHS "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/debug/lib" NO_DEFAULT_PATH)
 4 | 
 5 | if(NOT ZLIB_INCLUDE_DIR OR NOT (ZLIB_LIBRARY_RELEASE OR ZLIB_LIBRARY_DEBUG))
 6 |     message(FATAL_ERROR "Broken installation of vcpkg port zlib")
 7 | endif()
 8 | 
 9 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
10 | 
11 | if(CMAKE_VERSION VERSION_LESS 3.4)
12 |     include(SelectLibraryConfigurations)
13 |     select_library_configurations(ZLIB)
14 |     unset(ZLIB_FOUND)
15 | endif()
16 | _find_package(${ARGS})
17 | 


--------------------------------------------------------------------------------
/vcpkg_ports/zlib/vcpkg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "zlib",
 3 |   "version": "1.3.1",
 4 |   "description": "A compression library",
 5 |   "homepage": "https://www.zlib.net/",
 6 |   "license": "Zlib",
 7 |   "dependencies": [
 8 |       "vcpkg-cmake"
 9 |   ]
10 | }
11 | 


--------------------------------------------------------------------------------