├── src
├── test
│ ├── resources
│ │ ├── .gitignore
│ │ └── META-INF
│ │ │ └── services
│ │ │ └── za.co.absa.abris.avro.sql.SchemaConverter
│ └── scala
│ │ └── za
│ │ └── co
│ │ └── absa
│ │ └── abris
│ │ ├── avro
│ │ ├── sql
│ │ │ ├── DummySchemaConverter.scala
│ │ │ ├── package.scala
│ │ │ ├── CatalystDataToAvroSpec.scala
│ │ │ ├── SchemaEvolutionSpec.scala
│ │ │ └── AvroDataToCatalystSpec.scala
│ │ ├── utils
│ │ │ └── AvroSchemaEncoder.scala
│ │ ├── registry
│ │ │ ├── TestRegistryClient.scala
│ │ │ └── SchemaSubjectSpec.scala
│ │ ├── errors
│ │ │ ├── FailFastExceptionHandlerSpec.scala
│ │ │ ├── PermissiveRecordExceptionHandlerSpec.scala
│ │ │ └── SpecificRecordExceptionHandlerSpec.scala
│ │ ├── schemas
│ │ │ └── SchemaLoaderSpec.scala
│ │ ├── parsing
│ │ │ └── utils
│ │ │ │ └── AvroSchemaUtilsSpec.scala
│ │ ├── read
│ │ │ └── confluent
│ │ │ │ ├── SchemaManagerFactorySpec.scala
│ │ │ │ └── SchemaManagerSpec.scala
│ │ └── format
│ │ │ └── SparkAvroConversionsSpec.scala
│ │ └── config
│ │ ├── ToAvroConfigSpec.scala
│ │ ├── InternalToAvroConfigSpec.scala
│ │ ├── InternalFromAvroConfigSpec.scala
│ │ └── FromAvroConfigSpec.scala
└── main
│ ├── resources
│ ├── log4j.properties
│ └── META-INF
│ │ └── services
│ │ └── za.co.absa.abris.avro.sql.SchemaConverter
│ ├── scala
│ ├── za
│ │ └── co
│ │ │ └── absa
│ │ │ └── abris
│ │ │ ├── avro
│ │ │ ├── read
│ │ │ │ └── confluent
│ │ │ │ │ ├── ConfluentConstants.scala
│ │ │ │ │ ├── SchemaManagerFactory.scala
│ │ │ │ │ └── SchemaManager.scala
│ │ │ ├── sql
│ │ │ │ ├── SchemaConverter.scala
│ │ │ │ ├── DefaultSchemaConverter.scala
│ │ │ │ ├── CatalystDataToAvro.scala
│ │ │ │ └── AvroDataToCatalyst.scala
│ │ │ ├── registry
│ │ │ │ ├── SchemaVersion.scala
│ │ │ │ ├── SchemaCoordinate.scala
│ │ │ │ ├── AbrisRegistryClient.scala
│ │ │ │ ├── ConfluentRegistryClient.scala
│ │ │ │ ├── ConfluentMockRegistryClient.scala
│ │ │ │ ├── AbstractConfluentRegistryClient.scala
│ │ │ │ └── SchemaSubject.scala
│ │ │ ├── errors
│ │ │ │ ├── DeserializationExceptionHandler.scala
│ │ │ │ ├── FailFastExceptionHandler.scala
│ │ │ │ ├── SpecificRecordExceptionHandler.scala
│ │ │ │ └── PermissiveRecordExceptionHandler.scala
│ │ │ ├── format
│ │ │ │ └── SparkAvroConversions.scala
│ │ │ ├── functions.scala
│ │ │ └── parsing
│ │ │ │ └── utils
│ │ │ │ └── AvroSchemaUtils.scala
│ │ │ ├── config
│ │ │ ├── InternalToAvroConfig.scala
│ │ │ └── InternalFromAvroConfig.scala
│ │ │ └── examples
│ │ │ ├── data
│ │ │ └── generation
│ │ │ │ ├── FixedString.scala
│ │ │ │ ├── ComplexRecordsGenerator.scala
│ │ │ │ └── TestSchemas.scala
│ │ │ ├── utils
│ │ │ ├── CompatibleRowEncoder.scala
│ │ │ └── ExamplesUtils.scala
│ │ │ ├── ConfluentKafkaAvroReader.scala
│ │ │ └── ConfluentKafkaAvroWriter.scala
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── sql
│ │ └── avro
│ │ ├── AbrisAvroSerializer.scala
│ │ └── AbrisAvroDeserializer.scala
│ └── avro
│ ├── native-simple-outer-schema.avsc
│ └── native-complete-schema.avsc
├── .gitignore
├── .github
└── workflows
│ ├── test-and-verify.yml
│ ├── compatibility-check.yml
│ └── ci-check-jacoco.yml
├── documentation
├── python-documentation.md
├── vanilla-avro-documentation.md
└── confluent-avro-documentation.md
├── scalastyle-config.xml
├── .editorconfig
├── LICENSE.md
└── README.md
/src/test/resources/.gitignore:
--------------------------------------------------------------------------------
1 | /sampleData/
2 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=ERROR,stdout
2 | log4j.logger.com.endeca=INFO
3 | # Logger for crawl metrics
4 | log4j.logger.com.endeca.itl.web.metrics=INFO
5 |
6 | log4j.logger.org.apache.kafka.clients.consumer.internals.Fetcher=WARN
7 | log4j.logger.org.apache.spark.ContextCleaner=WARN
8 | log4j.logger.za.co.absa.abris.avro.read.confluent.SchemaManager$=WARN
9 | log4j.logger.za.co.absa.abris.avro.subject.SubjectNameStrategyAdapterFactory$=WARN
10 |
11 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
12 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
13 | log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
3 | .cache-main
4 | .cache-tests
5 |
6 | # use glob syntax.
7 | syntax: glob
8 | *.ser
9 | *.class
10 | *~
11 | *.bak
12 | #*.off
13 | *.old
14 |
15 | # eclipse conf file
16 | .settings
17 | .classpath
18 | .manager
19 | .scala_dependencies
20 |
21 | # idea
22 | .idea
23 | *.iml
24 |
25 | # building
26 | target
27 | build
28 | null
29 | tmp*
30 | temp*
31 | dist
32 | test-output
33 | build.log
34 |
35 | # other scm
36 | .svn
37 | .CVS
38 | .hg*
39 |
40 | # switch to regexp syntax.
41 | # syntax: regexp
42 | # ^\.pc/
43 |
44 | #SHITTY output not in target directory
45 | build.log
46 |
47 | .cache*
48 | dependency-reduced-pom.xml
49 |
50 | _testOutput
51 | output
52 | /keystore/
53 |
--------------------------------------------------------------------------------
/.github/workflows/test-and-verify.yml:
--------------------------------------------------------------------------------
1 | name: Test and verify
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 |
13 | strategy:
14 | matrix:
15 | spark: [ 3.2, 3.3, 3.4, 3.5 ]
16 | scala: [ 2.12, 2.13 ]
17 |
18 | name: Spark ${{ matrix.spark }}, Scala ${{ matrix.scala }}
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up JDK 1.8
23 | uses: actions/setup-java@v1
24 | with:
25 | java-version: 1.8
26 | - name: Run tests
27 | run: mvn clean verify -Plicense-check,spark-${{ matrix.spark }},scala-${{ matrix.scala }}
28 |
--------------------------------------------------------------------------------
/src/test/resources/META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2022 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | za.co.absa.abris.avro.sql.DummySchemaConverter
17 |
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2022 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | za.co.absa.abris.avro.sql.DefaultSchemaConverter
17 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/read/confluent/ConfluentConstants.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.read.confluent
18 |
19 | object ConfluentConstants {
20 |
21 | val MAGIC_BYTE = 0x0
22 | val SCHEMA_ID_SIZE_BYTES = 4
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/sql/SchemaConverter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.sql.types.DataType
21 |
22 | trait SchemaConverter {
23 | val shortName: String
24 | def toSqlType(avroSchema: Schema): DataType
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/SchemaVersion.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | /**
20 | * Version of schema stored in Confluent Schema Registry
21 | */
22 | trait SchemaVersion
23 | case class NumVersion(num: Int) extends SchemaVersion
24 | case class LatestVersion() extends SchemaVersion
25 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/errors/DeserializationExceptionHandler.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.sql.avro.AbrisAvroDeserializer
21 |
22 | trait DeserializationExceptionHandler extends Serializable {
23 |
24 | def handle(exception: Throwable, deserializer: AbrisAvroDeserializer, readerSchema: Schema): Any
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/SchemaCoordinate.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | /**
20 | * coordinate that unambiguously identifies schema in schema registry
21 | */
22 | trait SchemaCoordinate
23 |
24 | case class IdCoordinate(schemaId: Int) extends SchemaCoordinate
25 |
26 | case class SubjectCoordinate(subject: SchemaSubject, version: SchemaVersion) extends SchemaCoordinate
27 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/sql/DefaultSchemaConverter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 | import org.apache.avro.Schema
19 | import org.apache.spark.sql.avro.SchemaConverters
20 | import org.apache.spark.sql.types.DataType
21 |
22 | class DefaultSchemaConverter extends SchemaConverter {
23 | override val shortName: String = "default"
24 | override def toSqlType(avroSchema: Schema): DataType = SchemaConverters.toSqlType(avroSchema).dataType
25 | }
26 |
--------------------------------------------------------------------------------
/.github/workflows/compatibility-check.yml:
--------------------------------------------------------------------------------
1 | name: Binary Compatibility
2 |
3 | on:
4 | push:
5 | branches: [ master, branch-3.2 ]
6 | pull_request:
7 | branches: [ master, branch-3.2 ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ubuntu-latest
13 | strategy:
14 | fail-fast: false
15 | matrix:
16 | spark: [ 3.2 ]
17 | scala: [ 2.12, 2.13 ]
18 | name: Spark ${{ matrix.spark }}, Scala ${{ matrix.scala }}
19 | steps:
20 | - uses: actions/checkout@v2
21 | - name: Set up JDK 1.8
22 | uses: actions/setup-java@v1
23 | with:
24 | java-version: 1.8
25 | - uses: actions/cache@v2
26 | with:
27 | path: ~/.m2/repository
28 | key: ${{ runner.os }}-${{ matrix.scala }}-${{ hashFiles('**/pom.xml') }}
29 | restore-keys: |
30 | ${{ runner.os }}-${{ matrix.scala }}-
31 | - name: Switch scala version
32 | run: mvn scala-cross-build:change-version -Pscala-${{ matrix.scala }}
33 | - name: Check binary compatibility
34 | run: mvn clean test -DskipTests -Pcompatibility-check,scala-${{ matrix.scala }}
35 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/config/InternalToAvroConfig.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.config
18 |
19 | import org.apache.avro.Schema
20 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
21 | import za.co.absa.abris.config.ToAvroConfig.Key
22 |
23 | private[abris] class InternalToAvroConfig(map: Map[String, Any]) {
24 |
25 | val schema: Schema = AvroSchemaUtils.parse(map(Key.Schema).asInstanceOf[String])
26 |
27 | val schemaId: Option[Int] = map.get(Key.SchemaId).map(_.asInstanceOf[Int])
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/errors/FailFastExceptionHandler.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.SparkException
21 | import org.apache.spark.sql.avro.AbrisAvroDeserializer
22 |
23 | class FailFastExceptionHandler extends DeserializationExceptionHandler {
24 |
25 | def handle(exception: Throwable, avroDeserializer: AbrisAvroDeserializer, readerSchema: Schema): Any = {
26 | throw new SparkException("Malformed record detected.", exception)
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/sql/DummySchemaConverter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
21 | import za.co.absa.abris.avro.sql.DummySchemaConverter._
22 |
23 | class DummySchemaConverter extends SchemaConverter {
24 | override val shortName: String = name
25 | override def toSqlType(avroSchema: Schema): DataType = dataType
26 | }
27 |
28 | object DummySchemaConverter {
29 | val name: String = "dummy"
30 | val dataType: DataType = StructType(Seq(StructField("long", LongType)))
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/avro/native-simple-outer-schema.avsc:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright 2018 ABSA Group Limited
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | {
19 | "type": "record",
20 | "name": "NativeSimpleOuter",
21 | "namespace": "all_types.test",
22 | "fields": [
23 | {
24 | "name": "name",
25 | "type": "string"
26 | },
27 | {
28 | "name": "nested",
29 | "type": {
30 | "type": "record",
31 | "name": "Nested",
32 | "fields": [
33 | {
34 | "name": "int",
35 | "type": "int"
36 | },
37 | {
38 | "name": "long",
39 | "type": "long"
40 | }
41 | ]
42 | }
43 | }
44 | ]
45 | }
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/data/generation/FixedString.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples.data.generation
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.avro.generic.GenericFixed
21 | import za.co.absa.commons.annotation.DeveloperApi
22 |
23 | @DeveloperApi
24 | object FixedString {
25 | def getClassName(): String = new FixedString("").getClass.getName
26 | }
27 |
28 | /**
29 | * Utility class for writing Avro fixed fields.
30 | */
31 | @DeveloperApi
32 | class FixedString(value: String) extends GenericFixed {
33 | override def getSchema(): Schema = null
34 | override def bytes(): Array[Byte] = value.getBytes
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/avro/AbrisAvroSerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package org.apache.spark.sql.avro
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.sql.types.DataType
21 | import za.co.absa.commons.annotation.DeveloperApi
22 |
23 | /**
24 | * Simple wrapper to access spark package private class
25 | */
26 | @DeveloperApi
27 | class AbrisAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean) {
28 |
29 | private val serializer: AvroSerializer = new AvroSerializer(rootCatalystType, rootAvroType, nullable)
30 |
31 | def serialize(catalystData: Any): Any = {
32 | serializer.serialize(catalystData)
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/AbrisRegistryClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | import io.confluent.kafka.schemaregistry.client.SchemaMetadata
20 | import org.apache.avro.Schema
21 |
22 | trait AbrisRegistryClient {
23 |
24 | def getAllVersions(subject: String): java.util.List[Integer]
25 |
26 | def testCompatibility(subject: String, schema: Schema): Boolean
27 |
28 | def register(subject: String, schema: Schema): Int
29 |
30 | def getLatestSchemaMetadata(subject: String): SchemaMetadata
31 |
32 | def getSchemaMetadata(subject: String, version: Int): SchemaMetadata
33 |
34 | def getById(schemaId: Int): Schema
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/errors/SpecificRecordExceptionHandler.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.avro.specific.SpecificRecordBase
21 | import org.apache.spark.internal.Logging
22 | import org.apache.spark.sql.avro.AbrisAvroDeserializer
23 |
24 | class SpecificRecordExceptionHandler(defaultRecord: SpecificRecordBase) extends DeserializationExceptionHandler with Logging {
25 |
26 | def handle(exception: Throwable, deserializer: AbrisAvroDeserializer, readerSchema: Schema): Any = {
27 | logWarning("Malformed record detected. Replacing with default record.", exception)
28 | deserializer.deserialize(defaultRecord)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/utils/AvroSchemaEncoder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.utils
18 |
19 | import org.apache.spark.sql.{Encoder, Row}
20 | import za.co.absa.abris.avro.format.SparkAvroConversions
21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
22 | import za.co.absa.abris.examples.data.generation.ComplexRecordsGenerator
23 | import za.co.absa.abris.examples.utils.CompatibleRowEncoder
24 |
25 | class AvroSchemaEncoder {
26 |
27 | def getEncoder: Encoder[Row] = {
28 | val avroSchema = AvroSchemaUtils.parse(ComplexRecordsGenerator.usedAvroSchema)
29 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema)
30 | CompatibleRowEncoder.apply(sparkSchema)
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/errors/PermissiveRecordExceptionHandler.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.avro.generic.{GenericData, GenericRecord}
21 | import org.apache.avro.specific.SpecificRecordBase
22 | import org.apache.spark.internal.Logging
23 | import org.apache.spark.sql.avro.AbrisAvroDeserializer
24 |
25 | class PermissiveRecordExceptionHandler() extends DeserializationExceptionHandler with Logging {
26 |
27 | def handle(exception: Throwable, deserializer: AbrisAvroDeserializer, readerSchema: Schema): Any = {
28 | logWarning("Malformed record detected. Replacing with full null row.", exception)
29 | val record = new GenericData.Record(readerSchema)
30 | deserializer.deserialize(record)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/registry/TestRegistryClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 | import io.confluent.kafka.schemaregistry.client.SchemaMetadata
19 | import org.apache.avro.Schema
20 |
21 | import java.util
22 |
23 | class TestRegistryClient(config: Map[String, String]) extends AbrisRegistryClient {
24 |
25 | override def getAllVersions(subject: String): util.List[Integer] = ???
26 |
27 | override def testCompatibility(subject: String, schema: Schema): Boolean = ???
28 |
29 | override def register(subject: String, schema: Schema): Int = ???
30 |
31 | override def getLatestSchemaMetadata(subject: String): SchemaMetadata = ???
32 |
33 | override def getSchemaMetadata(subject: String, version: Int): SchemaMetadata = ???
34 |
35 | override def getById(schemaId: Int): Schema = ???
36 | }
37 |
--------------------------------------------------------------------------------
/.github/workflows/ci-check-jacoco.yml:
--------------------------------------------------------------------------------
1 | name: CI check JaCoCo code-coverage
2 |
3 | on:
4 | pull_request:
5 | branches: [ master ]
6 | types: [ opened, edited, synchronize, reopened ]
7 |
8 | jobs:
9 | test:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Checkout code
13 | uses: actions/checkout@v2
14 | - name: Setup Scala
15 | uses: olafurpg/setup-scala@v10
16 | with:
17 | java-version: "adopt@1.8"
18 | - name: Build and run tests
19 | run: mvn clean verify -Pcode-coverage
20 | - name: Add coverage to PR
21 | id: jacoco
22 | uses: madrapps/jacoco-report@v1.4
23 | with:
24 | paths: ${{ github.workspace }}/target/site/jacoco/jacoco.xml
25 | token: ${{ secrets.GITHUB_TOKEN }}
26 | min-coverage-overall: 0.0
27 | min-coverage-changed-files: 80.0
28 | title: JaCoCo code coverage report
29 | update-comment: true
30 | - name: Get the Coverage info
31 | run: |
32 | echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}"
33 | echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}"
34 | - name: Fail PR if changed files coverage is less than 80%
35 | if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }}
36 | uses: actions/github-script@v6
37 | with:
38 | script: |
39 | core.setFailed('Changed files coverage is less than 80%!')
40 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/ConfluentRegistryClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 | import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
19 | import io.confluent.kafka.serializers.KafkaAvroDeserializerConfig
20 |
21 | import scala.collection.JavaConverters._
22 |
23 | class ConfluentRegistryClient(client: SchemaRegistryClient) extends AbstractConfluentRegistryClient(client) {
24 |
25 | def this(configs: Map[String,String]) = this(ConfluentRegistryClient.createClient(configs))
26 | }
27 |
28 | object ConfluentRegistryClient {
29 |
30 | private def createClient(configs: Map[String,String]) = {
31 | val settings = new KafkaAvroDeserializerConfig(configs.asJava)
32 | val urls = settings.getSchemaRegistryUrls
33 | val maxSchemaObject = settings.getMaxSchemasPerSubject
34 |
35 | new CachedSchemaRegistryClient(urls, maxSchemaObject, configs.asJava)
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/config/InternalFromAvroConfig.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.config
18 |
19 | import org.apache.avro.Schema
20 | import za.co.absa.abris.avro.errors.{FailFastExceptionHandler, DeserializationExceptionHandler}
21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
22 | import za.co.absa.abris.config.FromAvroConfig.Key
23 |
24 | private[abris] class InternalFromAvroConfig(map: Map[String, Any]) {
25 |
26 | val readerSchema: Schema = AvroSchemaUtils.parse(map(Key.ReaderSchema).asInstanceOf[String])
27 |
28 | val writerSchema: Option[Schema] = map
29 | .get(Key.WriterSchema)
30 | .map(s => AvroSchemaUtils.parse(s.asInstanceOf[String]))
31 |
32 | val schemaConverter: Option[String] = map
33 | .get(Key.SchemaConverter)
34 | .map(_.asInstanceOf[String])
35 |
36 | val deserializationHandler: DeserializationExceptionHandler = map
37 | .get(Key.ExceptionHandler)
38 | .map(s => s.asInstanceOf[DeserializationExceptionHandler])
39 | .getOrElse(new FailFastExceptionHandler)
40 | }
41 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/sql/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro
18 |
19 | import org.apache.spark.sql.DataFrame
20 | import org.scalatest.matchers.should.Matchers._
21 |
22 | package object sql {
23 |
24 | /**
25 | * assert that both dataFrames contain the same data
26 | *
27 | * @param expectedFrame
28 | * @param actualFrame
29 | */
30 | def shouldEqualByData(expectedFrame: DataFrame, actualFrame: DataFrame): Unit = {
31 |
32 | def columnNames(frame: DataFrame) = frame.schema.fields.map(_.name)
33 |
34 | val expectedColNames = columnNames(expectedFrame)
35 | val actualColNames = columnNames(actualFrame)
36 |
37 | expectedColNames shouldEqual actualColNames
38 |
39 | expectedColNames.foreach(col => {
40 | val expectedColumn = expectedFrame.select(col).collect().map(row => row.toSeq.head)
41 | val actualColumn = actualFrame.select(col).collect().map(row => row.toSeq.head)
42 |
43 | for ((expected, actual ) <- expectedColumn.zip(actualColumn)) {
44 | actual shouldEqual expected
45 | }
46 | })
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/ConfluentMockRegistryClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException
20 | import io.confluent.kafka.schemaregistry.client.{MockSchemaRegistryClient, SchemaMetadata, SchemaRegistryClient}
21 |
22 | import java.io.IOException
23 |
24 |
25 | class ConfluentMockRegistryClient(client: SchemaRegistryClient) extends AbstractConfluentRegistryClient(client) {
26 |
27 | def this() = this(new MockSchemaRegistryClient())
28 |
29 | /**
30 | * MockSchemaRegistryClient is throwing different Exception than the mocked client, this is a workaround
31 | */
32 | @throws[IOException]
33 | @throws[RestClientException]
34 | override def getLatestSchemaMetadata(subject: String): SchemaMetadata = {
35 | try client.getLatestSchemaMetadata(subject)
36 | catch {
37 | case e: IOException if e.getMessage == "No schema registered under subject!" =>
38 | throw new RestClientException("No schema registered under subject!", 404, 40401)
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/utils/CompatibleRowEncoder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples.utils
18 |
19 | import org.apache.spark.sql.{Encoder, Row}
20 | import org.apache.spark.sql.types.StructType
21 |
22 | import scala.util.Try
23 |
24 | object CompatibleRowEncoder {
25 | def apply(schema: StructType): Encoder[Row] = {
26 | // Spark < 3.5.0
27 | val rowEncoderTry = Try {
28 | val rowEncoderClass = Class.forName("org.apache.spark.sql.catalyst.encoders.RowEncoder")
29 | val applyMethod = rowEncoderClass.getMethod("apply", classOf[StructType])
30 | applyMethod.invoke(null, schema).asInstanceOf[Encoder[Row]]
31 | }
32 |
33 | // Spark >= 3.5.0
34 | rowEncoderTry.orElse(Try {
35 | val encodersClass = Class.forName("org.apache.spark.sql.Encoders")
36 | val rowMethod = encodersClass.getMethod("row", classOf[StructType])
37 | rowMethod.invoke(null, schema).asInstanceOf[Encoder[Row]]
38 | }).getOrElse {
39 | throw new IllegalStateException("Neither RowEncoder.apply nor Encoders.row is available in the Spark version.")
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/config/ToAvroConfigSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.config
18 |
19 | import org.scalatest.flatspec.AnyFlatSpec
20 | import org.scalatest.matchers.should.Matchers
21 | import za.co.absa.abris.config.ToAvroConfig.Key
22 |
23 | class ToAvroConfigSpec extends AnyFlatSpec with Matchers {
24 |
25 | behavior of "ToAvroConfig"
26 |
27 | it should "provide map with all set configurations" in {
28 | val config = ToAvroConfig()
29 | .withSchema("foo")
30 | .withSchemaId(42)
31 |
32 | val map = config.abrisConfig()
33 | map(Key.Schema) shouldBe "foo"
34 | map(Key.SchemaId) shouldBe 42
35 | }
36 |
37 | it should "support the legacy constructor and methods" in {
38 | val config = new ToAvroConfig("foo", Some(2))
39 |
40 | config.schemaString() shouldBe "foo"
41 | config.schemaId() shouldBe Some(2)
42 |
43 | val map = config.abrisConfig()
44 | map(Key.Schema) shouldBe "foo"
45 | map(Key.SchemaId) shouldBe 2
46 | }
47 |
48 | it should "throw when validation fails" in {
49 | val config = ToAvroConfig()
50 |
51 | val thrown = intercept[IllegalArgumentException] {
52 | config.validate()
53 | }
54 | thrown.getMessage.contains(Key.Schema) shouldBe true
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/errors/FailFastExceptionHandlerSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import org.apache.spark.SparkException
20 | import org.apache.spark.sql.avro.{AbrisAvroDeserializer, SchemaConverters}
21 | import org.apache.spark.sql.types.DataType
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest.matchers.should.Matchers
24 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
25 | import za.co.absa.abris.examples.data.generation.TestSchemas
26 |
27 |
28 | class FailFastExceptionHandlerSpec extends AnyFlatSpec with Matchers {
29 |
30 | it should "should throw spark exception on error" in {
31 |
32 | val deserializationExceptionHandler = new FailFastExceptionHandler
33 | val schema = AvroSchemaUtils.parse(TestSchemas.COMPLEX_SCHEMA_SPEC)
34 | val dataType: DataType = SchemaConverters.toSqlType(schema).dataType
35 | val deserializer = new AbrisAvroDeserializer(schema, dataType)
36 |
37 | an[SparkException] should be thrownBy (deserializationExceptionHandler.handle(new Exception, deserializer, schema))
38 | val exceptionThrown = the[SparkException] thrownBy (deserializationExceptionHandler.handle(new Exception, deserializer, schema))
39 | exceptionThrown.getMessage should equal("Malformed record detected.")
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/format/SparkAvroConversions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.format
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.sql.avro.SchemaConverters
21 | import org.apache.spark.sql.types._
22 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
23 |
24 |
25 | /**
26 | * This class provides conversions between Avro and Spark schemas and data.
27 | */
28 | object SparkAvroConversions {
29 |
30 | /**
31 | * Converts a Spark's SQL type into an Avro schema, using specific names and namespaces for the schema.
32 | */
33 | def toAvroSchema(
34 | structType: StructType,
35 | schemaName: String,
36 | schemaNamespace: String): Schema = {
37 | SchemaConverters.toAvroType(structType, false, schemaName, schemaNamespace)
38 | }
39 |
40 | /**
41 | * Translates an Avro Schema into a Spark's StructType.
42 | *
43 | * Relies on Spark-Avro library to do the job.
44 | */
45 | def toSqlType(schema: String): StructType = toSqlType(AvroSchemaUtils.parse(schema))
46 |
47 | /**
48 | * Translates an Avro Schema into a Spark's StructType.
49 | *
50 | * Relies on Spark-Avro library to do the job.
51 | */
52 | def toSqlType(schema: Schema): StructType = {
53 | SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType]
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/sql/CatalystDataToAvroSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import org.apache.avro.SchemaBuilder
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
22 | import org.apache.spark.sql.functions.col
23 | import org.scalatest.BeforeAndAfterEach
24 | import org.scalatest.flatspec.AnyFlatSpec
25 | import org.scalatest.matchers.should.Matchers
26 | import za.co.absa.abris.avro.functions._
27 | import za.co.absa.abris.config.ToAvroConfig
28 |
29 | class CatalystDataToAvroSpec extends AnyFlatSpec with Matchers with BeforeAndAfterEach {
30 | it should "be serializable" in {
31 | val schema = SchemaBuilder
32 | .record("foo")
33 | .namespace("test_namespace")
34 | .fields()
35 | .name("int").`type`().intType().noDefault()
36 | .endRecord()
37 | .toString
38 | val config = ToAvroConfig().withSchema(schema)
39 | val catalystDataToAvro = to_avro(col("col"), config).expr
40 |
41 | val javaSerializer = new JavaSerializer(new SparkConf())
42 | javaSerializer.newInstance().serialize(catalystDataToAvro)
43 |
44 | val kryoSerializer = new KryoSerializer(new SparkConf())
45 | kryoSerializer.newInstance().serialize(catalystDataToAvro)
46 |
47 | // test successful if no exception is thrown
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/schemas/SchemaLoaderSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.schemas
18 |
19 | import org.apache.commons.io.FileUtils
20 | import org.scalatest.flatspec.AnyFlatSpec
21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
22 | import za.co.absa.abris.examples.data.generation.TestSchemas
23 |
24 | import java.io.File
25 | import java.nio.charset.Charset
26 |
27 | class SchemaLoaderSpec extends AnyFlatSpec {
28 |
29 | private val testDir = new File("testDirSchemaLoader")
30 |
31 | behavior of "SchemaLoader"
32 |
33 | it should "retrieve schemas from file systems" in {
34 | val expectedSchemaString = TestSchemas.COMPLEX_SCHEMA_SPEC
35 | val expectedSchema = AvroSchemaUtils.parse(expectedSchemaString)
36 | val schemaFileName = "testSchemaName"
37 | val destination = writeIntoFS(expectedSchemaString, schemaFileName)
38 | val loadedSchema = AvroSchemaUtils.load(destination.getAbsolutePath)
39 |
40 | FileUtils.deleteQuietly(new File(destination.getAbsolutePath))
41 | FileUtils.deleteDirectory(testDir)
42 |
43 | assert(expectedSchema.equals(loadedSchema))
44 | }
45 |
46 | private def writeIntoFS(schema: String, name: String): File = {
47 | val destination = new File(testDir, name)
48 | FileUtils.write(destination, schema, Charset.defaultCharset)
49 | destination
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/config/InternalToAvroConfigSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.config
18 |
19 | import org.apache.avro.SchemaBuilder
20 | import org.scalatest.flatspec.AnyFlatSpec
21 | import org.scalatest.matchers.should.Matchers
22 |
23 | class InternalToAvroConfigSpec extends AnyFlatSpec with Matchers {
24 |
25 | import InternalToAvroConfigSpec._
26 |
27 | behavior of "InternalToAvroConfig"
28 |
29 | it should "convert and provide all set properties" in {
30 | val config = ToAvroConfig()
31 | .withSchema(avroSchema.toString)
32 | .withSchemaId(42)
33 |
34 | val intConfig = new InternalToAvroConfig(config.abrisConfig())
35 |
36 | val schema = intConfig.schema
37 | schema.getName shouldBe "foo"
38 | schema.getNamespace shouldBe "test_namespace"
39 | schema.getFields.size() shouldBe 2
40 |
41 | intConfig.schemaId shouldBe Some(42)
42 | }
43 |
44 | it should "return None for optional properties that were not set" in {
45 | val config = ToAvroConfig()
46 | .withSchema(avroSchema.toString)
47 |
48 | val intConfig = new InternalToAvroConfig(config.abrisConfig())
49 |
50 | intConfig.schemaId shouldBe None
51 | }
52 | }
53 |
54 | object InternalToAvroConfigSpec {
55 |
56 | val avroSchema = SchemaBuilder
57 | .record("foo")
58 | .namespace("test_namespace")
59 | .fields()
60 | .name("int").`type`().intType().noDefault()
61 | .name("bytes_name").`type`().stringType().noDefault()
62 | .endRecord()
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/AbstractConfluentRegistryClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | import io.confluent.kafka.schemaregistry.avro.AvroSchema
20 | import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
21 | import org.apache.avro.Schema
22 |
23 | import java.util
24 |
25 |
26 | abstract class AbstractConfluentRegistryClient(client: SchemaRegistryClient) extends AbrisRegistryClient {
27 |
28 | override def getAllVersions(subject: String): util.List[Integer] =
29 | client.getAllVersions(subject)
30 |
31 | override def testCompatibility(subject: String, schema: Schema): Boolean =
32 | client.testCompatibility(subject, new AvroSchema(schema))
33 |
34 | override def register(subject: String, schema: Schema): Int =
35 | client.register(subject, new AvroSchema(schema))
36 |
37 | override def getLatestSchemaMetadata(subject: String): SchemaMetadata =
38 | client.getLatestSchemaMetadata(subject)
39 |
40 | override def getSchemaMetadata(subject: String, version: Int): SchemaMetadata =
41 | client.getSchemaMetadata(subject, version)
42 |
43 | override def getById(schemaId: Int): Schema = {
44 | val parsedSchema = client.getSchemaById(schemaId)
45 | parsedSchema match {
46 | case schema: AvroSchema => schema.rawSchema()
47 | case schema => throw new UnsupportedOperationException(s"Only AvroSchema is supported," +
48 | s" got schema type ${schema.schemaType()}")
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/avro/native-complete-schema.avsc:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * Copyright 2018 ABSA Group Limited
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | {
19 | "namespace": "all_types.test",
20 | "type": "record",
21 | "name": "NativeComplete",
22 | "fields": [
23 | {
24 | "name": "bytes",
25 | "type": "bytes"
26 | },
27 | {
28 | "name": "string",
29 | "type": [
30 | "string",
31 | "null"
32 | ],
33 | "default": "blue"
34 | },
35 | {
36 | "name": "int",
37 | "type": [
38 | "int",
39 | "null"
40 | ]
41 | },
42 | {
43 | "name": "long",
44 | "type": [
45 | "long",
46 | "null"
47 | ]
48 | },
49 | {
50 | "name": "double",
51 | "type": [
52 | "double",
53 | "null"
54 | ]
55 | },
56 | {
57 | "name": "float",
58 | "type": [
59 | "float",
60 | "null"
61 | ]
62 | },
63 | {
64 | "name": "boolean",
65 | "type": [
66 | "boolean",
67 | "null"
68 | ]
69 | },
70 | {
71 | "name": "array",
72 | "type": {
73 | "type": "array",
74 | "items": "string"
75 | }
76 | },
77 | {
78 | "name": "map",
79 | "type": {
80 | "type": "map",
81 | "values": {
82 | "type": "array",
83 | "items": "long"
84 | }
85 | }
86 | },
87 | {
88 | "name": "fixed",
89 | "type": {
90 | "type": "fixed",
91 | "size": 40,
92 | "name": "Fixed"
93 | }
94 | }
95 | ]
96 | }
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/ConfluentKafkaAvroReader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples
18 |
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.functions.col
21 | import org.apache.spark.sql.streaming.Trigger
22 | import za.co.absa.abris.avro.read.confluent.SchemaManagerFactory
23 | import za.co.absa.abris.avro.registry.SchemaSubject
24 | import za.co.absa.abris.config.AbrisConfig
25 |
26 | import scala.concurrent.duration.DurationInt
27 |
28 | object ConfluentKafkaAvroReader {
29 |
30 | val kafkaTopicName = "test_topic"
31 |
32 | def main(args: Array[String]): Unit = {
33 |
34 | val schemaManager = SchemaManagerFactory.create(Map(AbrisConfig.SCHEMA_REGISTRY_URL ->"http://localhost:8081"))
35 | val schemaExists = schemaManager.exists(SchemaSubject.usingTopicNameStrategy("foo"))
36 |
37 |
38 | val spark = SparkSession
39 | .builder()
40 | .appName("WriterJob")
41 | .master("local[2]")
42 | .getOrCreate()
43 |
44 |
45 | spark.sparkContext.setLogLevel("INFO")
46 |
47 | val dataFrame = spark
48 | .readStream
49 | .format("kafka")
50 | .option("kafka.bootstrap.servers", "localhost:9092")
51 | .option("subscribe", kafkaTopicName)
52 | .option("startingOffsets", "earliest")
53 | .load()
54 |
55 | val abrisConfig = AbrisConfig
56 | .fromConfluentAvro
57 | .downloadReaderSchemaByLatestVersion
58 | .andTopicNameStrategy(kafkaTopicName)
59 | .usingSchemaRegistry("http://localhost:8081")
60 |
61 | import za.co.absa.abris.avro.functions.from_avro
62 | val deserialized = dataFrame.select(from_avro(col("value"), abrisConfig) as "data")
63 |
64 | deserialized.printSchema()
65 |
66 | deserialized
67 | .writeStream
68 | .format("console")
69 | .trigger(Trigger.ProcessingTime(5.seconds))
70 | .option("truncate", "false")
71 | .start()
72 | .awaitTermination()
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/registry/SchemaSubjectSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | import org.scalatest.BeforeAndAfter
20 | import org.scalatest.flatspec.AnyFlatSpec
21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
22 |
23 | class SchemaSubjectSpec extends AnyFlatSpec with BeforeAndAfter {
24 |
25 | private val schema = AvroSchemaUtils.parse(
26 | """{
27 | |"type": "record",
28 | |"name": "Blah",
29 | |"namespace" : "Bleh",
30 | |"fields": [{ "name": "name", "type": "string" }]
31 | |}""".stripMargin)
32 |
33 | behavior of "SchemaSubject"
34 |
35 | it should "retrieve the correct subject name for TopicName strategy" in {
36 |
37 | assertResult("foo_topic-value")(
38 | SchemaSubject.usingTopicNameStrategy("foo_topic").asString
39 | )
40 |
41 | assertResult("foo_topic-key")(
42 | SchemaSubject.usingTopicNameStrategy("foo_topic", isKey = true).asString
43 | )
44 | }
45 |
46 | it should "retrieve the correct subject name for RecordName strategy" in {
47 |
48 | assertResult("foo_namespace.foo_name")(
49 | SchemaSubject.usingRecordNameStrategy("foo_name", "foo_namespace").asString
50 | )
51 | }
52 |
53 | it should "retrieve the correct subject name for TopicRecordName strategy" in {
54 |
55 | assertResult("topic-foo_namespace.foo_name")(
56 | SchemaSubject.usingTopicRecordNameStrategy("topic", "foo_name", "foo_namespace").asString
57 | )
58 | }
59 |
60 | it should "retrieve name and namespace for RecordName strategy from schema" in {
61 |
62 | assertResult("Bleh.Blah")(
63 | SchemaSubject.usingRecordNameStrategy(schema).asString
64 | )
65 | }
66 |
67 | it should "retrieve name and namespace for TopicRecordName strategy from schema" in {
68 |
69 | assertResult("foo_topic-Bleh.Blah")(
70 | SchemaSubject.usingTopicRecordNameStrategy("foo_topic", schema).asString
71 | )
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/config/InternalFromAvroConfigSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.config
18 |
19 | import org.apache.avro.SchemaBuilder
20 | import org.scalatest.flatspec.AnyFlatSpec
21 | import org.scalatest.matchers.should.Matchers
22 |
23 | class InternalFromAvroConfigSpec extends AnyFlatSpec with Matchers {
24 |
25 | import InternalFromAvroConfigSpec._
26 |
27 | behavior of "InternalFromAvroConfig"
28 |
29 | it should "convert and provide all set properties" in {
30 | val config = FromAvroConfig()
31 | .withReaderSchema(avroReaderSchema.toString)
32 | .withWriterSchema(avroWriterSchema.toString)
33 |
34 | val intConfig = new InternalFromAvroConfig(config.abrisConfig())
35 |
36 | val readerSchema = intConfig.readerSchema
37 | readerSchema.getName shouldBe "reader"
38 | readerSchema.getNamespace shouldBe "test_namespace"
39 | readerSchema.getFields.size() shouldBe 2
40 |
41 | val writerSchema = intConfig.writerSchema.get
42 | writerSchema.getName shouldBe "writer"
43 | writerSchema.getNamespace shouldBe "test_namespace"
44 | writerSchema.getFields.size() shouldBe 1
45 | }
46 |
47 | it should "return None for optional properties that were not set" in {
48 | val config = FromAvroConfig()
49 | .withReaderSchema(avroReaderSchema.toString)
50 |
51 | val intConfig = new InternalFromAvroConfig(config.abrisConfig())
52 |
53 | intConfig.writerSchema shouldBe None
54 | }
55 | }
56 |
57 | object InternalFromAvroConfigSpec {
58 |
59 | val avroReaderSchema = SchemaBuilder
60 | .record("reader")
61 | .namespace("test_namespace")
62 | .fields()
63 | .name("int").`type`().intType().noDefault()
64 | .name("bytes_name").`type`().stringType().noDefault()
65 | .endRecord()
66 |
67 | val avroWriterSchema = SchemaBuilder
68 | .record("writer")
69 | .namespace("test_namespace")
70 | .fields()
71 | .name("int").`type`().intType().noDefault()
72 | .endRecord()
73 | }
74 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/config/FromAvroConfigSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.config
18 |
19 | import org.scalatest.flatspec.AnyFlatSpec
20 | import org.scalatest.matchers.should.Matchers
21 | import za.co.absa.abris.config.FromAvroConfig.Key
22 |
23 | class FromAvroConfigSpec extends AnyFlatSpec with Matchers {
24 |
25 | behavior of "FromAvroConfig"
26 |
27 | it should "provide all set configurations" in {
28 | val dummySchemaConverter = "dummy"
29 | val config = FromAvroConfig()
30 | .withWriterSchema("foo")
31 | .withReaderSchema("bar")
32 | .withSchemaConverter(dummySchemaConverter)
33 | .withSchemaRegistryConfig(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "url"))
34 |
35 | val map = config.abrisConfig()
36 | map(Key.WriterSchema) shouldBe "foo"
37 | map(Key.ReaderSchema) shouldBe "bar"
38 | map(Key.SchemaConverter) shouldBe dummySchemaConverter
39 |
40 | config.schemaRegistryConf().get(AbrisConfig.SCHEMA_REGISTRY_URL) shouldBe "url"
41 | }
42 |
43 | it should "support the legacy constructor and methods" in {
44 | val config = new FromAvroConfig("foo", Some(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "url")))
45 |
46 | config.schemaString() shouldBe "foo"
47 | config.schemaRegistryConf().get(AbrisConfig.SCHEMA_REGISTRY_URL) shouldBe "url"
48 |
49 | val map = config.abrisConfig()
50 | map(Key.ReaderSchema) shouldBe "foo"
51 | }
52 |
53 | it should "throw when validation fails" in {
54 | val config = FromAvroConfig()
55 |
56 | val thrown = intercept[IllegalArgumentException] {
57 | config.validate()
58 | }
59 | thrown.getMessage.contains(Key.ReaderSchema) shouldBe true
60 |
61 | val config2 = FromAvroConfig()
62 | .withWriterSchema("foo")
63 | .withReaderSchema("bar")
64 | .withSchemaRegistryConfig(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "url"))
65 |
66 | val thrown2 = intercept[IllegalArgumentException] {
67 | config2.validate()
68 | }
69 | thrown2.getMessage.contains(Key.WriterSchema) shouldBe true
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/functions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro
18 |
19 | import org.apache.spark.sql.Column
20 | import za.co.absa.abris.avro.sql.{AvroDataToCatalyst, CatalystDataToAvro}
21 | import za.co.absa.abris.config.{AbrisConfig, FromAvroConfig, ToAvroConfig}
22 |
23 |
24 | // scalastyle:off: object.name
25 | object functions {
26 | // scalastyle:on: object.name
27 | // scalastyle:off: method.name
28 |
29 | /**
30 | *
31 | * @param column containing data for conversion
32 | * @param config Abris configuration
33 | * @return column containing data in avro format
34 | */
35 | def to_avro(column: Column, config: ToAvroConfig): Column = {
36 | config.validate()
37 |
38 | new Column(CatalystDataToAvro(
39 | column.expr,
40 | config.abrisConfig()
41 | ))
42 | }
43 |
44 | /**
45 | *
46 | * @param column containing data for conversion
47 | * @param schema avro schema
48 | * @return column containing data in avro format
49 | */
50 | def to_avro(column: Column, schema: String): Column = {
51 | val config = AbrisConfig
52 | .toSimpleAvro
53 | .provideSchema(schema)
54 |
55 | to_avro(column, config)
56 | }
57 |
58 | /**
59 | *
60 | * @param column column containing data for conversion
61 | * @param config Abris configuration
62 | * @return column with converted data
63 | */
64 | def from_avro(column: Column, config: FromAvroConfig): Column = {
65 | config.validate()
66 |
67 | new Column(AvroDataToCatalyst(
68 | column.expr,
69 | config.abrisConfig(),
70 | config.schemaRegistryConf()
71 | ))
72 | }
73 |
74 | /**
75 | *
76 | * @param column column containing data for conversion
77 | * @param schema avro schema
78 | * @return column with converted data
79 | */
80 | def from_avro(column: Column, schema: String): Column = {
81 | val config = AbrisConfig
82 | .fromSimpleAvro
83 | .provideSchema(schema)
84 |
85 | from_avro(column, config)
86 | }
87 |
88 | // scalastyle:on: method.name
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/registry/SchemaSubject.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.registry
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.avro.Schema.Type
21 |
22 | /**
23 | * Represents Confluent Schema Registry Subject created using naming strategy
24 | *
25 | * https://docs.confluent.io/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work
26 | *
27 | */
28 | class SchemaSubject(val asString: String) {
29 | override def toString: String = asString
30 | }
31 |
32 | object SchemaSubject{
33 |
34 | def usingTopicNameStrategy(
35 | topicName: String,
36 | isKey: Boolean = false
37 | ): SchemaSubject = {
38 | val suffix = if (isKey) "-key" else "-value"
39 | new SchemaSubject(topicName + suffix)
40 | }
41 |
42 | def usingRecordNameStrategy(
43 | recordName: String,
44 | recordNamespace: String
45 | ): SchemaSubject = {
46 | val dummySchema = createDummySchema(recordName, recordNamespace)
47 | new SchemaSubject(getRecordName(dummySchema))
48 | }
49 |
50 | def usingRecordNameStrategy(
51 | schema: Schema
52 | ): SchemaSubject = {
53 | new SchemaSubject(getRecordName(schema))
54 | }
55 |
56 | def usingTopicRecordNameStrategy(
57 | topicName: String,
58 | recordName: String,
59 | recordNamespace: String
60 | ): SchemaSubject = {
61 | val dummySchema = createDummySchema(recordName, recordNamespace)
62 | new SchemaSubject(topicName + "-" + getRecordName(dummySchema))
63 | }
64 |
65 | def usingTopicRecordNameStrategy(
66 | topicName: String,
67 | schema: Schema
68 | ): SchemaSubject = {
69 | new SchemaSubject(topicName + "-" + getRecordName(schema))
70 | }
71 |
72 | private def getRecordName(schema: Schema): String =
73 | if (schema.getType == Type.RECORD) {
74 | schema.getFullName
75 | } else {
76 | throw new IllegalArgumentException(s"Schema must be of type RECORD not ${schema.getType}")
77 | }
78 |
79 | private def createDummySchema(name: String, namespace: String) =
80 | Schema.createRecord(name, "", namespace, false)
81 | }
82 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/errors/PermissiveRecordExceptionHandlerSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.avro.{AbrisAvroDeserializer, SchemaConverters}
21 | import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
22 | import org.apache.spark.sql.types.{DataType, StructType}
23 | import org.scalatest.flatspec.AnyFlatSpec
24 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
25 | import za.co.absa.abris.examples.data.generation.TestSchemas
26 |
27 | class PermissiveRecordExceptionHandlerSpec extends AnyFlatSpec {
28 |
29 | private val spark = SparkSession
30 | .builder()
31 | .appName("unitTest")
32 | .master("local[1]")
33 | .config("spark.driver.bindAddress", "localhost")
34 | .config("spark.ui.enabled", "false")
35 | .getOrCreate()
36 |
37 | it should "receive empty dataframe row back" in {
38 |
39 | val expectedNestedFieldSchema = new StructType()
40 | .add("int", "int")
41 | .add("long", "long")
42 | val expectedNestedStructSchema = new StructType()
43 | .add("name", "string")
44 | .add("nested", expectedNestedFieldSchema)
45 |
46 | val expectedNestedFieldInternalRow = new SpecificInternalRow(expectedNestedFieldSchema)
47 | expectedNestedFieldInternalRow.setNullAt(0)
48 | expectedNestedFieldInternalRow.setNullAt(1)
49 |
50 | val expectedNestedStructInternalRow = new SpecificInternalRow(expectedNestedStructSchema)
51 | expectedNestedStructInternalRow.setNullAt(0)
52 | expectedNestedStructInternalRow.setNullAt(1)
53 |
54 | //actual
55 | val deserializationExceptionHandler = new PermissiveRecordExceptionHandler()
56 | val schema = AvroSchemaUtils.parse(TestSchemas.NATIVE_SIMPLE_OUTER_SCHEMA)
57 | val dataType: DataType = SchemaConverters.toSqlType(schema).dataType
58 |
59 | val actualResult = deserializationExceptionHandler
60 | .handle(new Exception, new AbrisAvroDeserializer(schema, dataType), schema)
61 |
62 | assert(actualResult == expectedNestedStructInternalRow)
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/sql/CatalystDataToAvro.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import org.apache.avro.generic.GenericDatumWriter
20 | import org.apache.avro.io.{BinaryEncoder, EncoderFactory}
21 | import org.apache.spark.sql.avro.AbrisAvroSerializer
22 | import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
23 | import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression}
24 | import org.apache.spark.sql.types.{BinaryType, DataType}
25 | import za.co.absa.abris.avro.read.confluent.ConfluentConstants
26 | import za.co.absa.abris.config.InternalToAvroConfig
27 |
28 | import java.io.ByteArrayOutputStream
29 | import java.nio.ByteBuffer
30 |
31 | private[abris] case class CatalystDataToAvro(
32 | child: Expression,
33 | abrisConfig: Map[String,Any]
34 | ) extends UnaryExpression {
35 |
36 | override def dataType: DataType = BinaryType
37 |
38 | @transient private lazy val config = new InternalToAvroConfig(abrisConfig)
39 |
40 | @transient private lazy val serializer: AbrisAvroSerializer =
41 | new AbrisAvroSerializer(child.dataType, config.schema, child.nullable)
42 |
43 | @transient private lazy val writer =
44 | new GenericDatumWriter[Any](config.schema)
45 |
46 | @transient private var encoder: BinaryEncoder = _
47 |
48 | @transient private lazy val out = new ByteArrayOutputStream
49 |
50 | override def nullSafeEval(input: Any): Any = {
51 | out.reset()
52 |
53 | config.schemaId.foreach { id =>
54 | attachSchemaId(id, out)
55 | }
56 |
57 | encoder = EncoderFactory.get().directBinaryEncoder(out, encoder)
58 | val avroData = serializer.serialize(input)
59 | writer.write(avroData, encoder)
60 | encoder.flush()
61 | out.toByteArray
62 | }
63 |
64 | private def attachSchemaId(id: Int, outStream: ByteArrayOutputStream) = {
65 | outStream.write(ConfluentConstants.MAGIC_BYTE)
66 | outStream.write(ByteBuffer.allocate(ConfluentConstants.SCHEMA_ID_SIZE_BYTES).putInt(id).array())
67 | }
68 |
69 | override def prettyName: String = "to_avro"
70 |
71 | override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
72 | val expr = ctx.addReferenceObj("this", this)
73 | defineCodeGen(ctx, ev, input =>
74 | s"(byte[]) $expr.nullSafeEval($input)")
75 | }
76 |
77 | override protected def withNewChildInternal(newChild: Expression): Expression =
78 | copy(child = newChild)
79 | }
80 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/errors/SpecificRecordExceptionHandlerSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.errors
18 |
19 | import all_types.test.{NativeSimpleOuter, Nested}
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.avro.{AbrisAvroDeserializer, SchemaConverters}
22 | import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
23 | import org.apache.spark.sql.types.{DataType, StructType}
24 | import org.apache.spark.unsafe.types.UTF8String
25 | import org.scalatest.flatspec.AnyFlatSpec
26 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
27 | import za.co.absa.abris.examples.data.generation.TestSchemas
28 |
29 | class SpecificRecordExceptionHandlerSpec extends AnyFlatSpec {
30 |
31 | private val spark = SparkSession
32 | .builder()
33 | .appName("unitTest")
34 | .master("local[1]")
35 | .config("spark.driver.bindAddress", "localhost")
36 | .config("spark.ui.enabled", "false")
37 | .getOrCreate()
38 |
39 | it should "receive empty dataframe row back" in {
40 | // provided
41 | val providedDefaultRecord = NativeSimpleOuter.newBuilder()
42 | .setName("name")
43 | .setNested(Nested.newBuilder()
44 | .setInt$(1)
45 | .setLong$(1)
46 | .build())
47 | .build()
48 |
49 | // expected
50 | val expectedNestedFieldSchema = new StructType()
51 | .add("int", "int")
52 | .add("long", "long")
53 | val expectedNestedStructSchema = new StructType()
54 | .add("name", "string")
55 | .add("nested", expectedNestedFieldSchema)
56 |
57 | val expectedNestedFieldInternalRow = new SpecificInternalRow(expectedNestedFieldSchema)
58 | expectedNestedFieldInternalRow.setInt(0, 1)
59 | expectedNestedFieldInternalRow.setLong(1, 1L)
60 |
61 | val expectedNestedStructInternalRow = new SpecificInternalRow(expectedNestedStructSchema)
62 | expectedNestedStructInternalRow.update(0, UTF8String.fromString("name"))
63 | expectedNestedStructInternalRow.update(1, expectedNestedFieldInternalRow)
64 |
65 | //actual
66 | val deserializationExceptionHandler = new SpecificRecordExceptionHandler(providedDefaultRecord)
67 | val schema = AvroSchemaUtils.parse(TestSchemas.NATIVE_SIMPLE_OUTER_SCHEMA)
68 | val dataType: DataType = SchemaConverters.toSqlType(schema).dataType
69 |
70 | val actualResult = deserializationExceptionHandler
71 | .handle(new Exception, new AbrisAvroDeserializer(schema, dataType), schema)
72 |
73 | assert(actualResult == expectedNestedStructInternalRow)
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/data/generation/ComplexRecordsGenerator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples.data.generation
18 |
19 | import org.apache.spark.sql.Row
20 | import za.co.absa.commons.annotation.DeveloperApi
21 |
22 | import java.nio.ByteBuffer
23 | import scala.util.Random
24 | import scala.collection.JavaConverters._
25 |
26 | /**
27 | * This class provides methods to generate example/test data.
28 | * Not part of the library core.
29 | */
30 | // scalastyle:off magic.number
31 | @DeveloperApi
32 | object ComplexRecordsGenerator {
33 |
34 | private val random = new Random()
35 |
36 | val usedAvroSchema: String = TestSchemas.NATIVE_COMPLETE_SCHEMA
37 |
38 | def generateUnparsedRows(howMany: Int): List[Row] = {
39 | val result = new Array[Row](howMany)
40 | for (i <- 0 until howMany) {
41 | result(i) = Row.fromSeq(getDataSeq())
42 | }
43 | result.toList
44 | }
45 |
46 | private def getDataSeq(): Seq[Any] = {
47 | val map = Map[String, Seq[Long]](
48 | "entry1" -> randomSeqOfLongs(20),
49 | "entry2" -> randomSeqOfLongs(30))
50 | Seq(
51 | ByteBuffer.wrap(randomString(20).getBytes).array(),
52 | randomString(30),
53 | new java.lang.Integer(random.nextInt()),
54 | new java.lang.Long(random.nextLong()),
55 | new java.lang.Double(random.nextDouble()),
56 | new java.lang.Float(random.nextFloat()),
57 | new java.lang.Boolean(random.nextBoolean()),
58 | randomSeqOfStrings(10, 15),
59 | map,
60 | new FixedString(randomString(40)).bytes())
61 | }
62 |
63 | private def randomListOfLongs(listSize: Int) = {
64 | val array = new Array[Long](listSize)
65 | for (i <- 0 until listSize) {
66 | array(i) = random.nextLong()
67 | }
68 | new java.util.ArrayList(array.toList.asJava)
69 | }
70 |
71 | private def randomSeqOfLongs(listSize: Int) = {
72 | randomListOfLongs(listSize).asScala.toSeq
73 | }
74 |
75 | private def randomListOfStrings(listSize: Int, stringLength: Int) = {
76 | val array = new Array[String](listSize)
77 | for (i <- 0 until listSize) {
78 | array(i) = randomString(stringLength)
79 | }
80 | new java.util.ArrayList(array.toList.asJava)
81 | }
82 |
83 | private def randomSeqOfStrings(listSize: Int, stringLength: Int) = {
84 | randomListOfStrings(listSize, stringLength).asScala
85 | }
86 |
87 | private def randomString(length: Int): String = {
88 | val randomStream = Random.alphanumeric
89 | randomStream.take(length).mkString
90 | }
91 | }
92 | // scalastyle:on magic.number
93 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/ConfluentKafkaAvroWriter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples
18 |
19 | import org.apache.spark.sql.functions.{col, struct}
20 | import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession}
21 | import za.co.absa.abris.avro.format.SparkAvroConversions
22 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
23 | import za.co.absa.abris.config.AbrisConfig
24 | import za.co.absa.abris.examples.data.generation.ComplexRecordsGenerator
25 | import za.co.absa.abris.examples.utils.CompatibleRowEncoder
26 |
27 |
28 | object ConfluentKafkaAvroWriter {
29 |
30 | val kafkaTopicName = "test_topic"
31 |
32 | val dummyDataRows = 5
33 | val dummyDataPartitions = 1
34 |
35 | def main(args: Array[String]): Unit = {
36 |
37 | val spark = SparkSession
38 | .builder()
39 | .appName("ReaderJob")
40 | .master("local[2]")
41 | .getOrCreate()
42 |
43 | spark.sparkContext.setLogLevel("INFO")
44 |
45 | val dataFrame = generateRandomDataFrame(spark)
46 |
47 | dataFrame.show(false)
48 |
49 | val schemaString = ComplexRecordsGenerator.usedAvroSchema
50 |
51 | // to serialize all columns in dataFrame we need to put them in a spark struct
52 | val allColumns = struct(dataFrame.columns.map(col).toIndexedSeq: _*)
53 |
54 | val abrisConfig = AbrisConfig
55 | .toConfluentAvro
56 | .provideAndRegisterSchema(schemaString)
57 | .usingTopicNameStrategy(kafkaTopicName)
58 | .usingSchemaRegistry("http://localhost:8081")
59 |
60 | import za.co.absa.abris.avro.functions.to_avro
61 |
62 | val avroFrame = dataFrame.select(to_avro(allColumns, abrisConfig) as "value")
63 |
64 | avroFrame
65 | .write
66 | .format("kafka")
67 | .option("kafka.bootstrap.servers", "localhost:9092")
68 | .option("topic", kafkaTopicName)
69 | .save()
70 | }
71 |
72 | private def generateRandomDataFrame(spark: SparkSession): DataFrame = {
73 | import spark.implicits._
74 |
75 | implicit val encoder: Encoder[Row] = getEncoder
76 |
77 | val rows = createRows(dummyDataRows)
78 | spark.sparkContext.parallelize(rows, dummyDataPartitions).toDF()
79 | }
80 |
81 | private def createRows(howMany: Int): List[Row] = {
82 | ComplexRecordsGenerator.generateUnparsedRows(howMany)
83 | }
84 |
85 | private def getEncoder: Encoder[Row] = {
86 | val avroSchema = AvroSchemaUtils.parse(ComplexRecordsGenerator.usedAvroSchema)
87 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema)
88 | CompatibleRowEncoder.apply(sparkSchema)
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/avro/AbrisAvroDeserializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package org.apache.spark.sql.avro
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.spark.sql.types.DataType
21 | import za.co.absa.commons.annotation.DeveloperApi
22 |
23 | import scala.collection.mutable
24 | import scala.util.Try
25 |
26 | /**
27 | * Compatibility layer handling different versions of AvroDeserializer
28 | * the package also allows to access package private class
29 | */
30 | @DeveloperApi
31 | class AbrisAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) {
32 |
33 | private val deserializer = {
34 | val clazz = classOf[AvroDeserializer]
35 | val schemaClz = classOf[Schema]
36 | val dataTypeClz = classOf[DataType]
37 | val stringClz = classOf[String]
38 | val booleanClz = classOf[Boolean]
39 |
40 | clazz.getConstructors.collectFirst {
41 | case currCtor if currCtor.getParameterTypes sameElements
42 | Array(schemaClz, dataTypeClz) =>
43 | // Spark 2.4
44 | currCtor.newInstance(rootAvroType, rootCatalystType)
45 | case currCtor if currCtor.getParameterTypes sameElements
46 | Array(schemaClz, dataTypeClz, stringClz) =>
47 | // Spark 3.0 - Spark 3.5.0 (including)
48 | currCtor.newInstance(rootAvroType, rootCatalystType, "LEGACY")
49 | case currCtor if currCtor.getParameterTypes sameElements
50 | Array(schemaClz, dataTypeClz, stringClz, booleanClz) =>
51 | // Spark 3.5.1 - 3.5.2
52 | currCtor.newInstance(rootAvroType, rootCatalystType, "LEGACY", false: java.lang.Boolean)
53 | case currCtor if currCtor.getParameterTypes.toSeq sameElements
54 | Array(schemaClz, dataTypeClz, stringClz, booleanClz, stringClz) =>
55 | // Spark 4.0.0-SNAPSHOT+
56 | currCtor.newInstance(rootAvroType, rootCatalystType, "LEGACY", false: java.lang.Boolean, "")
57 | } match {
58 | case Some(value: AvroDeserializer) =>
59 | value
60 | case _ =>
61 | throw new NoSuchMethodException(
62 | s"""Supported constructors for AvroDeserializer are:
63 | |${clazz.getConstructors.toSeq.mkString(System.lineSeparator())}""".stripMargin)
64 | }
65 |
66 | }
67 |
68 | private val ru = scala.reflect.runtime.universe
69 | private val rm = ru.runtimeMirror(getClass.getClassLoader)
70 | private val classSymbol = rm.classSymbol(deserializer.getClass)
71 | private val deserializeMethodSymbol = classSymbol.info.decl(ru.TermName("deserialize")).asMethod
72 | private val deserializeMethod = rm.reflect(deserializer).reflectMethod(deserializeMethodSymbol)
73 |
74 | def deserialize(data: Any): Any = {
75 | deserializeMethod(data) match {
76 | case Some(x) => x // Spark 3.1 +
77 | case x => x // Spark 3.0 -
78 | }
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/read/confluent/SchemaManagerFactory.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 |
18 | package za.co.absa.abris.avro.read.confluent
19 |
20 | import org.apache.spark.internal.Logging
21 | import za.co.absa.abris.avro.registry.{AbrisRegistryClient, ConfluentMockRegistryClient, ConfluentRegistryClient}
22 | import za.co.absa.abris.config.AbrisConfig
23 | import za.co.absa.commons.annotation.DeveloperApi
24 |
25 | import scala.collection.concurrent
26 | import scala.util.Try
27 | import scala.util.control.NonFatal
28 |
29 | /**
30 | * This thread-safe factory creates [[SchemaManager]] and also manages the instances of SchemaRegistryClient
31 | * used by allowing caching of the references in order to avoid creating instances in every call that can be
32 | * used to cache schemas.
33 | * This factory also allows us to use custom registry client via abris.registryClient.class property.
34 | */
35 | object SchemaManagerFactory extends Logging {
36 |
37 | private val clientInstances: concurrent.Map[Map[String,String], AbrisRegistryClient] = concurrent.TrieMap()
38 |
39 | @DeveloperApi
40 | def addSRClientInstance(configs: Map[String, String], client: AbrisRegistryClient): Unit = {
41 | clientInstances.put(configs, client)
42 | }
43 |
44 | @DeveloperApi
45 | def resetSRClientInstance(): Unit = {
46 | clientInstances.clear()
47 | }
48 |
49 | def create(configs: Map[String,String]): SchemaManager = new SchemaManager(getOrCreateRegistryClient(configs))
50 |
51 | private def getOrCreateRegistryClient(configs: Map[String,String]): AbrisRegistryClient = {
52 | clientInstances.getOrElseUpdate(configs, {
53 | if (configs.contains(AbrisConfig.REGISTRY_CLIENT_CLASS)) {
54 | try {
55 | val clazz = Class.forName(configs(AbrisConfig.REGISTRY_CLIENT_CLASS))
56 | logInfo(msg = s"Configuring new Schema Registry client of type '${clazz.getCanonicalName}'")
57 | Try(clazz.getConstructor(classOf[Map[String, String]]).newInstance(configs))
58 | .recover { case _: NoSuchMethodException =>
59 | clazz.getConstructor().newInstance()
60 | }
61 | .get
62 | .asInstanceOf[AbrisRegistryClient]
63 | } catch {
64 | case e if NonFatal(e) =>
65 | throw new IllegalArgumentException("Custom registry client must implement AbrisRegistryClient " +
66 | "and have parameterless or Map[String, String] accepting constructor", e)
67 | }
68 | } else if (configs(AbrisConfig.SCHEMA_REGISTRY_URL).startsWith("mock://")) {
69 | logInfo(msg = s"Configuring new Schema Registry client of type ConfluentMockRegistryClient")
70 | new ConfluentMockRegistryClient()
71 | } else {
72 | logInfo(msg = s"Configuring new Schema Registry client of type ConfluentRegistryClient")
73 | new ConfluentRegistryClient(configs)
74 | }
75 | })
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/documentation/python-documentation.md:
--------------------------------------------------------------------------------
1 | # Using ABRiS with Python and PySpark
2 | Abris is a Scala library, but with a bit of effort it can be used in Python as well.
3 |
4 | We Provide some examples, but most of the documentation is written for Scala, so if you need more check the scala examples and just convert the code to Python.
5 |
6 | PySpark is using [Py4J](https://www.py4j.org/) as an interface between Scala and Python so you can check the documentation to get better idea how to transform the code,
7 | but mostly it should be clear form the following examples.
8 |
9 | ### Examples
10 |
11 | ```python
12 | from pyspark import SparkContext
13 | from pyspark.sql.column import Column, _to_java_column
14 |
15 | def from_avro(col, config):
16 | """
17 | avro deserialize
18 |
19 | :param col (PySpark column / str): column name "key" or "value"
20 | :param config (za.co.absa.abris.config.FromAvroConfig): abris config, generated from abris_config helper function
21 | :return: PySpark Column
22 | """
23 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm
24 | abris_avro = jvm_gateway.za.co.absa.abris.avro
25 |
26 | return Column(abris_avro.functions.from_avro(_to_java_column(col), config))
27 |
28 | def from_avro_abris_config(config_map, topic, is_key):
29 | """
30 | Create from avro abris config with a schema url
31 |
32 | :param config_map (dict[str, str]): configuration map to pass to deserializer, ex: {'schema.registry.url': 'http://localhost:8081'}
33 | :param topic (str): kafka topic
34 | :param is_key (bool): boolean
35 | :return: za.co.absa.abris.config.FromAvroConfig
36 | """
37 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm
38 | scala_map = jvm_gateway.PythonUtils.toScalaMap(config_map)
39 |
40 | return jvm_gateway.za.co.absa.abris.config \
41 | .AbrisConfig \
42 | .fromConfluentAvro() \
43 | .downloadReaderSchemaByLatestVersion() \
44 | .andTopicNameStrategy(topic, is_key) \
45 | .usingSchemaRegistry(scala_map)
46 |
47 | def to_avro(col, config):
48 | """
49 | avro serialize
50 | :param col (PySpark column / str): column name "key" or "value"
51 | :param config (za.co.absa.abris.config.ToAvroConfig): abris config, generated from abris_config helper function
52 | :return: PySpark Column
53 | """
54 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm
55 | abris_avro = jvm_gateway.za.co.absa.abris.avro
56 |
57 | return Column(abris_avro.functions.to_avro(_to_java_column(col), config))
58 |
59 | def to_avro_abris_config(config_map, topic, is_key):
60 | """
61 | Create to avro abris config with a schema url
62 |
63 | :param config_map (dict[str, str]): configuration map to pass to the serializer, ex: {'schema.registry.url': 'http://localhost:8081'}
64 | :param topic (str): kafka topic
65 | :param is_key (bool): boolean
66 | :return: za.co.absa.abris.config.ToAvroConfig
67 | """
68 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm
69 | scala_map = jvm_gateway.PythonUtils.toScalaMap(config_map)
70 |
71 | return jvm_gateway.za.co.absa.abris.config \
72 | .AbrisConfig \
73 | .toConfluentAvro() \
74 | .downloadSchemaByLatestVersion() \
75 | .andTopicNameStrategy(topic, is_key) \
76 | .usingSchemaRegistry(scala_map)
77 | ```
78 |
79 | Complete Example with loading from Kafka:
80 |
81 | ```python
82 | df = spark.read.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "topic_name").load()
83 |
84 | from_avro_abris_settings = from_avro_abris_config({'schema.registry.url': 'http://localhost:8081'}, 'topic_name', False)
85 | df2 = df.withColumn("parsed", from_avro("value", from_avro_abris_settings))
86 | df2.show()
87 | ```
88 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/parsing/utils/AvroSchemaUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.parsing.utils
18 |
19 | import org.apache.avro.{Schema, SchemaBuilder}
20 | import org.apache.commons.io.IOUtils
21 | import org.apache.hadoop.conf.Configuration
22 | import org.apache.hadoop.fs.{FileSystem, Path}
23 | import org.apache.spark.sql.DataFrame
24 | import org.apache.spark.sql.avro.SchemaConverters
25 | import org.apache.spark.sql.functions.struct
26 |
27 | import java.nio.charset.Charset
28 | import scala.collection.JavaConverters._
29 |
30 | /**
31 | * This class provides utility methods to cope with Avro schemas.
32 | */
33 | object AvroSchemaUtils {
34 |
35 | /**
36 | * Parses a plain Avro schema into an org.apache.avro.Schema implementation.
37 | */
38 | def parse(schema: String): Schema = new Schema.Parser().parse(schema)
39 |
40 | /**
41 | * Loads an Avro org.apache.avro.Schema from the path.
42 | */
43 | def load(path: String): Schema = {
44 | parse(loadPlain(path))
45 | }
46 |
47 | /**
48 | * Loads an Avro's plain schema from the path.
49 | */
50 | def loadPlain(path: String): String = {
51 | if (path == null) {
52 | throw new IllegalArgumentException("Null path informed. " +
53 | "Please make sure you provide a valid path to an existing Avro schema located in some file system.")
54 | }
55 | val hdfs = FileSystem.get(new Configuration())
56 | val stream = hdfs.open(new Path(path))
57 | try IOUtils.readLines(stream, Charset.defaultCharset).asScala.mkString("\n") finally stream.close()
58 | }
59 |
60 | def toAvroSchema(
61 | dataFrame: DataFrame,
62 | columnName: String,
63 | recordName: String = "topLevelRecord",
64 | nameSpace: String = ""
65 | ): Schema = {
66 | val fieldIndex = dataFrame.schema.fieldIndex(columnName)
67 | val field = dataFrame.schema.fields(fieldIndex)
68 |
69 | SchemaConverters.toAvroType(field.dataType, field.nullable, recordName, nameSpace)
70 | }
71 |
72 | def toAvroSchema(
73 | dataFrame: DataFrame,
74 | columnNames: Seq[String]
75 | ): Schema = toAvroSchema(dataFrame, columnNames, "topLevelRecord", "")
76 |
77 | def toAvroSchema(
78 | dataFrame: DataFrame,
79 | columnNames: Seq[String],
80 | recordName: String,
81 | nameSpace: String
82 | ): Schema = {
83 | val allColumns = struct(columnNames.map(dataFrame.col): _*)
84 | val expression = allColumns.expr
85 |
86 | SchemaConverters.toAvroType(expression.dataType, expression.nullable, recordName, nameSpace)
87 | }
88 |
89 | def toAvroSchema(
90 | dataFrame: DataFrame
91 | ): Schema = toAvroSchema(dataFrame, "topLevelRecord", "")
92 |
93 | def toAvroSchema(
94 | dataFrame: DataFrame,
95 | recordName: String,
96 | nameSpace: String
97 | ): Schema =
98 | toAvroSchema(dataFrame, dataFrame.columns.toIndexedSeq, recordName, nameSpace)
99 |
100 | def wrapSchema(schema: Schema, name: String, namespace: String): Schema = {
101 | SchemaBuilder.record(name)
102 | .namespace(namespace)
103 | .fields().name(schema.getName).`type`(schema).noDefault()
104 | .endRecord()
105 | }
106 |
107 | }
108 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/parsing/utils/AvroSchemaUtilsSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.parsing.utils
18 |
19 | import org.apache.avro.Schema.Type
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.functions.{col, struct}
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest.matchers.should.Matchers
24 |
25 |
26 | class AvroSchemaUtilsSpec extends AnyFlatSpec with Matchers {
27 |
28 | private val spark = SparkSession
29 | .builder()
30 | .appName("unitTest")
31 | .master("local[2]")
32 | .config("spark.driver.bindAddress", "localhost")
33 | .config("spark.ui.enabled", "false")
34 | .getOrCreate()
35 |
36 | import spark.implicits._
37 |
38 |
39 | val dataFrame = Seq((1, "bat", true), (2, "mouse", false)).toDF("number", "word", "bool")
40 |
41 |
42 | behavior of "AvroSchemaUtils"
43 |
44 | it should "convert the schema of whole dataframe" in {
45 |
46 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame)
47 |
48 | schema.getType shouldBe Type.RECORD
49 | schema.getFullName shouldBe "topLevelRecord"
50 | schema.getFields.get(0).schema().getType shouldBe Type.INT
51 | schema.getFields.get(1).schema().getType shouldBe Type.UNION
52 | schema.getFields.get(2).schema().getType shouldBe Type.BOOLEAN
53 |
54 |
55 | val schema2 = AvroSchemaUtils.toAvroSchema(dataFrame, "foo", "bar")
56 | schema2.getType shouldBe Type.RECORD
57 | schema2.getFullName shouldBe "bar.foo"
58 | }
59 |
60 | it should "convert the schema of multiple selected columns" in {
61 |
62 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame, Seq("bool","number"))
63 |
64 | schema.getType shouldBe Type.RECORD
65 | schema.getFullName shouldBe "topLevelRecord"
66 | schema.getFields.size() shouldBe 2
67 | schema.getFields.get(0).schema().getType shouldBe Type.BOOLEAN
68 | schema.getFields.get(1).schema().getType shouldBe Type.INT
69 |
70 |
71 | val schema2 = AvroSchemaUtils.toAvroSchema(dataFrame, Seq("bool","number"), "foo", "bar")
72 | schema2.getType shouldBe Type.RECORD
73 | schema2.getFullName shouldBe "bar.foo"
74 | }
75 |
76 | it should "convert the schema of one selected simple column" in {
77 |
78 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame, "bool")
79 |
80 | schema.getType shouldBe Type.BOOLEAN
81 | schema.getFullName shouldBe "boolean"
82 |
83 | val schema2 = AvroSchemaUtils.toAvroSchema(dataFrame, "bool", "foo", "bar")
84 | schema2.getType shouldBe Type.BOOLEAN
85 | schema2.getFullName shouldBe "boolean"
86 | }
87 |
88 | it should "convert the schema of one selected struct column" in {
89 |
90 | val structDataFrame = dataFrame.select(struct(col("bool"), col("number")) as "str")
91 |
92 | val schema = AvroSchemaUtils.toAvroSchema(structDataFrame, "str")
93 |
94 | schema.getType shouldBe Type.RECORD
95 | schema.getFullName shouldBe "topLevelRecord"
96 | schema.getFields.size() shouldBe 2
97 | schema.getFields.get(0).schema().getType shouldBe Type.BOOLEAN
98 | schema.getFields.get(1).schema().getType shouldBe Type.INT
99 |
100 | val schema2 = AvroSchemaUtils.toAvroSchema(structDataFrame, "str", "foo", "bar")
101 |
102 | schema2.getType shouldBe Type.RECORD
103 | schema2.getFullName shouldBe "bar.foo"
104 | schema2.getFields.size() shouldBe 2
105 | schema2.getFields.get(0).schema().getType shouldBe Type.BOOLEAN
106 | schema2.getFields.get(1).schema().getType shouldBe Type.INT
107 | }
108 |
109 | }
110 |
--------------------------------------------------------------------------------
/documentation/vanilla-avro-documentation.md:
--------------------------------------------------------------------------------
1 | # ABRiS - vanilla Avro documentation
2 |
3 | - [Avro to Spark](#Avro-to-Spark)
4 | - [Spark to Avro](#spark-to-avro)
5 | ## Avro to Spark
6 |
7 | ### Providing Avro schema
8 | ```scala
9 | import za.co.absa.abris.avro.functions.from_avro
10 |
11 | def readAvro(dataFrame: DataFrame, schemaString: String): DataFrame = {
12 |
13 | dataFrame.select(from_avro(col("value"), schemaString) as 'data).select("data.*")
14 | }
15 | ```
16 | In this example the Avro binary data are in ```dataFrame``` inside column the **value**.
17 | The Avro schema is provided as a string ```schemaString```.
18 |
19 | After the Avro data are converted to Spark SQL representation they are stored in column the **data**.
20 | This column is immediately flattened in the next select so the result will be a ```DataFrame``` containing only the deserialized avro data.
21 |
22 | ### Using Schema Registry
23 | First we need to provide the Schema Registry configuration:
24 | ```scala
25 | val fromAvroConfig1: FromAvroConfig = AbrisConfig
26 | .fromSimpleAvro
27 | .downloadSchemaById(66)
28 | .usingSchemaRegistry("http://registry-url")
29 |
30 | // or
31 | val fromAvroConfig2: FromAvroConfig = AbrisConfig
32 | .fromSimpleAvro
33 | .downloadSchemaByLatestVersion
34 | .andTopicRecordNameStrategy("topic", "recordName", "namespace")
35 | .usingSchemaRegistry("http://registry-url")
36 |
37 | // or
38 | val fromAvroConfig3: FromAvroConfig = AbrisConfig
39 | .fromSimpleAvro
40 | .downloadSchemaByVersion(3)
41 | .andTopicNameStrategy("topicFoo", isKey=true) // Use isKey=true for the key schema and isKey=false for the value schema
42 | .usingSchemaRegistry("http://registry-url")
43 |
44 | // ...
45 | ```
46 | There are several ways how to configure this.
47 | Each step in configurator will offer you some options, and you just have to choose what you want to do.
48 | At the end you should get an instance of `FromAvroConfig` that you can use like this:
49 |
50 |
51 | ```scala
52 | import za.co.absa.abris.avro.functions.from_avro
53 |
54 | def readAvro(dataFrame: DataFrame, fromAvroConfig: FromAvroConfig): DataFrame = {
55 |
56 | dataFrame.select(from_avro(col("value"), fromAvroConfig) as 'data).select("data.*")
57 | }
58 | ```
59 |
60 | ## Spark to Avro
61 |
62 | ### Providing Avro schema
63 | ```scala
64 | import za.co.absa.abris.avro.functions.to_avro
65 |
66 | def writeAvro(dataFrame: DataFrame, schemaString: String): DataFrame = {
67 |
68 | val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*)
69 | dataFrame.select(to_avro(allColumns, schemaString) as 'value)
70 | }
71 | ```
72 | If you provide the Avro schema as a second argument, ABRiS will use it to convert Spark data into Avro.
73 | Please make sure that the data types in Spark DataFrame and in schema are compatible.
74 |
75 | ### Using schema registry
76 | If you want to use Schema Registry you need to provide a configuration:
77 |
78 | ```scala
79 | val toAvroConfig1: ToAvroConfig = AbrisConfig
80 | .toSimpleAvro
81 | .provideAndRegisterSchema(schemaString)
82 | .usingTopicRecordNameStrategy("fooTopic") // record name is taken from the schema
83 | .usingSchemaRegistry("http://registry-url")
84 |
85 | // or
86 | val toAvroConfig2: ToAvroConfig = AbrisConfig
87 | .toSimpleAvro
88 | .downloadSchemaByVersion(4)
89 | .andTopicNameStrategy("fooTopic")
90 | .usingSchemaRegistry("http://registry-url")
91 |
92 | // or
93 | val toAvroConfig3: ToAvroConfig = AbrisConfig
94 | .toSimpleAvro
95 | .downloadSchemaById(66)
96 | .usingSchemaRegistry("http://registry-url")
97 |
98 | // ...
99 | ```
100 | There are several ways how to configure this.
101 | Each step in configurator will offer you some options, and you just have to choose what you want to do.
102 | At the end you should get an instance of `ToAvroConfig` that you can use like this:
103 | ```scala
104 | import za.co.absa.abris.avro.functions.to_avro
105 |
106 | def writeAvro(dataFrame: DataFrame, toAvroConfig: ToAvroConfig): DataFrame = {
107 |
108 | val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*)
109 | dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value)
110 | }
111 | ```
112 |
113 | ### Generate schema from data and register
114 |
115 | See [here](confluent-avro-documentation.md#generate-schema-from-data-and-register)
116 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/read/confluent/SchemaManagerFactorySpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.read.confluent
18 |
19 | import org.scalatest.BeforeAndAfterEach
20 | import org.scalatest.flatspec.AnyFlatSpec
21 | import za.co.absa.abris.avro.registry.{AbrisRegistryClient, ConfluentMockRegistryClient, ConfluentRegistryClient, TestRegistryClient}
22 | import za.co.absa.abris.config.AbrisConfig
23 |
24 | import scala.reflect.runtime.{universe => ru}
25 |
26 | class SchemaManagerFactorySpec extends AnyFlatSpec with BeforeAndAfterEach {
27 |
28 | private val schemaRegistryConfig1 = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "http://dummy")
29 |
30 | private val schemaRegistryConfig2 = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "http://dummy_sr_2")
31 |
32 | private val schemaRegistryConfig3 = Map(
33 | AbrisConfig.SCHEMA_REGISTRY_URL -> "http://dummy_sr_2",
34 | AbrisConfig.REGISTRY_CLIENT_CLASS -> "za.co.absa.abris.avro.registry.TestRegistryClient"
35 | )
36 |
37 | override def beforeEach(): Unit = {
38 | super.beforeEach()
39 | SchemaManagerFactory.resetSRClientInstance() // Reset factory state
40 | }
41 |
42 | behavior of "SchemaManagerFactory"
43 |
44 | it should "create a schema manager for the given Schema Registry configs " +
45 | "and cache the Schema Registry Client reference for subsequent usages" in {
46 | val schemaManagerRef1 = SchemaManagerFactory.create(schemaRegistryConfig1)
47 | val schemaManagerRef2 = SchemaManagerFactory.create(schemaRegistryConfig1)
48 |
49 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader)
50 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm
51 |
52 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
53 | val res2 = m.reflect(schemaManagerRef2).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
54 | assert(res1.eq(res2))
55 | }
56 |
57 | it should "create a schema manager with a different schema registry client depending on the configs passed" in {
58 | val schemaManagerRef1 = SchemaManagerFactory.create(schemaRegistryConfig1)
59 | val schemaManagerRef2 = SchemaManagerFactory.create(schemaRegistryConfig2)
60 |
61 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader)
62 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm
63 |
64 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
65 | val res2 = m.reflect(schemaManagerRef2).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
66 | assert(!res1.eq(res2))
67 | }
68 |
69 | it should "create a schema manager with a custom schema registry client depending on the configs passed" in {
70 | val schemaManagerRef1 = SchemaManagerFactory.create(schemaRegistryConfig1)
71 | val schemaManagerRef3 = SchemaManagerFactory.create(schemaRegistryConfig3)
72 |
73 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader)
74 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm
75 |
76 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
77 | val res3 = m.reflect(schemaManagerRef3).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
78 | assert(!res1.eq(res3))
79 | assert(res1.isInstanceOf[ConfluentRegistryClient])
80 | assert(res3.isInstanceOf[TestRegistryClient])
81 | }
82 |
83 | it should "create mock client when url starts with mock://" in {
84 | val config = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "mock://dummy_sr_2")
85 |
86 | val schemaManagerRef1 = SchemaManagerFactory.create(config)
87 |
88 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader)
89 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm
90 |
91 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient]
92 | assert(res1.isInstanceOf[ConfluentMockRegistryClient])
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/format/SparkAvroConversionsSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.format
18 |
19 | import org.apache.avro.Schema.Type
20 | import org.apache.avro.SchemaBuilder
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest.matchers.should.Matchers
24 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
25 | import za.co.absa.abris.examples.data.generation.TestSchemas
26 |
27 | import scala.collection.JavaConverters._
28 |
29 | class SparkAvroConversionsSpec extends AnyFlatSpec with Matchers {
30 |
31 | // scalastyle:off magic.number
32 |
33 | private val structType = StructType(
34 | Seq(
35 | StructField("int1", IntegerType, false),
36 | StructField("long1", LongType, false),
37 | StructField("map1", new MapType(StringType, IntegerType, false), false),
38 | StructField("array1", new ArrayType(LongType, false), false),
39 | StructField("struct2", StructType(
40 | Seq(
41 | StructField("string2", StringType, true),
42 | StructField("string3", StringType, false)
43 | )
44 | ), false),
45 | StructField("double1", DoubleType, false),
46 | StructField("struct3", StructType(
47 | Seq(
48 | StructField("int2", IntegerType, false),
49 | StructField("float1", FloatType, false)
50 | )
51 | ), false),
52 | StructField("bytes", BinaryType, false)
53 | )
54 | )
55 |
56 | behavior of "SparkAvroConversions"
57 |
58 | it should "convert Avro schemas to SQL types" in {
59 | val schema = AvroSchemaUtils.parse(TestSchemas.COMPLEX_SCHEMA_SPEC)
60 | val sql = SparkAvroConversions.toSqlType(schema)
61 | val schemaFromSql = SparkAvroConversions.toAvroSchema(sql, schema.getName, schema.getNamespace)
62 |
63 | schema.getFields.asScala.foreach(field =>
64 | assert(schema.getField(field.name).toString == schemaFromSql.getField(field.name).toString))
65 | }
66 |
67 | it should "convert SQL types to Avro schemas" in {
68 | val schemaName = "teste_name"
69 | val schemaNamespace = "teste_namespace"
70 |
71 | val schema = SparkAvroConversions.toAvroSchema(structType, schemaName, schemaNamespace)
72 |
73 | assert(schema.getName == schemaName)
74 | assert(schema.getNamespace == schemaNamespace)
75 | assert(schema.getField("int1").schema().getType == Type.INT)
76 | assert(schema.getField("long1").schema().getType == Type.LONG)
77 | assert(schema.getField("map1").schema().getType == Type.MAP)
78 | assert(schema.getField("array1").schema().getType == Type.ARRAY)
79 | assert(schema.getField("struct2").schema().getType == Type.RECORD)
80 | assert(schema.getField("double1").schema().getType == Type.DOUBLE)
81 | assert(schema.getField("struct3").schema().getType == Type.RECORD)
82 | assert(schema.getField("bytes").schema().getType == Type.BYTES)
83 |
84 | val map1 = schema.getField("map1").schema()
85 | assert(map1.getValueType.getType == Type.INT)
86 |
87 | val array1 = schema.getField("array1").schema()
88 | assert(array1.getElementType.getType == Type.LONG)
89 |
90 | val struct2 = schema.getField("struct2").schema()
91 | assert(struct2.getField("string2").schema().getType == Type.UNION) // nullable fields are "unioned" with null
92 | assert(struct2.getField("string3").schema().getType == Type.STRING)
93 |
94 | val struct3 = schema.getField("struct3").schema()
95 | assert(struct3.getField("int2").schema().getType == Type.INT)
96 | assert(struct3.getField("float1").schema().getType == Type.FLOAT)
97 | }
98 |
99 | it should "convert fixed and bytes type" in {
100 |
101 | val avroSchema = SchemaBuilder
102 | .record("test_record")
103 | .namespace("test_namespace")
104 | .fields()
105 | .name("fixed_name").`type`().fixed("fixed_name").size(3).noDefault()
106 | .name("bytes_name").`type`().bytesType().noDefault()
107 | .endRecord()
108 |
109 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema)
110 |
111 | sparkSchema.fields(0) shouldBe StructField("fixed_name", BinaryType, false)
112 | sparkSchema.fields(1) shouldBe StructField("bytes_name", BinaryType, false)
113 | }
114 |
115 | // scalastyle:on magic.number
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/utils/ExamplesUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples.utils
18 |
19 | import java.io.FileInputStream
20 | import java.util.Properties
21 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter}
22 | import org.apache.spark.sql.{DataFrameWriter, Row, SparkSession}
23 | import org.slf4j.LoggerFactory
24 | import za.co.absa.commons.annotation.DeveloperApi
25 |
26 | import scala.collection.JavaConverters._
27 |
28 | @DeveloperApi
29 | object ExamplesUtils {
30 |
31 | private val OPTION_PREFIX = "option."
32 |
33 | private val logger = LoggerFactory.getLogger(ExamplesUtils.getClass)
34 |
35 | def checkArgs(args: Array[String]): Unit = {
36 | if (args.length != 1) {
37 | logger.error("No properties file specified.")
38 | System.exit(1)
39 | }
40 | }
41 |
42 | def getSparkSession(properties: Properties, jobNameProp: String, jobMasterProp: String,
43 | logLevelProp: String): SparkSession = {
44 |
45 | val spark = SparkSession
46 | .builder()
47 | .appName(properties.getProperty(jobNameProp))
48 | .master(properties.getProperty(jobMasterProp))
49 | .getOrCreate()
50 |
51 | spark.sparkContext.setLogLevel(properties.getProperty(logLevelProp))
52 | spark
53 | }
54 |
55 | def loadProperties(args: Array[String]): Properties = {
56 | logger.debug("Loading properties from: " + args(0))
57 | val properties = ExamplesUtils.loadProperties(args(0))
58 | for (key <- properties.asScala.keysIterator) {
59 | logger.debug(s"\t$key = ${properties.getProperty(key)}")
60 | }
61 | properties
62 | }
63 |
64 | def loadProperties(path: String): Properties = {
65 | val properties = new Properties()
66 | properties.load(new FileInputStream(path))
67 | properties
68 | }
69 |
70 | private def getKeys(properties: Properties) = {
71 | properties.keySet().asScala
72 | .filter(key => key.toString.startsWith(OPTION_PREFIX))
73 | .map(key => (key.toString, key.toString.drop(OPTION_PREFIX.length())))
74 | }
75 |
76 | implicit class ReaderStreamOptions(stream: DataStreamReader) {
77 | def addOptions(properties: Properties): DataStreamReader = {
78 | getKeys(properties)
79 | .foreach(keys => {
80 | logger.debug(s"DataStreamReader: setting option: ${keys._2} = ${properties.getProperty(keys._1)}")
81 | stream.option(keys._2, properties.getProperty(keys._1))
82 | })
83 | stream
84 | }
85 | }
86 |
87 | implicit class WriterRowOptions(stream: DataFrameWriter[Row]) {
88 | def addOptions(properties: Properties): DataFrameWriter[Row] = {
89 | getKeys(properties)
90 | .foreach(keys => {
91 | logger.debug(s"DataFrameWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}")
92 | stream.option(keys._2, properties.getProperty(keys._1))
93 | })
94 | stream
95 | }
96 | }
97 |
98 | implicit class WriterOptions(stream: DataFrameWriter[Array[Byte]]) {
99 | def addOptions(properties: Properties): DataFrameWriter[Array[Byte]] = {
100 | getKeys(properties)
101 | .foreach(keys => {
102 | logger.debug(s"DataFrameWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}")
103 | stream.option(keys._2, properties.getProperty(keys._1))
104 | })
105 | stream
106 | }
107 | }
108 |
109 | implicit class WriterRowStreamOptions(stream: DataStreamWriter[Row]) {
110 | def addOptions(properties: Properties): DataStreamWriter[Row] = {
111 | getKeys(properties)
112 | .foreach(keys => {
113 | logger.debug(s"DataStreamWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}")
114 | stream.option(keys._2, properties.getProperty(keys._1))
115 | })
116 | stream
117 | }
118 | }
119 |
120 | implicit class WriterStreamOptions(stream: DataStreamWriter[Array[Byte]]) {
121 | def addOptions(properties: Properties): DataStreamWriter[Array[Byte]] = {
122 | getKeys(properties)
123 | .foreach(keys => {
124 | logger.debug(s"DataStreamWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}")
125 | stream.option(keys._2, properties.getProperty(keys._1))
126 | })
127 | stream
128 | }
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/read/confluent/SchemaManagerSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.read.confluent
18 |
19 | import org.scalatest.BeforeAndAfter
20 | import org.scalatest.flatspec.AnyFlatSpec
21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
22 | import za.co.absa.abris.avro.registry.{ConfluentMockRegistryClient, LatestVersion, NumVersion, SchemaSubject}
23 | import za.co.absa.abris.config.AbrisConfig
24 |
25 | class schemaManagerSpec extends AnyFlatSpec with BeforeAndAfter {
26 |
27 | private val schema = AvroSchemaUtils.parse(
28 | "{\"type\": \"record\", \"name\": \"Blah\", \"fields\": [{ \"name\": \"name\", \"type\": \"string\" }]}")
29 |
30 |
31 | val recordByteSchema = AvroSchemaUtils.parse("""{
32 | "namespace": "all-types.test",
33 | "type": "record",
34 | "name": "record_name",
35 | "fields":[
36 | {"name": "int", "type": ["int", "null"] }
37 | ]
38 | }""")
39 |
40 | val recordEvolvedByteSchema1 = AvroSchemaUtils.parse("""{
41 | "namespace": "all-types.test",
42 | "type": "record",
43 | "name": "record_name",
44 | "fields":[
45 | {"name": "int", "type": ["int", "null"] },
46 | {"name": "favorite_color", "type": "string", "default": "green"}
47 | ]
48 | }""")
49 |
50 | val recordEvolvedByteSchema2 = AvroSchemaUtils.parse("""{
51 | "namespace": "all-types.test",
52 | "type": "record",
53 | "name": "record_name",
54 | "fields":[
55 | {"name": "int", "type": ["int", "null"] },
56 | {"name": "favorite_color", "type": "string", "default": "green"},
57 | {"name": "favorite_badger", "type": "string", "default": "Honey badger"}
58 | ]
59 | }""")
60 |
61 | val registryUrl = "dummyUrl"
62 | val registryConfig = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> registryUrl)
63 |
64 | behavior of "SchemaManager"
65 |
66 | it should "return correct schema by id or subect and version" in {
67 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient())
68 | val subject1 = SchemaSubject.usingTopicNameStrategy("foo")
69 | val subject2 = SchemaSubject.usingTopicNameStrategy("bar")
70 |
71 | val id1 = schemaManager.register(subject1, schema) // id1, version 1
72 | val id2 = schemaManager.register(subject2, recordByteSchema) // id2, version 1
73 | val id3 = schemaManager.register(subject2, recordEvolvedByteSchema1) // id3, version 2
74 | val id4 = schemaManager.register(subject2, recordEvolvedByteSchema2) // id4, version 3
75 |
76 | assert(schemaManager.getSchemaById(id1) == schema)
77 | assert(schemaManager.getSchemaById(id2) == recordByteSchema)
78 | assert(schemaManager.getSchemaById(id3) == recordEvolvedByteSchema1)
79 | assert(schemaManager.getSchemaById(id4) == recordEvolvedByteSchema2)
80 |
81 | assert(schemaManager.getSchemaBySubjectAndVersion(subject1, NumVersion(1)) == schema)
82 | assert(schemaManager.getSchemaBySubjectAndVersion(subject1, LatestVersion()) == schema)
83 |
84 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, NumVersion(1)) == recordByteSchema)
85 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, NumVersion(2)) == recordEvolvedByteSchema1)
86 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, NumVersion(3)) == recordEvolvedByteSchema2)
87 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, LatestVersion()) == recordEvolvedByteSchema2)
88 | }
89 |
90 | it should "find already existing schema" in {
91 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient())
92 |
93 | val subject = SchemaSubject.usingTopicNameStrategy("dummy_topic")
94 |
95 | schemaManager.register(subject, recordByteSchema)
96 | schemaManager.register(subject, recordEvolvedByteSchema1)
97 | schemaManager.register(subject, recordEvolvedByteSchema2)
98 |
99 | val maybeId = schemaManager.findEquivalentSchema(recordEvolvedByteSchema1, subject)
100 |
101 | val resultSchema = schemaManager.getSchemaById(maybeId.get)
102 |
103 | assert(resultSchema.equals(recordEvolvedByteSchema1))
104 | }
105 |
106 | "exists" should "return true when schema is in registry" in {
107 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient())
108 |
109 | val subject = SchemaSubject.usingTopicNameStrategy("dummy_topic")
110 | schemaManager.register(subject, recordByteSchema)
111 | val schemaExists = schemaManager.exists(subject)
112 |
113 | assert(schemaExists == true)
114 | }
115 |
116 | "exists" should "return false when schema is not in registry" in {
117 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient())
118 | val schemaExists = schemaManager.exists(SchemaSubject.usingTopicNameStrategy("foo"))
119 |
120 | assert(schemaExists == false)
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/read/confluent/SchemaManager.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.read.confluent
18 |
19 | import java.security.InvalidParameterException
20 |
21 | import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException
22 | import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient}
23 | import org.apache.avro.Schema
24 | import org.apache.spark.internal.Logging
25 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
26 | import za.co.absa.abris.avro.registry._
27 |
28 | import scala.collection.JavaConverters._
29 | import scala.util.{Failure, Success, Try}
30 |
31 | /**
32 | * This class provides methods to integrate with remote schemas through Schema Registry.
33 | *
34 | * This can be considered an "enriched" facade to the Schema Registry client.
35 | *
36 | * This is NOT THREAD SAFE, which means that multiple threads operating on this object
37 | * (e.g. calling 'configureSchemaRegistry' with different parameters) would operated
38 | * on the same Schema Registry client, thus, leading to inconsistent behavior.
39 | */
40 | class SchemaManager(schemaRegistryClient: AbrisRegistryClient) extends Logging {
41 |
42 | def this(schemaRegistryClient: SchemaRegistryClient) = this(new ConfluentRegistryClient(schemaRegistryClient))
43 |
44 | def getSchema(coordinate: SchemaCoordinate): Schema = coordinate match {
45 | case IdCoordinate(id) => getSchemaById(id)
46 | case SubjectCoordinate(subject, version) => getSchemaBySubjectAndVersion(subject, version)
47 | }
48 |
49 | def getSchemaById(schemaId: Int): Schema = schemaRegistryClient.getById(schemaId)
50 |
51 | /**
52 | * @param version - Some(versionNumber) or None for latest version
53 | */
54 | def getSchemaBySubjectAndVersion(subject: SchemaSubject, version: SchemaVersion): Schema = {
55 | val metadata = getSchemaMetadataBySubjectAndVersion(subject, version)
56 |
57 | AvroSchemaUtils.parse(metadata.getSchema)
58 | }
59 |
60 | /**
61 | * @param version - Some(versionNumber) or None for latest version
62 | */
63 | def getSchemaMetadataBySubjectAndVersion(subject: SchemaSubject, version: SchemaVersion): SchemaMetadata =
64 | version match {
65 | case NumVersion(versionInt) => schemaRegistryClient.getSchemaMetadata(subject.asString, versionInt)
66 | case LatestVersion() => schemaRegistryClient.getLatestSchemaMetadata(subject.asString)
67 | }
68 |
69 | def register(subject: SchemaSubject, schemaString: String): Int =
70 | register(subject, AvroSchemaUtils.parse(schemaString))
71 |
72 | /**
73 | * Register new schema for a subject.
74 | *
75 | * @throws InvalidParameterException when the new schema is not compatible with already exiting one.
76 | * @return registered schema id
77 | */
78 | def register(subject: SchemaSubject, schema: Schema): Int = {
79 | if (!exists(subject) || isCompatible(schema, subject)) {
80 | logInfo(s"AvroSchemaUtils.registerIfCompatibleSchema: Registering schema for subject: $subject")
81 | schemaRegistryClient.register(subject.asString, schema)
82 | } else {
83 | throw new InvalidParameterException(s"Schema registration failed. Schema for subject:'$subject' " +
84 | s"already exists and it is not compatible with schema you are trying to register.")
85 | }
86 | }
87 |
88 | /**
89 | * Checks if a given schema exists in Schema Registry.
90 | */
91 | def exists(subject: SchemaSubject): Boolean = {
92 | Try(schemaRegistryClient.getLatestSchemaMetadata(subject.asString)) match {
93 | case Success(_) => true
94 | case Failure(e: RestClientException) if e.getStatus == 404 => false
95 | case Failure(e) => throw e
96 | }
97 | }
98 |
99 | /**
100 | * Checks if a new schema is compatible with the latest schema registered for a given subject.
101 | */
102 | private def isCompatible(newSchema: Schema, subject: SchemaSubject): Boolean = {
103 | schemaRegistryClient.testCompatibility(subject.asString, newSchema)
104 | }
105 |
106 | def getAllSchemasWithMetadata(subject: SchemaSubject): List[SchemaMetadata] = {
107 | val versions = Try(schemaRegistryClient.getAllVersions(subject.asString)) match {
108 | case Success(l) => l.asScala.toList
109 | case Failure(e: RestClientException) if e.getStatus == 404 => List.empty[Integer]
110 | case Failure(e) => throw e
111 | }
112 |
113 | versions.map(schemaRegistryClient.getSchemaMetadata(subject.asString, _))
114 | }
115 |
116 | def findEquivalentSchema(schema: Schema, subject: SchemaSubject): Option[Int] = {
117 | val maybeIdenticalSchemaMetadata =
118 | getAllSchemasWithMetadata(subject)
119 | .find{
120 | sm => AvroSchemaUtils.parse(sm.getSchema).equals(schema)
121 | }
122 |
123 | maybeIdenticalSchemaMetadata.map(_.getId)
124 | }
125 |
126 | def getIfExistsOrElseRegisterSchema(schema: Schema, subject: SchemaSubject): Int = {
127 | val maybeSchemaId = findEquivalentSchema(schema, subject)
128 | maybeSchemaId.getOrElse(register(subject, schema))
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/documentation/confluent-avro-documentation.md:
--------------------------------------------------------------------------------
1 | # ABRiS - Confluent Avro documentation
2 |
3 | - [Avro to Spark](#Avro-to-Spark)
4 | - [Spark to Avro](#spark-to-avro)
5 |
6 | The main difference between confluent avro and vanilla Avro is in whether it expects the schema id in the Avro payload.
7 | In Confluent avro there always have to be schema id on the start of the payload.
8 |
9 | ## Avro to Spark
10 | When converting from Confluent avro to Spark, there may be two schemas *reader schema* and *writer schema*.
11 | - Writer schema is the one used to convert data to avro and is the one identified by id in the avro payload.
12 | - Reader schema is the one specified by you.
13 |
14 | The schemas must be compatible.
15 |
16 | There are several ways how to get the reader schema, most of them are in the following config examples:
17 |
18 | ```scala
19 | // Provide an avro schema as json
20 | val fromAvroConfig1: FromAvroConfig = AbrisConfig
21 | .fromConfluentAvro
22 | .provideReaderSchema("{ ...schema json...}")
23 | .usingSchemaRegistry("http://registry-url")
24 |
25 | // Specify a schema id
26 | val fromAvroConfig2: FromAvroConfig = AbrisConfig
27 | .fromConfluentAvro
28 | .downloadReaderSchemaById(66)
29 | .usingSchemaRegistry("http://registry-url")
30 |
31 | // Use the schema with the latest version.
32 | val fromAvroConfig3: FromAvroConfig = AbrisConfig
33 | .fromConfluentAvro
34 | .downloadReaderSchemaByLatestVersion
35 | .andTopicNameStrategy("topicName", isKey=true) // Use isKey=true for the key schema and isKey=false for the value schema
36 | .usingSchemaRegistry("http://registry-url")
37 | ```
38 | Once you have `FromAvroConfig` you just need to pass it to Abris function:
39 | ```scala
40 | import za.co.absa.abris.avro.functions.from_avro
41 |
42 | def readAvro(dataFrame: DataFrame, fromAvroConfig: FromAvroConfig): DataFrame = {
43 |
44 | dataFrame.select(from_avro(col("value"), fromAvroConfig) as 'data).select("data.*")
45 | }
46 | ```
47 |
48 | ## Spark to Avro
49 | When converting data to Avro there is only one schema in play, but you have several options how to provide it:
50 | - You can provide it as a string and let Abris register the schema for you.
51 | - You can specify a schema that already is in the registry. In that case Abris will download it and no registration is necessary.
52 |
53 | When registering the schema Abris will do it only if the same schema is not already registered.
54 | So it's something like: register if not exist.
55 |
56 | Some configuration examples:
57 | ```scala
58 | // Provide avro schema string with record name strategy
59 | val toAvroConfig1: ToAvroConfig = AbrisConfig
60 | .toConfluentAvro
61 | .provideAndRegisterSchema("{ ...schema json... }")
62 | .usingRecordNameStrategy() // name and namespace taken from schema
63 | .usingSchemaRegistry("http://registry-url")
64 |
65 | // Provide avro schema string with topic name strategy
66 | val toAvroConfig2: ToAvroConfig = AbrisConfig
67 | .toConfluentAvro
68 | .provideAndRegisterSchema("{ ...schema json... }")
69 | .usingTopicNameStrategy("fooTopic") // Assumes value schema by default. Use isKey=true for the key schema
70 | .usingSchemaRegistry("http://registry-url")
71 |
72 | // Use already existing schema by id
73 | val toAvroConfig3: ToAvroConfig = AbrisConfig
74 | .toConfluentAvro
75 | .downloadSchemaById(66)
76 | .usingSchemaRegistry("http://registry-url")
77 |
78 | // Use latest version of already existing schema
79 | val toAvroConfig4: ToAvroConfig = AbrisConfig
80 | .toConfluentAvro
81 | .downloadSchemaByLatestVersion
82 | .andTopicNameStrategy("fooTopic")
83 | .usingSchemaRegistry("http://registry-url")
84 | ```
85 | Once you have a config you can use it like this:
86 | ```scala
87 | import za.co.absa.abris.avro.functions.to_avro
88 |
89 | def writeAvro(dataFrame: DataFrame, toAvroConfig: ToAvroConfig): DataFrame = {
90 |
91 | val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*)
92 | dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value)
93 | }
94 | ```
95 |
96 | ### Generate schema from data and register
97 | Unlike previous versions of ABRiS, the schema is not automatically generated during the evaluation for every record, but
98 | must be provided to the configuration.
99 |
100 | Given a dataframe, the Avro schema can be generated as shown below.
101 |
102 | ```scala
103 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils
104 |
105 | // generate schema for all columns in a dataframe
106 | AvroSchemaUtils.toAvroSchema(dataFrame)
107 |
108 | // generate schema for one column in a dataframe
109 | AvroSchemaUtils.toAvroSchema(dataFrame, "input")
110 |
111 | // generate schema for multiple columns in a dataframe
112 | AvroSchemaUtils.toAvroSchema(dataFrame, Seq("input", "numbers"))
113 | ```
114 | All above methods also have a variant where you can specify `recordName` and `nameSpace` instead of using default ones.
115 |
116 | When the schema is generated, it can be registered, and the schema id obtained:
117 |
118 | ```scala
119 | val schemaRegistryClientConfig = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "http://localhost:8081")
120 | val schemaManager = SchemaManagerFactory.create(schemaRegistryClientConfig)
121 |
122 | // register schema with topic name strategy
123 | def registerSchema1(schema: Schema, schemaManager: SchemaManager): Int = {
124 | val subject = SchemaSubject.usingTopicNameStrategy("topic", isKey=true) // Use isKey=true for the key schema and isKey=false for the value schema
125 | schemaManager.register(subject, schema)
126 | }
127 |
128 | // register schema with record name strategy
129 | def registerSchema2(schema: Schema, schemaManager: SchemaManager): Int = {
130 | val subject = SchemaSubject.usingRecordNameStrategy(schema)
131 | schemaManager.register(subject, schema)
132 | }
133 |
134 | // register schema with topic record name strategy
135 | def registerSchema3(schema: Schema, schemaManager: SchemaManager): Int = {
136 | val subject = SchemaSubject.usingTopicRecordNameStrategy("topic", schema)
137 | schemaManager.register(subject, schema)
138 | }
139 | ```
140 |
141 | Once you have the schema id, you can pass it to the configuration:
142 | ```scala
143 | def createConfig(schemaId: Int): ToAvroConfig = AbrisConfig
144 | .toConfluentAvro
145 | .downloadSchemaById(schemaId)
146 | .usingSchemaRegistry("http://localhost:8081")
147 | ```
148 |
149 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/avro/sql/AvroDataToCatalyst.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import org.apache.avro.Schema
20 | import org.apache.avro.generic.GenericDatumReader
21 | import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
22 | import org.apache.kafka.common.errors.SerializationException
23 | import org.apache.spark.sql.avro.AbrisAvroDeserializer
24 | import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenerator, CodegenContext, ExprCode}
25 | import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
26 | import org.apache.spark.sql.types.{BinaryType, DataType}
27 | import za.co.absa.abris.avro.errors.DeserializationExceptionHandler
28 | import za.co.absa.abris.avro.read.confluent.{ConfluentConstants, SchemaManagerFactory}
29 | import za.co.absa.abris.config.InternalFromAvroConfig
30 |
31 | import java.nio.ByteBuffer
32 | import java.util.ServiceLoader
33 | import scala.collection.mutable
34 | import scala.util.control.NonFatal
35 | import scala.util.{Failure, Success, Try}
36 |
37 | private[abris] case class AvroDataToCatalyst(
38 | child: Expression,
39 | abrisConfig: Map[String, Any],
40 | schemaRegistryConf: Option[Map[String, String]]
41 | ) extends UnaryExpression with ExpectsInputTypes {
42 |
43 | @transient private lazy val schemaConverter = loadSchemaConverter(config.schemaConverter)
44 |
45 | override def inputTypes: Seq[BinaryType.type] = Seq(BinaryType)
46 |
47 | override lazy val dataType: DataType = schemaConverter.toSqlType(readerSchema)
48 |
49 | override def nullable: Boolean = true
50 |
51 | private val confluentCompliant = schemaRegistryConf.isDefined
52 |
53 | @transient private lazy val config = new InternalFromAvroConfig(abrisConfig)
54 |
55 | @transient private lazy val schemaManager = SchemaManagerFactory.create(schemaRegistryConf.get)
56 |
57 | @transient private lazy val readerSchema = config.readerSchema
58 |
59 | @transient private lazy val writerSchemaOption = config.writerSchema
60 |
61 | @transient private lazy val deserializationHandler: DeserializationExceptionHandler = config.deserializationHandler
62 |
63 | @transient private lazy val vanillaReader: GenericDatumReader[Any] =
64 | new GenericDatumReader[Any](writerSchemaOption.getOrElse(readerSchema), readerSchema)
65 |
66 | @transient private lazy val confluentReaderCache: mutable.HashMap[Int, GenericDatumReader[Any]] =
67 | new mutable.HashMap[Int, GenericDatumReader[Any]]()
68 |
69 | @transient private var decoder: BinaryDecoder = _
70 |
71 | @transient private lazy val deserializer = new AbrisAvroDeserializer(readerSchema, dataType)
72 |
73 | // Reused result object (usually of type IndexedRecord)
74 | @transient private var result: Any = _
75 |
76 | override def nullSafeEval(input: Any): Any = {
77 | val binary = input.asInstanceOf[Array[Byte]]
78 | try {
79 | val intermediateData = decode(binary)
80 |
81 | deserializer.deserialize(intermediateData)
82 |
83 | } catch {
84 | // There could be multiple possible exceptions here, e.g. java.io.IOException,
85 | // AvroRuntimeException, ArrayIndexOutOfBoundsException, etc.
86 | // To make it simple, catch all the exceptions here.
87 | case NonFatal(e) => deserializationHandler.handle(e, deserializer, readerSchema)
88 | }
89 | }
90 |
91 | override def prettyName: String = "from_avro"
92 |
93 | override protected def flatArguments: Iterator[Any] = {
94 | def isMap(x: Any) = x match {
95 | case _: Map[_, _] => true
96 | case _ => false
97 | }
98 |
99 | super.flatArguments.filter {
100 | case Some(x) if isMap(x) => false // don't print schemaRegistryConf
101 | case _ => true
102 | }
103 | }
104 |
105 | override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
106 | val expr = ctx.addReferenceObj("this", this)
107 | defineCodeGen(ctx, ev, input =>
108 | s"(${boxedType(ctx, dataType)})$expr.nullSafeEval($input)")
109 | }
110 |
111 | /**
112 | * The method boxedType(...) is placed in different classes in Spark 2.3 and 2.4
113 | */
114 | private def boxedType(ctx: CodegenContext, dataType: DataType): String = {
115 | val tryBoxedTypeSpark2_4 = Try {
116 | CodeGenerator
117 | .getClass
118 | .getMethod("boxedType", classOf[DataType])
119 | .invoke(CodeGenerator, dataType)
120 | }
121 |
122 | val boxedType = tryBoxedTypeSpark2_4.getOrElse {
123 | classOf[CodegenContext]
124 | .getMethod("boxedType", classOf[DataType])
125 | .invoke(ctx, dataType)
126 | }
127 |
128 | boxedType.asInstanceOf[String]
129 | }
130 |
131 | private def decode(payload: Array[Byte]): Any = if (confluentCompliant) {
132 | decodeConfluentAvro(payload)
133 | } else {
134 | decodeVanillaAvro(payload)
135 | }
136 |
137 | private def decodeConfluentAvro(payload: Array[Byte]): Any = {
138 |
139 | val buffer = ByteBuffer.wrap(payload)
140 | if (buffer.get() != ConfluentConstants.MAGIC_BYTE) {
141 | throw new SerializationException("Unknown magic byte!")
142 | }
143 |
144 | val schemaId = buffer.getInt()
145 |
146 | val start = buffer.position() + buffer.arrayOffset()
147 | val length = buffer.limit() - 1 - ConfluentConstants.SCHEMA_ID_SIZE_BYTES
148 | decoder = DecoderFactory.get().binaryDecoder(buffer.array(), start, length, decoder)
149 |
150 | val reader = confluentReaderCache.getOrElseUpdate(schemaId, {
151 | val writerSchema = downloadWriterSchema(schemaId)
152 | new GenericDatumReader[Any](writerSchema, readerSchema)
153 | })
154 |
155 | result = reader.read(result, decoder)
156 | result
157 | }
158 |
159 | private def downloadWriterSchema(id: Int): Schema = {
160 | Try(schemaManager.getSchemaById(id)) match {
161 | case Success(schema) => schema
162 | case Failure(e) => throw new RuntimeException("Not able to download writer schema", e)
163 | }
164 | }
165 |
166 | private def decodeVanillaAvro(payload: Array[Byte]): Any = {
167 |
168 | decoder = DecoderFactory.get().binaryDecoder(payload, 0, payload.length, decoder)
169 | result = vanillaReader.read(result, decoder)
170 | result
171 | }
172 |
173 | override protected def withNewChildInternal(newChild: Expression): Expression =
174 | copy(child = newChild)
175 |
176 | private def loadSchemaConverter(nameOpt: Option[String]) = {
177 | import scala.collection.JavaConverters._
178 | nameOpt match {
179 | case Some(name) => ServiceLoader.load(classOf[SchemaConverter]).asScala
180 | .find(c => c.shortName == name || c.getClass.getName == name)
181 | .getOrElse(throw new ClassNotFoundException(s"Could not find schema converter $name"))
182 | case None => new DefaultSchemaConverter()
183 | }
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/sql/SchemaEvolutionSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import org.apache.spark.sql.functions.{col, lit, struct}
20 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
21 | import org.scalatest.BeforeAndAfterEach
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest.matchers.should.Matchers
24 | import za.co.absa.abris.avro.format.SparkAvroConversions
25 | import za.co.absa.abris.avro.functions._
26 | import za.co.absa.abris.avro.read.confluent.SchemaManagerFactory
27 | import za.co.absa.abris.avro.registry.{ConfluentMockRegistryClient, SchemaSubject}
28 | import za.co.absa.abris.config.AbrisConfig
29 |
30 | class SchemaEvolutionSpec extends AnyFlatSpec with Matchers with BeforeAndAfterEach
31 | {
32 | private val spark = SparkSession
33 | .builder()
34 | .appName("unitTest")
35 | .master("local[2]")
36 | .config("spark.driver.bindAddress", "localhost")
37 | .config("spark.ui.enabled", "false")
38 | .getOrCreate()
39 |
40 | private val dummyUrl = "dummyUrl"
41 | private val schemaRegistryConfig = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl)
42 |
43 | override def beforeEach(): Unit = {
44 | val mockedSchemaRegistryClient = new ConfluentMockRegistryClient()
45 | SchemaManagerFactory.addSRClientInstance(schemaRegistryConfig, mockedSchemaRegistryClient)
46 | }
47 |
48 | val recordByteSchema = """{
49 | "namespace": "all-types.test",
50 | "type": "record",
51 | "name": "record_name",
52 | "fields":[
53 | {"name": "int", "type": ["int", "null"] }
54 | ]
55 | }"""
56 |
57 | val recordEvolvedByteSchema = """{
58 | "namespace": "all-types.test",
59 | "type": "record",
60 | "name": "record_name",
61 | "fields":[
62 | {"name": "int", "type": ["int", "null"] },
63 | {"name": "favorite_color", "type": "string", "default": "green"}
64 | ]
65 | }"""
66 |
67 | private def createTestData(avroSchema: String): DataFrame = {
68 | val testInts = Seq(42, 66, 77, 321, 789) // scalastyle:ignore
69 | val rows = testInts.map(i => Row.fromSeq(Seq(i)))
70 | val rdd = spark.sparkContext.parallelize(rows, 2)
71 |
72 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema)
73 |
74 | spark.createDataFrame(rdd, sparkSchema)
75 | }
76 |
77 | it should "convert to avro with old schema and back with evolved schema (providing the schema)" in {
78 |
79 | val allData = createTestData(recordByteSchema)
80 | val dataFrame: DataFrame = allData.select(struct(allData.col(allData.columns.head)) as "integers")
81 |
82 | val toCAConfig = AbrisConfig
83 | .toConfluentAvro
84 | .provideAndRegisterSchema(recordByteSchema)
85 | .usingTopicRecordNameStrategy("test_topic")
86 | .usingSchemaRegistry(dummyUrl)
87 |
88 | val avroBytes = dataFrame
89 | .select(to_avro(col("integers"), toCAConfig) as "avroBytes")
90 |
91 | avroBytes.collect() // force evaluation
92 |
93 | val fromCAConfig = AbrisConfig
94 | .fromConfluentAvro
95 | .provideReaderSchema(recordEvolvedByteSchema)
96 | .usingSchemaRegistry(dummyUrl)
97 |
98 | val result = avroBytes
99 | .select(from_avro(col("avroBytes"), fromCAConfig)
100 | as "integersWithDefault")
101 |
102 | val expectedStruct = struct(allData.col(allData.columns.head), lit("green"))
103 | val expectedResult: DataFrame = allData.select(expectedStruct as "integersWithDefault")
104 |
105 | shouldEqualByData(expectedResult, result)
106 | }
107 |
108 | it should "convert to avro with old schema and back with evolved schema (all from schema registry)" in {
109 |
110 | val allData = createTestData(recordByteSchema)
111 | val dataFrame: DataFrame = allData.select(struct(allData.col(allData.columns.head)) as "integers")
112 |
113 | val toCAConfig = AbrisConfig
114 | .toConfluentAvro
115 | .provideAndRegisterSchema(recordByteSchema)
116 | .usingTopicRecordNameStrategy("test_topic")
117 | .usingSchemaRegistry(dummyUrl)
118 |
119 | val avroBytes = dataFrame.select(to_avro(col("integers"), toCAConfig) as "avroBytes")
120 |
121 | // To avoid race conditions between schema registration and reading the data are converted from spark to scala
122 | val avroRows = avroBytes.collect()
123 |
124 | val schemaManager = SchemaManagerFactory.create(schemaRegistryConfig)
125 | val subject = SchemaSubject.usingTopicRecordNameStrategy(
126 | "test_topic",
127 | "record_name",
128 | "all-types.test"
129 | )
130 |
131 | schemaManager.register(subject, recordEvolvedByteSchema)
132 |
133 | // Now when the last version of schema is registered, we will convert the data back to spark DataFrame
134 | val avroDF = spark.sparkContext.parallelize(avroRows.toIndexedSeq, 2)
135 | val outputAvro = spark.createDataFrame(avroDF, avroBytes.schema)
136 |
137 | val fromCAConfig = AbrisConfig
138 | .fromConfluentAvro
139 | .downloadReaderSchemaByLatestVersion
140 | .andTopicRecordNameStrategy("test_topic", "record_name", "all-types.test")
141 | .usingSchemaRegistry(dummyUrl)
142 |
143 | val result = outputAvro.select(from_avro(col("avroBytes"), fromCAConfig) as "integersWithDefault")
144 |
145 | val expectedStruct = struct(allData.col(allData.columns.head), lit("green"))
146 | val expectedResult: DataFrame = allData.select(expectedStruct as "integersWithDefault")
147 |
148 | shouldEqualByData(expectedResult, result)
149 | }
150 |
151 | it should "convert to simple avro with old schema and back with evolved reader schema (providing the schema)" in {
152 |
153 | val allData = createTestData(recordByteSchema)
154 | val dataFrame: DataFrame = allData.select(struct(allData.col(allData.columns.head)) as "integers")
155 |
156 | // Serialize record with a writer schema
157 | val toCAConfig = AbrisConfig
158 | .toSimpleAvro
159 | .provideSchema(recordByteSchema)
160 |
161 | val avroBytes = dataFrame
162 | .select(to_avro(col("integers"), toCAConfig) as "avroBytes")
163 |
164 | avroBytes.collect() // force evaluation
165 |
166 | // Deserialize record specifying a reader and a writer schema
167 | // Avro will decode using the writer schema and then match with the
168 | // reader schema. Thus e.g. new fields with a default value will also show up.
169 | val fromCAConfig = AbrisConfig
170 | .fromSimpleAvro
171 | .provideSchema(recordEvolvedByteSchema)
172 | .withWriterSchema(recordByteSchema)
173 |
174 | val result = avroBytes
175 | .select(from_avro(col("avroBytes"), fromCAConfig)
176 | as "integersWithDefault")
177 |
178 | val expectedStruct = struct(allData.col(allData.columns.head), lit("green"))
179 | val expectedResult: DataFrame = allData.select(expectedStruct as "integersWithDefault")
180 |
181 | shouldEqualByData(expectedResult, result)
182 | }
183 | }
184 |
--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
1 |
15 |
16 |
17 | Scalastyle standard configuration
18 |
38 |
39 |
40 |
41 | 300
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | 120
50 | 2
51 |
52 |
53 |
54 |
55 | [A-Z][A-Za-z0-9]*
56 |
57 |
58 |
59 |
60 | [A-Z][A-Za-z0-9]*
61 |
62 |
63 |
64 |
65 | ^[a-z][A-Za-z0-9]*$
66 |
67 |
68 |
69 |
70 |
71 | sun._,java.awt._
72 |
73 |
74 |
75 |
76 | 8
77 |
78 |
79 |
80 |
81 | -1,0,1,2,3
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 | \bprint(|ln|f)\(
95 |
96 |
97 |
98 |
99 | 30
100 |
101 |
102 |
103 |
104 | 10
105 |
106 |
107 |
108 |
109 |
110 |
111 | true
112 | false
113 |
114 |
115 |
116 |
117 | 50
118 |
119 |
120 |
121 |
122 | ^[a-z][A-Za-z0-9]*$
123 |
124 |
125 |
126 |
127 | 30
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 | \bprint(|ln|f)\(
140 |
141 |
142 |
143 |
144 |
145 |
146 | ^[A-Z_]$
147 |
148 |
149 |
150 |
151 |
152 |
153 | 2
154 | ^""$
155 |
156 |
157 |
158 |
159 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/abris/avro/sql/AvroDataToCatalystSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2021 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.avro.sql
18 |
19 | import all_types.test.{Fixed, NativeComplete}
20 | import org.apache.spark.SparkException
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
23 | import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession}
24 | import org.apache.spark.sql.functions.col
25 | import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
26 | import org.scalatest.BeforeAndAfterEach
27 | import org.scalatest.flatspec.AnyFlatSpec
28 | import org.scalatest.matchers.should.Matchers
29 | import za.co.absa.abris.avro.errors.{FailFastExceptionHandler, SpecificRecordExceptionHandler}
30 | import za.co.absa.abris.avro.format.SparkAvroConversions
31 | import za.co.absa.abris.avro.functions._
32 | import za.co.absa.abris.avro.utils.AvroSchemaEncoder
33 | import za.co.absa.abris.config.{AbrisConfig, FromAvroConfig}
34 | import za.co.absa.abris.examples.data.generation.TestSchemas
35 |
36 | import java.util.Collections
37 | import java.nio.ByteBuffer
38 | import java.util
39 | import scala.collection.JavaConverters._
40 |
41 | class AvroDataToCatalystSpec extends AnyFlatSpec with Matchers with BeforeAndAfterEach {
42 |
43 | private val spark = SparkSession
44 | .builder()
45 | .appName("unitTest")
46 | .master("local[2]")
47 | .config("spark.driver.bindAddress", "localhost")
48 | .config("spark.ui.enabled", "false")
49 | .getOrCreate()
50 |
51 | import spark.implicits._
52 |
53 | private val avroSchemaEncoder = new AvroSchemaEncoder
54 | implicit private val encoder: Encoder[Row] = avroSchemaEncoder.getEncoder
55 |
56 | it should "not print schema registry configs in the spark plan" in {
57 | val sensitiveData = "username:password"
58 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA
59 | val dummyUrl = "dummyUrl"
60 |
61 | val fromAvroConfig = FromAvroConfig()
62 | .withReaderSchema(schemaString)
63 | .withSchemaRegistryConfig(Map(
64 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl,
65 | "basic.auth.user.info" -> sensitiveData
66 | ))
67 |
68 | val column = from_avro(col("avroBytes"), fromAvroConfig)
69 | column.expr.toString() should not include sensitiveData
70 | }
71 |
72 | it should "use the default schema converter by default" in {
73 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA
74 | val dummyUrl = "dummyUrl"
75 | val expectedDataType = StructType(Seq(
76 | StructField("int", IntegerType, nullable = false),
77 | StructField("long", LongType, nullable = false)
78 | ))
79 |
80 | val fromAvroConfig = FromAvroConfig()
81 | .withReaderSchema(schemaString)
82 | .withSchemaRegistryConfig(Map(
83 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl
84 | ))
85 |
86 | val column = from_avro(col("avroBytes"), fromAvroConfig)
87 | column.expr.dataType shouldBe expectedDataType
88 | }
89 |
90 | it should "use a custom schema converter identified by the short name" in {
91 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA
92 | val dummyUrl = "dummyUrl"
93 |
94 | val fromAvroConfig = FromAvroConfig()
95 | .withReaderSchema(schemaString)
96 | .withSchemaRegistryConfig(Map(
97 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl
98 | ))
99 | .withSchemaConverter(DummySchemaConverter.name)
100 |
101 | val column = from_avro(col("avroBytes"), fromAvroConfig)
102 | column.expr.dataType shouldBe DummySchemaConverter.dataType
103 | }
104 |
105 | it should "use a custom schema converter identified by the fully qualified name" in {
106 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA
107 | val dummyUrl = "dummyUrl"
108 |
109 | val fromAvroConfig = FromAvroConfig()
110 | .withReaderSchema(schemaString)
111 | .withSchemaRegistryConfig(Map(
112 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl
113 | ))
114 | .withSchemaConverter("za.co.absa.abris.avro.sql.DummySchemaConverter")
115 |
116 | val column = from_avro(col("avroBytes"), fromAvroConfig)
117 | column.expr.dataType shouldBe DummySchemaConverter.dataType
118 | }
119 |
120 | it should "throw an error if the specified custom schema converter does not exist" in {
121 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA
122 | val dummyUrl = "dummyUrl"
123 |
124 | val fromAvroConfig = FromAvroConfig()
125 | .withReaderSchema(schemaString)
126 | .withSchemaRegistryConfig(Map(
127 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl
128 | ))
129 | .withSchemaConverter("nonexistent")
130 |
131 | val ex = intercept[ClassNotFoundException](from_avro(col("avroBytes"), fromAvroConfig).expr.dataType)
132 | ex.getMessage should include ("nonexistent")
133 | }
134 |
135 | it should "be serializable" in {
136 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA
137 | val config = FromAvroConfig().withReaderSchema(schemaString)
138 | val avroDataToCatalyst = from_avro(col("col"), config).expr
139 |
140 | val javaSerializer = new JavaSerializer(new SparkConf())
141 | javaSerializer.newInstance().serialize(avroDataToCatalyst)
142 |
143 | val kryoSerializer = new KryoSerializer(new SparkConf())
144 | kryoSerializer.newInstance().serialize(avroDataToCatalyst)
145 |
146 | // test successful if no exception is thrown
147 | }
148 |
149 | it should "throw a Spark exception when unable to deserialize " in {
150 |
151 | val providedData = Seq(Row("$£%^".getBytes()))
152 | val providedDataFrame: DataFrame = spark.sparkContext.parallelize(providedData, 2).toDF() as "bytes"
153 |
154 | val dummyUrl = "dummyUrl"
155 | val fromConfig = AbrisConfig
156 | .fromConfluentAvro
157 | .provideReaderSchema(TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA)
158 | .usingSchemaRegistry(dummyUrl)
159 | .withExceptionHandler(new FailFastExceptionHandler)
160 |
161 | the[SparkException] thrownBy providedDataFrame.select(from_avro(col("bytes"), fromConfig )).collect()
162 | }
163 |
164 | it should "replace undeserializable record with default SpecificRecord" in {
165 | // provided
166 | val providedData = Seq(
167 | Row("$£%^".getBytes())
168 | )
169 | val providedDataFrame: DataFrame = spark.sparkContext.parallelize(providedData, 2).toDF() as "bytes"
170 |
171 | val providedDefaultRecord = NativeComplete.newBuilder()
172 | .setBytes(ByteBuffer.wrap(Array[Byte](1,2,3)))
173 | .setString("default-record")
174 | .setInt$(1)
175 | .setLong$(2L)
176 | .setDouble$(3.0)
177 | .setFloat$(4.0F)
178 | .setBoolean$(true)
179 | .setArray(Collections.singletonList("arrayItem1"))
180 | .setMap(Collections.singletonMap[CharSequence, util.List[java.lang.Long]](
181 | "key1",
182 | Collections.singletonList[java.lang.Long](1L)))
183 | .setFixed(new Fixed(Array.fill[Byte](40){1}))
184 | .build()
185 |
186 | // expected
187 | val expectedData = Seq(
188 | Row(Array[Byte](1,2,3),
189 | "default-record",
190 | 1,
191 | 2L,
192 | 3.0,
193 | 4F,
194 | true,
195 | Collections.singletonList("arrayItem1"),
196 | Collections.singletonMap[CharSequence, util.List[java.lang.Long]](
197 | "key1",
198 | Collections.singletonList[java.lang.Long](1L)),
199 | Array.fill[Byte](40){1}
200 | )).asJava
201 |
202 | val expectedDataFrame: DataFrame = spark.createDataFrame(expectedData, SparkAvroConversions.toSqlType(NativeComplete.SCHEMA$))
203 |
204 | // actual
205 | val dummyUrl = "dummyUrl"
206 | val fromConfig = AbrisConfig
207 | .fromConfluentAvro
208 | .provideReaderSchema(NativeComplete.SCHEMA$.toString())
209 | .usingSchemaRegistry(dummyUrl)
210 | .withExceptionHandler(new SpecificRecordExceptionHandler(providedDefaultRecord))
211 |
212 | val actualDataFrame = providedDataFrame
213 | .select(from_avro(col("bytes"), fromConfig).as("actual"))
214 | .select(col("actual.*"))
215 |
216 | shouldEqualByData(expectedDataFrame, actualDataFrame)
217 | }
218 | }
219 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # top-most EditorConfig file
2 | root = true
3 |
4 | [*]
5 | charset = utf-8
6 | end_of_line = lf
7 | trim_trailing_whitespace = true
8 |
9 | [*.xml]
10 | indent_size = 4
11 | indent_style = space
12 | insert_final_newline = true
13 |
14 | [*.{java,scala,js,json,css}]
15 | indent_size = 2
16 | indent_style = space
17 | insert_final_newline = true
18 | max_line_length = 120
19 |
20 | [*.md]
21 | trim_trailing_whitespace = false
22 |
23 | [*.scala]
24 | ij_continuation_indent_size = 2
25 | ij_scala_align_composite_pattern = true
26 | ij_scala_align_extends_with = 0
27 | ij_scala_align_group_field_declarations = false
28 | ij_scala_align_if_else = false
29 | ij_scala_align_in_columns_case_branch = false
30 | ij_scala_align_multiline_binary_operation = false
31 | ij_scala_align_multiline_chained_methods = false
32 | ij_scala_align_multiline_for = true
33 | ij_scala_align_multiline_parameters = false
34 | ij_scala_align_multiline_parameters_in_calls = false
35 | ij_scala_align_multiline_parenthesized_expression = false
36 | ij_scala_align_parameter_types_in_multiline_declarations = 0
37 | ij_scala_align_tuple_elements = false
38 | ij_scala_alternate_continuation_indent_for_params = 4
39 | ij_scala_binary_operation_wrap = off
40 | ij_scala_blank_lines_after_anonymous_class_header = 0
41 | ij_scala_blank_lines_after_class_header = 0
42 | ij_scala_blank_lines_after_imports = 1
43 | ij_scala_blank_lines_after_package = 1
44 | ij_scala_blank_lines_around_class = 1
45 | ij_scala_blank_lines_around_class_in_inner_scopes = 0
46 | ij_scala_blank_lines_around_field = 0
47 | ij_scala_blank_lines_around_field_in_inner_scopes = 0
48 | ij_scala_blank_lines_around_field_in_interface = 0
49 | ij_scala_blank_lines_around_method = 1
50 | ij_scala_blank_lines_around_method_in_inner_scopes = 1
51 | ij_scala_blank_lines_around_method_in_interface = 0
52 | ij_scala_blank_lines_before_class_end = 0
53 | ij_scala_blank_lines_before_imports = 1
54 | ij_scala_blank_lines_before_method_body = 0
55 | ij_scala_blank_lines_before_package = 0
56 | ij_scala_block_brace_style = end_of_line
57 | ij_scala_block_comment_at_first_column = true
58 | ij_scala_call_parameters_new_line_after_lparen = 0
59 | ij_scala_call_parameters_right_paren_on_new_line = false
60 | ij_scala_call_parameters_wrap = off
61 | ij_scala_case_clause_brace_force = never
62 | ij_scala_catch_on_new_line = false
63 | ij_scala_class_annotation_wrap = split_into_lines
64 | ij_scala_class_brace_style = end_of_line
65 | ij_scala_closure_brace_force = never
66 | ij_scala_do_not_align_block_expr_params = true
67 | ij_scala_do_not_indent_case_clause_body = false
68 | ij_scala_do_not_indent_tuples_close_brace = true
69 | ij_scala_do_while_brace_force = never
70 | ij_scala_else_on_new_line = false
71 | ij_scala_enable_scaladoc_formatting = true
72 | ij_scala_enforce_functional_syntax_for_unit = true
73 | ij_scala_extends_keyword_wrap = off
74 | ij_scala_extends_list_wrap = off
75 | ij_scala_field_annotation_wrap = split_into_lines
76 | ij_scala_finally_brace_force = never
77 | ij_scala_finally_on_new_line = false
78 | ij_scala_for_brace_force = never
79 | ij_scala_for_statement_wrap = off
80 | ij_scala_formatter = 0
81 | ij_scala_if_brace_force = never
82 | ij_scala_implicit_value_class_suffix = Ops
83 | ij_scala_indent_braced_function_args = true
84 | ij_scala_indent_case_from_switch = true
85 | ij_scala_indent_first_parameter = true
86 | ij_scala_indent_first_parameter_clause = false
87 | ij_scala_indent_type_arguments = true
88 | ij_scala_indent_type_parameters = true
89 | ij_scala_indent_yield_after_one_line_enumerators = true
90 | ij_scala_keep_blank_lines_before_right_brace = 2
91 | ij_scala_keep_blank_lines_in_code = 2
92 | ij_scala_keep_blank_lines_in_declarations = 2
93 | ij_scala_keep_comments_on_same_line = true
94 | ij_scala_keep_first_column_comment = false
95 | ij_scala_keep_indents_on_empty_lines = false
96 | ij_scala_keep_line_breaks = true
97 | ij_scala_keep_one_line_lambdas_in_arg_list = false
98 | ij_scala_keep_simple_blocks_in_one_line = false
99 | ij_scala_keep_simple_methods_in_one_line = false
100 | ij_scala_keep_xml_formatting = false
101 | ij_scala_line_comment_add_space = false
102 | ij_scala_line_comment_at_first_column = true
103 | ij_scala_method_annotation_wrap = split_into_lines
104 | ij_scala_method_brace_force = never
105 | ij_scala_method_brace_style = end_of_line
106 | ij_scala_method_call_chain_wrap = off
107 | ij_scala_method_parameters_new_line_after_left_paren = false
108 | ij_scala_method_parameters_right_paren_on_new_line = false
109 | ij_scala_method_parameters_wrap = off
110 | ij_scala_modifier_list_wrap = false
111 | ij_scala_multiline_string_align_dangling_closing_quotes = false
112 | ij_scala_multiline_string_closing_quotes_on_new_line = false
113 | ij_scala_multiline_string_insert_margin_on_enter = true
114 | ij_scala_multiline_string_margin_char = |
115 | ij_scala_multiline_string_margin_indent = 2
116 | ij_scala_multiline_string_opening_quotes_on_new_line = true
117 | ij_scala_multiline_string_process_margin_on_copy_paste = true
118 | ij_scala_newline_after_annotations = false
119 | ij_scala_not_continuation_indent_for_params = false
120 | ij_scala_parameter_annotation_wrap = off
121 | ij_scala_parentheses_expression_new_line_after_left_paren = false
122 | ij_scala_parentheses_expression_right_paren_on_new_line = false
123 | ij_scala_place_closure_parameters_on_new_line = false
124 | ij_scala_place_self_type_on_new_line = true
125 | ij_scala_prefer_parameters_wrap = false
126 | ij_scala_preserve_space_after_method_declaration_name = false
127 | ij_scala_reformat_on_compile = false
128 | ij_scala_replace_case_arrow_with_unicode_char = false
129 | ij_scala_replace_for_generator_arrow_with_unicode_char = false
130 | ij_scala_replace_lambda_with_greek_letter = false
131 | ij_scala_replace_map_arrow_with_unicode_char = false
132 | ij_scala_scalafmt_fallback_to_default_settings = false
133 | ij_scala_scalafmt_reformat_on_files_save = false
134 | ij_scala_scalafmt_show_invalid_code_warnings = true
135 | ij_scala_scalafmt_use_intellij_formatter_for_range_format = true
136 | ij_scala_sd_align_exception_comments = true
137 | ij_scala_sd_align_list_item_content = true
138 | ij_scala_sd_align_other_tags_comments = true
139 | ij_scala_sd_align_parameters_comments = true
140 | ij_scala_sd_align_return_comments = true
141 | ij_scala_sd_blank_line_after_parameters_comments = false
142 | ij_scala_sd_blank_line_after_return_comments = false
143 | ij_scala_sd_blank_line_before_parameters = false
144 | ij_scala_sd_blank_line_before_tags = true
145 | ij_scala_sd_blank_line_between_parameters = false
146 | ij_scala_sd_keep_blank_lines_between_tags = false
147 | ij_scala_sd_preserve_spaces_in_tags = false
148 | ij_scala_space_after_comma = true
149 | ij_scala_space_after_for_semicolon = true
150 | ij_scala_space_after_modifiers_constructor = false
151 | ij_scala_space_after_type_colon = true
152 | ij_scala_space_before_brace_method_call = true
153 | ij_scala_space_before_class_left_brace = true
154 | ij_scala_space_before_for_parentheses = true
155 | ij_scala_space_before_if_parentheses = true
156 | ij_scala_space_before_infix_like_method_parentheses = false
157 | ij_scala_space_before_infix_method_call_parentheses = false
158 | ij_scala_space_before_infix_operator_like_method_call_parentheses = true
159 | ij_scala_space_before_method_call_parentheses = false
160 | ij_scala_space_before_method_left_brace = true
161 | ij_scala_space_before_method_parentheses = false
162 | ij_scala_space_before_type_colon = false
163 | ij_scala_space_before_type_parameter_in_def_list = false
164 | ij_scala_space_before_type_parameter_leading_context_bound_colon = false
165 | ij_scala_space_before_type_parameter_leading_context_bound_colon_hk = true
166 | ij_scala_space_before_type_parameter_list = false
167 | ij_scala_space_before_type_parameter_rest_context_bound_colons = true
168 | ij_scala_space_before_while_parentheses = true
169 | ij_scala_space_inside_closure_braces = true
170 | ij_scala_space_inside_self_type_braces = true
171 | ij_scala_space_within_empty_method_call_parentheses = false
172 | ij_scala_spaces_around_at_in_patterns = false
173 | ij_scala_spaces_in_imports = false
174 | ij_scala_spaces_in_one_line_blocks = false
175 | ij_scala_spaces_within_brackets = false
176 | ij_scala_spaces_within_for_parentheses = false
177 | ij_scala_spaces_within_if_parentheses = false
178 | ij_scala_spaces_within_method_call_parentheses = false
179 | ij_scala_spaces_within_method_parentheses = false
180 | ij_scala_spaces_within_parentheses = false
181 | ij_scala_spaces_within_while_parentheses = false
182 | ij_scala_special_else_if_treatment = true
183 | ij_scala_trailing_comma_arg_list_enabled = true
184 | ij_scala_trailing_comma_import_selector_enabled = false
185 | ij_scala_trailing_comma_mode = trailing_comma_keep
186 | ij_scala_trailing_comma_params_enabled = true
187 | ij_scala_trailing_comma_pattern_arg_list_enabled = false
188 | ij_scala_trailing_comma_tuple_enabled = false
189 | ij_scala_trailing_comma_tuple_type_enabled = false
190 | ij_scala_trailing_comma_type_params_enabled = false
191 | ij_scala_try_brace_force = never
192 | ij_scala_type_annotation_exclude_constant = true
193 | ij_scala_type_annotation_exclude_in_dialect_sources = true
194 | ij_scala_type_annotation_exclude_in_test_sources = false
195 | ij_scala_type_annotation_exclude_member_of_anonymous_class = false
196 | ij_scala_type_annotation_exclude_member_of_private_class = false
197 | ij_scala_type_annotation_exclude_when_type_is_stable = true
198 | ij_scala_type_annotation_function_parameter = false
199 | ij_scala_type_annotation_implicit_modifier = true
200 | ij_scala_type_annotation_local_definition = false
201 | ij_scala_type_annotation_private_member = false
202 | ij_scala_type_annotation_protected_member = true
203 | ij_scala_type_annotation_public_member = true
204 | ij_scala_type_annotation_structural_type = true
205 | ij_scala_type_annotation_underscore_parameter = false
206 | ij_scala_type_annotation_unit_type = true
207 | ij_scala_use_alternate_continuation_indent_for_params = false
208 | ij_scala_use_scala3_indentation_based_syntax = true
209 | ij_scala_use_scaladoc2_formatting = false
210 | ij_scala_variable_annotation_wrap = off
211 | ij_scala_while_brace_force = never
212 | ij_scala_while_on_new_line = false
213 | ij_scala_wrap_before_with_keyword = false
214 | ij_scala_wrap_first_method_in_call_chain = false
215 | ij_scala_wrap_long_lines = false
216 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # ABRiS - Avro Bridge for Spark
4 |
5 | - Pain free Spark/Avro integration.
6 |
7 | - Seamlessly integrate with Confluent platform, including Schema Registry with all available [naming strategies](https://docs.confluent.io/current/schema-registry/serializer-formatter.html#how-the-naming-strategies-work) and schema evolution.
8 |
9 | - Seamlessly convert your Avro records from anywhere (e.g. Kafka, Parquet, HDFS, etc) into Spark Rows.
10 |
11 | - Convert your Dataframes into Avro records without even specifying a schema.
12 |
13 | - Go back-and-forth Spark Avro (since Spark 2.4).
14 |
15 |
16 | ### Coordinates for Maven POM dependency
17 |
18 | | Scala | Abris |
19 | |:------:|:-------:|
20 | | 2.11 | [](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.11) |
21 | | 2.12 | [](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.12) |
22 | | 2.13 | [](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.13) |
23 |
24 | ## Supported versions
25 |
26 | | Abris | Spark | Scala |
27 | |:-----: |:-------------:|:-----: |
28 | | 6.2.0 - 6.x.x | 3.2.1 - 3.5.x | 2.12 / 2.13 |
29 | | 6.0.0 - 6.1.1 | 3.2.0 | 2.12 / 2.13 |
30 | | 5.0.0 - 5.x.x | 3.0.x / 3.1.x | 2.12 |
31 | | 5.0.0 - 5.x.x | 2.4.x | 2.11 / 2.12 |
32 |
33 | From version 6.0.0, ABRiS only supports Spark 3.2.x.
34 |
35 | ABRiS 5.0.x is still supported for older versions of Spark (see [branch-5](https://github.com/AbsaOSS/ABRiS/tree/branch-5))
36 |
37 | ## Older Versions
38 | This is documentation for Abris **version 6**. Documentation for older versions is located in corresponding branches:
39 | [branch-5](https://github.com/AbsaOSS/ABRiS/tree/branch-5),
40 | [branch-4](https://github.com/AbsaOSS/ABRiS/tree/branch-4),
41 | [branch-3.2](https://github.com/AbsaOSS/ABRiS/tree/branch-3.2).
42 |
43 | ## Confluent Schema Registry Version
44 | Abris by default uses Confluent client version 6.2.0.
45 |
46 | ## Installation
47 | Abris needs `spark-avro` to run, make sure you include the `spark-avro` dependency when using Abris.
48 | The version of `spark-avro` and `Spark` should be identical.
49 |
50 | Example: submitting a Spark job:
51 | ```
52 | ./bin/spark-submit \
53 | --packages org.apache.spark:spark-avro_2.12:3.5.0,za.co.absa:abris_2.12:6.4.0 \
54 | ...rest of submit params...
55 | ```
56 |
57 | Example: using Abris in maven project:
58 | ```xml
59 |
60 | org.apache.spark
61 | spark-core_2.12
62 | 3.5.0
63 | provided
64 |
65 |
66 | org.apache.spark
67 | spark-avro_2.12
68 | 3.5.0
69 |
70 |
71 | za.co.absa
72 | abris_2.12
73 | 6.4.0
74 |
75 | ```
76 |
77 | Example: using Abris in SBT project:
78 | ```Scala
79 | libraryDependencies ++= Seq(
80 | "org.apache.spark" %% "spark-core" % "3.5.0" % Provided,
81 | "org.apache.spark" %% "spark-avro" % "3.5.0",
82 | "za.co.absa" %% "abris" % "6.4.0"
83 | )
84 | ```
85 |
86 |
87 | ## Usage
88 |
89 | ABRiS API is in it's most basic form almost identical to Spark built-in support for Avro, but it provides additional functionality.
90 | Mainly it's support of schema registry and also seamless integration with confluent Avro data format.
91 |
92 | The API consists of two Spark SQL expressions (`to_avro` and `from_avro`) and fluent configurator (`AbrisConfig`)
93 |
94 | Using the configurator you can choose from four basic config types:
95 | * `toSimpleAvro`, `toConfluentAvro`, `fromSimpleAvro` and `fromConfluentAvro`
96 |
97 | And configure what you want to do, mainly how to get the avro schema.
98 |
99 | Example of usage:
100 | ```Scala
101 | val abrisConfig = AbrisConfig
102 | .fromConfluentAvro
103 | .downloadReaderSchemaByLatestVersion
104 | .andTopicNameStrategy("topic123")
105 | .usingSchemaRegistry("http://localhost:8081")
106 |
107 | import za.co.absa.abris.avro.functions.from_avro
108 | val deserialized = dataFrame.select(from_avro(col("value"), abrisConfig) as 'data)
109 | ```
110 |
111 | Detailed instructions for many use cases are in separated documents:
112 |
113 | - [How to use Abris with vanilla avro (with examples)](documentation/vanilla-avro-documentation.md)
114 | - [How to use Abris with Confluent avro (with examples)](documentation/confluent-avro-documentation.md)
115 | - [How to use Abris in Python (with examples)](documentation/python-documentation.md)
116 |
117 | Full runnable examples can be found in the ```za.co.absa.abris.examples``` package. You can also take a look at unit tests in package ```za.co.absa.abris.avro.sql```.
118 |
119 | **IMPORTANT**: Spark dependencies have `provided` scope in the `pom.xml`, so when running the examples, please make sure that you either, instruct your IDE to include dependencies with
120 | `provided` scope, or change the scope directly.
121 |
122 | ### Confluent Avro format
123 | The format of Avro binary data is defined in [Avro specification](http://avro.apache.org/docs/current/spec.html).
124 | Confluent format extends it and prepends the schema id before the actual record.
125 | The Confluent expressions in this library expect this format and add the id after the Avro data are generated or remove it before they are parsed.
126 |
127 | You can find more about Confluent and Schema Registry in [Confluent documentation](https://docs.confluent.io/current/schema-registry/index.html).
128 |
129 |
130 | ### Schema Registry security and other additional settings
131 |
132 | Only Schema registry client setting that is mandatory is the url,
133 | but if you need to provide more the configurer allows you to provide a whole map.
134 |
135 | For example, you may want to provide `basic.auth.user.info` and `basic.auth.credentials.source` required for user authentication.
136 | You can do it this way:
137 |
138 | ```scala
139 | val registryConfig = Map(
140 | AbrisConfig.SCHEMA_REGISTRY_URL -> "http://localhost:8081",
141 | "basic.auth.credentials.source" -> "USER_INFO",
142 | "basic.auth.user.info" -> "srkey:srvalue"
143 | )
144 |
145 | val abrisConfig = AbrisConfig
146 | .fromConfluentAvro
147 | .downloadReaderSchemaByLatestVersion
148 | .andTopicNameStrategy("topic123")
149 | .usingSchemaRegistry(registryConfig) // use the map instead of just url
150 | ```
151 |
152 | ## Other Features
153 |
154 | ### Generating Avro schema from Spark data frame column
155 | There is a helper method that allows you to generate schema automatically from spark column.
156 | Assuming you have a data frame containing column "input". You can generate schema for data in that column like this:
157 | ```scala
158 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame, "input")
159 | ```
160 |
161 | ### Using schema manager to directly download or register schema
162 | You can use SchemaManager directly to do operations with schema registry.
163 | The configuration is identical to Schema Registry Client.
164 | The SchemaManager is just a wrapper around the client providing helpful methods and abstractions.
165 |
166 | ```scala
167 | val schemaRegistryClientConfig = Map( ...configuration... )
168 | val schemaManager = SchemaManagerFactory.create(schemaRegistryClientConfig)
169 |
170 | // Downloading schema:
171 | val schema = schemaManager.getSchemaById(42)
172 |
173 | // Registering schema:
174 | val schemaString = "{...avro schema json...}"
175 | val subject = SchemaSubject.usingTopicNameStrategy("fooTopic")
176 | val schemaId = schemaManager.register(subject, schemaString)
177 |
178 | // and more, check SchemaManager's methods
179 | ```
180 |
181 | ### De-serialisation Error Handling
182 | There are 2 ways ABRiS handles de-serialisation errors:
183 |
184 | #### FailFast (Default)
185 | Given no provided de-serialisation handler, a failure will result in a spark exception being thrown
186 | and with the error being outputted. This is the default procedure.
187 |
188 | #### SpecificRecordHandler
189 | The second option requires providing a default record that will be outputted in the event of a failure.
190 | This should be used as a flag to be deleted outside ABRiS that should mean the spark job will not stop.
191 | Beware however, a null or empty record will also result in an error so a record with a different input should be chosen.
192 |
193 | This can be provided as such:
194 | ```scala
195 | val abrisConfig = AbrisConfig
196 | .fromConfluentAvro
197 | .downloadReaderSchemaByLatestVersion
198 | .andTopicNameStrategy("topic123")
199 | .usingSchemaRegistry(registryConfig)
200 | .withSchemaConverter("custom")
201 | .withExceptionHandler(new SpecificRecordExceptionHandler(providedDefaultRecord))
202 | ```
203 |
204 | This is only for confluent-based configuration, not for standard avro.
205 |
206 | #### PermissiveRecordExceptionHandler
207 | The third option is to use the `PermissiveRecordExceptionHandler`. In case of a deserialization failure, this handler replaces the problematic record with a fully null record, instead of throwing an exception. This allows the data processing pipeline to continue without interruption.
208 |
209 | The main use case for this option is when you want to prioritize continuity of processing over individual record integrity. It's especially useful when dealing with large datasets where occasional malformed records could be tolerated.
210 |
211 | Here's how to use it:
212 |
213 | ```scala
214 | val abrisConfig = AbrisConfig
215 | .fromConfluentAvro
216 | .downloadReaderSchemaByLatestVersion
217 | .andTopicNameStrategy("topic123")
218 | .usingSchemaRegistry(registryConfig)
219 | .withSchemaConverter("custom")
220 | .withExceptionHandler(new PermissiveRecordExceptionHandler())
221 | ```
222 |
223 | With this configuration, in the event of a deserialization error, the `PermissiveRecordExceptionHandler` will log a warning, substitute the malformed record with a fully null one, and allow the data processing pipeline to continue.
224 |
225 |
226 | ### Data Conversions
227 | This library also provides convenient methods to convert between Avro and Spark schemas.
228 |
229 | If you have an Avro schema which you want to convert into a Spark SQL one - to generate your Dataframes, for instance - you can do as follows:
230 |
231 | ```scala
232 | val avroSchema: Schema = AvroSchemaUtils.load("path_to_avro_schema")
233 | val sqlSchema: StructType = SparkAvroConversions.toSqlType(avroSchema)
234 | ```
235 |
236 | You can also do the inverse operation by running:
237 |
238 | ```scala
239 | val sqlSchema = new StructType(new StructField ....
240 | val avroSchema = SparkAvroConversions.toAvroSchema(sqlSchema, avro_schema_name, avro_schema_namespace)
241 | ```
242 |
243 | #### Custom data conversions
244 | If you would like to use custom logic to convert from Avro to Spark, you can implement the `SchemaConverter` trait.
245 | The custom class is loaded in ABRiS using the service provider interface (SPI), so you need to register your class in your
246 | `META-INF/services` resource directory. You can then configure the custom class with its short name or the fully qualified name.
247 |
248 | **Example**
249 |
250 | Custom schema converter implementation
251 | ```scala
252 | package za.co.absa.abris.avro.sql
253 | import org.apache.avro.Schema
254 | import org.apache.spark.sql.types.DataType
255 |
256 | class CustomSchemaConverter extends SchemaConverter {
257 | override val shortName: String = "custom"
258 | override def toSqlType(avroSchema: Schema): DataType = ???
259 | }
260 | ```
261 |
262 | Provider configuration file `META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter`:
263 | ```
264 | za.co.absa.abris.avro.sql.CustomSchemaConverter
265 | ```
266 |
267 | Abris configuration
268 | ```scala
269 | val abrisConfig = AbrisConfig
270 | .fromConfluentAvro
271 | .downloadReaderSchemaByLatestVersion
272 | .andTopicNameStrategy("topic123")
273 | .usingSchemaRegistry(registryConfig)
274 | .withSchemaConverter("custom")
275 | ```
276 |
277 | ## Multiple schemas in one topic
278 | The naming strategies RecordName and TopicRecordName allow for a one topic to receive different payloads,
279 | i.e. payloads containing different schemas that do not have to be compatible,
280 | as explained [here](https://docs.confluent.io/current/schema-registry/docs/serializer-formatter.html#subject-name-strategy).
281 |
282 | When you read such data from Kafka they will be stored as binary column in a dataframe,
283 | but once you convert them to Spark types they cannot be in one dataframe,
284 | because all rows in dataframe must have the same schema.
285 |
286 | So if you have multiple incompatible types of avro data in a dataframe you must first sort them out to several dataframes.
287 | One for each schema. Then you can use Abris and convert the avro data.
288 |
289 | ## How to measure code coverage
290 | ```shell
291 | ./mvn clean verify -Pcode-coverage,scala-2.12
292 | or
293 | ./mvn clean verify -Pcode-coverage,scala-2.13
294 | ```
295 | Code coverage reports will be generated on paths:
296 | ```
297 | {local-path}\ABRiS\target\jacoco
298 | ```
299 |
300 | ---
301 |
302 | Copyright 2018 ABSA Group Limited
303 |
304 | Licensed under the Apache License, Version 2.0 (the "License");
305 | you may not use this file except in compliance with the License.
306 | You may obtain a copy of the License at
307 |
308 | http://www.apache.org/licenses/LICENSE-2.0
309 |
310 | Unless required by applicable law or agreed to in writing, software
311 | distributed under the License is distributed on an "AS IS" BASIS,
312 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
313 | See the License for the specific language governing permissions and
314 | limitations under the License.
315 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/abris/examples/data/generation/TestSchemas.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2018 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.abris.examples.data.generation
18 |
19 | import all_types.test.{NativeComplete, NativeSimpleOuter}
20 | import za.co.absa.commons.annotation.DeveloperApi
21 |
22 | /**
23 | * Provides several Avro schemas.
24 | *
25 | * Used for tests and examples.
26 | */
27 | @DeveloperApi
28 | object TestSchemas {
29 |
30 | case class ErrorMessage(errType: String, errCode: String, errMsg: String, errCol: String, rawValues: Seq[String],
31 | mappings: Seq[Mapping] = Seq())
32 |
33 | case class Mapping(mappingTableColumn: String, mappedDatasetColumn: String)
34 |
35 | val NATIVE_SIMPLE_OUTER_SCHEMA = NativeSimpleOuter.SCHEMA$.toString()
36 |
37 | val NATIVE_SIMPLE_NESTED_SCHEMA = """{
38 | "namespace": "all-types.test",
39 | "type":"record",
40 | "name":"nested",
41 | "fields":
42 | [
43 | {"name":"int", "type":"int"},
44 | {"name":"long","type":"long"}
45 | ]
46 | }"""
47 |
48 | val NATIVE_COMPLETE_SCHEMA = NativeComplete.SCHEMA$.toString()
49 |
50 | val NATIVE_COMPLETE_SCHEMA_WITHOUT_FIXED = """{
51 | "namespace": "all-types.test",
52 | "type": "record",
53 | "name": "native_complete",
54 | "fields":[
55 | {"name": "bytes", "type": "bytes" },
56 | { "name": "string", "type": ["string", "null"] },
57 | { "name": "int", "type": ["int", "null"] },
58 | { "name": "long", "type": ["long", "null"] },
59 | { "name": "double", "type": ["double", "null"] },
60 | { "name": "float", "type": ["float", "null"] },
61 | { "name": "boolean", "type": ["boolean","null"] },
62 | { "name": "array", "type": {"type": "array", "items": "string"} },
63 | {"name": "map", "type": { "type": "map", "values": {"type": "array", "items": "long"}}}
64 | ]
65 | }"""
66 |
67 | val NATIVE_SCHEMA_SPEC = """{
68 | "namespace": "all-types.test",
69 | "type": "record",
70 | "name": "native",
71 | "fields":[
72 | { "name": "string", "type": ["string", "null"] },
73 | { "name": "int", "type": ["int", "null"] },
74 | { "name": "long", "type": ["long", "null"] },
75 | { "name": "double", "type": ["double", "null"] },
76 | { "name": "float", "type": ["float", "null"] },
77 | { "name": "boolean", "type": ["boolean","null"] }
78 | ]
79 | }"""
80 |
81 | val ARRAY_SCHEMA_SPEC = """{
82 | "namespace": "all-types.test",
83 | "type": "record",
84 | "name": "array",
85 | "fields":[
86 | { "name": "array", "type": {"type": "array", "items": "string"} }
87 | ]
88 | }"""
89 |
90 | val MAP_SCHEMA_SPEC = """{
91 | "namespace": "all-types.test",
92 | "type": "record",
93 | "name": "map",
94 | "fields":[
95 | {"name": "map", "type": { "type": "map", "values": {"type": "array", "items": "long"}}}
96 | ]
97 | }"""
98 |
99 | val BYTES_SCHEMA_SPEC = """{
100 | "namespace": "all-types.test",
101 | "type": "record",
102 | "name": "bytes",
103 | "fields":[
104 | {"name": "bytes", "type": "bytes" }
105 | ]
106 | }"""
107 |
108 | val FIXED_SCHEMA_SPEC = """{
109 | "namespace": "all-types.test",
110 | "type": "record",
111 | "name": "fixed_name",
112 | "fields":[
113 | {"name": "fixed", "type": {"type": "fixed", "size": 13, "name": "fixed"}}
114 | ]
115 | }"""
116 |
117 | val DECIMAL_SCHEMA_SPEC = """{
118 | "namespace": "all-types.test",
119 | "type": "record",
120 | "name": "decimal",
121 | "fields":[
122 | {"name": "decimal", "type": {"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}}
123 | ]
124 | }"""
125 |
126 | val DATE_SCHEMA_SPEC = """{
127 | "namespace": "all-types.test",
128 | "type": "record",
129 | "name": "date",
130 | "fields":[
131 | {"name": "date", "type": {"type": "int", "logicalType": "date"}}
132 | ]
133 | }"""
134 |
135 | val MILLISECOND_SCHEMA_SPEC = """{
136 | "namespace": "all-types.test",
137 | "type": "record",
138 | "name": "millisecond",
139 | "fields":[
140 | {"name": "millisecond", "type": {"type": "int", "logicalType": "time-millis"}}
141 | ]
142 | }"""
143 |
144 | val MICROSECOND_SCHEMA_SPEC = """{
145 | "namespace": "all-types.test",
146 | "type": "record",
147 | "name": "microsecond",
148 | "fields":[
149 | {"name": "microsecond", "type": {"type": "long", "logicalType": "time-micros"}}
150 | ]
151 | }"""
152 |
153 | val TIMESTAMP_MILLIS_SCHEMA_SPEC = """{
154 | "namespace": "all-types.test",
155 | "type": "record",
156 | "name": "timestamp_millis",
157 | "fields":[
158 | {"name": "timestampMillis", "type": {"type": "long", "logicalType": "timestamp-millis"}}
159 | ]
160 | }"""
161 |
162 | val TIMESTAMP_MICROS_SCHEMA_SPEC = """{
163 | "namespace": "all-types.test",
164 | "type": "record",
165 | "name": "timestamp_micros",
166 | "fields":[
167 | {"name": "timestampMicros", "type": {"type": "long", "logicalType": "timestamp-micros"}}
168 | ]
169 | }"""
170 |
171 | val DURATION_MICROS_SCHEMA_SPEC = """{
172 | "namespace": "all-types.test",
173 | "type": "record",
174 | "name": "duration_micros",
175 | "fields":[
176 | {"name": "duration", "type": {"type": "fixed", "size": 12, "name": "name", "logicalType": "duration"}}
177 | ]
178 | }"""
179 |
180 | val COMPLEX_SCHEMA_SPEC = """{
181 | "type":"record",
182 | "name":"complex",
183 | "namespace":"all-types.test",
184 | "fields":
185 | [
186 | {"name":"name","type":"string"},
187 | {"name":"regions","type":
188 | {"type":"map","values":
189 | {"type":"array","items":
190 | {"type":"record","name":"City","fields":
191 | [
192 | {"name":"name","type":"string"},
193 | {"name":"neighborhoods","type":
194 | {"type":"array","items":
195 | {"type":"record","name":"Neighborhood","fields":
196 | [
197 | {"name":"name","type":"string"},
198 | {"name":"streets","type":
199 | {"type":"array","items":
200 | {"type":"record","name":"Street","fields":
201 | [
202 | {"name":"name","type":"string"},
203 | {"name":"zip","type":"string"}
204 | ]
205 | }
206 | }
207 | }
208 | ]
209 | }
210 | }
211 | }
212 | ]
213 | }
214 | }
215 | }
216 | }
217 | ]
218 | }"""
219 |
220 | val COMPLEX_SCHEMA_STREET_SPEC = """
221 | {
222 | "namespace":"test_city",
223 | "type":"record",
224 | "name":"Street",
225 | "fields":
226 | [
227 | {"name":"name","type":"string"},
228 | {"name":"zip","type":"string"}
229 | ]
230 | }"""
231 |
232 | val COMPLEX_SCHEMA_NEIGHBORHOOD_SPEC = """
233 | {
234 | "namespace":"test_neighborhood",
235 | "type":"record",
236 | "name":"Neighborhood",
237 | "fields":
238 | [
239 | {"name":"name","type":"string"},
240 | {"name":"streets",
241 | "type":
242 | {
243 | "type":"array",
244 | "items":
245 | {
246 | "type":"record",
247 | "name":"Street",
248 | "fields":
249 | [
250 | {"name":"name","type":"string"},
251 | {"name":"zip","type":"string"}
252 | ]
253 | }
254 | }
255 | }
256 | ]
257 | }"""
258 |
259 | val COMPLEX_SCHEMA_CITY_SPEC = """
260 | {
261 | "namespace":"test_city",
262 | "type":"record",
263 | "name":"City",
264 | "fields":
265 | [
266 | {"name":"name","type":"string"},
267 | {"name":"neighborhoods","type":
268 | {
269 | "type":"array",
270 | "items":
271 | {
272 | "type":"record",
273 | "name":"Neighborhood",
274 | "fields":
275 | [
276 | {"name":"name","type":"string"},
277 | {"name":"streets","type":
278 | {
279 | "type":"array",
280 | "items":
281 | {
282 | "type":"record",
283 | "name":"Street",
284 | "fields":
285 | [
286 | {"name":"name","type":"string"},
287 | {"name":"zip","type":"string"}
288 | ]
289 | }
290 | }
291 | }
292 | ]
293 | }
294 | }
295 | }
296 | ]
297 | }"""
298 |
299 | val CASE_CLASSES_SCHEMA = """
300 | {
301 | "type":"record",
302 | "name":"name",
303 | "namespace":"namespace",
304 | "fields":[
305 | {
306 | "name":"errCol",
307 | "type":[
308 | {
309 | "type":"array",
310 | "items":[
311 | {
312 | "type":"record",
313 | "name":"errCol",
314 | "namespace":"namespace.errCol",
315 | "fields":[
316 | {
317 | "name":"errType",
318 | "type":[
319 | "string",
320 | "null"
321 | ]
322 | },
323 | {
324 | "name":"errCode",
325 | "type":[
326 | "string",
327 | "null"
328 | ]
329 | },
330 | {
331 | "name":"errMsg",
332 | "type":[
333 | "string",
334 | "null"
335 | ]
336 | },
337 | {
338 | "name":"errCol",
339 | "type":[
340 | "string",
341 | "null"
342 | ]
343 | },
344 | {
345 | "name":"rawValues",
346 | "type":[
347 | {
348 | "type":"array",
349 | "items":[
350 | "string",
351 | "null"
352 | ]
353 | },
354 | "null"
355 | ]
356 | },
357 | {
358 | "name":"mappings",
359 | "type":[
360 | {
361 | "type":"array",
362 | "items":[
363 | {
364 | "type":"record",
365 | "name":"mappings",
366 | "namespace":"namespace.errCol.mappings",
367 | "fields":[
368 | {
369 | "name":"mappingTableColumn",
370 | "type":[
371 | "string",
372 | "null"
373 | ]
374 | },
375 | {
376 | "name":"mappedDatasetColumn",
377 | "type":[
378 | "string",
379 | "null"
380 | ]
381 | }
382 | ]
383 | },
384 | "null"
385 | ]
386 | },
387 | "null"
388 | ]
389 | }
390 | ]
391 | },
392 | "null"
393 | ]
394 | },
395 | "null"
396 | ]
397 | }
398 | ]
399 | }
400 | """
401 | }
402 |
--------------------------------------------------------------------------------