├── src ├── test │ ├── resources │ │ ├── .gitignore │ │ └── META-INF │ │ │ └── services │ │ │ └── za.co.absa.abris.avro.sql.SchemaConverter │ └── scala │ │ └── za │ │ └── co │ │ └── absa │ │ └── abris │ │ ├── avro │ │ ├── sql │ │ │ ├── DummySchemaConverter.scala │ │ │ ├── package.scala │ │ │ ├── CatalystDataToAvroSpec.scala │ │ │ ├── SchemaEvolutionSpec.scala │ │ │ └── AvroDataToCatalystSpec.scala │ │ ├── utils │ │ │ └── AvroSchemaEncoder.scala │ │ ├── registry │ │ │ ├── TestRegistryClient.scala │ │ │ └── SchemaSubjectSpec.scala │ │ ├── errors │ │ │ ├── FailFastExceptionHandlerSpec.scala │ │ │ ├── PermissiveRecordExceptionHandlerSpec.scala │ │ │ └── SpecificRecordExceptionHandlerSpec.scala │ │ ├── schemas │ │ │ └── SchemaLoaderSpec.scala │ │ ├── parsing │ │ │ └── utils │ │ │ │ └── AvroSchemaUtilsSpec.scala │ │ ├── read │ │ │ └── confluent │ │ │ │ ├── SchemaManagerFactorySpec.scala │ │ │ │ └── SchemaManagerSpec.scala │ │ └── format │ │ │ └── SparkAvroConversionsSpec.scala │ │ └── config │ │ ├── ToAvroConfigSpec.scala │ │ ├── InternalToAvroConfigSpec.scala │ │ ├── InternalFromAvroConfigSpec.scala │ │ └── FromAvroConfigSpec.scala └── main │ ├── resources │ ├── log4j.properties │ └── META-INF │ │ └── services │ │ └── za.co.absa.abris.avro.sql.SchemaConverter │ ├── scala │ ├── za │ │ └── co │ │ │ └── absa │ │ │ └── abris │ │ │ ├── avro │ │ │ ├── read │ │ │ │ └── confluent │ │ │ │ │ ├── ConfluentConstants.scala │ │ │ │ │ ├── SchemaManagerFactory.scala │ │ │ │ │ └── SchemaManager.scala │ │ │ ├── sql │ │ │ │ ├── SchemaConverter.scala │ │ │ │ ├── DefaultSchemaConverter.scala │ │ │ │ ├── CatalystDataToAvro.scala │ │ │ │ └── AvroDataToCatalyst.scala │ │ │ ├── registry │ │ │ │ ├── SchemaVersion.scala │ │ │ │ ├── SchemaCoordinate.scala │ │ │ │ ├── AbrisRegistryClient.scala │ │ │ │ ├── ConfluentRegistryClient.scala │ │ │ │ ├── ConfluentMockRegistryClient.scala │ │ │ │ ├── AbstractConfluentRegistryClient.scala │ │ │ │ └── SchemaSubject.scala │ │ │ ├── errors │ │ │ │ ├── DeserializationExceptionHandler.scala │ │ │ │ ├── FailFastExceptionHandler.scala │ │ │ │ ├── SpecificRecordExceptionHandler.scala │ │ │ │ └── PermissiveRecordExceptionHandler.scala │ │ │ ├── format │ │ │ │ └── SparkAvroConversions.scala │ │ │ ├── functions.scala │ │ │ └── parsing │ │ │ │ └── utils │ │ │ │ └── AvroSchemaUtils.scala │ │ │ ├── config │ │ │ ├── InternalToAvroConfig.scala │ │ │ └── InternalFromAvroConfig.scala │ │ │ └── examples │ │ │ ├── data │ │ │ └── generation │ │ │ │ ├── FixedString.scala │ │ │ │ ├── ComplexRecordsGenerator.scala │ │ │ │ └── TestSchemas.scala │ │ │ ├── utils │ │ │ ├── CompatibleRowEncoder.scala │ │ │ └── ExamplesUtils.scala │ │ │ ├── ConfluentKafkaAvroReader.scala │ │ │ └── ConfluentKafkaAvroWriter.scala │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── avro │ │ ├── AbrisAvroSerializer.scala │ │ └── AbrisAvroDeserializer.scala │ └── avro │ ├── native-simple-outer-schema.avsc │ └── native-complete-schema.avsc ├── .gitignore ├── .github └── workflows │ ├── test-and-verify.yml │ ├── compatibility-check.yml │ └── ci-check-jacoco.yml ├── documentation ├── python-documentation.md ├── vanilla-avro-documentation.md └── confluent-avro-documentation.md ├── scalastyle-config.xml ├── .editorconfig ├── LICENSE.md └── README.md /src/test/resources/.gitignore: -------------------------------------------------------------------------------- 1 | /sampleData/ 2 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=ERROR,stdout 2 | log4j.logger.com.endeca=INFO 3 | # Logger for crawl metrics 4 | log4j.logger.com.endeca.itl.web.metrics=INFO 5 | 6 | log4j.logger.org.apache.kafka.clients.consumer.internals.Fetcher=WARN 7 | log4j.logger.org.apache.spark.ContextCleaner=WARN 8 | log4j.logger.za.co.absa.abris.avro.read.confluent.SchemaManager$=WARN 9 | log4j.logger.za.co.absa.abris.avro.subject.SubjectNameStrategyAdapterFactory$=WARN 10 | 11 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 12 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 13 | log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | 3 | .cache-main 4 | .cache-tests 5 | 6 | # use glob syntax. 7 | syntax: glob 8 | *.ser 9 | *.class 10 | *~ 11 | *.bak 12 | #*.off 13 | *.old 14 | 15 | # eclipse conf file 16 | .settings 17 | .classpath 18 | .manager 19 | .scala_dependencies 20 | 21 | # idea 22 | .idea 23 | *.iml 24 | 25 | # building 26 | target 27 | build 28 | null 29 | tmp* 30 | temp* 31 | dist 32 | test-output 33 | build.log 34 | 35 | # other scm 36 | .svn 37 | .CVS 38 | .hg* 39 | 40 | # switch to regexp syntax. 41 | # syntax: regexp 42 | # ^\.pc/ 43 | 44 | #SHITTY output not in target directory 45 | build.log 46 | 47 | .cache* 48 | dependency-reduced-pom.xml 49 | 50 | _testOutput 51 | output 52 | /keystore/ 53 | -------------------------------------------------------------------------------- /.github/workflows/test-and-verify.yml: -------------------------------------------------------------------------------- 1 | name: Test and verify 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | spark: [ 3.2, 3.3, 3.4, 3.5 ] 16 | scala: [ 2.12, 2.13 ] 17 | 18 | name: Spark ${{ matrix.spark }}, Scala ${{ matrix.scala }} 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up JDK 1.8 23 | uses: actions/setup-java@v1 24 | with: 25 | java-version: 1.8 26 | - name: Run tests 27 | run: mvn clean verify -Plicense-check,spark-${{ matrix.spark }},scala-${{ matrix.scala }} 28 | -------------------------------------------------------------------------------- /src/test/resources/META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2022 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | za.co.absa.abris.avro.sql.DummySchemaConverter 17 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2022 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | za.co.absa.abris.avro.sql.DefaultSchemaConverter 17 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/read/confluent/ConfluentConstants.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.read.confluent 18 | 19 | object ConfluentConstants { 20 | 21 | val MAGIC_BYTE = 0x0 22 | val SCHEMA_ID_SIZE_BYTES = 4 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/sql/SchemaConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.sql.types.DataType 21 | 22 | trait SchemaConverter { 23 | val shortName: String 24 | def toSqlType(avroSchema: Schema): DataType 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/SchemaVersion.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | /** 20 | * Version of schema stored in Confluent Schema Registry 21 | */ 22 | trait SchemaVersion 23 | case class NumVersion(num: Int) extends SchemaVersion 24 | case class LatestVersion() extends SchemaVersion 25 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/errors/DeserializationExceptionHandler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.sql.avro.AbrisAvroDeserializer 21 | 22 | trait DeserializationExceptionHandler extends Serializable { 23 | 24 | def handle(exception: Throwable, deserializer: AbrisAvroDeserializer, readerSchema: Schema): Any 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/SchemaCoordinate.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | /** 20 | * coordinate that unambiguously identifies schema in schema registry 21 | */ 22 | trait SchemaCoordinate 23 | 24 | case class IdCoordinate(schemaId: Int) extends SchemaCoordinate 25 | 26 | case class SubjectCoordinate(subject: SchemaSubject, version: SchemaVersion) extends SchemaCoordinate 27 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/sql/DefaultSchemaConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | import org.apache.avro.Schema 19 | import org.apache.spark.sql.avro.SchemaConverters 20 | import org.apache.spark.sql.types.DataType 21 | 22 | class DefaultSchemaConverter extends SchemaConverter { 23 | override val shortName: String = "default" 24 | override def toSqlType(avroSchema: Schema): DataType = SchemaConverters.toSqlType(avroSchema).dataType 25 | } 26 | -------------------------------------------------------------------------------- /.github/workflows/compatibility-check.yml: -------------------------------------------------------------------------------- 1 | name: Binary Compatibility 2 | 3 | on: 4 | push: 5 | branches: [ master, branch-3.2 ] 6 | pull_request: 7 | branches: [ master, branch-3.2 ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | spark: [ 3.2 ] 17 | scala: [ 2.12, 2.13 ] 18 | name: Spark ${{ matrix.spark }}, Scala ${{ matrix.scala }} 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up JDK 1.8 22 | uses: actions/setup-java@v1 23 | with: 24 | java-version: 1.8 25 | - uses: actions/cache@v2 26 | with: 27 | path: ~/.m2/repository 28 | key: ${{ runner.os }}-${{ matrix.scala }}-${{ hashFiles('**/pom.xml') }} 29 | restore-keys: | 30 | ${{ runner.os }}-${{ matrix.scala }}- 31 | - name: Switch scala version 32 | run: mvn scala-cross-build:change-version -Pscala-${{ matrix.scala }} 33 | - name: Check binary compatibility 34 | run: mvn clean test -DskipTests -Pcompatibility-check,scala-${{ matrix.scala }} 35 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/config/InternalToAvroConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.config 18 | 19 | import org.apache.avro.Schema 20 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 21 | import za.co.absa.abris.config.ToAvroConfig.Key 22 | 23 | private[abris] class InternalToAvroConfig(map: Map[String, Any]) { 24 | 25 | val schema: Schema = AvroSchemaUtils.parse(map(Key.Schema).asInstanceOf[String]) 26 | 27 | val schemaId: Option[Int] = map.get(Key.SchemaId).map(_.asInstanceOf[Int]) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/errors/FailFastExceptionHandler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.SparkException 21 | import org.apache.spark.sql.avro.AbrisAvroDeserializer 22 | 23 | class FailFastExceptionHandler extends DeserializationExceptionHandler { 24 | 25 | def handle(exception: Throwable, avroDeserializer: AbrisAvroDeserializer, readerSchema: Schema): Any = { 26 | throw new SparkException("Malformed record detected.", exception) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/sql/DummySchemaConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType} 21 | import za.co.absa.abris.avro.sql.DummySchemaConverter._ 22 | 23 | class DummySchemaConverter extends SchemaConverter { 24 | override val shortName: String = name 25 | override def toSqlType(avroSchema: Schema): DataType = dataType 26 | } 27 | 28 | object DummySchemaConverter { 29 | val name: String = "dummy" 30 | val dataType: DataType = StructType(Seq(StructField("long", LongType))) 31 | } 32 | -------------------------------------------------------------------------------- /src/main/avro/native-simple-outer-schema.avsc: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright 2018 ABSA Group Limited 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | { 19 | "type": "record", 20 | "name": "NativeSimpleOuter", 21 | "namespace": "all_types.test", 22 | "fields": [ 23 | { 24 | "name": "name", 25 | "type": "string" 26 | }, 27 | { 28 | "name": "nested", 29 | "type": { 30 | "type": "record", 31 | "name": "Nested", 32 | "fields": [ 33 | { 34 | "name": "int", 35 | "type": "int" 36 | }, 37 | { 38 | "name": "long", 39 | "type": "long" 40 | } 41 | ] 42 | } 43 | } 44 | ] 45 | } -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/data/generation/FixedString.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples.data.generation 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.avro.generic.GenericFixed 21 | import za.co.absa.commons.annotation.DeveloperApi 22 | 23 | @DeveloperApi 24 | object FixedString { 25 | def getClassName(): String = new FixedString("").getClass.getName 26 | } 27 | 28 | /** 29 | * Utility class for writing Avro fixed fields. 30 | */ 31 | @DeveloperApi 32 | class FixedString(value: String) extends GenericFixed { 33 | override def getSchema(): Schema = null 34 | override def bytes(): Array[Byte] = value.getBytes 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/avro/AbrisAvroSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.sql.avro 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.sql.types.DataType 21 | import za.co.absa.commons.annotation.DeveloperApi 22 | 23 | /** 24 | * Simple wrapper to access spark package private class 25 | */ 26 | @DeveloperApi 27 | class AbrisAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean) { 28 | 29 | private val serializer: AvroSerializer = new AvroSerializer(rootCatalystType, rootAvroType, nullable) 30 | 31 | def serialize(catalystData: Any): Any = { 32 | serializer.serialize(catalystData) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/AbrisRegistryClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | import io.confluent.kafka.schemaregistry.client.SchemaMetadata 20 | import org.apache.avro.Schema 21 | 22 | trait AbrisRegistryClient { 23 | 24 | def getAllVersions(subject: String): java.util.List[Integer] 25 | 26 | def testCompatibility(subject: String, schema: Schema): Boolean 27 | 28 | def register(subject: String, schema: Schema): Int 29 | 30 | def getLatestSchemaMetadata(subject: String): SchemaMetadata 31 | 32 | def getSchemaMetadata(subject: String, version: Int): SchemaMetadata 33 | 34 | def getById(schemaId: Int): Schema 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/errors/SpecificRecordExceptionHandler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.avro.specific.SpecificRecordBase 21 | import org.apache.spark.internal.Logging 22 | import org.apache.spark.sql.avro.AbrisAvroDeserializer 23 | 24 | class SpecificRecordExceptionHandler(defaultRecord: SpecificRecordBase) extends DeserializationExceptionHandler with Logging { 25 | 26 | def handle(exception: Throwable, deserializer: AbrisAvroDeserializer, readerSchema: Schema): Any = { 27 | logWarning("Malformed record detected. Replacing with default record.", exception) 28 | deserializer.deserialize(defaultRecord) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/utils/AvroSchemaEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.utils 18 | 19 | import org.apache.spark.sql.{Encoder, Row} 20 | import za.co.absa.abris.avro.format.SparkAvroConversions 21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 22 | import za.co.absa.abris.examples.data.generation.ComplexRecordsGenerator 23 | import za.co.absa.abris.examples.utils.CompatibleRowEncoder 24 | 25 | class AvroSchemaEncoder { 26 | 27 | def getEncoder: Encoder[Row] = { 28 | val avroSchema = AvroSchemaUtils.parse(ComplexRecordsGenerator.usedAvroSchema) 29 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema) 30 | CompatibleRowEncoder.apply(sparkSchema) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/errors/PermissiveRecordExceptionHandler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.avro.generic.{GenericData, GenericRecord} 21 | import org.apache.avro.specific.SpecificRecordBase 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.sql.avro.AbrisAvroDeserializer 24 | 25 | class PermissiveRecordExceptionHandler() extends DeserializationExceptionHandler with Logging { 26 | 27 | def handle(exception: Throwable, deserializer: AbrisAvroDeserializer, readerSchema: Schema): Any = { 28 | logWarning("Malformed record detected. Replacing with full null row.", exception) 29 | val record = new GenericData.Record(readerSchema) 30 | deserializer.deserialize(record) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/registry/TestRegistryClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | import io.confluent.kafka.schemaregistry.client.SchemaMetadata 19 | import org.apache.avro.Schema 20 | 21 | import java.util 22 | 23 | class TestRegistryClient(config: Map[String, String]) extends AbrisRegistryClient { 24 | 25 | override def getAllVersions(subject: String): util.List[Integer] = ??? 26 | 27 | override def testCompatibility(subject: String, schema: Schema): Boolean = ??? 28 | 29 | override def register(subject: String, schema: Schema): Int = ??? 30 | 31 | override def getLatestSchemaMetadata(subject: String): SchemaMetadata = ??? 32 | 33 | override def getSchemaMetadata(subject: String, version: Int): SchemaMetadata = ??? 34 | 35 | override def getById(schemaId: Int): Schema = ??? 36 | } 37 | -------------------------------------------------------------------------------- /.github/workflows/ci-check-jacoco.yml: -------------------------------------------------------------------------------- 1 | name: CI check JaCoCo code-coverage 2 | 3 | on: 4 | pull_request: 5 | branches: [ master ] 6 | types: [ opened, edited, synchronize, reopened ] 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v2 14 | - name: Setup Scala 15 | uses: olafurpg/setup-scala@v10 16 | with: 17 | java-version: "adopt@1.8" 18 | - name: Build and run tests 19 | run: mvn clean verify -Pcode-coverage 20 | - name: Add coverage to PR 21 | id: jacoco 22 | uses: madrapps/jacoco-report@v1.4 23 | with: 24 | paths: ${{ github.workspace }}/target/site/jacoco/jacoco.xml 25 | token: ${{ secrets.GITHUB_TOKEN }} 26 | min-coverage-overall: 0.0 27 | min-coverage-changed-files: 80.0 28 | title: JaCoCo code coverage report 29 | update-comment: true 30 | - name: Get the Coverage info 31 | run: | 32 | echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}" 33 | echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}" 34 | - name: Fail PR if changed files coverage is less than 80% 35 | if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }} 36 | uses: actions/github-script@v6 37 | with: 38 | script: | 39 | core.setFailed('Changed files coverage is less than 80%!') 40 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/ConfluentRegistryClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient} 19 | import io.confluent.kafka.serializers.KafkaAvroDeserializerConfig 20 | 21 | import scala.collection.JavaConverters._ 22 | 23 | class ConfluentRegistryClient(client: SchemaRegistryClient) extends AbstractConfluentRegistryClient(client) { 24 | 25 | def this(configs: Map[String,String]) = this(ConfluentRegistryClient.createClient(configs)) 26 | } 27 | 28 | object ConfluentRegistryClient { 29 | 30 | private def createClient(configs: Map[String,String]) = { 31 | val settings = new KafkaAvroDeserializerConfig(configs.asJava) 32 | val urls = settings.getSchemaRegistryUrls 33 | val maxSchemaObject = settings.getMaxSchemasPerSubject 34 | 35 | new CachedSchemaRegistryClient(urls, maxSchemaObject, configs.asJava) 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/config/InternalFromAvroConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.config 18 | 19 | import org.apache.avro.Schema 20 | import za.co.absa.abris.avro.errors.{FailFastExceptionHandler, DeserializationExceptionHandler} 21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 22 | import za.co.absa.abris.config.FromAvroConfig.Key 23 | 24 | private[abris] class InternalFromAvroConfig(map: Map[String, Any]) { 25 | 26 | val readerSchema: Schema = AvroSchemaUtils.parse(map(Key.ReaderSchema).asInstanceOf[String]) 27 | 28 | val writerSchema: Option[Schema] = map 29 | .get(Key.WriterSchema) 30 | .map(s => AvroSchemaUtils.parse(s.asInstanceOf[String])) 31 | 32 | val schemaConverter: Option[String] = map 33 | .get(Key.SchemaConverter) 34 | .map(_.asInstanceOf[String]) 35 | 36 | val deserializationHandler: DeserializationExceptionHandler = map 37 | .get(Key.ExceptionHandler) 38 | .map(s => s.asInstanceOf[DeserializationExceptionHandler]) 39 | .getOrElse(new FailFastExceptionHandler) 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/sql/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro 18 | 19 | import org.apache.spark.sql.DataFrame 20 | import org.scalatest.matchers.should.Matchers._ 21 | 22 | package object sql { 23 | 24 | /** 25 | * assert that both dataFrames contain the same data 26 | * 27 | * @param expectedFrame 28 | * @param actualFrame 29 | */ 30 | def shouldEqualByData(expectedFrame: DataFrame, actualFrame: DataFrame): Unit = { 31 | 32 | def columnNames(frame: DataFrame) = frame.schema.fields.map(_.name) 33 | 34 | val expectedColNames = columnNames(expectedFrame) 35 | val actualColNames = columnNames(actualFrame) 36 | 37 | expectedColNames shouldEqual actualColNames 38 | 39 | expectedColNames.foreach(col => { 40 | val expectedColumn = expectedFrame.select(col).collect().map(row => row.toSeq.head) 41 | val actualColumn = actualFrame.select(col).collect().map(row => row.toSeq.head) 42 | 43 | for ((expected, actual ) <- expectedColumn.zip(actualColumn)) { 44 | actual shouldEqual expected 45 | } 46 | }) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/ConfluentMockRegistryClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException 20 | import io.confluent.kafka.schemaregistry.client.{MockSchemaRegistryClient, SchemaMetadata, SchemaRegistryClient} 21 | 22 | import java.io.IOException 23 | 24 | 25 | class ConfluentMockRegistryClient(client: SchemaRegistryClient) extends AbstractConfluentRegistryClient(client) { 26 | 27 | def this() = this(new MockSchemaRegistryClient()) 28 | 29 | /** 30 | * MockSchemaRegistryClient is throwing different Exception than the mocked client, this is a workaround 31 | */ 32 | @throws[IOException] 33 | @throws[RestClientException] 34 | override def getLatestSchemaMetadata(subject: String): SchemaMetadata = { 35 | try client.getLatestSchemaMetadata(subject) 36 | catch { 37 | case e: IOException if e.getMessage == "No schema registered under subject!" => 38 | throw new RestClientException("No schema registered under subject!", 404, 40401) 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/utils/CompatibleRowEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples.utils 18 | 19 | import org.apache.spark.sql.{Encoder, Row} 20 | import org.apache.spark.sql.types.StructType 21 | 22 | import scala.util.Try 23 | 24 | object CompatibleRowEncoder { 25 | def apply(schema: StructType): Encoder[Row] = { 26 | // Spark < 3.5.0 27 | val rowEncoderTry = Try { 28 | val rowEncoderClass = Class.forName("org.apache.spark.sql.catalyst.encoders.RowEncoder") 29 | val applyMethod = rowEncoderClass.getMethod("apply", classOf[StructType]) 30 | applyMethod.invoke(null, schema).asInstanceOf[Encoder[Row]] 31 | } 32 | 33 | // Spark >= 3.5.0 34 | rowEncoderTry.orElse(Try { 35 | val encodersClass = Class.forName("org.apache.spark.sql.Encoders") 36 | val rowMethod = encodersClass.getMethod("row", classOf[StructType]) 37 | rowMethod.invoke(null, schema).asInstanceOf[Encoder[Row]] 38 | }).getOrElse { 39 | throw new IllegalStateException("Neither RowEncoder.apply nor Encoders.row is available in the Spark version.") 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/config/ToAvroConfigSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.config 18 | 19 | import org.scalatest.flatspec.AnyFlatSpec 20 | import org.scalatest.matchers.should.Matchers 21 | import za.co.absa.abris.config.ToAvroConfig.Key 22 | 23 | class ToAvroConfigSpec extends AnyFlatSpec with Matchers { 24 | 25 | behavior of "ToAvroConfig" 26 | 27 | it should "provide map with all set configurations" in { 28 | val config = ToAvroConfig() 29 | .withSchema("foo") 30 | .withSchemaId(42) 31 | 32 | val map = config.abrisConfig() 33 | map(Key.Schema) shouldBe "foo" 34 | map(Key.SchemaId) shouldBe 42 35 | } 36 | 37 | it should "support the legacy constructor and methods" in { 38 | val config = new ToAvroConfig("foo", Some(2)) 39 | 40 | config.schemaString() shouldBe "foo" 41 | config.schemaId() shouldBe Some(2) 42 | 43 | val map = config.abrisConfig() 44 | map(Key.Schema) shouldBe "foo" 45 | map(Key.SchemaId) shouldBe 2 46 | } 47 | 48 | it should "throw when validation fails" in { 49 | val config = ToAvroConfig() 50 | 51 | val thrown = intercept[IllegalArgumentException] { 52 | config.validate() 53 | } 54 | thrown.getMessage.contains(Key.Schema) shouldBe true 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/errors/FailFastExceptionHandlerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import org.apache.spark.SparkException 20 | import org.apache.spark.sql.avro.{AbrisAvroDeserializer, SchemaConverters} 21 | import org.apache.spark.sql.types.DataType 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 25 | import za.co.absa.abris.examples.data.generation.TestSchemas 26 | 27 | 28 | class FailFastExceptionHandlerSpec extends AnyFlatSpec with Matchers { 29 | 30 | it should "should throw spark exception on error" in { 31 | 32 | val deserializationExceptionHandler = new FailFastExceptionHandler 33 | val schema = AvroSchemaUtils.parse(TestSchemas.COMPLEX_SCHEMA_SPEC) 34 | val dataType: DataType = SchemaConverters.toSqlType(schema).dataType 35 | val deserializer = new AbrisAvroDeserializer(schema, dataType) 36 | 37 | an[SparkException] should be thrownBy (deserializationExceptionHandler.handle(new Exception, deserializer, schema)) 38 | val exceptionThrown = the[SparkException] thrownBy (deserializationExceptionHandler.handle(new Exception, deserializer, schema)) 39 | exceptionThrown.getMessage should equal("Malformed record detected.") 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/format/SparkAvroConversions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.format 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.sql.avro.SchemaConverters 21 | import org.apache.spark.sql.types._ 22 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 23 | 24 | 25 | /** 26 | * This class provides conversions between Avro and Spark schemas and data. 27 | */ 28 | object SparkAvroConversions { 29 | 30 | /** 31 | * Converts a Spark's SQL type into an Avro schema, using specific names and namespaces for the schema. 32 | */ 33 | def toAvroSchema( 34 | structType: StructType, 35 | schemaName: String, 36 | schemaNamespace: String): Schema = { 37 | SchemaConverters.toAvroType(structType, false, schemaName, schemaNamespace) 38 | } 39 | 40 | /** 41 | * Translates an Avro Schema into a Spark's StructType. 42 | * 43 | * Relies on Spark-Avro library to do the job. 44 | */ 45 | def toSqlType(schema: String): StructType = toSqlType(AvroSchemaUtils.parse(schema)) 46 | 47 | /** 48 | * Translates an Avro Schema into a Spark's StructType. 49 | * 50 | * Relies on Spark-Avro library to do the job. 51 | */ 52 | def toSqlType(schema: Schema): StructType = { 53 | SchemaConverters.toSqlType(schema).dataType.asInstanceOf[StructType] 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/sql/CatalystDataToAvroSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import org.apache.avro.SchemaBuilder 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} 22 | import org.apache.spark.sql.functions.col 23 | import org.scalatest.BeforeAndAfterEach 24 | import org.scalatest.flatspec.AnyFlatSpec 25 | import org.scalatest.matchers.should.Matchers 26 | import za.co.absa.abris.avro.functions._ 27 | import za.co.absa.abris.config.ToAvroConfig 28 | 29 | class CatalystDataToAvroSpec extends AnyFlatSpec with Matchers with BeforeAndAfterEach { 30 | it should "be serializable" in { 31 | val schema = SchemaBuilder 32 | .record("foo") 33 | .namespace("test_namespace") 34 | .fields() 35 | .name("int").`type`().intType().noDefault() 36 | .endRecord() 37 | .toString 38 | val config = ToAvroConfig().withSchema(schema) 39 | val catalystDataToAvro = to_avro(col("col"), config).expr 40 | 41 | val javaSerializer = new JavaSerializer(new SparkConf()) 42 | javaSerializer.newInstance().serialize(catalystDataToAvro) 43 | 44 | val kryoSerializer = new KryoSerializer(new SparkConf()) 45 | kryoSerializer.newInstance().serialize(catalystDataToAvro) 46 | 47 | // test successful if no exception is thrown 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/schemas/SchemaLoaderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.schemas 18 | 19 | import org.apache.commons.io.FileUtils 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 22 | import za.co.absa.abris.examples.data.generation.TestSchemas 23 | 24 | import java.io.File 25 | import java.nio.charset.Charset 26 | 27 | class SchemaLoaderSpec extends AnyFlatSpec { 28 | 29 | private val testDir = new File("testDirSchemaLoader") 30 | 31 | behavior of "SchemaLoader" 32 | 33 | it should "retrieve schemas from file systems" in { 34 | val expectedSchemaString = TestSchemas.COMPLEX_SCHEMA_SPEC 35 | val expectedSchema = AvroSchemaUtils.parse(expectedSchemaString) 36 | val schemaFileName = "testSchemaName" 37 | val destination = writeIntoFS(expectedSchemaString, schemaFileName) 38 | val loadedSchema = AvroSchemaUtils.load(destination.getAbsolutePath) 39 | 40 | FileUtils.deleteQuietly(new File(destination.getAbsolutePath)) 41 | FileUtils.deleteDirectory(testDir) 42 | 43 | assert(expectedSchema.equals(loadedSchema)) 44 | } 45 | 46 | private def writeIntoFS(schema: String, name: String): File = { 47 | val destination = new File(testDir, name) 48 | FileUtils.write(destination, schema, Charset.defaultCharset) 49 | destination 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/config/InternalToAvroConfigSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.config 18 | 19 | import org.apache.avro.SchemaBuilder 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import org.scalatest.matchers.should.Matchers 22 | 23 | class InternalToAvroConfigSpec extends AnyFlatSpec with Matchers { 24 | 25 | import InternalToAvroConfigSpec._ 26 | 27 | behavior of "InternalToAvroConfig" 28 | 29 | it should "convert and provide all set properties" in { 30 | val config = ToAvroConfig() 31 | .withSchema(avroSchema.toString) 32 | .withSchemaId(42) 33 | 34 | val intConfig = new InternalToAvroConfig(config.abrisConfig()) 35 | 36 | val schema = intConfig.schema 37 | schema.getName shouldBe "foo" 38 | schema.getNamespace shouldBe "test_namespace" 39 | schema.getFields.size() shouldBe 2 40 | 41 | intConfig.schemaId shouldBe Some(42) 42 | } 43 | 44 | it should "return None for optional properties that were not set" in { 45 | val config = ToAvroConfig() 46 | .withSchema(avroSchema.toString) 47 | 48 | val intConfig = new InternalToAvroConfig(config.abrisConfig()) 49 | 50 | intConfig.schemaId shouldBe None 51 | } 52 | } 53 | 54 | object InternalToAvroConfigSpec { 55 | 56 | val avroSchema = SchemaBuilder 57 | .record("foo") 58 | .namespace("test_namespace") 59 | .fields() 60 | .name("int").`type`().intType().noDefault() 61 | .name("bytes_name").`type`().stringType().noDefault() 62 | .endRecord() 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/AbstractConfluentRegistryClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | import io.confluent.kafka.schemaregistry.avro.AvroSchema 20 | import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient} 21 | import org.apache.avro.Schema 22 | 23 | import java.util 24 | 25 | 26 | abstract class AbstractConfluentRegistryClient(client: SchemaRegistryClient) extends AbrisRegistryClient { 27 | 28 | override def getAllVersions(subject: String): util.List[Integer] = 29 | client.getAllVersions(subject) 30 | 31 | override def testCompatibility(subject: String, schema: Schema): Boolean = 32 | client.testCompatibility(subject, new AvroSchema(schema)) 33 | 34 | override def register(subject: String, schema: Schema): Int = 35 | client.register(subject, new AvroSchema(schema)) 36 | 37 | override def getLatestSchemaMetadata(subject: String): SchemaMetadata = 38 | client.getLatestSchemaMetadata(subject) 39 | 40 | override def getSchemaMetadata(subject: String, version: Int): SchemaMetadata = 41 | client.getSchemaMetadata(subject, version) 42 | 43 | override def getById(schemaId: Int): Schema = { 44 | val parsedSchema = client.getSchemaById(schemaId) 45 | parsedSchema match { 46 | case schema: AvroSchema => schema.rawSchema() 47 | case schema => throw new UnsupportedOperationException(s"Only AvroSchema is supported," + 48 | s" got schema type ${schema.schemaType()}") 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/avro/native-complete-schema.avsc: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * Copyright 2018 ABSA Group Limited 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | { 19 | "namespace": "all_types.test", 20 | "type": "record", 21 | "name": "NativeComplete", 22 | "fields": [ 23 | { 24 | "name": "bytes", 25 | "type": "bytes" 26 | }, 27 | { 28 | "name": "string", 29 | "type": [ 30 | "string", 31 | "null" 32 | ], 33 | "default": "blue" 34 | }, 35 | { 36 | "name": "int", 37 | "type": [ 38 | "int", 39 | "null" 40 | ] 41 | }, 42 | { 43 | "name": "long", 44 | "type": [ 45 | "long", 46 | "null" 47 | ] 48 | }, 49 | { 50 | "name": "double", 51 | "type": [ 52 | "double", 53 | "null" 54 | ] 55 | }, 56 | { 57 | "name": "float", 58 | "type": [ 59 | "float", 60 | "null" 61 | ] 62 | }, 63 | { 64 | "name": "boolean", 65 | "type": [ 66 | "boolean", 67 | "null" 68 | ] 69 | }, 70 | { 71 | "name": "array", 72 | "type": { 73 | "type": "array", 74 | "items": "string" 75 | } 76 | }, 77 | { 78 | "name": "map", 79 | "type": { 80 | "type": "map", 81 | "values": { 82 | "type": "array", 83 | "items": "long" 84 | } 85 | } 86 | }, 87 | { 88 | "name": "fixed", 89 | "type": { 90 | "type": "fixed", 91 | "size": 40, 92 | "name": "Fixed" 93 | } 94 | } 95 | ] 96 | } -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/ConfluentKafkaAvroReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import org.apache.spark.sql.functions.col 21 | import org.apache.spark.sql.streaming.Trigger 22 | import za.co.absa.abris.avro.read.confluent.SchemaManagerFactory 23 | import za.co.absa.abris.avro.registry.SchemaSubject 24 | import za.co.absa.abris.config.AbrisConfig 25 | 26 | import scala.concurrent.duration.DurationInt 27 | 28 | object ConfluentKafkaAvroReader { 29 | 30 | val kafkaTopicName = "test_topic" 31 | 32 | def main(args: Array[String]): Unit = { 33 | 34 | val schemaManager = SchemaManagerFactory.create(Map(AbrisConfig.SCHEMA_REGISTRY_URL ->"http://localhost:8081")) 35 | val schemaExists = schemaManager.exists(SchemaSubject.usingTopicNameStrategy("foo")) 36 | 37 | 38 | val spark = SparkSession 39 | .builder() 40 | .appName("WriterJob") 41 | .master("local[2]") 42 | .getOrCreate() 43 | 44 | 45 | spark.sparkContext.setLogLevel("INFO") 46 | 47 | val dataFrame = spark 48 | .readStream 49 | .format("kafka") 50 | .option("kafka.bootstrap.servers", "localhost:9092") 51 | .option("subscribe", kafkaTopicName) 52 | .option("startingOffsets", "earliest") 53 | .load() 54 | 55 | val abrisConfig = AbrisConfig 56 | .fromConfluentAvro 57 | .downloadReaderSchemaByLatestVersion 58 | .andTopicNameStrategy(kafkaTopicName) 59 | .usingSchemaRegistry("http://localhost:8081") 60 | 61 | import za.co.absa.abris.avro.functions.from_avro 62 | val deserialized = dataFrame.select(from_avro(col("value"), abrisConfig) as "data") 63 | 64 | deserialized.printSchema() 65 | 66 | deserialized 67 | .writeStream 68 | .format("console") 69 | .trigger(Trigger.ProcessingTime(5.seconds)) 70 | .option("truncate", "false") 71 | .start() 72 | .awaitTermination() 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/registry/SchemaSubjectSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | import org.scalatest.BeforeAndAfter 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 22 | 23 | class SchemaSubjectSpec extends AnyFlatSpec with BeforeAndAfter { 24 | 25 | private val schema = AvroSchemaUtils.parse( 26 | """{ 27 | |"type": "record", 28 | |"name": "Blah", 29 | |"namespace" : "Bleh", 30 | |"fields": [{ "name": "name", "type": "string" }] 31 | |}""".stripMargin) 32 | 33 | behavior of "SchemaSubject" 34 | 35 | it should "retrieve the correct subject name for TopicName strategy" in { 36 | 37 | assertResult("foo_topic-value")( 38 | SchemaSubject.usingTopicNameStrategy("foo_topic").asString 39 | ) 40 | 41 | assertResult("foo_topic-key")( 42 | SchemaSubject.usingTopicNameStrategy("foo_topic", isKey = true).asString 43 | ) 44 | } 45 | 46 | it should "retrieve the correct subject name for RecordName strategy" in { 47 | 48 | assertResult("foo_namespace.foo_name")( 49 | SchemaSubject.usingRecordNameStrategy("foo_name", "foo_namespace").asString 50 | ) 51 | } 52 | 53 | it should "retrieve the correct subject name for TopicRecordName strategy" in { 54 | 55 | assertResult("topic-foo_namespace.foo_name")( 56 | SchemaSubject.usingTopicRecordNameStrategy("topic", "foo_name", "foo_namespace").asString 57 | ) 58 | } 59 | 60 | it should "retrieve name and namespace for RecordName strategy from schema" in { 61 | 62 | assertResult("Bleh.Blah")( 63 | SchemaSubject.usingRecordNameStrategy(schema).asString 64 | ) 65 | } 66 | 67 | it should "retrieve name and namespace for TopicRecordName strategy from schema" in { 68 | 69 | assertResult("foo_topic-Bleh.Blah")( 70 | SchemaSubject.usingTopicRecordNameStrategy("foo_topic", schema).asString 71 | ) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/config/InternalFromAvroConfigSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.config 18 | 19 | import org.apache.avro.SchemaBuilder 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import org.scalatest.matchers.should.Matchers 22 | 23 | class InternalFromAvroConfigSpec extends AnyFlatSpec with Matchers { 24 | 25 | import InternalFromAvroConfigSpec._ 26 | 27 | behavior of "InternalFromAvroConfig" 28 | 29 | it should "convert and provide all set properties" in { 30 | val config = FromAvroConfig() 31 | .withReaderSchema(avroReaderSchema.toString) 32 | .withWriterSchema(avroWriterSchema.toString) 33 | 34 | val intConfig = new InternalFromAvroConfig(config.abrisConfig()) 35 | 36 | val readerSchema = intConfig.readerSchema 37 | readerSchema.getName shouldBe "reader" 38 | readerSchema.getNamespace shouldBe "test_namespace" 39 | readerSchema.getFields.size() shouldBe 2 40 | 41 | val writerSchema = intConfig.writerSchema.get 42 | writerSchema.getName shouldBe "writer" 43 | writerSchema.getNamespace shouldBe "test_namespace" 44 | writerSchema.getFields.size() shouldBe 1 45 | } 46 | 47 | it should "return None for optional properties that were not set" in { 48 | val config = FromAvroConfig() 49 | .withReaderSchema(avroReaderSchema.toString) 50 | 51 | val intConfig = new InternalFromAvroConfig(config.abrisConfig()) 52 | 53 | intConfig.writerSchema shouldBe None 54 | } 55 | } 56 | 57 | object InternalFromAvroConfigSpec { 58 | 59 | val avroReaderSchema = SchemaBuilder 60 | .record("reader") 61 | .namespace("test_namespace") 62 | .fields() 63 | .name("int").`type`().intType().noDefault() 64 | .name("bytes_name").`type`().stringType().noDefault() 65 | .endRecord() 66 | 67 | val avroWriterSchema = SchemaBuilder 68 | .record("writer") 69 | .namespace("test_namespace") 70 | .fields() 71 | .name("int").`type`().intType().noDefault() 72 | .endRecord() 73 | } 74 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/config/FromAvroConfigSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.config 18 | 19 | import org.scalatest.flatspec.AnyFlatSpec 20 | import org.scalatest.matchers.should.Matchers 21 | import za.co.absa.abris.config.FromAvroConfig.Key 22 | 23 | class FromAvroConfigSpec extends AnyFlatSpec with Matchers { 24 | 25 | behavior of "FromAvroConfig" 26 | 27 | it should "provide all set configurations" in { 28 | val dummySchemaConverter = "dummy" 29 | val config = FromAvroConfig() 30 | .withWriterSchema("foo") 31 | .withReaderSchema("bar") 32 | .withSchemaConverter(dummySchemaConverter) 33 | .withSchemaRegistryConfig(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "url")) 34 | 35 | val map = config.abrisConfig() 36 | map(Key.WriterSchema) shouldBe "foo" 37 | map(Key.ReaderSchema) shouldBe "bar" 38 | map(Key.SchemaConverter) shouldBe dummySchemaConverter 39 | 40 | config.schemaRegistryConf().get(AbrisConfig.SCHEMA_REGISTRY_URL) shouldBe "url" 41 | } 42 | 43 | it should "support the legacy constructor and methods" in { 44 | val config = new FromAvroConfig("foo", Some(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "url"))) 45 | 46 | config.schemaString() shouldBe "foo" 47 | config.schemaRegistryConf().get(AbrisConfig.SCHEMA_REGISTRY_URL) shouldBe "url" 48 | 49 | val map = config.abrisConfig() 50 | map(Key.ReaderSchema) shouldBe "foo" 51 | } 52 | 53 | it should "throw when validation fails" in { 54 | val config = FromAvroConfig() 55 | 56 | val thrown = intercept[IllegalArgumentException] { 57 | config.validate() 58 | } 59 | thrown.getMessage.contains(Key.ReaderSchema) shouldBe true 60 | 61 | val config2 = FromAvroConfig() 62 | .withWriterSchema("foo") 63 | .withReaderSchema("bar") 64 | .withSchemaRegistryConfig(Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "url")) 65 | 66 | val thrown2 = intercept[IllegalArgumentException] { 67 | config2.validate() 68 | } 69 | thrown2.getMessage.contains(Key.WriterSchema) shouldBe true 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/functions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro 18 | 19 | import org.apache.spark.sql.Column 20 | import za.co.absa.abris.avro.sql.{AvroDataToCatalyst, CatalystDataToAvro} 21 | import za.co.absa.abris.config.{AbrisConfig, FromAvroConfig, ToAvroConfig} 22 | 23 | 24 | // scalastyle:off: object.name 25 | object functions { 26 | // scalastyle:on: object.name 27 | // scalastyle:off: method.name 28 | 29 | /** 30 | * 31 | * @param column containing data for conversion 32 | * @param config Abris configuration 33 | * @return column containing data in avro format 34 | */ 35 | def to_avro(column: Column, config: ToAvroConfig): Column = { 36 | config.validate() 37 | 38 | new Column(CatalystDataToAvro( 39 | column.expr, 40 | config.abrisConfig() 41 | )) 42 | } 43 | 44 | /** 45 | * 46 | * @param column containing data for conversion 47 | * @param schema avro schema 48 | * @return column containing data in avro format 49 | */ 50 | def to_avro(column: Column, schema: String): Column = { 51 | val config = AbrisConfig 52 | .toSimpleAvro 53 | .provideSchema(schema) 54 | 55 | to_avro(column, config) 56 | } 57 | 58 | /** 59 | * 60 | * @param column column containing data for conversion 61 | * @param config Abris configuration 62 | * @return column with converted data 63 | */ 64 | def from_avro(column: Column, config: FromAvroConfig): Column = { 65 | config.validate() 66 | 67 | new Column(AvroDataToCatalyst( 68 | column.expr, 69 | config.abrisConfig(), 70 | config.schemaRegistryConf() 71 | )) 72 | } 73 | 74 | /** 75 | * 76 | * @param column column containing data for conversion 77 | * @param schema avro schema 78 | * @return column with converted data 79 | */ 80 | def from_avro(column: Column, schema: String): Column = { 81 | val config = AbrisConfig 82 | .fromSimpleAvro 83 | .provideSchema(schema) 84 | 85 | from_avro(column, config) 86 | } 87 | 88 | // scalastyle:on: method.name 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/registry/SchemaSubject.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.registry 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.avro.Schema.Type 21 | 22 | /** 23 | * Represents Confluent Schema Registry Subject created using naming strategy 24 | * 25 | * https://docs.confluent.io/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work 26 | * 27 | */ 28 | class SchemaSubject(val asString: String) { 29 | override def toString: String = asString 30 | } 31 | 32 | object SchemaSubject{ 33 | 34 | def usingTopicNameStrategy( 35 | topicName: String, 36 | isKey: Boolean = false 37 | ): SchemaSubject = { 38 | val suffix = if (isKey) "-key" else "-value" 39 | new SchemaSubject(topicName + suffix) 40 | } 41 | 42 | def usingRecordNameStrategy( 43 | recordName: String, 44 | recordNamespace: String 45 | ): SchemaSubject = { 46 | val dummySchema = createDummySchema(recordName, recordNamespace) 47 | new SchemaSubject(getRecordName(dummySchema)) 48 | } 49 | 50 | def usingRecordNameStrategy( 51 | schema: Schema 52 | ): SchemaSubject = { 53 | new SchemaSubject(getRecordName(schema)) 54 | } 55 | 56 | def usingTopicRecordNameStrategy( 57 | topicName: String, 58 | recordName: String, 59 | recordNamespace: String 60 | ): SchemaSubject = { 61 | val dummySchema = createDummySchema(recordName, recordNamespace) 62 | new SchemaSubject(topicName + "-" + getRecordName(dummySchema)) 63 | } 64 | 65 | def usingTopicRecordNameStrategy( 66 | topicName: String, 67 | schema: Schema 68 | ): SchemaSubject = { 69 | new SchemaSubject(topicName + "-" + getRecordName(schema)) 70 | } 71 | 72 | private def getRecordName(schema: Schema): String = 73 | if (schema.getType == Type.RECORD) { 74 | schema.getFullName 75 | } else { 76 | throw new IllegalArgumentException(s"Schema must be of type RECORD not ${schema.getType}") 77 | } 78 | 79 | private def createDummySchema(name: String, namespace: String) = 80 | Schema.createRecord(name, "", namespace, false) 81 | } 82 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/errors/PermissiveRecordExceptionHandlerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import org.apache.spark.sql.avro.{AbrisAvroDeserializer, SchemaConverters} 21 | import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow 22 | import org.apache.spark.sql.types.{DataType, StructType} 23 | import org.scalatest.flatspec.AnyFlatSpec 24 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 25 | import za.co.absa.abris.examples.data.generation.TestSchemas 26 | 27 | class PermissiveRecordExceptionHandlerSpec extends AnyFlatSpec { 28 | 29 | private val spark = SparkSession 30 | .builder() 31 | .appName("unitTest") 32 | .master("local[1]") 33 | .config("spark.driver.bindAddress", "localhost") 34 | .config("spark.ui.enabled", "false") 35 | .getOrCreate() 36 | 37 | it should "receive empty dataframe row back" in { 38 | 39 | val expectedNestedFieldSchema = new StructType() 40 | .add("int", "int") 41 | .add("long", "long") 42 | val expectedNestedStructSchema = new StructType() 43 | .add("name", "string") 44 | .add("nested", expectedNestedFieldSchema) 45 | 46 | val expectedNestedFieldInternalRow = new SpecificInternalRow(expectedNestedFieldSchema) 47 | expectedNestedFieldInternalRow.setNullAt(0) 48 | expectedNestedFieldInternalRow.setNullAt(1) 49 | 50 | val expectedNestedStructInternalRow = new SpecificInternalRow(expectedNestedStructSchema) 51 | expectedNestedStructInternalRow.setNullAt(0) 52 | expectedNestedStructInternalRow.setNullAt(1) 53 | 54 | //actual 55 | val deserializationExceptionHandler = new PermissiveRecordExceptionHandler() 56 | val schema = AvroSchemaUtils.parse(TestSchemas.NATIVE_SIMPLE_OUTER_SCHEMA) 57 | val dataType: DataType = SchemaConverters.toSqlType(schema).dataType 58 | 59 | val actualResult = deserializationExceptionHandler 60 | .handle(new Exception, new AbrisAvroDeserializer(schema, dataType), schema) 61 | 62 | assert(actualResult == expectedNestedStructInternalRow) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/sql/CatalystDataToAvro.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import org.apache.avro.generic.GenericDatumWriter 20 | import org.apache.avro.io.{BinaryEncoder, EncoderFactory} 21 | import org.apache.spark.sql.avro.AbrisAvroSerializer 22 | import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} 23 | import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} 24 | import org.apache.spark.sql.types.{BinaryType, DataType} 25 | import za.co.absa.abris.avro.read.confluent.ConfluentConstants 26 | import za.co.absa.abris.config.InternalToAvroConfig 27 | 28 | import java.io.ByteArrayOutputStream 29 | import java.nio.ByteBuffer 30 | 31 | private[abris] case class CatalystDataToAvro( 32 | child: Expression, 33 | abrisConfig: Map[String,Any] 34 | ) extends UnaryExpression { 35 | 36 | override def dataType: DataType = BinaryType 37 | 38 | @transient private lazy val config = new InternalToAvroConfig(abrisConfig) 39 | 40 | @transient private lazy val serializer: AbrisAvroSerializer = 41 | new AbrisAvroSerializer(child.dataType, config.schema, child.nullable) 42 | 43 | @transient private lazy val writer = 44 | new GenericDatumWriter[Any](config.schema) 45 | 46 | @transient private var encoder: BinaryEncoder = _ 47 | 48 | @transient private lazy val out = new ByteArrayOutputStream 49 | 50 | override def nullSafeEval(input: Any): Any = { 51 | out.reset() 52 | 53 | config.schemaId.foreach { id => 54 | attachSchemaId(id, out) 55 | } 56 | 57 | encoder = EncoderFactory.get().directBinaryEncoder(out, encoder) 58 | val avroData = serializer.serialize(input) 59 | writer.write(avroData, encoder) 60 | encoder.flush() 61 | out.toByteArray 62 | } 63 | 64 | private def attachSchemaId(id: Int, outStream: ByteArrayOutputStream) = { 65 | outStream.write(ConfluentConstants.MAGIC_BYTE) 66 | outStream.write(ByteBuffer.allocate(ConfluentConstants.SCHEMA_ID_SIZE_BYTES).putInt(id).array()) 67 | } 68 | 69 | override def prettyName: String = "to_avro" 70 | 71 | override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { 72 | val expr = ctx.addReferenceObj("this", this) 73 | defineCodeGen(ctx, ev, input => 74 | s"(byte[]) $expr.nullSafeEval($input)") 75 | } 76 | 77 | override protected def withNewChildInternal(newChild: Expression): Expression = 78 | copy(child = newChild) 79 | } 80 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/errors/SpecificRecordExceptionHandlerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.errors 18 | 19 | import all_types.test.{NativeSimpleOuter, Nested} 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.avro.{AbrisAvroDeserializer, SchemaConverters} 22 | import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow 23 | import org.apache.spark.sql.types.{DataType, StructType} 24 | import org.apache.spark.unsafe.types.UTF8String 25 | import org.scalatest.flatspec.AnyFlatSpec 26 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 27 | import za.co.absa.abris.examples.data.generation.TestSchemas 28 | 29 | class SpecificRecordExceptionHandlerSpec extends AnyFlatSpec { 30 | 31 | private val spark = SparkSession 32 | .builder() 33 | .appName("unitTest") 34 | .master("local[1]") 35 | .config("spark.driver.bindAddress", "localhost") 36 | .config("spark.ui.enabled", "false") 37 | .getOrCreate() 38 | 39 | it should "receive empty dataframe row back" in { 40 | // provided 41 | val providedDefaultRecord = NativeSimpleOuter.newBuilder() 42 | .setName("name") 43 | .setNested(Nested.newBuilder() 44 | .setInt$(1) 45 | .setLong$(1) 46 | .build()) 47 | .build() 48 | 49 | // expected 50 | val expectedNestedFieldSchema = new StructType() 51 | .add("int", "int") 52 | .add("long", "long") 53 | val expectedNestedStructSchema = new StructType() 54 | .add("name", "string") 55 | .add("nested", expectedNestedFieldSchema) 56 | 57 | val expectedNestedFieldInternalRow = new SpecificInternalRow(expectedNestedFieldSchema) 58 | expectedNestedFieldInternalRow.setInt(0, 1) 59 | expectedNestedFieldInternalRow.setLong(1, 1L) 60 | 61 | val expectedNestedStructInternalRow = new SpecificInternalRow(expectedNestedStructSchema) 62 | expectedNestedStructInternalRow.update(0, UTF8String.fromString("name")) 63 | expectedNestedStructInternalRow.update(1, expectedNestedFieldInternalRow) 64 | 65 | //actual 66 | val deserializationExceptionHandler = new SpecificRecordExceptionHandler(providedDefaultRecord) 67 | val schema = AvroSchemaUtils.parse(TestSchemas.NATIVE_SIMPLE_OUTER_SCHEMA) 68 | val dataType: DataType = SchemaConverters.toSqlType(schema).dataType 69 | 70 | val actualResult = deserializationExceptionHandler 71 | .handle(new Exception, new AbrisAvroDeserializer(schema, dataType), schema) 72 | 73 | assert(actualResult == expectedNestedStructInternalRow) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/data/generation/ComplexRecordsGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples.data.generation 18 | 19 | import org.apache.spark.sql.Row 20 | import za.co.absa.commons.annotation.DeveloperApi 21 | 22 | import java.nio.ByteBuffer 23 | import scala.util.Random 24 | import scala.collection.JavaConverters._ 25 | 26 | /** 27 | * This class provides methods to generate example/test data. 28 | * Not part of the library core. 29 | */ 30 | // scalastyle:off magic.number 31 | @DeveloperApi 32 | object ComplexRecordsGenerator { 33 | 34 | private val random = new Random() 35 | 36 | val usedAvroSchema: String = TestSchemas.NATIVE_COMPLETE_SCHEMA 37 | 38 | def generateUnparsedRows(howMany: Int): List[Row] = { 39 | val result = new Array[Row](howMany) 40 | for (i <- 0 until howMany) { 41 | result(i) = Row.fromSeq(getDataSeq()) 42 | } 43 | result.toList 44 | } 45 | 46 | private def getDataSeq(): Seq[Any] = { 47 | val map = Map[String, Seq[Long]]( 48 | "entry1" -> randomSeqOfLongs(20), 49 | "entry2" -> randomSeqOfLongs(30)) 50 | Seq( 51 | ByteBuffer.wrap(randomString(20).getBytes).array(), 52 | randomString(30), 53 | new java.lang.Integer(random.nextInt()), 54 | new java.lang.Long(random.nextLong()), 55 | new java.lang.Double(random.nextDouble()), 56 | new java.lang.Float(random.nextFloat()), 57 | new java.lang.Boolean(random.nextBoolean()), 58 | randomSeqOfStrings(10, 15), 59 | map, 60 | new FixedString(randomString(40)).bytes()) 61 | } 62 | 63 | private def randomListOfLongs(listSize: Int) = { 64 | val array = new Array[Long](listSize) 65 | for (i <- 0 until listSize) { 66 | array(i) = random.nextLong() 67 | } 68 | new java.util.ArrayList(array.toList.asJava) 69 | } 70 | 71 | private def randomSeqOfLongs(listSize: Int) = { 72 | randomListOfLongs(listSize).asScala.toSeq 73 | } 74 | 75 | private def randomListOfStrings(listSize: Int, stringLength: Int) = { 76 | val array = new Array[String](listSize) 77 | for (i <- 0 until listSize) { 78 | array(i) = randomString(stringLength) 79 | } 80 | new java.util.ArrayList(array.toList.asJava) 81 | } 82 | 83 | private def randomSeqOfStrings(listSize: Int, stringLength: Int) = { 84 | randomListOfStrings(listSize, stringLength).asScala 85 | } 86 | 87 | private def randomString(length: Int): String = { 88 | val randomStream = Random.alphanumeric 89 | randomStream.take(length).mkString 90 | } 91 | } 92 | // scalastyle:on magic.number 93 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/ConfluentKafkaAvroWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples 18 | 19 | import org.apache.spark.sql.functions.{col, struct} 20 | import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession} 21 | import za.co.absa.abris.avro.format.SparkAvroConversions 22 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 23 | import za.co.absa.abris.config.AbrisConfig 24 | import za.co.absa.abris.examples.data.generation.ComplexRecordsGenerator 25 | import za.co.absa.abris.examples.utils.CompatibleRowEncoder 26 | 27 | 28 | object ConfluentKafkaAvroWriter { 29 | 30 | val kafkaTopicName = "test_topic" 31 | 32 | val dummyDataRows = 5 33 | val dummyDataPartitions = 1 34 | 35 | def main(args: Array[String]): Unit = { 36 | 37 | val spark = SparkSession 38 | .builder() 39 | .appName("ReaderJob") 40 | .master("local[2]") 41 | .getOrCreate() 42 | 43 | spark.sparkContext.setLogLevel("INFO") 44 | 45 | val dataFrame = generateRandomDataFrame(spark) 46 | 47 | dataFrame.show(false) 48 | 49 | val schemaString = ComplexRecordsGenerator.usedAvroSchema 50 | 51 | // to serialize all columns in dataFrame we need to put them in a spark struct 52 | val allColumns = struct(dataFrame.columns.map(col).toIndexedSeq: _*) 53 | 54 | val abrisConfig = AbrisConfig 55 | .toConfluentAvro 56 | .provideAndRegisterSchema(schemaString) 57 | .usingTopicNameStrategy(kafkaTopicName) 58 | .usingSchemaRegistry("http://localhost:8081") 59 | 60 | import za.co.absa.abris.avro.functions.to_avro 61 | 62 | val avroFrame = dataFrame.select(to_avro(allColumns, abrisConfig) as "value") 63 | 64 | avroFrame 65 | .write 66 | .format("kafka") 67 | .option("kafka.bootstrap.servers", "localhost:9092") 68 | .option("topic", kafkaTopicName) 69 | .save() 70 | } 71 | 72 | private def generateRandomDataFrame(spark: SparkSession): DataFrame = { 73 | import spark.implicits._ 74 | 75 | implicit val encoder: Encoder[Row] = getEncoder 76 | 77 | val rows = createRows(dummyDataRows) 78 | spark.sparkContext.parallelize(rows, dummyDataPartitions).toDF() 79 | } 80 | 81 | private def createRows(howMany: Int): List[Row] = { 82 | ComplexRecordsGenerator.generateUnparsedRows(howMany) 83 | } 84 | 85 | private def getEncoder: Encoder[Row] = { 86 | val avroSchema = AvroSchemaUtils.parse(ComplexRecordsGenerator.usedAvroSchema) 87 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema) 88 | CompatibleRowEncoder.apply(sparkSchema) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/avro/AbrisAvroDeserializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.sql.avro 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.spark.sql.types.DataType 21 | import za.co.absa.commons.annotation.DeveloperApi 22 | 23 | import scala.collection.mutable 24 | import scala.util.Try 25 | 26 | /** 27 | * Compatibility layer handling different versions of AvroDeserializer 28 | * the package also allows to access package private class 29 | */ 30 | @DeveloperApi 31 | class AbrisAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) { 32 | 33 | private val deserializer = { 34 | val clazz = classOf[AvroDeserializer] 35 | val schemaClz = classOf[Schema] 36 | val dataTypeClz = classOf[DataType] 37 | val stringClz = classOf[String] 38 | val booleanClz = classOf[Boolean] 39 | 40 | clazz.getConstructors.collectFirst { 41 | case currCtor if currCtor.getParameterTypes sameElements 42 | Array(schemaClz, dataTypeClz) => 43 | // Spark 2.4 44 | currCtor.newInstance(rootAvroType, rootCatalystType) 45 | case currCtor if currCtor.getParameterTypes sameElements 46 | Array(schemaClz, dataTypeClz, stringClz) => 47 | // Spark 3.0 - Spark 3.5.0 (including) 48 | currCtor.newInstance(rootAvroType, rootCatalystType, "LEGACY") 49 | case currCtor if currCtor.getParameterTypes sameElements 50 | Array(schemaClz, dataTypeClz, stringClz, booleanClz) => 51 | // Spark 3.5.1 - 3.5.2 52 | currCtor.newInstance(rootAvroType, rootCatalystType, "LEGACY", false: java.lang.Boolean) 53 | case currCtor if currCtor.getParameterTypes.toSeq sameElements 54 | Array(schemaClz, dataTypeClz, stringClz, booleanClz, stringClz) => 55 | // Spark 4.0.0-SNAPSHOT+ 56 | currCtor.newInstance(rootAvroType, rootCatalystType, "LEGACY", false: java.lang.Boolean, "") 57 | } match { 58 | case Some(value: AvroDeserializer) => 59 | value 60 | case _ => 61 | throw new NoSuchMethodException( 62 | s"""Supported constructors for AvroDeserializer are: 63 | |${clazz.getConstructors.toSeq.mkString(System.lineSeparator())}""".stripMargin) 64 | } 65 | 66 | } 67 | 68 | private val ru = scala.reflect.runtime.universe 69 | private val rm = ru.runtimeMirror(getClass.getClassLoader) 70 | private val classSymbol = rm.classSymbol(deserializer.getClass) 71 | private val deserializeMethodSymbol = classSymbol.info.decl(ru.TermName("deserialize")).asMethod 72 | private val deserializeMethod = rm.reflect(deserializer).reflectMethod(deserializeMethodSymbol) 73 | 74 | def deserialize(data: Any): Any = { 75 | deserializeMethod(data) match { 76 | case Some(x) => x // Spark 3.1 + 77 | case x => x // Spark 3.0 - 78 | } 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/read/confluent/SchemaManagerFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | package za.co.absa.abris.avro.read.confluent 19 | 20 | import org.apache.spark.internal.Logging 21 | import za.co.absa.abris.avro.registry.{AbrisRegistryClient, ConfluentMockRegistryClient, ConfluentRegistryClient} 22 | import za.co.absa.abris.config.AbrisConfig 23 | import za.co.absa.commons.annotation.DeveloperApi 24 | 25 | import scala.collection.concurrent 26 | import scala.util.Try 27 | import scala.util.control.NonFatal 28 | 29 | /** 30 | * This thread-safe factory creates [[SchemaManager]] and also manages the instances of SchemaRegistryClient 31 | * used by allowing caching of the references in order to avoid creating instances in every call that can be 32 | * used to cache schemas. 33 | * This factory also allows us to use custom registry client via abris.registryClient.class property. 34 | */ 35 | object SchemaManagerFactory extends Logging { 36 | 37 | private val clientInstances: concurrent.Map[Map[String,String], AbrisRegistryClient] = concurrent.TrieMap() 38 | 39 | @DeveloperApi 40 | def addSRClientInstance(configs: Map[String, String], client: AbrisRegistryClient): Unit = { 41 | clientInstances.put(configs, client) 42 | } 43 | 44 | @DeveloperApi 45 | def resetSRClientInstance(): Unit = { 46 | clientInstances.clear() 47 | } 48 | 49 | def create(configs: Map[String,String]): SchemaManager = new SchemaManager(getOrCreateRegistryClient(configs)) 50 | 51 | private def getOrCreateRegistryClient(configs: Map[String,String]): AbrisRegistryClient = { 52 | clientInstances.getOrElseUpdate(configs, { 53 | if (configs.contains(AbrisConfig.REGISTRY_CLIENT_CLASS)) { 54 | try { 55 | val clazz = Class.forName(configs(AbrisConfig.REGISTRY_CLIENT_CLASS)) 56 | logInfo(msg = s"Configuring new Schema Registry client of type '${clazz.getCanonicalName}'") 57 | Try(clazz.getConstructor(classOf[Map[String, String]]).newInstance(configs)) 58 | .recover { case _: NoSuchMethodException => 59 | clazz.getConstructor().newInstance() 60 | } 61 | .get 62 | .asInstanceOf[AbrisRegistryClient] 63 | } catch { 64 | case e if NonFatal(e) => 65 | throw new IllegalArgumentException("Custom registry client must implement AbrisRegistryClient " + 66 | "and have parameterless or Map[String, String] accepting constructor", e) 67 | } 68 | } else if (configs(AbrisConfig.SCHEMA_REGISTRY_URL).startsWith("mock://")) { 69 | logInfo(msg = s"Configuring new Schema Registry client of type ConfluentMockRegistryClient") 70 | new ConfluentMockRegistryClient() 71 | } else { 72 | logInfo(msg = s"Configuring new Schema Registry client of type ConfluentRegistryClient") 73 | new ConfluentRegistryClient(configs) 74 | } 75 | }) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /documentation/python-documentation.md: -------------------------------------------------------------------------------- 1 | # Using ABRiS with Python and PySpark 2 | Abris is a Scala library, but with a bit of effort it can be used in Python as well. 3 | 4 | We Provide some examples, but most of the documentation is written for Scala, so if you need more check the scala examples and just convert the code to Python. 5 | 6 | PySpark is using [Py4J](https://www.py4j.org/) as an interface between Scala and Python so you can check the documentation to get better idea how to transform the code, 7 | but mostly it should be clear form the following examples. 8 | 9 | ### Examples 10 | 11 | ```python 12 | from pyspark import SparkContext 13 | from pyspark.sql.column import Column, _to_java_column 14 | 15 | def from_avro(col, config): 16 | """ 17 | avro deserialize 18 | 19 | :param col (PySpark column / str): column name "key" or "value" 20 | :param config (za.co.absa.abris.config.FromAvroConfig): abris config, generated from abris_config helper function 21 | :return: PySpark Column 22 | """ 23 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm 24 | abris_avro = jvm_gateway.za.co.absa.abris.avro 25 | 26 | return Column(abris_avro.functions.from_avro(_to_java_column(col), config)) 27 | 28 | def from_avro_abris_config(config_map, topic, is_key): 29 | """ 30 | Create from avro abris config with a schema url 31 | 32 | :param config_map (dict[str, str]): configuration map to pass to deserializer, ex: {'schema.registry.url': 'http://localhost:8081'} 33 | :param topic (str): kafka topic 34 | :param is_key (bool): boolean 35 | :return: za.co.absa.abris.config.FromAvroConfig 36 | """ 37 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm 38 | scala_map = jvm_gateway.PythonUtils.toScalaMap(config_map) 39 | 40 | return jvm_gateway.za.co.absa.abris.config \ 41 | .AbrisConfig \ 42 | .fromConfluentAvro() \ 43 | .downloadReaderSchemaByLatestVersion() \ 44 | .andTopicNameStrategy(topic, is_key) \ 45 | .usingSchemaRegistry(scala_map) 46 | 47 | def to_avro(col, config): 48 | """ 49 | avro serialize 50 | :param col (PySpark column / str): column name "key" or "value" 51 | :param config (za.co.absa.abris.config.ToAvroConfig): abris config, generated from abris_config helper function 52 | :return: PySpark Column 53 | """ 54 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm 55 | abris_avro = jvm_gateway.za.co.absa.abris.avro 56 | 57 | return Column(abris_avro.functions.to_avro(_to_java_column(col), config)) 58 | 59 | def to_avro_abris_config(config_map, topic, is_key): 60 | """ 61 | Create to avro abris config with a schema url 62 | 63 | :param config_map (dict[str, str]): configuration map to pass to the serializer, ex: {'schema.registry.url': 'http://localhost:8081'} 64 | :param topic (str): kafka topic 65 | :param is_key (bool): boolean 66 | :return: za.co.absa.abris.config.ToAvroConfig 67 | """ 68 | jvm_gateway = SparkContext._active_spark_context._gateway.jvm 69 | scala_map = jvm_gateway.PythonUtils.toScalaMap(config_map) 70 | 71 | return jvm_gateway.za.co.absa.abris.config \ 72 | .AbrisConfig \ 73 | .toConfluentAvro() \ 74 | .downloadSchemaByLatestVersion() \ 75 | .andTopicNameStrategy(topic, is_key) \ 76 | .usingSchemaRegistry(scala_map) 77 | ``` 78 | 79 | Complete Example with loading from Kafka: 80 | 81 | ```python 82 | df = spark.read.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "topic_name").load() 83 | 84 | from_avro_abris_settings = from_avro_abris_config({'schema.registry.url': 'http://localhost:8081'}, 'topic_name', False) 85 | df2 = df.withColumn("parsed", from_avro("value", from_avro_abris_settings)) 86 | df2.show() 87 | ``` 88 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/parsing/utils/AvroSchemaUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.parsing.utils 18 | 19 | import org.apache.avro.{Schema, SchemaBuilder} 20 | import org.apache.commons.io.IOUtils 21 | import org.apache.hadoop.conf.Configuration 22 | import org.apache.hadoop.fs.{FileSystem, Path} 23 | import org.apache.spark.sql.DataFrame 24 | import org.apache.spark.sql.avro.SchemaConverters 25 | import org.apache.spark.sql.functions.struct 26 | 27 | import java.nio.charset.Charset 28 | import scala.collection.JavaConverters._ 29 | 30 | /** 31 | * This class provides utility methods to cope with Avro schemas. 32 | */ 33 | object AvroSchemaUtils { 34 | 35 | /** 36 | * Parses a plain Avro schema into an org.apache.avro.Schema implementation. 37 | */ 38 | def parse(schema: String): Schema = new Schema.Parser().parse(schema) 39 | 40 | /** 41 | * Loads an Avro org.apache.avro.Schema from the path. 42 | */ 43 | def load(path: String): Schema = { 44 | parse(loadPlain(path)) 45 | } 46 | 47 | /** 48 | * Loads an Avro's plain schema from the path. 49 | */ 50 | def loadPlain(path: String): String = { 51 | if (path == null) { 52 | throw new IllegalArgumentException("Null path informed. " + 53 | "Please make sure you provide a valid path to an existing Avro schema located in some file system.") 54 | } 55 | val hdfs = FileSystem.get(new Configuration()) 56 | val stream = hdfs.open(new Path(path)) 57 | try IOUtils.readLines(stream, Charset.defaultCharset).asScala.mkString("\n") finally stream.close() 58 | } 59 | 60 | def toAvroSchema( 61 | dataFrame: DataFrame, 62 | columnName: String, 63 | recordName: String = "topLevelRecord", 64 | nameSpace: String = "" 65 | ): Schema = { 66 | val fieldIndex = dataFrame.schema.fieldIndex(columnName) 67 | val field = dataFrame.schema.fields(fieldIndex) 68 | 69 | SchemaConverters.toAvroType(field.dataType, field.nullable, recordName, nameSpace) 70 | } 71 | 72 | def toAvroSchema( 73 | dataFrame: DataFrame, 74 | columnNames: Seq[String] 75 | ): Schema = toAvroSchema(dataFrame, columnNames, "topLevelRecord", "") 76 | 77 | def toAvroSchema( 78 | dataFrame: DataFrame, 79 | columnNames: Seq[String], 80 | recordName: String, 81 | nameSpace: String 82 | ): Schema = { 83 | val allColumns = struct(columnNames.map(dataFrame.col): _*) 84 | val expression = allColumns.expr 85 | 86 | SchemaConverters.toAvroType(expression.dataType, expression.nullable, recordName, nameSpace) 87 | } 88 | 89 | def toAvroSchema( 90 | dataFrame: DataFrame 91 | ): Schema = toAvroSchema(dataFrame, "topLevelRecord", "") 92 | 93 | def toAvroSchema( 94 | dataFrame: DataFrame, 95 | recordName: String, 96 | nameSpace: String 97 | ): Schema = 98 | toAvroSchema(dataFrame, dataFrame.columns.toIndexedSeq, recordName, nameSpace) 99 | 100 | def wrapSchema(schema: Schema, name: String, namespace: String): Schema = { 101 | SchemaBuilder.record(name) 102 | .namespace(namespace) 103 | .fields().name(schema.getName).`type`(schema).noDefault() 104 | .endRecord() 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/parsing/utils/AvroSchemaUtilsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.parsing.utils 18 | 19 | import org.apache.avro.Schema.Type 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.functions.{col, struct} 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | 25 | 26 | class AvroSchemaUtilsSpec extends AnyFlatSpec with Matchers { 27 | 28 | private val spark = SparkSession 29 | .builder() 30 | .appName("unitTest") 31 | .master("local[2]") 32 | .config("spark.driver.bindAddress", "localhost") 33 | .config("spark.ui.enabled", "false") 34 | .getOrCreate() 35 | 36 | import spark.implicits._ 37 | 38 | 39 | val dataFrame = Seq((1, "bat", true), (2, "mouse", false)).toDF("number", "word", "bool") 40 | 41 | 42 | behavior of "AvroSchemaUtils" 43 | 44 | it should "convert the schema of whole dataframe" in { 45 | 46 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame) 47 | 48 | schema.getType shouldBe Type.RECORD 49 | schema.getFullName shouldBe "topLevelRecord" 50 | schema.getFields.get(0).schema().getType shouldBe Type.INT 51 | schema.getFields.get(1).schema().getType shouldBe Type.UNION 52 | schema.getFields.get(2).schema().getType shouldBe Type.BOOLEAN 53 | 54 | 55 | val schema2 = AvroSchemaUtils.toAvroSchema(dataFrame, "foo", "bar") 56 | schema2.getType shouldBe Type.RECORD 57 | schema2.getFullName shouldBe "bar.foo" 58 | } 59 | 60 | it should "convert the schema of multiple selected columns" in { 61 | 62 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame, Seq("bool","number")) 63 | 64 | schema.getType shouldBe Type.RECORD 65 | schema.getFullName shouldBe "topLevelRecord" 66 | schema.getFields.size() shouldBe 2 67 | schema.getFields.get(0).schema().getType shouldBe Type.BOOLEAN 68 | schema.getFields.get(1).schema().getType shouldBe Type.INT 69 | 70 | 71 | val schema2 = AvroSchemaUtils.toAvroSchema(dataFrame, Seq("bool","number"), "foo", "bar") 72 | schema2.getType shouldBe Type.RECORD 73 | schema2.getFullName shouldBe "bar.foo" 74 | } 75 | 76 | it should "convert the schema of one selected simple column" in { 77 | 78 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame, "bool") 79 | 80 | schema.getType shouldBe Type.BOOLEAN 81 | schema.getFullName shouldBe "boolean" 82 | 83 | val schema2 = AvroSchemaUtils.toAvroSchema(dataFrame, "bool", "foo", "bar") 84 | schema2.getType shouldBe Type.BOOLEAN 85 | schema2.getFullName shouldBe "boolean" 86 | } 87 | 88 | it should "convert the schema of one selected struct column" in { 89 | 90 | val structDataFrame = dataFrame.select(struct(col("bool"), col("number")) as "str") 91 | 92 | val schema = AvroSchemaUtils.toAvroSchema(structDataFrame, "str") 93 | 94 | schema.getType shouldBe Type.RECORD 95 | schema.getFullName shouldBe "topLevelRecord" 96 | schema.getFields.size() shouldBe 2 97 | schema.getFields.get(0).schema().getType shouldBe Type.BOOLEAN 98 | schema.getFields.get(1).schema().getType shouldBe Type.INT 99 | 100 | val schema2 = AvroSchemaUtils.toAvroSchema(structDataFrame, "str", "foo", "bar") 101 | 102 | schema2.getType shouldBe Type.RECORD 103 | schema2.getFullName shouldBe "bar.foo" 104 | schema2.getFields.size() shouldBe 2 105 | schema2.getFields.get(0).schema().getType shouldBe Type.BOOLEAN 106 | schema2.getFields.get(1).schema().getType shouldBe Type.INT 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /documentation/vanilla-avro-documentation.md: -------------------------------------------------------------------------------- 1 | # ABRiS - vanilla Avro documentation 2 | 3 | - [Avro to Spark](#Avro-to-Spark) 4 | - [Spark to Avro](#spark-to-avro) 5 | ## Avro to Spark 6 | 7 | ### Providing Avro schema 8 | ```scala 9 | import za.co.absa.abris.avro.functions.from_avro 10 | 11 | def readAvro(dataFrame: DataFrame, schemaString: String): DataFrame = { 12 | 13 | dataFrame.select(from_avro(col("value"), schemaString) as 'data).select("data.*") 14 | } 15 | ``` 16 | In this example the Avro binary data are in ```dataFrame``` inside column the **value**. 17 | The Avro schema is provided as a string ```schemaString```. 18 | 19 | After the Avro data are converted to Spark SQL representation they are stored in column the **data**. 20 | This column is immediately flattened in the next select so the result will be a ```DataFrame``` containing only the deserialized avro data. 21 | 22 | ### Using Schema Registry 23 | First we need to provide the Schema Registry configuration: 24 | ```scala 25 | val fromAvroConfig1: FromAvroConfig = AbrisConfig 26 | .fromSimpleAvro 27 | .downloadSchemaById(66) 28 | .usingSchemaRegistry("http://registry-url") 29 | 30 | // or 31 | val fromAvroConfig2: FromAvroConfig = AbrisConfig 32 | .fromSimpleAvro 33 | .downloadSchemaByLatestVersion 34 | .andTopicRecordNameStrategy("topic", "recordName", "namespace") 35 | .usingSchemaRegistry("http://registry-url") 36 | 37 | // or 38 | val fromAvroConfig3: FromAvroConfig = AbrisConfig 39 | .fromSimpleAvro 40 | .downloadSchemaByVersion(3) 41 | .andTopicNameStrategy("topicFoo", isKey=true) // Use isKey=true for the key schema and isKey=false for the value schema 42 | .usingSchemaRegistry("http://registry-url") 43 | 44 | // ... 45 | ``` 46 | There are several ways how to configure this. 47 | Each step in configurator will offer you some options, and you just have to choose what you want to do. 48 | At the end you should get an instance of `FromAvroConfig` that you can use like this: 49 | 50 | 51 | ```scala 52 | import za.co.absa.abris.avro.functions.from_avro 53 | 54 | def readAvro(dataFrame: DataFrame, fromAvroConfig: FromAvroConfig): DataFrame = { 55 | 56 | dataFrame.select(from_avro(col("value"), fromAvroConfig) as 'data).select("data.*") 57 | } 58 | ``` 59 | 60 | ## Spark to Avro 61 | 62 | ### Providing Avro schema 63 | ```scala 64 | import za.co.absa.abris.avro.functions.to_avro 65 | 66 | def writeAvro(dataFrame: DataFrame, schemaString: String): DataFrame = { 67 | 68 | val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*) 69 | dataFrame.select(to_avro(allColumns, schemaString) as 'value) 70 | } 71 | ``` 72 | If you provide the Avro schema as a second argument, ABRiS will use it to convert Spark data into Avro. 73 | Please make sure that the data types in Spark DataFrame and in schema are compatible. 74 | 75 | ### Using schema registry 76 | If you want to use Schema Registry you need to provide a configuration: 77 | 78 | ```scala 79 | val toAvroConfig1: ToAvroConfig = AbrisConfig 80 | .toSimpleAvro 81 | .provideAndRegisterSchema(schemaString) 82 | .usingTopicRecordNameStrategy("fooTopic") // record name is taken from the schema 83 | .usingSchemaRegistry("http://registry-url") 84 | 85 | // or 86 | val toAvroConfig2: ToAvroConfig = AbrisConfig 87 | .toSimpleAvro 88 | .downloadSchemaByVersion(4) 89 | .andTopicNameStrategy("fooTopic") 90 | .usingSchemaRegistry("http://registry-url") 91 | 92 | // or 93 | val toAvroConfig3: ToAvroConfig = AbrisConfig 94 | .toSimpleAvro 95 | .downloadSchemaById(66) 96 | .usingSchemaRegistry("http://registry-url") 97 | 98 | // ... 99 | ``` 100 | There are several ways how to configure this. 101 | Each step in configurator will offer you some options, and you just have to choose what you want to do. 102 | At the end you should get an instance of `ToAvroConfig` that you can use like this: 103 | ```scala 104 | import za.co.absa.abris.avro.functions.to_avro 105 | 106 | def writeAvro(dataFrame: DataFrame, toAvroConfig: ToAvroConfig): DataFrame = { 107 | 108 | val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*) 109 | dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value) 110 | } 111 | ``` 112 | 113 | ### Generate schema from data and register 114 | 115 | See [here](confluent-avro-documentation.md#generate-schema-from-data-and-register) 116 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/read/confluent/SchemaManagerFactorySpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.read.confluent 18 | 19 | import org.scalatest.BeforeAndAfterEach 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import za.co.absa.abris.avro.registry.{AbrisRegistryClient, ConfluentMockRegistryClient, ConfluentRegistryClient, TestRegistryClient} 22 | import za.co.absa.abris.config.AbrisConfig 23 | 24 | import scala.reflect.runtime.{universe => ru} 25 | 26 | class SchemaManagerFactorySpec extends AnyFlatSpec with BeforeAndAfterEach { 27 | 28 | private val schemaRegistryConfig1 = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "http://dummy") 29 | 30 | private val schemaRegistryConfig2 = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "http://dummy_sr_2") 31 | 32 | private val schemaRegistryConfig3 = Map( 33 | AbrisConfig.SCHEMA_REGISTRY_URL -> "http://dummy_sr_2", 34 | AbrisConfig.REGISTRY_CLIENT_CLASS -> "za.co.absa.abris.avro.registry.TestRegistryClient" 35 | ) 36 | 37 | override def beforeEach(): Unit = { 38 | super.beforeEach() 39 | SchemaManagerFactory.resetSRClientInstance() // Reset factory state 40 | } 41 | 42 | behavior of "SchemaManagerFactory" 43 | 44 | it should "create a schema manager for the given Schema Registry configs " + 45 | "and cache the Schema Registry Client reference for subsequent usages" in { 46 | val schemaManagerRef1 = SchemaManagerFactory.create(schemaRegistryConfig1) 47 | val schemaManagerRef2 = SchemaManagerFactory.create(schemaRegistryConfig1) 48 | 49 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader) 50 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm 51 | 52 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 53 | val res2 = m.reflect(schemaManagerRef2).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 54 | assert(res1.eq(res2)) 55 | } 56 | 57 | it should "create a schema manager with a different schema registry client depending on the configs passed" in { 58 | val schemaManagerRef1 = SchemaManagerFactory.create(schemaRegistryConfig1) 59 | val schemaManagerRef2 = SchemaManagerFactory.create(schemaRegistryConfig2) 60 | 61 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader) 62 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm 63 | 64 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 65 | val res2 = m.reflect(schemaManagerRef2).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 66 | assert(!res1.eq(res2)) 67 | } 68 | 69 | it should "create a schema manager with a custom schema registry client depending on the configs passed" in { 70 | val schemaManagerRef1 = SchemaManagerFactory.create(schemaRegistryConfig1) 71 | val schemaManagerRef3 = SchemaManagerFactory.create(schemaRegistryConfig3) 72 | 73 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader) 74 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm 75 | 76 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 77 | val res3 = m.reflect(schemaManagerRef3).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 78 | assert(!res1.eq(res3)) 79 | assert(res1.isInstanceOf[ConfluentRegistryClient]) 80 | assert(res3.isInstanceOf[TestRegistryClient]) 81 | } 82 | 83 | it should "create mock client when url starts with mock://" in { 84 | val config = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "mock://dummy_sr_2") 85 | 86 | val schemaManagerRef1 = SchemaManagerFactory.create(config) 87 | 88 | val m = ru.runtimeMirror(schemaManagerRef1.getClass.getClassLoader) 89 | val fieldTerm = ru.typeOf[SchemaManager].decl(ru.TermName("schemaRegistryClient")).asTerm 90 | 91 | val res1 = m.reflect(schemaManagerRef1).reflectField(fieldTerm).get.asInstanceOf[AbrisRegistryClient] 92 | assert(res1.isInstanceOf[ConfluentMockRegistryClient]) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/format/SparkAvroConversionsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.format 18 | 19 | import org.apache.avro.Schema.Type 20 | import org.apache.avro.SchemaBuilder 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 25 | import za.co.absa.abris.examples.data.generation.TestSchemas 26 | 27 | import scala.collection.JavaConverters._ 28 | 29 | class SparkAvroConversionsSpec extends AnyFlatSpec with Matchers { 30 | 31 | // scalastyle:off magic.number 32 | 33 | private val structType = StructType( 34 | Seq( 35 | StructField("int1", IntegerType, false), 36 | StructField("long1", LongType, false), 37 | StructField("map1", new MapType(StringType, IntegerType, false), false), 38 | StructField("array1", new ArrayType(LongType, false), false), 39 | StructField("struct2", StructType( 40 | Seq( 41 | StructField("string2", StringType, true), 42 | StructField("string3", StringType, false) 43 | ) 44 | ), false), 45 | StructField("double1", DoubleType, false), 46 | StructField("struct3", StructType( 47 | Seq( 48 | StructField("int2", IntegerType, false), 49 | StructField("float1", FloatType, false) 50 | ) 51 | ), false), 52 | StructField("bytes", BinaryType, false) 53 | ) 54 | ) 55 | 56 | behavior of "SparkAvroConversions" 57 | 58 | it should "convert Avro schemas to SQL types" in { 59 | val schema = AvroSchemaUtils.parse(TestSchemas.COMPLEX_SCHEMA_SPEC) 60 | val sql = SparkAvroConversions.toSqlType(schema) 61 | val schemaFromSql = SparkAvroConversions.toAvroSchema(sql, schema.getName, schema.getNamespace) 62 | 63 | schema.getFields.asScala.foreach(field => 64 | assert(schema.getField(field.name).toString == schemaFromSql.getField(field.name).toString)) 65 | } 66 | 67 | it should "convert SQL types to Avro schemas" in { 68 | val schemaName = "teste_name" 69 | val schemaNamespace = "teste_namespace" 70 | 71 | val schema = SparkAvroConversions.toAvroSchema(structType, schemaName, schemaNamespace) 72 | 73 | assert(schema.getName == schemaName) 74 | assert(schema.getNamespace == schemaNamespace) 75 | assert(schema.getField("int1").schema().getType == Type.INT) 76 | assert(schema.getField("long1").schema().getType == Type.LONG) 77 | assert(schema.getField("map1").schema().getType == Type.MAP) 78 | assert(schema.getField("array1").schema().getType == Type.ARRAY) 79 | assert(schema.getField("struct2").schema().getType == Type.RECORD) 80 | assert(schema.getField("double1").schema().getType == Type.DOUBLE) 81 | assert(schema.getField("struct3").schema().getType == Type.RECORD) 82 | assert(schema.getField("bytes").schema().getType == Type.BYTES) 83 | 84 | val map1 = schema.getField("map1").schema() 85 | assert(map1.getValueType.getType == Type.INT) 86 | 87 | val array1 = schema.getField("array1").schema() 88 | assert(array1.getElementType.getType == Type.LONG) 89 | 90 | val struct2 = schema.getField("struct2").schema() 91 | assert(struct2.getField("string2").schema().getType == Type.UNION) // nullable fields are "unioned" with null 92 | assert(struct2.getField("string3").schema().getType == Type.STRING) 93 | 94 | val struct3 = schema.getField("struct3").schema() 95 | assert(struct3.getField("int2").schema().getType == Type.INT) 96 | assert(struct3.getField("float1").schema().getType == Type.FLOAT) 97 | } 98 | 99 | it should "convert fixed and bytes type" in { 100 | 101 | val avroSchema = SchemaBuilder 102 | .record("test_record") 103 | .namespace("test_namespace") 104 | .fields() 105 | .name("fixed_name").`type`().fixed("fixed_name").size(3).noDefault() 106 | .name("bytes_name").`type`().bytesType().noDefault() 107 | .endRecord() 108 | 109 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema) 110 | 111 | sparkSchema.fields(0) shouldBe StructField("fixed_name", BinaryType, false) 112 | sparkSchema.fields(1) shouldBe StructField("bytes_name", BinaryType, false) 113 | } 114 | 115 | // scalastyle:on magic.number 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/utils/ExamplesUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples.utils 18 | 19 | import java.io.FileInputStream 20 | import java.util.Properties 21 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter} 22 | import org.apache.spark.sql.{DataFrameWriter, Row, SparkSession} 23 | import org.slf4j.LoggerFactory 24 | import za.co.absa.commons.annotation.DeveloperApi 25 | 26 | import scala.collection.JavaConverters._ 27 | 28 | @DeveloperApi 29 | object ExamplesUtils { 30 | 31 | private val OPTION_PREFIX = "option." 32 | 33 | private val logger = LoggerFactory.getLogger(ExamplesUtils.getClass) 34 | 35 | def checkArgs(args: Array[String]): Unit = { 36 | if (args.length != 1) { 37 | logger.error("No properties file specified.") 38 | System.exit(1) 39 | } 40 | } 41 | 42 | def getSparkSession(properties: Properties, jobNameProp: String, jobMasterProp: String, 43 | logLevelProp: String): SparkSession = { 44 | 45 | val spark = SparkSession 46 | .builder() 47 | .appName(properties.getProperty(jobNameProp)) 48 | .master(properties.getProperty(jobMasterProp)) 49 | .getOrCreate() 50 | 51 | spark.sparkContext.setLogLevel(properties.getProperty(logLevelProp)) 52 | spark 53 | } 54 | 55 | def loadProperties(args: Array[String]): Properties = { 56 | logger.debug("Loading properties from: " + args(0)) 57 | val properties = ExamplesUtils.loadProperties(args(0)) 58 | for (key <- properties.asScala.keysIterator) { 59 | logger.debug(s"\t$key = ${properties.getProperty(key)}") 60 | } 61 | properties 62 | } 63 | 64 | def loadProperties(path: String): Properties = { 65 | val properties = new Properties() 66 | properties.load(new FileInputStream(path)) 67 | properties 68 | } 69 | 70 | private def getKeys(properties: Properties) = { 71 | properties.keySet().asScala 72 | .filter(key => key.toString.startsWith(OPTION_PREFIX)) 73 | .map(key => (key.toString, key.toString.drop(OPTION_PREFIX.length()))) 74 | } 75 | 76 | implicit class ReaderStreamOptions(stream: DataStreamReader) { 77 | def addOptions(properties: Properties): DataStreamReader = { 78 | getKeys(properties) 79 | .foreach(keys => { 80 | logger.debug(s"DataStreamReader: setting option: ${keys._2} = ${properties.getProperty(keys._1)}") 81 | stream.option(keys._2, properties.getProperty(keys._1)) 82 | }) 83 | stream 84 | } 85 | } 86 | 87 | implicit class WriterRowOptions(stream: DataFrameWriter[Row]) { 88 | def addOptions(properties: Properties): DataFrameWriter[Row] = { 89 | getKeys(properties) 90 | .foreach(keys => { 91 | logger.debug(s"DataFrameWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}") 92 | stream.option(keys._2, properties.getProperty(keys._1)) 93 | }) 94 | stream 95 | } 96 | } 97 | 98 | implicit class WriterOptions(stream: DataFrameWriter[Array[Byte]]) { 99 | def addOptions(properties: Properties): DataFrameWriter[Array[Byte]] = { 100 | getKeys(properties) 101 | .foreach(keys => { 102 | logger.debug(s"DataFrameWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}") 103 | stream.option(keys._2, properties.getProperty(keys._1)) 104 | }) 105 | stream 106 | } 107 | } 108 | 109 | implicit class WriterRowStreamOptions(stream: DataStreamWriter[Row]) { 110 | def addOptions(properties: Properties): DataStreamWriter[Row] = { 111 | getKeys(properties) 112 | .foreach(keys => { 113 | logger.debug(s"DataStreamWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}") 114 | stream.option(keys._2, properties.getProperty(keys._1)) 115 | }) 116 | stream 117 | } 118 | } 119 | 120 | implicit class WriterStreamOptions(stream: DataStreamWriter[Array[Byte]]) { 121 | def addOptions(properties: Properties): DataStreamWriter[Array[Byte]] = { 122 | getKeys(properties) 123 | .foreach(keys => { 124 | logger.debug(s"DataStreamWriter: setting option: ${keys._2} = ${properties.getProperty(keys._1)}") 125 | stream.option(keys._2, properties.getProperty(keys._1)) 126 | }) 127 | stream 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/read/confluent/SchemaManagerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.read.confluent 18 | 19 | import org.scalatest.BeforeAndAfter 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 22 | import za.co.absa.abris.avro.registry.{ConfluentMockRegistryClient, LatestVersion, NumVersion, SchemaSubject} 23 | import za.co.absa.abris.config.AbrisConfig 24 | 25 | class schemaManagerSpec extends AnyFlatSpec with BeforeAndAfter { 26 | 27 | private val schema = AvroSchemaUtils.parse( 28 | "{\"type\": \"record\", \"name\": \"Blah\", \"fields\": [{ \"name\": \"name\", \"type\": \"string\" }]}") 29 | 30 | 31 | val recordByteSchema = AvroSchemaUtils.parse("""{ 32 | "namespace": "all-types.test", 33 | "type": "record", 34 | "name": "record_name", 35 | "fields":[ 36 | {"name": "int", "type": ["int", "null"] } 37 | ] 38 | }""") 39 | 40 | val recordEvolvedByteSchema1 = AvroSchemaUtils.parse("""{ 41 | "namespace": "all-types.test", 42 | "type": "record", 43 | "name": "record_name", 44 | "fields":[ 45 | {"name": "int", "type": ["int", "null"] }, 46 | {"name": "favorite_color", "type": "string", "default": "green"} 47 | ] 48 | }""") 49 | 50 | val recordEvolvedByteSchema2 = AvroSchemaUtils.parse("""{ 51 | "namespace": "all-types.test", 52 | "type": "record", 53 | "name": "record_name", 54 | "fields":[ 55 | {"name": "int", "type": ["int", "null"] }, 56 | {"name": "favorite_color", "type": "string", "default": "green"}, 57 | {"name": "favorite_badger", "type": "string", "default": "Honey badger"} 58 | ] 59 | }""") 60 | 61 | val registryUrl = "dummyUrl" 62 | val registryConfig = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> registryUrl) 63 | 64 | behavior of "SchemaManager" 65 | 66 | it should "return correct schema by id or subect and version" in { 67 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient()) 68 | val subject1 = SchemaSubject.usingTopicNameStrategy("foo") 69 | val subject2 = SchemaSubject.usingTopicNameStrategy("bar") 70 | 71 | val id1 = schemaManager.register(subject1, schema) // id1, version 1 72 | val id2 = schemaManager.register(subject2, recordByteSchema) // id2, version 1 73 | val id3 = schemaManager.register(subject2, recordEvolvedByteSchema1) // id3, version 2 74 | val id4 = schemaManager.register(subject2, recordEvolvedByteSchema2) // id4, version 3 75 | 76 | assert(schemaManager.getSchemaById(id1) == schema) 77 | assert(schemaManager.getSchemaById(id2) == recordByteSchema) 78 | assert(schemaManager.getSchemaById(id3) == recordEvolvedByteSchema1) 79 | assert(schemaManager.getSchemaById(id4) == recordEvolvedByteSchema2) 80 | 81 | assert(schemaManager.getSchemaBySubjectAndVersion(subject1, NumVersion(1)) == schema) 82 | assert(schemaManager.getSchemaBySubjectAndVersion(subject1, LatestVersion()) == schema) 83 | 84 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, NumVersion(1)) == recordByteSchema) 85 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, NumVersion(2)) == recordEvolvedByteSchema1) 86 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, NumVersion(3)) == recordEvolvedByteSchema2) 87 | assert(schemaManager.getSchemaBySubjectAndVersion(subject2, LatestVersion()) == recordEvolvedByteSchema2) 88 | } 89 | 90 | it should "find already existing schema" in { 91 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient()) 92 | 93 | val subject = SchemaSubject.usingTopicNameStrategy("dummy_topic") 94 | 95 | schemaManager.register(subject, recordByteSchema) 96 | schemaManager.register(subject, recordEvolvedByteSchema1) 97 | schemaManager.register(subject, recordEvolvedByteSchema2) 98 | 99 | val maybeId = schemaManager.findEquivalentSchema(recordEvolvedByteSchema1, subject) 100 | 101 | val resultSchema = schemaManager.getSchemaById(maybeId.get) 102 | 103 | assert(resultSchema.equals(recordEvolvedByteSchema1)) 104 | } 105 | 106 | "exists" should "return true when schema is in registry" in { 107 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient()) 108 | 109 | val subject = SchemaSubject.usingTopicNameStrategy("dummy_topic") 110 | schemaManager.register(subject, recordByteSchema) 111 | val schemaExists = schemaManager.exists(subject) 112 | 113 | assert(schemaExists == true) 114 | } 115 | 116 | "exists" should "return false when schema is not in registry" in { 117 | val schemaManager = new SchemaManager(new ConfluentMockRegistryClient()) 118 | val schemaExists = schemaManager.exists(SchemaSubject.usingTopicNameStrategy("foo")) 119 | 120 | assert(schemaExists == false) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/read/confluent/SchemaManager.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.read.confluent 18 | 19 | import java.security.InvalidParameterException 20 | 21 | import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException 22 | import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient} 23 | import org.apache.avro.Schema 24 | import org.apache.spark.internal.Logging 25 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 26 | import za.co.absa.abris.avro.registry._ 27 | 28 | import scala.collection.JavaConverters._ 29 | import scala.util.{Failure, Success, Try} 30 | 31 | /** 32 | * This class provides methods to integrate with remote schemas through Schema Registry. 33 | * 34 | * This can be considered an "enriched" facade to the Schema Registry client. 35 | * 36 | * This is NOT THREAD SAFE, which means that multiple threads operating on this object 37 | * (e.g. calling 'configureSchemaRegistry' with different parameters) would operated 38 | * on the same Schema Registry client, thus, leading to inconsistent behavior. 39 | */ 40 | class SchemaManager(schemaRegistryClient: AbrisRegistryClient) extends Logging { 41 | 42 | def this(schemaRegistryClient: SchemaRegistryClient) = this(new ConfluentRegistryClient(schemaRegistryClient)) 43 | 44 | def getSchema(coordinate: SchemaCoordinate): Schema = coordinate match { 45 | case IdCoordinate(id) => getSchemaById(id) 46 | case SubjectCoordinate(subject, version) => getSchemaBySubjectAndVersion(subject, version) 47 | } 48 | 49 | def getSchemaById(schemaId: Int): Schema = schemaRegistryClient.getById(schemaId) 50 | 51 | /** 52 | * @param version - Some(versionNumber) or None for latest version 53 | */ 54 | def getSchemaBySubjectAndVersion(subject: SchemaSubject, version: SchemaVersion): Schema = { 55 | val metadata = getSchemaMetadataBySubjectAndVersion(subject, version) 56 | 57 | AvroSchemaUtils.parse(metadata.getSchema) 58 | } 59 | 60 | /** 61 | * @param version - Some(versionNumber) or None for latest version 62 | */ 63 | def getSchemaMetadataBySubjectAndVersion(subject: SchemaSubject, version: SchemaVersion): SchemaMetadata = 64 | version match { 65 | case NumVersion(versionInt) => schemaRegistryClient.getSchemaMetadata(subject.asString, versionInt) 66 | case LatestVersion() => schemaRegistryClient.getLatestSchemaMetadata(subject.asString) 67 | } 68 | 69 | def register(subject: SchemaSubject, schemaString: String): Int = 70 | register(subject, AvroSchemaUtils.parse(schemaString)) 71 | 72 | /** 73 | * Register new schema for a subject. 74 | * 75 | * @throws InvalidParameterException when the new schema is not compatible with already exiting one. 76 | * @return registered schema id 77 | */ 78 | def register(subject: SchemaSubject, schema: Schema): Int = { 79 | if (!exists(subject) || isCompatible(schema, subject)) { 80 | logInfo(s"AvroSchemaUtils.registerIfCompatibleSchema: Registering schema for subject: $subject") 81 | schemaRegistryClient.register(subject.asString, schema) 82 | } else { 83 | throw new InvalidParameterException(s"Schema registration failed. Schema for subject:'$subject' " + 84 | s"already exists and it is not compatible with schema you are trying to register.") 85 | } 86 | } 87 | 88 | /** 89 | * Checks if a given schema exists in Schema Registry. 90 | */ 91 | def exists(subject: SchemaSubject): Boolean = { 92 | Try(schemaRegistryClient.getLatestSchemaMetadata(subject.asString)) match { 93 | case Success(_) => true 94 | case Failure(e: RestClientException) if e.getStatus == 404 => false 95 | case Failure(e) => throw e 96 | } 97 | } 98 | 99 | /** 100 | * Checks if a new schema is compatible with the latest schema registered for a given subject. 101 | */ 102 | private def isCompatible(newSchema: Schema, subject: SchemaSubject): Boolean = { 103 | schemaRegistryClient.testCompatibility(subject.asString, newSchema) 104 | } 105 | 106 | def getAllSchemasWithMetadata(subject: SchemaSubject): List[SchemaMetadata] = { 107 | val versions = Try(schemaRegistryClient.getAllVersions(subject.asString)) match { 108 | case Success(l) => l.asScala.toList 109 | case Failure(e: RestClientException) if e.getStatus == 404 => List.empty[Integer] 110 | case Failure(e) => throw e 111 | } 112 | 113 | versions.map(schemaRegistryClient.getSchemaMetadata(subject.asString, _)) 114 | } 115 | 116 | def findEquivalentSchema(schema: Schema, subject: SchemaSubject): Option[Int] = { 117 | val maybeIdenticalSchemaMetadata = 118 | getAllSchemasWithMetadata(subject) 119 | .find{ 120 | sm => AvroSchemaUtils.parse(sm.getSchema).equals(schema) 121 | } 122 | 123 | maybeIdenticalSchemaMetadata.map(_.getId) 124 | } 125 | 126 | def getIfExistsOrElseRegisterSchema(schema: Schema, subject: SchemaSubject): Int = { 127 | val maybeSchemaId = findEquivalentSchema(schema, subject) 128 | maybeSchemaId.getOrElse(register(subject, schema)) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /documentation/confluent-avro-documentation.md: -------------------------------------------------------------------------------- 1 | # ABRiS - Confluent Avro documentation 2 | 3 | - [Avro to Spark](#Avro-to-Spark) 4 | - [Spark to Avro](#spark-to-avro) 5 | 6 | The main difference between confluent avro and vanilla Avro is in whether it expects the schema id in the Avro payload. 7 | In Confluent avro there always have to be schema id on the start of the payload. 8 | 9 | ## Avro to Spark 10 | When converting from Confluent avro to Spark, there may be two schemas *reader schema* and *writer schema*. 11 | - Writer schema is the one used to convert data to avro and is the one identified by id in the avro payload. 12 | - Reader schema is the one specified by you. 13 | 14 | The schemas must be compatible. 15 | 16 | There are several ways how to get the reader schema, most of them are in the following config examples: 17 | 18 | ```scala 19 | // Provide an avro schema as json 20 | val fromAvroConfig1: FromAvroConfig = AbrisConfig 21 | .fromConfluentAvro 22 | .provideReaderSchema("{ ...schema json...}") 23 | .usingSchemaRegistry("http://registry-url") 24 | 25 | // Specify a schema id 26 | val fromAvroConfig2: FromAvroConfig = AbrisConfig 27 | .fromConfluentAvro 28 | .downloadReaderSchemaById(66) 29 | .usingSchemaRegistry("http://registry-url") 30 | 31 | // Use the schema with the latest version. 32 | val fromAvroConfig3: FromAvroConfig = AbrisConfig 33 | .fromConfluentAvro 34 | .downloadReaderSchemaByLatestVersion 35 | .andTopicNameStrategy("topicName", isKey=true) // Use isKey=true for the key schema and isKey=false for the value schema 36 | .usingSchemaRegistry("http://registry-url") 37 | ``` 38 | Once you have `FromAvroConfig` you just need to pass it to Abris function: 39 | ```scala 40 | import za.co.absa.abris.avro.functions.from_avro 41 | 42 | def readAvro(dataFrame: DataFrame, fromAvroConfig: FromAvroConfig): DataFrame = { 43 | 44 | dataFrame.select(from_avro(col("value"), fromAvroConfig) as 'data).select("data.*") 45 | } 46 | ``` 47 | 48 | ## Spark to Avro 49 | When converting data to Avro there is only one schema in play, but you have several options how to provide it: 50 | - You can provide it as a string and let Abris register the schema for you. 51 | - You can specify a schema that already is in the registry. In that case Abris will download it and no registration is necessary. 52 | 53 | When registering the schema Abris will do it only if the same schema is not already registered. 54 | So it's something like: register if not exist. 55 | 56 | Some configuration examples: 57 | ```scala 58 | // Provide avro schema string with record name strategy 59 | val toAvroConfig1: ToAvroConfig = AbrisConfig 60 | .toConfluentAvro 61 | .provideAndRegisterSchema("{ ...schema json... }") 62 | .usingRecordNameStrategy() // name and namespace taken from schema 63 | .usingSchemaRegistry("http://registry-url") 64 | 65 | // Provide avro schema string with topic name strategy 66 | val toAvroConfig2: ToAvroConfig = AbrisConfig 67 | .toConfluentAvro 68 | .provideAndRegisterSchema("{ ...schema json... }") 69 | .usingTopicNameStrategy("fooTopic") // Assumes value schema by default. Use isKey=true for the key schema 70 | .usingSchemaRegistry("http://registry-url") 71 | 72 | // Use already existing schema by id 73 | val toAvroConfig3: ToAvroConfig = AbrisConfig 74 | .toConfluentAvro 75 | .downloadSchemaById(66) 76 | .usingSchemaRegistry("http://registry-url") 77 | 78 | // Use latest version of already existing schema 79 | val toAvroConfig4: ToAvroConfig = AbrisConfig 80 | .toConfluentAvro 81 | .downloadSchemaByLatestVersion 82 | .andTopicNameStrategy("fooTopic") 83 | .usingSchemaRegistry("http://registry-url") 84 | ``` 85 | Once you have a config you can use it like this: 86 | ```scala 87 | import za.co.absa.abris.avro.functions.to_avro 88 | 89 | def writeAvro(dataFrame: DataFrame, toAvroConfig: ToAvroConfig): DataFrame = { 90 | 91 | val allColumns = struct(dataFrame.columns.head, dataFrame.columns.tail: _*) 92 | dataFrame.select(to_avro(allColumns, toAvroConfig) as 'value) 93 | } 94 | ``` 95 | 96 | ### Generate schema from data and register 97 | Unlike previous versions of ABRiS, the schema is not automatically generated during the evaluation for every record, but 98 | must be provided to the configuration. 99 | 100 | Given a dataframe, the Avro schema can be generated as shown below. 101 | 102 | ```scala 103 | import za.co.absa.abris.avro.parsing.utils.AvroSchemaUtils 104 | 105 | // generate schema for all columns in a dataframe 106 | AvroSchemaUtils.toAvroSchema(dataFrame) 107 | 108 | // generate schema for one column in a dataframe 109 | AvroSchemaUtils.toAvroSchema(dataFrame, "input") 110 | 111 | // generate schema for multiple columns in a dataframe 112 | AvroSchemaUtils.toAvroSchema(dataFrame, Seq("input", "numbers")) 113 | ``` 114 | All above methods also have a variant where you can specify `recordName` and `nameSpace` instead of using default ones. 115 | 116 | When the schema is generated, it can be registered, and the schema id obtained: 117 | 118 | ```scala 119 | val schemaRegistryClientConfig = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> "http://localhost:8081") 120 | val schemaManager = SchemaManagerFactory.create(schemaRegistryClientConfig) 121 | 122 | // register schema with topic name strategy 123 | def registerSchema1(schema: Schema, schemaManager: SchemaManager): Int = { 124 | val subject = SchemaSubject.usingTopicNameStrategy("topic", isKey=true) // Use isKey=true for the key schema and isKey=false for the value schema 125 | schemaManager.register(subject, schema) 126 | } 127 | 128 | // register schema with record name strategy 129 | def registerSchema2(schema: Schema, schemaManager: SchemaManager): Int = { 130 | val subject = SchemaSubject.usingRecordNameStrategy(schema) 131 | schemaManager.register(subject, schema) 132 | } 133 | 134 | // register schema with topic record name strategy 135 | def registerSchema3(schema: Schema, schemaManager: SchemaManager): Int = { 136 | val subject = SchemaSubject.usingTopicRecordNameStrategy("topic", schema) 137 | schemaManager.register(subject, schema) 138 | } 139 | ``` 140 | 141 | Once you have the schema id, you can pass it to the configuration: 142 | ```scala 143 | def createConfig(schemaId: Int): ToAvroConfig = AbrisConfig 144 | .toConfluentAvro 145 | .downloadSchemaById(schemaId) 146 | .usingSchemaRegistry("http://localhost:8081") 147 | ``` 148 | 149 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/avro/sql/AvroDataToCatalyst.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import org.apache.avro.Schema 20 | import org.apache.avro.generic.GenericDatumReader 21 | import org.apache.avro.io.{BinaryDecoder, DecoderFactory} 22 | import org.apache.kafka.common.errors.SerializationException 23 | import org.apache.spark.sql.avro.AbrisAvroDeserializer 24 | import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenerator, CodegenContext, ExprCode} 25 | import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} 26 | import org.apache.spark.sql.types.{BinaryType, DataType} 27 | import za.co.absa.abris.avro.errors.DeserializationExceptionHandler 28 | import za.co.absa.abris.avro.read.confluent.{ConfluentConstants, SchemaManagerFactory} 29 | import za.co.absa.abris.config.InternalFromAvroConfig 30 | 31 | import java.nio.ByteBuffer 32 | import java.util.ServiceLoader 33 | import scala.collection.mutable 34 | import scala.util.control.NonFatal 35 | import scala.util.{Failure, Success, Try} 36 | 37 | private[abris] case class AvroDataToCatalyst( 38 | child: Expression, 39 | abrisConfig: Map[String, Any], 40 | schemaRegistryConf: Option[Map[String, String]] 41 | ) extends UnaryExpression with ExpectsInputTypes { 42 | 43 | @transient private lazy val schemaConverter = loadSchemaConverter(config.schemaConverter) 44 | 45 | override def inputTypes: Seq[BinaryType.type] = Seq(BinaryType) 46 | 47 | override lazy val dataType: DataType = schemaConverter.toSqlType(readerSchema) 48 | 49 | override def nullable: Boolean = true 50 | 51 | private val confluentCompliant = schemaRegistryConf.isDefined 52 | 53 | @transient private lazy val config = new InternalFromAvroConfig(abrisConfig) 54 | 55 | @transient private lazy val schemaManager = SchemaManagerFactory.create(schemaRegistryConf.get) 56 | 57 | @transient private lazy val readerSchema = config.readerSchema 58 | 59 | @transient private lazy val writerSchemaOption = config.writerSchema 60 | 61 | @transient private lazy val deserializationHandler: DeserializationExceptionHandler = config.deserializationHandler 62 | 63 | @transient private lazy val vanillaReader: GenericDatumReader[Any] = 64 | new GenericDatumReader[Any](writerSchemaOption.getOrElse(readerSchema), readerSchema) 65 | 66 | @transient private lazy val confluentReaderCache: mutable.HashMap[Int, GenericDatumReader[Any]] = 67 | new mutable.HashMap[Int, GenericDatumReader[Any]]() 68 | 69 | @transient private var decoder: BinaryDecoder = _ 70 | 71 | @transient private lazy val deserializer = new AbrisAvroDeserializer(readerSchema, dataType) 72 | 73 | // Reused result object (usually of type IndexedRecord) 74 | @transient private var result: Any = _ 75 | 76 | override def nullSafeEval(input: Any): Any = { 77 | val binary = input.asInstanceOf[Array[Byte]] 78 | try { 79 | val intermediateData = decode(binary) 80 | 81 | deserializer.deserialize(intermediateData) 82 | 83 | } catch { 84 | // There could be multiple possible exceptions here, e.g. java.io.IOException, 85 | // AvroRuntimeException, ArrayIndexOutOfBoundsException, etc. 86 | // To make it simple, catch all the exceptions here. 87 | case NonFatal(e) => deserializationHandler.handle(e, deserializer, readerSchema) 88 | } 89 | } 90 | 91 | override def prettyName: String = "from_avro" 92 | 93 | override protected def flatArguments: Iterator[Any] = { 94 | def isMap(x: Any) = x match { 95 | case _: Map[_, _] => true 96 | case _ => false 97 | } 98 | 99 | super.flatArguments.filter { 100 | case Some(x) if isMap(x) => false // don't print schemaRegistryConf 101 | case _ => true 102 | } 103 | } 104 | 105 | override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { 106 | val expr = ctx.addReferenceObj("this", this) 107 | defineCodeGen(ctx, ev, input => 108 | s"(${boxedType(ctx, dataType)})$expr.nullSafeEval($input)") 109 | } 110 | 111 | /** 112 | * The method boxedType(...) is placed in different classes in Spark 2.3 and 2.4 113 | */ 114 | private def boxedType(ctx: CodegenContext, dataType: DataType): String = { 115 | val tryBoxedTypeSpark2_4 = Try { 116 | CodeGenerator 117 | .getClass 118 | .getMethod("boxedType", classOf[DataType]) 119 | .invoke(CodeGenerator, dataType) 120 | } 121 | 122 | val boxedType = tryBoxedTypeSpark2_4.getOrElse { 123 | classOf[CodegenContext] 124 | .getMethod("boxedType", classOf[DataType]) 125 | .invoke(ctx, dataType) 126 | } 127 | 128 | boxedType.asInstanceOf[String] 129 | } 130 | 131 | private def decode(payload: Array[Byte]): Any = if (confluentCompliant) { 132 | decodeConfluentAvro(payload) 133 | } else { 134 | decodeVanillaAvro(payload) 135 | } 136 | 137 | private def decodeConfluentAvro(payload: Array[Byte]): Any = { 138 | 139 | val buffer = ByteBuffer.wrap(payload) 140 | if (buffer.get() != ConfluentConstants.MAGIC_BYTE) { 141 | throw new SerializationException("Unknown magic byte!") 142 | } 143 | 144 | val schemaId = buffer.getInt() 145 | 146 | val start = buffer.position() + buffer.arrayOffset() 147 | val length = buffer.limit() - 1 - ConfluentConstants.SCHEMA_ID_SIZE_BYTES 148 | decoder = DecoderFactory.get().binaryDecoder(buffer.array(), start, length, decoder) 149 | 150 | val reader = confluentReaderCache.getOrElseUpdate(schemaId, { 151 | val writerSchema = downloadWriterSchema(schemaId) 152 | new GenericDatumReader[Any](writerSchema, readerSchema) 153 | }) 154 | 155 | result = reader.read(result, decoder) 156 | result 157 | } 158 | 159 | private def downloadWriterSchema(id: Int): Schema = { 160 | Try(schemaManager.getSchemaById(id)) match { 161 | case Success(schema) => schema 162 | case Failure(e) => throw new RuntimeException("Not able to download writer schema", e) 163 | } 164 | } 165 | 166 | private def decodeVanillaAvro(payload: Array[Byte]): Any = { 167 | 168 | decoder = DecoderFactory.get().binaryDecoder(payload, 0, payload.length, decoder) 169 | result = vanillaReader.read(result, decoder) 170 | result 171 | } 172 | 173 | override protected def withNewChildInternal(newChild: Expression): Expression = 174 | copy(child = newChild) 175 | 176 | private def loadSchemaConverter(nameOpt: Option[String]) = { 177 | import scala.collection.JavaConverters._ 178 | nameOpt match { 179 | case Some(name) => ServiceLoader.load(classOf[SchemaConverter]).asScala 180 | .find(c => c.shortName == name || c.getClass.getName == name) 181 | .getOrElse(throw new ClassNotFoundException(s"Could not find schema converter $name")) 182 | case None => new DefaultSchemaConverter() 183 | } 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/sql/SchemaEvolutionSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import org.apache.spark.sql.functions.{col, lit, struct} 20 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 21 | import org.scalatest.BeforeAndAfterEach 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | import za.co.absa.abris.avro.format.SparkAvroConversions 25 | import za.co.absa.abris.avro.functions._ 26 | import za.co.absa.abris.avro.read.confluent.SchemaManagerFactory 27 | import za.co.absa.abris.avro.registry.{ConfluentMockRegistryClient, SchemaSubject} 28 | import za.co.absa.abris.config.AbrisConfig 29 | 30 | class SchemaEvolutionSpec extends AnyFlatSpec with Matchers with BeforeAndAfterEach 31 | { 32 | private val spark = SparkSession 33 | .builder() 34 | .appName("unitTest") 35 | .master("local[2]") 36 | .config("spark.driver.bindAddress", "localhost") 37 | .config("spark.ui.enabled", "false") 38 | .getOrCreate() 39 | 40 | private val dummyUrl = "dummyUrl" 41 | private val schemaRegistryConfig = Map(AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl) 42 | 43 | override def beforeEach(): Unit = { 44 | val mockedSchemaRegistryClient = new ConfluentMockRegistryClient() 45 | SchemaManagerFactory.addSRClientInstance(schemaRegistryConfig, mockedSchemaRegistryClient) 46 | } 47 | 48 | val recordByteSchema = """{ 49 | "namespace": "all-types.test", 50 | "type": "record", 51 | "name": "record_name", 52 | "fields":[ 53 | {"name": "int", "type": ["int", "null"] } 54 | ] 55 | }""" 56 | 57 | val recordEvolvedByteSchema = """{ 58 | "namespace": "all-types.test", 59 | "type": "record", 60 | "name": "record_name", 61 | "fields":[ 62 | {"name": "int", "type": ["int", "null"] }, 63 | {"name": "favorite_color", "type": "string", "default": "green"} 64 | ] 65 | }""" 66 | 67 | private def createTestData(avroSchema: String): DataFrame = { 68 | val testInts = Seq(42, 66, 77, 321, 789) // scalastyle:ignore 69 | val rows = testInts.map(i => Row.fromSeq(Seq(i))) 70 | val rdd = spark.sparkContext.parallelize(rows, 2) 71 | 72 | val sparkSchema = SparkAvroConversions.toSqlType(avroSchema) 73 | 74 | spark.createDataFrame(rdd, sparkSchema) 75 | } 76 | 77 | it should "convert to avro with old schema and back with evolved schema (providing the schema)" in { 78 | 79 | val allData = createTestData(recordByteSchema) 80 | val dataFrame: DataFrame = allData.select(struct(allData.col(allData.columns.head)) as "integers") 81 | 82 | val toCAConfig = AbrisConfig 83 | .toConfluentAvro 84 | .provideAndRegisterSchema(recordByteSchema) 85 | .usingTopicRecordNameStrategy("test_topic") 86 | .usingSchemaRegistry(dummyUrl) 87 | 88 | val avroBytes = dataFrame 89 | .select(to_avro(col("integers"), toCAConfig) as "avroBytes") 90 | 91 | avroBytes.collect() // force evaluation 92 | 93 | val fromCAConfig = AbrisConfig 94 | .fromConfluentAvro 95 | .provideReaderSchema(recordEvolvedByteSchema) 96 | .usingSchemaRegistry(dummyUrl) 97 | 98 | val result = avroBytes 99 | .select(from_avro(col("avroBytes"), fromCAConfig) 100 | as "integersWithDefault") 101 | 102 | val expectedStruct = struct(allData.col(allData.columns.head), lit("green")) 103 | val expectedResult: DataFrame = allData.select(expectedStruct as "integersWithDefault") 104 | 105 | shouldEqualByData(expectedResult, result) 106 | } 107 | 108 | it should "convert to avro with old schema and back with evolved schema (all from schema registry)" in { 109 | 110 | val allData = createTestData(recordByteSchema) 111 | val dataFrame: DataFrame = allData.select(struct(allData.col(allData.columns.head)) as "integers") 112 | 113 | val toCAConfig = AbrisConfig 114 | .toConfluentAvro 115 | .provideAndRegisterSchema(recordByteSchema) 116 | .usingTopicRecordNameStrategy("test_topic") 117 | .usingSchemaRegistry(dummyUrl) 118 | 119 | val avroBytes = dataFrame.select(to_avro(col("integers"), toCAConfig) as "avroBytes") 120 | 121 | // To avoid race conditions between schema registration and reading the data are converted from spark to scala 122 | val avroRows = avroBytes.collect() 123 | 124 | val schemaManager = SchemaManagerFactory.create(schemaRegistryConfig) 125 | val subject = SchemaSubject.usingTopicRecordNameStrategy( 126 | "test_topic", 127 | "record_name", 128 | "all-types.test" 129 | ) 130 | 131 | schemaManager.register(subject, recordEvolvedByteSchema) 132 | 133 | // Now when the last version of schema is registered, we will convert the data back to spark DataFrame 134 | val avroDF = spark.sparkContext.parallelize(avroRows.toIndexedSeq, 2) 135 | val outputAvro = spark.createDataFrame(avroDF, avroBytes.schema) 136 | 137 | val fromCAConfig = AbrisConfig 138 | .fromConfluentAvro 139 | .downloadReaderSchemaByLatestVersion 140 | .andTopicRecordNameStrategy("test_topic", "record_name", "all-types.test") 141 | .usingSchemaRegistry(dummyUrl) 142 | 143 | val result = outputAvro.select(from_avro(col("avroBytes"), fromCAConfig) as "integersWithDefault") 144 | 145 | val expectedStruct = struct(allData.col(allData.columns.head), lit("green")) 146 | val expectedResult: DataFrame = allData.select(expectedStruct as "integersWithDefault") 147 | 148 | shouldEqualByData(expectedResult, result) 149 | } 150 | 151 | it should "convert to simple avro with old schema and back with evolved reader schema (providing the schema)" in { 152 | 153 | val allData = createTestData(recordByteSchema) 154 | val dataFrame: DataFrame = allData.select(struct(allData.col(allData.columns.head)) as "integers") 155 | 156 | // Serialize record with a writer schema 157 | val toCAConfig = AbrisConfig 158 | .toSimpleAvro 159 | .provideSchema(recordByteSchema) 160 | 161 | val avroBytes = dataFrame 162 | .select(to_avro(col("integers"), toCAConfig) as "avroBytes") 163 | 164 | avroBytes.collect() // force evaluation 165 | 166 | // Deserialize record specifying a reader and a writer schema 167 | // Avro will decode using the writer schema and then match with the 168 | // reader schema. Thus e.g. new fields with a default value will also show up. 169 | val fromCAConfig = AbrisConfig 170 | .fromSimpleAvro 171 | .provideSchema(recordEvolvedByteSchema) 172 | .withWriterSchema(recordByteSchema) 173 | 174 | val result = avroBytes 175 | .select(from_avro(col("avroBytes"), fromCAConfig) 176 | as "integersWithDefault") 177 | 178 | val expectedStruct = struct(allData.col(allData.columns.head), lit("green")) 179 | val expectedResult: DataFrame = allData.select(expectedStruct as "integersWithDefault") 180 | 181 | shouldEqualByData(expectedResult, result) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 15 | 16 | 17 | Scalastyle standard configuration 18 | 19 | 20 | true 21 | 36 | 37 | 38 | 39 | 40 | 41 | 300 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 120 50 | 2 51 | 52 | 53 | 54 | 55 | [A-Z][A-Za-z0-9]* 56 | 57 | 58 | 59 | 60 | [A-Z][A-Za-z0-9]* 61 | 62 | 63 | 64 | 65 | ^[a-z][A-Za-z0-9]*$ 66 | 67 | 68 | 69 | 70 | 71 | sun._,java.awt._ 72 | 73 | 74 | 75 | 76 | 8 77 | 78 | 79 | 80 | 81 | -1,0,1,2,3 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | \bprint(|ln|f)\( 95 | 96 | 97 | 98 | 99 | 30 100 | 101 | 102 | 103 | 104 | 10 105 | 106 | 107 | 108 | 109 | 110 | 111 | true 112 | false 113 | 114 | 115 | 116 | 117 | 50 118 | 119 | 120 | 121 | 122 | ^[a-z][A-Za-z0-9]*$ 123 | 124 | 125 | 126 | 127 | 30 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | \bprint(|ln|f)\( 140 | 141 | 142 | 143 | 144 | 145 | 146 | ^[A-Z_]$ 147 | 148 | 149 | 150 | 151 | 152 | 153 | 2 154 | ^""$ 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/abris/avro/sql/AvroDataToCatalystSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.avro.sql 18 | 19 | import all_types.test.{Fixed, NativeComplete} 20 | import org.apache.spark.SparkException 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} 23 | import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession} 24 | import org.apache.spark.sql.functions.col 25 | import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} 26 | import org.scalatest.BeforeAndAfterEach 27 | import org.scalatest.flatspec.AnyFlatSpec 28 | import org.scalatest.matchers.should.Matchers 29 | import za.co.absa.abris.avro.errors.{FailFastExceptionHandler, SpecificRecordExceptionHandler} 30 | import za.co.absa.abris.avro.format.SparkAvroConversions 31 | import za.co.absa.abris.avro.functions._ 32 | import za.co.absa.abris.avro.utils.AvroSchemaEncoder 33 | import za.co.absa.abris.config.{AbrisConfig, FromAvroConfig} 34 | import za.co.absa.abris.examples.data.generation.TestSchemas 35 | 36 | import java.util.Collections 37 | import java.nio.ByteBuffer 38 | import java.util 39 | import scala.collection.JavaConverters._ 40 | 41 | class AvroDataToCatalystSpec extends AnyFlatSpec with Matchers with BeforeAndAfterEach { 42 | 43 | private val spark = SparkSession 44 | .builder() 45 | .appName("unitTest") 46 | .master("local[2]") 47 | .config("spark.driver.bindAddress", "localhost") 48 | .config("spark.ui.enabled", "false") 49 | .getOrCreate() 50 | 51 | import spark.implicits._ 52 | 53 | private val avroSchemaEncoder = new AvroSchemaEncoder 54 | implicit private val encoder: Encoder[Row] = avroSchemaEncoder.getEncoder 55 | 56 | it should "not print schema registry configs in the spark plan" in { 57 | val sensitiveData = "username:password" 58 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA 59 | val dummyUrl = "dummyUrl" 60 | 61 | val fromAvroConfig = FromAvroConfig() 62 | .withReaderSchema(schemaString) 63 | .withSchemaRegistryConfig(Map( 64 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl, 65 | "basic.auth.user.info" -> sensitiveData 66 | )) 67 | 68 | val column = from_avro(col("avroBytes"), fromAvroConfig) 69 | column.expr.toString() should not include sensitiveData 70 | } 71 | 72 | it should "use the default schema converter by default" in { 73 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA 74 | val dummyUrl = "dummyUrl" 75 | val expectedDataType = StructType(Seq( 76 | StructField("int", IntegerType, nullable = false), 77 | StructField("long", LongType, nullable = false) 78 | )) 79 | 80 | val fromAvroConfig = FromAvroConfig() 81 | .withReaderSchema(schemaString) 82 | .withSchemaRegistryConfig(Map( 83 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl 84 | )) 85 | 86 | val column = from_avro(col("avroBytes"), fromAvroConfig) 87 | column.expr.dataType shouldBe expectedDataType 88 | } 89 | 90 | it should "use a custom schema converter identified by the short name" in { 91 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA 92 | val dummyUrl = "dummyUrl" 93 | 94 | val fromAvroConfig = FromAvroConfig() 95 | .withReaderSchema(schemaString) 96 | .withSchemaRegistryConfig(Map( 97 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl 98 | )) 99 | .withSchemaConverter(DummySchemaConverter.name) 100 | 101 | val column = from_avro(col("avroBytes"), fromAvroConfig) 102 | column.expr.dataType shouldBe DummySchemaConverter.dataType 103 | } 104 | 105 | it should "use a custom schema converter identified by the fully qualified name" in { 106 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA 107 | val dummyUrl = "dummyUrl" 108 | 109 | val fromAvroConfig = FromAvroConfig() 110 | .withReaderSchema(schemaString) 111 | .withSchemaRegistryConfig(Map( 112 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl 113 | )) 114 | .withSchemaConverter("za.co.absa.abris.avro.sql.DummySchemaConverter") 115 | 116 | val column = from_avro(col("avroBytes"), fromAvroConfig) 117 | column.expr.dataType shouldBe DummySchemaConverter.dataType 118 | } 119 | 120 | it should "throw an error if the specified custom schema converter does not exist" in { 121 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA 122 | val dummyUrl = "dummyUrl" 123 | 124 | val fromAvroConfig = FromAvroConfig() 125 | .withReaderSchema(schemaString) 126 | .withSchemaRegistryConfig(Map( 127 | AbrisConfig.SCHEMA_REGISTRY_URL -> dummyUrl 128 | )) 129 | .withSchemaConverter("nonexistent") 130 | 131 | val ex = intercept[ClassNotFoundException](from_avro(col("avroBytes"), fromAvroConfig).expr.dataType) 132 | ex.getMessage should include ("nonexistent") 133 | } 134 | 135 | it should "be serializable" in { 136 | val schemaString = TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA 137 | val config = FromAvroConfig().withReaderSchema(schemaString) 138 | val avroDataToCatalyst = from_avro(col("col"), config).expr 139 | 140 | val javaSerializer = new JavaSerializer(new SparkConf()) 141 | javaSerializer.newInstance().serialize(avroDataToCatalyst) 142 | 143 | val kryoSerializer = new KryoSerializer(new SparkConf()) 144 | kryoSerializer.newInstance().serialize(avroDataToCatalyst) 145 | 146 | // test successful if no exception is thrown 147 | } 148 | 149 | it should "throw a Spark exception when unable to deserialize " in { 150 | 151 | val providedData = Seq(Row("$£%^".getBytes())) 152 | val providedDataFrame: DataFrame = spark.sparkContext.parallelize(providedData, 2).toDF() as "bytes" 153 | 154 | val dummyUrl = "dummyUrl" 155 | val fromConfig = AbrisConfig 156 | .fromConfluentAvro 157 | .provideReaderSchema(TestSchemas.NATIVE_SIMPLE_NESTED_SCHEMA) 158 | .usingSchemaRegistry(dummyUrl) 159 | .withExceptionHandler(new FailFastExceptionHandler) 160 | 161 | the[SparkException] thrownBy providedDataFrame.select(from_avro(col("bytes"), fromConfig )).collect() 162 | } 163 | 164 | it should "replace undeserializable record with default SpecificRecord" in { 165 | // provided 166 | val providedData = Seq( 167 | Row("$£%^".getBytes()) 168 | ) 169 | val providedDataFrame: DataFrame = spark.sparkContext.parallelize(providedData, 2).toDF() as "bytes" 170 | 171 | val providedDefaultRecord = NativeComplete.newBuilder() 172 | .setBytes(ByteBuffer.wrap(Array[Byte](1,2,3))) 173 | .setString("default-record") 174 | .setInt$(1) 175 | .setLong$(2L) 176 | .setDouble$(3.0) 177 | .setFloat$(4.0F) 178 | .setBoolean$(true) 179 | .setArray(Collections.singletonList("arrayItem1")) 180 | .setMap(Collections.singletonMap[CharSequence, util.List[java.lang.Long]]( 181 | "key1", 182 | Collections.singletonList[java.lang.Long](1L))) 183 | .setFixed(new Fixed(Array.fill[Byte](40){1})) 184 | .build() 185 | 186 | // expected 187 | val expectedData = Seq( 188 | Row(Array[Byte](1,2,3), 189 | "default-record", 190 | 1, 191 | 2L, 192 | 3.0, 193 | 4F, 194 | true, 195 | Collections.singletonList("arrayItem1"), 196 | Collections.singletonMap[CharSequence, util.List[java.lang.Long]]( 197 | "key1", 198 | Collections.singletonList[java.lang.Long](1L)), 199 | Array.fill[Byte](40){1} 200 | )).asJava 201 | 202 | val expectedDataFrame: DataFrame = spark.createDataFrame(expectedData, SparkAvroConversions.toSqlType(NativeComplete.SCHEMA$)) 203 | 204 | // actual 205 | val dummyUrl = "dummyUrl" 206 | val fromConfig = AbrisConfig 207 | .fromConfluentAvro 208 | .provideReaderSchema(NativeComplete.SCHEMA$.toString()) 209 | .usingSchemaRegistry(dummyUrl) 210 | .withExceptionHandler(new SpecificRecordExceptionHandler(providedDefaultRecord)) 211 | 212 | val actualDataFrame = providedDataFrame 213 | .select(from_avro(col("bytes"), fromConfig).as("actual")) 214 | .select(col("actual.*")) 215 | 216 | shouldEqualByData(expectedDataFrame, actualDataFrame) 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # top-most EditorConfig file 2 | root = true 3 | 4 | [*] 5 | charset = utf-8 6 | end_of_line = lf 7 | trim_trailing_whitespace = true 8 | 9 | [*.xml] 10 | indent_size = 4 11 | indent_style = space 12 | insert_final_newline = true 13 | 14 | [*.{java,scala,js,json,css}] 15 | indent_size = 2 16 | indent_style = space 17 | insert_final_newline = true 18 | max_line_length = 120 19 | 20 | [*.md] 21 | trim_trailing_whitespace = false 22 | 23 | [*.scala] 24 | ij_continuation_indent_size = 2 25 | ij_scala_align_composite_pattern = true 26 | ij_scala_align_extends_with = 0 27 | ij_scala_align_group_field_declarations = false 28 | ij_scala_align_if_else = false 29 | ij_scala_align_in_columns_case_branch = false 30 | ij_scala_align_multiline_binary_operation = false 31 | ij_scala_align_multiline_chained_methods = false 32 | ij_scala_align_multiline_for = true 33 | ij_scala_align_multiline_parameters = false 34 | ij_scala_align_multiline_parameters_in_calls = false 35 | ij_scala_align_multiline_parenthesized_expression = false 36 | ij_scala_align_parameter_types_in_multiline_declarations = 0 37 | ij_scala_align_tuple_elements = false 38 | ij_scala_alternate_continuation_indent_for_params = 4 39 | ij_scala_binary_operation_wrap = off 40 | ij_scala_blank_lines_after_anonymous_class_header = 0 41 | ij_scala_blank_lines_after_class_header = 0 42 | ij_scala_blank_lines_after_imports = 1 43 | ij_scala_blank_lines_after_package = 1 44 | ij_scala_blank_lines_around_class = 1 45 | ij_scala_blank_lines_around_class_in_inner_scopes = 0 46 | ij_scala_blank_lines_around_field = 0 47 | ij_scala_blank_lines_around_field_in_inner_scopes = 0 48 | ij_scala_blank_lines_around_field_in_interface = 0 49 | ij_scala_blank_lines_around_method = 1 50 | ij_scala_blank_lines_around_method_in_inner_scopes = 1 51 | ij_scala_blank_lines_around_method_in_interface = 0 52 | ij_scala_blank_lines_before_class_end = 0 53 | ij_scala_blank_lines_before_imports = 1 54 | ij_scala_blank_lines_before_method_body = 0 55 | ij_scala_blank_lines_before_package = 0 56 | ij_scala_block_brace_style = end_of_line 57 | ij_scala_block_comment_at_first_column = true 58 | ij_scala_call_parameters_new_line_after_lparen = 0 59 | ij_scala_call_parameters_right_paren_on_new_line = false 60 | ij_scala_call_parameters_wrap = off 61 | ij_scala_case_clause_brace_force = never 62 | ij_scala_catch_on_new_line = false 63 | ij_scala_class_annotation_wrap = split_into_lines 64 | ij_scala_class_brace_style = end_of_line 65 | ij_scala_closure_brace_force = never 66 | ij_scala_do_not_align_block_expr_params = true 67 | ij_scala_do_not_indent_case_clause_body = false 68 | ij_scala_do_not_indent_tuples_close_brace = true 69 | ij_scala_do_while_brace_force = never 70 | ij_scala_else_on_new_line = false 71 | ij_scala_enable_scaladoc_formatting = true 72 | ij_scala_enforce_functional_syntax_for_unit = true 73 | ij_scala_extends_keyword_wrap = off 74 | ij_scala_extends_list_wrap = off 75 | ij_scala_field_annotation_wrap = split_into_lines 76 | ij_scala_finally_brace_force = never 77 | ij_scala_finally_on_new_line = false 78 | ij_scala_for_brace_force = never 79 | ij_scala_for_statement_wrap = off 80 | ij_scala_formatter = 0 81 | ij_scala_if_brace_force = never 82 | ij_scala_implicit_value_class_suffix = Ops 83 | ij_scala_indent_braced_function_args = true 84 | ij_scala_indent_case_from_switch = true 85 | ij_scala_indent_first_parameter = true 86 | ij_scala_indent_first_parameter_clause = false 87 | ij_scala_indent_type_arguments = true 88 | ij_scala_indent_type_parameters = true 89 | ij_scala_indent_yield_after_one_line_enumerators = true 90 | ij_scala_keep_blank_lines_before_right_brace = 2 91 | ij_scala_keep_blank_lines_in_code = 2 92 | ij_scala_keep_blank_lines_in_declarations = 2 93 | ij_scala_keep_comments_on_same_line = true 94 | ij_scala_keep_first_column_comment = false 95 | ij_scala_keep_indents_on_empty_lines = false 96 | ij_scala_keep_line_breaks = true 97 | ij_scala_keep_one_line_lambdas_in_arg_list = false 98 | ij_scala_keep_simple_blocks_in_one_line = false 99 | ij_scala_keep_simple_methods_in_one_line = false 100 | ij_scala_keep_xml_formatting = false 101 | ij_scala_line_comment_add_space = false 102 | ij_scala_line_comment_at_first_column = true 103 | ij_scala_method_annotation_wrap = split_into_lines 104 | ij_scala_method_brace_force = never 105 | ij_scala_method_brace_style = end_of_line 106 | ij_scala_method_call_chain_wrap = off 107 | ij_scala_method_parameters_new_line_after_left_paren = false 108 | ij_scala_method_parameters_right_paren_on_new_line = false 109 | ij_scala_method_parameters_wrap = off 110 | ij_scala_modifier_list_wrap = false 111 | ij_scala_multiline_string_align_dangling_closing_quotes = false 112 | ij_scala_multiline_string_closing_quotes_on_new_line = false 113 | ij_scala_multiline_string_insert_margin_on_enter = true 114 | ij_scala_multiline_string_margin_char = | 115 | ij_scala_multiline_string_margin_indent = 2 116 | ij_scala_multiline_string_opening_quotes_on_new_line = true 117 | ij_scala_multiline_string_process_margin_on_copy_paste = true 118 | ij_scala_newline_after_annotations = false 119 | ij_scala_not_continuation_indent_for_params = false 120 | ij_scala_parameter_annotation_wrap = off 121 | ij_scala_parentheses_expression_new_line_after_left_paren = false 122 | ij_scala_parentheses_expression_right_paren_on_new_line = false 123 | ij_scala_place_closure_parameters_on_new_line = false 124 | ij_scala_place_self_type_on_new_line = true 125 | ij_scala_prefer_parameters_wrap = false 126 | ij_scala_preserve_space_after_method_declaration_name = false 127 | ij_scala_reformat_on_compile = false 128 | ij_scala_replace_case_arrow_with_unicode_char = false 129 | ij_scala_replace_for_generator_arrow_with_unicode_char = false 130 | ij_scala_replace_lambda_with_greek_letter = false 131 | ij_scala_replace_map_arrow_with_unicode_char = false 132 | ij_scala_scalafmt_fallback_to_default_settings = false 133 | ij_scala_scalafmt_reformat_on_files_save = false 134 | ij_scala_scalafmt_show_invalid_code_warnings = true 135 | ij_scala_scalafmt_use_intellij_formatter_for_range_format = true 136 | ij_scala_sd_align_exception_comments = true 137 | ij_scala_sd_align_list_item_content = true 138 | ij_scala_sd_align_other_tags_comments = true 139 | ij_scala_sd_align_parameters_comments = true 140 | ij_scala_sd_align_return_comments = true 141 | ij_scala_sd_blank_line_after_parameters_comments = false 142 | ij_scala_sd_blank_line_after_return_comments = false 143 | ij_scala_sd_blank_line_before_parameters = false 144 | ij_scala_sd_blank_line_before_tags = true 145 | ij_scala_sd_blank_line_between_parameters = false 146 | ij_scala_sd_keep_blank_lines_between_tags = false 147 | ij_scala_sd_preserve_spaces_in_tags = false 148 | ij_scala_space_after_comma = true 149 | ij_scala_space_after_for_semicolon = true 150 | ij_scala_space_after_modifiers_constructor = false 151 | ij_scala_space_after_type_colon = true 152 | ij_scala_space_before_brace_method_call = true 153 | ij_scala_space_before_class_left_brace = true 154 | ij_scala_space_before_for_parentheses = true 155 | ij_scala_space_before_if_parentheses = true 156 | ij_scala_space_before_infix_like_method_parentheses = false 157 | ij_scala_space_before_infix_method_call_parentheses = false 158 | ij_scala_space_before_infix_operator_like_method_call_parentheses = true 159 | ij_scala_space_before_method_call_parentheses = false 160 | ij_scala_space_before_method_left_brace = true 161 | ij_scala_space_before_method_parentheses = false 162 | ij_scala_space_before_type_colon = false 163 | ij_scala_space_before_type_parameter_in_def_list = false 164 | ij_scala_space_before_type_parameter_leading_context_bound_colon = false 165 | ij_scala_space_before_type_parameter_leading_context_bound_colon_hk = true 166 | ij_scala_space_before_type_parameter_list = false 167 | ij_scala_space_before_type_parameter_rest_context_bound_colons = true 168 | ij_scala_space_before_while_parentheses = true 169 | ij_scala_space_inside_closure_braces = true 170 | ij_scala_space_inside_self_type_braces = true 171 | ij_scala_space_within_empty_method_call_parentheses = false 172 | ij_scala_spaces_around_at_in_patterns = false 173 | ij_scala_spaces_in_imports = false 174 | ij_scala_spaces_in_one_line_blocks = false 175 | ij_scala_spaces_within_brackets = false 176 | ij_scala_spaces_within_for_parentheses = false 177 | ij_scala_spaces_within_if_parentheses = false 178 | ij_scala_spaces_within_method_call_parentheses = false 179 | ij_scala_spaces_within_method_parentheses = false 180 | ij_scala_spaces_within_parentheses = false 181 | ij_scala_spaces_within_while_parentheses = false 182 | ij_scala_special_else_if_treatment = true 183 | ij_scala_trailing_comma_arg_list_enabled = true 184 | ij_scala_trailing_comma_import_selector_enabled = false 185 | ij_scala_trailing_comma_mode = trailing_comma_keep 186 | ij_scala_trailing_comma_params_enabled = true 187 | ij_scala_trailing_comma_pattern_arg_list_enabled = false 188 | ij_scala_trailing_comma_tuple_enabled = false 189 | ij_scala_trailing_comma_tuple_type_enabled = false 190 | ij_scala_trailing_comma_type_params_enabled = false 191 | ij_scala_try_brace_force = never 192 | ij_scala_type_annotation_exclude_constant = true 193 | ij_scala_type_annotation_exclude_in_dialect_sources = true 194 | ij_scala_type_annotation_exclude_in_test_sources = false 195 | ij_scala_type_annotation_exclude_member_of_anonymous_class = false 196 | ij_scala_type_annotation_exclude_member_of_private_class = false 197 | ij_scala_type_annotation_exclude_when_type_is_stable = true 198 | ij_scala_type_annotation_function_parameter = false 199 | ij_scala_type_annotation_implicit_modifier = true 200 | ij_scala_type_annotation_local_definition = false 201 | ij_scala_type_annotation_private_member = false 202 | ij_scala_type_annotation_protected_member = true 203 | ij_scala_type_annotation_public_member = true 204 | ij_scala_type_annotation_structural_type = true 205 | ij_scala_type_annotation_underscore_parameter = false 206 | ij_scala_type_annotation_unit_type = true 207 | ij_scala_use_alternate_continuation_indent_for_params = false 208 | ij_scala_use_scala3_indentation_based_syntax = true 209 | ij_scala_use_scaladoc2_formatting = false 210 | ij_scala_variable_annotation_wrap = off 211 | ij_scala_while_brace_force = never 212 | ij_scala_while_on_new_line = false 213 | ij_scala_wrap_before_with_keyword = false 214 | ij_scala_wrap_first_method_in_call_chain = false 215 | ij_scala_wrap_long_lines = false 216 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # ABRiS - Avro Bridge for Spark 4 | 5 | - Pain free Spark/Avro integration. 6 | 7 | - Seamlessly integrate with Confluent platform, including Schema Registry with all available [naming strategies](https://docs.confluent.io/current/schema-registry/serializer-formatter.html#how-the-naming-strategies-work) and schema evolution. 8 | 9 | - Seamlessly convert your Avro records from anywhere (e.g. Kafka, Parquet, HDFS, etc) into Spark Rows. 10 | 11 | - Convert your Dataframes into Avro records without even specifying a schema. 12 | 13 | - Go back-and-forth Spark Avro (since Spark 2.4). 14 | 15 | 16 | ### Coordinates for Maven POM dependency 17 | 18 | | Scala | Abris | 19 | |:------:|:-------:| 20 | | 2.11 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.11) | 21 | | 2.12 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.12/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.12) | 22 | | 2.13 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.13/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/abris_2.13) | 23 | 24 | ## Supported versions 25 | 26 | | Abris | Spark | Scala | 27 | |:-----: |:-------------:|:-----: | 28 | | 6.2.0 - 6.x.x | 3.2.1 - 3.5.x | 2.12 / 2.13 | 29 | | 6.0.0 - 6.1.1 | 3.2.0 | 2.12 / 2.13 | 30 | | 5.0.0 - 5.x.x | 3.0.x / 3.1.x | 2.12 | 31 | | 5.0.0 - 5.x.x | 2.4.x | 2.11 / 2.12 | 32 | 33 | From version 6.0.0, ABRiS only supports Spark 3.2.x. 34 | 35 | ABRiS 5.0.x is still supported for older versions of Spark (see [branch-5](https://github.com/AbsaOSS/ABRiS/tree/branch-5)) 36 | 37 | ## Older Versions 38 | This is documentation for Abris **version 6**. Documentation for older versions is located in corresponding branches: 39 | [branch-5](https://github.com/AbsaOSS/ABRiS/tree/branch-5), 40 | [branch-4](https://github.com/AbsaOSS/ABRiS/tree/branch-4), 41 | [branch-3.2](https://github.com/AbsaOSS/ABRiS/tree/branch-3.2). 42 | 43 | ## Confluent Schema Registry Version 44 | Abris by default uses Confluent client version 6.2.0. 45 | 46 | ## Installation 47 | Abris needs `spark-avro` to run, make sure you include the `spark-avro` dependency when using Abris. 48 | The version of `spark-avro` and `Spark` should be identical. 49 | 50 | Example: submitting a Spark job: 51 | ``` 52 | ./bin/spark-submit \ 53 | --packages org.apache.spark:spark-avro_2.12:3.5.0,za.co.absa:abris_2.12:6.4.0 \ 54 | ...rest of submit params... 55 | ``` 56 | 57 | Example: using Abris in maven project: 58 | ```xml 59 | 60 | org.apache.spark 61 | spark-core_2.12 62 | 3.5.0 63 | provided 64 | 65 | 66 | org.apache.spark 67 | spark-avro_2.12 68 | 3.5.0 69 | 70 | 71 | za.co.absa 72 | abris_2.12 73 | 6.4.0 74 | 75 | ``` 76 | 77 | Example: using Abris in SBT project: 78 | ```Scala 79 | libraryDependencies ++= Seq( 80 | "org.apache.spark" %% "spark-core" % "3.5.0" % Provided, 81 | "org.apache.spark" %% "spark-avro" % "3.5.0", 82 | "za.co.absa" %% "abris" % "6.4.0" 83 | ) 84 | ``` 85 | 86 | 87 | ## Usage 88 | 89 | ABRiS API is in it's most basic form almost identical to Spark built-in support for Avro, but it provides additional functionality. 90 | Mainly it's support of schema registry and also seamless integration with confluent Avro data format. 91 | 92 | The API consists of two Spark SQL expressions (`to_avro` and `from_avro`) and fluent configurator (`AbrisConfig`) 93 | 94 | Using the configurator you can choose from four basic config types: 95 | * `toSimpleAvro`, `toConfluentAvro`, `fromSimpleAvro` and `fromConfluentAvro` 96 | 97 | And configure what you want to do, mainly how to get the avro schema. 98 | 99 | Example of usage: 100 | ```Scala 101 | val abrisConfig = AbrisConfig 102 | .fromConfluentAvro 103 | .downloadReaderSchemaByLatestVersion 104 | .andTopicNameStrategy("topic123") 105 | .usingSchemaRegistry("http://localhost:8081") 106 | 107 | import za.co.absa.abris.avro.functions.from_avro 108 | val deserialized = dataFrame.select(from_avro(col("value"), abrisConfig) as 'data) 109 | ``` 110 | 111 | Detailed instructions for many use cases are in separated documents: 112 | 113 | - [How to use Abris with vanilla avro (with examples)](documentation/vanilla-avro-documentation.md) 114 | - [How to use Abris with Confluent avro (with examples)](documentation/confluent-avro-documentation.md) 115 | - [How to use Abris in Python (with examples)](documentation/python-documentation.md) 116 | 117 | Full runnable examples can be found in the ```za.co.absa.abris.examples``` package. You can also take a look at unit tests in package ```za.co.absa.abris.avro.sql```. 118 | 119 | **IMPORTANT**: Spark dependencies have `provided` scope in the `pom.xml`, so when running the examples, please make sure that you either, instruct your IDE to include dependencies with 120 | `provided` scope, or change the scope directly. 121 | 122 | ### Confluent Avro format 123 | The format of Avro binary data is defined in [Avro specification](http://avro.apache.org/docs/current/spec.html). 124 | Confluent format extends it and prepends the schema id before the actual record. 125 | The Confluent expressions in this library expect this format and add the id after the Avro data are generated or remove it before they are parsed. 126 | 127 | You can find more about Confluent and Schema Registry in [Confluent documentation](https://docs.confluent.io/current/schema-registry/index.html). 128 | 129 | 130 | ### Schema Registry security and other additional settings 131 | 132 | Only Schema registry client setting that is mandatory is the url, 133 | but if you need to provide more the configurer allows you to provide a whole map. 134 | 135 | For example, you may want to provide `basic.auth.user.info` and `basic.auth.credentials.source` required for user authentication. 136 | You can do it this way: 137 | 138 | ```scala 139 | val registryConfig = Map( 140 | AbrisConfig.SCHEMA_REGISTRY_URL -> "http://localhost:8081", 141 | "basic.auth.credentials.source" -> "USER_INFO", 142 | "basic.auth.user.info" -> "srkey:srvalue" 143 | ) 144 | 145 | val abrisConfig = AbrisConfig 146 | .fromConfluentAvro 147 | .downloadReaderSchemaByLatestVersion 148 | .andTopicNameStrategy("topic123") 149 | .usingSchemaRegistry(registryConfig) // use the map instead of just url 150 | ``` 151 | 152 | ## Other Features 153 | 154 | ### Generating Avro schema from Spark data frame column 155 | There is a helper method that allows you to generate schema automatically from spark column. 156 | Assuming you have a data frame containing column "input". You can generate schema for data in that column like this: 157 | ```scala 158 | val schema = AvroSchemaUtils.toAvroSchema(dataFrame, "input") 159 | ``` 160 | 161 | ### Using schema manager to directly download or register schema 162 | You can use SchemaManager directly to do operations with schema registry. 163 | The configuration is identical to Schema Registry Client. 164 | The SchemaManager is just a wrapper around the client providing helpful methods and abstractions. 165 | 166 | ```scala 167 | val schemaRegistryClientConfig = Map( ...configuration... ) 168 | val schemaManager = SchemaManagerFactory.create(schemaRegistryClientConfig) 169 | 170 | // Downloading schema: 171 | val schema = schemaManager.getSchemaById(42) 172 | 173 | // Registering schema: 174 | val schemaString = "{...avro schema json...}" 175 | val subject = SchemaSubject.usingTopicNameStrategy("fooTopic") 176 | val schemaId = schemaManager.register(subject, schemaString) 177 | 178 | // and more, check SchemaManager's methods 179 | ``` 180 | 181 | ### De-serialisation Error Handling 182 | There are 2 ways ABRiS handles de-serialisation errors: 183 | 184 | #### FailFast (Default) 185 | Given no provided de-serialisation handler, a failure will result in a spark exception being thrown 186 | and with the error being outputted. This is the default procedure. 187 | 188 | #### SpecificRecordHandler 189 | The second option requires providing a default record that will be outputted in the event of a failure. 190 | This should be used as a flag to be deleted outside ABRiS that should mean the spark job will not stop. 191 | Beware however, a null or empty record will also result in an error so a record with a different input should be chosen. 192 | 193 | This can be provided as such: 194 | ```scala 195 | val abrisConfig = AbrisConfig 196 | .fromConfluentAvro 197 | .downloadReaderSchemaByLatestVersion 198 | .andTopicNameStrategy("topic123") 199 | .usingSchemaRegistry(registryConfig) 200 | .withSchemaConverter("custom") 201 | .withExceptionHandler(new SpecificRecordExceptionHandler(providedDefaultRecord)) 202 | ``` 203 | 204 | This is only for confluent-based configuration, not for standard avro. 205 | 206 | #### PermissiveRecordExceptionHandler 207 | The third option is to use the `PermissiveRecordExceptionHandler`. In case of a deserialization failure, this handler replaces the problematic record with a fully null record, instead of throwing an exception. This allows the data processing pipeline to continue without interruption. 208 | 209 | The main use case for this option is when you want to prioritize continuity of processing over individual record integrity. It's especially useful when dealing with large datasets where occasional malformed records could be tolerated. 210 | 211 | Here's how to use it: 212 | 213 | ```scala 214 | val abrisConfig = AbrisConfig 215 | .fromConfluentAvro 216 | .downloadReaderSchemaByLatestVersion 217 | .andTopicNameStrategy("topic123") 218 | .usingSchemaRegistry(registryConfig) 219 | .withSchemaConverter("custom") 220 | .withExceptionHandler(new PermissiveRecordExceptionHandler()) 221 | ``` 222 | 223 | With this configuration, in the event of a deserialization error, the `PermissiveRecordExceptionHandler` will log a warning, substitute the malformed record with a fully null one, and allow the data processing pipeline to continue. 224 | 225 | 226 | ### Data Conversions 227 | This library also provides convenient methods to convert between Avro and Spark schemas. 228 | 229 | If you have an Avro schema which you want to convert into a Spark SQL one - to generate your Dataframes, for instance - you can do as follows: 230 | 231 | ```scala 232 | val avroSchema: Schema = AvroSchemaUtils.load("path_to_avro_schema") 233 | val sqlSchema: StructType = SparkAvroConversions.toSqlType(avroSchema) 234 | ``` 235 | 236 | You can also do the inverse operation by running: 237 | 238 | ```scala 239 | val sqlSchema = new StructType(new StructField .... 240 | val avroSchema = SparkAvroConversions.toAvroSchema(sqlSchema, avro_schema_name, avro_schema_namespace) 241 | ``` 242 | 243 | #### Custom data conversions 244 | If you would like to use custom logic to convert from Avro to Spark, you can implement the `SchemaConverter` trait. 245 | The custom class is loaded in ABRiS using the service provider interface (SPI), so you need to register your class in your 246 | `META-INF/services` resource directory. You can then configure the custom class with its short name or the fully qualified name. 247 | 248 | **Example** 249 | 250 | Custom schema converter implementation 251 | ```scala 252 | package za.co.absa.abris.avro.sql 253 | import org.apache.avro.Schema 254 | import org.apache.spark.sql.types.DataType 255 | 256 | class CustomSchemaConverter extends SchemaConverter { 257 | override val shortName: String = "custom" 258 | override def toSqlType(avroSchema: Schema): DataType = ??? 259 | } 260 | ``` 261 | 262 | Provider configuration file `META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter`: 263 | ``` 264 | za.co.absa.abris.avro.sql.CustomSchemaConverter 265 | ``` 266 | 267 | Abris configuration 268 | ```scala 269 | val abrisConfig = AbrisConfig 270 | .fromConfluentAvro 271 | .downloadReaderSchemaByLatestVersion 272 | .andTopicNameStrategy("topic123") 273 | .usingSchemaRegistry(registryConfig) 274 | .withSchemaConverter("custom") 275 | ``` 276 | 277 | ## Multiple schemas in one topic 278 | The naming strategies RecordName and TopicRecordName allow for a one topic to receive different payloads, 279 | i.e. payloads containing different schemas that do not have to be compatible, 280 | as explained [here](https://docs.confluent.io/current/schema-registry/docs/serializer-formatter.html#subject-name-strategy). 281 | 282 | When you read such data from Kafka they will be stored as binary column in a dataframe, 283 | but once you convert them to Spark types they cannot be in one dataframe, 284 | because all rows in dataframe must have the same schema. 285 | 286 | So if you have multiple incompatible types of avro data in a dataframe you must first sort them out to several dataframes. 287 | One for each schema. Then you can use Abris and convert the avro data. 288 | 289 | ## How to measure code coverage 290 | ```shell 291 | ./mvn clean verify -Pcode-coverage,scala-2.12 292 | or 293 | ./mvn clean verify -Pcode-coverage,scala-2.13 294 | ``` 295 | Code coverage reports will be generated on paths: 296 | ``` 297 | {local-path}\ABRiS\target\jacoco 298 | ``` 299 | 300 | --- 301 | 302 | Copyright 2018 ABSA Group Limited 303 | 304 | Licensed under the Apache License, Version 2.0 (the "License"); 305 | you may not use this file except in compliance with the License. 306 | You may obtain a copy of the License at 307 | 308 | http://www.apache.org/licenses/LICENSE-2.0 309 | 310 | Unless required by applicable law or agreed to in writing, software 311 | distributed under the License is distributed on an "AS IS" BASIS, 312 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 313 | See the License for the specific language governing permissions and 314 | limitations under the License. 315 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/abris/examples/data/generation/TestSchemas.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.abris.examples.data.generation 18 | 19 | import all_types.test.{NativeComplete, NativeSimpleOuter} 20 | import za.co.absa.commons.annotation.DeveloperApi 21 | 22 | /** 23 | * Provides several Avro schemas. 24 | * 25 | * Used for tests and examples. 26 | */ 27 | @DeveloperApi 28 | object TestSchemas { 29 | 30 | case class ErrorMessage(errType: String, errCode: String, errMsg: String, errCol: String, rawValues: Seq[String], 31 | mappings: Seq[Mapping] = Seq()) 32 | 33 | case class Mapping(mappingTableColumn: String, mappedDatasetColumn: String) 34 | 35 | val NATIVE_SIMPLE_OUTER_SCHEMA = NativeSimpleOuter.SCHEMA$.toString() 36 | 37 | val NATIVE_SIMPLE_NESTED_SCHEMA = """{ 38 | "namespace": "all-types.test", 39 | "type":"record", 40 | "name":"nested", 41 | "fields": 42 | [ 43 | {"name":"int", "type":"int"}, 44 | {"name":"long","type":"long"} 45 | ] 46 | }""" 47 | 48 | val NATIVE_COMPLETE_SCHEMA = NativeComplete.SCHEMA$.toString() 49 | 50 | val NATIVE_COMPLETE_SCHEMA_WITHOUT_FIXED = """{ 51 | "namespace": "all-types.test", 52 | "type": "record", 53 | "name": "native_complete", 54 | "fields":[ 55 | {"name": "bytes", "type": "bytes" }, 56 | { "name": "string", "type": ["string", "null"] }, 57 | { "name": "int", "type": ["int", "null"] }, 58 | { "name": "long", "type": ["long", "null"] }, 59 | { "name": "double", "type": ["double", "null"] }, 60 | { "name": "float", "type": ["float", "null"] }, 61 | { "name": "boolean", "type": ["boolean","null"] }, 62 | { "name": "array", "type": {"type": "array", "items": "string"} }, 63 | {"name": "map", "type": { "type": "map", "values": {"type": "array", "items": "long"}}} 64 | ] 65 | }""" 66 | 67 | val NATIVE_SCHEMA_SPEC = """{ 68 | "namespace": "all-types.test", 69 | "type": "record", 70 | "name": "native", 71 | "fields":[ 72 | { "name": "string", "type": ["string", "null"] }, 73 | { "name": "int", "type": ["int", "null"] }, 74 | { "name": "long", "type": ["long", "null"] }, 75 | { "name": "double", "type": ["double", "null"] }, 76 | { "name": "float", "type": ["float", "null"] }, 77 | { "name": "boolean", "type": ["boolean","null"] } 78 | ] 79 | }""" 80 | 81 | val ARRAY_SCHEMA_SPEC = """{ 82 | "namespace": "all-types.test", 83 | "type": "record", 84 | "name": "array", 85 | "fields":[ 86 | { "name": "array", "type": {"type": "array", "items": "string"} } 87 | ] 88 | }""" 89 | 90 | val MAP_SCHEMA_SPEC = """{ 91 | "namespace": "all-types.test", 92 | "type": "record", 93 | "name": "map", 94 | "fields":[ 95 | {"name": "map", "type": { "type": "map", "values": {"type": "array", "items": "long"}}} 96 | ] 97 | }""" 98 | 99 | val BYTES_SCHEMA_SPEC = """{ 100 | "namespace": "all-types.test", 101 | "type": "record", 102 | "name": "bytes", 103 | "fields":[ 104 | {"name": "bytes", "type": "bytes" } 105 | ] 106 | }""" 107 | 108 | val FIXED_SCHEMA_SPEC = """{ 109 | "namespace": "all-types.test", 110 | "type": "record", 111 | "name": "fixed_name", 112 | "fields":[ 113 | {"name": "fixed", "type": {"type": "fixed", "size": 13, "name": "fixed"}} 114 | ] 115 | }""" 116 | 117 | val DECIMAL_SCHEMA_SPEC = """{ 118 | "namespace": "all-types.test", 119 | "type": "record", 120 | "name": "decimal", 121 | "fields":[ 122 | {"name": "decimal", "type": {"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2}} 123 | ] 124 | }""" 125 | 126 | val DATE_SCHEMA_SPEC = """{ 127 | "namespace": "all-types.test", 128 | "type": "record", 129 | "name": "date", 130 | "fields":[ 131 | {"name": "date", "type": {"type": "int", "logicalType": "date"}} 132 | ] 133 | }""" 134 | 135 | val MILLISECOND_SCHEMA_SPEC = """{ 136 | "namespace": "all-types.test", 137 | "type": "record", 138 | "name": "millisecond", 139 | "fields":[ 140 | {"name": "millisecond", "type": {"type": "int", "logicalType": "time-millis"}} 141 | ] 142 | }""" 143 | 144 | val MICROSECOND_SCHEMA_SPEC = """{ 145 | "namespace": "all-types.test", 146 | "type": "record", 147 | "name": "microsecond", 148 | "fields":[ 149 | {"name": "microsecond", "type": {"type": "long", "logicalType": "time-micros"}} 150 | ] 151 | }""" 152 | 153 | val TIMESTAMP_MILLIS_SCHEMA_SPEC = """{ 154 | "namespace": "all-types.test", 155 | "type": "record", 156 | "name": "timestamp_millis", 157 | "fields":[ 158 | {"name": "timestampMillis", "type": {"type": "long", "logicalType": "timestamp-millis"}} 159 | ] 160 | }""" 161 | 162 | val TIMESTAMP_MICROS_SCHEMA_SPEC = """{ 163 | "namespace": "all-types.test", 164 | "type": "record", 165 | "name": "timestamp_micros", 166 | "fields":[ 167 | {"name": "timestampMicros", "type": {"type": "long", "logicalType": "timestamp-micros"}} 168 | ] 169 | }""" 170 | 171 | val DURATION_MICROS_SCHEMA_SPEC = """{ 172 | "namespace": "all-types.test", 173 | "type": "record", 174 | "name": "duration_micros", 175 | "fields":[ 176 | {"name": "duration", "type": {"type": "fixed", "size": 12, "name": "name", "logicalType": "duration"}} 177 | ] 178 | }""" 179 | 180 | val COMPLEX_SCHEMA_SPEC = """{ 181 | "type":"record", 182 | "name":"complex", 183 | "namespace":"all-types.test", 184 | "fields": 185 | [ 186 | {"name":"name","type":"string"}, 187 | {"name":"regions","type": 188 | {"type":"map","values": 189 | {"type":"array","items": 190 | {"type":"record","name":"City","fields": 191 | [ 192 | {"name":"name","type":"string"}, 193 | {"name":"neighborhoods","type": 194 | {"type":"array","items": 195 | {"type":"record","name":"Neighborhood","fields": 196 | [ 197 | {"name":"name","type":"string"}, 198 | {"name":"streets","type": 199 | {"type":"array","items": 200 | {"type":"record","name":"Street","fields": 201 | [ 202 | {"name":"name","type":"string"}, 203 | {"name":"zip","type":"string"} 204 | ] 205 | } 206 | } 207 | } 208 | ] 209 | } 210 | } 211 | } 212 | ] 213 | } 214 | } 215 | } 216 | } 217 | ] 218 | }""" 219 | 220 | val COMPLEX_SCHEMA_STREET_SPEC = """ 221 | { 222 | "namespace":"test_city", 223 | "type":"record", 224 | "name":"Street", 225 | "fields": 226 | [ 227 | {"name":"name","type":"string"}, 228 | {"name":"zip","type":"string"} 229 | ] 230 | }""" 231 | 232 | val COMPLEX_SCHEMA_NEIGHBORHOOD_SPEC = """ 233 | { 234 | "namespace":"test_neighborhood", 235 | "type":"record", 236 | "name":"Neighborhood", 237 | "fields": 238 | [ 239 | {"name":"name","type":"string"}, 240 | {"name":"streets", 241 | "type": 242 | { 243 | "type":"array", 244 | "items": 245 | { 246 | "type":"record", 247 | "name":"Street", 248 | "fields": 249 | [ 250 | {"name":"name","type":"string"}, 251 | {"name":"zip","type":"string"} 252 | ] 253 | } 254 | } 255 | } 256 | ] 257 | }""" 258 | 259 | val COMPLEX_SCHEMA_CITY_SPEC = """ 260 | { 261 | "namespace":"test_city", 262 | "type":"record", 263 | "name":"City", 264 | "fields": 265 | [ 266 | {"name":"name","type":"string"}, 267 | {"name":"neighborhoods","type": 268 | { 269 | "type":"array", 270 | "items": 271 | { 272 | "type":"record", 273 | "name":"Neighborhood", 274 | "fields": 275 | [ 276 | {"name":"name","type":"string"}, 277 | {"name":"streets","type": 278 | { 279 | "type":"array", 280 | "items": 281 | { 282 | "type":"record", 283 | "name":"Street", 284 | "fields": 285 | [ 286 | {"name":"name","type":"string"}, 287 | {"name":"zip","type":"string"} 288 | ] 289 | } 290 | } 291 | } 292 | ] 293 | } 294 | } 295 | } 296 | ] 297 | }""" 298 | 299 | val CASE_CLASSES_SCHEMA = """ 300 | { 301 | "type":"record", 302 | "name":"name", 303 | "namespace":"namespace", 304 | "fields":[ 305 | { 306 | "name":"errCol", 307 | "type":[ 308 | { 309 | "type":"array", 310 | "items":[ 311 | { 312 | "type":"record", 313 | "name":"errCol", 314 | "namespace":"namespace.errCol", 315 | "fields":[ 316 | { 317 | "name":"errType", 318 | "type":[ 319 | "string", 320 | "null" 321 | ] 322 | }, 323 | { 324 | "name":"errCode", 325 | "type":[ 326 | "string", 327 | "null" 328 | ] 329 | }, 330 | { 331 | "name":"errMsg", 332 | "type":[ 333 | "string", 334 | "null" 335 | ] 336 | }, 337 | { 338 | "name":"errCol", 339 | "type":[ 340 | "string", 341 | "null" 342 | ] 343 | }, 344 | { 345 | "name":"rawValues", 346 | "type":[ 347 | { 348 | "type":"array", 349 | "items":[ 350 | "string", 351 | "null" 352 | ] 353 | }, 354 | "null" 355 | ] 356 | }, 357 | { 358 | "name":"mappings", 359 | "type":[ 360 | { 361 | "type":"array", 362 | "items":[ 363 | { 364 | "type":"record", 365 | "name":"mappings", 366 | "namespace":"namespace.errCol.mappings", 367 | "fields":[ 368 | { 369 | "name":"mappingTableColumn", 370 | "type":[ 371 | "string", 372 | "null" 373 | ] 374 | }, 375 | { 376 | "name":"mappedDatasetColumn", 377 | "type":[ 378 | "string", 379 | "null" 380 | ] 381 | } 382 | ] 383 | }, 384 | "null" 385 | ] 386 | }, 387 | "null" 388 | ] 389 | } 390 | ] 391 | }, 392 | "null" 393 | ] 394 | }, 395 | "null" 396 | ] 397 | } 398 | ] 399 | } 400 | """ 401 | } 402 | --------------------------------------------------------------------------------