├── .editorconfig ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── setup.cfg ├── setup.py ├── src └── aws_schema_registry │ ├── __init__.py │ ├── adapter │ ├── __init__.py │ └── kafka.py │ ├── avro.py │ ├── client.py │ ├── codec.py │ ├── exception.py │ ├── jsonschema.py │ ├── naming.py │ ├── schema.py │ └── serde.py ├── tests ├── integration │ ├── conftest.py │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── src │ │ │ └── main │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── disasteraware │ │ │ │ └── aws │ │ │ │ └── schemaregistry │ │ │ │ └── App.java │ │ ├── test_java_integration.py │ │ ├── user.avsc │ │ └── user.json │ └── kafka_test │ │ ├── README.md │ │ ├── docker-compose.yml │ │ ├── test_kafka_integration.py │ │ ├── user.json │ │ ├── user.v1.avsc │ │ └── user.v2.avsc ├── test_avro.py ├── test_client.py ├── test_codec.py └── test_jsonschema.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | insert_final_newline = true 6 | indent_style = space 7 | end_of_line = lf 8 | trim_trailing_whitespace = true 9 | 10 | [*.py] 11 | indent_size = 4 12 | 13 | [*.{yml}] 14 | indent_size = 2 15 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | release: 9 | branches: [ main ] 10 | types: [ created ] 11 | 12 | jobs: 13 | lint: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - uses: actions/setup-python@v2 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install tox 25 | 26 | - run: tox -e flake8 27 | 28 | typecheck: 29 | needs: lint 30 | runs-on: ubuntu-latest 31 | 32 | steps: 33 | - uses: actions/checkout@v2 34 | 35 | - uses: actions/setup-python@v2 36 | 37 | - name: Install dependencies 38 | run: | 39 | python -m pip install --upgrade pip 40 | pip install tox 41 | 42 | - run: tox -e mypy 43 | 44 | test: 45 | needs: lint 46 | runs-on: ${{ matrix.operating-system }} 47 | timeout-minutes: 15 48 | 49 | strategy: 50 | fail-fast: false 51 | matrix: 52 | operating-system: [ubuntu-latest, macos-latest] 53 | python-version: ['3.8', '3.9', '3.10', '3.11'] 54 | 55 | steps: 56 | - uses: actions/checkout@v2 57 | 58 | - name: Set up Python ${{ matrix.python-version }} 59 | uses: actions/setup-python@v2 60 | with: 61 | python-version: ${{ matrix.python-version }} 62 | 63 | - name: Install dependencies 64 | run: | 65 | python -m pip install --upgrade pip 66 | pip install tox tox-gh-actions 67 | 68 | - name: Test 69 | run: tox 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # python 2 | venv* 3 | .venv* 4 | env 5 | .env 6 | .tox 7 | .toxenv 8 | __pycache__ 9 | *.egg-info 10 | build 11 | dist 12 | 13 | # IDEs 14 | .vscode 15 | .idea 16 | *.iml 17 | 18 | tests/integration/java/target 19 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.4 2 | 3.10.12 3 | 3.9.7 4 | 3.8.12 5 | 3.7.12 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Glue Schema Registry for Python 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/aws-glue-schema-registry.svg)](https://pypi.org/project/aws-glue-schema-registry) 4 | [![PyPI](https://img.shields.io/pypi/pyversions/aws-glue-schema-registry)](https://pypi.org/project/aws-glue-schema-registry) 5 | [![main](https://github.com/DisasterAWARE/aws-glue-schema-registry-python/actions/workflows/main.yml/badge.svg)](https://github.com/DisasterAWARE/aws-glue-schema-registry-python/actions/workflows/main.yml) 6 | 7 | Use the AWS Glue Schema Registry in Python projects. 8 | 9 | This library is a partial port of [aws-glue-schema-registry](https://github.com/awslabs/aws-glue-schema-registry) which implements a subset of its features with full compatibility. 10 | 11 | ## Feature Support 12 | 13 | Feature | Java Library | Python Library | Notes 14 | :------ | :----------- | :------------- | :---- 15 | Serialization and deserialization using schema registry | ✔️ | ✔️ 16 | Avro message format | ✔️ | ✔️ 17 | JSON Schema message format | ✔️ | ✔️ 18 | Kafka Streams support | ✔️ | | N/A for Python, Kafka Streams is Java-only 19 | Compression | ✔️ | ✔️ | 20 | Local schema cache | ✔️ | ✔️ 21 | Schema auto-registration | ✔️ | ✔️ 22 | Evolution checks | ✔️ | ✔️ 23 | Migration from a third party Schema Registry | ✔️ | ✔️ 24 | Flink support | ✔️ | ❌ 25 | Kafka Connect support | ✔️ | | N/A for Python, Kafka Connect is Java-only 26 | 27 | ## Installation - PyPI (Recommended) 28 | 29 | `pip install aws-glue-schema-registry` 30 | 31 | ## Installation - local 32 | 33 | Clone this repository and run: 34 | 35 | ``` 36 | python setup.py install -e . 37 | ``` 38 | 39 | This library includes opt-in extra dependencies that enable support for certain features. For example, to use the schema registry with [kafka-python](https://pypi.org/project/kafka-python/), you should install the `kafka-python` extra: 40 | 41 | ``` 42 | python setup.py install -e .[kafka-python] 43 | ``` 44 | 45 | Extra name | Purpose 46 | :--------- | :------ 47 | kafka-python | Provides adapter classes to plug into `kafka-python` 48 | 49 | ## Usage 50 | 51 | First use `boto3` to create a low-level AWS Glue client: 52 | 53 | ```python 54 | import boto3 55 | 56 | # Pass your AWS credentials or profile information here 57 | session = boto3.Session(access_key_id=xxx, secret_access_key=xxx, region_name='us-west-2') 58 | 59 | glue_client = session.client('glue') 60 | ``` 61 | 62 | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration for more information on configuring boto3. 63 | 64 | Send Kafka messages with `SchemaRegistrySerializer`: 65 | 66 | ```python 67 | from aws_schema_registry import DataAndSchema, SchemaRegistryClient 68 | from aws_schema_registry.avro import AvroSchema 69 | 70 | # In this example we will use kafka-python as our Kafka client, 71 | # so we need to have the `kafka-python` extras installed and use 72 | # the kafka adapter. 73 | from aws_schema_registry.adapter.kafka import KafkaSerializer 74 | from kafka import KafkaProducer 75 | 76 | # Create the schema registry client, which is a façade around the boto3 glue client 77 | client = SchemaRegistryClient(glue_client, 78 | registry_name='my-registry') 79 | 80 | # Create the serializer 81 | serializer = KafkaSerializer(client) 82 | 83 | # Create the producer 84 | producer = KafkaProducer(value_serializer=serializer) 85 | 86 | # Our producer needs a schema to send along with the data. 87 | # In this example we're using Avro, so we'll load an .avsc file. 88 | with open('user.avsc', 'r') as schema_file: 89 | schema = AvroSchema(schema_file.read()) 90 | 91 | # Send message data along with schema 92 | data = { 93 | 'name': 'John Doe', 94 | 'favorite_number': 6 95 | } 96 | producer.send('my-topic', value=(data, schema)) 97 | # the value MUST be a tuple when we're using the KafkaSerializer 98 | ``` 99 | 100 | Read Kafka messages with `SchemaRegistryDeserializer`: 101 | 102 | ```python 103 | from aws_schema_registry import SchemaRegistryClient 104 | 105 | # In this example we will use kafka-python as our Kafka client, 106 | # so we need to have the `kafka-python` extras installed and use 107 | # the kafka adapter. 108 | from aws_schema_registry.adapter.kafka import KafkaDeserializer 109 | from kafka import KafkaConsumer 110 | 111 | # Create the schema registry client, which is a façade around the boto3 glue client 112 | client = SchemaRegistryClient(glue_client, 113 | registry_name='my-registry') 114 | 115 | # Create the deserializer 116 | deserializer = KafkaDeserializer(client) 117 | 118 | # Create the consumer 119 | consumer = KafkaConsumer('my-topic', value_deserializer=deserializer) 120 | 121 | # Now use the consumer normally 122 | for message in consumer: 123 | # The deserializer produces DataAndSchema instances 124 | value: DataAndSchema = message.value 125 | # which are NamedTuples with a `data` and `schema` property 126 | value.data == value[0] 127 | value.schema == value[1] 128 | # and can be deconstructed 129 | data, schema = value 130 | ``` 131 | 132 | ## Contributing 133 | 134 | Clone this repository and install development dependencies: 135 | 136 | ``` 137 | pip install -e .[dev] 138 | ``` 139 | 140 | Run the linter and tests with tox before committing. After committing, check Github Actions to see the result of the automated checks. 141 | 142 | ### Linting 143 | 144 | Lint the code with: 145 | 146 | ``` 147 | flake8 148 | ``` 149 | 150 | Run the type checker with: 151 | 152 | ``` 153 | mypy 154 | ``` 155 | 156 | ### Tests 157 | 158 | Tests go under the `tests/` directory. All tests outside of `tests/integration` are unit tests with no external dependencies. 159 | 160 | Tests under `tests/integration` are integration test that interact with external resources and/or real AWS schema registries. They generally run slower and require some additional configuration. 161 | 162 | Run just the unit tests with: 163 | 164 | ``` 165 | pytest --ignore tests/integration 166 | ``` 167 | 168 | All integration tests use the following environment variables: 169 | 170 | - `AWS_ACCESS_KEY_ID` 171 | - `AWS_SECRET_ACCESS_KEY` 172 | - `AWS_SESSION_TOKEN` 173 | - `AWS_REGION` 174 | - `AWS_PROFILE` 175 | - `CLEANUP_REGISTRY`: Set to any value to prevent the test from destroying the registry created during the test, allowing you to inspect its contents. 176 | 177 | If no `AWS_` environment variables are set, `boto3` will try to load credentials from your default AWS profile. 178 | 179 | See individual integration test directories for additional requirements and setup instructions. 180 | 181 | ### Tox 182 | 183 | This project uses [Tox](https://tox.wiki/en/latest/) to run tests across multiple Python versions. 184 | 185 | Install Tox with: 186 | 187 | ``` 188 | pip install tox 189 | ``` 190 | 191 | and run it with: 192 | 193 | ``` 194 | tox 195 | ``` 196 | 197 | Note that Tox requires the tested python versions to be installed. One convenient way to manage this is using [pyenv](https://github.com/pyenv/pyenv#installation). See the `.python-versions` file for the Python versions that need to be installed. 198 | 199 | 200 | ### Releases 201 | 202 | Assuming pypi permissions: 203 | 204 | ``` 205 | python -m build 206 | twine upload -r testpypi dist/* 207 | twine upload dist/* 208 | ``` -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = aws-glue-schema-registry 3 | version = 1.1.3 4 | description = Use the AWS Glue Schema Registry. 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | author = Corentin Debost 8 | author_email = develop@disasteraware.com 9 | license = Apache Software License 10 | license_files = LICENSE 11 | classifiers = 12 | Development Status :: 4 - Beta 13 | Intended Audience :: Developers 14 | License :: OSI Approved :: Apache Software License 15 | Programming Language :: Python :: 3 16 | Programming Language :: Python :: 3 :: Only 17 | Programming Language :: Python :: 3.8 18 | Programming Language :: Python :: 3.9 19 | Programming Language :: Python :: 3.10 20 | Programming Language :: Python :: 3.11 21 | Topic :: Software Development :: Libraries 22 | Typing :: Typed 23 | keywords = aws, glue, schema, registry, avro 24 | project_urls = 25 | Source=https://github.com/DisasterAWARE/aws-glue-schema-registry-python 26 | 27 | [options] 28 | packages = 29 | aws_schema_registry 30 | aws_schema_registry.adapter 31 | package_dir = 32 | =src 33 | python_requires = >=3.8 34 | install_requires = 35 | boto3>=1.17.102 36 | typing-extensions>=3.7.4.3;python_version<"3.8" 37 | fastavro>=1.4.5 38 | orjson~=3.6.0;python_version<"3.11" 39 | orjson>=3.7.7;python_version>="3.11" 40 | fastjsonschema~=2.15 41 | setup_requires = 42 | setuptools 43 | 44 | [options.extras_require] 45 | dev = 46 | pytest>=6 47 | flake8>=3 48 | kafka-python = 49 | kafka-python>=2 50 | 51 | [mypy] 52 | files = src,tests 53 | 54 | [mypy-kafka.*] 55 | ignore_missing_imports = True 56 | 57 | [mypy-boto3.*] 58 | ignore_missing_imports = True 59 | 60 | [mypy-fastjsonschema.*] 61 | ignore_missing_imports = True 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | if __name__ == '__main__': 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /src/aws_schema_registry/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import SchemaRegistryClient 2 | from .exception import SchemaRegistryException 3 | from .schema import ( 4 | CompatibilityMode, DataFormat, Schema, SchemaVersion, ValidationError 5 | ) 6 | from .serde import DataAndSchema, KafkaDeserializer, KafkaSerializer 7 | 8 | __version__ = '1.0.0' 9 | 10 | __all__ = [ 11 | 'CompatibilityMode', 12 | 'DataAndSchema', 13 | 'DataFormat', 14 | 'KafkaDeserializer', 15 | 'KafkaSerializer', 16 | 'Schema', 17 | 'SchemaRegistryClient', 18 | 'SchemaRegistryException', 19 | 'SchemaVersion', 20 | 'ValidationError' 21 | ] 22 | -------------------------------------------------------------------------------- /src/aws_schema_registry/adapter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DisasterAWARE/aws-glue-schema-registry-python/e514f86151abc0b9247342ecc9b4a6bea67c3fe4/src/aws_schema_registry/adapter/__init__.py -------------------------------------------------------------------------------- /src/aws_schema_registry/adapter/kafka.py: -------------------------------------------------------------------------------- 1 | """Adapter for kafka-python. 2 | 3 | https://pypi.org/project/kafka-python/ 4 | """ 5 | 6 | from kafka.serializer import Serializer, Deserializer 7 | 8 | from aws_schema_registry import ( 9 | KafkaSerializer as _KafkaSerializer, 10 | KafkaDeserializer as _KafkaDeserializer 11 | ) 12 | 13 | 14 | class KafkaSerializer(Serializer): 15 | def __init__(self, *args, **kwargs): 16 | self._serializer = _KafkaSerializer(*args, **kwargs) 17 | 18 | def serialize(self, topic, value): 19 | return self._serializer.serialize(topic, value) 20 | 21 | 22 | class KafkaDeserializer(Deserializer): 23 | def __init__(self, *args, **kwargs): 24 | self._deserializer = _KafkaDeserializer(*args, **kwargs) 25 | 26 | def deserialize(self, topic, bytes_): 27 | return self._deserializer.deserialize(topic, bytes_) 28 | -------------------------------------------------------------------------------- /src/aws_schema_registry/avro.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from io import BytesIO 4 | import json 5 | from typing import Union 6 | 7 | import fastavro 8 | 9 | from aws_schema_registry.schema import DataFormat, Schema, ValidationError 10 | 11 | 12 | class AvroSchema(Schema): 13 | """Implementation of the `Schema` protocol for Avro schemas. 14 | 15 | Arguments: 16 | definition: the schema, either as a parsed dict or a string 17 | return_record_name: if true, when reading a union of records, 18 | the result will be a tuple where the first value is the 19 | name of the record and the second value is the record 20 | itself 21 | """ 22 | 23 | def __init__(self, definition: Union[str, dict], 24 | return_record_name: bool = False): 25 | if isinstance(definition, str): 26 | self._dict = json.loads(definition) 27 | else: 28 | self._dict = definition 29 | self._parsed = fastavro.parse_schema(self._dict) 30 | self.return_record_name = return_record_name 31 | 32 | def __hash__(self): 33 | return hash(str(self)) 34 | 35 | def __eq__(self, other): 36 | return isinstance(other, AvroSchema) and \ 37 | self._parsed == other._parsed and \ 38 | self.return_record_name == other.return_record_name 39 | 40 | def __str__(self): 41 | return json.dumps(self._dict) 42 | 43 | def __repr__(self): 44 | return '' % self._dict 45 | 46 | @property 47 | def data_format(self) -> DataFormat: 48 | return 'AVRO' 49 | 50 | @property 51 | def fqn(self) -> str: 52 | # https://github.com/fastavro/fastavro/issues/415 53 | return self._parsed.get('name', self._parsed['type']) 54 | 55 | def read(self, bytes_: bytes): 56 | b = BytesIO(bytes_) 57 | value = fastavro.schemaless_reader( 58 | b, 59 | self._parsed, 60 | return_record_name=self.return_record_name 61 | ) 62 | b.close() 63 | return value 64 | 65 | def write(self, data) -> bytes: 66 | b = BytesIO() 67 | fastavro.schemaless_writer(b, self._parsed, data) 68 | value = b.getvalue() 69 | b.close() 70 | return value 71 | 72 | def validate(self, data): 73 | try: 74 | fastavro.validate(data, self._parsed) 75 | except Exception as e: 76 | # the message will contain space characters, json.loads + str is a 77 | # (relatively inefficient) way to remove them 78 | detail: list[str] = json.loads(str(e)) 79 | raise ValidationError(str(detail)) from e 80 | -------------------------------------------------------------------------------- /src/aws_schema_registry/client.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging 3 | import time 4 | import random 5 | import string 6 | from typing import ContextManager, Mapping, Optional 7 | from uuid import UUID 8 | 9 | from aws_schema_registry.schema import ( 10 | CompatibilityMode, DataFormat, SchemaVersion 11 | ) 12 | from aws_schema_registry.exception import SchemaRegistryException 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | SCHEMA_VERSION_NOT_FOUND_MSG = 'Schema version is not found.' 17 | SCHEMA_NOT_FOUND_MSG = 'Schema is not found.' 18 | 19 | DEFAULT_COMPATIBILITY_MODE: CompatibilityMode = 'BACKWARD' 20 | 21 | 22 | def schema_name_from_arn(arn: str) -> str: 23 | return arn.split('/')[-1] 24 | 25 | 26 | class SchemaRegistryClient: 27 | """Façade that makes the registry API easier to use. 28 | 29 | Simplifies the large boto glue client to just operations on a 30 | single registry at a time and hides HTTP communication details. 31 | 32 | Arguments: 33 | glue_client: glue client created by `botocore`/`boto3`. 34 | registry_name: the name of the registry this client will work 35 | against. If not specified, defaults to the default registry 36 | which is named 'default-registry'. 37 | max_wait_attempts: maximum number of times to check whether a 38 | newly created schema has become available before reporting 39 | an error. 40 | wait_interval_seconds: delay in seconds between checking 41 | whether a newly created schema has become available. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | glue_client, 47 | registry_name: str = 'default-registry', 48 | max_wait_attempts: int = 10, 49 | wait_interval_seconds: float = 3 50 | ): 51 | self.glue_client = glue_client 52 | self.registry_name = registry_name 53 | self.max_wait_attempts = max_wait_attempts 54 | self.wait_interval_seconds = wait_interval_seconds 55 | 56 | def get_schema_version(self, version_id: UUID) -> SchemaVersion: 57 | """Get a schema version from the registry by id. 58 | 59 | Arguments: 60 | version_id: the schema version's unique id. 61 | 62 | Returns: 63 | SchemaVersion 64 | """ 65 | try: 66 | res = self.glue_client.get_schema_version( 67 | SchemaVersionId=str(version_id) 68 | ) 69 | except Exception as e: 70 | raise SchemaRegistryException( 71 | f'Failed to get schema version by id {version_id}' 72 | ) from e 73 | if ( 74 | res['SchemaVersionId'] is None or 75 | res['Status'] != 'AVAILABLE' 76 | ): 77 | raise SchemaRegistryException( 78 | f"Schema Found but status is {res['Status']}" 79 | ) 80 | return SchemaVersion( 81 | schema_name=schema_name_from_arn(res['SchemaArn']), 82 | version_id=UUID(res['SchemaVersionId']), 83 | definition=res['SchemaDefinition'], 84 | data_format=res['DataFormat'], 85 | status=res['Status'], 86 | version_number=res['VersionNumber'] 87 | ) 88 | 89 | def get_schema_by_definition( 90 | self, 91 | definition: str, 92 | schema_name: str 93 | ) -> SchemaVersion: 94 | """Get a schema version from the registry by schema definition. 95 | 96 | Arguments: 97 | definition: the stringified schema definition. 98 | schema_name: the name of the schema. 99 | 100 | Returns: 101 | SchemaVersion 102 | """ 103 | try: 104 | LOG.debug( 105 | 'Getting schema version id for: name = %s, definition = %s', 106 | schema_name, definition 107 | ) 108 | res = self.glue_client.get_schema_by_definition( 109 | SchemaId={ 110 | 'SchemaName': schema_name, 111 | 'RegistryName': self.registry_name 112 | }, 113 | SchemaDefinition=definition 114 | ) 115 | if ( 116 | res['SchemaVersionId'] is None or 117 | res['Status'] != 'AVAILABLE' 118 | ): 119 | raise SchemaRegistryException( 120 | f"Schema Found but status is {res['Status']}" 121 | ) 122 | return SchemaVersion( 123 | schema_name=schema_name_from_arn(res['SchemaArn']), 124 | version_id=UUID(res['SchemaVersionId']), 125 | definition=definition, 126 | data_format=res['DataFormat'], 127 | status=res['Status'] 128 | ) 129 | except Exception as e: 130 | raise SchemaRegistryException( 131 | 'Failed to get schemaVersionId by schema definition for schema' 132 | f' name = {schema_name}' 133 | ) from e 134 | 135 | def get_or_register_schema_version( 136 | self, 137 | definition: str, 138 | schema_name: str, 139 | data_format: DataFormat, 140 | compatibility_mode: CompatibilityMode = DEFAULT_COMPATIBILITY_MODE, 141 | metadata: Optional[Mapping[str, str]] = None 142 | ) -> SchemaVersion: 143 | """Get Schema Version ID by following below steps: 144 | 145 | 1) If schema version id exists in registry then get it from registry 146 | 2) If schema version id does not exist in registry 147 | then if schema exists but version doesn't exist 148 | then 149 | 2.1) Register schema version 150 | else if schema does not exist 151 | then 152 | 2.2) create schema and register schema version 153 | 154 | Arguments: 155 | definition: the stringified schema definition. 156 | schema_name: the name of the schema in the registry. 157 | data_format: which format to use if creating the schema. 158 | Has no effect if the schema by name already exists. 159 | compatibility_mode: which compatibility mode to use if 160 | creating the schema. Has no effect if the schema by 161 | name already exists. 162 | metadata: optional metadata to add to the schema version 163 | if registering a new version. Has no effect if a 164 | schema version matching the specified definition already 165 | exists. 166 | """ 167 | try: 168 | version = self.get_schema_by_definition( 169 | definition, schema_name 170 | ) 171 | except SchemaRegistryException as e: 172 | cause_msg = str(e.__cause__) 173 | if SCHEMA_VERSION_NOT_FOUND_MSG in cause_msg: 174 | LOG.debug(cause_msg) 175 | version_id = self.register_schema_version( 176 | definition, schema_name, metadata 177 | ) 178 | elif SCHEMA_NOT_FOUND_MSG in cause_msg: 179 | LOG.debug(cause_msg) 180 | version_id = self.create_schema( 181 | schema_name, data_format, definition, compatibility_mode, 182 | metadata 183 | ) 184 | else: 185 | raise SchemaRegistryException( 186 | 'Exception occurred while fetching or registering schema' 187 | f' definition = {definition}, schema name = {schema_name}' 188 | ) from e 189 | version = self.get_schema_version(version_id) 190 | return version 191 | 192 | def register_schema_version( 193 | self, 194 | definition: str, 195 | schema_name: str, 196 | metadata: Optional[Mapping[str, str]] = None 197 | ) -> UUID: 198 | """Register a new version to an existing schema. 199 | 200 | Waits until the new version becomes available before returning. 201 | 202 | Arguments: 203 | definition: the schema definition. 204 | schema_name: the name of the schema. 205 | metadata (optional): version metadata key-value pairs. 206 | 207 | Returns: 208 | UUID: the id of the new schema version 209 | """ 210 | try: 211 | res = self.glue_client.register_schema_version( 212 | SchemaId={ 213 | 'SchemaName': schema_name, 214 | 'RegistryName': self.registry_name 215 | }, 216 | SchemaDefinition=definition 217 | ) 218 | version_id = UUID(res['SchemaVersionId']) 219 | LOG.info('Registered the schema version with schema version ' 220 | 'id = %s and with version number = %s and status %s', 221 | version_id, res['VersionNumber'], res['Status']) 222 | if res['Status'] != 'AVAILABLE': 223 | self._wait_for_schema_evolution_check_to_complete(version_id) 224 | except Exception as e: 225 | raise SchemaRegistryException( 226 | 'Register schema :: Call failed when registering the schema' 227 | f' with the schema registry for schema name = {schema_name}', 228 | ) from e 229 | if metadata: 230 | self.put_schema_version_metadata(version_id, metadata) 231 | return version_id 232 | 233 | def _wait_for_schema_evolution_check_to_complete( 234 | self, 235 | schema_version_id: UUID 236 | ): 237 | time.sleep(self.wait_interval_seconds) 238 | for _ in range(self.max_wait_attempts): 239 | res = self.glue_client.get_schema_version( 240 | SchemaVersionId=str(schema_version_id) 241 | ) 242 | status = res['Status'] 243 | if status == 'AVAILABLE': 244 | break 245 | elif status != 'PENDING': 246 | raise SchemaRegistryException( 247 | 'Schema evolution check failed.' 248 | f' schemaVersionId {schema_version_id} is in' 249 | f' {status} status.' 250 | ) 251 | else: 252 | raise SchemaRegistryException( 253 | 'Retries exhausted for schema evolution check for ' 254 | f'schemaVersionId = {schema_version_id}' 255 | ) 256 | 257 | def put_schema_version_metadata( 258 | self, 259 | version_id: UUID, 260 | metadata: Mapping[str, str] 261 | ): 262 | for k, v in metadata.items(): 263 | try: 264 | self.glue_client.put_schema_version_metadata( 265 | SchemaVersionId=str(version_id), 266 | MetadataKeyValue={ 267 | 'MetadataKey': k, 268 | 'MetadataValue': v 269 | } 270 | ) 271 | except Exception as e: 272 | raise SchemaRegistryException( 273 | 'Put schema version metadata :: Call failed when put' 274 | f' metadata key = {k} value = {v} to schema for schema' 275 | f' versionid = {version_id}' 276 | ) from e 277 | 278 | def create_schema( 279 | self, 280 | name: str, 281 | data_format: DataFormat, 282 | definition: str, 283 | compatibility_mode: CompatibilityMode = DEFAULT_COMPATIBILITY_MODE, 284 | metadata: Optional[Mapping[str, str]] = None 285 | ) -> UUID: 286 | """Create a new schema and return the version id.""" 287 | try: 288 | LOG.info('Creating schema with name: %s and definition: %s', 289 | name, definition) 290 | res = self.glue_client.create_schema( 291 | SchemaName=name, 292 | RegistryId={ 293 | 'RegistryName': self.registry_name 294 | }, 295 | DataFormat=data_format, 296 | Compatibility=compatibility_mode, 297 | Description='', 298 | Tags={}, 299 | SchemaDefinition=definition 300 | ) 301 | version_id = UUID(res['SchemaVersionId']) 302 | if metadata: 303 | self.put_schema_version_metadata(version_id, metadata) 304 | except Exception as e: 305 | if type(e).__name__ == 'AlreadyExistsException': 306 | LOG.warn('Schema is already created, this could be caused by ' 307 | 'multiple producers racing to auto-create schema.') 308 | version_id = self.register_schema_version( 309 | definition, name, metadata 310 | ) 311 | else: 312 | raise SchemaRegistryException( 313 | f'Create schema {name} failed' 314 | ) from e 315 | return version_id 316 | 317 | 318 | class TemporaryRegistry(ContextManager): 319 | """A real schema registry for use in tests and experiments. 320 | 321 | This class implements the ContextManager protocol, creating the registry 322 | on enter and destroying it on exit. 323 | 324 | Usage: 325 | 326 | ```python 327 | with TemporaryRegistry(glue_client, 'MyRegistry') as r: 328 | # registry is created on enter 329 | print(r.name) # the "real" (suffixed) registry name 330 | # registry is destroyed on exit 331 | ``` 332 | 333 | Arguments: 334 | glue_client: glue client created by `botocore`/`boto3`. 335 | name: human-readable name for the created registry. The name will be 336 | suffixed by a random identifier to reduce the freqency of 337 | collisions. 338 | description: description for the created registry. 339 | autoremove: whether to destroy the created registry. Defaults to True. 340 | """ 341 | 342 | DEFAULT_DESCRIPTION = 'Temporary registry created with the aws-glue-schema-registry Python library.' # NOQA 343 | 344 | def __init__(self, glue_client, 345 | name: str = 'temporary-registry', 346 | description: str = DEFAULT_DESCRIPTION, 347 | autoremove: bool = True): 348 | self.glue_client = glue_client 349 | date = datetime.utcnow().strftime('%y-%m-%d-%H-%M') 350 | r = ''.join(random.choices(string.digits + string.ascii_letters, 351 | k=16)) 352 | self.name = f'{name}-{date}-{r}' 353 | self.description = description 354 | self.autoremove = autoremove 355 | 356 | def __enter__(self): 357 | LOG.info('creating registry %s...' % self.name) 358 | self.glue_client.create_registry( 359 | RegistryName=self.name, 360 | Description=self.description 361 | ) 362 | return self 363 | 364 | def __exit__(self, *args): 365 | if self.autoremove: 366 | LOG.info('deleting registry %s...' % self.name) 367 | self.glue_client.delete_registry( 368 | RegistryId={'RegistryName': self.name} 369 | ) 370 | -------------------------------------------------------------------------------- /src/aws_schema_registry/codec.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | """Functions to encode and decode values with schema information. 4 | 5 | In order to benefit from using a schema registry with Kafka, key and 6 | value data must be encoded with additional bits of information, which 7 | effectively renders the raw bytes human unreadable. 8 | 9 | The encoded data consists of the following components: 10 | 11 | byte | value 12 | ------------ 13 | 0 | magic byte that signifies whether the data was written with 14 | | a compatible client 15 | 1 | which algorithm was used to compress the data bytes 16 | 2-17 | uuid that indicates the writer schema 17 | 18+ | actual data bytes, possibly compressed according to the 18 | | compression byte 19 | 20 | This encoding is based on the Java Glue Schema Registry client 21 | (https://github.com/awslabs/aws-glue-schema-registry) in an effort to 22 | maintain full compatibility. 23 | """ 24 | 25 | from io import BytesIO 26 | from uuid import UUID 27 | import zlib 28 | 29 | VERSION_BYTE = b'\x03' 30 | """Expected value of the magic version byte. 31 | 32 | If the leading byte of the encoded data has a different value, 33 | that signifies one of the following: 34 | 35 | 1. The data was encoded by a different version of the encoder 36 | 2. The data was encoded by a different library (e.g. the Java library) 37 | that is no longer compatible with this library 38 | 3. The data was encoded for another schema registry 39 | (e.g. Confluent Schema Registry) 40 | 4. The data was written by a schema-less producer 41 | """ 42 | 43 | COMPRESSION_ENABLED_BYTE = b'\x05' 44 | """Compression byte when using ZLIB compression.""" 45 | 46 | COMPRESSION_DISABLED_BYTE = b'\x00' 47 | """Compression byte when compression is disabled.""" 48 | 49 | SCHEMA_VERSION_ID_SIZE = 16 50 | """Number of bytes reserved for the schema version uuid.""" 51 | 52 | 53 | class CodecException(Exception): 54 | """Raised when encoding or decoding fails.""" 55 | 56 | 57 | class UnknownEncodingException(CodecException): 58 | """Raised when decoding data with an unknown encoding.""" 59 | 60 | 61 | def encode(data: bytes, 62 | schema_version_id: UUID, 63 | compression=None) -> bytes: 64 | """Encode data and schema information into bytes. 65 | 66 | Arguments: 67 | data (bytes): the payload itself. 68 | schema_version_id (UUID): version id of the schema used to 69 | serialize the data. 70 | compression (Any): whether to compress the payload data. 71 | Any truthy value can be passed to enable compression. 72 | 73 | Currently only ZLIB compression is supported. In future 74 | versions this parameter may take specific values to 75 | differentiate between different compression algorithms. 76 | 77 | Returns: 78 | bytes 79 | """ 80 | b = BytesIO() 81 | if compression: 82 | compression_byte = COMPRESSION_ENABLED_BYTE 83 | data = zlib.compress(data) 84 | else: 85 | compression_byte = COMPRESSION_DISABLED_BYTE 86 | b.write(VERSION_BYTE) 87 | b.write(compression_byte) 88 | b.write(schema_version_id.bytes) 89 | b.write(data) 90 | value = b.getvalue() 91 | b.close() 92 | return value 93 | 94 | 95 | def decode(bytes_: bytes) -> tuple[bytes, UUID]: 96 | """Decode bytes into data and schema information. 97 | 98 | Arguments: 99 | bytes_ (bytes): encoded bytes. 100 | 101 | Returns: 102 | tuple[bytes, UUID]: a two-item tuple consisting of the decoded 103 | and decompressed data, and the schema version id 104 | 105 | Raises: 106 | UnknownEncodingException: if the leading byte of the encoded 107 | data is not recognized, implying the data was encoded with 108 | an incompatible client or for a different schema registry 109 | CodecException: if any other error occurs while decoding 110 | """ 111 | b = BytesIO(bytes_) 112 | version = b.read(1) 113 | if version != VERSION_BYTE: 114 | raise UnknownEncodingException( 115 | r"leading byte {version!r} not recognized" 116 | ) 117 | compression = b.read(1) 118 | schema_version = UUID(bytes=b.read(SCHEMA_VERSION_ID_SIZE)) 119 | data = b.read() 120 | if compression == COMPRESSION_ENABLED_BYTE: 121 | data = zlib.decompress(data) 122 | elif compression != COMPRESSION_DISABLED_BYTE: 123 | raise CodecException( 124 | f'compression byte {compression!r} not recognized' 125 | ) 126 | return data, schema_version 127 | -------------------------------------------------------------------------------- /src/aws_schema_registry/exception.py: -------------------------------------------------------------------------------- 1 | class SchemaRegistryException(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /src/aws_schema_registry/jsonschema.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Union 4 | 5 | import orjson 6 | 7 | import fastjsonschema 8 | 9 | from aws_schema_registry.schema import DataFormat, Schema, ValidationError 10 | 11 | 12 | class JsonSchema(Schema): 13 | """Implementation of the `Schema` protocol for JSON schemas. 14 | 15 | Arguments: 16 | definition: the schema, either as a parsed dict or a string 17 | """ 18 | 19 | def __init__(self, definition: Union[str, dict]): 20 | if isinstance(definition, str): 21 | self._dict = orjson.loads(definition) 22 | else: 23 | self._dict = definition 24 | self._compiled_validation_method = fastjsonschema.compile(self._dict) 25 | 26 | def __hash__(self): 27 | return hash(str(self)) 28 | 29 | def __eq__(self, other): 30 | return isinstance(other, JsonSchema) and \ 31 | self._dict == other._dict 32 | 33 | def __str__(self): 34 | return orjson.dumps(self._dict).decode() 35 | 36 | def __repr__(self): 37 | return '' % self._dict 38 | 39 | @property 40 | def data_format(self) -> DataFormat: 41 | return 'JSON' 42 | 43 | @property 44 | def fqn(self) -> str: 45 | return "" 46 | 47 | def read(self, bytes_: bytes): 48 | data = orjson.loads(bytes_) 49 | self.validate(data) 50 | return data 51 | 52 | def write(self, data) -> bytes: 53 | self.validate(data) 54 | return orjson.dumps(data) 55 | 56 | def validate(self, data): 57 | try: 58 | self._compiled_validation_method(data) 59 | except fastjsonschema.exceptions.JsonSchemaValueException as e: 60 | raise ValidationError(str(e)) from e 61 | -------------------------------------------------------------------------------- /src/aws_schema_registry/naming.py: -------------------------------------------------------------------------------- 1 | """Strategies for choosing schema names in the registry. 2 | 3 | The following strategies are included in this module: 4 | * topic_name_strategy (default) 5 | * record_name_strategy 6 | * topic_record_name_strategy 7 | 8 | Use `help()` to see more information on each strategy. 9 | 10 | Implement custom strategies by defining a function that fulfills the 11 | SchemaNameStrategy: 12 | 13 | ```python 14 | from aws_schema_registry import Schema 15 | 16 | def custom_strategy(topic: str, is_key: bool, schema: Schema) -> str: 17 | ... 18 | ``` 19 | """ 20 | 21 | import sys 22 | 23 | if sys.version_info[1] < 8: # py37 24 | from typing_extensions import Protocol 25 | else: 26 | from typing import Protocol 27 | 28 | from aws_schema_registry.schema import Schema 29 | 30 | 31 | class SchemaNamingStrategy(Protocol): 32 | """Controls how schema names are chosen for a value. 33 | 34 | Arguments: 35 | topic: the name of the topic the value is being written into. 36 | value: the _unserialized_ value being written. 37 | is_key: whether the value is a Kafka key or value. 38 | 39 | Returns: 40 | str: the schema name to use 41 | """ 42 | def __call__(self, topic: str, is_key: bool, schema: Schema) -> str: ... 43 | 44 | 45 | def topic_name_strategy(topic: str, is_key: bool, schema: Schema) -> str: 46 | """The default naming strategy. 47 | 48 | Message keys are `-key` and message values are 49 | `-value`. 50 | 51 | This is a sensible strategy for topics whose records follow a uniform 52 | schema, but does not allow mixing different schemas on the same topic. 53 | """ 54 | return f"{topic}-{'key' if is_key else 'value'}" 55 | 56 | 57 | def record_name_strategy(topic: str, is_key: bool, schema: Schema) -> str: 58 | """Uses the fully-qualified record name as the schema name. 59 | 60 | Allows a topic to contain records with multiple incompatible schemas. 61 | However, this requires that the fully-qualified record names uniquely 62 | and consistently identify a schema across the entire registry. 63 | """ 64 | return schema.fqn 65 | 66 | 67 | def topic_record_name_strategy(topic: str, is_key: bool, 68 | schema: Schema) -> str: 69 | """Combines the topic and record name to form the schema name. 70 | 71 | Allows a topic to contain records with multiple incompatible schemas. 72 | Additionally allows different topics to use the same record name for 73 | incompatible schemas. 74 | """ 75 | return f'{topic}-{schema.fqn}' 76 | -------------------------------------------------------------------------------- /src/aws_schema_registry/schema.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from dataclasses import dataclass 5 | import sys 6 | from typing import Any, Optional, Hashable 7 | from uuid import UUID 8 | 9 | if sys.version_info[1] < 8: # for py37 10 | from typing_extensions import Literal 11 | else: 12 | from typing import Literal 13 | 14 | DataFormat = Literal['AVRO', 'JSON'] 15 | 16 | CompatibilityMode = Literal['NONE', 'DISABLED', 'BACKWARD', 'BACKWARD_ALL', 17 | 'FORWARD', 'FORWARD_ALL', 'FULL', 'FULL_ALL'] 18 | """Controls the checks performed on new schema versions. 19 | 20 | Values: 21 | NONE: no compatibility checks performed 22 | DISABLED: no new versions can be added to the schema 23 | BACKWARD: consumer can read both current and previous version 24 | BACKWARD_ALL: consumer can read current and all previous 25 | versions 26 | FORWARD: consumer can read both current and subsequent version 27 | FORWARD_ALL: consumer can read both current and all subsequent 28 | versions 29 | FULL: combination of 'BACKWARD' and 'FORWARD' 30 | FULL_ALL: combination of 'BACKWARD_ALL' and 'FORWARD_ALL' 31 | """ 32 | 33 | SchemaStatus = Literal['AVAILABLE', 'PENDING', 'DELETING'] 34 | SchemaVersionStatus = Literal['AVAILABLE', 'PENDING', 'FAILURE', 'DELETING'] 35 | 36 | 37 | class Schema(ABC, Hashable): 38 | """Abstract base class for a schema implementation.""" 39 | 40 | @property 41 | @abstractmethod 42 | def data_format(self) -> DataFormat: 43 | """The data format of this schema.""" 44 | 45 | @property 46 | @abstractmethod 47 | def fqn(self) -> str: 48 | """The fully-qualified name of this schema.""" 49 | 50 | @abstractmethod 51 | def read(self, bytes_: bytes) -> Any: 52 | """Read bytes into a record.""" 53 | 54 | @abstractmethod 55 | def write(self, data) -> bytes: 56 | """Write a record into bytes.""" 57 | 58 | def validate(self, data) -> None: 59 | """Raise a ValidationException if the data is invalid.""" 60 | 61 | 62 | class ValidationError(Exception): 63 | """Raised when a schema's `validate` is called on invalid data. 64 | 65 | The error need not contain *every* validation error, just the first that 66 | classifies the data as invalid. 67 | """ 68 | pass 69 | 70 | 71 | @dataclass 72 | class SchemaVersion: 73 | schema_name: str 74 | version_id: UUID 75 | definition: str 76 | data_format: DataFormat 77 | status: SchemaVersionStatus 78 | version_number: Optional[int] = None 79 | 80 | def __hash__(self): 81 | return hash(self.definition) 82 | -------------------------------------------------------------------------------- /src/aws_schema_registry/serde.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import logging 5 | from typing import Any, NamedTuple 6 | from uuid import UUID 7 | 8 | from aws_schema_registry.avro import AvroSchema 9 | from aws_schema_registry.jsonschema import JsonSchema 10 | from aws_schema_registry.client import SchemaRegistryClient 11 | from aws_schema_registry.codec import decode, encode, UnknownEncodingException 12 | from aws_schema_registry.exception import SchemaRegistryException 13 | from aws_schema_registry.naming import ( 14 | SchemaNamingStrategy, topic_name_strategy 15 | ) 16 | from aws_schema_registry.schema import CompatibilityMode, Schema, SchemaVersion 17 | 18 | LOG = logging.getLogger(__name__) 19 | 20 | 21 | class DataAndSchema(NamedTuple): 22 | """Data and its schema. 23 | 24 | Can be used to wrap the data and schema together before calling the 25 | producer's producing methods. 26 | """ 27 | data: Any 28 | schema: Schema 29 | 30 | 31 | class KafkaSerializer: 32 | """Kafka serializer that uses the AWS Schema Registry. 33 | 34 | Arguments: 35 | client: instance of SchemaRegistryClient 36 | is_key (optional): whether the serializer is serializing keys as 37 | opposed to values. Defaults to false. Setting this to the 38 | appropriate value is important to avoid mixing key and value 39 | schemas if using the default schema name strategy. 40 | compatibility_mode (optional): the compatibility mode t use if 41 | creating a new schema in the registry. Defaults to the 42 | registry's default compatibility setting if not specified. 43 | schema_naming_strategy (optional): how to choose the schema name 44 | when creating new schemas. Defaults to the topic name 45 | strategy. See the `naming` module for more information and 46 | alternate strategies. 47 | """ 48 | 49 | def __init__( 50 | self, 51 | client: SchemaRegistryClient, 52 | is_key: bool = False, 53 | compatibility_mode: CompatibilityMode = 'BACKWARD', 54 | schema_naming_strategy: SchemaNamingStrategy = topic_name_strategy 55 | ): 56 | self.client = client 57 | self.is_key = is_key 58 | self.compatibility_mode: CompatibilityMode = compatibility_mode 59 | self.schema_naming_strategy = schema_naming_strategy 60 | 61 | def serialize(self, topic: str, data_and_schema: DataAndSchema): 62 | if data_and_schema is None: 63 | return None 64 | if not isinstance(data_and_schema, tuple): 65 | raise TypeError('KafkaSerializer can only serialize', 66 | f' {tuple}, got {type(data_and_schema)}') 67 | data, schema = data_and_schema 68 | schema_version = self._get_schema_version(topic, schema) 69 | serialized = schema.write(data) 70 | return encode(serialized, schema_version.version_id) 71 | 72 | @functools.lru_cache(maxsize=None) 73 | def _get_schema_version(self, topic: str, schema: Schema) -> SchemaVersion: 74 | schema_name = self.schema_naming_strategy(topic, self.is_key, schema) 75 | LOG.info('Fetching schema %s...', schema_name) 76 | return self.client.get_or_register_schema_version( 77 | definition=str(schema), 78 | schema_name=schema_name, 79 | data_format=schema.data_format, 80 | compatibility_mode=self.compatibility_mode 81 | ) 82 | 83 | 84 | class KafkaDeserializer: 85 | """Kafka serializer that uses the AWS Schema Registry. 86 | 87 | Arguments: 88 | client: instance of SchemaRegistryClient. 89 | return_record_name: if true, when reading a union of records, 90 | the result will be a tuple where the first value is the 91 | name of the record and the second value is the record 92 | itself 93 | secondary_deserializer: optional deserializer to pass through 94 | to when processing values with an unrecognized encoding. 95 | This is primarily use to migrate from other schema 96 | registries or handle schema-less data. The secondary deserializer 97 | should either be a callable taking the same arguments as 98 | deserialize or an object with a matching deserialize method. 99 | """ 100 | 101 | def __init__( 102 | self, 103 | client: SchemaRegistryClient, 104 | return_record_name: bool = False, 105 | secondary_deserializer=None 106 | ): 107 | self.client = client 108 | self.return_record_name = return_record_name 109 | self.secondary_deserializer = secondary_deserializer 110 | 111 | def deserialize(self, topic: str, bytes_: bytes): 112 | if bytes_ is None: 113 | return None 114 | try: 115 | data_bytes, schema_version_id = decode(bytes_) 116 | except UnknownEncodingException as e: 117 | if self.secondary_deserializer: 118 | if callable(self.secondary_deserializer): 119 | return self.secondary_deserializer(topic, bytes_) 120 | return self.secondary_deserializer.deserialize(topic, bytes_) 121 | else: 122 | raise SchemaRegistryException( 123 | 'no secondary deserializer provided to handle' 124 | ' unrecognized data encoding' 125 | ) from e 126 | writer_schema_version = self._get_schema_version(schema_version_id) 127 | writer_schema = self._schema_for_version(writer_schema_version) 128 | return DataAndSchema(writer_schema.read(data_bytes), writer_schema) 129 | 130 | @functools.lru_cache(maxsize=None) 131 | def _get_schema_version(self, version_id: UUID): 132 | LOG.info('Fetching schema version %s...', version_id) 133 | return self.client.get_schema_version(version_id) 134 | 135 | @functools.lru_cache(maxsize=None) 136 | def _schema_for_version(self, version: SchemaVersion) -> Schema: 137 | if version.data_format == 'AVRO': 138 | return AvroSchema(version.definition) 139 | elif version.data_format == 'JSON': 140 | return JsonSchema(version.definition) 141 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | import pytest 6 | 7 | from aws_schema_registry.client import TemporaryRegistry 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID') 12 | AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') 13 | AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN') 14 | AWS_REGION = os.getenv('AWS_REGION') 15 | AWS_PROFILE = os.getenv('AWS_PROFILE') 16 | 17 | CLEANUP_REGISTRY = os.getenv('CLEANUP_REGISTRY') is None 18 | 19 | 20 | @pytest.fixture(scope='session') 21 | def boto_session(): 22 | return boto3.Session( 23 | aws_access_key_id=AWS_ACCESS_KEY_ID, 24 | aws_secret_access_key=AWS_SECRET_ACCESS_KEY, 25 | aws_session_token=AWS_SESSION_TOKEN, 26 | region_name=AWS_REGION, 27 | profile_name=AWS_PROFILE 28 | ) 29 | 30 | 31 | @pytest.fixture(scope='session') 32 | def glue_client(boto_session): 33 | return boto_session.client('glue') 34 | 35 | 36 | @pytest.fixture(scope='session') 37 | def registry(glue_client): 38 | """The AWS Glue registry to use for testing.""" 39 | with TemporaryRegistry( 40 | glue_client, 41 | name='schema-registry-tests', 42 | description='Registry used for the schema registry python integration' 43 | ' tests. This registry does not hold any valuable data and is safe to' 44 | ' delete as long as it is not currently in use by a test', 45 | autoremove=CLEANUP_REGISTRY 46 | ) as registry: 47 | yield registry.name 48 | -------------------------------------------------------------------------------- /tests/integration/java/README.md: -------------------------------------------------------------------------------- 1 | # Java integration tests 2 | 3 | Tests that this libary is compatible with the [Java version](https://github.com/awslabs/aws-glue-schema-registry). 4 | 5 | Requires Java v11+ and Maven to build the Java test project. 6 | -------------------------------------------------------------------------------- /tests/integration/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.disasteraware.aws.schemaregistry 8 | java-integration-test 9 | develop-SNAPSHOT 10 | jar 11 | 12 | java-integration-test 13 | 14 | 15 | UTF-8 16 | 11 17 | 11 18 | 19 | 20 | 21 | 22 | software.amazon.glue 23 | schema-registry-serde 24 | 1.1.4 25 | 26 | 27 | org.apache.avro 28 | avro 29 | 1.10.2 30 | 31 | 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-shade-plugin 38 | 2.3 39 | 40 | ${project.artifactId} 41 | false 42 | 43 | 44 | 45 | com.disasteraware.aws.schemaregistry.App 46 | 47 | 48 | 49 | 50 | 51 | 52 | package 53 | 54 | shade 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /tests/integration/java/src/main/java/com/disasteraware/aws/schemaregistry/App.java: -------------------------------------------------------------------------------- 1 | package com.disasteraware.aws.schemaregistry; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import java.util.Objects; 7 | 8 | import com.amazonaws.services.schemaregistry.deserializers.GlueSchemaRegistryKafkaDeserializer; 9 | import com.amazonaws.services.schemaregistry.serializers.GlueSchemaRegistryKafkaSerializer; 10 | import com.amazonaws.services.schemaregistry.utils.AWSSchemaRegistryConstants; 11 | import com.amazonaws.services.schemaregistry.utils.AvroRecordType; 12 | 13 | import org.apache.avro.Schema; 14 | import org.apache.avro.generic.GenericData; 15 | import org.apache.avro.generic.GenericRecord; 16 | import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; 17 | import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; 18 | import software.amazon.awssdk.services.glue.model.DataFormat; 19 | 20 | public class App { 21 | static AwsCredentialsProvider credentialsProvider = ProfileCredentialsProvider.builder() 22 | .profileName(System.getenv("AWS_PROFILE")) 23 | .build(); 24 | 25 | static Map configs = new HashMap<>(); 26 | 27 | public static void main(String[] args) { 28 | String dataFormat = Objects.requireNonNull(System.getenv("DATA_FORMAT")); 29 | configs.put(AWSSchemaRegistryConstants.AWS_REGION, Objects.requireNonNull(System.getenv("AWS_REGION"))); 30 | configs.put(AWSSchemaRegistryConstants.REGISTRY_NAME, Objects.requireNonNull(System.getenv("REGISTRY_NAME"))); 31 | configs.put(AWSSchemaRegistryConstants.SCHEMA_NAME, Objects.requireNonNull(System.getenv("SCHEMA_NAME"))); 32 | configs.put(AWSSchemaRegistryConstants.SCHEMA_AUTO_REGISTRATION_SETTING, true); 33 | 34 | if (dataFormat.equals("AVRO")) { 35 | configs.put(AWSSchemaRegistryConstants.DATA_FORMAT, DataFormat.AVRO.name()); 36 | configs.put(AWSSchemaRegistryConstants.AVRO_RECORD_TYPE, AvroRecordType.GENERIC_RECORD.getName()); 37 | try { 38 | byte[] bytes; 39 | GenericRecord record; 40 | Schema schema; 41 | 42 | bytes = System.in.readAllBytes(); 43 | 44 | GlueSchemaRegistryKafkaDeserializer deserializer = new GlueSchemaRegistryKafkaDeserializer(configs); 45 | record = (GenericRecord) deserializer.deserialize("test", bytes); 46 | schema = record.getSchema(); 47 | 48 | GlueSchemaRegistryKafkaSerializer serializer = new GlueSchemaRegistryKafkaSerializer(configs); 49 | bytes = serializer.serialize("test", record); 50 | 51 | System.out.write(bytes, 0, bytes.length); 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | System.exit(1); 55 | } 56 | } else if (dataFormat.equals("JSON")) { 57 | configs.put(AWSSchemaRegistryConstants.DATA_FORMAT, DataFormat.JSON.name()); 58 | try { 59 | byte[] bytes; 60 | Object record; 61 | Schema schema; 62 | 63 | bytes = System.in.readAllBytes(); 64 | 65 | GlueSchemaRegistryKafkaDeserializer deserializer = new GlueSchemaRegistryKafkaDeserializer(configs); 66 | record = deserializer.deserialize("test", bytes); 67 | 68 | GlueSchemaRegistryKafkaSerializer serializer = new GlueSchemaRegistryKafkaSerializer(configs); 69 | bytes = serializer.serialize("test", record); 70 | 71 | System.out.write(bytes, 0, bytes.length); 72 | } catch (IOException e) { 73 | e.printStackTrace(); 74 | System.exit(1); 75 | } 76 | } else { 77 | System.out.println("Only JSON or AVRO are acceptable data formats"); 78 | System.exit(1); 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /tests/integration/java/test_java_integration.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess 4 | 5 | import pytest 6 | 7 | from aws_schema_registry import DataAndSchema, SchemaRegistryClient 8 | from aws_schema_registry.avro import AvroSchema 9 | from aws_schema_registry.jsonschema import JsonSchema 10 | from aws_schema_registry.adapter.kafka import ( 11 | KafkaDeserializer, KafkaSerializer 12 | ) 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | JAVA_CODE_LOCATION = os.path.dirname(__file__) 17 | JAR_LOCATION = os.path.join( 18 | JAVA_CODE_LOCATION, 19 | 'target', 20 | 'java-integration-test.jar' 21 | ) 22 | 23 | with open(os.path.join(os.path.dirname(__file__), 'user.avsc'), 'r') as f: 24 | SCHEMA = AvroSchema(f.read()) 25 | 26 | with open(os.path.join(os.path.dirname(__file__), 'user.json'), 'r') as f: 27 | JSON_SCHEMA = JsonSchema(f.read()) 28 | 29 | 30 | def _topic_name_schema_type_name_strategy(topic, is_key, schema): 31 | return f"{topic}-{'key' if is_key else 'value'}-{schema.data_format}" 32 | 33 | 34 | @pytest.mark.parametrize("schema", [SCHEMA, JSON_SCHEMA]) 35 | def test_interop_with_java_library(glue_client, registry, 36 | boto_session, schema): 37 | client = SchemaRegistryClient(glue_client, registry_name=registry) 38 | serializer = KafkaSerializer( 39 | client, 40 | schema_naming_strategy=_topic_name_schema_type_name_strategy) 41 | deserializer = KafkaDeserializer(client) 42 | 43 | data = { 44 | 'name': 'John Doe', 45 | 'favorite_number': 6, 46 | 'favorite_color': 'red' 47 | } 48 | serialized: bytes = serializer.serialize( 49 | 'test', DataAndSchema(data, schema) 50 | ) 51 | 52 | if not os.path.exists(JAR_LOCATION): 53 | LOG.info('Test java jar not found at %s, trying to compile...', 54 | JAR_LOCATION) 55 | compile_java() 56 | credentials = boto_session.get_credentials() 57 | proc = subprocess.run( 58 | ['java', '-jar', JAR_LOCATION], 59 | input=serialized, 60 | capture_output=True, 61 | env={ 62 | 'DATA_FORMAT': schema.data_format, 63 | 'AWS_ACCESS_KEY_ID': credentials.access_key, 64 | 'AWS_SECRET_ACCESS_KEY': credentials.secret_key, 65 | 'AWS_SESSION_TOKEN': credentials.token, 66 | 'AWS_REGION': boto_session.region_name, 67 | 'REGISTRY_NAME': registry, 68 | 'SCHEMA_NAME': _topic_name_schema_type_name_strategy( 69 | "test", False, schema) 70 | } 71 | ) 72 | print(proc.stderr) 73 | proc.check_returncode() 74 | deserialized = deserializer.deserialize('test', proc.stdout) 75 | assert deserialized 76 | assert deserialized.data == data 77 | assert deserialized.schema == schema 78 | 79 | 80 | def compile_java(): 81 | LOG.info('Finding mvn...') 82 | find_mvn_proc = subprocess.run(['which', 'mvn'], capture_output=True) 83 | if find_mvn_proc.returncode != 0: 84 | raise Exception('Cannot find an installation of maven to compile the' 85 | ' java test code. Compile manually or install mvn.') 86 | mvn = find_mvn_proc.stdout.decode('utf-8').strip() 87 | LOG.info('mvn found at %s', mvn) 88 | LOG.info('compiling...') 89 | compile_proc = subprocess.run( 90 | [mvn, 'clean', 'package'], 91 | cwd=JAVA_CODE_LOCATION 92 | ) 93 | compile_proc.check_returncode() 94 | -------------------------------------------------------------------------------- /tests/integration/java/user.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "aws_schema_registry.integrationtests", 3 | "type": "record", 4 | "name": "User", 5 | "fields": [ 6 | {"name": "name", "type": "string" }, 7 | {"name": "favorite_number", "type": ["int", "null"] }, 8 | {"name": "favorite_color", "type": ["string", "null"] } 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/integration/java/user.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "type": "object", 4 | "properties": { 5 | "name": { 6 | "type": "string" 7 | }, 8 | "favorite_number": { 9 | "type": "integer" 10 | }, 11 | "favorite_color": { 12 | "type": "string" 13 | } 14 | }, 15 | "required": [ 16 | "name", 17 | "favorite_number", 18 | "favorite_color" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /tests/integration/kafka_test/README.md: -------------------------------------------------------------------------------- 1 | # Kafka integration tests 2 | 3 | Tests that the serializer and deserializer works correctly with a real Kafka cluster. 4 | 5 | Requires [Docker](https://www.docker.com/). Tested with Docker v20. 6 | 7 | Run `docker compose -f tests/integration/kafka_test/docker-compose.yml up -d` before running these tests. Destroy the docker stack with `docker compose -f tests/integration/kafka_test/docker-compose.yml down`. 8 | -------------------------------------------------------------------------------- /tests/integration/kafka_test/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | zookeeper: 5 | image: 'public.ecr.aws/bitnami/zookeeper:latest' 6 | ports: 7 | - '2181:2182' 8 | environment: 9 | - ALLOW_ANONYMOUS_LOGIN=yes 10 | 11 | kafka: 12 | image: 'public.ecr.aws/bitnami/kafka:latest' 13 | ports: 14 | - '9092:9092' 15 | links: 16 | - zookeeper 17 | container_name: local_kafka 18 | environment: 19 | - KAFKA_BROKER_ID=1 20 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 21 | - KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:9092 22 | - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092 23 | - ALLOW_PLAINTEXT_LISTENER=yes -------------------------------------------------------------------------------- /tests/integration/kafka_test/test_kafka_integration.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | 4 | from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer 5 | from kafka.admin import NewTopic 6 | import pytest 7 | 8 | from aws_schema_registry import DataAndSchema, SchemaRegistryClient 9 | from aws_schema_registry.avro import AvroSchema 10 | from aws_schema_registry.jsonschema import JsonSchema 11 | from aws_schema_registry.adapter.kafka import ( 12 | KafkaDeserializer, KafkaSerializer 13 | ) 14 | from aws_schema_registry.naming import (record_name_strategy, 15 | topic_name_strategy) 16 | 17 | BOOTSTRAP_STRING = '127.0.0.1:9092' 18 | 19 | TOPIC_PREFIX = 'SchemaRegistryTests' 20 | NUMBER_OF_PARTITIONS = 1 21 | REPLICATION_FACTOR = 1 22 | 23 | DATE = datetime.utcnow().strftime('%y-%m-%d-%H-%M') 24 | 25 | with open(os.path.join(os.path.dirname(__file__), 'user.v1.avsc'), 'r') as f: 26 | SCHEMA_V1 = AvroSchema(f.read()) 27 | with open(os.path.join(os.path.dirname(__file__), 'user.v2.avsc'), 'r') as f: 28 | SCHEMA_V2 = AvroSchema(f.read()) 29 | with open(os.path.join(os.path.dirname(__file__), 'user.json'), 'r') as f: 30 | SCHEMA_JSON = JsonSchema(f.read()) 31 | 32 | PRODUCER_PROPERTIES = { 33 | 'bootstrap_servers': BOOTSTRAP_STRING, 34 | 'acks': 'all', 35 | 'retries': 0, 36 | 'batch_size': 16384, 37 | 'linger_ms': 1, 38 | 'buffer_memory': 33554432, 39 | 'request_timeout_ms': 1000 40 | } 41 | 42 | CONSUMER_PROPERTIES = { 43 | 'bootstrap_servers': BOOTSTRAP_STRING, 44 | 'auto_offset_reset': 'earliest', 45 | 'enable_auto_commit': False 46 | } 47 | 48 | 49 | @pytest.fixture(scope='session') 50 | def topic(): 51 | """The topic to use for testing. Name is partially random.""" 52 | name = f'{TOPIC_PREFIX}-{DATE}' 53 | admin_client = KafkaAdminClient(bootstrap_servers=BOOTSTRAP_STRING) 54 | print('creating topic %s...' % name) 55 | admin_client.create_topics([ 56 | NewTopic(name, NUMBER_OF_PARTITIONS, REPLICATION_FACTOR) 57 | ]) 58 | yield name 59 | print('deleting topic %s...' % name) 60 | admin_client.delete_topics([name]) 61 | 62 | 63 | def test_produce_consume_with_ser_de_schema_registry( 64 | glue_client, topic, registry 65 | ): 66 | client = SchemaRegistryClient( 67 | glue_client, registry_name=registry 68 | ) 69 | serializer = KafkaSerializer( 70 | client, schema_naming_strategy=record_name_strategy 71 | ) 72 | 73 | # jsonschema has no fqn, so we use topic_name_strategy for it 74 | # (which also requires a separate producer) 75 | json_serializer = KafkaSerializer( 76 | client, schema_naming_strategy=topic_name_strategy 77 | ) 78 | 79 | deserializer = KafkaDeserializer(client) 80 | 81 | producer = KafkaProducer( 82 | value_serializer=serializer, 83 | **PRODUCER_PROPERTIES 84 | ) 85 | 86 | json_producer = KafkaProducer( 87 | value_serializer=json_serializer, 88 | **PRODUCER_PROPERTIES 89 | ) 90 | 91 | data1 = { 92 | 'name': 'John Doe', 93 | 'favorite_number': 6, 94 | 'favorite_color': 'red' 95 | } 96 | producer.send(topic, DataAndSchema(data1, SCHEMA_V1)) 97 | 98 | data2 = { 99 | 'name': 'John Doe', 100 | 'favorite_number': 6, 101 | 'favorite_colors': ['red', 'blue'] 102 | } 103 | producer.send(topic, DataAndSchema(data2, SCHEMA_V2)) 104 | 105 | data3 = { 106 | 'name': 'John Doe', 107 | 'favorite_number': 6, 108 | 'favorite_colors': ['red', 'blue', "yello"] 109 | } 110 | json_producer.send(topic, DataAndSchema(data3, SCHEMA_JSON)) 111 | 112 | consumer = KafkaConsumer( 113 | topic, 114 | value_deserializer=deserializer, 115 | **CONSUMER_PROPERTIES 116 | ) 117 | batch = consumer.poll(timeout_ms=1000) 118 | assert len(batch) == 1 119 | messages = batch[list(batch.keys())[0]] 120 | assert len(messages) == 3 121 | assert messages[0].value.data == data1 122 | assert messages[0].value.schema == SCHEMA_V1 123 | assert messages[1].value.data == data2 124 | assert messages[1].value.schema == SCHEMA_V2 125 | assert messages[2].value.data == data3 126 | assert messages[2].value.schema == SCHEMA_JSON 127 | -------------------------------------------------------------------------------- /tests/integration/kafka_test/user.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "type": "object", 4 | "properties": { 5 | "name": { 6 | "type": "string" 7 | }, 8 | "favorite_number": { 9 | "type": "integer" 10 | }, 11 | "favorite_colors": { 12 | "type": "array", 13 | "items": { 14 | "type": "string" 15 | } 16 | } 17 | }, 18 | "required": [ 19 | "name", 20 | "favorite_number", 21 | "favorite_colors" 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /tests/integration/kafka_test/user.v1.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "aws_schema_registry.integrationtests", 3 | "type": "record", 4 | "name": "User", 5 | "fields": [ 6 | {"name": "name", "type": "string" }, 7 | {"name": "favorite_number", "type": ["int", "null"] }, 8 | {"name": "favorite_color", "type": ["string", "null"] } 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /tests/integration/kafka_test/user.v2.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "aws_schema_registry.integrationtests", 3 | "type": "record", 4 | "name": "User", 5 | "fields": [ 6 | {"name": "name", "type": "string" }, 7 | {"name": "favorite_number", "type": ["int", "null"] }, 8 | {"name": "favorite_colors", "type": { 9 | "type": "array", "items": { 10 | "namespace": "com.amazonaws.services.schemaregistry.serializers.avro", 11 | "name": "color", 12 | "type": "string" 13 | } 14 | }, "default": []} 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/test_avro.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from aws_schema_registry import ValidationError 4 | from aws_schema_registry.avro import AvroSchema 5 | 6 | 7 | def test_fully_qualified_name(): 8 | s = AvroSchema('{"type": "record", "namespace": "foo", "name": "Bar"}') 9 | assert s.fqn == "foo.Bar" 10 | 11 | 12 | def test_primitive_name(): 13 | # fastavro does not fulfill this part of the Avro spec 14 | s = AvroSchema('{"type": "string"}') 15 | assert s.fqn == 'string' 16 | 17 | 18 | def test_readwrite(): 19 | s = AvroSchema(''' 20 | { 21 | "type": "record", 22 | "name": "JediMaster", 23 | "fields": [ 24 | {"name": "name", "type": "string" }, 25 | {"name": "age", "type": "int" } 26 | ] 27 | }''') 28 | d = { 29 | 'name': 'Yoda', 30 | 'age': 900 31 | } 32 | assert s.read(s.write(d)) == d 33 | 34 | 35 | def test_validation(): 36 | s = AvroSchema(''' 37 | { 38 | "type": "record", 39 | "name": "JediMaster", 40 | "fields": [ 41 | {"name": "name", "type": "string" }, 42 | {"name": "age", "type": "int" } 43 | ] 44 | }''') 45 | with pytest.raises(ValidationError) as e: 46 | s.validate({'name': 'Obi-Wan'}) 47 | assert 'name' not in str(e) 48 | assert 'age' in str(e) 49 | with pytest.raises(ValidationError) as e: 50 | s.validate({'name': 1, 'age': 2}) 51 | assert 'name' in str(e) 52 | assert 'age' not in str(e) 53 | s.validate({'name': 'Jar Jar', 'age': 42, 'sith': True}) 54 | -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | from uuid import UUID, uuid4 3 | 4 | import pytest 5 | 6 | from aws_schema_registry.client import ( 7 | SchemaRegistryClient, SCHEMA_NOT_FOUND_MSG, SCHEMA_VERSION_NOT_FOUND_MSG 8 | ) 9 | from aws_schema_registry.exception import SchemaRegistryException 10 | 11 | REGISTRY_NAME = 'user-topic' 12 | SCHEMA_NAME = 'User-Topic' 13 | JSON_SCHEMA_NAME = 'User-Topic-json' 14 | SCHEMA_ARN = f'arn:aws:glue:us-west-2:123:schema/{REGISTRY_NAME}/{SCHEMA_NAME}' 15 | SCHEMA_VERSION_ID = UUID('b7b4a7f0-9c96-4e4a-a687-fb5de9ef0c63') 16 | JSON_SCHEMA_VERSION_ID = UUID('98718bb6-ca2a-4ac6-b841-748cab68b1b1') 17 | SCHEMA_DEF = '{"name": "Test", "type": "record", "fields": []}' 18 | JSON_SCHEMA_DEF = """{ 19 | "$schema": "http://json-schema.org/draft-04/schema#", 20 | "type": "object", 21 | "properties": { 22 | "name": { 23 | "type": "string" 24 | }, 25 | "age": { 26 | "type": "integer" 27 | } 28 | }, 29 | "required": [ 30 | "name", 31 | "age" 32 | ] 33 | }""" 34 | 35 | METADATA = { 36 | 'event-source-1': 'topic1', 37 | 'event-source-2': 'topic2', 38 | 'event-source-3': 'topic3', 39 | 'event-source-4': 'topic4', 40 | 'event-source-5': 'topic5' 41 | } 42 | 43 | 44 | @pytest.fixture 45 | def glue_client(): 46 | return Mock() 47 | 48 | 49 | @pytest.fixture 50 | def client(glue_client): 51 | return SchemaRegistryClient( 52 | glue_client, 53 | registry_name=REGISTRY_NAME, 54 | wait_interval_seconds=0 55 | ) 56 | 57 | 58 | def test_get_schema_version(client, glue_client): 59 | glue_client.get_schema_version = Mock(return_value={ 60 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 61 | 'SchemaDefinition': SCHEMA_DEF, 62 | 'SchemaArn': SCHEMA_ARN, 63 | 'DataFormat': 'AVRO', 64 | 'VersionNumber': 123, 65 | 'Status': 'AVAILABLE' 66 | }) 67 | 68 | version = client.get_schema_version(SCHEMA_VERSION_ID) 69 | 70 | assert version.version_id == SCHEMA_VERSION_ID 71 | 72 | 73 | def test_get_schema_by_definition(client, glue_client): 74 | glue_client.get_schema_by_definition = Mock(return_value={ 75 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 76 | 'SchemaArn': SCHEMA_ARN, 77 | 'DataFormat': 'AVRO', 78 | 'Status': 'AVAILABLE' 79 | }) 80 | 81 | version = client.get_schema_by_definition(SCHEMA_DEF, SCHEMA_NAME) 82 | 83 | assert version.version_id == SCHEMA_VERSION_ID 84 | 85 | 86 | def test_get_or_register_schema_version_creates_schema(client, glue_client): 87 | glue_client.get_schema_by_definition = Mock( 88 | side_effect=SchemaRegistryException( 89 | Exception(SCHEMA_NOT_FOUND_MSG) 90 | )) 91 | glue_client.create_schema = Mock(return_value={ 92 | 'RegistryName': REGISTRY_NAME, 93 | 'SchemaName': SCHEMA_NAME, 94 | 'Description': '', 95 | 'DataFormat': 'AVRO', 96 | 'Compatibility': 'BACKWARD', 97 | 'SchemaStatus': 'AVAILABLE', 98 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 99 | 'SchemaVersionStatus': 'AVAILABLE' 100 | }) 101 | glue_client.get_schema_version = Mock(return_value={ 102 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 103 | 'SchemaDefinition': SCHEMA_DEF, 104 | 'DataFormat': 'AVRO', 105 | 'SchemaArn': SCHEMA_ARN, 106 | 'VersionNumber': 123, 107 | 'Status': 'AVAILABLE' 108 | }) 109 | 110 | version = client.get_or_register_schema_version( 111 | definition=SCHEMA_DEF, 112 | schema_name=SCHEMA_NAME, 113 | data_format='AVRO' 114 | ) 115 | 116 | assert version.version_id == SCHEMA_VERSION_ID 117 | 118 | glue_client.get_schema_version = Mock(return_value={ 119 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 120 | 'SchemaDefinition': SCHEMA_DEF, 121 | 'DataFormat': 'JSON', 122 | 'SchemaArn': SCHEMA_ARN, 123 | 'VersionNumber': 123, 124 | 'Status': 'AVAILABLE' 125 | }) 126 | 127 | version = client.get_or_register_schema_version( 128 | definition=SCHEMA_DEF, 129 | schema_name=SCHEMA_NAME, 130 | data_format='JSON' 131 | ) 132 | 133 | assert version.version_id == SCHEMA_VERSION_ID 134 | 135 | 136 | def test_get_or_register_schema_version_registers_version( 137 | client, glue_client 138 | ): 139 | glue_client.get_schema_by_definition = Mock( 140 | side_effect=SchemaRegistryException( 141 | Exception(SCHEMA_VERSION_NOT_FOUND_MSG) 142 | )) 143 | glue_client.register_schema_version = Mock(return_value={ 144 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 145 | 'VersionNumber': 123, 146 | 'Status': 'AVAILABLE' 147 | }) 148 | glue_client.get_schema_version = Mock(return_value={ 149 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 150 | 'SchemaDefinition': SCHEMA_DEF, 151 | 'DataFormat': 'AVRO', 152 | 'SchemaArn': SCHEMA_ARN, 153 | 'VersionNumber': 123, 154 | 'Status': 'AVAILABLE' 155 | }) 156 | 157 | version = client.get_or_register_schema_version( 158 | definition=SCHEMA_DEF, 159 | schema_name=SCHEMA_NAME, 160 | data_format='AVRO' 161 | ) 162 | 163 | assert version.version_id == SCHEMA_VERSION_ID 164 | 165 | 166 | @pytest.mark.parametrize( 167 | "schema_def,schema_name,schema_ver_id", 168 | [(SCHEMA_DEF, SCHEMA_NAME, SCHEMA_VERSION_ID), 169 | (JSON_SCHEMA_DEF, JSON_SCHEMA_NAME, JSON_SCHEMA_VERSION_ID)]) 170 | def test_register_schema_version(client, glue_client, 171 | schema_name, schema_def, schema_ver_id): 172 | print(schema_name, schema_def, schema_ver_id) 173 | glue_client.register_schema_version = Mock(return_value={ 174 | 'SchemaVersionId': str(schema_ver_id), 175 | 'VersionNumber': 1, 176 | 'Status': 'AVAILABLE' 177 | }) 178 | 179 | version_id = client.register_schema_version(schema_def, schema_name) 180 | 181 | assert version_id == schema_ver_id 182 | 183 | 184 | def test_wait_for_schema_evolution_check_to_complete(client, glue_client): 185 | responses = [ 186 | { 187 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 188 | 'Status': 'PENDING' 189 | }, { 190 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 191 | 'Status': 'AVAILABLE' 192 | } 193 | ] 194 | glue_client.get_schema_version = Mock(side_effect=responses) 195 | 196 | client._wait_for_schema_evolution_check_to_complete(SCHEMA_VERSION_ID) 197 | 198 | 199 | def test_schema_evolution_timeout(client, glue_client): 200 | glue_client.get_schema_version = Mock(return_value={ 201 | 'SchemaVersionId': str(SCHEMA_VERSION_ID), 202 | 'Status': 'PENDING' 203 | }) 204 | 205 | with pytest.raises(SchemaRegistryException): 206 | client._wait_for_schema_evolution_check_to_complete(SCHEMA_VERSION_ID) 207 | 208 | assert glue_client.get_schema_version.call_count == 10 209 | 210 | 211 | def test_put_schema_version_metadata_succeeds(client, glue_client): 212 | glue_client.put_schema_version_metadata = Mock( 213 | side_effect=_make_put_schema_version_metadata_response 214 | ) 215 | 216 | client.put_schema_version_metadata(SCHEMA_VERSION_ID, METADATA) 217 | 218 | assert ( 219 | glue_client.put_schema_version_metadata.call_count 220 | == 221 | len(METADATA) 222 | ) 223 | for k, v in METADATA.items(): 224 | glue_client.put_schema_version_metadata.assert_any_call( 225 | SchemaVersionId=str(SCHEMA_VERSION_ID), 226 | MetadataKeyValue={ 227 | 'MetadataKey': k, 228 | 'MetadataValue': v 229 | } 230 | ) 231 | 232 | 233 | def _make_put_schema_version_metadata_response( 234 | SchemaVersionId: str, 235 | MetadataKeyValue: dict 236 | ): 237 | return { 238 | 'SchemaVersionId': SchemaVersionId, 239 | 'MetadataKey': MetadataKeyValue['MetadataKey'], 240 | 'MetadataValue': MetadataKeyValue['MetadataValue'] 241 | } 242 | 243 | 244 | @pytest.mark.parametrize("data_format", ["AVRO", "JSON"]) 245 | def test_create_schema(client, glue_client, data_format): 246 | schema_version_id = uuid4() 247 | glue_client.create_schema = Mock(return_value={ 248 | 'SchemaName': SCHEMA_NAME, 249 | 'DataFormat': data_format, 250 | 'SchemaVersionId': str(schema_version_id) 251 | }) 252 | 253 | version_id = client.create_schema( 254 | SCHEMA_NAME, data_format, SCHEMA_DEF 255 | ) 256 | 257 | assert version_id == schema_version_id 258 | -------------------------------------------------------------------------------- /tests/test_codec.py: -------------------------------------------------------------------------------- 1 | from uuid import uuid4 2 | 3 | import pytest 4 | 5 | from aws_schema_registry.codec import ( 6 | CodecException, encode, decode, 7 | UnknownEncodingException 8 | ) 9 | 10 | 11 | @pytest.mark.parametrize('compression', [None, 'zlib']) 12 | def test_codec(compression): 13 | data = (1024).to_bytes(2, 'big') 14 | schema_version_id = uuid4() 15 | encoded = encode(data, schema_version_id, compression=compression) 16 | decoded = decode(encoded) 17 | assert decoded[0] == data 18 | assert decoded[1] == schema_version_id 19 | 20 | 21 | def test_unknown_leading_byte(): 22 | # leading byte '0' is what the Confluent Schema Registry client uses 23 | bytes_ = b'\x00\x05\x00\x00' 24 | with pytest.raises(UnknownEncodingException): 25 | decode(bytes_) 26 | 27 | 28 | def test_unknown_compression_byte(): 29 | bytes_ = b'\x00\x01\x00\x00' 30 | with pytest.raises(CodecException): 31 | decode(bytes_) 32 | -------------------------------------------------------------------------------- /tests/test_jsonschema.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pytest 3 | 4 | from aws_schema_registry import ValidationError 5 | from aws_schema_registry.jsonschema import JsonSchema 6 | 7 | 8 | def test_readwrite(): 9 | s = JsonSchema("""{ 10 | "$schema": "http://json-schema.org/draft-04/schema#", 11 | "type": "object", 12 | "properties": { 13 | "name": { 14 | "type": "string" 15 | }, 16 | "age": { 17 | "type": "integer" 18 | } 19 | }, 20 | "required": [ 21 | "name", 22 | "age" 23 | ] 24 | }""") 25 | 26 | d = { 27 | 'name': 'Yoda', 28 | 'age': 900 29 | } 30 | 31 | assert s.read(s.write(d)) == d 32 | 33 | 34 | def test_validation_during_read_write(): 35 | s = JsonSchema("""{ 36 | "$schema": "http://json-schema.org/draft-04/schema#", 37 | "type": "object", 38 | "properties": { 39 | "name": { 40 | "type": "string" 41 | }, 42 | "age": { 43 | "type": "integer" 44 | } 45 | }, 46 | "required": [ 47 | "name", 48 | "age" 49 | ] 50 | }""") 51 | 52 | with pytest.raises(ValidationError, match=re.escape( 53 | "data.name must be string" 54 | )): 55 | s.read(b'{"name": 1, "age": 2}') 56 | 57 | with pytest.raises(ValidationError, match=re.escape( 58 | "data.name must be string" 59 | )): 60 | s.write({"name": 1, "age": 2}) 61 | 62 | 63 | def test_validation(): 64 | s = JsonSchema("""{ 65 | "$schema": "http://json-schema.org/draft-04/schema#", 66 | "type": "object", 67 | "properties": { 68 | "name": { 69 | "type": "string" 70 | }, 71 | "age": { 72 | "type": "integer" 73 | } 74 | }, 75 | "required": [ 76 | "name", 77 | "age" 78 | ] 79 | }""") 80 | 81 | with pytest.raises( 82 | ValidationError, 83 | # fastjsonschema>=2.18.0 reports only missing properties, so it will 84 | # exclude 'name' 85 | match=r"data must contain \[('name', )?'age'\] properties" 86 | ): 87 | s.validate({'name': 'Obi-Wan'}) 88 | with pytest.raises(ValidationError, match=re.escape( 89 | "data.name must be string" 90 | )): 91 | s.validate({'name': 1, 'age': 2}) 92 | 93 | s.validate({'name': 'Jar Jar', 'age': 42, 'sith': True}) 94 | 95 | s = JsonSchema("""{ 96 | "$schema": "http://json-schema.org/draft-04/schema#", 97 | "type": "object", 98 | "properties": { 99 | "name": { 100 | "type": "string" 101 | }, 102 | "age": { 103 | "type": "integer" 104 | } 105 | }, 106 | "required": [ 107 | "name", 108 | "age" 109 | ], 110 | "additionalProperties": false 111 | }""") 112 | 113 | with pytest.raises(ValidationError, match=re.escape( 114 | "data must not contain {'sith'} properties" 115 | )): 116 | s.validate({'name': 'Jar Jar', 'age': 42, 'sith': True}) 117 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = flake8,mypy,py38,py39,py310,py311 3 | 4 | [gh-actions] 5 | python = 6 | 3.8: py38 7 | 3.9: py39 8 | 3.10: py310 9 | 3.11: py311 10 | 11 | [testenv] 12 | deps = pytest 13 | commands = 14 | pip install . .[dev] .[kafka-python] 15 | python -m pytest --ignore tests/integration 16 | 17 | [testenv:flake8] 18 | deps = flake8 19 | commands = flake8 20 | 21 | [testenv:mypy] 22 | deps = mypy 23 | commands = 24 | pip install . .[dev] .[kafka-python] 25 | python -m mypy 26 | 27 | [pytest] 28 | python_files = test_* 29 | testpaths = tests 30 | 31 | [flake8] 32 | exclude = venv*,.venv*,env,.env,.tox,.toxenv,.git,__pycache__ 33 | --------------------------------------------------------------------------------