├── Chapter08
    ├── .gitignore
    ├── pulumi
    │   ├── Pulumi.contracts.yaml
    │   ├── Pulumi.yaml
    │   └── __main__.py
    ├── pulumi_introduction
    │   ├── Pulumi.introduction.yaml
    │   ├── Pulumi.yaml
    │   └── __main__.py
    ├── requirements.txt
    ├── parse_contract.py
    ├── validate_contract.py
    ├── generate-json-schema.py
    ├── contracts
    │   ├── Customer-v3-incompatible.yaml
    │   ├── Customer-invalid.yaml
    │   ├── Customer.yaml
    │   └── Customer-v2.yaml
    ├── schema_registry
    │   ├── create-schema.py
    │   ├── update-schema-v2.py
    │   ├── update-schema-v3-incompatible.py
    │   └── docker-compose.yml
    ├── validate-data.py
    ├── anonymize.py
    └── lib
    │   ├── data_contracts.py
    │   └── test_data_contracts.py
├── Chapter06
    ├── customer.proto
    ├── customer_avro.json
    ├── customer.yaml
    └── customer_jsonschema.json
├── Chapter05
    └── customer.yaml
├── Chapter09
    └── customer.yaml
├── Chapter07
    ├── customer.yaml
    └── data-platform-gateway-schema.yaml
├── LICENSE
├── Chapter03
    └── order_events.yaml
└── README.md


/Chapter08/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | venv/
3 | customer.schema.json
4 | 


--------------------------------------------------------------------------------
/Chapter08/pulumi/Pulumi.contracts.yaml:
--------------------------------------------------------------------------------
1 | config:
2 |   gcp:project: my-google-project-2468
3 | 


--------------------------------------------------------------------------------
/Chapter08/pulumi_introduction/Pulumi.introduction.yaml:
--------------------------------------------------------------------------------
1 | config:
2 |   gcp:project: my-google-project-2468
3 | 


--------------------------------------------------------------------------------
/Chapter06/customer.proto:
--------------------------------------------------------------------------------
1 | message Customer {
2 |   string id       = 1;
3 |   string name     = 2;
4 |   string email    = 3;
5 |   string language = 4;
6 | }


--------------------------------------------------------------------------------
/Chapter08/pulumi/Pulumi.yaml:
--------------------------------------------------------------------------------
1 | name: data-contracts
2 | runtime:
3 |   name: python
4 |   options:
5 |     virtualenv: ../venv
6 | description: Data Contracts book
7 | 


--------------------------------------------------------------------------------
/Chapter08/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml==6.0
2 | pulumi>=3.0.0,<4.0.0
3 | pulumi-gcp>=6.0.0,<7.0.0
4 | jsonschema==4.17.3
5 | confluent-kafka==2.1.0
6 | requests==2.31.0
7 | 


--------------------------------------------------------------------------------
/Chapter08/pulumi_introduction/Pulumi.yaml:
--------------------------------------------------------------------------------
1 | name: data-contracts
2 | runtime:
3 |   name: python
4 |   options:
5 |     virtualenv: ../venv
6 | description: Data Contracts book example
7 | 


--------------------------------------------------------------------------------
/Chapter08/parse_contract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import yaml
 4 | 
 5 | with open("contracts/Customer.yaml", "r") as stream:
 6 |     contract = yaml.safe_load(stream)
 7 | 
 8 | print(
 9 |     f'Successfully parsed the `{contract["name"]}` contract, which is owned by `{contract["owner"]}`.')
10 | 


--------------------------------------------------------------------------------
/Chapter08/validate_contract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import yaml
 4 | 
 5 | with open("contracts/Customer-invalid.yaml", "r") as stream:
 6 |     contract = yaml.safe_load(stream)
 7 | 
 8 | if 'owner' not in contract:
 9 |     raise ValueError(f'`{contract["name"]}` contract does not have an owner')
10 | 
11 | print(
12 |     f'Successfully parsed the `{contract["name"]}` contract, which is owned by `{contract["owner"]}`.')
13 | 


--------------------------------------------------------------------------------
/Chapter05/customer.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | owner: product-team@data-contracts.com
 4 | version: 1
 5 | fields:
 6 |   name:
 7 |     type: string
 8 |     description: The name of the customer.
 9 |     personal_data: true
10 |     anonymization_strategy: hex
11 |   email:
12 |     type: string
13 |     description: The email address of the customer.
14 |     personal_data: true
15 |     anonymization_strategy: email
16 | 


--------------------------------------------------------------------------------
/Chapter08/generate-json-schema.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append("lib/")
 5 | 
 6 | import json
 7 | from data_contracts import DataContract
 8 | 
 9 | data_contract = DataContract("contracts/Customer.yaml")
10 | 
11 | output_file = 'customer.schema.json'
12 | with open(output_file, 'w') as writer:
13 |     writer.write(json.dumps(data_contract.json_schema(), indent=2))
14 |     print(f'Written JSON Schema of `{data_contract.name()}` data contract to `{output_file}`')
15 | 


--------------------------------------------------------------------------------
/Chapter09/customer.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | owner: product-team@data-contracts.com
 4 | version: 1
 5 | warehouse_path: app.customers
 6 | fields:
 7 |   name:
 8 |     type: string
 9 |     description: The name of the customer.
10 |     personal_data: true
11 |     anonymization_strategy: hex
12 |   email:
13 |     type: string
14 |     description: The email address of the customer.
15 |     personal_data: true
16 |     anonymization_strategy: email
17 | 


--------------------------------------------------------------------------------
/Chapter07/customer.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer record ingested from Salesforce.
 3 | owner: product-team@data-contracts.com
 4 | version: 2
 5 | warehouse_path: sales_data.salesforce.customers
 6 | backups:
 7 |   schedule: @daily
 8 |   expire: P60D
 9 | fields:
10 |   id:
11 |     type: string
12 |     description: The unique identifier for the record.
13 |   name:
14 |     type: string
15 |     description: The name of the customer.
16 |     personal_data: true
17 |     anonymization_strategy: hex
18 |   email:
19 |     type: string
20 |     description: The email address of the customer.
21 |     personal_data: true
22 |     anonymization_strategy: email
23 | 


--------------------------------------------------------------------------------
/Chapter08/contracts/Customer-v3-incompatible.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | owner: product-team@data-contracts.com
 4 | version: 1
 5 | fields:
 6 |   id:
 7 |     type: string
 8 |     description: The unique identifier for the customer.
 9 |     required: true
10 |   name:
11 |     type: string
12 |     description: The name of the customer.
13 |     required: true
14 |     anonymization_strategy: hex
15 |   language:
16 |     type: string
17 |     description: The language preference of the customer.
18 |     enum: [en, fr, es]
19 |   country:
20 |     type: string
21 |     description: The country the customer resides in.
22 | 


--------------------------------------------------------------------------------
/Chapter06/customer_avro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "record",
 3 |   "name": "Customer",
 4 |   "doc": "A customer of our e-commerce website",
 5 |   "fields": [{
 6 |       "name": "id",
 7 |       "type": "string",
 8 |       "doc": "The unique identifier for the customer."
 9 |     },
10 |     {
11 |       "name": "name",
12 |       "type": "string",
13 |       "doc": "The name of the customer."
14 |     },
15 |     {
16 |       "name": "email",
17 |       "type": "string",
18 |       "doc": "The email address of the customer."
19 |     },
20 |     {
21 |       "name": "language",
22 |       "type": "string",
23 |       "doc": "The language preference of the customer."
24 |     }
25 |   ]
26 | }


--------------------------------------------------------------------------------
/Chapter08/pulumi/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("../lib/")
 3 | 
 4 | from data_contracts import DataContract
 5 | 
 6 | import pulumi
 7 | from pulumi_gcp import bigquery
 8 | 
 9 | data_contract = DataContract("../contracts/Customer.yaml")
10 | 
11 | dataset = bigquery.Dataset(
12 |     "dataProductsDataset",
13 |     dataset_id="data_products",
14 |     friendly_name="Data Products",
15 |     description="A dataset to hold our teams data products.",
16 | )
17 | 
18 | customer_table = bigquery.Table("customerTable",
19 |     dataset_id=dataset.dataset_id,
20 |     table_id=data_contract.name(),
21 |     deletion_protection=False,
22 |     schema=data_contract.bigquery_schema())
23 | 


--------------------------------------------------------------------------------
/Chapter08/contracts/Customer-invalid.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | version: 1
 4 | fields:
 5 |   id:
 6 |     type: string
 7 |     description: The unique identifier for the customer.
 8 |     required: true
 9 |   name:
10 |     type: string
11 |     description: The name of the customer.
12 |     required: true
13 |     anonymization_strategy: hex
14 |   email:
15 |     type: string
16 |     description: The email address of the customer.
17 |     pattern: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
18 |     required: true
19 |     anonymization_strategy: email
20 |   language:
21 |     type: string
22 |     description: The language preference of the customer.
23 |     enum: [en, fr, es]
24 | 


--------------------------------------------------------------------------------
/Chapter08/contracts/Customer.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | owner: product-team@data-contracts.com
 4 | version: 1
 5 | fields:
 6 |   id:
 7 |     type: string
 8 |     description: The unique identifier for the customer.
 9 |     required: true
10 |   name:
11 |     type: string
12 |     description: The name of the customer.
13 |     required: true
14 |     anonymization_strategy: hex
15 |   email:
16 |     type: string
17 |     description: The email address of the customer.
18 |     pattern: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
19 |     required: true
20 |     anonymization_strategy: email
21 |   language:
22 |     type: string
23 |     description: The language preference of the customer.
24 |     enum: [en, fr, es]
25 | 


--------------------------------------------------------------------------------
/Chapter08/pulumi_introduction/__main__.py:
--------------------------------------------------------------------------------
 1 | import pulumi
 2 | from pulumi_gcp import bigquery
 3 | 
 4 | default_dataset = bigquery.Dataset(
 5 |     "defaultDataset",
 6 |     dataset_id="pulumi_introduction",
 7 |     friendly_name="Pulumi Introduction",
 8 |     description="This is an example description",
 9 | )
10 | default_table = bigquery.Table(
11 |     "defaultTable",
12 |     dataset_id=default_dataset.dataset_id,
13 |     table_id="my_table",
14 |     deletion_protection=False,
15 |     schema="""[
16 |   {
17 |     "name": "id",
18 |     "type": "STRING",
19 |     "mode": "REQUIRED",
20 |     "description": "The ID"
21 |   },
22 |   {
23 |     "name": "state",
24 |     "type": "STRING",
25 |     "description": "State where the head office is located"
26 |   }
27 | ]
28 | """)
29 | 


--------------------------------------------------------------------------------
/Chapter08/schema_registry/create-schema.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script uses the Confluent Python libraries to register our
 4 | # `Customer.yaml` schema to our schema registry.
 5 | 
 6 | import sys
 7 | sys.path.append("../lib/")
 8 | 
 9 | from data_contracts import DataContract
10 | from confluent_kafka.schema_registry import SchemaRegistryClient
11 | from confluent_kafka.schema_registry.schema_registry_client import Schema
12 | import json
13 | 
14 | data_contract = DataContract("../contracts/Customer.yaml")
15 | 
16 | client = SchemaRegistryClient({"url": "http://localhost:8081"})
17 | schema = Schema(json.dumps(data_contract.json_schema()), schema_type='JSON')
18 | result = client.register_schema(data_contract.name(), schema)
19 | 
20 | print(f"Registered schema `{data_contract.name()}` with ID of {result}")
21 | 


--------------------------------------------------------------------------------
/Chapter08/schema_registry/update-schema-v2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script uses the Confluent Python libraries to register our
 4 | # `Customer-v2.yaml` schema to our schema registry.
 5 | 
 6 | import sys
 7 | sys.path.append("../lib/")
 8 | 
 9 | from data_contracts import DataContract
10 | from confluent_kafka.schema_registry import SchemaRegistryClient
11 | from confluent_kafka.schema_registry.schema_registry_client import Schema
12 | import json
13 | 
14 | data_contract = DataContract("../contracts/Customer-v2.yaml")
15 | 
16 | client = SchemaRegistryClient({"url": "http://localhost:8081"})
17 | schema = Schema(json.dumps(data_contract.json_schema()), schema_type='JSON')
18 | result = client.register_schema(data_contract.name(), schema)
19 | 
20 | print(f"Updated schema `{data_contract.name()}` with ID of {result}")
21 | 


--------------------------------------------------------------------------------
/Chapter06/customer.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | owner: product-team@data-contracts.com
 4 | version: 2
 5 | fields:
 6 |   id:
 7 |     type: string
 8 |     description: The unique identifier for the customer.
 9 |     required: true
10 |     primary_key: true
11 |   name:
12 |     type: string
13 |     description: The name of the customer.
14 |     required: true
15 |     personal_data: true
16 |     anonymization_strategy: hex
17 |   email:
18 |     type: string
19 |     description: The email address of the customer.
20 |     pattern: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
21 |     required: true
22 |     personal_data: true
23 |     anonymization_strategy: email
24 |   language:
25 |     type: string
26 |     description: The language preference of the customer.
27 |     enum: [en, fr, es]


--------------------------------------------------------------------------------
/Chapter08/contracts/Customer-v2.yaml:
--------------------------------------------------------------------------------
 1 | name: Customer
 2 | description: A customer of our e-commerce website.
 3 | owner: product-team@data-contracts.com
 4 | version: 1
 5 | fields:
 6 |   id:
 7 |     type: string
 8 |     description: The unique identifier for the customer.
 9 |     required: true
10 |   name:
11 |     type: string
12 |     description: The name of the customer.
13 |     required: true
14 |     anonymization_strategy: hex
15 |   email:
16 |     type: string
17 |     description: The email address of the customer.
18 |     pattern: "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
19 |     required: true
20 |     anonymization_strategy: email
21 |   language:
22 |     type: string
23 |     description: The language preference of the customer.
24 |     enum: [en, fr, es]
25 |   country:
26 |     type: string
27 |     description: The country the customer resides in.
28 | 


--------------------------------------------------------------------------------
/Chapter06/customer_jsonschema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "http://json-schema.org/draft-07/schema#",
 3 |   "type": "object",
 4 |   "title": "customer",
 5 |   "description": "A customer of our e-commerce website",
 6 |   "properties": {
 7 |     "id": {
 8 |       "type": "string",
 9 |       "description": "The unique identifier for the customer.",
10 |     },
11 |     "name": {
12 |       "type": "string",
13 |       "description": "The name of the customer.",
14 |     },
15 |     "email": {
16 |       "type": "string",
17 |       "description": "The email address of the customer.",
18 |       "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
19 |     },
20 |     "language": {
21 |       "type": "string",
22 |       "description": "The language preference of the customer.",
23 |       "enum": ["en", "fr", "es"]
24 |     }
25 |   },
26 |   "required": ["id", "name", "email", "language"]
27 | }


--------------------------------------------------------------------------------
/Chapter07/data-platform-gateway-schema.yaml:
--------------------------------------------------------------------------------
 1 | name: transaction_fee_calculated
 2 | namespace: core_banking
 3 | doc: Records the calculation of a transaction fee that would be collected from a merchant.
 4 | primary_keys:
 5 |   - event_id
 6 | fields:
 7 |   - name: event_id
 8 |     doc: Unique deterministic ID of the event.
 9 |     type: string
10 |     required: true
11 |   - name: created_at
12 |     doc: RFC 3339 time at which the event was emitted.
13 |     type: timestamp
14 |     required: true
15 |   - name: payment_currency
16 |     doc: >
17 |       ISO 4217 currency code of the payment which this fee is been charged for.
18 |       Examples include AUD, CAD, DKK, EUR, GBP, NZD, SEK, USD
19 |     type: string
20 |   - name: amount
21 |     doc: The amount (in minor currency unit) of the calculated fee.
22 |     type: long
23 |   - name: net_amount
24 |     doc: The amount (in minor currency unit) of the calculated fee, minus tax
25 |     type: long
26 | 


--------------------------------------------------------------------------------
/Chapter08/schema_registry/update-schema-v3-incompatible.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script uses the Confluent Python libraries to attempt to register our
 4 | # `Customer-v3-incompatible.yaml` schema to our schema registry. It will fail
 5 | # due to the compatibility checks performed by the schema registry and throw an
 6 | # exception.
 7 | 
 8 | import sys
 9 | sys.path.append("../lib/")
10 | 
11 | from data_contracts import DataContract
12 | from confluent_kafka.schema_registry import SchemaRegistryClient
13 | from confluent_kafka.schema_registry.schema_registry_client import Schema
14 | import json
15 | 
16 | data_contract = DataContract("../contracts/Customer-v3-incompatible.yaml")
17 | 
18 | client = SchemaRegistryClient({"url": "http://localhost:8081"})
19 | schema = Schema(json.dumps(data_contract.json_schema()), schema_type='JSON')
20 | result = client.register_schema(data_contract.name(), schema)
21 | 
22 | print(f"Updated schema `{data_contract.name()}` with ID of {result}")
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter08/validate-data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append("lib/")
 5 | 
 6 | from data_contracts import DataContract
 7 | from jsonschema import validate
 8 | from jsonschema.exceptions import ValidationError
 9 | 
10 | events = [
11 |     # Valid events
12 |     {"id": "DC12", "name" : "Andrew", "email" : "andrew@data-contracts.com", "language": "en"},
13 |     {"id": "DC13", "name" : "Deborah", "email" : "deborah@data-contracts.com"},
14 |     # Missing email, which is a required field
15 |     {"id": "DC14", "name" : "Bukayo", "language": "en"},
16 |     # Email does not pass regex validation
17 |     {"id": "DC15", "name" : "Bukayo", "email" : "bukayo", "language": "en"},
18 |     # `nl` is not a valid language code
19 |     {"id": "DC16", "name" : "Vivianne", "email" : "vivianne@data-contracts.com", "language": "nl"},
20 | ]
21 | 
22 | data_contract = DataContract("contracts/Customer.yaml")
23 | 
24 | for event in events:
25 |     try:
26 |         validate(event, data_contract.json_schema())
27 |         print(f"✅ Successfully validated event {event}")
28 |     except ValidationError as e:
29 |         print(f"❗ Error validating event {event}\n{e}")
30 | 


--------------------------------------------------------------------------------
/Chapter03/order_events.yaml:
--------------------------------------------------------------------------------
 1 | description: An event generated when an order is created
 2 | owner: product-team@data-contracts.com
 3 | version: 1
 4 | slos:
 5 |   completeness_percent: 100
 6 |   timeliness_mins: 60
 7 |   availability_percent: 95
 8 | lakehouse_path: order_events
 9 | fields:
10 |   id:
11 |     type: string
12 |     description: The unique identifier for the order
13 |   created_at:
14 |     type: timestamp
15 |     description: The date and time the order was created
16 |   items:
17 |     type: array
18 |     fields:
19 |       product_id:
20 |         type: string
21 |         description: The unique identifier for the product
22 |       price:
23 |         type: float
24 |         description: The price of the product, in cents
25 |       quantity:
26 |         type: integer
27 |         description: The amount of this product ordered
28 |       discount_id:
29 |         type: string
30 |         description: The unique identifier for the discount
31 |       discount_percent:
32 |         type: float
33 |         description: The percentage discount applied to this item, represented as a number between 0 and 1
34 |   order_total:
35 |     type: float
36 |     description: The total cost of the order, in cents
37 | 


--------------------------------------------------------------------------------
/Chapter08/anonymize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append("lib/")
 5 | 
 6 | from data_contracts import DataContract
 7 | 
 8 | # Anonymize the event based on the rules specified in the data contract
 9 | def anonymize(event: dict, data_contract: DataContract):
10 |     anonymized = event.copy()
11 |     for name, metadata in data_contract.fields().items():
12 |         if 'anonymization_strategy' in metadata:
13 |             if metadata['anonymization_strategy'] == 'email':
14 |                 anonymized[name] = f"anonymized+{event['id']}@data-contracts.com"
15 |             if metadata['anonymization_strategy'] == 'hex':
16 |                 anonymized[name] = event[name].encode("utf-8").hex()
17 | 
18 |     return anonymized
19 | 
20 | events = [
21 |     {"id": "DC12", "name" : "Andrew", "email" : "andrew@data-contracts.com", "language": "en"},
22 |     {"id": "DC13", "name" : "Deborah", "email" : "deborah@data-contracts.com"},
23 |     {"id": "DC14", "name" : "Bukayo", "email" : "bukayo@data-contracts.com", "language": "en"},
24 | ]
25 | 
26 | data_contract = DataContract("contracts/Customer.yaml")
27 | 
28 | for event in events:
29 |     anonymized = anonymize(event, data_contract)
30 |     print(f"Anonymizing:\t{event}\n\t\t{anonymized}")
31 | 


--------------------------------------------------------------------------------
/Chapter08/schema_registry/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '2'
 3 | services:
 4 |   zookeeper:
 5 |     image: confluentinc/cp-zookeeper:7.3.3
 6 |     hostname: zookeeper
 7 |     container_name: zookeeper
 8 |     ports:
 9 |       - "2181:2181"
10 |     environment:
11 |       ZOOKEEPER_CLIENT_PORT: 2181
12 |       ZOOKEEPER_TICK_TIME: 2000
13 | 
14 |   broker:
15 |     image: confluentinc/cp-server:7.3.3
16 |     hostname: broker
17 |     container_name: broker
18 |     depends_on:
19 |       - zookeeper
20 |     ports:
21 |       - "9092:9092"
22 |       - "9101:9101"
23 |     environment:
24 |       KAFKA_BROKER_ID: 1
25 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
26 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
27 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
28 |       KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
29 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
30 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
31 |       KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1
32 |       KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1
33 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
34 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
35 |       KAFKA_JMX_PORT: 9101
36 |       KAFKA_JMX_HOSTNAME: localhost
37 |       KAFKA_CONFLUENT_SCHEMA_REGISTRY_URL: http://schema-registry:8081
38 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: 'broker:29092'
39 |       CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
40 |       CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous'
41 | 
42 |   schema-registry:
43 |     image: confluentinc/cp-schema-registry:7.3.3
44 |     hostname: schema-registry
45 |     container_name: schema-registry
46 |     depends_on:
47 |       - broker
48 |     ports:
49 |       - "8081:8081"
50 |     environment:
51 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
52 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092'
53 |       SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
54 |       SCHEMA_REGISTRY_SCHEMA_COMPATIBILITY_LEVEL: FORWARD
55 | 


--------------------------------------------------------------------------------
/Chapter08/lib/data_contracts.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import json
 3 | 
 4 | 
 5 | class DataContract:
 6 |     '''
 7 |     An object containing a data contract.
 8 | 
 9 |     Args:
10 |         path (str): The path to the data contract YAML file
11 |     '''
12 |     def __init__(self, path: str):
13 |         with open(path, "r") as stream:
14 |             self.contract = yaml.safe_load(stream)
15 |         
16 |         if 'owner' not in self.contract:
17 |             raise ValueError(f'`{self.name()}` contract does not have an owner')
18 | 
19 |     def name(self) -> str:
20 |         '''
21 |         Returns:
22 |             The name of the data contract.
23 |         '''
24 |         return self.contract['name']
25 |     
26 |     def fields(self) -> dict:
27 |         '''
28 |         Returns:
29 |             The fields that make up the schema.
30 |         '''
31 |         return self.contract['fields']
32 | 
33 |     def bigquery_schema(self) -> str:
34 |         '''
35 |         Generate a BigQuery schema from the data contract.
36 | 
37 |         Returns:
38 |             The BigQuery schema as JSON
39 |         '''
40 |         bq_schema = []
41 |         for name, metadata in self.fields().items():
42 |             schema = {
43 |                 'name': name,
44 |                 'type': metadata['type'].upper(),
45 |                 'description': metadata['description']
46 |             }
47 |             if 'required' in metadata and metadata['required'] is True:
48 |                 schema['mode'] = 'REQUIRED'
49 |             bq_schema.append(schema)
50 | 
51 |         return json.dumps(bq_schema, indent=2)
52 | 
53 |     def json_schema(self) -> dict:
54 |         '''
55 |         Generate a JSON Schema from the data contract.
56 | 
57 |         Returns:
58 |             The JSON Schema
59 |         '''
60 |         properties = {}
61 |         required = []
62 |         for name, metadata in self.fields().items():
63 |             properties[name] = {
64 |                 'description': metadata['description'],
65 |                 'type': metadata['type']
66 |             }
67 |             if 'enum' in metadata:
68 |                 properties[name]['enum'] = metadata['enum']
69 |             if 'pattern' in metadata:
70 |                 properties[name]['pattern'] = metadata['pattern']
71 | 
72 |             if 'required' in metadata and metadata['required'] is True:
73 |                 required.append(name)
74 | 
75 |         schema = {
76 |             "$schema": "https://json-schema.org/draft/2020-12/schema",
77 |             "title": self.name(),
78 |             "description": self.contract['description'],
79 |             "type": "object",
80 |             "properties": properties,
81 |             "required": required,
82 |             "additionalProperties": True
83 |         }
84 |         return schema
85 | 


--------------------------------------------------------------------------------
/Chapter08/lib/test_data_contracts.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from data_contracts import DataContract
 4 | 
 5 | 
 6 | class TestDataContracts(unittest.TestCase):
 7 |     maxDiff = None
 8 | 
 9 |     def test_invalid_contract(self):
10 |         with self.assertRaises(ValueError):
11 |             DataContract("../contracts/Customer-invalid.yaml")
12 | 
13 |     def test_name(self):
14 |         data_contract = DataContract("../contracts/Customer.yaml")
15 |         self.assertEqual(data_contract.name(), 'Customer')
16 | 
17 |     def test_bigquery_schema(self):
18 |         data_contract = DataContract("../contracts/Customer.yaml")
19 | 
20 |         expected = """[
21 |   {
22 |     "name": "id",
23 |     "type": "STRING",
24 |     "description": "The unique identifier for the customer.",
25 |     "mode": "REQUIRED"
26 |   },
27 |   {
28 |     "name": "name",
29 |     "type": "STRING",
30 |     "description": "The name of the customer.",
31 |     "mode": "REQUIRED"
32 |   },
33 |   {
34 |     "name": "email",
35 |     "type": "STRING",
36 |     "description": "The email address of the customer.",
37 |     "mode": "REQUIRED"
38 |   },
39 |   {
40 |     "name": "language",
41 |     "type": "STRING",
42 |     "description": "The language preference of the customer."
43 |   }
44 | ]"""
45 |         actual = data_contract.bigquery_schema()
46 |         self.assertEqual(actual, expected)
47 | 
48 |     def test_json_schema(self):
49 |         data_contract = DataContract("../contracts/Customer.yaml")
50 | 
51 |         expected = {
52 |             "$schema": "https://json-schema.org/draft/2020-12/schema",
53 |             "title": "Customer",
54 |             "description": "A customer of our e-commerce website.",
55 |             "type": "object",
56 |             "properties": {
57 |                 "id": {
58 |                     "description": "The unique identifier for the customer.",
59 |                     "type": "string"
60 |                 },
61 |                 "name": {
62 |                     "description": "The name of the customer.",
63 |                     "type": "string"
64 |                 },
65 |                 "email": {
66 |                     "description": "The email address of the customer.",
67 |                     "type": "string",
68 |                     "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
69 |                 },
70 |                 "language": {
71 |                     "description": "The language preference of the customer.",
72 |                     "type": "string",
73 |                     "enum": [
74 |                         "en",
75 |                         "fr",
76 |                         "es"
77 |                     ]
78 |                 }
79 |             },
80 |             "required": [
81 |                 "id",
82 |                 "name",
83 |                 "email"
84 |             ],
85 |             "additionalProperties": True
86 |         }
87 |         actual = data_contract.json_schema()
88 |         self.assertEqual(actual, expected)
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center"><a href="https://packt.link/mlsumgh"><img src="https://static.packt-cdn.com/assets/images/ML Summit Banner v3 1200x627.png" alt="Machine Learning Summit 2025"/></a></p>
  2 | 
  3 | ## Machine Learning Summit 2025
  4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges**
  5 | 
  6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of:
  7 | - **Agentic and Generative AI**
  8 | - **Applied Machine Learning in the Real World**
  9 | - **ML Engineering and Optimization**
 10 | 
 11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh)
 12 | 
 13 | ---
 14 | 
 15 | ## Join Our Newsletters 📬
 16 | 
 17 | ### DataPro  
 18 | *The future of AI is unfolding. Don’t fall behind.*
 19 | 
 20 | <p><a href="https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/DataPro NL QR Code.png" alt="DataPro QR" width="150"/></a></p>
 21 | 
 22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers.  
 23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast.
 24 | 
 25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat.
 26 | 
 27 | ---
 28 | 
 29 | ### BIPro  
 30 | *Business runs on data. Make sure yours tells the right story.*
 31 | 
 32 | <p><a href="https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/BIPro NL QR Code.png" alt="BIPro QR" width="150"/></a></p>
 33 | 
 34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders.  
 35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**.
 36 | 
 37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing.
 38 | 
 39 | # Driving Data Quality with Data Contracts
 40 | 
 41 | <a href="https://www.amazon.com/Driving-Data-Quality-Contracts-comprehensive/dp/1837635005/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=&sr=&utm_source=github&utm_medium=repository&utm_campaign=9781801810135"><img src="https://m.media-amazon.com/images/I/81+QpXfjD9L._SL1500_.jpg" alt="Shipping & Fee Details" height="256px" align="right"></a>
 42 | 
 43 | This is the code repository for [Driving Data Quality with Data Contracts](https://www.amazon.com/Driving-Data-Quality-Contracts-comprehensive/dp/1837635005/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=&sr=&utm_source=github&utm_medium=repository&utm_campaign=9781801810135), published by Packt.
 44 | 
 45 | **A comprehensive guide to building reliable, trusted, and effective data platforms**
 46 | 
 47 | ## What is this book about?
 48 | 
 49 | This book covers the following exciting features:
 50 | Gain insights into the intricacies and shortcomings of today’s data architectures
 51 | Understand exactly how data contracts can solve prevalent data challenges
 52 | Drive a fundamental transformation of your data culture by implementing data contracts
 53 | Discover what goes into a data contract and why it’s important
 54 | Design a modern data architecture that leverages the power of data contracts
 55 | Explore sample implementations to get practical knowledge of using data contracts
 56 | Embrace best practices for the successful deployment of data contracts
 57 | 
 58 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1837635005) today!
 59 | 
 60 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
 61 | alt="https://www.packtpub.com/" border="5" /></a>
 62 | 
 63 | ## Instructions and Navigations
 64 | All of the code is organized into folders. For example, Chapter08.
 65 | 
 66 | The code will look like the following:
 67 | ```
 68 | import pulumi 
 69 | from pulumi_gcp import bigquery 
 70 |       
 71 | default_dataset = bigquery.Dataset(  
 72 |    "defaultDataset", 
 73 |     dataset_id="pulumi_introduction", 
 74 |     friendly_name="Pulumi Introduction", 
 75 |     description="This is an example description", 
 76 | ) 
 77 | ```
 78 | 
 79 | **Following is what you need for this book:**
 80 | If you’re a data engineer, data leader, architect, or practitioner thinking about your data architecture and looking to design one that enables your organization to get the most value from your data, this book is for you. Additionally, staff engineers, product managers, and software engineering leaders and executives will also find valuable insights.
 81 | 
 82 | With the following software and hardware list you can run all code files present in the book (Chapter 08).
 83 | ### Software and Hardware List
 84 | | Chapter | Software required | OS required |
 85 | | -------- | ------------------------------------ | ----------------------------------- |
 86 | | 8 | Python 3.9.12 | Windows, macOS, or Linux |
 87 | | 8 | Docker | Windows, macOS, or Linux |
 88 | | 8 | Google Cloud Platform | Windows, macOS, or Linux |
 89 | 
 90 | 
 91 | ### Related products
 92 | * Data Modeling with Snowflake [[Packt]](https://www.packtpub.com/product/data-modeling-with-snowflake/9781837634453?utm_source=github&utm_medium=repository&utm_campaign=9781837634453) [[Amazon]](https://www.amazon.com/dp/1837634459)
 93 | 
 94 | * Data Literacy in Practice  [[Packt]](https://www.packtpub.com/product/data-literacy-in-practice/9781803246758?utm_source=github&utm_medium=repository&utm_campaign=9781803246758) [[Amazon]](https://www.amazon.com/dp/1803246758)
 95 | 
 96 | ## Get to Know the Author
 97 | **Andrew Jones**
 98 | is a principal engineer at GoCardless, one of Europe’s leading Fintech's. He has over 15 years experience in the industry, with the first half primarily as a software engineer, before he moved into the data infrastructure and data engineering space. Joining GoCardless as its first data engineer, he led his team to build their data platform from scratch. After initially following a typical data architecture and getting frustrated with facing the same old challenges he’d faced for years, he started thinking there must be a better way, which led to him coining and defining the ideas around data contracts. Andrew is a regular speaker and writer, and he is passionate about helping organizations get maximum value from data.
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------