├── v1 └── .keep ├── v2 └── .keep ├── datacontractcli └── .keep ├── .python-version ├── constraint.txt ├── .gitignore ├── requirements.txt ├── drop-bad-data.sql ├── connection.yml ├── bad-data.sql ├── initdb.d └── init.sql ├── README.md ├── .devcontainer ├── devcontainer.json └── Dockerfile ├── data.sql ├── docker-compose.yaml └── lib └── data_contract.py /v1/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /v2/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datacontractcli/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.9 2 | -------------------------------------------------------------------------------- /constraint.txt: -------------------------------------------------------------------------------- 1 | cython<3 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | venv 3 | schema.hcl 4 | *.proto 5 | .soda 6 | datacontractcli/customers.yaml 7 | datacontractcli/catalog/ 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml==6.0.1 2 | soda-core==3.3.5 3 | soda-core-contracts==3.3.5 4 | soda-core-postgres==3.3.5 5 | uv==0.8.15 6 | -------------------------------------------------------------------------------- /drop-bad-data.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM customers WHERE id = 'C16'; 2 | DELETE FROM customers WHERE id = 'C01' AND created = '2022-08-25 18:02:52'; -------------------------------------------------------------------------------- /connection.yml: -------------------------------------------------------------------------------- 1 | name: local_postgres 2 | type: postgres 3 | connection: 4 | host: postgres 5 | database: public 6 | username: postgres 7 | password: secret -------------------------------------------------------------------------------- /bad-data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO customers(id, size, created, distance) 2 | VALUES 3 | ('C16', 'XML', '2022-08-25 18:02:52', 74), 4 | ('C01', 'M', '2022-08-25 18:02:52', 74); -------------------------------------------------------------------------------- /initdb.d/init.sql: -------------------------------------------------------------------------------- 1 | create user mark with encrypted password 'secret'; 2 | create role administrator; 3 | create role marketing; 4 | grant marketing to mark; 5 | grant administrator to postgres; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Implementing Data Contracts 2 | 3 | This is the companion code for my [Implementing Data Contracts course](https://resources.andrew-jones.com/b/79NPq). 4 | 5 | It provides a foundation on which you will build upon in the course. 6 | 7 | For more information on the course, data contracts, or myself, feel free to [contract me](https://resources.andrew-jones.com/contact)! 8 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Data Contract Workshop", 3 | "dockerComposeFile": [ 4 | "../docker-compose.yaml" 5 | ], 6 | "service": "dev", 7 | "workspaceFolder": "/workspace", 8 | "customizations": { 9 | "vscode": { 10 | "extensions": [ 11 | "ms-python.python", 12 | "redhat.vscode-yaml", 13 | "ms-vscode.live-server" 14 | ], 15 | "settings": { 16 | "python.defaultInterpreterPath": "/usr/local/bin/python" 17 | } 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /data.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO customers(id, size, created, distance) 2 | VALUES 3 | ('C01', 'M', '2020-05-20 13:02:52', 74), 4 | ('C02', 'M', '2021-07-19 13:02:52', 234), 5 | ('C03', 'L', '2022-01-10 10:15:30', 112), 6 | ('C04', 'S', '2023-03-22 09:45:00', 54), 7 | ('C05', 'L', '2019-11-05 16:23:10', 298), 8 | ('C06', 'M', '2021-05-15 08:34:25', 185), 9 | ('C07', 'L', '2020-08-30 17:50:55', 90), 10 | ('C08', 'S', '2022-11-21 14:27:38', 123), 11 | ('C09', 'M', '2018-07-13 11:05:42', 76), 12 | ('C10', 'S', '2021-12-31 20:18:07', 201), 13 | ('C11', 'L', '2020-06-18 13:22:43', 110), 14 | ('C12', 'M', '2023-02-14 12:30:00', 64), 15 | ('C13', 'S', '2019-09-09 07:12:56', 45), 16 | ('C14', 'M', '2022-04-17 18:45:12', 320), 17 | ('C15', 'M', '2020-12-25 21:05:33', 89); -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/devcontainers/python:0-3.11 2 | 3 | COPY requirements.txt /tmp/pip-tmp/ 4 | RUN pip install --no-cache-dir -r /tmp/pip-tmp/requirements.txt 5 | 6 | # Install psql 7 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 8 | && apt-get -y install --no-install-recommends postgresql-client 9 | # Use environment vars for connecting 10 | ENV PGHOST='postgres' 11 | ENV PGUSER='postgres' 12 | ENV PGPASSWORD='secret' 13 | ENV PGDATABASE='public' 14 | 15 | # Install Atlas 16 | RUN curl -sSf https://atlasgo.sh | sh -s -- -y 17 | 18 | # Install buf 19 | ENV BIN="/usr/local/bin" 20 | ENV BUF_VERSION="1.57.0" 21 | RUN curl -sSL \ 22 | "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m)" \ 23 | -o "${BIN}/buf" && chmod +x "${BIN}/buf" 24 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | postgres: 3 | container_name: postgres 4 | image: postgres:16.2 5 | environment: 6 | POSTGRES_USER: postgres 7 | POSTGRES_PASSWORD: secret 8 | POSTGRES_DB: public 9 | PGDATA: /data/postgres 10 | PGUSER: postgres # Required for health check 11 | healthcheck: 12 | test: [ "CMD-SHELL", "pg_isready" ] 13 | interval: 2s 14 | timeout: 2s 15 | retries: 20 16 | command: postgres -c listen_addresses='*' 17 | volumes: 18 | - ./initdb.d/init.sql:/docker-entrypoint-initdb.d/init.sql 19 | ports: 20 | - "5432:5432" 21 | networks: 22 | - postgres 23 | 24 | dev: 25 | container_name: dev 26 | build: 27 | context: . 28 | dockerfile: .devcontainer/Dockerfile 29 | volumes: 30 | - .:/workspace:cached 31 | command: sleep infinity 32 | depends_on: 33 | postgres: 34 | condition: service_healthy 35 | networks: 36 | - postgres 37 | 38 | networks: 39 | postgres: 40 | name: postgres 41 | driver: bridge 42 | -------------------------------------------------------------------------------- /lib/data_contract.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | def to_atlas_schema(contract): 4 | hcl = f"""schema "public" {{}} 5 | 6 | table "{contract["dataset"]}" {{ 7 | schema = schema.public 8 | """ 9 | for column in contract["columns"]: 10 | hcl += f""" 11 | column "{column['name']}" {{ 12 | type = {column['data_type'].lower()} 13 | }} 14 | """ 15 | hcl += "}" 16 | return hcl 17 | 18 | protobuf_type_map = { 19 | "varchar": "string", 20 | "integer": "int64", 21 | # protobuf doesn't have a timestamp field, so we'll just 22 | # use string for this example 23 | "timestamp": "string", 24 | } 25 | def to_protobuf(contract): 26 | proto = f"message {contract['dataset']} {{\n" 27 | 28 | for i, column in enumerate(contract["columns"], start=1): 29 | type = protobuf_type_map.get(column['data_type'].lower()) 30 | proto += f"\toptional {type} {column['name']} = {i};\n" 31 | 32 | proto += "}" 33 | return proto 34 | 35 | def to_datacontract_cli(contract): 36 | data = dict( 37 | dataContractSpecification = '0.9.3', 38 | id = contract["dataset"], 39 | info = dict ( 40 | title = contract["dataset"], 41 | version = str(contract["version"]), 42 | description = contract["description"], 43 | owner = contract["owner"]["email"], 44 | ), 45 | servers = dict( 46 | workshop = dict( 47 | type = 'postgres', 48 | host = 'localhost', 49 | port = 5432, 50 | database = 'postgres', 51 | schema = 'public', 52 | ), 53 | ), 54 | models = dict() 55 | ) 56 | 57 | fields = dict() 58 | for column in contract["columns"]: 59 | fields[column["name"]] = dict( 60 | description = column["description"], 61 | type = column["data_type"].lower() 62 | ) 63 | 64 | data["models"][contract["dataset"]] = { 65 | 'fields': fields, 66 | 'description': contract["description"], 67 | } 68 | 69 | 70 | return data 71 | 72 | def to_pace(contract): 73 | # The schema 74 | fields = [] 75 | for column in contract["columns"]: 76 | fields.append({ 77 | 'name_parts': [column["name"]], 78 | 'type': column["data_type"].lower(), 79 | }) 80 | 81 | # The access policies 82 | conditions = [] 83 | for policy in contract["access"]: 84 | conditions.append(dict( 85 | principals = policy.get("principals", []), 86 | condition = policy.get("condition") 87 | )) 88 | 89 | filters = [dict( 90 | generic_filter = dict( 91 | conditions = conditions 92 | ) 93 | )] 94 | 95 | return dict( 96 | metadata = dict( 97 | title = f"public.{contract['dataset']}", 98 | ), 99 | source = dict( 100 | ref = dict( 101 | integration_fqn = f"public.{contract['dataset']}", 102 | platform = dict( 103 | id = "postgres", 104 | platform_type = "POSTGRES", 105 | ) 106 | ), 107 | fields = fields, 108 | ), 109 | rule_sets = [dict( 110 | target = dict( 111 | ref = dict( 112 | integration_fqn = f"public.{contract['dataset']}_view", 113 | ), 114 | ), 115 | filters = filters, 116 | )] 117 | ) --------------------------------------------------------------------------------