├── .gitignore ├── LICENSE ├── README.md ├── architecture └── architecture-latest.excalidraw ├── community └── README.md ├── data └── optd │ └── optd_airline_por.csv ├── datacontract.com ├── .gitignore ├── README.md ├── contracts │ ├── data-contract-flight-route-quality.yaml │ ├── data-contract-flight-route-schema.yaml │ ├── data-contract-flight-route.yaml │ └── orders-latest-npii.yaml ├── data ├── soda-checks.yml ├── soda-conf.yml └── sql │ └── duckdb-ddl-create-view-from-csv.sql ├── img └── data-contracts-producers-and-consumers-2023-05.webp ├── quality └── ge │ └── 01-example-pyspark-ge-hc.py ├── schemata ├── mvp │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── document.sh │ ├── great_expectations │ │ ├── .gitignore │ │ ├── checkpoints │ │ │ └── geonames_routes.yml │ │ ├── expectations │ │ │ ├── .ge_store_backend_id │ │ │ └── geonames_routes.json │ │ ├── great_expectations.yml │ │ └── plugins │ │ │ └── custom_data_docs │ │ │ └── styles │ │ │ └── data_docs_custom_styles.css │ ├── opencontract │ │ └── v1 │ │ │ └── org │ │ │ └── schemata │ │ │ └── protobuf │ │ │ └── schemata.proto │ ├── schema │ │ └── route.proto │ ├── score.sh │ ├── src │ │ └── python │ │ │ └── pyspark.py │ └── validate.sh └── quickstart │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── document.sh │ ├── org │ └── schemata │ │ └── protobuf │ │ └── schemata.proto │ ├── schema │ ├── activity.proto │ ├── brand.proto │ ├── campaign.proto │ ├── category.proto │ ├── entities.proto │ ├── product.proto │ └── user.proto │ ├── score.sh │ └── validate.sh └── smithy └── quickstart ├── .gitignore ├── README.md ├── build.gradle.kts ├── model └── weather.smithy └── smithy-build.json /.gitignore: -------------------------------------------------------------------------------- 1 | /.smithy*.log 2 | /.vscode/ 3 | /tmp/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Data Engineering helpers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Data contracts 2 | ============== 3 | 4 | # Table of Contents 5 | * [Data contracts](#data-contracts) 6 | * [Overview](#overview) 7 | * [Other repositories of Data Engineering helpers](#other-repositories-of-data-engineering-helpers) 8 | * [Specifications](#specifications) 9 | * [Definitions](#definitions) 10 | * [Definition by Andrew Jones](#definition-by-andrew-jones) 11 | * [Definition by Atlan](#definition-by-atlan) 12 | * [Definition by Charles Verleyen](#definition-by-charles-verleyen) 13 | * [Definition by Jean-Georges Perrin](#definition-by-jean-georges-perrin) 14 | * [Definition by David Jayatillake](#definition-by-david-jayatillake) 15 | * [References](#references) 16 | * [Use cases](#use-cases) 17 | * [Web sites, blogs](#web-sites-blogs) 18 | * [A game about quality for data](#a-game-about-quality-for-data) 19 | * [Data contracts for the warehouse on Substack](#data-contracts-for-the-warehouse-on-substack) 20 | * [Data products, Chad Sanderson on Substack](#data-products-chad-sanderson-on-substack) 21 | * [Data contracts demystified, by Atlan](#data-contracts-demystified-by-atlan) 22 | * [Bitol organization](#bitol-organization) 23 | * [Books and collection of articles](#books-and-collection-of-articles) 24 | * [Data Contracts: Developing Production-Grade Pipelines at Scale](#data-contracts-developing-production-grade-pipelines-at-scale) 25 | * [Implementing Data Mesh](#implementing-data-mesh) 26 | * [Illustrated Guide to Data Products in Action](#illustrated-guide-to-data-products-in-action) 27 | * [Astrafy end-to-end implementation of data contracts](#astrafy-end-to-end-implementation-of-data-contracts) 28 | * [Driving Data Quality with Data Contracts](#driving-data-quality-with-data-contracts) 29 | * [Data Contracts Using Schema Registry](#data-contracts-using-schema-registry) 30 | * [Anatomy of a Data Product](#anatomy-of-a-data-product) 31 | * [Awesome data contracts](#awesome-data-contracts) 32 | * [Articles](#articles) 33 | * [Data Contract Implementation Patterns](#data-contract-implementation-patterns) 34 | * [Data Contracts The Key to reliable and product-oriented data](#data-contracts-the-key-to-reliable-and-product-oriented-data) 35 | * [Data Quality at Petabyte Scale: Building Trust in the Data Lifecycle](#data-quality-at-petabyte-scale-building-trust-in-the-data-lifecycle) 36 | * [Behind the scene of data contracts](#behind-the-scene-of-data-contracts) 37 | * [Data as a Product and Data Contract](#data-as-a-product-and-data-contract) 38 | * [Data Contract 101](#data-contract-101) 39 | * [Data Contracts: the Mesh Glue](#data-contracts-the-mesh-glue) 40 | * [Data contracts for non-tech readers](#data-contracts-for-non-tech-readers) 41 | * [Tables as Interfaces](#tables-as-interfaces) 42 | * [DBT Model Contracts: Importance and Pitfalls](#dbt-model-contracts-importance-and-pitfalls) 43 | * [DBT implementing data contracts](#dbt-implementing-data-contracts) 44 | * [Excerpts](#excerpts) 45 | * [PayPal open sources its data contract templates](#paypal-open-sources-its-data-contract-templates) 46 | * [Data contracts, the missing foundation](#data-contracts-the-missing-foundation) 47 | * [An engineering guide to data creation and data quality, a data contract perspective](#an-engineering-guide-to-data-creation-and-data-quality-a-data-contract-perspective) 48 | * [Data contracts for the warehouse](#data-contracts-for-the-warehouse) 49 | * [Need for an Open Standard for the Semantic Layer](#need-for-an-open-standard-for-the-semantic-layer) 50 | * [Data contracts wrapped 2022](#data-contracts-wrapped-2022) 51 | * [Data contracts in practice](#data-contracts-in-practice) 52 | * [An Engineer's guide to Data Contracts](#an-engineers-guide-to-data-contracts) 53 | * [The production-grade Data Pipeline](#the-production-grade-data-pipeline) 54 | * [Yet another post on Data Contracts](#yet-another-post-on-data-contracts) 55 | * [Fine, let us talk about data contracts](#fine-let-us-talk-about-data-contracts) 56 | * [Data contracts - From zero to hero](#data-contracts---from-zero-to-hero) 57 | * [Contracts have consequences](#contracts-have-consequences) 58 | * [Data Person: Attorney At Law](#data-person-attorney-at-law) 59 | * [The rise of data contracts](#the-rise-of-data-contracts) 60 | * [Interfaces and breaking stuff](#interfaces-and-breaking-stuff) 61 | * [Implementing Data Contracts: 7 Key Learnings](#implementing-data-contracts-7-key-learnings) 62 | * [Shifting left on governance: DataHub and schema annotations](#shifting-left-on-governance-datahub-and-schema-annotations) 63 | * [Data contracts at GoCardless, 6 months on](#data-contracts-at-gocardless-6-months-on) 64 | * [Improving data quality with data contracts](#improving-data-quality-with-data-contracts) 65 | * [Tools and frameworks](#tools-and-frameworks) 66 | * [Schemata](#schemata) 67 | * [OpenDataMesh](#opendatamesh) 68 | * [Datacontract.com specification and CLI](#datacontractcom-specification-and-cli) 69 | * [Bitol \- Open Data Contract Standard (ODCS)](#bitol---open-data-contract-standard-odcs) 70 | * [PayPal data contract templates](#paypal-data-contract-templates) 71 | * [PolyExpose: a simplistic Polyglot data tool](#polyexpose-a-simplistic-polyglot-data-tool) 72 | * [SQLMesh](#sqlmesh) 73 | * [Nessie](#nessie) 74 | * [Kolle](#kolle) 75 | * [Smithy](#smithy) 76 | * [Avro / Schema Registry](#avro--schema-registry) 77 | * [Support by cloud vendors](#support-by-cloud-vendors) 78 | * [Protocol buffers (Protobuf)](#protocol-buffers-protobuf) 79 | * [Buz](#buz) 80 | * [Benthos](#benthos) 81 | * [Memphis](#memphis) 82 | * [API specifications](#api-specifications) 83 | * [Schema.org](#schemaorg) 84 | * [Vendor solutions](#vendor-solutions) 85 | * [DBT](#dbt) 86 | * [AWS](#aws) 87 | * [Google](#google) 88 | * [Collibra](#collibra) 89 | * [AWS](#aws-1) 90 | * [DataMesh Manager](#datamesh-manager) 91 | * [Exploration / Proof-of-Concept (PoC)](#exploration--proof-of-concept-poc) 92 | 93 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go) 94 | 95 | # Overview 96 | [This project](https://github.com/data-engineering-helpers/data-contracts) 97 | intends to document requirements and referential material to implement 98 | data contracts in the perspective of data engineering on a 99 | modern data stack (MDS). 100 | 101 | Data contracts are essential to decouple data producers from data consumers, 102 | while having both parties taking responsibility for their respective parts. 103 | 104 | Even though the members of the GitHub organization may be employed by 105 | some companies, they speak on their personal behalf and do not represent 106 | these companies. 107 | 108 | ## Other repositories of Data Engineering helpers 109 | * [Data Engineering Helpers - Knowledge Sharing - Data products](https://github.com/data-engineering-helpers/data-products) 110 | * [Data Engineering Helpers - Knowledge Sharing - Data quality](https://github.com/data-engineering-helpers/data-quality) 111 | * [Data Engineering Helpers - Knowledge Sharing - Architecture principles](https://github.com/data-engineering-helpers/architecture-principles) 112 | * [Data Engineering Helpers - Knowledge Sharing - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle) 113 | * [Data Engineering Helpers - Knowledge Sharing - Data management](https://github.com/data-engineering-helpers/data-management) 114 | * [Data Engineering Helpers - Knowledge Sharing - Data lakehouse](https://github.com/data-engineering-helpers/data-lakehouse) 115 | * [Data Engineering Helpers - Knowledge Sharing - Metadata](https://github.com/data-engineering-helpers/metadata) 116 | * [Data Engineering Helpers - Knowledge Sharing - Data pipeline deployment](https://github.com/data-engineering-helpers/data-pipeline-deployment) 117 | * [Data Engineering Helpers - Knowledge Sharing - Semantic layer](https://github.com/data-engineering-helpers/semantic-layer) 118 | 119 | # Specifications 120 | * Open Data Contract Specification (ODCS) 121 | * Reader-friendly, dedicated site: https://bitol-io.github.io/open-data-contract-standard/latest/ 122 | * GitHub home page: https://github.com/bitol-io/open-data-contract-standard 123 | 124 | * Innoq's Data Contract specification: https://datacontract.com/ 125 | 126 | * Linux Foundation's Open Data Product Specification (ODPS): https://opendataproducts.org/ 127 | 128 | * Innoq's specification for Data Products: https://dataproduct-specification.com/ 129 | 130 | * Data contract as code (DCaC) principle: the data contracts must be specified 131 | thanks to an 132 | [Interface Definition Language (IDL)](https://en.wikipedia.org/wiki/Interface_description_language), 133 | for instance [Smithy](https://smithy.io/), [Protobuf](https://protobuf.dev/), 134 | [OpenDataMesh](https://dpds.opendatamesh.org/), 135 | [Avro](https://avro.apache.org/docs/) or [dbt schema](https://getdbt.com) 136 | 137 | * Shift-left principle: as much as meta-data as possible should be 138 | written directly within the IDL-based data contracts, potentially 139 | through annotations and/or naming conventions as comments 140 | 141 | * The idea behind the two above-mentioned principles is to have the IDL-based 142 | specifications materializing the 143 | [single version of the truth (SVOT)](https://en.wikipedia.org/wiki/Single_version_of_the_truth) 144 | for the data sets, while benefitting from the whole automation and tooling 145 | that an open standard such as OpenDataMesh, Smithy and Protobuf bring 146 | 147 | * The data contracts should support at least the following features: 148 | * Data validation / Data quality - From the data contracts, we should be 149 | able to generate specifications for specific tools such as 150 | [Great Expectations](https://greatexpectations.io/), 151 | [Deequ](https://github.com/awslabs/deequ), 152 | [dbt data testing](https://www.getdbt.com/product/data-testing/) 153 | or [SODA data quality platform](https://www.soda.io/data-quality-platform) 154 | * Generation of data schemas for a few specific compute enginees such as 155 | [Spark data types](https://spark.apache.org/docs/latest/sql-ref-datatypes.html), 156 | [Flink data types](https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/types/), 157 | [Python Dataclasses](https://docs.python.org/3/library/dataclasses.html), 158 | [Pandera](https://pandera.readthedocs.io/en/stable/), 159 | [Pydantic](https://docs.pydantic.dev/) 160 | or [Pandas](https://towardsdatascience.com/data-quality-check-for-your-data-analysis-tutorial-with-pandas-7ee96d7dc4b6) 161 | 162 | # Definitions 163 | 164 | ## Definition by Andrew Jones 165 | * Source: [https://andrew-jones.com/blog/data-contracts/](https://data-contracts.com/) 166 | 167 | > A data contract is an agreed **interface** between the generators of data and its consumers. 168 | > It sets the **expectations** around that data, defines how it should be **governed**, 169 | > and facilitates the **explicit** generation of quality data that meets the business requirements. 170 | 171 | ## Definition by Atlan 172 | * Source: https://atlan.com/data-contracts/ 173 | 174 | > A data contract outlines how data can get exchanged between two parties. 175 | > It defines the structure, format, and rules of exchange in a distributed 176 | > data architecture. These formal agreements make sure that there are not 177 | > any uncertainties or undocumented assumptions about data. 178 | 179 | ## Definition by Charles Verleyen 180 | * Source: 181 | https://medium.astrafy.io/data-quality-with-great-expectations-e41504d93e17 182 | 183 | ![Data contracts: API-based agreements](img/data-contracts-producers-and-consumers-2023-05.webp) 184 | 185 | > Without high-quality data, every analytics initiative will be underwhelming 186 | > at best and actively damaging the business at worst. Data contracts 187 | > are API-based agreements between producers and consumers designed to solve 188 | > exactly that problem Data Contracts are not a new concept. 189 | > They are simply new implementations of a very old idea 190 | > — that producers and consumers should work together to generate high-quality, 191 | > semantically valid data from the ground up. 192 | 193 | ## Definition by Jean-Georges Perrin 194 | * Source: https://medium.com/profitoptics/data-contract-101-568a9adbf9a9 195 | 196 | > A data contract acts as an agreement between multiple parties; specifically, 197 | > a data producer and its consumer(s). A data contract: 198 | > * Creates a link between data producers and data consumers. 199 | > * Creates a link between a logical representation of the data and its physical implementation. 200 | > * Describes “meta meta” data: rules, quality, and behavior (yes, there are two metas in this sentence). 201 | 202 | ## Definition by David Jayatillake 203 | * Source: https://davidsj.substack.com/p/yet-another-post-on-data-contracts 204 | 205 | > In short, **a Data Contract is an enforceable agreement on structure 206 | > and format between the producer and consumer of data**. 207 | > You could even define it in a simpler way: 208 | > **a Data Contract is a guarantee on structure and format by a producer 209 | > of data**. 210 | 211 | # References 212 | * [Data contracts - (WIP) Community management](https://github.com/data-engineering-helpers/data-contracts/blob/main/community/README.md) 213 | * [Material for Data platform - Data products](https://github.com/data-engineering-helpers/data-products) 214 | * [Architecture principles for data engineering pipelines on the Modern Data Stack (MDS)](https://github.com/data-engineering-helpers/architecture-principles) 215 | * [Material for the Data platform - Architecture principles](https://github.com/data-engineering-helpers/architecture-principles/blob/main/material/README.md) 216 | * Specifications/principles for a 217 | [data engineering pipeline deployment tool](https://github.com/data-engineering-helpers/data-pipeline-deployment) 218 | * [`dpcctl`, the Data Processing Pipeline (DPP) CLI utility](https://github.com/data-engineering-helpers/dppctl), a Minimal Viable Product (MVP) in Go 219 | * [Material for the Data platform - Metadata](https://github.com/data-engineering-helpers/metadata/blob/main/README.md) 220 | * [Material for the Data platform - Data quality](https://github.com/data-engineering-helpers/data-quality/blob/main/README.md) 221 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md) 222 | * Quickstart guides: 223 | * [Schemata quickstart guide](schemata/quickstart/) 224 | 8 [Smithy quickstart guide](smithy/quickstart/) 225 | 226 | ## Use cases 227 | * [Geonames Quality Assurance (QA) framework](https://github.com/data-engineering-helpers/data-quality/blob/main/README.md#geonames) 228 | * [OpenTravelData (OPTD) Quality Assurance framework](https://github.com/data-engineering-helpers/data-quality/blob/main/README.md#opentraveldata-optd) 229 | 230 | ## Web sites, blogs 231 | 232 | ### A game about quality for data 233 | * Title: A game about quality for data 234 | * Author: Joe Leach 235 | * Link to the gamified use case: https://joelbth.codeberg.page/community-notices/ 236 | 237 | ### Data contracts for the warehouse on Substack 238 | * Link to the web site/blog: https://dataproducts.substack.com/p/data-contracts-for-the-warehouse 239 | 240 | ### Data products, Chad Sanderson on Substack 241 | * Link to Chad Sanderson's profile: https://substack.com/profile/12566999-chad-sanderson 242 | * Link to the newsletter subscription form: https://dataproducts.substack.com/ 243 | 244 | ### Data contracts demystified, by Atlan 245 | * Title: Data Contracts: The Key to Scaling Distributed Data Architecture and Reducing Data Chaos 246 | * Date: April 2023 247 | * Link to the web site: https://atlan.com/data-contracts/ 248 | 249 | ### Bitol organization 250 | * GitHub organization: https://github.com/bitol-io 251 | * Supporting groups: Bitol (https://github.com/bitol-io) and AIDA user group (https://aidausergroup.org) 252 | * The basics of a data contract: a data contract defines the agreement between a data producer 253 | and consumers. A data contract contains several sections: fundamentals, schema, data quality, 254 | Service-level agreement (SLA). security & stakeholders. custom properties. 255 | * Main author: Jean-Georges Perrin 256 | ([Jean-Georges Perrin on LinkedIn](https://www.linkedin.com/in/jgperrin/), 257 | [Jean-Georges Perrin on Medium](https://medium.com/@jgperrin), 258 | [Jean-Georges Perrin on GitHub](https://github.com/jgperrin)): 259 | * Articles: 260 | * 2023-12-09 - [Why the Need for Standardizing Data Contracts?](https://medium.com/abeadata/why-the-need-for-standardizing-data-contracts-133bc3491148) 261 | * 2023-11-30 - [Linux Foundation AI & Data - Bitol Joins LF AI & Data as New Sandbox Project](https://lfaidata.foundation/blog/2023/11/30/bitol-joins-lf-ai-data-as-new-sandbox-project/) 262 | * 2023-11-30 - [AIDAUG - Bitol Joins LF AI & Data as New Sandbox Project](https://aidausergroup.org/2023/11/30/bitol-joins-lf-ai-data-as-new-sandbox-project/) 263 | * 2023-10-01 - [Data Contracts: A Bridge Connecting Two Worlds](https://medium.com/@atanas.iliev.ai/data-contracts-a-bridge-connecting-two-worlds-404eff1d970d) 264 | * 2023-09-10 - [Data Contracts 101](https://medium.com/p/568a9adbf9a9) 265 | * 2023-08-10 - [Welcome to the Open Data Contract Standard](https://jgp.ai/2023/08/09/welcome-to-the-open-data-contract-standard/) 266 | * 2023-05-11 - [Data Contracts – Everything You Need to Know](https://www.montecarlodata.com/blog-data-contracts-explained/) 267 | * 2023-05-07 - [Data Engineering Weekly #130 - Data Contract in the Wild with PayPal’s Data Contract Template](https://www.dataengineeringweekly.com/p/data-engineering-weekly-130) 268 | * 2023-05-06 - [PayPal เปิด Data Contract เป็น Open Source Template ให้ไปใช้งานกัน](https://discuss.dataengineercafe.io/t/paypal-data-contract-open-source-template/581/1) 269 | * 2023-05-05 - [Jonathan Neo (j__neo ) on Reddit](https://www.reddit.com/r/dataengineering/comments/137glbo/comment/jixw5hj/?utm_source=reddit&utm_medium=web2x&context=3) 270 | * 2023-05-01 - [PayPal open sources its data contract template](https://jgp.ai/2023/05/01/paypal-open-sources-its-data-contract-template/) 271 | 272 | ## Books and collections of articles 273 | 274 | ### Data Contracts: Developing Production-Grade Pipelines at Scale 275 | * Title: Data Contracts: Developing Production-Grade Pipelines at Scale 276 | * Author: Chad Sanderson 277 | ([Chad Sanderson on LinkedIn](https://www.linkedin.com/in/chad-sanderson/)) 278 | * Date: 2025 279 | * Link to a LinkedIn post: 280 | https://www.linkedin.com/posts/chad-sanderson_big-news-weve-spent-the-last-year-writing-activity-7290411822044565505-IwkV 281 | * Link to the early release page: https://www.gable.ai/data-contracts-book 282 | 283 | ### Implementing Data Mesh 284 | * Title: Implementing Data Mesh 285 | * Date: December 2023 286 | * Authors: 287 | + Jean-Georges Perrin 288 | ([Jean-Georges Perrin on LinkedIn](https://www.linkedin.com/in/jgperrin/), 289 | [Jean-Georges Perrin on Medium](https://medium.com/@jgperrin)) 290 | + Eric Broda 291 | * Link to O'Reilly page for the book: 292 | https://www.oreilly.com/library/view/implementing-data-mesh/9781098156213/ 293 | * Publisher: O'Reilly 294 | 295 | ### Illustrated Guide to Data Products in Action 296 | * Title: Where Exactly Data Becomes Product: Illustrated Guide to Data Products in Action 297 | * Date: Aug. 2024 298 | * Authors: 299 | Animesh Kumar 300 | ([Animesh Kumar on LinkedIn](https://www.linkedin.com/in/anismiles/), 301 | [Animesh Kumar on Substack](https://substack.com/@moderndata101)) 302 | and 303 | Travis Thomspon 304 | ([Travis Thomspon on LinkedIn](https://www.linkedin.com/in/travis-w-thompson/), 305 | [Travis Thomspon on Substack](https://substack.com/@travisthompsonsays)) 306 | * Link to the article: 307 | https://moderndata101.substack.com/p/where-exactly-data-becomes-product 308 | * Publisher: Substack 309 | 310 | ### Astrafy end-to-end implementation of data contracts 311 | * Title: Implementation of the Data Contracts with dbt, Google Cloud & Great Expectations 312 | * Link to the LinkedIn post summarizing the Medium posts: 313 | https://www.linkedin.com/posts/astrafy_datacontracts-dbt-greatexpectations-activity-7087097534392745987-_1RR 314 | * Author: Łukasz Ściga 315 | ([Łukasz Ściga on LinkedIn](https://www.linkedin.com/in/lukasz-sciga/), 316 | [Łukasz Ściga on Medium](https://medium.com/@lukasz-sciga)) 317 | * Publisher: Medium 318 | * Medium posts: 319 | * [Medium - Implementation of the Data Contracts with dbt, Google Cloud & Great Expectations - Part 1](https://medium.astrafy.io/implementation-of-the-data-contracts-with-dbt-google-cloud-great-expectations-part-1-939774fc7284) 320 | * [Medium - Implementation of the Data Contracts with dbt, Google Cloud & Great Expectations - Part 2](https://medium.astrafy.io/implementation-of-the-data-contracts-with-dbt-google-cloud-great-expectations-part-2-112c96c2914a) 321 | * [Medium - Implementation of the Data Contracts with dbt, Google Cloud & Great Expectations - Part 3](https://medium.astrafy.io/implementation-of-the-data-contracts-with-dbt-google-cloud-great-expectations-part-3-7c2675d549df) 322 | 323 | ### Driving Data Quality with Data Contracts 324 | * Title: Driving Data Quality with Data Contracts: A comprehensive guide to building reliable, trusted, and effective data platforms 325 | * Author: [Andrew Jones](https://andrew-jones.com/) 326 | * Date: 30 June 2023 327 | * Publisher: Packt 328 | * ASIN: B0C37FPH3D 329 | * Article on the book: 330 | https://andrew-jones.com/blog/data-contracts-the-book-out-now/ 331 | * Book on Google Books: https://books.google.fr/books?id=OkTJEAAAQBAJ 332 | * GitHub companion repository: 333 | https://github.com/PacktPublishing/Driving-Data-Quality-with-Data-Contracts 334 | 335 | ### Data Contracts Using Schema Registry 336 | * Title: Kafka for Developers - Data Contracts Using Schema Registry 337 | * Date: March 2023 338 | * Author: Dilip Sundarraj 339 | * ISBN: 9781837633487 340 | * Link on the book page on Packt: 341 | https://www.packtpub.com/product/kafka-for-developers-data-contracts-using-schema-registry-video/9781837633487 342 | * Publisher: Packt Publishing 343 | * GitHub companion repository: 344 | https://github.com/PacktPublishing/Kafka-for-Developers---Data-Contracts-Using-Schema-Registry 345 | 346 | ### Anatomy of a Data Product 347 | * Title: Anatomy of a Data Product 348 | * Date: December 2022 349 | * Author: Jesse Paquette ( 350 | [Jesse Paquette on LinkedIn](https://www.linkedin.com/in/jessepaquette/), 351 | [Jesse Paquette on Medium](https://jessepaquette.medium.com/)) 352 | * Link to the articles: 353 | * Part 1: https://jessepaquette.medium.com/anatomy-of-a-data-product-part-one-5afa99609699 354 | * Part 2: https://jessepaquette.medium.com/anatomy-of-a-data-product-part-two-9d0c19e4307b 355 | * Part 3: https://jessepaquette.medium.com/anatomy-of-a-data-product-part-three-801782b2f4bf 356 | * Part 4: https://jessepaquette.medium.com/anatomy-of-a-data-product-part-four-e69706c156e6 357 | * Part 5: https://jessepaquette.medium.com/anatomy-of-a-data-product-part-five-9a1f47c12db4 358 | 359 | ### Awesome data contracts 360 | * Link to the reference documentation on GitHub: 361 |  https://github.com/AltimateAI/awesome-data-contracts 362 | 363 | ## Articles 364 | 365 | ### Data Contract Implementation Patterns 366 | * Title: Data Contract Implementation Patterns 367 | * Date: Mar. 2025 368 | * Author: Paolo Platter 369 | ([Paolo Platter on LinkedIn](https://www.linkedin.com/in/paoloplatter/), 370 | [Paolo Platter on Medium](https://p-platter.medium.com/)) 371 | * Link to the article on Medium: 372 | https://medium.com/agile-lab-engineering/data-contract-implementation-patterns-bf4b6d4f21e9 373 | 374 | ### Data Contracts: The Key to reliable and product-oriented data 375 | * Title: Data Contracts: The Key to reliable and product-oriented data 376 | * Date: Feb. 2025 377 | * Authors: 378 | * Pierre-Yves Bonnefoy 379 | * Gaëlle Seret 380 | * Link to the article on Substack: 381 | https://cleandataarchitecture.substack.com/p/data-contracts-the-key-to-reliable 382 | 383 | ### Data Quality at Petabyte Scale: Building Trust in the Data Lifecycle 384 | * Title: Data Quality at Petabyte Scale: Building Trust in the Data Lifecycle 385 | * Date: Feb. 2025 386 | * Author: Zakariah Siyaji 387 | ([Zakariah Siyaji on LinkedIn](https://www.linkedin.com/in/zakariah-siyaji/), 388 | [Zakariah Siyaji on Medium](https://medium.com/@zaki.siyaji)) 389 | * Link to the post on LinkedIn by Chad Sanderson: 390 | https://www.linkedin.com/posts/chad-sanderson_many-companies-talk-about-implementing-data-activity-7296212049565515777-dnCn/ 391 | * Link to the article on Medium: 392 | https://medium.com/glassdoor-engineering/data-quality-at-petabyte-scale-building-trust-in-the-data-lifecycle-7052361307a4 393 | * Also referenced in 394 | [Data Engineering Helpers - Knowledge Sharing - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md#data-quality-at-petabyte-scale-building-trust-in-the-data-lifecycle) 395 | 396 | ### Behind the scene of data contracts 397 | * Title: Behind the scene of data contracts 398 | * Author: Pietro La Torre 399 | * Date: Nov. 2024 400 | * Link to the Substack article: 401 | https://open.substack.com/pub/dataware/p/behind-the-scenes-of-data-contracts 402 | 403 | ### Data as a Product and Data Contract 404 | * Title: Data-as-a-Product and Data-Contract: An evolutionary approach to data maturity 405 | * Date: April 2024 406 | * Author: 407 | [Olivier Wulveryck on LinkedIn](https://www.linkedin.com/in/olivierwulveryck/) 408 | * Link to the blog article: 409 | https://blog.owulveryck.info/2024/04/09/data-as-a-product-and-data-contract-an-evolutionary-approach-to-data-maturity.html 410 | 411 | ### Data Contract 101 412 | * Title: Data contract 101 413 | * Date: September 2023 414 | * Author: Jean-Georges Perrin 415 | ([Jean-Georges Perrin on LinkedIn](https://www.linkedin.com/in/jgperrin/), 416 | [Jean-Georges Perrin on Medium](https://medium.com/@jgperrin)) 417 | * Link to the article: 418 | https://medium.com/profitoptics/data-contract-101-568a9adbf9a9 419 | * Publisher: Medium 420 | 421 | ### Data Contracts: the Mesh Glue 422 | * Title: Data Contracts: the Mesh Glue 423 | * Author: Luis Velasco 424 | ([Luis Velasco on LinkedIn](https://www.linkedin.com/in/luisvelascouk/), 425 | [Luis Velasco on Medium](https://velascoluis.medium.com/), 426 | [Luis Velasco on GitHub](https://github.com/velascoluis)) 427 | * Date: July 2023 428 | * Link to the article: 429 | https://towardsdatascience.com/data-contracts-the-mesh-glue-c1b533e2a664 430 | * Publisher: Medium 431 | 432 | ### Data contracts for non-tech readers 433 | * Title: Data contracts for non-tech readers: a restaurant analogy 434 | * Author: Samy Doreau 435 | ([Samy Doreau on LinkedIn](https://www.linkedin.com/in/samydoreau/)) 436 | * Date: July 2023 437 | * Link to the article: 438 | https://infinitelambda.com/data-contracts-non-tech-restaurant/ 439 | * Publisher: Infinite Lambda 440 | 441 | ### Tables as Interfaces 442 | * Title: Tables as Interfaces 443 | * Date: July 2023 444 | * Author: David Jayatillake 445 | ([David Jayatillake on LinkedIn](https://www.linkedin.com/in/david-jayatillake/), 446 | [David Jayatillake on Substack](https://substack.com/@davidsj)) 447 | * Link to the article: 448 | https://davidsj.substack.com/p/tables-as-interfaces 449 | * Publisher: Substack 450 | 451 | ### DBT Model Contracts: Importance and Pitfalls 452 | * Title: DBT Model Contracts: Importance and Pitfalls 453 | * Date: May 2023 454 | * Author: Ramon Marrero 455 | ([Ramon Marrero on LinkedIn]([https://www.linkedin.com/in/tristanhandy/](https://www.linkedin.com/in/ramon-marrero-almonte/)), 456 | [Ramon Marrero on Medium]([https://www.getdbt.com/author/tristan-handy/](https://medium.com/@ramon-marrero))) 457 | * Link to the article: 458 | https://medium.com/geekculture/dbt-model-contracts-importance-and-pitfalls-20b113358ad7 459 | * Publisher: Medium 460 | 461 | ### DBT implementing data contracts 462 | * Title: The next big step forwards for analytics engineering 463 | * Date: April 2023 464 | * Author: Tristan Handy 465 | ([Tristan Handy on LinkedIn](https://www.linkedin.com/in/tristanhandy/), 466 | [Tristan Handy on DBT's web site](https://www.getdbt.com/author/tristan-handy/)) 467 | * Link to the article: 468 | https://www.getdbt.com/blog/analytics-engineering-next-step-forwards/ 469 | * Publisher: DBT 470 | 471 | #### Excerpts 472 | dbt Core v1.5 is slated for release at the end of April, and it will include three new constructs: 473 | * [Access](https://docs.getdbt.com/docs/collaborate/govern/model-access): 474 | Choose which models ought to be “private” (implementation details, handling complexity within one team or domain) and “public” (an intentional interface, shared with other teams). Other groups and projects can only ref a model — that is, take a critical dependency on it — in accordance with its access. 475 | * [Contracts](https://docs.getdbt.com/docs/collaborate/govern/model-contracts): 476 | Define the structure of a model explicitly. If your model’s SQL doesn’t match the specified column names and data types, it will fail to build. Breaking changes (removing, renaming, retyping a column) will be caught during CI. On data platforms that support build-time constraints, ensure that columns are not null or pass custom checks while a model is being built, in addition to more flexible testing after. 477 | * [Versions](https://docs.getdbt.com/docs/collaborate/govern/model-versions): 478 | A single model can have multiple versioned definitions, with the same name for downstream reference. When a mature model with an enforced contract and public access needs to undergo a breaking change, rather than breaking downstream queriers immediately, facilitate their migration by bumping the version and communicating a deprecation window. 479 | 480 | In the future, individual teams will own their own data. Data engineering will own “core tables” or “conformed dimensions” that will be used by other teams. Ecommerce will own models related to site visits and conversion rate. Ops will own data related to fulfillment. Etc. Each of these teams will reference the public interfaces exposed by other teams as a part of their work, and periodically release upgrades as versions are incremented on upstream dependencies. Teams will review PRs for their own models, and so have more context for what “good” looks like. Monitoring and alerting will happen in alignment with teams and codebases, so there will be real accountability to delivering a high quality, high reliability data product. Teams will manage their own warehouse spend and optimize accordingly. And teams will be able to publish their own metrics all the way into their analytics tool of choice. 481 | 482 | ### PayPal open sources its data contract templates 483 | * Title: PayPal open sources its data contract templates 484 | * Date: May 2023 485 | * Author: Jean-Georges Perrin 486 | * See also [Data Contract 101 section above](#data-contract-101) 487 | * Link to the article: 488 | https://jgp.ai/2023/05/01/paypal-open-sources-its-data-contract-template/ 489 | * Publisher: Jean-Georges Perrin's blog 490 | 491 | ### Data contracts, the missing foundation 492 | * Title: Data contracts: The missing foundation 493 | * Date: March 2023 494 | * Author: Tom Baeyens 495 | ([Tom Baeyens on LinkedIn](https://www.linkedin.com/in/tombaeyens/), 496 | [Tom Baeyens on Medium](https://medium.com/@tombaeyens)) 497 | * Link to the article: 498 |  https://medium.com/@tombaeyens/data-contracts-the-missing-foundation-3c7a98544d2a 499 | * Publisher: Medium 500 | 501 | ### An engineering guide to data creation and data quality, a data contract perspective 502 | * Title: An Engineering Guide to Data Creation and Data Quality - 503 | A Data Contract perspective 504 | * Dates: March and May 2023 505 | * Author: Ananth Packkildurai 506 | ([Ananth Packkildurai on LinkedIn](https://www.linkedin.com/in/ananthdurai/), 507 | [Ananth Packkildurai on Substack](https://substack.com/profile/3520227-ananth-packkildurai), 508 | [Ananth Packkildurai on GitHub](https://github.com/ananthdurai)) 509 | * Part 1: 510 | https://www.dataengineeringweekly.com/p/an-engineering-guide-to-data-creation 511 | * Part 2: 512 | https://www.dataengineeringweekly.com/p/an-engineering-guide-to-data-quality 513 | * Publisher: 514 | [Data Engineering Weekly (DEW) newsletter on Substack](https://www.dataengineeringweekly.com/) 515 | * Note that Ananth Packkildurai is the main contributor of 516 | [Schemata](#schemata) 517 | 518 | ### Data contracts for the warehouse 519 | * Title: Data contracts for the warehouse 520 | * Date: January 2023 521 | * Authors: 522 | * Chad Sanderson 523 | ([Chad Sanderson on LinkedIn](https://www.linkedin.com/in/chad-sanderson/), 524 | [Chad Sanderson on Substack](https://substack.com/profile/12566999-chad-sanderson)) 525 | * Daniel Dicker 526 | ([Daniel Dicker on LinkedIn](https://www.linkedin.com/in/danieldicker/), 527 | [Daniel Dicker on Substack](https://substack.com/profile/48987754-daniel-dicker)) 528 | * Link to the web site/blog: 529 |  https://dataproducts.substack.com/p/data-contracts-for-the-warehouse 530 | * Publisher: Substack 531 | 532 | ### Need for an Open Standard for the Semantic Layer 533 | * Title: The Need for an Open Standard for the Semantic Layer 534 | * Date: October 2023 535 | * Author: Artyom Keydunov 536 | ([Artyom Keydunov on LinkedIn](https://www.linkedin.com/in/keydunov/)) 537 | * Link to the article: 538 | https://cube.dev/blog/the-need-for-an-open-standard-for-the-semantic-layer 539 | * Publisher: [Cube blog](https://cube.dev/) 540 | 541 | ### Data contracts wrapped 2022 542 | * Title: Data contracts wrapped - 2022 543 | * Date: December 2022 544 | * Author: Shirshanka Das 545 | ([Shirshanka Das on LinkedIn](https://www.linkedin.com/in/shirshankadas/)) 546 | * Link to the article: 547 | https://medium.com/datahub-project/data-contracts-wrapped-2022-470e0c43365d 548 | * Publisher: Medium 549 | 550 | ### Data contracts in practice 551 | * Title: Data contracts in practice 552 | * Date: December 2022 553 | * Author: Andrea Gioia 554 | ([Andrea Gioia on LinkedIn](https://www.linkedin.com/in/andreagioia/), 555 | [Andrea Gioia on Medium](https://medium.com/@andrea_gioia), 556 | [Andrea Gioia on GitHub](https://github.com/andrea-gioia)) 557 | * Link to the article: 558 | https://medium.com/better-programming/data-contracts-in-practice-93e58d324f34 559 | * Publihser: Medium 560 | * Note that Andrea Gioia is the main contributor of 561 | [OpenDataMesh](#opendatamesh) 562 | 563 | ### An Engineer's guide to Data Contracts 564 | * Title: An Engineer's guide to Data Contracts 565 | * Date: October 2022 566 | * Authors: 567 | * Chad Sanderson 568 | ([Chad Sanderson on LinkedIn](https://www.linkedin.com/in/chad-sanderson/), 569 | [Chad Sanderson on Substack](https://substack.com/profile/12566999-chad-sanderson)) 570 | * Adrian Kreuziger 571 | * Part 1: 572 | https://dataproducts.substack.com/p/an-engineers-guide-to-data-contracts 573 | * Part 2: 574 | https://dataproducts.substack.com/p/an-engineers-guide-to-data-contracts-6df 575 | * Publisher: Substack 576 | 577 | ### The production-grade Data Pipeline 578 | * Title: The production-grade Data Pipeline 579 | * Date: September 2022 580 | * Author: Chad Sanderson 581 | ([Chad Sanderson on LinkedIn](https://www.linkedin.com/in/chad-sanderson/), 582 | [Chad Sanderson on Substack](https://substack.com/profile/12566999-chad-sanderson)) 583 | * Link to the article: 584 | https://dataproducts.substack.com/p/the-production-grade-data-pipeline 585 | * Publisher: Substack 586 | 587 | ### Yet another post on Data Contracts 588 | * Title: Yet another post on Data Contracts 589 | * Date: September 2022 590 | * Author: David Jayatillake 591 | ([David Jayatillake on Substack](https://substack.com/profile/64081583-david-jayatillake), 592 | [David Jayatillake on LinkedIn](https://www.linkedin.com/in/david-jayatillake/)) 593 | * Part 1: https://davidsj.substack.com/p/yet-another-post-on-data-contracts 594 | * Part 2: https://davidsj.substack.com/p/yet-another-post-on-data-contracts-9f0 595 | * Part 3: https://davidsj.substack.com/p/yet-another-post-on-data-contracts-dad 596 | * Publisher: Substack 597 | 598 | ### Fine, let us talk about data contracts 599 | * Title: Fine, let's talk about data contracts 600 | * Date: September 2022 601 | * Author: Benn Stancil 602 | ([Benn Stancil on Substack](https://benn.substack.com/about), 603 | [Benn Stancil on LinkedIn](https://www.linkedin.com/in/benn-stancil/)) 604 | * Link to the article: https://benn.substack.com/p/data-contracts 605 | * Publisher: Substack 606 | 607 | ### Data contracts - From zero to hero 608 | * Title: Data contracts - From zero to hero 609 | * Date: September 2022 610 | * Author: Mehdi Ouazza 611 | ([Mehdi Ouazza on LinkedIn](https://www.linkedin.com/in/mehd-io/)) 612 | * Link to the article: 613 | https://towardsdatascience.com/data-contracts-from-zero-to-hero-343717ac4d5e 614 | * Publisher: Medium 615 | 616 | ### Contracts have consequences 617 | * Title: Contracts have consequences 618 | * Date: September 2022 619 | * Author: Tristan Hardy 620 | ([Tristan Hardy on Substack](https://substack.com/profile/1135298-tristan-handy)) 621 | * Link to the article: 622 | https://roundup.getdbt.com/p/contracts-have-consequences 623 | * Publisher: Substack 624 | 625 | ### Data Person: Attorney At Law 626 | * Title: Data Person: Attorney At Law 627 | * Date: September 2022 628 | * Author: Stephen Bailey 629 | ([Stephen Bailey on Substack](https://substack.com/profile/16953086-stephen-bailey), 630 | [Stephen Bailey on LinkedIn](https://www.linkedin.com/in/stkbailey/)) 631 | * Link to the article: 632 | https://stkbailey.substack.com/p/data-person-attorney-at-law 633 | * Publisher: Substack 634 | 635 | ### The rise of data contracts 636 | * Title: The rise of data contracts 637 | * Date: August 2022 638 | * Author: Chad Sanderson 639 | ([Chad Sanderson on LinkedIn](https://www.linkedin.com/in/chad-sanderson/), 640 | [Chad Sanderson on Substack](https://substack.com/profile/12566999-chad-sanderson)) 641 | * Link to the article: 642 | https://dataproducts.substack.com/p/the-rise-of-data-contracts 643 | * Publisher: Substack 644 | 645 | ### Interfaces and breaking stuff 646 | * Title: Interfaces and breaking stuff 647 | * Date: July 2022 648 | * Author: Tristan Handy 649 | ([Tristan Handy on Substack](https://substack.com/profile/1135298-tristan-handy), 650 | [Tristan Handy on LinkedIn](https://www.linkedin.com/in/tristanhandy/)) 651 | * Link to the article: 652 | https://roundup.getdbt.com/p/interfaces-and-breaking-stuff 653 | * Publisher: Substack 654 | 655 | ### Implementing Data Contracts: 7 Key Learnings 656 | * Title: Implementing Data Contracts: 7 Key Learnings 657 | * Date: July 2022 658 | * Author: Barr Moses, CEO at Monte Carlo 659 | ([Barr Moses on LinkedIn](https://www.linkedin.com/in/barrmoses/), 660 | [Barr Moses on Medium](https://medium.com/@barrmoses)) 661 | * Link to the article: 662 |  https://barrmoses.medium.com/implementing-data-contracts-7-key-learnings-d214a5947d5e 663 | * Publisher: Medium 664 | 665 | ### Shifting left on governance: DataHub and schema annotations 666 | * Title: Shifting left on governance: DataHub and schema annotations 667 | * Date: May 2022 668 | * Author: Joshua Shinavier 669 | ([Joshua Shinavier on LinkedIn](https://www.linkedin.com/in/joshuashinavier/)) 670 | * Link to the article: 671 |  https://engineering.linkedin.com/blog/2022/shifting-left-on-governance--datahub-and-schema-annotations 672 | * Publisher: LinkedIn 673 | 674 | ### Data contracts at GoCardless, 6 months on 675 | * Title: Data contracts at GoCardless, 6 months on 676 | * Date: May 2022 677 | * Author: Andrew Jones 678 | ([Andrew Jones on LinkedIn](https://www.linkedin.com/in/andrewrhysjones/), 679 | [Andrew Jones on Medium](https://andrew-jones.medium.com/)) 680 | * Link to the article: 681 | https://medium.com/gocardless-tech/data-contracts-at-gocardless-6-months-on-bbf24a37206e 682 | * Publisher: Medium 683 | 684 | ### Improving data quality with data contracts 685 | * Title: Improving data quality with data contracts 686 | * Date: December 2021 687 | * Author: Andrew Jones 688 | ([Andrew Jones on LinkedIn](https://www.linkedin.com/in/andrewrhysjones/), 689 | [Andrew Jones on Medium](https://andrew-jones.medium.com/)) 690 | * Link to the article: 691 | https://medium.com/gocardless-tech/improving-data-quality-with-data-contracts-238041e35698 692 | * Publisher: Medium 693 | 694 | # Tools and frameworks 695 | 696 | ## Schemata 697 | * Homepage: [GitHub - Schemata](https://github.com/ananthdurai/schemata) 698 | * Schema modeling framework for decentralized domain-driven ownership of data. 699 | It combines a set of standard metadata definitions for each schema 700 | and data field and a scoring algorithm to provide a feedback loop 701 | on how efficient the data modeling of the data warehouse is. 702 | It supports ProtoBuf, dbt and Avro formats. It may support OpenDataMesh 703 | and/or Smithy in the future 704 | * Main contributors: Ananth Packkildurai 705 | ([Ananth Packkildurai on LinkedIn](https://www.linkedin.com/in/ananthdurai/), 706 | [Ananth Packkildurai on Substack](https://substack.com/profile/3520227-ananth-packkildurai), 707 | [Ananth Packkildurai on GitHub](https://github.com/ananthdurai)) 708 | * See also: 709 | + [Data engineering guide to data creation, a data contract perspective (in this page)](#data-engineering-guide-to-data-creation-a-data-contract-perspective) 710 | + [Data contracts - Schemata quickstart guide](schemata/quickstart/) 711 | 712 | ## OpenDataMesh 713 | * Homepage: https://dpds.opendatamesh.org 714 | * An open specification that declaratively defines a data product 715 | in all its components using a JSON or YAML descriptor document. 716 | It is released under Apache 2.0 license. 717 | * Main contributors: Andrea Gioia 718 | ([Andrea Gioia on LinkedIn](https://www.linkedin.com/in/andreagioia/), 719 | [Andrea Gioia on Medium](https://medium.com/@andrea_gioia), 720 | [Andrea Gioia on GitHub](https://github.com/andrea-gioia)) 721 | * See also 722 | [Data contracts in practice (in this page)](#data-contracts-in-practice) 723 | 724 | ## Datacontract.com specification and CLI 725 | * `datacontract` command-line (CLI) utility: https://github.com/datacontract/cli 726 | * Datacontract.com specification: 727 | https://datacontract.com/ 728 | * Inspired from the Open Data Contract Standard (ODCS); see below 729 | 730 | ## Bitol - Open Data Contract Standard (ODCS) 731 | * See also the [Bitol organization section above](#bitol-organization) 732 | * Home page: https://github.com/bitol-io/open-data-contract-standard 733 | + Open Data Contract Standard (ODCS)/specification: 734 | https://github.com/bitol-io/open-data-contract-standard/blob/main/docs/README.md 735 | + Examples: 736 | https://github.com/bitol-io/open-data-contract-standard/blob/main/examples/README.md 737 | * ODCS inherits from the PayPal Credit data contract template (see below) 738 | * See above the Datacontract.com specification and CLI 739 | for an actual implementation 740 | * Overview: A data contract defines the agreement between a data producer and consumers. 741 | A data contract contains several sections: fundamentals, schema, data quality, 742 | Service-level agreement (SLA), security & stakeholders, custom properties 743 | 744 | ## PayPal data contract templates 745 | * Homepage: https://github.com/paypal/data-contract-template 746 | * This project describes the data contract being used in the implementation 747 | of Data Mesh at PayPal. It is available as an Apache 2.0 license. 748 | 749 | ## PolyExpose: a simplistic Polyglot data tool 750 | * Homepage: https://github.com/velascoluis/polyexpose 751 | * Prototype, simplistic Python package implementing the following concepts 752 | + The ultimate goal to ensure reusability, Data mesh answer is to introduce 753 | the concept of polyglot data, an abstraction to clearly differentiate 754 | between the data semantics and the data consumption format/syntax. 755 | + This is a very elegant approach with a very clear separation of 756 | responsibilities between semantics and its underlying technology, 757 | but as Data Mesh does not prescribe any kind of technical architecture, 758 | sometimes this can be challenging to visualize or implement. 759 | + The idea of [this repository](https://github.com/velascoluis/polyexpose) 760 | is to present a potential technology architecture that implements 761 | this pattern using as many open source components as possible 762 | * Main contributors: Luis Velasco 763 | ([Luis Velasco on LinkedIn](https://www.linkedin.com/in/luisvelascouk/), 764 | [Luis Velasco on Medium](https://velascoluis.medium.com/), 765 | [Luis Velasco on GitHub](https://github.com/velascoluis)) 766 | * See also 767 | [Data Contracts: the Mesh Glue (in this page)](#data-contracts-the-mesh-glue) 768 | 769 | ## SQLMesh 770 | * Homepage: https://sqlmesh.com/ 771 | * GitHub page: https://github.com/TobikoData/sqlmesh 772 | * Documentation: https://sqlmesh.readthedocs.io/en/stable/ 773 | * Article from Nov. 2023: https://tobikodata.com/why-data-teams-are-adopting-declarative-pipelines.html 774 | * Overview: SQLMesh is a data transformation framework that brings the benefits of DevOps to data teams. 775 | It enables data scientists, analysts, and engineers to efficiently run and deploy data transformations 776 | written in SQL or Python. 777 | 778 | ## Nessie 779 | * Homepage: https://projectnessie.org/ 780 | * GitHub page: https://github.com/projectnessie/nessie/ 781 | * Documentation: https://projectnessie.org/nessie-latest/ 782 | * Article from Apr. 2024, by [Ciro Greco](https://www.linkedin.com/in/cirogreco/): 783 | https://towardsdatascience.com/write-audit-publish-for-data-lakes-in-pure-python-no-jvm-25fbd971b17d 784 | * Overview: Transactional catalog for data lakes 785 | + Git-inspired data version control 786 | + Cross-table transactions and visibility 787 | + Open data lake approach, supporting Hive, Spark, Dremio, AWS Athena, etc. 788 | + Works with Apache Iceberg tables 789 | + Run as a Docker image or on Kubernetes 790 | + Fork it on GitHub 791 | 792 | ## Kolle 793 | * GitHub page: https://github.com/metaheed/kolle 794 | 795 | Zero/Low code based business model representation automation. 796 | Kolle is for working on data models, data-contract, data quality, 797 | data profiling, and data linage instead of technical tooling or platform. 798 | 799 | ## Smithy 800 | * Homepage: https://smithy.io/ 801 | * Smithy is a language (IDL) for defining services and SDKs. 802 | * Main contributor: AWS 803 | * See also [Data contracts - Smithy quickstart guide](smithy/quickstart/) 804 | 805 | ## Avro / Schema Registry 806 | * [Schema Registry overview](https://docs.confluent.io/platform/current/schema-registry/index.html) 807 | * [Avro overview](https://avro.apache.org/docs/) 808 | 809 | ### Support by cloud vendors 810 | * [AWS Glue Schema Registry](https://docs.aws.amazon.com/glue/latest/dg/schema-registry.html) 811 | 812 | ## Protocol buffers (Protobuf) 813 | * [Protobuf homepage](https://protobuf.dev/) 814 | * Main contributor: Google 815 | 816 | ## Buz 817 | * [Buz homepage](https://buz.dev) 818 | * [GitHub - Buz](https://github.com/silverton-io/buz) 819 | * Overview: Buz is a system for collecting events from various sources, 820 | validating data quality, and delivering them to where they need to be. 821 | 822 | ## Benthos 823 | * [Benthos homepage](https://www.benthos.dev/) 824 | * [GitHub - Benthos](https://github.com/benthosdev/benthos) 825 | * Overview: Benthos is a high performance and resilient stream processor, 826 | able to connect various sources and sinks in a range of brokering patterns 827 | and perform hydration, enrichments, transformations and filters on payloads. 828 | 829 | ## Memphis 830 | * [Memphis homepage](https://memphis.dev) 831 | * [GitHub - Memphis](https://github.com/memphisdev/memphis) 832 | * Overview: A simple, robust, and durable cloud-native message broker wrapped 833 | with an entire ecosystem that enables cost-effective, fast, and 834 | reliable development of modern queue-based use cases. 835 | Memphis enables the building of modern queue-based applications that require 836 | large volumes of streamed and enriched data, modern protocols, zero ops, 837 | rapid development, extreme cost reduction, and a significantly 838 | lower amount of dev time for data-oriented developers and data engineers. 839 | 840 | ## API specifications 841 | * [OpenAPI](https://www.openapis.org/) 842 | * [AsyncAPI spec](https://www.asyncapi.com/docs/reference/specification/v2.6.0) 843 | * [DatastoreAPI](https://dpds.opendatamesh.org/resources/standards/dsapi-spec/) 844 | 845 | ## Schema.org 846 | * Homepage: https://schema.org/ 847 | 848 | # Vendor solutions 849 | 850 | ## DBT 851 | 852 | ## AWS 853 | 854 | ## Google 855 | 856 | ## Collibra 857 | 858 | ## AWS 859 | 860 | ## DataMesh Manager 861 | * Home page: https://datamesh-manager.com 862 | 863 | 864 | # Exploration / Proof-of-Concept (PoC) 865 | * [GitHub - Smithy - Quickstart guide](smithy/quickstart/) 866 | -------------------------------------------------------------------------------- /architecture/architecture-latest.excalidraw: -------------------------------------------------------------------------------- 1 | { 2 | "type": "excalidraw", 3 | "version": 2, 4 | "source": "https://excalidraw.com", 5 | "elements": [ 6 | { 7 | "type": "rectangle", 8 | "version": 194, 9 | "versionNonce": 1999645924, 10 | "isDeleted": false, 11 | "id": "R6RLCdCAkWGJLZcvUIFG5", 12 | "fillStyle": "hachure", 13 | "strokeWidth": 1, 14 | "strokeStyle": "dotted", 15 | "roughness": 0, 16 | "opacity": 90, 17 | "angle": 0, 18 | "x": 1045.8328170776367, 19 | "y": -1947.0825776826591, 20 | "strokeColor": "#000000", 21 | "backgroundColor": "transparent", 22 | "width": 169.2230224609375, 23 | "height": 210.71142578125, 24 | "seed": 2013700956, 25 | "groupIds": [], 26 | "roundness": { 27 | "type": 3 28 | }, 29 | "boundElements": [ 30 | { 31 | "type": "text", 32 | "id": "tjdn3IMEVCM3vkGvgNgdT" 33 | } 34 | ], 35 | "updated": 1681835624070, 36 | "link": null, 37 | "locked": false 38 | }, 39 | { 40 | "type": "text", 41 | "version": 239, 42 | "versionNonce": 1960870748, 43 | "isDeleted": false, 44 | "id": "tjdn3IMEVCM3vkGvgNgdT", 45 | "fillStyle": "hachure", 46 | "strokeWidth": 1, 47 | "strokeStyle": "dotted", 48 | "roughness": 0, 49 | "opacity": 90, 50 | "angle": 0, 51 | "x": 1059.5243682861328, 52 | "y": -1901.7268647920341, 53 | "strokeColor": "#000000", 54 | "backgroundColor": "transparent", 55 | "width": 141.8399200439453, 56 | "height": 120, 57 | "seed": 260410204, 58 | "groupIds": [], 59 | "roundness": null, 60 | "boundElements": [], 61 | "updated": 1681835624070, 62 | "link": null, 63 | "locked": false, 64 | "fontSize": 16, 65 | "fontFamily": 1, 66 | "text": "Data contracts\nas code\n--\nOpenDataMesh \n(ODM), Protobuf, \nSmithy, Avro, dbt", 67 | "textAlign": "center", 68 | "verticalAlign": "middle", 69 | "containerId": "R6RLCdCAkWGJLZcvUIFG5", 70 | "originalText": "Data contracts\nas code\n--\nOpenDataMesh (ODM), Protobuf, Smithy, Avro, dbt", 71 | "lineHeight": 1.25, 72 | "baseline": 114 73 | }, 74 | { 75 | "type": "ellipse", 76 | "version": 278, 77 | "versionNonce": 2045391716, 78 | "isDeleted": false, 79 | "id": "vhlYJ_xxo9G8rICFYqgXX", 80 | "fillStyle": "solid", 81 | "strokeWidth": 1, 82 | "strokeStyle": "solid", 83 | "roughness": 1, 84 | "opacity": 100, 85 | "angle": 0, 86 | "x": 803.3993983361962, 87 | "y": -1713.267854985719, 88 | "strokeColor": "#000000", 89 | "backgroundColor": "#ced4da", 90 | "width": 155.54253472222206, 91 | "height": 37.11371527777783, 92 | "seed": 896850279, 93 | "groupIds": [ 94 | "fGsf0DPm0OG4GZgSfq9aR" 95 | ], 96 | "roundness": null, 97 | "boundElements": [], 98 | "updated": 1681835592317, 99 | "link": null, 100 | "locked": false 101 | }, 102 | { 103 | "type": "line", 104 | "version": 360, 105 | "versionNonce": 395322588, 106 | "isDeleted": false, 107 | "id": "82VsMeu4AQlumZoORA-ZY", 108 | "fillStyle": "solid", 109 | "strokeWidth": 1, 110 | "strokeStyle": "solid", 111 | "roughness": 1, 112 | "opacity": 100, 113 | "angle": 0, 114 | "x": 804.3455788917503, 115 | "y": -1857.2088272079413, 116 | "strokeColor": "#ced4da", 117 | "backgroundColor": "#ced4da", 118 | "width": 157.8862847222224, 119 | "height": 163.75, 120 | "seed": 211752585, 121 | "groupIds": [ 122 | "fGsf0DPm0OG4GZgSfq9aR" 123 | ], 124 | "roundness": null, 125 | "boundElements": [], 126 | "updated": 1681835592318, 127 | "link": null, 128 | "locked": false, 129 | "startBinding": null, 130 | "endBinding": null, 131 | "lastCommittedPoint": null, 132 | "startArrowhead": null, 133 | "endArrowhead": null, 134 | "points": [ 135 | [ 136 | 0, 137 | 0 138 | ], 139 | [ 140 | -0.24305555555567082, 141 | 162.33506944444446 142 | ], 143 | [ 144 | 157.64322916666674, 145 | 162.33940972222229 146 | ], 147 | [ 148 | 157.0355902777776, 149 | -1.4105902777777146 150 | ], 151 | [ 152 | 0, 153 | 0 154 | ] 155 | ] 156 | }, 157 | { 158 | "type": "ellipse", 159 | "version": 283, 160 | "versionNonce": 76949220, 161 | "isDeleted": false, 162 | "id": "ZqbGiu7aqb3XCpSoSls3V", 163 | "fillStyle": "solid", 164 | "strokeWidth": 1, 165 | "strokeStyle": "solid", 166 | "roughness": 1, 167 | "opacity": 100, 168 | "angle": 0, 169 | "x": 804.0417594473073, 170 | "y": -1876.68365359683, 171 | "strokeColor": "#000000", 172 | "backgroundColor": "#f6f7f7", 173 | "width": 155.54253472222206, 174 | "height": 37.11371527777783, 175 | "seed": 1752534633, 176 | "groupIds": [ 177 | "fGsf0DPm0OG4GZgSfq9aR" 178 | ], 179 | "roundness": null, 180 | "boundElements": [], 181 | "updated": 1681835592318, 182 | "link": null, 183 | "locked": false 184 | }, 185 | { 186 | "type": "line", 187 | "version": 127, 188 | "versionNonce": 1416571228, 189 | "isDeleted": false, 190 | "id": "BDBFYRka5f1cr_edDkQ0r", 191 | "fillStyle": "solid", 192 | "strokeWidth": 1, 193 | "strokeStyle": "solid", 194 | "roughness": 1, 195 | "opacity": 100, 196 | "angle": 0, 197 | "x": 804.5582525028614, 198 | "y": -1856.9093480412744, 199 | "strokeColor": "#000000", 200 | "backgroundColor": "#ced4da", 201 | "width": 0.6553819444445708, 202 | "height": 165.72916666666669, 203 | "seed": 2127186377, 204 | "groupIds": [ 205 | "fGsf0DPm0OG4GZgSfq9aR" 206 | ], 207 | "roundness": null, 208 | "boundElements": [], 209 | "updated": 1681835592318, 210 | "link": null, 211 | "locked": false, 212 | "startBinding": null, 213 | "endBinding": null, 214 | "lastCommittedPoint": null, 215 | "startArrowhead": null, 216 | "endArrowhead": null, 217 | "points": [ 218 | [ 219 | 0, 220 | 0 221 | ], 222 | [ 223 | -0.6553819444445708, 224 | 165.72916666666669 225 | ] 226 | ] 227 | }, 228 | { 229 | "type": "line", 230 | "version": 113, 231 | "versionNonce": 1282910820, 232 | "isDeleted": false, 233 | "id": "FaSiVy3hE-gl32hLhGq59", 234 | "fillStyle": "solid", 235 | "strokeWidth": 1, 236 | "strokeStyle": "solid", 237 | "roughness": 1, 238 | "opacity": 100, 239 | "angle": 0, 240 | "x": 962.2231830584168, 241 | "y": -1857.877229985719, 242 | "strokeColor": "#000000", 243 | "backgroundColor": "#ced4da", 244 | "width": 1.3454861111110858, 245 | "height": 164.96093749999994, 246 | "seed": 634996903, 247 | "groupIds": [ 248 | "fGsf0DPm0OG4GZgSfq9aR" 249 | ], 250 | "roundness": null, 251 | "boundElements": [], 252 | "updated": 1681835592318, 253 | "link": null, 254 | "locked": false, 255 | "startBinding": null, 256 | "endBinding": null, 257 | "lastCommittedPoint": null, 258 | "startArrowhead": null, 259 | "endArrowhead": null, 260 | "points": [ 261 | [ 262 | 0, 263 | 0 264 | ], 265 | [ 266 | -1.3454861111110858, 267 | 164.96093749999994 268 | ] 269 | ] 270 | }, 271 | { 272 | "type": "text", 273 | "version": 142, 274 | "versionNonce": 771225188, 275 | "isDeleted": false, 276 | "id": "TwCM5lWbJvTozKGGQi9qy", 277 | "fillStyle": "hachure", 278 | "strokeWidth": 1, 279 | "strokeStyle": "dotted", 280 | "roughness": 0, 281 | "opacity": 90, 282 | "angle": 0, 283 | "x": 822.8473663330078, 284 | "y": -1794.4074220430107, 285 | "strokeColor": "#000000", 286 | "backgroundColor": "transparent", 287 | "width": 120.51191711425781, 288 | "height": 60, 289 | "seed": 622384860, 290 | "groupIds": [], 291 | "roundness": null, 292 | "boundElements": [], 293 | "updated": 1681838615184, 294 | "link": null, 295 | "locked": false, 296 | "fontSize": 16, 297 | "fontFamily": 1, 298 | "text": "Versioned store\n--\nGit repository", 299 | "textAlign": "center", 300 | "verticalAlign": "top", 301 | "containerId": null, 302 | "originalText": "Versioned store\n--\nGit repository", 303 | "lineHeight": 1.25, 304 | "baseline": 54 305 | }, 306 | { 307 | "type": "line", 308 | "version": 72, 309 | "versionNonce": 653100380, 310 | "isDeleted": false, 311 | "id": "6yoy_chssxcNX6o4fuwmQ", 312 | "fillStyle": "hachure", 313 | "strokeWidth": 1, 314 | "strokeStyle": "dotted", 315 | "roughness": 0, 316 | "opacity": 90, 317 | "angle": 0, 318 | "x": 1045.6968307495117, 319 | "y": -1848.6949586641044, 320 | "strokeColor": "#000000", 321 | "backgroundColor": "transparent", 322 | "width": 83.60516357421875, 323 | "height": 62.147674560546875, 324 | "seed": 702607460, 325 | "groupIds": [], 326 | "roundness": { 327 | "type": 2 328 | }, 329 | "boundElements": [], 330 | "updated": 1681835632189, 331 | "link": null, 332 | "locked": false, 333 | "startBinding": null, 334 | "endBinding": null, 335 | "lastCommittedPoint": null, 336 | "startArrowhead": null, 337 | "endArrowhead": null, 338 | "points": [ 339 | [ 340 | 0, 341 | 0 342 | ], 343 | [ 344 | -83.60516357421875, 345 | 62.147674560546875 346 | ] 347 | ] 348 | }, 349 | { 350 | "type": "rectangle", 351 | "version": 80, 352 | "versionNonce": 1344086236, 353 | "isDeleted": false, 354 | "id": "Dn5aYHhSAxez792C40uBy", 355 | "fillStyle": "hachure", 356 | "strokeWidth": 1, 357 | "strokeStyle": "dotted", 358 | "roughness": 0, 359 | "opacity": 90, 360 | "angle": 0, 361 | "x": 1042.8771286010742, 362 | "y": -1679.950207687542, 363 | "strokeColor": "#000000", 364 | "backgroundColor": "transparent", 365 | "width": 184.03125, 366 | "height": 227.73236083984375, 367 | "seed": 842375652, 368 | "groupIds": [], 369 | "roundness": { 370 | "type": 3 371 | }, 372 | "boundElements": [], 373 | "updated": 1681835673408, 374 | "link": null, 375 | "locked": false 376 | }, 377 | { 378 | "type": "text", 379 | "version": 76, 380 | "versionNonce": 1949884252, 381 | "isDeleted": false, 382 | "id": "6hvF0PIu2HjQVwj1Nhr0k", 383 | "fillStyle": "hachure", 384 | "strokeWidth": 1, 385 | "strokeStyle": "dotted", 386 | "roughness": 0, 387 | "opacity": 90, 388 | "angle": 0, 389 | "x": 1093.1760864257812, 390 | "y": -1649.7967653047294, 391 | "strokeColor": "#000000", 392 | "backgroundColor": "transparent", 393 | "width": 74.92796325683594, 394 | "height": 60, 395 | "seed": 1752185820, 396 | "groupIds": [], 397 | "roundness": null, 398 | "boundElements": [], 399 | "updated": 1681835676958, 400 | "link": null, 401 | "locked": false, 402 | "fontSize": 16, 403 | "fontFamily": 1, 404 | "text": "Tooling\n--\nSchemata", 405 | "textAlign": "center", 406 | "verticalAlign": "top", 407 | "containerId": null, 408 | "originalText": "Tooling\n--\nSchemata", 409 | "lineHeight": 1.25, 410 | "baseline": 54 411 | }, 412 | { 413 | "type": "rectangle", 414 | "version": 182, 415 | "versionNonce": 1369745756, 416 | "isDeleted": false, 417 | "id": "wx5fvQsPt4gAmEU5HIdOr", 418 | "fillStyle": "hachure", 419 | "strokeWidth": 1, 420 | "strokeStyle": "dotted", 421 | "roughness": 0, 422 | "opacity": 90, 423 | "angle": 0, 424 | "x": 1414.9919967651367, 425 | "y": -1664.986615158245, 426 | "strokeColor": "#000000", 427 | "backgroundColor": "transparent", 428 | "width": 119.1612548828125, 429 | "height": 40.67926025390625, 430 | "seed": 801920100, 431 | "groupIds": [], 432 | "roundness": { 433 | "type": 3 434 | }, 435 | "boundElements": [ 436 | { 437 | "type": "text", 438 | "id": "EJnWGKpUirRnESrLIDU1c" 439 | }, 440 | { 441 | "id": "zu3uUTh_HpgrLGJdYEtf6", 442 | "type": "arrow" 443 | }, 444 | { 445 | "id": "vk3Vz9QB9LkkJJ2mAG3h0", 446 | "type": "arrow" 447 | }, 448 | { 449 | "id": "Jh_8R5j-akJIUBSq2FIp1", 450 | "type": "arrow" 451 | } 452 | ], 453 | "updated": 1681838308541, 454 | "link": null, 455 | "locked": false 456 | }, 457 | { 458 | "type": "text", 459 | "version": 116, 460 | "versionNonce": 123349604, 461 | "isDeleted": false, 462 | "id": "EJnWGKpUirRnESrLIDU1c", 463 | "fillStyle": "hachure", 464 | "strokeWidth": 1, 465 | "strokeStyle": "dotted", 466 | "roughness": 0, 467 | "opacity": 90, 468 | "angle": 0, 469 | "x": 1439.9646530151367, 470 | "y": -1654.646985031292, 471 | "strokeColor": "#000000", 472 | "backgroundColor": "transparent", 473 | "width": 69.2159423828125, 474 | "height": 20, 475 | "seed": 880498140, 476 | "groupIds": [], 477 | "roundness": null, 478 | "boundElements": [], 479 | "updated": 1681838308541, 480 | "link": null, 481 | "locked": false, 482 | "fontSize": 16, 483 | "fontFamily": 1, 484 | "text": "Protobuf", 485 | "textAlign": "center", 486 | "verticalAlign": "middle", 487 | "containerId": "wx5fvQsPt4gAmEU5HIdOr", 488 | "originalText": "Protobuf", 489 | "lineHeight": 1.25, 490 | "baseline": 14 491 | }, 492 | { 493 | "type": "rectangle", 494 | "version": 129, 495 | "versionNonce": 371091676, 496 | "isDeleted": false, 497 | "id": "qBYazSMOGCFUZTUGs8WUn", 498 | "fillStyle": "hachure", 499 | "strokeWidth": 1, 500 | "strokeStyle": "dotted", 501 | "roughness": 0, 502 | "opacity": 90, 503 | "angle": 0, 504 | "x": 1260.9375534057617, 505 | "y": -1559.9214601289482, 506 | "strokeColor": "#000000", 507 | "backgroundColor": "transparent", 508 | "width": 144, 509 | "height": 50, 510 | "seed": 78534500, 511 | "groupIds": [], 512 | "roundness": { 513 | "type": 3 514 | }, 515 | "boundElements": [ 516 | { 517 | "type": "text", 518 | "id": "3Ddke6M6guYfoQKgG9UaN" 519 | } 520 | ], 521 | "updated": 1681835766065, 522 | "link": null, 523 | "locked": false 524 | }, 525 | { 526 | "type": "text", 527 | "version": 89, 528 | "versionNonce": 805184228, 529 | "isDeleted": false, 530 | "id": "3Ddke6M6guYfoQKgG9UaN", 531 | "fillStyle": "hachure", 532 | "strokeWidth": 1, 533 | "strokeStyle": "dotted", 534 | "roughness": 0, 535 | "opacity": 90, 536 | "angle": 0, 537 | "x": 1274.8415832519531, 538 | "y": -1544.9214601289482, 539 | "strokeColor": "#000000", 540 | "backgroundColor": "transparent", 541 | "width": 116.19194030761719, 542 | "height": 20, 543 | "seed": 2014237156, 544 | "groupIds": [], 545 | "roundness": null, 546 | "boundElements": [], 547 | "updated": 1681835766065, 548 | "link": null, 549 | "locked": false, 550 | "fontSize": 16, 551 | "fontFamily": 1, 552 | "text": "OpenDataMesh", 553 | "textAlign": "center", 554 | "verticalAlign": "middle", 555 | "containerId": "qBYazSMOGCFUZTUGs8WUn", 556 | "originalText": "OpenDataMesh", 557 | "lineHeight": 1.25, 558 | "baseline": 14 559 | }, 560 | { 561 | "type": "arrow", 562 | "version": 162, 563 | "versionNonce": 1384062428, 564 | "isDeleted": false, 565 | "id": "zu3uUTh_HpgrLGJdYEtf6", 566 | "fillStyle": "hachure", 567 | "strokeWidth": 1, 568 | "strokeStyle": "dotted", 569 | "roughness": 0, 570 | "opacity": 90, 571 | "angle": 0, 572 | "x": 1428.567766751262, 573 | "y": -1620.999798751995, 574 | "strokeColor": "#000000", 575 | "backgroundColor": "transparent", 576 | "width": 96.85738619706285, 577 | "height": 63.299774169921875, 578 | "seed": 1580188892, 579 | "groupIds": [], 580 | "roundness": { 581 | "type": 2 582 | }, 583 | "boundElements": [], 584 | "updated": 1681838308541, 585 | "link": null, 586 | "locked": false, 587 | "startBinding": { 588 | "elementId": "wx5fvQsPt4gAmEU5HIdOr", 589 | "focus": 0.10828079799569044, 590 | "gap": 3.30755615234375 591 | }, 592 | "endBinding": null, 593 | "lastCommittedPoint": null, 594 | "startArrowhead": null, 595 | "endArrowhead": "triangle", 596 | "points": [ 597 | [ 598 | 0, 599 | 0 600 | ], 601 | [ 602 | -96.85738619706285, 603 | 63.299774169921875 604 | ] 605 | ] 606 | }, 607 | { 608 | "type": "arrow", 609 | "version": 145, 610 | "versionNonce": 1861172708, 611 | "isDeleted": false, 612 | "id": "vk3Vz9QB9LkkJJ2mAG3h0", 613 | "fillStyle": "hachure", 614 | "strokeWidth": 1, 615 | "strokeStyle": "dotted", 616 | "roughness": 0, 617 | "opacity": 90, 618 | "angle": 0, 619 | "x": 1456.3887667062784, 620 | "y": -1617.4171571504326, 621 | "strokeColor": "#000000", 622 | "backgroundColor": "transparent", 623 | "width": 38.903611959193995, 624 | "height": 61.609100341796875, 625 | "seed": 868632156, 626 | "groupIds": [], 627 | "roundness": { 628 | "type": 2 629 | }, 630 | "boundElements": [], 631 | "updated": 1681838308541, 632 | "link": null, 633 | "locked": false, 634 | "startBinding": { 635 | "elementId": "wx5fvQsPt4gAmEU5HIdOr", 636 | "focus": 0.4884878357051805, 637 | "gap": 6.89019775390625 638 | }, 639 | "endBinding": null, 640 | "lastCommittedPoint": null, 641 | "startArrowhead": null, 642 | "endArrowhead": "triangle", 643 | "points": [ 644 | [ 645 | 0, 646 | 0 647 | ], 648 | [ 649 | 38.903611959193995, 650 | 61.609100341796875 651 | ] 652 | ] 653 | }, 654 | { 655 | "type": "rectangle", 656 | "version": 177, 657 | "versionNonce": 812868060, 658 | "isDeleted": false, 659 | "id": "_OJnFfTHeL31iIg9Aia-v", 660 | "fillStyle": "hachure", 661 | "strokeWidth": 1, 662 | "strokeStyle": "dotted", 663 | "roughness": 0, 664 | "opacity": 90, 665 | "angle": 0, 666 | "x": 1426.6882858276367, 667 | "y": -1556.7289552461357, 668 | "strokeColor": "#000000", 669 | "backgroundColor": "transparent", 670 | "width": 144, 671 | "height": 50, 672 | "seed": 78534500, 673 | "groupIds": [], 674 | "roundness": { 675 | "type": 3 676 | }, 677 | "boundElements": [ 678 | { 679 | "type": "text", 680 | "id": "_ZV2D_CUSL0lghaz1iJ7D" 681 | }, 682 | { 683 | "id": "Jh_8R5j-akJIUBSq2FIp1", 684 | "type": "arrow" 685 | } 686 | ], 687 | "updated": 1681838304453, 688 | "link": null, 689 | "locked": false 690 | }, 691 | { 692 | "type": "text", 693 | "version": 140, 694 | "versionNonce": 822778716, 695 | "isDeleted": false, 696 | "id": "_ZV2D_CUSL0lghaz1iJ7D", 697 | "fillStyle": "hachure", 698 | "strokeWidth": 1, 699 | "strokeStyle": "dotted", 700 | "roughness": 0, 701 | "opacity": 90, 702 | "angle": 0, 703 | "x": 1474.7843017578125, 704 | "y": -1541.7289552461357, 705 | "strokeColor": "#000000", 706 | "backgroundColor": "transparent", 707 | "width": 47.80796813964844, 708 | "height": 20, 709 | "seed": 2014237156, 710 | "groupIds": [], 711 | "roundness": null, 712 | "boundElements": [], 713 | "updated": 1681835984526, 714 | "link": null, 715 | "locked": false, 716 | "fontSize": 16, 717 | "fontFamily": 1, 718 | "text": "Smithy", 719 | "textAlign": "center", 720 | "verticalAlign": "middle", 721 | "containerId": "_OJnFfTHeL31iIg9Aia-v", 722 | "originalText": "Smithy", 723 | "lineHeight": 1.25, 724 | "baseline": 14 725 | }, 726 | { 727 | "type": "rectangle", 728 | "version": 32, 729 | "versionNonce": 1772416228, 730 | "isDeleted": false, 731 | "id": "6TaJzFHnvGI8IKYzoKRrC", 732 | "fillStyle": "hachure", 733 | "strokeWidth": 1, 734 | "strokeStyle": "dotted", 735 | "roughness": 0, 736 | "opacity": 90, 737 | "angle": 0, 738 | "x": 1599.8814010620117, 739 | "y": -1559.4686403047294, 740 | "strokeColor": "#000000", 741 | "backgroundColor": "transparent", 742 | "width": 111.9368896484375, 743 | "height": 54.54931640625, 744 | "seed": 1088818404, 745 | "groupIds": [], 746 | "roundness": { 747 | "type": 3 748 | }, 749 | "boundElements": [ 750 | { 751 | "type": "text", 752 | "id": "bM6d6QGfsMznUGbtXhPgP" 753 | }, 754 | { 755 | "id": "Jh_8R5j-akJIUBSq2FIp1", 756 | "type": "arrow" 757 | } 758 | ], 759 | "updated": 1681838306470, 760 | "link": null, 761 | "locked": false 762 | }, 763 | { 764 | "type": "text", 765 | "version": 22, 766 | "versionNonce": 1517755236, 767 | "isDeleted": false, 768 | "id": "bM6d6QGfsMznUGbtXhPgP", 769 | "fillStyle": "hachure", 770 | "strokeWidth": 1, 771 | "strokeStyle": "dotted", 772 | "roughness": 0, 773 | "opacity": 90, 774 | "angle": 0, 775 | "x": 1604.9858779907227, 776 | "y": -1552.1939821016044, 777 | "strokeColor": "#000000", 778 | "backgroundColor": "transparent", 779 | "width": 101.72793579101562, 780 | "height": 40, 781 | "seed": 1321059556, 782 | "groupIds": [], 783 | "roundness": null, 784 | "boundElements": [], 785 | "updated": 1681838296406, 786 | "link": null, 787 | "locked": false, 788 | "fontSize": 16, 789 | "fontFamily": 1, 790 | "text": "Great\nExpectations", 791 | "textAlign": "center", 792 | "verticalAlign": "middle", 793 | "containerId": "6TaJzFHnvGI8IKYzoKRrC", 794 | "originalText": "Great\nExpectations", 795 | "lineHeight": 1.25, 796 | "baseline": 34 797 | }, 798 | { 799 | "type": "arrow", 800 | "version": 229, 801 | "versionNonce": 1502994788, 802 | "isDeleted": false, 803 | "id": "Jh_8R5j-akJIUBSq2FIp1", 804 | "fillStyle": "hachure", 805 | "strokeWidth": 1, 806 | "strokeStyle": "dotted", 807 | "roughness": 0, 808 | "opacity": 90, 809 | "angle": 0, 810 | "x": 1485.5741746678514, 811 | "y": -1622.3482942353935, 812 | "strokeColor": "#000000", 813 | "backgroundColor": "transparent", 814 | "width": 177.34656229814618, 815 | "height": 59.185638427734375, 816 | "seed": 868632156, 817 | "groupIds": [], 818 | "roundness": { 819 | "type": 2 820 | }, 821 | "boundElements": [], 822 | "updated": 1681838308542, 823 | "link": null, 824 | "locked": false, 825 | "startBinding": { 826 | "elementId": "wx5fvQsPt4gAmEU5HIdOr", 827 | "focus": 0.46248307554124507, 828 | "gap": 1.9590606689453125 829 | }, 830 | "endBinding": { 831 | "elementId": "6TaJzFHnvGI8IKYzoKRrC", 832 | "focus": 0.7252732211905771, 833 | "gap": 3.6940155029296875 834 | }, 835 | "lastCommittedPoint": null, 836 | "startArrowhead": null, 837 | "endArrowhead": "triangle", 838 | "points": [ 839 | [ 840 | 0, 841 | 0 842 | ], 843 | [ 844 | 177.34656229814618, 845 | 59.185638427734375 846 | ] 847 | ] 848 | } 849 | ], 850 | "appState": { 851 | "gridSize": null, 852 | "viewBackgroundColor": "#ffffff" 853 | }, 854 | "files": {} 855 | } -------------------------------------------------------------------------------- /community/README.md: -------------------------------------------------------------------------------- 1 | Data Contracts - Community management 2 | ===================================== 3 | 4 | # Overview 5 | [This page](https://github.com/data-engineering-helpers/data-contracts/blob/main/community/README.md) 6 | helps to manage the community around the 7 | [Data contracts initiative](https://github.com/data-engineering-helpers/data-contracts). 8 | 9 | Data Contracts have the potential to reshape the organization in data teams 10 | (_e.g._, I would not be surprised to see some departments named 11 | "data contracts and governance" rather than just governance in the future). 12 | 13 | This initiative aims at contributing to an open ecosystem (_i.e._, 14 | open standards, open source tools and open knowledge sharing) 15 | for the data contracts. 16 | 17 | If you are interested in contributing, we can add yourself, and/or someone 18 | from your team, as contributors of the 19 | https://github.com/data-engineering-helpers GitHub public organization 20 | (that way, you will will become contributors of the 21 | https://github.com/data-engineering-helpers/data-contracts repository, 22 | among others). 23 | 24 | There have just been a few online meetings, mainly to see each other 25 | for real and present ourselves. 26 | 27 | You are welcome there. There is of course no obligation. And there is no 28 | commitment either, from any of the parties. Our goal is just to exchange 29 | pieces of idea, of information, of knowledge, tips and tricks, so as 30 | to contribute to establish a de facto open standard around Data Contracts, 31 | avoiding proprietary lock-ins. 32 | 33 | So far, we plan to use Open Data Mesh (ODM) as the open specification 34 | for Data Contracts (the main contributor being Andrea) and Schemata 35 | (Ananth being the main contributor) for the tooling/Swiss-knife. 36 | A very early draft of the specifications of what we want to achieve 37 | is https://github.com/data-engineering-helpers/data-contracts#specifications 38 | 39 | # Google Space 40 | * Reference: 41 | [Google Space/Chat - Data Contracts Community](https://mail.google.com/chat#chat/space/AAAA15chWp0) 42 | (if you have no access to that Google Space, a community member has to invite 43 | you there) 44 | 45 | The above-referenced Google Space/Chat is dedicated to knowledge sharing 46 | and work planning for that open community (with the persons detailed 47 | in the section below). 48 | 49 | >**Note** 50 | This initiative is not tight in any way to any cloud provider or any other 51 | vendor. A few vendor-related members are interested to participate silently 52 | (from a vendor perspective), that is, they absolutely do not interfere 53 | in discussions with vendor-specific solutions. 54 | 55 | # Profiles 56 | The following list may not be exhaustive, and there is no specific 57 | sorting order (except maybe something approaching a timeline of joining 58 | this community). If you would like to join, contact us (_e.g._, Denis Arnaud) 59 | and we will invite you. 60 | 61 | * Denis Arnaud 62 | ([Denis Arnaud on LinkedIn](https://www.linkedin.com/in/da115/), 63 | [Denis Arnaud on Medium](https://medium.com/@denis_arnaud), 64 | [Denis Arnaud on GitHub](https://github.com/da115115)) 65 | + Principal Engineer at [Decathlon Digital](https://digital.decathlon.net/) 66 | + Greater Lille, Nord, France 67 | 68 | * Ananth Packkildurai 69 | ([Ananth Packkildurai on LinkedIn](https://www.linkedin.com/in/ananthdurai/), 70 | [Ananth Packkildurai on Substack](https://substack.com/profile/3520227-ananth-packkildurai), 71 | [Ananth Packkildurai on GitHub](https://github.com/ananthdurai)) 72 | + Principal Engineer at [Murai](https://mural.co/) 73 | + Richmond, Virginia (VA), USA 74 | 75 | * Andrea Gioia 76 | ([Andrea Gioia on LinkedIn](https://www.linkedin.com/in/andreagioia/), 77 | [Andrea Gioia on Medium](https://medium.com/@andrea_gioia), 78 | [Andrea Gioia on GitHub](https://github.com/andrea-gioia)) 79 | + Partner at [Quantyca](http://www.quantyca.it/) 80 | + Milan, Lombardy, Italy 81 | 82 | * Joël Farvault 83 | ([Joël Farvault on LinkedIn](https://www.linkedin.com/in/joel-farvault-4332331/)) 84 | + Principal Specialist Solution Architect Analytics at [AWS](https://aws.com) 85 | 86 | * Charles Verleyen 87 | ([Charles Verleyen on LinkedIn](http://linkedin.com/in/charlesverleyen), 88 | [Charles Verleyen on Medium](https://medium.com/@charles.xavier.verleyen), 89 | [Charles Verleyen on GitHub](https://github.com/charles-astrafy)) 90 | + CEO and Lead Architect at [Astrafy](https://astrafy.io/) 91 | 92 | * Łukasz Ściga 93 | ([Łukasz Ściga on LinkedIn](https://www.linkedin.com/in/lukasz-sciga/)) 94 | + Data Engineer at [Astrafy](https://astrafy.io/) 95 | + Lausanne, Vaud, Switzerland 96 | 97 | * Médéric Hurier 98 | ([Médéric Hurier on LinkedIn](https://www.linkedin.com/in/fmind-dev/), 99 | [Médéric Hurier on Medium](https://fmind.medium.com/), 100 | [Médéric Hurier on GitHub](https://github.com/fmind/)) 101 | + Lead MLOps Engineer at [Decathlon Digital](https://digital.decathlon.net/) 102 | + Luxembourg, Luxembourg 103 | 104 | * Antoine Chenon 105 | ([Antoine Chenon on LinkedIn](https://www.linkedin.com/in/antoine-chenon-a26371108/), 106 | [Antoine Chenon on GitHub](https://github.com/FreddieMercuryDKT)) 107 | + Data Engineer at [Decathlon Digital](https://digital.decathlon.net/) 108 | + Greater Nantes, France 109 | 110 | * Samy Doreau 111 | ([Samy Doreau on LinkedIn](), 112 | [Samy Doreau on GitHub](https://github.com/iclarke)) 113 | + Data Engineering Consultant at 114 | [Infinite Lambda](https://infinitelambda.com/) 115 | + St Albans, England, United Kingdom 116 | 117 | * Alexander Schober 118 | ([Alexander Schober on LinkedIn](https://www.linkedin.com/in/alexander-schober/)) 119 | + Project Owner - AI Cluster at 120 | [Motius, formerly Siemens AI Lab](https://motius.de/) 121 | + Munich, Bavaria, Germany 122 | 123 | * Endrit Bytyqi 124 | ([Endrit Bytyqi on LinkedIn](https://www.linkedin.com/in/endrit-bytyqi/)) 125 | + Senior Tech Specialist - Data Engineering at 126 | [Motius, formerly Siemens AI Lab](https://motius.de/) 127 | + Munich, Bavaria, Germany 128 | 129 | * Tanya Tylevich 130 | ([Tanya Tylevich on LinkedIn](https://www.linkedin.com/in/tanya-tylevich-9777124/)) 131 | + SVP - Data Engineering at [Goldman Sachs](https://www.goldmansachs.com/) 132 | + West Nyack, New York (NY), USA 133 | 134 | * Benjamin Laot 135 | ([Benjamin Laot on LinkedIn](https://www.linkedin.com/in/benjamin-laot-42a83759/), 136 | [Benjamin Laot on GitHub](https://github.com/BenLaot)) 137 | + Staff Data Engineer at [Decathlon Digital](https://digital.decathlon.net/) 138 | + Greater Lille, Nord, France 139 | 140 | * Simon Auger 141 | ([Simon Auger at ](https://www.linkedin.com/in/simon-auger/), 142 | [Simon Auger at GitHub](https://github.com/saugerDecathlon)) 143 | + Senior Data Engineer at [Capgemni Invent](https://www.capgemini.com/about-us/who-we-are/our-brands/capgemini-invent/) 144 | + Greater Paris, Île-de-France, France 145 | 146 | * Sophie Ly 147 | ([Sophie Ly on LinkedIn](https://www.linkedin.com/in/sophie-ly-2a8095110/), 148 | [Sophie Ly on GitHub](https://github.com/dkt-sophie-ly)) 149 | + Data Engineer at [Decathlon Digital](https://digital.decathlon.net/) 150 | + Greater Lille, Nord, France 151 | 152 | * Nicolas Gibaud 153 | ([Nicolas Gibaud on LinkedIn](https://www.linkedin.com/in/nicolas-gibaud-aa929358/), 154 | [Nicolas Gibaud on GitHub](https://github.com/nicolasgibaud)) 155 | + Senior Data Engineer at [Capgemni Invent](https://www.capgemini.com/about-us/who-we-are/our-brands/capgemini-invent/) 156 | + Greater Paris, Île-de-France, France 157 | 158 | * Mohamed Rakza 159 | ([Mohamed Rakza on LinkedIn](https://www.linkedin.com/in/mohamed-rakza-64053a1a/), 160 | [Mohamed Rakza on GitHub](https://github.com/mrakza21)) 161 | + Senior Staff Data Engineer at [Decathlon Digital](https://digital.decathlon.net/) 162 | + Greater Lille, Nord, France 163 | 164 | * Théo Vialard 165 | ([Théo Vialard on LinkedIn](https://www.linkedin.com/in/th%C3%A9o-vialard-093708ab/), 166 | [Théo Vialard on GitHub](https://github.com/tvialard)) 167 | + Senior Data Consultant at [Capgemni Invent](https://www.capgemini.com/about-us/who-we-are/our-brands/capgemini-invent/) 168 | + Greater Paris, Île-de-France, France 169 | 170 | -------------------------------------------------------------------------------- /datacontract.com/.gitignore: -------------------------------------------------------------------------------- 1 | # Data contracts 2 | /datacontract.yaml 3 | /data-contract-flight-route.yaml 4 | /datacontract 5 | # DuckDB 6 | /db.duckdb 7 | # Quality checks 8 | /quality/ 9 | -------------------------------------------------------------------------------- /datacontract.com/README.md: -------------------------------------------------------------------------------- 1 | Data Contracts - Datacontract.com 2 | ================================= 3 | 4 | # Table of Contents (ToC) 5 | * [Data Contracts - Datacontract.com](#data-contracts---datacontractcom) 6 | * [Overview](#overview) 7 | * [References](#references) 8 | * [DuckDB](#duckdb) 9 | * [Soda Core](#soda-core) 10 | * [OpenTravelData (OPTD)](#opentraveldata-optd) 11 | * [Quickstart](#quickstart) 12 | * [Setup](#setup) 13 | * [Create a data contract](#create-a-data-contract) 14 | * [Import a data contract](#import-a-data-contract) 15 | * [Check the compliance of a data contract](#check-the-compliance-of-a-data-contract) 16 | * [Edit a data contract in Data Contract Studio](#edit-a-data-contract-in-data-contract-studio) 17 | * [Check the data quality of a table thanks to a data contract](#check-the-data-quality-of-a-table-thanks-to-a-data-contract) 18 | * [Installation](#installation) 19 | * [On MacOS](#on-macos) 20 | * [On Linux](#on-linux) 21 | * [General](#general) 22 | * [Options](#options) 23 | * [DuckDB](#duckdb-1) 24 | * [Soda Core](#soda-core-1) 25 | * [Trouble shooting](#trouble-shooting) 26 | * [Soda Core](#soda-core-2) 27 | 28 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go) 29 | 30 | # Overview 31 | [This page](https://github.com/data-engineering-helpers/data-contracts/blob/main/datacontract.com/README.md) 32 | is a deep dive on the [datacontract.com](https://datacontract.com/) ecosystem, 33 | which includes an 34 | [open specification](https://datacontract.com/) and 35 | [some utilities](https://cli.datacontract.com/). 36 | 37 | It is part of the larger 38 | [Data contracts initiative](https://github.com/data-engineering-helpers/data-contracts). 39 | 40 | # References 41 | * Data Contract Specification: https://datacontract.com/ 42 | * Data contract command-line (CLI) utility: 43 | https://cli.datacontract.com/ (GitHub: https://github.com/datacontract/cli) 44 | * Data Contract Studio: https://studio.datacontract.com/ 45 | 46 | ## DuckDB 47 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md) 48 | 49 | ## Soda Core 50 | * Soda Core GitHub repository: https://github.com/sodadata/soda-core 51 | * [Installation of Soda Core](https://github.com/sodadata/soda-core/blob/main/docs/installation.md) 52 | + [Configure Soda Core](https://github.com/sodadata/soda-core/blob/main/docs/configuration.md) 53 | + [Connect Soda to DuckDB](https://docs.soda.io/soda/connect-duckdb.html) 54 | 55 | ## OpenTravelData (OPTD) 56 | * OpenTravelData GitHub repository with latest CSV snapshots: 57 | https://github.com/opentraveldata/opentraveldata/tree/master/opentraveldata/ 58 | * For instance, the sample file used here is about transportation routes: 59 | https://github.com/opentraveldata/opentraveldata/blob/master/opentraveldata/optd_airline_por.csv 60 | - A copy of that CSV file is also available on a public S3 bucket: 61 | [`s3://optd/latest/`](https://s3.console.aws.amazon.com/s3/buckets/optd?region=eu-west-1&tab=objects) 62 | 63 | # Quickstart 64 | 65 | ## Setup 66 | * If not already done so, clone this Git repository and change directory to it: 67 | ```bash 68 | $ mkdir -p ~/dev/infra/data-contracts && \ 69 | git clone https://github.com/data-engineering-helpers/data-contracts.git ~/dev/infra/data-contracts/data-contracts && \ 70 | cd ~/dev/infra/data-contracts/data-contracts/datacontract.com 71 | ``` 72 | 73 | ## Create a data contract 74 | * Generate a new data contract, which will be a collection of 75 | commented samples (which may be then be uncommented following specific 76 | implementation details): 77 | ```bash 78 | $ datacontract init --file contracts/orders-latest-npii-new.yaml 79 | 📄 data contract written to contracts/orders-latest-npii-new.yaml 80 | ``` 81 | 82 | ## Import a data contract 83 | * Import a data contract from a URL, creating a local replication of it 84 | + From Data Contract Studio: 85 | ```bash 86 | $ datacontract init --from https://studio.datacontract.com/s/c8b27fe3-62dc-4a21-ae41-9471ce7859d7.yaml 87 | 📄 data contract written to datacontract.yaml 88 | ``` 89 | + From some code repository, _e.g._, Git repositories (note that the URL 90 | to the raw YAML file should be retrieved here, otherwise, the retrieved 91 | file is HTML, not YAML): 92 | ```bash 93 | $ datacontract init --overwrite-file --file data-contract-flight-route.yaml --from https://github.com/data-engineering-helpers/data-contracts/raw/main/datacontract.com/contracts/data-contract-flight-route.yaml 94 | 📄 data contract written to data-contract-flight-route.yaml 95 | ``` 96 | 97 | ## Check the compliance of a data contract 98 | * Check the validity of the data contract: 99 | ```bash 100 | $ datacontract lint --file contracts/data-contract-flight-route.yaml 101 | 🟢 data contract is valid! 102 | ``` 103 | 104 | >**Note** 105 | The schema specification of the 106 | [`contracts/data-contract-flight-route-quality.yaml` data contract](https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route-quality.yaml) 107 | is in its own YAML file, namely 108 | [`contracts/data-contract-flight-route-schema.yaml`](https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route-schema.yaml), 109 | cross-referenced by the data contract in the `schema` section. 110 | That allows to use that schema specification with other tools, such as DBT 111 | or Spark. 112 | 113 | ## Edit a data contract in Data Contract Studio 114 | * Open the data contract in Data Contract Studio: 115 | ```bash 116 | $ datacontract open --file contracts/data-contract-flight-route.yaml 117 | 🌐 opening data contract at https://studio.datacontract.com/s/16ff8cbb-7f3f-4ca4-addf-b3a4cbac1500 118 | ``` 119 | 120 | * Which results, for the 121 | [`contracts/data-contract-flight-route.yaml` data contract](https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route.yaml), 122 | into 123 | https://studio.datacontract.com/s/16ff8cbb-7f3f-4ca4-addf-b3a4cbac1500 124 | 125 | * The data contract may be edited in the Data Contract Studio directly: 126 | https://studio.datacontract.com/s/16ff8cbb-7f3f-4ca4-addf-b3a4cbac1500/edit 127 | 128 | ## Check the data quality of a table thanks to a data contract 129 | * The data quality check/test is not fully implemented yet. A feature request 130 | has been created for that purpose: 131 | https://github.com/datacontract/cli/issues/2 132 | 133 | >**Note** 134 | The data quality check section is located in its own YAML file, namely 135 | [`contracts/data-contract-flight-route-quality.yaml`](https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route-quality.yaml), 136 | cross-referenced by the data contract in the `quality` section. 137 | That allows to use that data quality specification with other tools, 138 | such as Soda or Great Expectations. Below is an example of use with Soda. 139 | Note that this feature (of checking the data quality with external tools) 140 | should be integrated in the `datacontract` CLI at 141 | [some point in the future](https://github.com/datacontract/cli/issues/2). 142 | 143 | * Launch a few data quality checks with Soda Core, with newer versions of the 144 | `datacontract` CLI, which wraps the SodaCL CLI 145 | + Setup the directory for the quality checks: 146 | ```bash 147 | $ mkdir -p quality 148 | ``` 149 | 150 | + Create or copy the (`db.duckdb`) DuckDB database file (see the 151 | [DuckDB sub-section](#duckdb) below in order to initialize that DuckDB 152 | database file) 153 | ```bash 154 | $ cp db.duckdb quality/ 155 | ``` 156 | 157 | + Launch the quality checks with the `datacontract` CLI: 158 | ```bash 159 | $ datacontract test --test-options "-d duckdb_local -c quality/soda-conf.yml" --file contracts/data-contract-flight-route.yaml --quality-file contracts/data-contract-flight-route-quality.yaml 160 | [...] 161 | Creating quality directory if needed... 162 | [18:01:15] Soda Core 3.0.51 163 | [18:01:16] Scan summary: 164 | [18:01:16] 2/2 checks PASSED: 165 | [18:01:16] transport_routes in duckdb_local 166 | [18:01:16] row_count between 90000 and 100000 [PASSED] 167 | [18:01:16] invalid_percent(freq) = 0 % [PASSED] 168 | [18:01:16] All is good. No failures. No warnings. No errors. 169 | ``` 170 | 171 | * Launch a few data quality checks with Soda Core, directly with the SodaCL CLI 172 | (without the `datacontract` CLI): 173 | ```bash 174 | $ soda scan -d duckdb_local -c quality/soda-conf.yml contracts/data-contract-flight-route-quality.yaml 175 | [16:16:10] Soda Core 3.0.51 176 | [16:16:11] Scan summary: 177 | [16:16:11] 2/2 checks PASSED: 178 | [16:16:11] transport_routes in duckdb_local 179 | [16:16:11] row_count between 90000 and 100000 [PASSED] 180 | [16:16:11] invalid_percent(freq) = 0 % [PASSED] 181 | [16:16:11] All is good. No failures. No warnings. No errors. 182 | ``` 183 | 184 | # Installation 185 | * Install the `datacontract` CLI (command-line) utility 186 | 187 | ## On MacOS 188 | * With HomeBrew 189 | ```bash 190 | $ brew install datacontract/brew/datacontract 191 | ``` 192 | 193 | ## On Linux 194 | * Specify the platform and architecture of the machine: 195 | ```bash 196 | export platform="$(uname | tr '[:upper:]' '[:lower:]')" 197 | export architecture="$(uname -m|sed 's/x86_/amd/')" 198 | ``` 199 | 200 | * From the [GitHub releases](https://github.com/datacontract/cli/releases) 201 | + Specify the latest version: 202 | ```bash 203 | $ DC_VERSION="0.1.1" 204 | DC_FN="datacontract-v$DC_VERSION-$platform-$architecture.tar.gz" 205 | ``` 206 | + Download the CLI utility: 207 | ```bash 208 | $ mkdir -p /tmp/datacontract 209 | pushd /tmp/datacontract 210 | curl -kL https://github.com/datacontract/cli/releases/download/v$DC_VERSION/$DC_FN -o $DC_FN 211 | ``` 212 | + Install the CLI utility: 213 | ```bash 214 | $ mkdir -p ~/.local/bin 215 | tar zxf $DC_FN && rm -f $DC_FN 216 | mv datacontract ~/.local/bin/ && rm -f LICENSE README.md 217 | popd 218 | ``` 219 | 220 | ## General 221 | * Check the version: 222 | ```bash 223 | $ datacontract --version 224 | datacontract version v0.3.2 225 | ``` 226 | 227 | ## Options 228 | 229 | ### DuckDB 230 | * Install DuckDB: 231 | ```bash 232 | $ python -mpip install -U duckdb 233 | ``` 234 | 235 | * Create, or re-create the `transport_routes` view in DuckDB: 236 | ```bash 237 | $ duckdb db.duckdb < sql/duckdb-ddl-create-view-from-csv.sql 238 | ls -lFh db.duckdb 239 | -rw-r--r-- 1 user group 268K Sep 27 11:39 db.duckdb 240 | ``` 241 | 242 | * The resulting view may be queried easily 243 | + Schema: 244 | ```bash 245 | $ duckdb db.duckdb "describe transport_routes;" 246 | ┌────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐ 247 | │ column_name │ column_type │ null │ key │ default │ extra │ 248 | │ varchar │ varchar │ varchar │ varchar │ varchar │ int32 │ 249 | ├────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤ 250 | │ transporter_id │ VARCHAR │ YES │ │ │ │ 251 | │ org_por_id │ VARCHAR │ YES │ │ │ │ 252 | │ dst_por_id │ VARCHAR │ YES │ │ │ │ 253 | │ freq │ BIGINT │ YES │ │ │ │ 254 | └────────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘ 255 | ``` 256 | + Number of records: 257 | ```bash 258 | $ duckdb db.duckdb "select count(*) as nb_recs from transport_routes;" 259 | ┌─────────┐ 260 | │ nb_recs │ 261 | │ int64 │ 262 | ├─────────┤ 263 | │ 91081 │ 264 | └─────────┘ 265 | ``` 266 | 267 | ### Soda Core 268 | * Install Soda Core with DuckDB: 269 | ```bash 270 | $ python -mpip install -U soda-core-duckdb 271 | ``` 272 | 273 | * Potentially re-execute the initialization scripts of the Shell: 274 | ```bash 275 | $ exec bash 276 | ``` 277 | 278 | * Test the connection from Soda to DuckDB: 279 | ```bash 280 | $ soda test-connection -d duckdb_local -c soda-conf.yml -V 281 | [12:05:22] Soda Core 3.0.50 282 | [12:05:22] Reading configuration file "soda-conf.yml" 283 | Successfully connected to 'duckdb_local'. 284 | [12:05:22] Query duckdb_local.test-connection: 285 | SELECT 1 286 | Connection 'duckdb_local' is valid. 287 | ``` 288 | 289 | # Trouble shooting 290 | 291 | ## Soda Core 292 | * Reference: https://github.com/sodadata/soda-core 293 | -------------------------------------------------------------------------------- /datacontract.com/contracts/data-contract-flight-route-quality.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route-quality.yaml 3 | # 4 | 5 | checks for transport_routes: 6 | - row_count between 90000 and 100000 7 | - invalid_percent(freq) = 0 %: 8 | valid format: integer 9 | -------------------------------------------------------------------------------- /datacontract.com/contracts/data-contract-flight-route-schema.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route-schema.yaml 3 | # 4 | 5 | transport_routes: 6 | description: 7 | type: object 8 | properties: 9 | transporter_id: 10 | type: string 11 | description: 2-letter IATA code of the transport provider 12 | org_por_id: 13 | type: string 14 | description: origin POR 15 | dst_por_id: 16 | type: string 17 | description: destination POR 18 | freq: 19 | type: integer 20 | description: frequency / number of flights 21 | -------------------------------------------------------------------------------- /datacontract.com/contracts/data-contract-flight-route.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/contracts/data-contract-flight-route.yaml 3 | # 4 | 5 | dataContractSpecification: 0.9.0 6 | id: data-contract-org-example-ks-flight-route-v0-0-1 7 | info: 8 | title: Data contract for transport routes 9 | version: 0.0.1 10 | description: The POR (points of reference) are specified through IATA 3-letter codes and transport providers through IATA 2-letter codes 11 | owner: OpenTravelData (OPTD) 12 | dataProduct: OPTD Transport Routes 13 | outputPort: urn:outputport:opentraveldata:optd_airline_por_v1 14 | contact: 15 | name: OpenTravelData (OPTD) 16 | url: https://github.com/orgs/opentraveldata/teams/core 17 | email: denis.arnaud_optd@m4x.org 18 | 19 | ### servers 20 | 21 | servers: 22 | production: 23 | type: s3 24 | location: s3://optd/latest/optd_airline_por.csv 25 | 26 | ### terms 27 | 28 | #terms: 29 | # usage: 30 | # limitations: 31 | # billing: 32 | # noticePeriod: 33 | 34 | 35 | ### schema 36 | 37 | schema: 38 | type: json-schema 39 | specification: "$ref: contracts/data-contract-flight-route-schema.yaml" 40 | 41 | ### examples 42 | 43 | examples: 44 | - type: csv 45 | model: transport_routes 46 | data: |- 47 | transporter_id,org_por_id,dst_por_id,freq 48 | "AF","ALG","CDG",1309 49 | "BA","ATL","LHR",325 50 | "IB","AMS","MAD",113 51 | "LH","AMS","FRA",1622 52 | "LH","AMS","MUC",596 53 | "QF","ADL","SYD",2732 54 | 55 | ### quality 56 | 57 | quality: 58 | type: SodaCL 59 | specification: "$ref: contracts/data-contract-flight-route-quality.yaml" 60 | 61 | -------------------------------------------------------------------------------- /datacontract.com/contracts/orders-latest-npii.yaml: -------------------------------------------------------------------------------- 1 | dataContractSpecification: 0.9.0 2 | id: orders-latest-npii 3 | info: 4 | title: Orders Latest NPII 5 | version: 1.0.0 6 | description: Successful customer orders in the webshop. All orders since 2020-01-01. Orders with their line items are in their current state (no history included). PII data is removed. 7 | owner: Checkout Team 8 | dataProduct: orders 9 | outputPort: bigquery_orders_latest_npii_v1 10 | contact: 11 | name: John Doe (Data Product Owner) 12 | email: john.doe@example.com 13 | servers: 14 | production: 15 | type: BigQuery 16 | project: acme_orders_prod 17 | dataset: bigquery_orders_latest_npii_v1 18 | terms: 19 | usage: > 20 | Data can be used for reports, analytics and machine learning use cases. 21 | Order may be linked and joined by other tables 22 | limitations: > 23 | Not suitable for real-time use cases. 24 | Data may not be used to identify individual customers. 25 | Max data processing per day: 10 TiB 26 | billing: 5000 USD per month 27 | noticePeriod: P3M 28 | schema: 29 | type: dbt # the specification format: dbt, bigquery, avro, protobuf, sql, json-schema, custom 30 | specification: # expressed as string or inline yaml or via "$ref: model.yaml" 31 | version: 2 32 | description: The subset of the output port's data model that we agree to use 33 | models: 34 | - name: orders 35 | description: > 36 | One record per order. Includes cancelled and deleted orders. 37 | columns: 38 | - name: order_id 39 | data_type: string 40 | description: Primary key of the orders table 41 | tests: 42 | - unique 43 | - not_null 44 | - name: order_timestamp 45 | data_type: timestamptz 46 | description: The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful. 47 | tests: 48 | - not_null 49 | - name: order_total 50 | data_type: integer 51 | description: "Total amount of the order in the smallest monetary unit (e.g., cents)." 52 | tests: 53 | - not_null 54 | - name: line_items 55 | description: > 56 | The items that are part of an order 57 | columns: 58 | - name: lines_item_id 59 | data_type: string 60 | description: Primary key of the lines_item_id table 61 | - name: order_id 62 | data_type: string 63 | description: Foreign key to the orders table 64 | tests: 65 | - relationships: 66 | to: ref('orders') 67 | field: order_id 68 | - name: sku 69 | data_type: string 70 | description: The purchased article number 71 | examples: 72 | - type: csv # csv, json, yaml, custom 73 | model: orders 74 | data: |- # expressed as string or inline yaml or via "$ref: data.csv" 75 | order_id,order_timestamp,order_total 76 | "1001","2023-09-09T08:30:00Z",2500 77 | "1002","2023-09-08T15:45:00Z",1800 78 | "1003","2023-09-07T12:15:00Z",3200 79 | "1004","2023-09-06T19:20:00Z",1500 80 | "1005","2023-09-05T10:10:00Z",4200 81 | "1006","2023-09-04T14:55:00Z",2800 82 | "1007","2023-09-03T21:05:00Z",1900 83 | "1008","2023-09-02T17:40:00Z",3600 84 | "1009","2023-09-01T09:25:00Z",3100 85 | "1010","2023-08-31T22:50:00Z",2700 86 | - type: csv 87 | model: line_items 88 | data: |- 89 | lines_item_id,order_id,sku 90 | "1","1001","5901234123457" 91 | "2","1001","4001234567890" 92 | "3","1002","5901234123457" 93 | "4","1002","2001234567893" 94 | "5","1003","4001234567890" 95 | "6","1003","5001234567892" 96 | "7","1004","5901234123457" 97 | "8","1005","2001234567893" 98 | "9","1005","5001234567892" 99 | "10","1005","6001234567891" 100 | quality: 101 | type: SodaCL # data quality check format: SodaCL, montecarlo, custom 102 | specification: # expressed as string or inline yaml or via "$ref: checks.yaml" 103 | checks for orders: 104 | - freshness(order_timestamp) < 24h 105 | - row_count > 500000 106 | - duplicate_count(order_id) = 0 107 | checks for line_items: 108 | - row_count > 500000 109 | 110 | -------------------------------------------------------------------------------- /datacontract.com/data: -------------------------------------------------------------------------------- 1 | ../data -------------------------------------------------------------------------------- /datacontract.com/soda-checks.yml: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/soda-checks.yml 3 | # 4 | 5 | checks for transport_routes: 6 | - row_count between 90000 and 100000 7 | - invalid_percent(freq) = 0 %: 8 | valid format: integer 9 | -------------------------------------------------------------------------------- /datacontract.com/soda-conf.yml: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/soda-conf.yml 3 | # 4 | 5 | data_source duckdb_local: 6 | type: duckdb 7 | path: db.duckdb 8 | read_only: true 9 | 10 | -------------------------------------------------------------------------------- /datacontract.com/sql/duckdb-ddl-create-view-from-csv.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- File: https://github.com/data-engineering-helpers/data-contracts/tree/main/datacontract.com/sql/duckdb-ddl-create-view-from-csv.sql 3 | -- 4 | 5 | drop view if exists transport_routes; 6 | drop table if exists transport_routes; 7 | 8 | create table transport_routes as ( 9 | select airline_code as transporter_id, 10 | apt_org as org_por_id, 11 | apt_dst as dst_por_id, 12 | flt_freq as freq 13 | from read_csv_auto("data/optd/optd_airline_por.csv", 14 | header=True, 15 | delim="^", 16 | AUTO_DETECT=TRUE) 17 | ); 18 | 19 | -------------------------------------------------------------------------------- /img/data-contracts-producers-and-consumers-2023-05.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-engineering-helpers/data-contracts/eb17ee6123106923ed1c9958e43460162eda0483/img/data-contracts-producers-and-consumers-2023-05.webp -------------------------------------------------------------------------------- /quality/ge/01-example-pyspark-ge-hc.py: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/blob/main/quality/ge/01-example-pyspark-ge-hc.py 3 | # 4 | # Inspired by 5 | # https://towardsdatascience.com/data-contracts-the-mesh-glue-c1b533e2a664 6 | # => https://gist.githubusercontent.com/velascoluis/268f73916f22fbaa070f9aba64c948f6/raw/71b85e11e0e7e81be7a665809ee1655e77669594/evaluate_ge_spark.py 7 | # Original author: Luis Velasco 8 | # 9 | import pyspark 10 | from pyspark.sql import SparkSession 11 | import argparse 12 | import json 13 | import logging 14 | from great_expectations.dataset import SparkDFDataset 15 | 16 | 17 | def evaluate_contract( 18 | table_name: str = None, 19 | gcs_warehouse_dir: str = None, 20 | contract_config_str: str = None, 21 | ): 22 | """ 23 | Evaluate a data contract using great expectations with pySPARK 24 | Parameters 25 | ---------- 26 | table_name: The iceberg table where we will evaluate the contract against 27 | gcs_warehouse_dir: Iceberg GCS data location 28 | contract_config_str: JSON representation of the contract with the following format 29 | data_contract: 30 | version: 1 31 | environment: pro 32 | name: polyexpose 33 | table_schema: 34 | dq_slos: 35 | mismatch_pct_lt: 0.2 36 | nulls_pct_lt: 0.2 37 | field_name: registration_dttm 38 | nulls_allowed: true 39 | required: true 40 | type: TIMESTAMP 41 | values_in: None 42 | 43 | Returns: 44 | -------- 45 | None 46 | """ 47 | config = pyspark.SparkConf().setAll( 48 | [ 49 | ( 50 | "spark.sql.extensions", 51 | "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", 52 | ), 53 | ( 54 | "spark.sql.catalog.spark_catalog", 55 | "org.apache.iceberg.spark.SparkSessionCatalog", 56 | ), 57 | ("spark.sql.catalog.spark_catalog.type", "hive"), 58 | ("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog"), 59 | ("spark.sql.catalog.local.type", "hadoop"), 60 | ("spark.sql.catalog.local.warehouse", gcs_warehouse_dir), 61 | ] 62 | ) 63 | 64 | spark = SparkSession.builder.config(conf=config).getOrCreate() 65 | spark.sparkContext.setLogLevel("WARN") 66 | contract_config = json.loads(contract_config_str) 67 | logging.info("POLYEXPOSE - Reading table: " + table_name + "\n") 68 | df = spark.table(table_name) 69 | logging.info("POLYEXPOSE - Starting to validate contract \n ") 70 | df_ge = SparkDFDataset(df) 71 | logging.info("POLYEXPOSE -Schema required fields validation \n ") 72 | mandatory_columns = [] 73 | for field in contract_config["data_contract"]["table_schema"]: 74 | if bool(field["required"]): 75 | mandatory_columns.append(field["field_name"]) 76 | for column in mandatory_columns: 77 | try: 78 | assert df_ge.expect_column_to_exist( 79 | column 80 | ).success, f"POLYEXPOSE - Required {column} not found in table:FAILED" 81 | logging.info(f"POLYEXPOSE -{column} exists - PASSED \n ") 82 | except AssertionError as e: 83 | logging.error(e) 84 | spark.stop() 85 | -------------------------------------------------------------------------------- /schemata/mvp/.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | .idea/ 3 | 4 | # build output 5 | target/ 6 | *.iml 7 | *.desc 8 | 9 | # Python 10 | __pycache__/ 11 | 12 | # 13 | tmp/ 14 | 15 | -------------------------------------------------------------------------------- /schemata/mvp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # 3 | all: build-all 4 | 5 | .PHONY: proto-gen 6 | proto-gen: 7 | protoc --proto_path=opencontract/v1/org --proto_path=schema --descriptor_set_out=model.desc --include_imports --include_source_info ./**/*.proto 8 | 9 | .PHONY: build-all 10 | build-all: proto-gen 11 | 12 | -------------------------------------------------------------------------------- /schemata/mvp/README.md: -------------------------------------------------------------------------------- 1 | Schemata - Minimal Viable Product (MVP) 2 | ======================================= 3 | 4 | # Table of Content (ToC) 5 | * [References](#references) 6 | * [Initialization](#initialization) 7 | * [Java and Maven](#java-and-maven) 8 | + [Great Expectations](#great-expectations) 9 | * [Data contracts - Schema MVP](#data-contracts---schema-mvp) 10 | * [Additional utilities](#additional-utilities) 11 | * [Generate Schemata JAR artifact](#generate-schemata-jar-artifact) 12 | * [Copy the Schemata Protobuf schema](#copy-the-schemata-protobuf-schema) 13 | * [Schemata utilities to validate and document the data contracts](#schemata-utilities-to-validate-and-document-the-data-contracts) 14 | * [Great Expectations](#great-expectations) 15 | 16 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go) 17 | 18 | # References 19 | * [GitHub - Data Engineering helpers - Data contracts (this project)](https://github.com/data-engineering-helpers/data-contracts) 20 | * Source of data: 21 | [GitHub - OpenTravelData (OPTD)](https://github.com/opentraveldata/opentraveldata) 22 | + [GitHub - OPTD - `optd_airline_por.csv` Air routes data file](hihttps://github.com/opentraveldata/opentraveldata/blob/master/opentraveldata/optd_airline_por.csv) 23 | 24 | ## Pre-requisites 25 | * [SDKMan](https://sdkman.io/) 26 | * [Great Expectations (GX)](https://github.com/great-expectations/great_expectations) 27 | + [GX - Support fot data contracts](https://github.com/great-expectations/great_expectations#data-contracts-support) 28 | * [JQ](https://stedolan.github.io/jq/) 29 | 30 | # Initialization 31 | 32 | ## Java and Maven 33 | * If not already done so, install Java (JDK 17) and Maven. The easiest 34 | is to use [SDKMan](https://sdkman.io/) 35 | + To install SDKMan (behind some corporate firewalls, one may need to setup 36 | a proxy with `http` and `https` Shell environment variables): 37 | ```bash 38 | $ curl -s "https://get.sdkman.io" | bash 39 | ``` 40 | + Proceed as requested by SDKMan (check that the SDKMan 41 | initialization is done properly within the Shell init scripts) 42 | 43 | * In order to check the currently installed versions of Java, Maven 44 | and so on, use `sdk list tool`, for instance: 45 | + Check the installed versions of Java: 46 | ```bash 47 | $ sdk list java 48 | ``` 49 | + Check the installed versions of Maven: 50 | ```bash 51 | $ sdk list maven 52 | ``` 53 | 54 | * Install the Amazon Coretto JDK17 (or whatever else JDK you prefer, 55 | but it has to be some JDK with version 17): 56 | ```bash 57 | $ sdk install 17.0.6-amzn 58 | ``` 59 | 60 | * Install Maven: 61 | ```bash 62 | $ sdk install 3.9.1 63 | ``` 64 | 65 | ## Great Expectations 66 | * If not already done so, install Great Expectations (GX): 67 | ```bash 68 | $ python -mpip install -U great_expectations 69 | ``` 70 | + Re-launch the Shell environment. For instance, with the Bash: 71 | ```bash 72 | $ exec bash 73 | ``` 74 | 75 | ## Additional utilities 76 | * [JQ](https://stedolan.github.io/jq/) comes handy to parse JSON structures. 77 | It is packages on most of the systems (MacOS, Linux) 78 | 79 | ## Data contracts - Schema MVP 80 | * If not already done so, clone the 81 | [Data contracts Git repository](https://github.com/data-engineering-helpers/data-contracts) 82 | 83 | * Go into the 84 | [Data contracts Schemata MVP directory](https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/mvp): 85 | ```bash 86 | $ pushd ~/dev/infra/data-contracts/schemata/mvp 87 | ``` 88 | 89 | ## Generate Schemata JAR artifact 90 | As of April 2023, Schemata does not release its JAR artifact on public 91 | repositories such as Maven Central. Hence, one has to clone the Schemata 92 | Git repository and to generate the JAR artifact locally. 93 | 94 | * If not already done so, clone the Schemata Git repository: 95 | ```bash 96 | $ mkdir -p ~/dev/infra && \ 97 | git clone git@github.com:ananthdurai/schemata.git ~/dev/infra/schemata 98 | ``` 99 | 100 | * Go into the Schemata directory: 101 | ```bash 102 | pushd ~/dev/infra/schemata 103 | ``` 104 | 105 | * Compile and package (generate the JAR artifact for) Schemata: 106 | ```bash 107 | $ make build-all 108 | ``` 109 | 110 | * Check that the JAR artifacts have been correctly generated: 111 | ```bash 112 | $ ls -lFh target/*.jar 113 | -rw-r--r-- 1 user staff 112K Apr 13 16:27 target/original-schemata-1.0.jar 114 | -rw-r--r-- 1 user staff 7.8M Apr 13 16:27 target/schemata-1.0-sources.jar 115 | -rw-r--r-- 1 user staff 20M Apr 13 16:27 target/schemata-1.0.jar 116 | ``` 117 | 118 | * Leave the Schemata directory: 119 | ```bash 120 | $ popd 121 | ``` 122 | 123 | * Go into the 124 | [Data contracts Schemata MVP directory](https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/mvp): 125 | ```bash 126 | $ pushd ~/dev/infra/data-contracts/schemata/mvp 127 | ``` 128 | 129 | * Copy the just generated Schemata JAR artifact: 130 | ```bash 131 | $ cp ~/dev/infra/schemata/target/schemata-1.0.jar target/ 132 | ``` 133 | 134 | * Leave the Data contracts Schemata MVP directory: 135 | ```bash 136 | $ popd 137 | ``` 138 | 139 | ## Copy the Schemata Protobuf schema 140 | * As mentioned on the 141 | [Schemata README](https://github.com/ananthdurai/schemata#download-and-install-protobuf-open-contract-definitions), 142 | install/copy the Schemata Protobuf schema locally (change the `v1` version 143 | if needed): 144 | ```bash 145 | $ bash -c "$(curl -fsSL https://raw.githubusercontent.com/ananthdurai/schemata/main/install.sh v1)" 146 | ``` 147 | 148 | * The resulting Schemata Protobuf schema should be available from the 149 | local `opencontract/v1/org/` directory: 150 | ```bash 151 | ls -lFh opencontract/v1/org/schemata/protobuf 152 | total 64 153 | -rw-r--r-- 1 user staff 28K Apr 21 09:44 schemata.proto 154 | ``` 155 | 156 | # Schemata utilities to validate and document the data contracts 157 | * Go into the 158 | [Data contracts Schemata MVP directory](https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/MVP): 159 | ```bash 160 | $ pushd ~/dev/infra/data-contracts/schemata/mvp 161 | ``` 162 | 163 | * Generate the data contract descriptors (they are needed by the Shell scripts 164 | and need to be refreshed/re-generated whenever the data contracts change): 165 | ```bash 166 | $ make all 167 | ``` 168 | 169 | * Check that the data contract descriptors have been correctly generated: 170 | ```bash 171 | $ ls -lFh *.desc 172 | -rw-r--r-- 1 user staff 111K Apr 13 17:01 model.desc 173 | $ strings model.desc | head -3 174 | google/protobuf/descriptor.proto 175 | google.protobuf"M 176 | FileDescriptorSet 177 | ``` 178 | 179 | * Validate the model: 180 | ```bash 181 | $ ./validate.sh 182 | ... 183 | Schema validation success. No error to report 184 | ``` 185 | 186 | * Score the model: 187 | ```bash 188 | $ ./score.sh org.opentraveldata.Route 189 | ... 190 | 191 | Schemata score for org.examples.User : 0.19 192 | ``` 193 | 194 | * Document the model 195 | + Raw output: 196 | ```bash 197 | $ ./document.sh 198 | ``` 199 | + Parsed with JQ: 200 | ```bash 201 | $ ./document.sh |grep -v "^[0-9]\{2\}"|jq 202 | [ 203 | { 204 | "name": "org.examples.Category", 205 | "description": "This is the description of the Category table", 206 | ... 207 | } 208 | ] 209 | ``` 210 | 211 | # Exploring the data 212 | * Launch the Python utility to parse the data set: 213 | ```bash 214 | $ pyspark < src/python/pyspark.py 215 | ... 216 | +------------+-------+-------+--------+ 217 | |airline_code|apt_org|apt_dst|flt_freq| 218 | +------------+-------+-------+--------+ 219 | | 0B| AGP| OTP| 108| 220 | --- 221 | | 0B| BCM| MUC| 66| 222 | +------------+-------+-------+--------+ 223 | only showing top 20 rows 224 | ``` 225 | 226 | # Great Expectations 227 | 228 | ## Setup 229 | Though the initial setup has been made in this project, it is reminded 230 | below how to do such an initial setup: 231 | 1. Initialize the GX project: 232 | ```bash 233 | $ great_expectations init 234 | ``` 235 | 2. Add the CSV data sample as a data source for GX: 236 | ```bash 237 | $ great_expectations datasource new 238 | ``` 239 | + Name of the data source: `geonames_routes` 240 | + The type of data source is local filesystem, to be read with PySpark 241 | + The base directory is `../../../data/optd` 242 | + Execute all the steps/cells of the Jupyter notebook and exit it 243 | 3. Create an expectation suite: 244 | ```bash 245 | $ great_expectations suite new 246 | ``` 247 | + Name of the suite: `geonames_routes` 248 | + Execute all the steps/cells of the Jupyter notebook and exit it 249 | + Edit the just generated 250 | [JSON file](https://github.com/data-engineering-helpers/data-contracts/blob/main/schemata/mvp/great_expectations/expectations/geonames_routes.json) 251 | (`vi great_expectations/expectations/geonames_routes.json`) and 252 | add the following lines/section, after the 253 | `"datasource_name": "geonames_routes",` line and before the 254 | `"limit": 1000`: 255 | ```js 256 | "batch_spec_passthrough": { 257 | "reader_options": { 258 | "delimiter": "^", 259 | "header": true 260 | } 261 | }, 262 | ``` 263 | 264 | ## Contribute to expectation suites 265 | * Documentation: 266 | https://docs.greatexpectations.io/docs/guides/validation/checkpoints/how_to_add_validations_data_or_suites_to_a_checkpoint 267 | 268 | * In order to edit the expectations suite: 269 | ```bash 270 | $ great_expectations suite edit geonames_routes 271 | ``` 272 | 273 | * Execute the expectations suite and interact through the Jupyter notebook: 274 | ```bash 275 | $ great_expectations checkpoint run geonames_routes 276 | ... 277 | Suite Name Status Expectations met 278 | - geonames_routes ✔ Passed 0 of 0 (100 %) 279 | ``` 280 | 281 | -------------------------------------------------------------------------------- /schemata/mvp/document.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | java -jar target/schemata-1.0.jar document --source=model.desc -p=PROTOBUF 4 | 5 | -------------------------------------------------------------------------------- /schemata/mvp/great_expectations/.gitignore: -------------------------------------------------------------------------------- 1 | uncommitted/ -------------------------------------------------------------------------------- /schemata/mvp/great_expectations/checkpoints/geonames_routes.yml: -------------------------------------------------------------------------------- 1 | name: geonames_routes 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: 8 | batch_request: {} 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: geonames_routes 25 | data_connector_name: default_inferred_data_connector_name 26 | data_asset_name: optd_airline_por.csv 27 | data_connector_query: 28 | index: -1 29 | expectation_suite_name: geonames_routes 30 | profilers: [] 31 | ge_cloud_id: 32 | expectation_suite_ge_cloud_id: 33 | -------------------------------------------------------------------------------- /schemata/mvp/great_expectations/expectations/.ge_store_backend_id: -------------------------------------------------------------------------------- 1 | store_backend_id = 7385a960-805b-4e5d-b8af-5f8a4ba41a54 2 | -------------------------------------------------------------------------------- /schemata/mvp/great_expectations/expectations/geonames_routes.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "geonames_routes", 4 | "expectations": [], 5 | "ge_cloud_id": null, 6 | "meta": { 7 | "citations": [ 8 | { 9 | "batch_request": { 10 | "data_asset_name": "optd_airline_por.csv", 11 | "data_connector_name": "default_inferred_data_connector_name", 12 | "datasource_name": "geonames_routes", 13 | "batch_spec_passthrough": { 14 | "reader_options": { 15 | "delimiter": "^", 16 | "header": true 17 | } 18 | }, 19 | "limit": 1000 20 | }, 21 | "citation_date": "2023-04-21T15:02:31.998735Z", 22 | "comment": "Created suite added via CLI" 23 | } 24 | ], 25 | "great_expectations_version": "0.16.8" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /schemata/mvp/great_expectations/great_expectations.yml: -------------------------------------------------------------------------------- 1 | # Welcome to Great Expectations! Always know what to expect from your data. 2 | # 3 | # Here you can define datasources, batch kwargs generators, integrations and 4 | # more. This file is intended to be committed to your repo. For help with 5 | # configuration please: 6 | # - Read our docs: https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview/#2-configure-your-datasource 7 | # - Join our slack channel: http://greatexpectations.io/slack 8 | 9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility 10 | # It is auto-generated and usually does not need to be changed. 11 | config_version: 3.0 12 | 13 | # Datasources tell Great Expectations where your data lives and how to get it. 14 | # You can use the CLI command `great_expectations datasource new` to help you 15 | # add a new datasource. Read more at https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview 16 | datasources: 17 | geonames_routes: 18 | data_connectors: 19 | default_inferred_data_connector_name: 20 | base_directory: ../../../data/optd 21 | module_name: great_expectations.datasource.data_connector 22 | default_regex: 23 | group_names: 24 | - data_asset_name 25 | pattern: (.*) 26 | class_name: InferredAssetFilesystemDataConnector 27 | default_runtime_data_connector_name: 28 | assets: 29 | my_runtime_asset_name: 30 | batch_spec_passthrough: 31 | reader_options: 32 | delimiter: ^ 33 | header: true 34 | module_name: great_expectations.datasource.data_connector.asset 35 | class_name: Asset 36 | batch_identifiers: 37 | - runtime_batch_identifier_name 38 | module_name: great_expectations.datasource.data_connector 39 | class_name: RuntimeDataConnector 40 | module_name: great_expectations.datasource 41 | class_name: Datasource 42 | execution_engine: 43 | module_name: great_expectations.execution_engine 44 | class_name: SparkDFExecutionEngine 45 | 46 | # This config file supports variable substitution which enables: 1) keeping 47 | # secrets out of source control & 2) environment-based configuration changes 48 | # such as staging vs prod. 49 | # 50 | # When GX encounters substitution syntax (like `my_key: ${my_value}` or 51 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt 52 | # to replace the value of `my_key` with the value from an environment 53 | # variable `my_value` or a corresponding key read from this config file, 54 | # which is defined through the `config_variables_file_path`. 55 | # Environment variables take precedence over variables defined here. 56 | # 57 | # Substitution values defined here can be a simple (non-nested) value, 58 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) 59 | # 60 | # 61 | # https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials 62 | 63 | 64 | config_variables_file_path: uncommitted/config_variables.yml 65 | 66 | # The plugins_directory will be added to your python path for custom modules 67 | # used to override and extend Great Expectations. 68 | plugins_directory: plugins/ 69 | 70 | stores: 71 | # Stores are configurable places to store things like Expectations, Validations 72 | # Data Docs, and more. These are for advanced users only - most users can simply 73 | # leave this section alone. 74 | # 75 | # Three stores are required: expectations, validations, and 76 | # evaluation_parameters, and must exist with a valid store entry. Additional 77 | # stores can be configured for uses such as data_docs, etc. 78 | expectations_store: 79 | class_name: ExpectationsStore 80 | store_backend: 81 | class_name: TupleFilesystemStoreBackend 82 | base_directory: expectations/ 83 | 84 | validations_store: 85 | class_name: ValidationsStore 86 | store_backend: 87 | class_name: TupleFilesystemStoreBackend 88 | base_directory: uncommitted/validations/ 89 | 90 | evaluation_parameter_store: 91 | class_name: EvaluationParameterStore 92 | checkpoint_store: 93 | class_name: CheckpointStore 94 | store_backend: 95 | class_name: TupleFilesystemStoreBackend 96 | suppress_store_backend_id: true 97 | base_directory: checkpoints/ 98 | 99 | profiler_store: 100 | class_name: ProfilerStore 101 | store_backend: 102 | class_name: TupleFilesystemStoreBackend 103 | suppress_store_backend_id: true 104 | base_directory: profilers/ 105 | 106 | expectations_store_name: expectations_store 107 | validations_store_name: validations_store 108 | evaluation_parameter_store_name: evaluation_parameter_store 109 | checkpoint_store_name: checkpoint_store 110 | 111 | data_docs_sites: 112 | # Data Docs make it simple to visualize data quality in your project. These 113 | # include Expectations, Validations & Profiles. The are built for all 114 | # Datasources from JSON artifacts in the local repo including validations & 115 | # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/docs/terms/data_docs 116 | local_site: 117 | class_name: SiteBuilder 118 | show_how_to_buttons: true 119 | store_backend: 120 | class_name: TupleFilesystemStoreBackend 121 | base_directory: uncommitted/data_docs/local_site/ 122 | site_index_builder: 123 | class_name: DefaultSiteIndexBuilder 124 | 125 | anonymous_usage_statistics: 126 | enabled: true 127 | data_context_id: 7385a960-805b-4e5d-b8af-5f8a4ba41a54 128 | include_rendered_content: 129 | expectation_suite: false 130 | expectation_validation_result: false 131 | globally: false 132 | notebooks: 133 | -------------------------------------------------------------------------------- /schemata/mvp/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /schemata/mvp/opencontract/v1/org/schemata/protobuf/schemata.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.schemata.schema; 4 | 5 | import "google/protobuf/descriptor.proto"; 6 | 7 | option java_package = "org.schemata.schema"; 8 | option java_outer_classname = "SchemataBuilder"; 9 | 10 | // MessageType captures the type of the stream. There are two types of stream. 11 | enum SchemaType { 12 | // This is an invalid state. If the entity defined as unknown the validator should throw an exception. 13 | UNKNOWN = 0; 14 | //Entity streams can be mutated in the downstream services. Entity streams often used to represent the current 15 | //state of the entity. In the classical DW concepts Entities typically represents the dimensions. 16 | // Entity must have a primary key field. 17 | ENTITY = 1; 18 | // Event streams are typically immutable in nature. Event streams often used to represent the state change of an Entity. 19 | // In the classical DW concepts Event streams represents the facts. Event streams will not have a primary key field. 20 | EVENT = 2; 21 | } 22 | 23 | enum EventType { 24 | // Invalid Event Type. If the message type is event, set it either LIFECYCLE or ACTIVITY or AGGREGATED 25 | NONE = 0; 26 | // LIFECYCLE event captures the state changes of an Entity. (e.g) User created, User deleted et al. 27 | LIFECYCLE = 1; 28 | // ACTIVITY event captures the events that resulted from one Entity changing the state of another Entity. 29 | // (e.g.) User A purchases Product B. The ACTIVITY event is often the result of a business transaction. 30 | ACTIVITY = 2; 31 | // AGGREGATED event captures the computed metrics over a specified window of time. (e.g) Number of views by a User for a Product. 32 | AGGREGATED = 3; 33 | } 34 | 35 | enum ActivityType { 36 | CREATED = 0; 37 | DELETED = 1; 38 | UPDATED = 2; 39 | } 40 | 41 | enum TimeUnit { 42 | SECONDS = 0; 43 | MINUTES = 1; 44 | HOURS = 2; 45 | } 46 | 47 | // CoreMetadata is the set of attribute apply to both the Message & Field 48 | message CoreMetadata { 49 | // Mandatory Metadata: description of the entity 50 | optional string description = 50001; 51 | // Optional Metadata: additional comments about the entity 52 | optional string comment = 50002; 53 | // Optional Metadata: Any related entity that has "hierarchy" or "has a" relationships. 54 | optional string see_also = 50003; 55 | // Optional Metadata: Additional link reference for further reading. 56 | // It could be a confluent page, An ADR or RFC or a Slack message link. 57 | optional string reference = 50004; 58 | } 59 | 60 | 61 | extend google.protobuf.MessageOptions { 62 | 63 | // message.description is a Mandatory Metadata 64 | CoreMetadata message_core = 60001; 65 | // Mandatory Metadata: owner of the entity. Usually it is the team name. 66 | string owner = 60002; 67 | // Mandatory Metadata: domain = 'core' indicates the entity is common across all the domains. 68 | // Other possible domains are `sales`, `marketing`, `product` etc 69 | string domain = 60003; 70 | // Mandatory Metadata: define the type of the message. 71 | SchemaType schema_type = 60004; 72 | // Status of the entity. You can have `testing`, `production` or `staging` depends on the lifecycle of schema definition. 73 | string status = 60005; 74 | // Slack or Teams channel name to communicate with the team which owns ths entity 75 | string team_channel = 60006; 76 | // Slack or Teams channel name to alert for any validation errors. 77 | string alert_channel = 60007; 78 | // Type of the event. Set if the Type = 'EVENT' 79 | EventType event_type = 60008; 80 | // Compliance Owner approves which team should have access to this Schema. 81 | string compliance_owner = 60009; 82 | // Compliance Owner Slack/ Teams Channel name 83 | string compliance_channel = 600010; 84 | } 85 | 86 | enum ClassificationType { 87 | //Identifiers used by developers to track users for advertising purposes. These include Google Play Advertising IDs, 88 | // Amazon Advertising IDs, Apple's identifierForAdvertising (IDFA), and Apple's identifierForVendor (IDFV). 89 | ADVERTISING_ID = 0; 90 | // An age measured in months or years. 91 | AGE = 1; 92 | // A credit card number is 12 to 19 digits long. They are used for payment transactions globally. 93 | CREDIT_CARD_NUMBER = 2; 94 | 95 | // A credit card track number is a variable length alphanumeric string. It is used to store key cardholder information. 96 | CREDIT_CARD_TRACK_NUMBER = 3; 97 | 98 | // A date. This infoType includes most date formats, including the names of common world holidays. 99 | DATE = 4; 100 | 101 | 102 | // A date that is identified by context as a date of birth. Note: Not recommended for use during latency sensitive operations. 103 | DATE_OF_BIRTH = 5; 104 | 105 | // A domain name as defined by the DNS standard. 106 | DOMAIN_NAME = 6; 107 | 108 | // An email address identifies the mailbox that emails are sent to or from. The maximum length of the domain name is 255 characters, and the maximum length of the local-part is 64 characters. 109 | EMAIL_ADDRESS = 7; 110 | 111 | // A person’s ethnic group. 112 | ETHNIC_GROUP = 8; 113 | 114 | // A common female name. Note: Not recommended for use during latency sensitive operations. 115 | FEMALE_NAME = 9; 116 | 117 | // A first name is defined as the first part of a PERSON_NAME. Note: Not recommended for use during latency sensitive operations. 118 | FIRST_NAME = 10; 119 | 120 | // A person’s gender identity. 121 | GENDER = 11; 122 | 123 | // Alphanumeric and special character strings that may be personally identifying but do not belong to a well-defined category, such as user IDs or medical record numbers. 124 | GENERIC_ID = 12; 125 | 126 | // An International Bank Account Number (IBAN) is an internationally agreed-upon method for identifying bank accounts defined by the International Standard of Organization (ISO) 13616:2007 standard. The European Committee for Banking Standards (ECBS) created ISO 13616:2007. An IBAN consists of up to 34 alphanumeric characters, including elements such as a country code or account number. 127 | IBAN_CODE = 13; 128 | 129 | // An HTTP cookie is a standard way of storing data on a per website basis. This detector will find headers containing these cookies. 130 | HTTP_COOKIE = 14; 131 | 132 | // An Integrated Circuit Card Identifier (ICCID) is used to uniquely identify each SIM card. It includes information, such as the country the card is active in and the mobile network code. 133 | ICCID_NUMBER = 15; 134 | 135 | // The International Classification of Diseases, Ninth Revision, Clinical Modification (ICD-9-CM) lexicon is used to assign diagnostic and procedure codes associated with inpatient, outpatient, and physician office use in the United States. The US National Center for Health Statistics (NCHS) created the ICD-9-CM lexicon. It is based on the ICD-9 lexicon, but provides for more morbidity detail. The ICD-9-CM lexicon is updated annually on October 1. 136 | ICD9_CODE = 16; 137 | 138 | // Like ICD-9-CM codes, the International Classification of Diseases, Tenth Revision, Clinical Modification (ICD-10-CM) lexicon is a series of diagnostic codes. The World Health Organization (WHO) publishes the ICD-10-CM lexicon to describe causes of morbidity and mortality. 139 | ICD10_CODE = 17; 140 | 141 | // An International Mobile Equipment Identity (IMEI) hardware identifier, used to identify mobile phones. 142 | IMEI_HARDWARE_ID = 18; 143 | 144 | // An International Mobile Subscriber Identity (IMEI) identifier, used to identify users on a mobile network. 145 | IMSI_ID = 19; 146 | 147 | // An Internet Protocol (IP) address (either IPv4 or IPv6). 148 | IP_ADDRESS = 20; 149 | 150 | // A last name is defined as the last part of a PERSON_NAME. 151 | LAST_NAME = 21; 152 | 153 | 154 | // A physical address or location. 155 | LOCATION = 22; 156 | 157 | // A media access control address (MAC address), which is an identifier for a network adapter. 158 | MAC_ADDRESS = 23; 159 | // A local media access control address (MAC address), which is an identifier for a network adapter. 160 | MAC_ADDRESS_LOCAL = 24; 161 | //A common male name. 162 | MALE_NAME = 25; 163 | 164 | // Terms that commonly refer to a person's medical condition or health. 165 | MEDICAL_TERM = 26; 166 | 167 | // A name of a chain store, business or organization. 168 | ORGANIZATION_NAME = 27; 169 | 170 | 171 | // A passport number that matches passport numbers for the following countries: Australia, Canada, China, France, Germany, Japan, Korea, Mexico, The Netherlands, Poland, Singapore, Spain, Sweden, Taiwan, United Kingdom, and the United States. 172 | PASSPORT = 28; 173 | 174 | // A full person name, which can include first names, middle names or initials, and last names. Note: Not recommended for use during latency sensitive operations. 175 | PERSON_NAME = 29; 176 | 177 | // A telephone number. 178 | PHONE_NUMBER = 30; 179 | 180 | // A street address. Note: Not recommended for use during latency sensitive operations. 181 | STREET_ADDRESS = 31; 182 | 183 | // A SWIFT code is the same as a Bank Identifier Code (BIC). It's a unique identification code for a particular bank. These codes are used when transferring money between banks, particularly for international wire transfers. Banks also use the codes for exchanging other messages. 184 | SWIFT_CODE = 32; 185 | // A timestamp of a specific time of day. 186 | TIME = 33; 187 | 188 | // A Uniform Resource Locator (URL). 189 | URL = 34; 190 | 191 | // A vehicle identification number (VIN) is a unique 17-digit code assigned to every on-road motor vehicle. 192 | VEHICLE_IDENTIFICATION_NUMBER = 35; 193 | 194 | // The infoType detectors in this section detect credentials and other secret data. 195 | Credentials_And_Secrets = 36; 196 | 197 | 198 | // An authentication token is a machine-readable way of determining whether a particular request has been authorized for a user. This detector currently identifies tokens that comply with OAuth or Bearer authentication. 199 | AUTH_TOKEN = 37; 200 | 201 | // Amazon Web Services account access keys. 202 | AWS_CREDENTIALS = 38; 203 | 204 | // Microsoft Azure certificate credentials for application authentication. 205 | AZURE_AUTH_TOKEN = 39; 206 | 207 | // A basic authentication header is an HTTP header used to identify a user to a server. It is part of the HTTP specification in RFC 1945, section 11. 208 | BASIC_AUTH_HEADER = 40; 209 | 210 | // An encryption key within configuration, code, or log text. 211 | ENCRYPTION_KEY = 41; 212 | 213 | // Google Cloud API key. An encrypted string that is used when calling Google Cloud APIs that don't need to access private user data. 214 | GCP_API_KEY = 42; 215 | 216 | // Google Cloud service account credentials. Credentials that can be used to authenticate with Google API client libraries and service accounts. 217 | GCP_CREDENTIALS = 43; 218 | 219 | // JSON Web Token. JSON Web Token in compact form. Represents a set of claims as a JSON object that is digitally signed using JSON Web Signature. 220 | JSON_WEB_TOKEN = 44; 221 | 222 | // Clear text passwords in configs, code, and other text. 223 | PASSWORD = 45; 224 | 225 | // A weakly hashed password is a method of storing a password that is easy to reverse engineer. The presence of such hashes often indicate that a system's security can be improved. 226 | WEAK_PASSWORD_HASH = 46; 227 | 228 | // An XSRF token is an HTTP header that is commonly used to prevent cross-site scripting attacks. Cross-site scripting is a type of security vulnerability that can be exploited by malicious sites. 229 | XSRF_TOKEN = 47; 230 | 231 | 232 | // An Argentine Documento Nacional de Identidad (DNI), or national identity card, is used as the main identity document for citizens. 233 | ARGENTINA_DNI_NUMBER = 100; 234 | 235 | 236 | // An Australian driver's license number. 237 | AUSTRALIA_DRIVERS_LICENSE_NUMBER = 200; 238 | 239 | // A 9-digit Australian Medicare account number is issued to permanent residents of Australia (except for Norfolk island). The primary purpose of this number is to prove Medicare eligibility to receive subsidized care in Australia. 240 | AUSTRALIA_MEDICARE_NUMBER = 201; 241 | 242 | // An Australian passport number. 243 | AUSTRALIA_PASSPORT = 202; 244 | 245 | // An Australian tax file number (TFN) is a number issued by the Australian Tax Office for taxpayer identification. Every taxpaying entity, such as an individual or an organization, is assigned a unique number. 246 | AUSTRALIA_TAX_FILE_NUMBER = 203; 247 | 248 | 249 | // A 12-digit Belgian national identity card number. 250 | BELGIUM_NATIONAL_ID_CARD_NUMBER = 300; 251 | 252 | 253 | // The Brazilian Cadastro de Pessoas Físicas (CPF) number, or Natural Persons Register number, is an 11-digit number used in Brazil for taxpayer identification. 254 | BRAZIL_CPF_NUMBER = 400; 255 | 256 | 257 | // A Canadian bank account number. 258 | CANADA_BANK_ACCOUNT = 500; 259 | 260 | // The British Columbia Personal Health Number (PHN) is issued to citizens, permanent residents, temporary workers, students, and other individuals who are entitled to health care coverage in the Province of British Columbia. 261 | CANADA_BC_PHN = 501; 262 | 263 | // A driver's license number for each of the ten provinces in Canada (the three territories are currently not covered). 264 | CANADA_DRIVERS_LICENSE_NUMBER = 502; 265 | 266 | // The Ontario Health Insurance Plan (OHIP) number is issued to citizens, permanent residents, temporary workers, students, and other individuals who are entitled to health care coverage in the Province of Ontario. 267 | CANADA_OHIP = 503; 268 | 269 | // A Canadian passport number. 270 | CANADA_PASSPORT = 504; 271 | 272 | // The Québec Health Insurance Number (also known as the RAMQ number) is issued to citizens, permanent residents, temporary workers, students, and other individuals who are entitled to health care coverage in the Province of Québec. 273 | CANADA_QUEBEC_HIN = 505; 274 | 275 | // The Canadian Social Insurance Number (SIN) is the main identifier used in Canada for citizens, permanent residents, and people on work or study visas. With a Canadian SIN and mailing address, one can apply for health care coverage, driver's licenses, and other important services. 276 | CANADA_SOCIAL_INSURANCE_NUMBER = 506; 277 | 278 | 279 | // A Chilean Cédula de Identidad (CDI), or identity card, is used as the main identity document for citizens. 280 | CHILE_CDI_NUMBER = 600; 281 | 282 | 283 | // A Chinese resident identification number. 284 | CHINA_RESIDENT_ID_NUMBER = 700; 285 | 286 | // A Chinese passport number. 287 | CHINA_PASSPORT = 701; 288 | 289 | // A Colombian Cédula de Ciudadanía (CDC), or citizenship card, is used as the main identity document for citizens. 290 | COLOMBIA_CDC_NUMBER = 800; 291 | 292 | 293 | // A Personal Identification Number (CPR, Det Centrale Personregister) is a national ID number in Denmark. It is used with public agencies such as health care and tax authorities. Banks and insurance companies also use it as a customer number. The CPR number is required for people who reside in Denmark, pay tax or own property there. 294 | DENMARK_CPR_NUMBER = 900; 295 | 296 | 297 | // The French Carte Nationale d'Identité Sécurisée (CNI or CNIS) is the French national identity card. It's an official identity document consisting of a 12-digit identification number. This number is commonly used when opening bank accounts and when paying by check. It can sometimes be used instead of a passport or visa within the European Union (EU) and in some other countries. 298 | FRANCE_CNI = 1000; 299 | 300 | // The French Numéro d'Inscription au Répertoire (NIR) is a permanent personal identification number that's also known as the French social security number for services including healthcare and pensions. 301 | FRANCE_NIR = 1001; 302 | 303 | // A French passport number. 304 | FRANCE_PASSPORT = 1002; 305 | 306 | // The French tax identification number is a government-issued ID for all individuals paying taxes in France. 307 | FRANCE_TAX_IDENTIFICATION_NUMBER = 1003; 308 | 309 | 310 | // A Finnish personal identity code, a national government identification number for Finnish citizens used on identity cards, driver's licenses and passports. 311 | FINLAND_NATIONAL_ID_NUMBER = 1100; 312 | 313 | 314 | // A German driver's license number. 315 | GERMANY_DRIVERS_LICENSE_NUMBER = 1200; 316 | 317 | // The German Personalausweis, or identity card, is used as the main identity document for citizens of Germany. 318 | GERMANY_IDENTITY_CARD_NUMBER = 1201; 319 | 320 | // A German passport number. The format of a German passport number is 10 alphanumeric characters, chosen from numerals 0–9 and letters C, F, G, H, J, K, L, M, N, P, R, T, V, W, X, Y, Z. 321 | GERMANY_PASSPORT = 1202; 322 | 323 | // An 11-digit German taxpayer identification number assigned to both natural-born and other legal residents of Germany for the purposes of recording tax payments. 324 | GERMANY_TAXPAYER_IDENTIFICATION_NUMBER = 1203; 325 | 326 | // A German Schufa identification number. Schufa Holding AG is a German credit bureau whose aim is to protect clients from credit risk. 327 | GERMANY_SCHUFA_ID = 1204; 328 | 329 | 330 | // The 香港身份證, or Hong Kong identity card (HKIC), is used as the main identity document for citizens of Hong Kong. 331 | HONG_KONG_ID_NUMBER = 1300; 332 | 333 | 334 | // The Indian Aadhaar number is a 12-digit unique identity number obtained by residents of India, based on their biometric and demographic data. 335 | INDIA_AADHAAR_INDIVIDUAL = 1400; 336 | 337 | // The Indian GST identification number (GSTIN) is a unique identifier required of every business in India for taxation. 338 | INDIA_GST_INDIVIDUAL = 1401; 339 | 340 | // The Indian Personal Permanent Account Number (PAN) is a unique 10-digit alphanumeric identifier used for identification of individuals—particularly people who pay income tax. It's issued by the Indian Income Tax Department. The PAN is valid for the lifetime of the holder. 341 | INDIA_PAN_INDIVIDUAL = 1402; 342 | 343 | 344 | // An Indonesian Single Identity Number (Nomor Induk Kependudukan, or NIK) is the national identification number of Indonesia. The NIK is used as the basis for issuing Indonesian resident identity cards (Kartu Tanda Penduduk, or KTP), passports, driver's licenses and other identity documents. 345 | INDONESIA_NIK_NUMBER = 1500; 346 | 347 | 348 | // An Irish driving license number. 349 | IRELAND_DRIVING_LICENSE_NUMBER = 1600; 350 | 351 | // Eircode is an Irish postal code that uniquely identifies an address. 352 | IRELAND_EIRCODE = 1601; 353 | 354 | // An Irish (IE) passport number. 355 | IRELAND_PASSPORT = 1602; 356 | 357 | // The Irish Personal Public Service Number (PPS number, or PPSN) is a unique number for accessing social welfare benefits, public services, and information in Ireland. 358 | IRELAND_PPSN = 1603; 359 | 360 | 361 | // The Israel identity card number is issued to all Israeli citizens at birth by the Ministry of the Interior. Temporary residents are assigned a number when they receive temporary resident status. 362 | ISRAEL_IDENTITY_CARD_NUMBER = 1700; 363 | 364 | 365 | // An Italy fiscal code number is a unique 16-digit code assigned to Italian citizens as a form of identification. 366 | ITALY_FISCAL_CODE = 1800; 367 | 368 | 369 | // A Japanese bank account number. 370 | JAPAN_BANK_ACCOUNT = 1900; 371 | 372 | // A Japanese driver's license number. 373 | JAPAN_DRIVERS_LICENSE_NUMBER = 1901; 374 | 375 | // The Japanese national identification number—sometimes referred to as "My Number"—is a new national ID number as of January 2016. 376 | JAPAN_INDIVIDUAL_NUMBER = 1902; 377 | 378 | // A Japanese passport number. The passport number consists of two alphabetic characters followed by seven digits. 379 | JAPAN_PASSPORT = 1903; 380 | 381 | 382 | // A Korean passport number. 383 | KOREA_PASSPORT = 2000; 384 | 385 | // A South Korean Social Security number. 386 | KOREA_RRN = 2001; 387 | 388 | 389 | // The Mexico Clave Única de Registro de Población (CURP) number, or Unique Population Registry Code or Personal Identification Code number. The CURP number is an 18-character state-issued identification number assigned by the Mexican government to citizens or residents of Mexico and used for taxpayer identification. 390 | MEXICO_CURP_NUMBER = 2100; 391 | 392 | // A Mexican passport number. 393 | MEXICO_PASSPORT = 2101; 394 | 395 | 396 | // A Dutch Burgerservicenummer (BSN), or Citizen's Service Number, is a state-issued identification number that's on driver's licenses, passports, and international ID cards. 397 | NETHERLANDS_BSN_NUMBER = 2200; 398 | 399 | // A Dutch passport number. 400 | NETHERLANDS_PASSPORT = 2201; 401 | 402 | 403 | // Norway‘s Fødselsnummer, National Identification Number, or Birth Number is assigned at birth, or on migration into the country. It is registered with the Norwegian Tax Office. 404 | NORWAY_NI_NUMBER = 2300; 405 | 406 | 407 | // A Paraguayan Cédula de Identidad Civil (CIC), or civil identity card, is used as the main identity document for citizens. 408 | PARAGUAY_CIC_NUMBER = 2400; 409 | 410 | 411 | // A Peruvian Documento Nacional de Identidad (DNI), or national identity card, is used as the main identity document for citizens. 412 | PERU_DNI_NUMBER = 2500; 413 | 414 | 415 | // The PESEL number is the national identification number used in Poland. It is mandatory for all permanent residents of Poland, and for temporary residents staying there longer than 2 months. It is assigned to just one person and cannot be changed. 416 | POLAND_PESEL_NUMBER = 2600; 417 | 418 | // The Polish identity card number. is a government identification number for Polish citizens. Every citizen older than 18 years must have an identity card. The local Office of Civic Affairs issues the card, and each card has its own unique number. 419 | POLAND_NATIONAL_ID_NUMBER = 2601; 420 | 421 | // A Polish passport number. Polish passport is an international travel document for Polish citizens. It can also be used as a proof of Polish citizenship. 422 | POLAND_PASSPORT = 2602; 423 | 424 | 425 | // A Portuguese Cartão de cidadão (CDC), or Citizen Card, is used as the main identity, Social Security, health services, taxpayer, and voter document for citizens. 426 | PORTUGAL_CDC_NUMBER = 2700; 427 | 428 | 429 | // A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card. 430 | SINGAPORE_NATIONAL_REGISTRATION_ID_NUMBER = 2800; 431 | 432 | // A Singaporean passport number. 433 | SINGAPORE_PASSPORT = 2801; 434 | 435 | 436 | // A South Africa ID number. 437 | SOUTH_AFRICA_ID_NUMBER = 2900; 438 | 439 | 440 | // The Spanish Código de Identificación Fiscal (CIF) was the tax identification system used in Spain for legal entities until 2008. It was then replaced by the Número de Identificación Fiscal (NIF) for natural and juridical persons. 441 | SPAIN_CIF_NUMBER = 3000; 442 | 443 | // A Spain national identity number. 444 | SPAIN_DNI_NUMBER = 3001; 445 | 446 | // A Spanish driver's license number. 447 | SPAIN_DRIVERS_LICENSE_NUMBER = 3002; 448 | 449 | // The Spanish Número de Identificación de Extranjeros (NIE) is an identification number for foreigners living or doing business in Spain. An NIE number is needed for key transactions such as opening a bank account, buying a car, or setting up a mobile phone contract. 450 | SPAIN_NIE_NUMBER = 3003; 451 | 452 | // The Spanish Número de Identificación Fiscal (NIF) is a government identification number for Spanish citizens. An NIF number is needed for key transactions such as opening a bank account, buying a car, or setting up a mobile phone contract. 453 | SPAIN_NIF_NUMBER = 3004; 454 | 455 | // A Spanish Ordinary Passport (Pasaporte Ordinario) number. There are 4 different types of passports in Spain. This detector is for the Ordinary Passport (Pasaporte Ordinario) type, which is issued for ordinary travel, such as vacations and business trips. 456 | SPAIN_PASSPORT = 3005; 457 | 458 | // The Spanish Social Security number (Número de Afiliación a la Seguridad Social) is a 10-digit sequence that identifies a person in Spain for all interactions with the country's Social Security system. 459 | SPAIN_SOCIAL_SECURITY_NUMBER = 3006; 460 | 461 | 462 | // A Swedish Personal Identity Number (personnummer), a national government identification number for Swedish citizens. 463 | SWEDEN_NATIONAL_ID_NUMBER = 3100; 464 | 465 | // A Swedish passport number. 466 | SWEDEN_PASSPORT = 3101; 467 | 468 | 469 | // A Taiwanese passport number. 470 | TAIWAN_PASSPORT = 3200; 471 | 472 | 473 | // The Thai บัตรประจำตัวประชาชนไทย, or identity card, is used as the main identity document for Thai nationals. 474 | THAILAND_NATIONAL_ID_NUMBER = 3300; 475 | 476 | 477 | // A unique Turkish personal identification number, assigned to every citizen of Turkey. 478 | TURKEY_ID_NUMBER = 3400; 479 | 480 | 481 | // The Scotland Community Health Index Number (CHI number) is a 10-digit sequence used to uniquely identify a patient within National Health Service Scotland (NHS Scotland). 482 | SCOTLAND_COMMUNITY_HEALTH_INDEX_NUMBER = 3500; 483 | 484 | // A driver's license number for the United Kingdom of Great Britain and Northern Ireland (UK). 485 | UK_DRIVERS_LICENSE_NUMBER = 3501; 486 | 487 | // A National Health Service (NHS) number is the unique number allocated to a registered user of the three public health services in England, Wales, and the Isle of Man. 488 | UK_NATIONAL_HEALTH_SERVICE_NUMBER = 3502; 489 | 490 | // The National Insurance number (NINO) is a number used in the United Kingdom (UK) in the administration of the National Insurance or social security system. It identifies people, and is also used for some purposes in the UK tax system. The number is sometimes referred to as NI No or NINO. 491 | UK_NATIONAL_INSURANCE_NUMBER = 3503; 492 | 493 | // A United Kingdom (UK) passport number. 494 | UK_PASSPORT = 3504; 495 | 496 | // A United Kingdom (UK) Unique Taxpayer Reference (UTR) number. This number, comprised of a string of 10 decimal digits, is an identifier used by the UK government to manage the taxation system. Unlike other identifiers, such as the passport number or social insurance number, the UTR is not listed on official identity cards. 497 | UK_TAXPAYER_REFERENCE = 3505; 498 | 499 | 500 | // An American Bankers' Committee on Uniform Security Identification Procedures (CUSIP) number is a 9-character alphanumeric code that identifies a North American financial security. 501 | AMERICAN_BANKERS_CUSIP_ID = 3600; 502 | 503 | // Drug product name or active ingredient registered by the United States Food and Drug Administration (FDA). 504 | FDA_CODE = 3601; 505 | 506 | // A United States Adoption Taxpayer Identification Number (ATIN) is a type of United States Tax Identification Number (TIN). An ATIN is issued by the Internal Revenue Service (IRS) to individuals who are in the process of legally adopting a US citizen or resident child. 507 | US_ADOPTION_TAXPAYER_IDENTIFICATION_NUMBER = 3602; 508 | 509 | // The American Bankers Association (ABA) Routing Number (also called the transit number) is a nine-digit code. It's used to identify the financial institution that's responsible to credit or entitled to receive credit for a check or electronic transaction. 510 | US_BANK_ROUTING_MICR = 3603; 511 | 512 | // A US Drug Enforcement Administration (DEA) number is assigned to a health care provider by the US DEA. It allows the health care provider to write prescriptions for controlled substances. The DEA number is often used as a general "prescriber number" that is a unique identifier for anyone who can prescribe medication. 513 | US_DEA_NUMBER = 3604; 514 | 515 | // A driver's license number for the United States. Format can vary depending on the issuing state. 516 | US_DRIVERS_LICENSE_NUMBER = 3605; 517 | 518 | // A United States Employer Identification Number (EIN) is also known as a Federal Tax Identification Number, and is used to identify a business entity. 519 | US_EMPLOYER_IDENTIFICATION_NUMBER = 3606; 520 | 521 | // The US National Provider Identifier (NPI) is a unique 10-digit identification number issued to health care providers in the United States by the Centers for Medicare and Medicaid Services (CMS). The NPI has replaced the unique provider identification number (UPIN) as the required identifier for Medicare services. It's also used by other payers, including commercial healthcare insurers. 522 | US_HEALTHCARE_NPI = 3607; 523 | 524 | // A United States Individual Taxpayer Identification Number (ITIN) is a type of Tax Identification Number (TIN), issued by the Internal Revenue Service (IRS). An ITIN is a tax processing number only available for certain nonresident and resident aliens, their spouses, and dependents who cannot get a Social Security Number (SSN). 525 | US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER = 3608; 526 | 527 | // A United States passport number. 528 | US_PASSPORT = 3609; 529 | 530 | // A United States Preparer Taxpayer Identification Number (PTIN) is an identification number that all paid tax return preparers must use on US federal tax returns or claims for refund submitted to the US Internal Revenue Service (IRS). 531 | US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER = 3610; 532 | 533 | // A United States Social Security number (SSN) is a 9-digit number issued to US citizens, permanent residents, and temporary residents. This detector will not match against numbers with all zeroes in any digit group (that is, 000-##-####, ###-00-####, or ###-##-0000), against numbers with 666 in the first digit group, or against numbers whose first digit is 9. 534 | US_SOCIAL_SECURITY_NUMBER = 3611; 535 | 536 | // A United States state name. 537 | US_STATE = 3612; 538 | 539 | // A US toll-free telephone number. 540 | US_TOLLFREE_PHONE_NUMBER = 3613; 541 | 542 | // A vehicle identification number (VIN) is a unique 17-digit code assigned to every on-road motor vehicle in North America. 543 | US_VEHICLE_IDENTIFICATION_NUMBER = 3614; 544 | 545 | 546 | // A Uruguayan Cédula de Identidad (CDI), or identity card, is used as the main identity document for citizens. 547 | URUGUAY_CDI_NUMBER = 3700; 548 | 549 | 550 | // A Venezuelan Cédula de Identidad (CDI), or national identity card, is used as the main identity document for citizens. 551 | VENEZUELA_CDI_NUMBER = 3800; 552 | 553 | 554 | } 555 | 556 | extend google.protobuf.FieldOptions { 557 | // message.description is a Mandatory Metadata 558 | CoreMetadata field_core = 70001; 559 | // Set true if the field contains classified data (Optional). 560 | bool is_classified = 70002; 561 | // Set the classification level if is_classified is true (This is Mandatory if is_classified set to true) 562 | string classification_level = 7003; 563 | // Specify the product type. product_type is an useful annotation to represent a field in a business perspective. 564 | // (e.g) user_id can be an INT field, but in the system design it could represent External Users rather than internal users. 565 | string product_type = 70004; 566 | // Set true if the field is a primary key. This must be true if the Schema type is Entity 567 | bool is_primary_key = 70005; 568 | // Type of the classification: Refer: https://cloud.google.com/dlp/docs/infotypes-reference 569 | ClassificationType classification_type = 70006; 570 | } -------------------------------------------------------------------------------- /schemata/mvp/schema/route.proto: -------------------------------------------------------------------------------- 1 | // File: https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/mvp/schema/route.proto 2 | 3 | syntax = "proto3"; 4 | 5 | package org.opentraveldata; 6 | 7 | import "google/protobuf/descriptor.proto"; 8 | import "schemata/protobuf/schemata.proto"; 9 | 10 | option java_package = "org.protocol.schema"; 11 | option java_outer_classname = "RouteBuilder"; 12 | 13 | message Route { 14 | option(org.schemata.schema.message_core).description = "Table of air routes"; 15 | option(org.schemata.schema.message_core).comment = "The POR (points of reference) are specified through IATA 3-letter codes and airlines through IATA 2-letter codes"; 16 | option(org.schemata.schema.owner) = "OpenTravelData (OPTD)"; 17 | option(org.schemata.schema.domain) = "Transport"; 18 | option(org.schemata.schema.schema_type) = ENTITY; 19 | 20 | string airline_id = 1 21 | [(org.schemata.schema.field_core).description = "2-letter IATA code of the airline"]; 22 | 23 | string org_por_id = 2 24 | [(org.schemata.schema.field_core).description = "origin POR"]; 25 | 26 | string dst_por_id = 3 27 | [(org.schemata.schema.field_core).description = "destination POR"]; 28 | 29 | int32 freq = 4 30 | [(org.schemata.schema.field_core).description = "frequency / number of flights"]; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /schemata/mvp/score.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | java -jar target/schemata-1.0.jar score -s=model.desc -p=PROTOBUF $1 4 | 5 | -------------------------------------------------------------------------------- /schemata/mvp/src/python/pyspark.py: -------------------------------------------------------------------------------- 1 | # 2 | # File: https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/mvp/src/python/pyspark.py 3 | # 4 | 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import StructType, StructField, StringType, IntegerType 7 | 8 | # Schema 9 | schema = StructType( 10 | [StructField("airline_id", StringType(), True), 11 | StructField("org_por", StringType(), True), 12 | StructField("dst_por", StringType(), True), 13 | StructField("freq", IntegerType(), True)] 14 | ) 15 | 16 | # Data set - source: OpenTravelData (OPTD) 17 | routeFile = "../../data/optd/optd_airline_por.csv" 18 | 19 | # Initializes the Spark session 20 | spark = ( 21 | SparkSession 22 | .builder 23 | .appName("SchemataMVP") 24 | .getOrCreate() 25 | ) 26 | 27 | # Parse the data file 28 | routeData = ( 29 | spark 30 | .read 31 | .options(delimiter="^", header=True, schema=schema) 32 | .csv(routeFile) 33 | .cache() 34 | ) 35 | 36 | # 37 | routeData.show() 38 | 39 | # 40 | spark.stop() 41 | 42 | -------------------------------------------------------------------------------- /schemata/mvp/validate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | java -jar target/schemata-1.0.jar validate -s=model.desc -p=PROTOBUF 4 | 5 | -------------------------------------------------------------------------------- /schemata/quickstart/.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | .idea/ 3 | 4 | # build output 5 | target/ 6 | *.iml 7 | *.desc 8 | 9 | # 10 | tmp/ 11 | 12 | -------------------------------------------------------------------------------- /schemata/quickstart/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # 3 | all: build-all 4 | 5 | .PHONY: proto-gen 6 | proto-gen: 7 | protoc --proto_path=org --proto_path=schema --descriptor_set_out=model.desc --include_imports --include_source_info ./**/*.proto 8 | 9 | .PHONY: build-all 10 | build-all: proto-gen 11 | 12 | -------------------------------------------------------------------------------- /schemata/quickstart/README.md: -------------------------------------------------------------------------------- 1 | Schemata - Quickstart 2 | ===================== 3 | 4 | # Table of Content (ToC) 5 | * [References](#references) 6 | * [Initialization](#initialization) 7 | * [Java and Maven](#java-and-maven) 8 | * [Data contracts - Schema Quickstart](#data-contracts---schema-quickstart) 9 | * [Additional utilities](#additional-utilities) 10 | * [Generate Schemata JAR artifact](#generate-schemata-jar-artifact) 11 | * [Schemata utilities to validate and document the data contracts](#schemata-utilities-to-validate-and-document-the-data-contracts) 12 | 13 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go) 14 | 15 | # References 16 | * [GitHub - Data Engineering helpers - Data contracts (this project)](https://github.com/data-engineering-helpers/data-contracts) 17 | 18 | # Initialization 19 | 20 | ## Java and Maven 21 | * If not already done so, install Java (JDK 17) and Maven. The easiest 22 | is to use [SDKMan](https://sdkman.io/) 23 | + To install SDKMan (behind some corporate firewalls, one may need to setup 24 | a proxy with `http` and `https` Shell environment variables): 25 | ```bash 26 | $ curl -s "https://get.sdkman.io" | bash 27 | ``` 28 | + Proceed as requested by SDKMan (check that the SDKMan 29 | initialization is done properly within the Shell init scripts) 30 | 31 | * In order to check the currently installed versions of Java, Maven 32 | and so on, use `sdk list tool`, for instance: 33 | + Check the installed versions of Java: 34 | ```bash 35 | $ sdk list java 36 | ``` 37 | + Check the installed versions of Maven: 38 | ```bash 39 | $ sdk list maven 40 | ``` 41 | 42 | * Install the Amazon Coretto JDK17 (or whatever else JDK you prefer, 43 | but it has to be some JDK with version 17): 44 | ```bash 45 | $ sdk install 17.0.6-amzn 46 | ``` 47 | 48 | * Install Maven: 49 | ```bash 50 | $ sdk install 3.9.1 51 | ``` 52 | 53 | ## Data contracts - Schema Quickstart 54 | * If not already done so, clone the 55 | [Data contracts Git repository](https://github.com/data-engineering-helpers/data-contracts) 56 | 57 | * Go into the 58 | [Data contracts Schemata quickstart directory](https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/quickstart): 59 | ```bash 60 | $ pushd ~/dev/infra/data-contracts/schemata/quickstart 61 | ``` 62 | 63 | ## Additional utilities 64 | * [JQ](https://stedolan.github.io/jq/) comes handy to parse JSON structures. 65 | It is packages on most of the systems (MacOS, Linux) 66 | 67 | # Generate Schemata JAR artifact 68 | As of April 2023, Schemata does not release its JAR artifact on public 69 | repositories such as Maven Central. Hence, one has to clone the Schemata 70 | Git repository and to generate the JAR artifact locally. 71 | 72 | * If not already done so, clone the Schemata Git repository: 73 | ```bash 74 | $ mkdir -p ~/dev/infra && \ 75 | git clone git@github.com:ananthdurai/schemata.git ~/dev/infra/schemata 76 | ``` 77 | 78 | * Go into the Schemata directory: 79 | ```bash 80 | pushd ~/dev/infra/schemata 81 | ``` 82 | 83 | * Compile and package (generate the JAR artifact for) Schemata: 84 | ```bash 85 | $ make build-all 86 | ``` 87 | 88 | * Check that the JAR artifacts have been correctly generated: 89 | ```bash 90 | $ ls -lFh target/*.jar 91 | -rw-r--r-- 1 user staff 112K Apr 13 16:27 target/original-schemata-1.0.jar 92 | -rw-r--r-- 1 user staff 7.8M Apr 13 16:27 target/schemata-1.0-sources.jar 93 | -rw-r--r-- 1 user staff 20M Apr 13 16:27 target/schemata-1.0.jar 94 | ``` 95 | 96 | * Leave the Schemata directory: 97 | ```bash 98 | $ popd 99 | ``` 100 | 101 | * Go into the 102 | [Data contracts Schemata quickstart directory](https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/quickstart): 103 | ```bash 104 | $ pushd ~/dev/infra/data-contracts/schemata/quickstart 105 | ``` 106 | 107 | * Copy the just generated Schemata JAR artifact: 108 | ```bash 109 | $ cp ~/dev/infra/schemata/target/schemata-1.0.jar target/ 110 | ``` 111 | 112 | * Leave the Data contracts Schemata quickstart directory: 113 | ```bash 114 | $ popd 115 | ``` 116 | 117 | # Schemata utilities to validate and document the data contracts 118 | * Go into the 119 | [Data contracts Schemata quickstart directory](https://github.com/data-engineering-helpers/data-contracts/tree/main/schemata/quickstart): 120 | ```bash 121 | $ pushd ~/dev/infra/data-contracts/schemata/quickstart 122 | ``` 123 | 124 | * Generate the data contract descriptors (they are needed by the Shell scripts 125 | and need to be refreshed/re-generated whenever the data contracts change): 126 | ```bash 127 | $ make all 128 | ``` 129 | 130 | * Check that the data contract descriptors have been correctly generated: 131 | ```bash 132 | $ ls -lFh *.desc 133 | -rw-r--r-- 1 user staff 111K Apr 13 17:01 model.desc 134 | $ strings model.desc | head -3 135 | activity.proto 136 | 137 | org.examplesB& 138 | org.protocol.schemaB 139 | ``` 140 | 141 | * Validate the model: 142 | ```bash 143 | $ ./validate.sh 144 | ... 145 | Schema validation success. No error to report 146 | ``` 147 | 148 | * Score the model: 149 | ```bash 150 | $ ./score.sh org.examples.User 151 | ... 152 | 153 | Schemata score for org.examples.User : 0.19 154 | ``` 155 | 156 | * Document the model 157 | + Raw output: 158 | ```bash 159 | $ ./document.sh 160 | ``` 161 | + Parsed with JQ: 162 | ```bash 163 | $ ./document.sh |grep -v "^[0-9]\{2\}"|jq 164 | [ 165 | { 166 | "name": "org.examples.Category", 167 | "description": "This is the description of the Category table", 168 | ... 169 | } 170 | ] 171 | ``` 172 | 173 | 174 | -------------------------------------------------------------------------------- /schemata/quickstart/document.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | java -jar target/schemata-1.0.jar document --source=model.desc -p=PROTOBUF 4 | 5 | -------------------------------------------------------------------------------- /schemata/quickstart/org/schemata/protobuf/schemata.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.schemata.schema; 4 | 5 | import "google/protobuf/descriptor.proto"; 6 | 7 | option java_package = "org.schemata.schema"; 8 | option java_outer_classname = "SchemataBuilder"; 9 | 10 | // MessageType captures the type of the stream. There are two types of stream. 11 | enum SchemaType { 12 | // This is an invalid state. If the entity defined as unknown the validator should throw an exception. 13 | UNKNOWN = 0; 14 | //Entity streams can be mutated in the downstream services. Entity streams often used to represent the current 15 | //state of the entity. In the classical DW concepts Entities typically represents the dimensions. 16 | // Entity must have a primary key field. 17 | ENTITY = 1; 18 | // Event streams are typically immutable in nature. Event streams often used to represent the state change of an Entity. 19 | // In the classical DW concepts Event streams represents the facts. Event streams will not have a primary key field. 20 | EVENT = 2; 21 | } 22 | 23 | enum EventType { 24 | // Invalid Event Type. If the message type is event, set it either LIFECYCLE or ACTIVITY or AGGREGATED 25 | NONE = 0; 26 | // LIFECYCLE event captures the state changes of an Entity. (e.g) User created, User deleted et al. 27 | LIFECYCLE = 1; 28 | // ACTIVITY event captures the events that resulted from one Entity changing the state of another Entity. 29 | // (e.g.) User A purchases Product B. The ACTIVITY event is often the result of a business transaction. 30 | ACTIVITY = 2; 31 | // AGGREGATED event captures the computed metrics over a specified window of time. (e.g) Number of views by a User for a Product. 32 | AGGREGATED = 3; 33 | } 34 | 35 | enum ActivityType { 36 | CREATED = 0; 37 | DELETED = 1; 38 | UPDATED = 2; 39 | } 40 | 41 | enum TimeUnit { 42 | SECONDS = 0; 43 | MINUTES = 1; 44 | HOURS = 2; 45 | } 46 | 47 | // CoreMetadata is the set of attribute apply to both the Message & Field 48 | message CoreMetadata { 49 | // Mandatory Metadata: description of the entity 50 | optional string description = 50001; 51 | // Optional Metadata: additional comments about the entity 52 | optional string comment = 50002; 53 | // Optional Metadata: Any related entity that has "hierarchy" or "has a" relationships. 54 | optional string see_also = 50003; 55 | // Optional Metadata: Additional link reference for further reading. 56 | // It could be a confluent page, An ADR or RFC or a Slack message link. 57 | optional string reference = 50004; 58 | } 59 | 60 | 61 | extend google.protobuf.MessageOptions { 62 | 63 | // message.description is a Mandatory Metadata 64 | CoreMetadata message_core = 60001; 65 | // Mandatory Metadata: owner of the entity. Usually it is the team name. 66 | string owner = 60002; 67 | // Mandatory Metadata: domain = 'core' indicates the entity is common across all the domains. 68 | // Other possible domains are `sales`, `marketing`, `product` etc 69 | string domain = 60003; 70 | // Mandatory Metadata: define the type of the message. 71 | SchemaType schema_type = 60004; 72 | // Status of the entity. You can have `testing`, `production` or `staging` depends on the lifecycle of schema definition. 73 | string status = 60005; 74 | // Slack or Teams channel name to communicate with the team which owns ths entity 75 | string team_channel = 60006; 76 | // Slack or Teams channel name to alert for any validation errors. 77 | string alert_channel = 60007; 78 | // Type of the event. Set if the Type = 'EVENT' 79 | EventType event_type = 60008; 80 | // Compliance Owner approves which team should have access to this Schema. 81 | string compliance_owner = 60009; 82 | // Compliance Owner Slack/ Teams Channel name 83 | string compliance_channel = 600010; 84 | } 85 | 86 | enum ClassificationType { 87 | //Identifiers used by developers to track users for advertising purposes. These include Google Play Advertising IDs, 88 | // Amazon Advertising IDs, Apple's identifierForAdvertising (IDFA), and Apple's identifierForVendor (IDFV). 89 | ADVERTISING_ID = 0; 90 | // An age measured in months or years. 91 | AGE = 1; 92 | // A credit card number is 12 to 19 digits long. They are used for payment transactions globally. 93 | CREDIT_CARD_NUMBER = 2; 94 | 95 | // A credit card track number is a variable length alphanumeric string. It is used to store key cardholder information. 96 | CREDIT_CARD_TRACK_NUMBER = 3; 97 | 98 | // A date. This infoType includes most date formats, including the names of common world holidays. 99 | DATE = 4; 100 | 101 | 102 | // A date that is identified by context as a date of birth. Note: Not recommended for use during latency sensitive operations. 103 | DATE_OF_BIRTH = 5; 104 | 105 | // A domain name as defined by the DNS standard. 106 | DOMAIN_NAME = 6; 107 | 108 | // An email address identifies the mailbox that emails are sent to or from. The maximum length of the domain name is 255 characters, and the maximum length of the local-part is 64 characters. 109 | EMAIL_ADDRESS = 7; 110 | 111 | // A person’s ethnic group. 112 | ETHNIC_GROUP = 8; 113 | 114 | // A common female name. Note: Not recommended for use during latency sensitive operations. 115 | FEMALE_NAME = 9; 116 | 117 | // A first name is defined as the first part of a PERSON_NAME. Note: Not recommended for use during latency sensitive operations. 118 | FIRST_NAME = 10; 119 | 120 | // A person’s gender identity. 121 | GENDER = 11; 122 | 123 | // Alphanumeric and special character strings that may be personally identifying but do not belong to a well-defined category, such as user IDs or medical record numbers. 124 | GENERIC_ID = 12; 125 | 126 | // An International Bank Account Number (IBAN) is an internationally agreed-upon method for identifying bank accounts defined by the International Standard of Organization (ISO) 13616:2007 standard. The European Committee for Banking Standards (ECBS) created ISO 13616:2007. An IBAN consists of up to 34 alphanumeric characters, including elements such as a country code or account number. 127 | IBAN_CODE = 13; 128 | 129 | // An HTTP cookie is a standard way of storing data on a per website basis. This detector will find headers containing these cookies. 130 | HTTP_COOKIE = 14; 131 | 132 | // An Integrated Circuit Card Identifier (ICCID) is used to uniquely identify each SIM card. It includes information, such as the country the card is active in and the mobile network code. 133 | ICCID_NUMBER = 15; 134 | 135 | // The International Classification of Diseases, Ninth Revision, Clinical Modification (ICD-9-CM) lexicon is used to assign diagnostic and procedure codes associated with inpatient, outpatient, and physician office use in the United States. The US National Center for Health Statistics (NCHS) created the ICD-9-CM lexicon. It is based on the ICD-9 lexicon, but provides for more morbidity detail. The ICD-9-CM lexicon is updated annually on October 1. 136 | ICD9_CODE = 16; 137 | 138 | // Like ICD-9-CM codes, the International Classification of Diseases, Tenth Revision, Clinical Modification (ICD-10-CM) lexicon is a series of diagnostic codes. The World Health Organization (WHO) publishes the ICD-10-CM lexicon to describe causes of morbidity and mortality. 139 | ICD10_CODE = 17; 140 | 141 | // An International Mobile Equipment Identity (IMEI) hardware identifier, used to identify mobile phones. 142 | IMEI_HARDWARE_ID = 18; 143 | 144 | // An International Mobile Subscriber Identity (IMEI) identifier, used to identify users on a mobile network. 145 | IMSI_ID = 19; 146 | 147 | // An Internet Protocol (IP) address (either IPv4 or IPv6). 148 | IP_ADDRESS = 20; 149 | 150 | // A last name is defined as the last part of a PERSON_NAME. 151 | LAST_NAME = 21; 152 | 153 | 154 | // A physical address or location. 155 | LOCATION = 22; 156 | 157 | // A media access control address (MAC address), which is an identifier for a network adapter. 158 | MAC_ADDRESS = 23; 159 | // A local media access control address (MAC address), which is an identifier for a network adapter. 160 | MAC_ADDRESS_LOCAL = 24; 161 | //A common male name. 162 | MALE_NAME = 25; 163 | 164 | // Terms that commonly refer to a person's medical condition or health. 165 | MEDICAL_TERM = 26; 166 | 167 | // A name of a chain store, business or organization. 168 | ORGANIZATION_NAME = 27; 169 | 170 | 171 | // A passport number that matches passport numbers for the following countries: Australia, Canada, China, France, Germany, Japan, Korea, Mexico, The Netherlands, Poland, Singapore, Spain, Sweden, Taiwan, United Kingdom, and the United States. 172 | PASSPORT = 28; 173 | 174 | // A full person name, which can include first names, middle names or initials, and last names. Note: Not recommended for use during latency sensitive operations. 175 | PERSON_NAME = 29; 176 | 177 | // A telephone number. 178 | PHONE_NUMBER = 30; 179 | 180 | // A street address. Note: Not recommended for use during latency sensitive operations. 181 | STREET_ADDRESS = 31; 182 | 183 | // A SWIFT code is the same as a Bank Identifier Code (BIC). It's a unique identification code for a particular bank. These codes are used when transferring money between banks, particularly for international wire transfers. Banks also use the codes for exchanging other messages. 184 | SWIFT_CODE = 32; 185 | // A timestamp of a specific time of day. 186 | TIME = 33; 187 | 188 | // A Uniform Resource Locator (URL). 189 | URL = 34; 190 | 191 | // A vehicle identification number (VIN) is a unique 17-digit code assigned to every on-road motor vehicle. 192 | VEHICLE_IDENTIFICATION_NUMBER = 35; 193 | 194 | // The infoType detectors in this section detect credentials and other secret data. 195 | Credentials_And_Secrets = 36; 196 | 197 | 198 | // An authentication token is a machine-readable way of determining whether a particular request has been authorized for a user. This detector currently identifies tokens that comply with OAuth or Bearer authentication. 199 | AUTH_TOKEN = 37; 200 | 201 | // Amazon Web Services account access keys. 202 | AWS_CREDENTIALS = 38; 203 | 204 | // Microsoft Azure certificate credentials for application authentication. 205 | AZURE_AUTH_TOKEN = 39; 206 | 207 | // A basic authentication header is an HTTP header used to identify a user to a server. It is part of the HTTP specification in RFC 1945, section 11. 208 | BASIC_AUTH_HEADER = 40; 209 | 210 | // An encryption key within configuration, code, or log text. 211 | ENCRYPTION_KEY = 41; 212 | 213 | // Google Cloud API key. An encrypted string that is used when calling Google Cloud APIs that don't need to access private user data. 214 | GCP_API_KEY = 42; 215 | 216 | // Google Cloud service account credentials. Credentials that can be used to authenticate with Google API client libraries and service accounts. 217 | GCP_CREDENTIALS = 43; 218 | 219 | // JSON Web Token. JSON Web Token in compact form. Represents a set of claims as a JSON object that is digitally signed using JSON Web Signature. 220 | JSON_WEB_TOKEN = 44; 221 | 222 | // Clear text passwords in configs, code, and other text. 223 | PASSWORD = 45; 224 | 225 | // A weakly hashed password is a method of storing a password that is easy to reverse engineer. The presence of such hashes often indicate that a system's security can be improved. 226 | WEAK_PASSWORD_HASH = 46; 227 | 228 | // An XSRF token is an HTTP header that is commonly used to prevent cross-site scripting attacks. Cross-site scripting is a type of security vulnerability that can be exploited by malicious sites. 229 | XSRF_TOKEN = 47; 230 | 231 | 232 | // An Argentine Documento Nacional de Identidad (DNI), or national identity card, is used as the main identity document for citizens. 233 | ARGENTINA_DNI_NUMBER = 100; 234 | 235 | 236 | // An Australian driver's license number. 237 | AUSTRALIA_DRIVERS_LICENSE_NUMBER = 200; 238 | 239 | // A 9-digit Australian Medicare account number is issued to permanent residents of Australia (except for Norfolk island). The primary purpose of this number is to prove Medicare eligibility to receive subsidized care in Australia. 240 | AUSTRALIA_MEDICARE_NUMBER = 201; 241 | 242 | // An Australian passport number. 243 | AUSTRALIA_PASSPORT = 202; 244 | 245 | // An Australian tax file number (TFN) is a number issued by the Australian Tax Office for taxpayer identification. Every taxpaying entity, such as an individual or an organization, is assigned a unique number. 246 | AUSTRALIA_TAX_FILE_NUMBER = 203; 247 | 248 | 249 | // A 12-digit Belgian national identity card number. 250 | BELGIUM_NATIONAL_ID_CARD_NUMBER = 300; 251 | 252 | 253 | // The Brazilian Cadastro de Pessoas Físicas (CPF) number, or Natural Persons Register number, is an 11-digit number used in Brazil for taxpayer identification. 254 | BRAZIL_CPF_NUMBER = 400; 255 | 256 | 257 | // A Canadian bank account number. 258 | CANADA_BANK_ACCOUNT = 500; 259 | 260 | // The British Columbia Personal Health Number (PHN) is issued to citizens, permanent residents, temporary workers, students, and other individuals who are entitled to health care coverage in the Province of British Columbia. 261 | CANADA_BC_PHN = 501; 262 | 263 | // A driver's license number for each of the ten provinces in Canada (the three territories are currently not covered). 264 | CANADA_DRIVERS_LICENSE_NUMBER = 502; 265 | 266 | // The Ontario Health Insurance Plan (OHIP) number is issued to citizens, permanent residents, temporary workers, students, and other individuals who are entitled to health care coverage in the Province of Ontario. 267 | CANADA_OHIP = 503; 268 | 269 | // A Canadian passport number. 270 | CANADA_PASSPORT = 504; 271 | 272 | // The Québec Health Insurance Number (also known as the RAMQ number) is issued to citizens, permanent residents, temporary workers, students, and other individuals who are entitled to health care coverage in the Province of Québec. 273 | CANADA_QUEBEC_HIN = 505; 274 | 275 | // The Canadian Social Insurance Number (SIN) is the main identifier used in Canada for citizens, permanent residents, and people on work or study visas. With a Canadian SIN and mailing address, one can apply for health care coverage, driver's licenses, and other important services. 276 | CANADA_SOCIAL_INSURANCE_NUMBER = 506; 277 | 278 | 279 | // A Chilean Cédula de Identidad (CDI), or identity card, is used as the main identity document for citizens. 280 | CHILE_CDI_NUMBER = 600; 281 | 282 | 283 | // A Chinese resident identification number. 284 | CHINA_RESIDENT_ID_NUMBER = 700; 285 | 286 | // A Chinese passport number. 287 | CHINA_PASSPORT = 701; 288 | 289 | // A Colombian Cédula de Ciudadanía (CDC), or citizenship card, is used as the main identity document for citizens. 290 | COLOMBIA_CDC_NUMBER = 800; 291 | 292 | 293 | // A Personal Identification Number (CPR, Det Centrale Personregister) is a national ID number in Denmark. It is used with public agencies such as health care and tax authorities. Banks and insurance companies also use it as a customer number. The CPR number is required for people who reside in Denmark, pay tax or own property there. 294 | DENMARK_CPR_NUMBER = 900; 295 | 296 | 297 | // The French Carte Nationale d'Identité Sécurisée (CNI or CNIS) is the French national identity card. It's an official identity document consisting of a 12-digit identification number. This number is commonly used when opening bank accounts and when paying by check. It can sometimes be used instead of a passport or visa within the European Union (EU) and in some other countries. 298 | FRANCE_CNI = 1000; 299 | 300 | // The French Numéro d'Inscription au Répertoire (NIR) is a permanent personal identification number that's also known as the French social security number for services including healthcare and pensions. 301 | FRANCE_NIR = 1001; 302 | 303 | // A French passport number. 304 | FRANCE_PASSPORT = 1002; 305 | 306 | // The French tax identification number is a government-issued ID for all individuals paying taxes in France. 307 | FRANCE_TAX_IDENTIFICATION_NUMBER = 1003; 308 | 309 | 310 | // A Finnish personal identity code, a national government identification number for Finnish citizens used on identity cards, driver's licenses and passports. 311 | FINLAND_NATIONAL_ID_NUMBER = 1100; 312 | 313 | 314 | // A German driver's license number. 315 | GERMANY_DRIVERS_LICENSE_NUMBER = 1200; 316 | 317 | // The German Personalausweis, or identity card, is used as the main identity document for citizens of Germany. 318 | GERMANY_IDENTITY_CARD_NUMBER = 1201; 319 | 320 | // A German passport number. The format of a German passport number is 10 alphanumeric characters, chosen from numerals 0–9 and letters C, F, G, H, J, K, L, M, N, P, R, T, V, W, X, Y, Z. 321 | GERMANY_PASSPORT = 1202; 322 | 323 | // An 11-digit German taxpayer identification number assigned to both natural-born and other legal residents of Germany for the purposes of recording tax payments. 324 | GERMANY_TAXPAYER_IDENTIFICATION_NUMBER = 1203; 325 | 326 | // A German Schufa identification number. Schufa Holding AG is a German credit bureau whose aim is to protect clients from credit risk. 327 | GERMANY_SCHUFA_ID = 1204; 328 | 329 | 330 | // The 香港身份證, or Hong Kong identity card (HKIC), is used as the main identity document for citizens of Hong Kong. 331 | HONG_KONG_ID_NUMBER = 1300; 332 | 333 | 334 | // The Indian Aadhaar number is a 12-digit unique identity number obtained by residents of India, based on their biometric and demographic data. 335 | INDIA_AADHAAR_INDIVIDUAL = 1400; 336 | 337 | // The Indian GST identification number (GSTIN) is a unique identifier required of every business in India for taxation. 338 | INDIA_GST_INDIVIDUAL = 1401; 339 | 340 | // The Indian Personal Permanent Account Number (PAN) is a unique 10-digit alphanumeric identifier used for identification of individuals—particularly people who pay income tax. It's issued by the Indian Income Tax Department. The PAN is valid for the lifetime of the holder. 341 | INDIA_PAN_INDIVIDUAL = 1402; 342 | 343 | 344 | // An Indonesian Single Identity Number (Nomor Induk Kependudukan, or NIK) is the national identification number of Indonesia. The NIK is used as the basis for issuing Indonesian resident identity cards (Kartu Tanda Penduduk, or KTP), passports, driver's licenses and other identity documents. 345 | INDONESIA_NIK_NUMBER = 1500; 346 | 347 | 348 | // An Irish driving license number. 349 | IRELAND_DRIVING_LICENSE_NUMBER = 1600; 350 | 351 | // Eircode is an Irish postal code that uniquely identifies an address. 352 | IRELAND_EIRCODE = 1601; 353 | 354 | // An Irish (IE) passport number. 355 | IRELAND_PASSPORT = 1602; 356 | 357 | // The Irish Personal Public Service Number (PPS number, or PPSN) is a unique number for accessing social welfare benefits, public services, and information in Ireland. 358 | IRELAND_PPSN = 1603; 359 | 360 | 361 | // The Israel identity card number is issued to all Israeli citizens at birth by the Ministry of the Interior. Temporary residents are assigned a number when they receive temporary resident status. 362 | ISRAEL_IDENTITY_CARD_NUMBER = 1700; 363 | 364 | 365 | // An Italy fiscal code number is a unique 16-digit code assigned to Italian citizens as a form of identification. 366 | ITALY_FISCAL_CODE = 1800; 367 | 368 | 369 | // A Japanese bank account number. 370 | JAPAN_BANK_ACCOUNT = 1900; 371 | 372 | // A Japanese driver's license number. 373 | JAPAN_DRIVERS_LICENSE_NUMBER = 1901; 374 | 375 | // The Japanese national identification number—sometimes referred to as "My Number"—is a new national ID number as of January 2016. 376 | JAPAN_INDIVIDUAL_NUMBER = 1902; 377 | 378 | // A Japanese passport number. The passport number consists of two alphabetic characters followed by seven digits. 379 | JAPAN_PASSPORT = 1903; 380 | 381 | 382 | // A Korean passport number. 383 | KOREA_PASSPORT = 2000; 384 | 385 | // A South Korean Social Security number. 386 | KOREA_RRN = 2001; 387 | 388 | 389 | // The Mexico Clave Única de Registro de Población (CURP) number, or Unique Population Registry Code or Personal Identification Code number. The CURP number is an 18-character state-issued identification number assigned by the Mexican government to citizens or residents of Mexico and used for taxpayer identification. 390 | MEXICO_CURP_NUMBER = 2100; 391 | 392 | // A Mexican passport number. 393 | MEXICO_PASSPORT = 2101; 394 | 395 | 396 | // A Dutch Burgerservicenummer (BSN), or Citizen's Service Number, is a state-issued identification number that's on driver's licenses, passports, and international ID cards. 397 | NETHERLANDS_BSN_NUMBER = 2200; 398 | 399 | // A Dutch passport number. 400 | NETHERLANDS_PASSPORT = 2201; 401 | 402 | 403 | // Norway‘s Fødselsnummer, National Identification Number, or Birth Number is assigned at birth, or on migration into the country. It is registered with the Norwegian Tax Office. 404 | NORWAY_NI_NUMBER = 2300; 405 | 406 | 407 | // A Paraguayan Cédula de Identidad Civil (CIC), or civil identity card, is used as the main identity document for citizens. 408 | PARAGUAY_CIC_NUMBER = 2400; 409 | 410 | 411 | // A Peruvian Documento Nacional de Identidad (DNI), or national identity card, is used as the main identity document for citizens. 412 | PERU_DNI_NUMBER = 2500; 413 | 414 | 415 | // The PESEL number is the national identification number used in Poland. It is mandatory for all permanent residents of Poland, and for temporary residents staying there longer than 2 months. It is assigned to just one person and cannot be changed. 416 | POLAND_PESEL_NUMBER = 2600; 417 | 418 | // The Polish identity card number. is a government identification number for Polish citizens. Every citizen older than 18 years must have an identity card. The local Office of Civic Affairs issues the card, and each card has its own unique number. 419 | POLAND_NATIONAL_ID_NUMBER = 2601; 420 | 421 | // A Polish passport number. Polish passport is an international travel document for Polish citizens. It can also be used as a proof of Polish citizenship. 422 | POLAND_PASSPORT = 2602; 423 | 424 | 425 | // A Portuguese Cartão de cidadão (CDC), or Citizen Card, is used as the main identity, Social Security, health services, taxpayer, and voter document for citizens. 426 | PORTUGAL_CDC_NUMBER = 2700; 427 | 428 | 429 | // A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card. 430 | SINGAPORE_NATIONAL_REGISTRATION_ID_NUMBER = 2800; 431 | 432 | // A Singaporean passport number. 433 | SINGAPORE_PASSPORT = 2801; 434 | 435 | 436 | // A South Africa ID number. 437 | SOUTH_AFRICA_ID_NUMBER = 2900; 438 | 439 | 440 | // The Spanish Código de Identificación Fiscal (CIF) was the tax identification system used in Spain for legal entities until 2008. It was then replaced by the Número de Identificación Fiscal (NIF) for natural and juridical persons. 441 | SPAIN_CIF_NUMBER = 3000; 442 | 443 | // A Spain national identity number. 444 | SPAIN_DNI_NUMBER = 3001; 445 | 446 | // A Spanish driver's license number. 447 | SPAIN_DRIVERS_LICENSE_NUMBER = 3002; 448 | 449 | // The Spanish Número de Identificación de Extranjeros (NIE) is an identification number for foreigners living or doing business in Spain. An NIE number is needed for key transactions such as opening a bank account, buying a car, or setting up a mobile phone contract. 450 | SPAIN_NIE_NUMBER = 3003; 451 | 452 | // The Spanish Número de Identificación Fiscal (NIF) is a government identification number for Spanish citizens. An NIF number is needed for key transactions such as opening a bank account, buying a car, or setting up a mobile phone contract. 453 | SPAIN_NIF_NUMBER = 3004; 454 | 455 | // A Spanish Ordinary Passport (Pasaporte Ordinario) number. There are 4 different types of passports in Spain. This detector is for the Ordinary Passport (Pasaporte Ordinario) type, which is issued for ordinary travel, such as vacations and business trips. 456 | SPAIN_PASSPORT = 3005; 457 | 458 | // The Spanish Social Security number (Número de Afiliación a la Seguridad Social) is a 10-digit sequence that identifies a person in Spain for all interactions with the country's Social Security system. 459 | SPAIN_SOCIAL_SECURITY_NUMBER = 3006; 460 | 461 | 462 | // A Swedish Personal Identity Number (personnummer), a national government identification number for Swedish citizens. 463 | SWEDEN_NATIONAL_ID_NUMBER = 3100; 464 | 465 | // A Swedish passport number. 466 | SWEDEN_PASSPORT = 3101; 467 | 468 | 469 | // A Taiwanese passport number. 470 | TAIWAN_PASSPORT = 3200; 471 | 472 | 473 | // The Thai บัตรประจำตัวประชาชนไทย, or identity card, is used as the main identity document for Thai nationals. 474 | THAILAND_NATIONAL_ID_NUMBER = 3300; 475 | 476 | 477 | // A unique Turkish personal identification number, assigned to every citizen of Turkey. 478 | TURKEY_ID_NUMBER = 3400; 479 | 480 | 481 | // The Scotland Community Health Index Number (CHI number) is a 10-digit sequence used to uniquely identify a patient within National Health Service Scotland (NHS Scotland). 482 | SCOTLAND_COMMUNITY_HEALTH_INDEX_NUMBER = 3500; 483 | 484 | // A driver's license number for the United Kingdom of Great Britain and Northern Ireland (UK). 485 | UK_DRIVERS_LICENSE_NUMBER = 3501; 486 | 487 | // A National Health Service (NHS) number is the unique number allocated to a registered user of the three public health services in England, Wales, and the Isle of Man. 488 | UK_NATIONAL_HEALTH_SERVICE_NUMBER = 3502; 489 | 490 | // The National Insurance number (NINO) is a number used in the United Kingdom (UK) in the administration of the National Insurance or social security system. It identifies people, and is also used for some purposes in the UK tax system. The number is sometimes referred to as NI No or NINO. 491 | UK_NATIONAL_INSURANCE_NUMBER = 3503; 492 | 493 | // A United Kingdom (UK) passport number. 494 | UK_PASSPORT = 3504; 495 | 496 | // A United Kingdom (UK) Unique Taxpayer Reference (UTR) number. This number, comprised of a string of 10 decimal digits, is an identifier used by the UK government to manage the taxation system. Unlike other identifiers, such as the passport number or social insurance number, the UTR is not listed on official identity cards. 497 | UK_TAXPAYER_REFERENCE = 3505; 498 | 499 | 500 | // An American Bankers' Committee on Uniform Security Identification Procedures (CUSIP) number is a 9-character alphanumeric code that identifies a North American financial security. 501 | AMERICAN_BANKERS_CUSIP_ID = 3600; 502 | 503 | // Drug product name or active ingredient registered by the United States Food and Drug Administration (FDA). 504 | FDA_CODE = 3601; 505 | 506 | // A United States Adoption Taxpayer Identification Number (ATIN) is a type of United States Tax Identification Number (TIN). An ATIN is issued by the Internal Revenue Service (IRS) to individuals who are in the process of legally adopting a US citizen or resident child. 507 | US_ADOPTION_TAXPAYER_IDENTIFICATION_NUMBER = 3602; 508 | 509 | // The American Bankers Association (ABA) Routing Number (also called the transit number) is a nine-digit code. It's used to identify the financial institution that's responsible to credit or entitled to receive credit for a check or electronic transaction. 510 | US_BANK_ROUTING_MICR = 3603; 511 | 512 | // A US Drug Enforcement Administration (DEA) number is assigned to a health care provider by the US DEA. It allows the health care provider to write prescriptions for controlled substances. The DEA number is often used as a general "prescriber number" that is a unique identifier for anyone who can prescribe medication. 513 | US_DEA_NUMBER = 3604; 514 | 515 | // A driver's license number for the United States. Format can vary depending on the issuing state. 516 | US_DRIVERS_LICENSE_NUMBER = 3605; 517 | 518 | // A United States Employer Identification Number (EIN) is also known as a Federal Tax Identification Number, and is used to identify a business entity. 519 | US_EMPLOYER_IDENTIFICATION_NUMBER = 3606; 520 | 521 | // The US National Provider Identifier (NPI) is a unique 10-digit identification number issued to health care providers in the United States by the Centers for Medicare and Medicaid Services (CMS). The NPI has replaced the unique provider identification number (UPIN) as the required identifier for Medicare services. It's also used by other payers, including commercial healthcare insurers. 522 | US_HEALTHCARE_NPI = 3607; 523 | 524 | // A United States Individual Taxpayer Identification Number (ITIN) is a type of Tax Identification Number (TIN), issued by the Internal Revenue Service (IRS). An ITIN is a tax processing number only available for certain nonresident and resident aliens, their spouses, and dependents who cannot get a Social Security Number (SSN). 525 | US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER = 3608; 526 | 527 | // A United States passport number. 528 | US_PASSPORT = 3609; 529 | 530 | // A United States Preparer Taxpayer Identification Number (PTIN) is an identification number that all paid tax return preparers must use on US federal tax returns or claims for refund submitted to the US Internal Revenue Service (IRS). 531 | US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER = 3610; 532 | 533 | // A United States Social Security number (SSN) is a 9-digit number issued to US citizens, permanent residents, and temporary residents. This detector will not match against numbers with all zeroes in any digit group (that is, 000-##-####, ###-00-####, or ###-##-0000), against numbers with 666 in the first digit group, or against numbers whose first digit is 9. 534 | US_SOCIAL_SECURITY_NUMBER = 3611; 535 | 536 | // A United States state name. 537 | US_STATE = 3612; 538 | 539 | // A US toll-free telephone number. 540 | US_TOLLFREE_PHONE_NUMBER = 3613; 541 | 542 | // A vehicle identification number (VIN) is a unique 17-digit code assigned to every on-road motor vehicle in North America. 543 | US_VEHICLE_IDENTIFICATION_NUMBER = 3614; 544 | 545 | 546 | // A Uruguayan Cédula de Identidad (CDI), or identity card, is used as the main identity document for citizens. 547 | URUGUAY_CDI_NUMBER = 3700; 548 | 549 | 550 | // A Venezuelan Cédula de Identidad (CDI), or national identity card, is used as the main identity document for citizens. 551 | VENEZUELA_CDI_NUMBER = 3800; 552 | 553 | 554 | } 555 | 556 | extend google.protobuf.FieldOptions { 557 | // message.description is a Mandatory Metadata 558 | CoreMetadata field_core = 70001; 559 | // Set true if the field contains classified data (Optional). 560 | bool is_classified = 70002; 561 | // Set the classification level if is_classified is true (This is Mandatory if is_classified set to true) 562 | string classification_level = 7003; 563 | // Specify the product type. product_type is an useful annotation to represent a field in a business perspective. 564 | // (e.g) user_id can be an INT field, but in the system design it could represent External Users rather than internal users. 565 | string product_type = 70004; 566 | // Set true if the field is a primary key. This must be true if the Schema type is Entity 567 | bool is_primary_key = 70005; 568 | // Type of the classification: Refer: https://cloud.google.com/dlp/docs/infotypes-reference 569 | ClassificationType classification_type = 70006; 570 | } -------------------------------------------------------------------------------- /schemata/quickstart/schema/activity.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | option java_package = "org.protocol.schema"; 6 | option java_outer_classname = "ActivityBuilder"; 7 | 8 | 9 | -------------------------------------------------------------------------------- /schemata/quickstart/schema/brand.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | import "schemata/protobuf/schemata.proto"; 6 | 7 | option java_package = "org.protocol.schema"; 8 | option java_outer_classname = "BrandBuilder"; 9 | 10 | 11 | message Brand { 12 | 13 | option(org.schemata.schema.message_core).description = "This is the description of the Brand table"; 14 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 15 | option(org.schemata.schema.message_core).see_also = "db.brand MySQL table"; 16 | option(org.schemata.schema.owner) = "Platform"; 17 | option(org.schemata.schema.domain) = "Core"; 18 | option(org.schemata.schema.schema_type) = ENTITY; 19 | option(org.schemata.schema.team_channel) = "#team-platform"; 20 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 21 | 22 | int32 id = 1 23 | [(org.schemata.schema.field_core).description = "Unique identifier for Brand", (org.schemata.schema.is_primary_key) = true]; 24 | 25 | string name = 2 26 | [(org.schemata.schema.field_core).description = "Name of the Brand"]; 27 | 28 | bool is_active = 3 29 | [(org.schemata.schema.field_core).description = "define the active status of the Brand. `true` == active; `false` = inactive`", (org.schemata.schema.field_core).comment = "should refactor to non-binary status"]; 30 | 31 | } 32 | 33 | message BrandEvent { 34 | option(org.schemata.schema.message_core).description = "This is the description of the brand activity table"; 35 | option(org.schemata.schema.owner) = "Platform"; 36 | option(org.schemata.schema.domain) = "Core"; 37 | option(org.schemata.schema.schema_type) = EVENT; 38 | option(org.schemata.schema.event_type) = LIFECYCLE; 39 | option(org.schemata.schema.team_channel) = "#team-platform"; 40 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 41 | 42 | Brand previous_brand_state = 1 43 | [(org.schemata.schema.field_core).description = "Previous version of the Brand entity before the mutation"]; 44 | 45 | Brand current_brand_state = 2 46 | [(org.schemata.schema.field_core).description = "Current version of the Brand entity before the mutation"]; 47 | 48 | org.schemata.schema.ActivityType activity_type = 3 49 | [(org.schemata.schema.field_core).description = "Lifecycle event type for the Brand table"]; 50 | } 51 | -------------------------------------------------------------------------------- /schemata/quickstart/schema/campaign.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | import "schemata/protobuf/schemata.proto"; 6 | import "product.proto"; 7 | import "google/protobuf/timestamp.proto"; 8 | import "category.proto"; 9 | 10 | option java_package = "org.protocol.schema"; 11 | option java_outer_classname = "CampaignBuilder"; 12 | 13 | 14 | enum CampaignOrigin { 15 | EMAIL = 0; 16 | SOCIAL_MEDIA = 1; 17 | } 18 | 19 | message Campaign { 20 | 21 | option(org.schemata.schema.message_core).description = "This is the description of the Campaign table"; 22 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 23 | option(org.schemata.schema.message_core).see_also = "db.campaign MySQL table"; 24 | option(org.schemata.schema.owner) = "Marketing"; 25 | option(org.schemata.schema.domain) = "Growth"; 26 | option(org.schemata.schema.schema_type) = ENTITY; 27 | option(org.schemata.schema.team_channel) = "#team-growth"; 28 | option(org.schemata.schema.alert_channel) = "#alerts-growth"; 29 | 30 | int32 id = 1 31 | [(org.schemata.schema.field_core).description = "Unique identifier for Campaign", (org.schemata.schema.is_primary_key) = true]; 32 | 33 | string name = 2 34 | [(org.schemata.schema.field_core).description = "Name of the Campaign"]; 35 | 36 | bool is_active = 3 37 | [(org.schemata.schema.field_core).description = "define the active status of the Campaign. `true` == active; `false` = inactive`", (org.schemata.schema.field_core).comment = "should refactor to non-binary status"]; 38 | 39 | } 40 | 41 | message CampaignEvent { 42 | option(org.schemata.schema.message_core).description = "This is the description of the Campaign activity table"; 43 | option(org.schemata.schema.owner) = "Marketing"; 44 | option(org.schemata.schema.domain) = "Growth"; 45 | option(org.schemata.schema.schema_type) = EVENT; 46 | option(org.schemata.schema.event_type) = LIFECYCLE; 47 | option(org.schemata.schema.team_channel) = "#team-growth"; 48 | option(org.schemata.schema.alert_channel) = "#alerts-growth"; 49 | 50 | Campaign previous_campaign_state = 1 51 | [(org.schemata.schema.field_core).description = "Previous version of the Campaign entity before the mutation"]; 52 | 53 | Campaign current_campaign_state = 2 54 | [(org.schemata.schema.field_core).description = "Current version of the Campaign entity before the mutation"]; 55 | 56 | org.schemata.schema.ActivityType activity_type = 3 57 | [(org.schemata.schema.field_core).description = "Lifecycle event type for the Campaign table"]; 58 | } 59 | 60 | message CampaignCategoryTrackerEvent { 61 | option(org.schemata.schema.message_core).description = "This is the description of the Campaign activity table"; 62 | option(org.schemata.schema.owner) = "Marketing"; 63 | option(org.schemata.schema.domain) = "Growth"; 64 | option(org.schemata.schema.schema_type) = EVENT; 65 | option(org.schemata.schema.event_type) = ACTIVITY; 66 | option(org.schemata.schema.team_channel) = "#team-growth"; 67 | option(org.schemata.schema.alert_channel) = "#alerts-growth"; 68 | 69 | Campaign campaign = 1 [(org.schemata.schema.field_core).description = "Campaign entity"]; 70 | Category category = 2 [(org.schemata.schema.field_core).description = "Category of the targeted campaign"]; 71 | CampaignOrigin origin = 3 [(org.schemata.schema.field_core).description = "origin source of the campaign"] ; 72 | google.protobuf.Timestamp timestamp = 4 [(org.schemata.schema.field_core).description = "Timestamp of the activity"]; 73 | 74 | } 75 | 76 | message CampaignProductTrackerEvent { 77 | 78 | option(org.schemata.schema.message_core).description = "This is the description of the Campaign activity table"; 79 | option(org.schemata.schema.owner) = "Marketing"; 80 | option(org.schemata.schema.domain) = "Growth"; 81 | option(org.schemata.schema.schema_type) = EVENT; 82 | option(org.schemata.schema.event_type) = ACTIVITY; 83 | option(org.schemata.schema.team_channel) = "#team-growth"; 84 | option(org.schemata.schema.alert_channel) = "#alerts-growth"; 85 | 86 | Campaign campaign = 1 [(org.schemata.schema.field_core).description = "Campaign entity"]; 87 | Product product = 2 [(org.schemata.schema.field_core).description = "Product of the targeted campaign"]; 88 | CampaignOrigin origin = 3 [(org.schemata.schema.field_core).description = "origin source of the campaign"] ; 89 | google.protobuf.Timestamp timestamp = 4 [(org.schemata.schema.field_core).description = "Timestamp of the activity"]; 90 | } 91 | 92 | -------------------------------------------------------------------------------- /schemata/quickstart/schema/category.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | import "schemata/protobuf/schemata.proto"; 6 | 7 | option java_package = "org.protocol.schema"; 8 | option java_outer_classname = "CategoryBuilder"; 9 | 10 | 11 | message Category { 12 | 13 | option(org.schemata.schema.message_core).description = "This is the description of the Category table"; 14 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 15 | option(org.schemata.schema.message_core).see_also = "db.category MySQL table"; 16 | option(org.schemata.schema.owner) = "Platform"; 17 | option(org.schemata.schema.domain) = "Core"; 18 | option(org.schemata.schema.schema_type) = ENTITY; 19 | option(org.schemata.schema.team_channel) = "#team-platform"; 20 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 21 | 22 | int32 id = 1 23 | [(org.schemata.schema.field_core).description = "Unique identifier for Category", (org.schemata.schema.is_primary_key) = true]; 24 | 25 | string name = 2 26 | [(org.schemata.schema.field_core).description = "Name of the Category"]; 27 | 28 | bool is_active = 3 29 | [(org.schemata.schema.field_core).description = "define the active status of the Category. `true` == active; `false` = inactive`", (org.schemata.schema.field_core).comment = "should refactor to non-binary status"]; 30 | 31 | } 32 | 33 | message CategoryEvent { 34 | option(org.schemata.schema.message_core).description = "This is the description of the Category activity table"; 35 | option(org.schemata.schema.owner) = "Platform"; 36 | option(org.schemata.schema.domain) = "Core"; 37 | option(org.schemata.schema.schema_type) = EVENT; 38 | option(org.schemata.schema.event_type) = LIFECYCLE; 39 | option(org.schemata.schema.team_channel) = "#team-platform"; 40 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 41 | 42 | Category previous_category_state = 1 43 | [(org.schemata.schema.field_core).description = "Previous version of the Category entity before the mutation"]; 44 | 45 | Category current_category_state = 2 46 | [(org.schemata.schema.field_core).description = "Current version of the Category entity before the mutation"]; 47 | 48 | org.schemata.schema.ActivityType activity_type = 3 49 | [(org.schemata.schema.field_core).description = "Lifecycle event type for the Category table"]; 50 | } 51 | -------------------------------------------------------------------------------- /schemata/quickstart/schema/entities.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | import "google/protobuf/descriptor.proto"; 6 | import "schemata/protobuf/schemata.proto"; 7 | 8 | option java_package = "org.entities.proto"; 9 | option java_outer_classname = "EntityBuilder"; 10 | 11 | message Department { 12 | option(org.schemata.schema.message_core).description = "This is the description of the department table"; 13 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 14 | option(org.schemata.schema.owner) = "Growth"; 15 | option(org.schemata.schema.domain) = "Core"; 16 | option(org.schemata.schema.schema_type) = ENTITY; 17 | 18 | int32 id = 1 19 | [(org.schemata.schema.field_core).description = "unique identifier for a given department", (org.schemata.schema.is_primary_key) = true]; 20 | 21 | string name = 2 22 | [(org.schemata.schema.field_core).description = "department name"]; 23 | } 24 | 25 | message Person { 26 | option(org.schemata.schema.message_core).description = "This is the description of the users table"; 27 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 28 | option(org.schemata.schema.owner) = "Growth"; 29 | option(org.schemata.schema.domain) = "Core"; 30 | option(org.schemata.schema.schema_type) = ENTITY; 31 | 32 | string name = 1 33 | [(org.schemata.schema.field_core).description = "person name"]; 34 | 35 | int32 id = 2 36 | [(org.schemata.schema.field_core).description = "unique identifier for a given person", (org.schemata.schema.is_primary_key) = true]; 37 | 38 | string email = 3 39 | [(org.schemata.schema.field_core).description = "official email address", (org.schemata.schema.is_classified) = true, (org.schemata.schema.classification_level) = "HIGH", (org.schemata.schema.product_type) = "email"]; 40 | 41 | Department dept = 4 42 | [(org.schemata.schema.field_core).description = "department name of the person"] ; 43 | } 44 | -------------------------------------------------------------------------------- /schemata/quickstart/schema/product.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | import "schemata/protobuf/schemata.proto"; 6 | import "category.proto"; 7 | import "brand.proto"; 8 | 9 | option java_package = "org.protocol.schema"; 10 | option java_outer_classname = "ProductBuilder"; 11 | 12 | 13 | message Product { 14 | 15 | option(org.schemata.schema.message_core).description = "This is the description of the Product table"; 16 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 17 | option(org.schemata.schema.message_core).see_also = "db.product MySQL table"; 18 | option(org.schemata.schema.owner) = "Platform"; 19 | option(org.schemata.schema.domain) = "Core"; 20 | option(org.schemata.schema.schema_type) = ENTITY; 21 | option(org.schemata.schema.team_channel) = "#team-platform"; 22 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 23 | 24 | int32 id = 1 25 | [(org.schemata.schema.field_core).description = "Unique identifier for Product", (org.schemata.schema.is_primary_key) = true]; 26 | 27 | string name = 2 28 | [(org.schemata.schema.field_core).description = "Name of the Product"]; 29 | 30 | Category category = 3 31 | [(org.schemata.schema.field_core).description = "Category name of the product"]; 32 | 33 | Brand brand = 4 34 | [(org.schemata.schema.field_core).description = "Brand name of the product"]; 35 | 36 | bool is_active = 5 37 | [(org.schemata.schema.field_core).description = "define the active status of the Product. `true` == active; `false` = inactive`", (org.schemata.schema.field_core).comment = "should refactor to non-binary status"]; 38 | 39 | } 40 | 41 | message ProductEvent { 42 | option(org.schemata.schema.message_core).description = "This is the description of the Product activity table"; 43 | option(org.schemata.schema.owner) = "Platform"; 44 | option(org.schemata.schema.domain) = "Core"; 45 | option(org.schemata.schema.schema_type) = EVENT; 46 | option(org.schemata.schema.event_type) = LIFECYCLE; 47 | option(org.schemata.schema.team_channel) = "#team-platform"; 48 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 49 | 50 | Product previous_brand_state = 1 51 | [(org.schemata.schema.field_core).description = "Previous version of the Product entity before the mutation"]; 52 | 53 | Product current_brand_state = 2 54 | [(org.schemata.schema.field_core).description = "Current version of the Product entity before the mutation"]; 55 | 56 | org.schemata.schema.ActivityType activity_type = 3 57 | [(org.schemata.schema.field_core).description = "Lifecycle event type for the Product table"]; 58 | } 59 | 60 | -------------------------------------------------------------------------------- /schemata/quickstart/schema/user.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package org.examples; 4 | 5 | import "schemata/protobuf/schemata.proto"; 6 | import "product.proto"; 7 | import "google/protobuf/timestamp.proto"; 8 | 9 | 10 | option java_package = "org.protocol.schema"; 11 | option java_outer_classname = "UserBuilder"; 12 | 13 | message User { 14 | 15 | option(org.schemata.schema.message_core).description = "This is the description of the users table"; 16 | option(org.schemata.schema.message_core).comment = "The comment added after thought"; 17 | option(org.schemata.schema.message_core).see_also = "db.user MySQL table"; 18 | option(org.schemata.schema.owner) = "Platform"; 19 | option(org.schemata.schema.domain) = "Core"; 20 | option(org.schemata.schema.schema_type) = ENTITY; 21 | option(org.schemata.schema.team_channel) = "#team-platform"; 22 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 23 | 24 | int32 id = 1 25 | [(org.schemata.schema.field_core).description = "Unique identifier for User", (org.schemata.schema.is_primary_key) = true]; 26 | 27 | string name = 2 28 | [(org.schemata.schema.field_core).description = "Name of the user"] ; 29 | 30 | string email = 3 31 | [(org.schemata.schema.field_core).description = "email id for the user", (org.schemata.schema.product_type) = "username", (org.schemata.schema.is_classified) = true, (org.schemata.schema.classification_level) = "LEVEL1"] ; 32 | 33 | bool is_active = 4 34 | [(org.schemata.schema.field_core).description = "define the active status of the user. `true` == active; `false` = inactive`", (org.schemata.schema.field_core).comment = "should refactor to non-binary status"]; 35 | 36 | string timezone = 5 37 | [(org.schemata.schema.field_core).description = "preferred time zone for the user"] ; 38 | } 39 | 40 | message UserEvent { 41 | option(org.schemata.schema.message_core).description = "This is the description of the users table"; 42 | option(org.schemata.schema.owner) = "Platform"; 43 | option(org.schemata.schema.domain) = "Core"; 44 | option(org.schemata.schema.schema_type) = EVENT; 45 | option(org.schemata.schema.event_type) = LIFECYCLE; 46 | option(org.schemata.schema.team_channel) = "#team-platform"; 47 | option(org.schemata.schema.alert_channel) = "#alerts-platform"; 48 | 49 | User previous_user_state = 1 50 | [(org.schemata.schema.field_core).description = "Previous version of the user entity before the mutation"]; 51 | 52 | User current_user_state = 2 53 | [(org.schemata.schema.field_core).description = "Current version of the user entity before the mutation"]; 54 | 55 | org.schemata.schema.ActivityType activity_type = 3 56 | [(org.schemata.schema.field_core).description = "Lifecycle event type for the Users table"]; 57 | 58 | google.protobuf.Timestamp timestamp = 4 [(org.schemata.schema.field_core).description = "Timestamp of the activity"]; 59 | } 60 | 61 | enum UserActivityType { 62 | VIEW = 0; 63 | READ_REVIEW = 1; 64 | VIEW_DESCRIPTION = 2; 65 | } 66 | 67 | message UserActivityEvent { 68 | option(org.schemata.schema.message_core).description = "This is the description of the users table"; 69 | option(org.schemata.schema.owner) = "Product"; 70 | option(org.schemata.schema.domain) = "Growth"; 71 | option(org.schemata.schema.schema_type) = EVENT; 72 | option(org.schemata.schema.event_type) = ACTIVITY; 73 | option(org.schemata.schema.team_channel) = "#team-growth"; 74 | option(org.schemata.schema.alert_channel) = "#alerts-growth"; 75 | User user = 1 [(org.schemata.schema.field_core).description = "User entity reference"]; 76 | Product product = 2 [(org.schemata.schema.field_core).description = "Product entity reference"]; 77 | UserActivityType activity_type = 3 [(org.schemata.schema.field_core).description = "Type of the user activity"]; 78 | google.protobuf.Timestamp timestamp = 4 [(org.schemata.schema.field_core).description = "Timestamp of the activity"]; 79 | } 80 | 81 | message UserActivityAggregate { 82 | 83 | option(org.schemata.schema.message_core).description = "This is the aggregated user activity view count. The event aggregated by user & product"; 84 | option(org.schemata.schema.owner) = "Product"; 85 | option(org.schemata.schema.domain) = "Growth"; 86 | option(org.schemata.schema.schema_type) = EVENT; 87 | option(org.schemata.schema.event_type) = AGGREGATED; 88 | option(org.schemata.schema.team_channel) = "#team-growth"; 89 | option(org.schemata.schema.alert_channel) = "#alerts-growth"; 90 | 91 | User user = 1[(org.schemata.schema.field_core).description = "User entity reference"]; 92 | Product product = 2 [(org.schemata.schema.field_core).description = "Product entity reference"]; 93 | int64 count = 3 [(org.schemata.schema.field_core).description = "Aggregated count of the user activity per product", (org.schemata.schema.product_type) = "activity_count"]; 94 | int32 windowTime = 4 [(org.schemata.schema.field_core).description = "Max window time for the aggregation"]; 95 | org.schemata.schema.TimeUnit window_time_unit = 5 [(org.schemata.schema.field_core).description = "TimeUnit of window for the aggregation"]; 96 | google.protobuf.Timestamp timestamp = 6 [(org.schemata.schema.field_core).description = "Timestamp of the activity"]; 97 | 98 | } 99 | 100 | -------------------------------------------------------------------------------- /schemata/quickstart/score.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | java -jar target/schemata-1.0.jar score -s=model.desc -p=PROTOBUF $1 4 | 5 | -------------------------------------------------------------------------------- /schemata/quickstart/validate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | java -jar target/schemata-1.0.jar validate -s=model.desc -p=PROTOBUF 4 | 5 | -------------------------------------------------------------------------------- /smithy/quickstart/.gitignore: -------------------------------------------------------------------------------- 1 | # Gradle builds 2 | /.gradle/ 3 | /build/ 4 | 5 | # Temp 6 | /tmp/ 7 | 8 | -------------------------------------------------------------------------------- /smithy/quickstart/README.md: -------------------------------------------------------------------------------- 1 | Smithy - Quickstart guide 2 | ========================= 3 | 4 | # References 5 | * [GitHub - Data Engineering helpers - Data contracts (this project)](https://github.com/data-engineering-helpers/data-contracts) 6 | * [Smithy - Quickstart guide](https://smithy.io/2.0/quickstart.html) 7 | * [Smithy - Gradle plugin](https://smithy.io/2.0/guides/building-models/gradle-plugin.html) 8 | 9 | # Quick start 10 | * If needed, install Java, Maven and Gradle. One of the easiest and most fexible 11 | way is to use [SDKMan](https://sdkman.io/), allowing to install any version of any 12 | JVM-based tool in parallel without interfering with the rest of the operating system. 13 | 14 | * Build the Smithy model: 15 | ```bash 16 | $ gradle build 17 | ``` 18 | 19 | -------------------------------------------------------------------------------- /smithy/quickstart/build.gradle.kts: -------------------------------------------------------------------------------- 1 | // See https://smithy.io/2.0/guides/building-models/gradle-plugin.html 2 | 3 | plugins { 4 | id("software.amazon.smithy").version("0.6.0") 5 | } 6 | 7 | repositories { 8 | mavenLocal() 9 | mavenCentral() 10 | } 11 | 12 | dependencies { 13 | implementation("software.amazon.smithy:smithy-model:1.29.0") 14 | implementation("software.amazon.smithy:smithy-aws-traits:1.29.0") 15 | } 16 | 17 | configure { 18 | // Uncomment this to use a custom projection when building the JAR. 19 | // projection = "foo" 20 | } 21 | 22 | // Uncomment to disable creating a JAR. 23 | //tasks["jar"].enabled = false 24 | 25 | -------------------------------------------------------------------------------- /smithy/quickstart/model/weather.smithy: -------------------------------------------------------------------------------- 1 | /// File: https://github.com/data-engineering-helpers/data-contracts/blob/main/smithy/quickstart/model/weather.smithy 2 | 3 | $version: "2" 4 | namespace example.weather 5 | 6 | use aws.protocols#restJson1 7 | 8 | /// Provides weather forecasts. 9 | @paginated( 10 | inputToken: "nextToken" 11 | outputToken: "nextToken" 12 | pageSize: "pageSize" 13 | ) 14 | 15 | @restJson1 16 | service Weather { 17 | version: "2006-03-01" 18 | resources: [City] 19 | operations: [GetCurrentTime] 20 | } 21 | 22 | resource City { 23 | identifiers: { cityId: CityId } 24 | read: GetCity 25 | list: ListCities 26 | resources: [Forecast] 27 | } 28 | 29 | resource Forecast { 30 | identifiers: { cityId: CityId } 31 | read: GetForecast, 32 | } 33 | 34 | // "pattern" is a trait. 35 | @pattern("^[A-Za-z0-9 ]+$") 36 | string CityId 37 | 38 | @readonly 39 | @http(code: 200, method: "GET", uri: "/cities/{cityId}") 40 | operation GetCity { 41 | input: GetCityInput 42 | output: GetCityOutput 43 | errors: [NoSuchResource] 44 | } 45 | 46 | @input 47 | structure GetCityInput { 48 | // "cityId" provides the identifier for the resource and 49 | // has to be marked as required. 50 | @required 51 | @httpLabel 52 | cityId: CityId 53 | } 54 | 55 | @output 56 | structure GetCityOutput { 57 | // "required" is used on output to indicate if the service 58 | // will always provide a value for the member. 59 | @required 60 | name: String 61 | 62 | @required 63 | coordinates: CityCoordinates 64 | } 65 | 66 | // This structure is nested within GetCityOutput. 67 | structure CityCoordinates { 68 | @required 69 | latitude: Float 70 | 71 | @required 72 | longitude: Float 73 | } 74 | 75 | // "error" is a trait that is used to specialize 76 | // a structure as an error. 77 | @error("client") 78 | structure NoSuchResource { 79 | @required 80 | resourceType: String 81 | } 82 | 83 | // The paginated trait indicates that the operation may 84 | // return truncated results. 85 | @readonly 86 | @paginated(items: "items") 87 | @http(code: 200, method: "GET", uri: "/cities") 88 | operation ListCities { 89 | input: ListCitiesInput 90 | output: ListCitiesOutput 91 | } 92 | 93 | @input 94 | structure ListCitiesInput { 95 | @httpQuery("nextToken") 96 | nextToken: String 97 | @httpQuery("pageSize") 98 | pageSize: Integer 99 | } 100 | 101 | @output 102 | structure ListCitiesOutput { 103 | nextToken: String 104 | 105 | @required 106 | items: CitySummaries 107 | } 108 | 109 | // CitySummaries is a list of CitySummary structures. 110 | list CitySummaries { 111 | member: CitySummary 112 | } 113 | 114 | // CitySummary contains a reference to a City. 115 | @references([{resource: City}]) 116 | structure CitySummary { 117 | @required 118 | cityId: CityId 119 | 120 | @required 121 | name: String 122 | } 123 | 124 | @readonly 125 | @http(code: 200, method: "GET", uri: "/currentTime") 126 | operation GetCurrentTime { 127 | input: GetCurrentTimeInput 128 | output: GetCurrentTimeOutput 129 | } 130 | 131 | @input 132 | structure GetCurrentTimeInput {} 133 | 134 | @output 135 | structure GetCurrentTimeOutput { 136 | @required 137 | time: Timestamp 138 | } 139 | 140 | @readonly 141 | @http(code: 200, method: "GET", uri: "/forecast/{cityId}") 142 | operation GetForecast { 143 | input: GetForecastInput 144 | output: GetForecastOutput 145 | } 146 | 147 | // "cityId" provides the only identifier for the resource since 148 | // a Forecast doesn't have its own. 149 | @input 150 | structure GetForecastInput { 151 | @required 152 | @httpLabel 153 | cityId: CityId 154 | } 155 | 156 | @output 157 | structure GetForecastOutput { 158 | chanceOfRain: Float 159 | } 160 | 161 | -------------------------------------------------------------------------------- /smithy/quickstart/smithy-build.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.0" 3 | } 4 | 5 | --------------------------------------------------------------------------------