├── LICENSE ├── README.md ├── dbt └── f1-analytics │ ├── dbt_project.yml │ └── models │ └── drivers │ ├── championship_winners.sql │ └── schema.yml ├── iac ├── gce.tf ├── init_scripts │ ├── airbyte.sh │ ├── open_metadata.sh │ └── superset.sh ├── main.tf ├── service_account.tf └── vars.tf └── platform_architecture.png /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Mahdi Karabiben 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sample Modern Data Platform 2 | 3 | This repository contains the different IaC scripts to deploy a sample Modern Data Platform, accompanied with a sample dbt model. 4 | 5 | Please refer to [the main article](https://towardsdatascience.com/building-an-end-to-end-open-source-modern-data-platform-c906be2f31bd) that goes through the details of the platform and the reasoning behind the component choices. 6 | 7 | The architecutre contains the following components: 8 | 9 | * BigQuery 10 | * Airbyte 11 | * dbt 12 | * Apache Superset 13 | * OpenMetadata 14 | 15 | ## Architecture diagram 16 | 17 | ![The platform's architecture](platform_architecture.png) 18 | 19 | -------------------------------------------------------------------------------- /dbt/f1-analytics/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'f1_analytics' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'f1_analytics' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_packages" 26 | 27 | 28 | # Configuring models 29 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 30 | 31 | # In this example config, we tell dbt to build all models in the example/ directory 32 | # as tables. These settings can be overridden in the individual model files 33 | # using the `{{ config(...) }}` macro. 34 | models: 35 | f1_analytics: 36 | # Config indicated by + and applies to all files under models/example/ 37 | drivers: 38 | +materialized: view 39 | -------------------------------------------------------------------------------- /dbt/f1-analytics/models/drivers/championship_winners.sql: -------------------------------------------------------------------------------- 1 | WITH season_results AS( 2 | SELECT 3 | res.driverId, 4 | year, 5 | SUM(points) as points_total 6 | FROM 7 | `project.f1.results` AS res 8 | INNER JOIN `project.f1.races` AS rc ON res.raceId = rc.raceId 9 | GROUP BY 10 | res.driverId, 11 | year 12 | ), season_ranking AS ( 13 | SELECT 14 | driverId, 15 | year, 16 | points_total, 17 | RANK() OVER(PARTITION BY year ORDER BY points_total DESC) AS ranking 18 | FROM 19 | season_results 20 | ) 21 | SELECT 22 | s.*, 23 | CONCAT(d.forename, " ", d.surname) AS full_name 24 | FROM 25 | season_ranking s 26 | INNER JOIN `project.f1.drivers` d ON d.driverId = s.driverId 27 | WHERE 28 | ranking = 1 -------------------------------------------------------------------------------- /dbt/f1-analytics/models/drivers/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: championship_winners 5 | description: "Model containing championship winner for each year" 6 | columns: 7 | - name: driverId 8 | description: "Driver ID" 9 | tests: 10 | - not_null 11 | - name: year 12 | description: "Championship year" 13 | tests: 14 | - not_null 15 | - unique 16 | - name: points_total 17 | description: "Points total for the year" 18 | tests: 19 | - not_null 20 | - name: full_name 21 | description: "Driver's full name" 22 | tests: 23 | - not_null 24 | -------------------------------------------------------------------------------- /iac/gce.tf: -------------------------------------------------------------------------------- 1 | resource "google_compute_instance" "airbyte-instance" { 2 | name = "airbyte-instance" 3 | machine_type = "e2-medium" 4 | 5 | tags = ["airbyte"] 6 | 7 | boot_disk { 8 | initialize_params { 9 | image = "debian-cloud/debian-10" 10 | size = 40 11 | } 12 | } 13 | 14 | network_interface { 15 | network = "default" 16 | access_config { 17 | network_tier = "PREMIUM" 18 | } 19 | } 20 | 21 | metadata_startup_script = file("${path.module}/init_scripts/airbyte.sh") 22 | 23 | service_account { 24 | # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles. 25 | email = google_service_account.airbyte-service-account.email 26 | scopes = ["cloud-platform"] 27 | } 28 | } 29 | 30 | resource "google_compute_instance" "superset-instance" { 31 | name = "superset-instance" 32 | machine_type = "e2-medium" 33 | 34 | tags = ["superset"] 35 | 36 | boot_disk { 37 | initialize_params { 38 | image = "debian-cloud/debian-10" 39 | size = 10 40 | } 41 | } 42 | 43 | network_interface { 44 | network = "default" 45 | access_config { 46 | network_tier = "PREMIUM" 47 | } 48 | } 49 | 50 | metadata_startup_script = file("${path.module}/init_scripts/superset.sh") 51 | 52 | service_account { 53 | # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles. 54 | email = google_service_account.superset-service-account.email 55 | scopes = ["cloud-platform"] 56 | } 57 | } 58 | 59 | resource "google_compute_instance" "openmetadata-instance" { 60 | name = "openmetadata-instance" 61 | machine_type = "e2-standard-4" 62 | 63 | tags = ["openmetadata"] 64 | 65 | boot_disk { 66 | initialize_params { 67 | image = "debian-cloud/debian-11" 68 | size = 40 69 | } 70 | } 71 | 72 | network_interface { 73 | network = "default" 74 | access_config { 75 | network_tier = "PREMIUM" 76 | } 77 | } 78 | 79 | metadata_startup_script = file("${path.module}/init_scripts/open_metadata.sh") 80 | 81 | service_account { 82 | # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles. 83 | email = google_service_account.openmetadata-service-account.email 84 | scopes = ["cloud-platform"] 85 | } 86 | } -------------------------------------------------------------------------------- /iac/init_scripts/airbyte.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Setup follows the steps provided by Airbyte 4 | # Documentation link: https://docs.airbyte.io/deploying-airbyte/on-gcp-compute-engine 5 | 6 | # First we install Docker 7 | sudo apt-get update 8 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg2 software-properties-common 9 | curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -- 10 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian buster stable" 11 | sudo apt-get update 12 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io 13 | sudo usermod -a -G docker $USER 14 | 15 | # Then we install Docker Compose 16 | sudo apt-get -y install wget 17 | sudo wget https://github.com/docker/compose/releases/download/1.26.2/docker-compose-$(uname -s)-$(uname -m) -O /usr/local/bin/docker-compose 18 | sudo chmod +x /usr/local/bin/docker-compose 19 | docker-compose --version 20 | 21 | # Lastly we install and start Airbyte 22 | mkdir airbyte && cd airbyte 23 | wget https://raw.githubusercontent.com/airbytehq/airbyte/master/{.env,docker-compose.yaml} 24 | sudo docker-compose up -d -------------------------------------------------------------------------------- /iac/init_scripts/open_metadata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Setup follows the steps provided by OpenMetadata 4 | # Documentation link: https://docs.open-metadata.org/install/run-openmetadata 5 | 6 | # First we install Docker 7 | sudo apt-get update 8 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg2 software-properties-common 9 | curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -- 10 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian buster stable" 11 | sudo apt-get update 12 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io 13 | sudo usermod -a -G docker $USER 14 | 15 | # Then we install Docker Compose 16 | sudo apt-get -y install wget 17 | sudo wget https://github.com/docker/compose/releases/download/v2.2.2/docker-compose-$(uname -s)-$(uname -m) -O /usr/local/bin/docker-compose 18 | sudo chmod +x /usr/local/bin/docker-compose 19 | docker-compose --version 20 | 21 | # We install pip 22 | sudo apt install -y python3-pip 23 | 24 | # We install and start OpenMetadata 25 | mkdir openmetadata-docker && cd openmetadata-docker 26 | pip3 install --upgrade pip setuptools 27 | pip3 install --upgrade 'openmetadata-ingestion[docker]' 28 | python3 -m metadata docker --start 29 | -------------------------------------------------------------------------------- /iac/init_scripts/superset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # First we install Docker 4 | sudo apt-get update 5 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg2 software-properties-common 6 | curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -- 7 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian buster stable" 8 | sudo apt-get update 9 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io 10 | sudo usermod -a -G docker $USER 11 | 12 | # Then we install Docker Compose 13 | sudo apt-get install -y wget git 14 | sudo wget https://github.com/docker/compose/releases/download/1.26.2/docker-compose-$(uname -s)-$(uname -m) -O /usr/local/bin/docker-compose 15 | sudo chmod +x /usr/local/bin/docker-compose 16 | docker-compose --version 17 | 18 | # Lastly we install and start Superset 19 | sudo git clone https://github.com/apache/superset.git 20 | cd superset 21 | touch ./docker/requirements-local.txt 22 | echo "pybigquery" >> ./docker/requirements-local.txt 23 | sudo docker-compose -f docker-compose-non-dev.yml up -------------------------------------------------------------------------------- /iac/main.tf: -------------------------------------------------------------------------------- 1 | provider "google" { 2 | project = "YOUR_PROJECT" 3 | region = var.region 4 | zone = var.zone 5 | } -------------------------------------------------------------------------------- /iac/service_account.tf: -------------------------------------------------------------------------------- 1 | resource "google_service_account" "airbyte-service-account" { 2 | account_id = "airbyte-service-account" 3 | display_name = "Airbyte Service Account" 4 | } 5 | 6 | resource "google_service_account" "superset-service-account" { 7 | account_id = "superset-service-account" 8 | display_name = "superset Service Account" 9 | } 10 | 11 | resource "google_service_account" "openmetadata-service-account" { 12 | account_id = "openmetadata-service-account" 13 | display_name = "OpenMetadata Service Account" 14 | } -------------------------------------------------------------------------------- /iac/vars.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | type = string 3 | default = "europe-west6" 4 | } 5 | 6 | variable "zone" { 7 | type = string 8 | default = "europe-west6-a" 9 | } -------------------------------------------------------------------------------- /platform_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahdiqb/modern_data_platform/92c9f67242db9b4d9783b3347ef40663fd44dd79/platform_architecture.png --------------------------------------------------------------------------------