├── .github ├── CODEOWNERS └── workflows │ ├── cd.yml │ └── ci.yml ├── .gitignore ├── .tool-versions ├── LICENSE ├── Makefile ├── README.md ├── assets └── images │ ├── cs1.png │ ├── cs2.png │ ├── cs3.png │ ├── dag.png │ ├── dash.png │ ├── data_infra.png │ ├── det.png │ ├── det2.png │ ├── fs.png │ ├── infra.png │ ├── proj_1.png │ ├── proj_2.png │ ├── secret.png │ ├── template.png │ └── tn.png ├── containers └── airflow │ ├── Dockerfile │ ├── quarto.sh │ └── requirements.txt ├── dags └── coincap_elt.py ├── data ├── .gitkeep └── coincap_exchanges.csv ├── docker-compose.yml ├── terraform ├── .terraform.lock.hcl ├── main.tf ├── output.tf └── variable.tf ├── tests └── dags │ └── test_dag_validity.py └── visualization ├── dashboard.html └── dashboard.qmd /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is a comment. 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # These owners will be the default owners for everything in 5 | # the repo. Unless a later match takes precedence, 6 | # @josephmachado will be requested for 7 | # review when someone opens a pull request. 8 | 9 | # REPLACE WITH YOUR GITHUB USER ID !!!! 10 | * @josephmachado -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | # Note: Uncomment to start CD 2 | # name: CD 3 | # on: 4 | # push: 5 | # branches: 6 | # - main 7 | # jobs: 8 | # deploy-to-ec2: 9 | # runs-on: ubuntu-latest 10 | # steps: 11 | # - name: Deploy to server 12 | # uses: easingthemes/ssh-deploy@main 13 | # env: 14 | # SSH_PRIVATE_KEY: ${{ secrets.SERVER_SSH_KEY }} 15 | # REMOTE_HOST: ${{ secrets.REMOTE_HOST }} 16 | # REMOTE_USER: ${{ secrets.REMOTE_USER }} 17 | # SOURCE: "./" 18 | # TARGET: "/home/ubuntu/data_engineering_project_template" 19 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # Note: Uncomment to start CI 2 | # name: CI 3 | # on: [pull_request] 4 | # jobs: 5 | # run-ci-tests: 6 | # runs-on: ubuntu-latest 7 | # steps: 8 | # - name: checkout repo 9 | # uses: actions/checkout@v2 10 | # - name: Spin up containers 11 | # run: make up 12 | # - name: Run CI test 13 | # run: make ci 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Vim ### 2 | [._]*.s[a-w][a-z] 3 | [._]s[a-w][a-z] 4 | *.un~ 5 | Session.vim 6 | .netrwhist 7 | *~ 8 | 9 | ### SublimeText ### 10 | # cache files for sublime text 11 | *.tmlanguage.cache 12 | *.tmPreferences.cache 13 | *.stTheme.cache 14 | 15 | # workspace files are user-specific 16 | *.sublime-workspace 17 | 18 | # project files should be checked into the repository, unless a significant 19 | # proportion of contributors will probably not be using SublimeText 20 | # *.sublime-project 21 | 22 | # sftp configuration file 23 | sftp-config.json 24 | 25 | # Python 26 | __pycache__ 27 | 28 | # policy 29 | trust-policy.json 30 | 31 | # logs 32 | logs/* 33 | *.log 34 | 35 | temp/* 36 | 37 | __MACOSX 38 | 39 | .dockerignore 40 | 41 | *.pem 42 | 43 | ###### TERRAFORM IGNORE 44 | # Local .terraform directories 45 | **/.terraform/* 46 | 47 | # .tfstate files 48 | *.tfstate 49 | *.tfstate.* 50 | 51 | # Crash log files 52 | crash.log 53 | crash.*.log 54 | 55 | # Exclude all .tfvars files, which are likely to contain sensitive data, such as 56 | # password, private keys, and other secrets. These should not be part of version 57 | # control as they are data points which are potentially sensitive and subject 58 | # to change depending on the environment. 59 | *.tfvars 60 | *.tfvars.json 61 | 62 | # Ignore override files as they are usually used to override resources locally and so 63 | # are not checked in 64 | override.tf 65 | override.tf.json 66 | *_override.tf 67 | *_override.tf.json 68 | 69 | # Include override files you do wish to add to version control using negated pattern 70 | # !example_override.tf 71 | 72 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 73 | # example: *tfplan* 74 | 75 | # Ignore CLI configuration files 76 | .terraformrc 77 | terraform.rc 78 | Footer 79 | 80 | dashboard_files/ 81 | 82 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | python 3.11.1 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Start Data Engineering 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #################################################################################################################### 2 | # Setup containers to run Airflow 3 | 4 | docker-spin-up: 5 | docker compose up airflow-init && docker compose up --build -d 6 | 7 | perms: 8 | sudo mkdir -p logs plugins temp dags tests migrations data visualization && sudo chmod -R u=rwx,g=rwx,o=rwx logs plugins temp dags tests migrations data visualization 9 | 10 | up: perms docker-spin-up 11 | 12 | down: 13 | docker compose down --volumes --rmi all 14 | 15 | restart: down up 16 | 17 | sh: 18 | docker exec -ti webserver bash 19 | 20 | #################################################################################################################### 21 | # Testing, auto formatting, type checks, & Lint checks 22 | 23 | pytest: 24 | docker exec webserver pytest -p no:warnings -v /opt/airflow/tests 25 | 26 | format: 27 | docker exec webserver python -m black -S --line-length 79 . 28 | 29 | isort: 30 | docker exec webserver isort . 31 | 32 | type: 33 | docker exec webserver mypy --ignore-missing-imports /opt/airflow 34 | 35 | lint: 36 | docker exec webserver flake8 /opt/airflow/dags 37 | 38 | ci: isort format type lint pytest 39 | 40 | #################################################################################################################### 41 | # Set up cloud infrastructure 42 | 43 | tf-init: 44 | terraform -chdir=./terraform init 45 | 46 | infra-up: 47 | terraform -chdir=./terraform apply 48 | 49 | infra-down: 50 | terraform -chdir=./terraform destroy 51 | 52 | infra-config: 53 | terraform -chdir=./terraform output 54 | 55 | #################################################################################################################### 56 | # Port forwarding to local machine 57 | 58 | cloud-metabase: 59 | terraform -chdir=./terraform output -raw private_key > private_key.pem && chmod 600 private_key.pem && ssh -o "IdentitiesOnly yes" -i private_key.pem ubuntu@$$(terraform -chdir=./terraform output -raw ec2_public_dns) -N -f -L 3001:$$(terraform -chdir=./terraform output -raw ec2_public_dns):3000 && open http://localhost:3001 && rm private_key.pem 60 | 61 | cloud-airflow: 62 | terraform -chdir=./terraform output -raw private_key > private_key.pem && chmod 600 private_key.pem && ssh -o "IdentitiesOnly yes" -i private_key.pem ubuntu@$$(terraform -chdir=./terraform output -raw ec2_public_dns) -N -f -L 8081:$$(terraform -chdir=./terraform output -raw ec2_public_dns):8080 && open http://localhost:8081 && rm private_key.pem 63 | 64 | #################################################################################################################### 65 | # Helpers 66 | 67 | ssh-ec2: 68 | terraform -chdir=./terraform output -raw private_key > private_key.pem && chmod 600 private_key.pem && ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i private_key.pem ubuntu@$$(terraform -chdir=./terraform output -raw ec2_public_dns) && rm private_key.pem 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | * [Data engineering project template](#data-engineering-project-template) 3 | * [Run Data Pipeline](#run-data-pipeline) 4 | * [Run on codespaces](#run-on-codespaces) 5 | * [Run locally](#run-locally) 6 | * [Architecture and services in this template](#architecture-and-services-in-this-template) 7 | * [Using template](#using-template) 8 | * [Writing pipelines](#writing-pipelines) 9 | * [(Optional) Advanced cloud setup](#optional-advanced-cloud-setup) 10 | * [Prerequisites:](#prerequisites) 11 | * [Tear down infra](#tear-down-infra) 12 | 13 | 14 | # Data engineering project template 15 | 16 | Detailed explanation can be found **[`in this post`](https://www.startdataengineering.com/post/data-engineering-projects-with-free-template/)** 17 | 18 | ## Run Data Pipeline 19 | 20 | Code available at **[data_engineering_project_template](https://github.com/josephmachado/data_engineering_project_template/tree/main?tab=readme-ov-file#data-engineering-project-template)** repository. 21 | 22 | ### Run on codespaces 23 | 24 | You can run this data pipeline using GitHub codespaces. Follow the instructions below. 25 | 26 | 1. Create codespaces by going to the **[data_engineering_project_template](https://github.com/josephmachado/data_engineering_project_template/tree/main?tab=readme-ov-file#data-engineering-project-template)** repository, cloning it(or click `Use this template` button) and then clicking on `Create codespaces on main` button. 27 | 2. Wait for codespaces to start, then in the terminal type `make up`. 28 | 3. Wait for `make up` to complete, and then wait for 30s (for Airflow to start). 29 | 4. After 30s go to the `ports` tab and click on the link exposing port `8080` to access Airflow UI (username and password is `airflow`). 30 | 31 | ![codespaces start](./assets/images/cs1.png) 32 | ![codespaces make up](./assets/images/cs2.png) 33 | ![codespaces open url](./assets/images/cs3.png) 34 | 35 | ### Run locally 36 | 37 | To run locally, you need: 38 | 39 | 1. [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) 40 | 2. [Github account](https://github.com/) 41 | 3. [Docker](https://docs.docker.com/engine/install/) with at least 4GB of RAM and [Docker Compose](https://docs.docker.com/compose/install/) v1.27.0 or later 42 | 43 | Clone the repo and run the following commands to start the data pipeline: 44 | 45 | ```bash 46 | git clone https://github.com/josephmachado/data_engineering_project_template.git 47 | cd data_engineering_project_template 48 | make up 49 | sleep 30 # wait for Airflow to start 50 | make ci # run checks and tests 51 | ``` 52 | Go to [http:localhost:8080](http:localhost:8080) to see the Airflow UI. Username and password are both `airflow`. 53 | 54 | ## Architecture and services in this template 55 | 56 | This data engineering project template, includes the following: 57 | 58 | 1. **`Airflow`**: To schedule and orchestrate DAGs. 59 | 2. **`Postgres`**: To store Airflow's details (which you can see via Airflow UI) and also has a schema to represent upstream databases. 60 | 3. **`DuckDB`**: To act as our warehouse 61 | 4. **`Quarto with Plotly`**: To convert code in `markdown` format to html files that can be embedded in your app or servered as is. 62 | 5. **`cuallee`**: To run data quality checks on the data we extracted from CoinCap API. 63 | 6. **`minio`**: To provide an S3 compatible open source storage system. 64 | 65 | For simplicity services 1-5 of the above are installed and run in one container defined [here](./containers/airflow/Dockerfile). 66 | 67 | ![DET](./assets/images/det2.png) 68 | 69 | The `coincap_elt` DAG in the [Airflow UI](http://localhost:8080) will look like the below image: 70 | 71 | ![DAG](./assets/images/dag.png) 72 | 73 | You can see the rendered html at [./visualizations/dashboard.html](https://github.com/josephmachado/data_engineering_project_template/blob/main/visualization/dashboard.html). 74 | 75 | The file structure of our repo is as shown below: 76 | 77 | ![File strucutre](./assets/images/fs.png) 78 | 79 | ## Using template 80 | 81 | You can use this repo as a template and create your own, click on the `Use this template` button. 82 | 83 | ![Template](./assets/images/template.png) 84 | 85 | ## Writing pipelines 86 | 87 | We have a sample pipeline at [coincap_elt.py](./dags/coincap_elt.py) that you can use as a starter to create your own DAGs. The tests are available at [./tests](./tests) folder. 88 | 89 | Once the `coincap_elt` DAG runs, we can see the dashboard html at [./visualization/dashboard.html](./visualization/dashboard.html) and will look like ![Dashboard](./assets/images/dash.png). 90 | 91 | ## (Optional) Advanced cloud setup 92 | 93 | If you want to run your code on an EC2 instance, with terraform, follow the steps below. 94 | 95 | ### Prerequisites: 96 | 97 | 1. [Terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) 98 | 2. [AWS account](https://aws.amazon.com/) 99 | 3. [AWS CLI installed](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) and [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) 100 | 101 | You can create your GitHub repository based on this template by clicking on the `Use this template button in the **[data_engineering_project_template](https://github.com/josephmachado/data_engineering_project_template)** repository. Clone your repository and replace content in the following files 102 | 103 | 1. **[CODEOWNERS](https://github.com/josephmachado/data_engineering_project_template/blob/main/.github/CODEOWNERS)**: In this file change the user id from `@josephmachado` to your Github user id. 104 | 2. **[cd.yml](https://github.com/josephmachado/data_engineering_project_template/blob/main/.github/workflows/cd.yml)**: In this file change the `data_engineering_project_template` part of the `TARGET` parameter to your repository name. 105 | 3. **[variable.tf](https://github.com/josephmachado/data_engineering_project_template/blob/main/terraform/variable.tf)**: In this file change the default values for `alert_email_id` and `repo_url` variables with your email and [github repository url](https://www.theserverside.com/blog/Coffee-Talk-Java-News-Stories-and-Opinions/GitHub-URL-find-use-example) respectively. 106 | 107 | Run the following commands in your project directory. 108 | 109 | ```shell 110 | # Create AWS services with Terraform 111 | make tf-init # Only needed on your first terraform run (or if you add new providers) 112 | make infra-up # type in yes after verifying the changes TF will make 113 | 114 | # Wait until the EC2 instance is initialized, you can check this via your AWS UI 115 | # See "Status Check" on the EC2 console, it should be "2/2 checks passed" before proceeding 116 | # Wait another 5 mins, Airflow takes a while to start up 117 | 118 | make cloud-airflow # this command will forward Airflow port from EC2 to your machine and opens it in the browser 119 | # the user name and password are both airflow 120 | 121 | make cloud-metabase # this command will forward Metabase port from EC2 to your machine and opens it in the browser 122 | # use https://github.com/josephmachado/data_engineering_project_template/blob/main/env file to connect to the warehouse from metabase 123 | ``` 124 | 125 | For the [continuous delivery](https://github.com/josephmachado/data_engineering_project_template/blob/main/.github/workflows/cd.yml) to work, set up the infrastructure with terraform, & defined the following repository secrets. You can set up the repository secrets by going to `Settings > Secrets > Actions > New repository secret`. 126 | 127 | 1. **`SERVER_SSH_KEY`**: We can get this by running `terraform -chdir=./terraform output -raw private_key` in the project directory and paste the entire content in a new Action secret called SERVER_SSH_KEY. 128 | 2. **`REMOTE_HOST`**: Get this by running `terraform -chdir=./terraform output -raw ec2_public_dns` in the project directory. 129 | 3. **`REMOTE_USER`**: The value for this is **ubuntu**. 130 | 131 | ### Tear down infra 132 | 133 | After you are done, make sure to destroy your cloud infrastructure. 134 | 135 | ```shell 136 | make down # Stop docker containers on your computer 137 | make infra-down # type in yes after verifying the changes TF will make 138 | ``` 139 | 140 | -------------------------------------------------------------------------------- /assets/images/cs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/cs1.png -------------------------------------------------------------------------------- /assets/images/cs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/cs2.png -------------------------------------------------------------------------------- /assets/images/cs3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/cs3.png -------------------------------------------------------------------------------- /assets/images/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/dag.png -------------------------------------------------------------------------------- /assets/images/dash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/dash.png -------------------------------------------------------------------------------- /assets/images/data_infra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/data_infra.png -------------------------------------------------------------------------------- /assets/images/det.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/det.png -------------------------------------------------------------------------------- /assets/images/det2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/det2.png -------------------------------------------------------------------------------- /assets/images/fs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/fs.png -------------------------------------------------------------------------------- /assets/images/infra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/infra.png -------------------------------------------------------------------------------- /assets/images/proj_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/proj_1.png -------------------------------------------------------------------------------- /assets/images/proj_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/proj_2.png -------------------------------------------------------------------------------- /assets/images/secret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/secret.png -------------------------------------------------------------------------------- /assets/images/template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/template.png -------------------------------------------------------------------------------- /assets/images/tn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/assets/images/tn.png -------------------------------------------------------------------------------- /containers/airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.9.2 2 | COPY requirements.txt / 3 | RUN pip install --no-cache-dir -r /requirements.txt 4 | 5 | COPY quarto.sh / 6 | RUN cd / && bash /quarto.sh 7 | -------------------------------------------------------------------------------- /containers/airflow/quarto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -L -o ~/quarto-1.5.43-linux-amd64.tar.gz https://github.com/quarto-dev/quarto-cli/releases/download/v1.5.43/quarto-1.5.43-linux-amd64.tar.gz 4 | mkdir ~/opt 5 | tar -C ~/opt -xvzf ~/quarto-1.5.43-linux-amd64.tar.gz 6 | 7 | mkdir ~/.local/bin 8 | ln -s ~/opt/quarto-1.5.43/bin/quarto ~/.local/bin/quarto 9 | 10 | ( echo ""; echo 'export PATH=$PATH:~/.local/bin\n' ; echo "" ) >> ~/.profile 11 | source ~/.profile 12 | 13 | -------------------------------------------------------------------------------- /containers/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | black==24.4.2 2 | flake8==7.0.0 3 | mypy==1.10.0 4 | isort==5.13.2 5 | moto[all]==5.0.9 6 | pytest==8.2.2 7 | pytest-mock==3.14.0 8 | apache-airflow-client==2.9.0 9 | yoyo-migrations==8.2.0 10 | duckdb==1.0.0 11 | plotly==5.22.0 12 | jupyter==1.0.0 13 | types-requests==2.32.0.20240602 14 | cuallee==0.10.3 15 | polars==0.20.31 16 | -------------------------------------------------------------------------------- /dags/coincap_elt.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from datetime import datetime, timedelta 4 | 5 | import requests 6 | 7 | from airflow import DAG 8 | from airflow.decorators import task 9 | from airflow.operators.bash import BashOperator 10 | from cuallee import Check, CheckLevel 11 | import polars as pl 12 | from airflow.operators.dummy import DummyOperator 13 | 14 | with DAG( 15 | 'coincap_elt', 16 | description='A simple DAG to fetch data \ 17 | from CoinCap Exchanges API and write to a file', 18 | schedule_interval=timedelta(days=1), 19 | start_date=datetime(2023, 1, 1), 20 | catchup=False, 21 | ) as dag: 22 | 23 | url = "https://api.coincap.io/v2/exchanges" 24 | file_path = f'{os.getenv("AIRFLOW_HOME")}/data/coincap_exchanges.csv' 25 | 26 | @task 27 | def fetch_coincap_exchanges(url, file_path): 28 | response = requests.get(url) 29 | data = response.json() 30 | exchanges = data['data'] 31 | if exchanges: 32 | keys = exchanges[0].keys() 33 | with open(file_path, 'w') as f: 34 | dict_writer = csv.DictWriter(f, fieldnames=keys) 35 | dict_writer.writeheader() 36 | dict_writer.writerows(exchanges) 37 | 38 | def check_completeness(pl_df, column_name): 39 | check = Check(CheckLevel.ERROR, "Completeness") 40 | validation_results_df = ( 41 | check.is_complete(column_name).validate(pl_df) 42 | ) 43 | return validation_results_df["status"].to_list() 44 | 45 | @task.branch 46 | def check_data_quality(validation_results): 47 | if "FAIL" not in validation_results: 48 | return ['generate_dashboard'] 49 | return ['stop_pipeline'] 50 | 51 | check_data_quality_instance = check_data_quality(check_completeness(pl.read_csv(file_path), "name")) 52 | 53 | stop_pipeline = DummyOperator(task_id='stop_pipeline') 54 | 55 | markdown_path = f'{os.getenv("AIRFLOW_HOME")}/visualization/' 56 | q_cmd = ( 57 | f'cd {markdown_path} && quarto render {markdown_path}/dashboard.qmd' 58 | ) 59 | gen_dashboard = BashOperator( 60 | task_id="generate_dashboard", bash_command=q_cmd 61 | ) 62 | 63 | fetch_coincap_exchanges(url, file_path) >> check_data_quality_instance >> gen_dashboard 64 | check_data_quality_instance >> stop_pipeline 65 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/data_engineering_project_template/e775024f0bdec616a320a85288e8fed9a77dac63/data/.gitkeep -------------------------------------------------------------------------------- /data/coincap_exchanges.csv: -------------------------------------------------------------------------------- 1 | name 2 | dummy 3 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | x-airflow-common: 3 | &airflow-common 4 | build: 5 | context: ./containers/airflow/ 6 | environment: 7 | &airflow-common-env 8 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 9 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 10 | AIRFLOW__CORE__FERNET_KEY: '' 11 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 12 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 13 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 14 | AIRFLOW_CONN_POSTGRES_DEFAULT: postgres://airflow:airflow@postgres:5432/airflow 15 | 16 | volumes: 17 | - ./dags:/opt/airflow/dags 18 | - ./data:/opt/airflow/data 19 | - ./visualization:/opt/airflow/visualization 20 | - ./logs:/opt/airflow/logs 21 | - ./plugins:/opt/airflow/plugins 22 | - ./tests:/opt/airflow/tests 23 | - ./temp:/opt/airflow/temp 24 | - ./migrations:/opt/airflow/migrations 25 | user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" 26 | depends_on: 27 | postgres: 28 | condition: service_healthy 29 | 30 | services: 31 | postgres: 32 | container_name: postgres 33 | image: postgres:16 34 | environment: 35 | POSTGRES_USER: airflow 36 | POSTGRES_PASSWORD: airflow 37 | POSTGRES_DB: airflow 38 | healthcheck: 39 | test: [ "CMD", "pg_isready", "-U", "airflow" ] 40 | interval: 5s 41 | retries: 5 42 | restart: always 43 | ports: 44 | - "5432:5432" 45 | 46 | airflow-webserver: 47 | <<: *airflow-common 48 | container_name: webserver 49 | command: webserver 50 | ports: 51 | - 8080:8080 52 | healthcheck: 53 | test: 54 | [ 55 | "CMD", 56 | "curl", 57 | "--fail", 58 | "http://localhost:8080/health" 59 | ] 60 | interval: 10s 61 | timeout: 10s 62 | retries: 5 63 | restart: always 64 | 65 | airflow-scheduler: 66 | <<: *airflow-common 67 | container_name: scheduler 68 | command: scheduler 69 | healthcheck: 70 | test: 71 | [ 72 | "CMD-SHELL", 73 | 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' 74 | ] 75 | interval: 10s 76 | timeout: 10s 77 | retries: 5 78 | restart: always 79 | 80 | airflow-init: 81 | <<: *airflow-common 82 | command: version 83 | environment: 84 | <<: *airflow-common-env 85 | _AIRFLOW_DB_UPGRADE: 'true' 86 | _AIRFLOW_WWW_USER_CREATE: 'true' 87 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 88 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 89 | -------------------------------------------------------------------------------- /terraform/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/aws" { 5 | version = "4.36.1" 6 | constraints = "~> 4.16" 7 | hashes = [ 8 | "h1:bVdhic55ukDoSukFwOOqX2q/gZ5efe4aBTMGivEuY4o=", 9 | "zh:19b16047b4f15e9b8538a2b925f1e860463984eed7d9bd78e870f3e884e827a7", 10 | "zh:3c0db06a9a14b05a77f3fe1fc029a5fb153f4966964790ca8e71ecc3427d83f5", 11 | "zh:3c7407a8229005e07bc274cbae6e3a464c441a88810bfc6eceb2414678fd08ae", 12 | "zh:3d96fa82c037fafbd3e7f4edc1de32afb029416650f6e392c39182fc74a9e03a", 13 | "zh:8f4f540c5f63d847c4b802ca84d148bb6275a3b0723deb09bf933a4800bc7209", 14 | "zh:9802cb77472d6bcf24c196ce2ca6d02fac9db91558536325fec85f955b71a8a4", 15 | "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", 16 | "zh:a263352433878c89832c2e38f4fd56cf96ae9969c13b5c710d5ba043cbd95743", 17 | "zh:aca7954a5f458ceb14bf0c04c961c4e1e9706bf3b854a1e90a97d0b20f0fe6d3", 18 | "zh:d78f400332e87a97cce2e080db9d01beb01f38f5402514a6705d6b8167e7730d", 19 | "zh:e14bdc49be1d8b7d2543d5c58078c84b76051085e8e6715a895dcfe6034b6098", 20 | "zh:f2e400b88c8de170bb5027922226da1e9a6614c03f2a6756c15c3b930c2f460c", 21 | ] 22 | } 23 | 24 | provider "registry.terraform.io/hashicorp/tls" { 25 | version = "4.0.3" 26 | hashes = [ 27 | "h1:r+5I08cum0iMcWp+ILHoacJ896BhEEk6vkhDrAT8ifU=", 28 | "zh:0b34a21c535db27a71d1b76a635352ad5dc31d81cdee3d34926629c8919990a1", 29 | "zh:160008f7a89ec99f1a79d05bd9c2f11679edbcac5ecf43f618d0a260d003d5ce", 30 | "zh:2b951a9c75be26ff320d979c121832866db75f08bcbe326cf5290c8497e345a4", 31 | "zh:5032695a3fb47914fa6037f21e7f06d0b684ffddde0c758477a63008476e97d0", 32 | "zh:581f518e5104e5bdbdb9f5af35628dcb04a6fee11c6279e4ac1480d7a2bde39f", 33 | "zh:66f36f9da00e7a39952e9e4cf158b552a98d8afabbb74804bf27b872b4fdefc6", 34 | "zh:a159589b44d353376f86d05f1819952445f9a1c00a3bcc2745217d6f513f5147", 35 | "zh:de077f6b852c8d637dbd9b9542ffd590db3d96f9f718acee6645ac6e22365b3c", 36 | "zh:e0ea922619aea6b97f416fd8c129af944df9f37b81417ae7b970c6895498843e", 37 | "zh:e7705f4f35332f598e6d4078c7e7e734e4728c5ec8642a50672e29d51d8f3be3", 38 | "zh:eb64e7a5ebd37b8f59acc91e033897a5d454998f70e57047c94dbfb649503320", 39 | "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 4.16" 6 | } 7 | } 8 | 9 | required_version = ">= 1.2.0" 10 | } 11 | 12 | provider "aws" { 13 | region = var.aws_region 14 | profile = "default" 15 | } 16 | 17 | # Create security group for access to EC2 from your Anywhere 18 | resource "aws_security_group" "sde_security_group" { 19 | name = "sde_security_group" 20 | description = "Security group to allow inbound SCP & outbound 8080 (Airflow) connections" 21 | 22 | ingress { 23 | description = "Inbound SCP" 24 | from_port = 22 25 | to_port = 22 26 | protocol = "tcp" 27 | cidr_blocks = ["0.0.0.0/0"] 28 | } 29 | 30 | egress { 31 | from_port = 0 32 | to_port = 0 33 | protocol = "-1" 34 | cidr_blocks = ["0.0.0.0/0"] 35 | } 36 | 37 | egress { 38 | from_port = 8080 39 | to_port = 8080 40 | protocol = "tcp" 41 | cidr_blocks = ["0.0.0.0/0"] 42 | } 43 | 44 | tags = { 45 | Name = "sde_security_group" 46 | } 47 | } 48 | 49 | # Create EC2 with IAM role to allow EMR, Redshift, & S3 access and security group 50 | resource "tls_private_key" "custom_key" { 51 | algorithm = "RSA" 52 | rsa_bits = 4096 53 | } 54 | 55 | resource "aws_key_pair" "generated_key" { 56 | key_name_prefix = var.key_name 57 | public_key = tls_private_key.custom_key.public_key_openssh 58 | } 59 | 60 | data "aws_ami" "ubuntu" { 61 | most_recent = true 62 | 63 | filter { 64 | name = "name" 65 | values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-20240514"] 66 | } 67 | 68 | filter { 69 | name = "virtualization-type" 70 | values = ["hvm"] 71 | } 72 | 73 | owners = ["099720109477"] # Canonical 74 | } 75 | 76 | resource "aws_instance" "sde_ec2" { 77 | ami = data.aws_ami.ubuntu.id 78 | instance_type = var.instance_type 79 | 80 | key_name = aws_key_pair.generated_key.key_name 81 | security_groups = [aws_security_group.sde_security_group.name] 82 | tags = { 83 | Name = "sde_ec2" 84 | } 85 | 86 | user_data = < /dev/null 105 | 106 | sudo apt-get -y update 107 | sudo apt-get -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin 108 | sudo chmod 666 /var/run/docker.sock 109 | 110 | sudo apt install make 111 | 112 | echo 'Clone git repo to EC2' 113 | cd /home/ubuntu && git clone ${var.repo_url} 114 | 115 | echo 'CD to data_engineering_project_template directory' 116 | cd data_engineering_project_template 117 | 118 | echo 'Start containers & Run db migrations' 119 | make up 120 | 121 | echo "-------------------------END SETUP---------------------------" 122 | 123 | EOF 124 | 125 | } 126 | 127 | # EC2 budget constraint 128 | resource "aws_budgets_budget" "ec2" { 129 | name = "budget-ec2-monthly" 130 | budget_type = "COST" 131 | limit_amount = "5" 132 | limit_unit = "USD" 133 | time_period_end = "2087-06-15_00:00" 134 | time_period_start = "2022-10-22_00:00" 135 | time_unit = "MONTHLY" 136 | 137 | notification { 138 | comparison_operator = "GREATER_THAN" 139 | threshold = 100 140 | threshold_type = "PERCENTAGE" 141 | notification_type = "FORECASTED" 142 | subscriber_email_addresses = [var.alert_email_id] 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /terraform/output.tf: -------------------------------------------------------------------------------- 1 | output "aws_region" { 2 | description = "Region set for AWS" 3 | value = var.aws_region 4 | } 5 | 6 | output "ec2_public_dns" { 7 | description = "EC2 public dns." 8 | value = aws_instance.sde_ec2.public_dns 9 | } 10 | 11 | output "private_key" { 12 | description = "EC2 private key." 13 | value = tls_private_key.custom_key.private_key_pem 14 | sensitive = true 15 | } 16 | 17 | output "public_key" { 18 | description = "EC2 public key." 19 | value = tls_private_key.custom_key.public_key_openssh 20 | } 21 | -------------------------------------------------------------------------------- /terraform/variable.tf: -------------------------------------------------------------------------------- 1 | ## AWS account level config: region 2 | variable "aws_region" { 3 | description = "AWS region" 4 | type = string 5 | default = "us-east-1" 6 | } 7 | 8 | ## Key to allow connection to our EC2 instance 9 | variable "key_name" { 10 | description = "EC2 key name" 11 | type = string 12 | default = "sde-key" 13 | } 14 | 15 | ## EC2 instance type 16 | variable "instance_type" { 17 | description = "Instance type for EMR and EC2" 18 | type = string 19 | default = "m4.xlarge" 20 | } 21 | 22 | ## Alert email receiver 23 | variable "alert_email_id" { 24 | description = "Email id to send alerts to " 25 | type = string 26 | default = "you-name@some-domain.com" 27 | } 28 | 29 | ## Your repository url 30 | variable "repo_url" { 31 | description = "Repository url to clone into production machine" 32 | type = string 33 | default = "https://github.com/josephmachado/data_engineering_project_template.git" 34 | } 35 | -------------------------------------------------------------------------------- /tests/dags/test_dag_validity.py: -------------------------------------------------------------------------------- 1 | from airflow.models import DagBag 2 | 3 | 4 | def test_no_import_errors(): 5 | 6 | dag_bag = DagBag() 7 | assert len(dag_bag.import_errors) == 0, "No Import Failures" 8 | assert dag_bag.size() == 1 9 | -------------------------------------------------------------------------------- /visualization/dashboard.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "CoinCap Exchange Dashboard" 3 | author: "StartDataEngineering" 4 | format: dashboard 5 | --- 6 | 7 | ## Row {height=70%} 8 | 9 | ```{python} 10 | #| title: Coincap Exchange data analysis 11 | 12 | import pandas as pd 13 | import plotly.express as px 14 | import os 15 | # Load the CSV file 16 | file_path = f'{os.getenv("AIRFLOW_HOME")}/data/coincap_exchanges.csv' 17 | import duckdb 18 | 19 | clean_data = duckdb.sql(f"select name, volumeUsd from '{file_path}' order by 2 desc limit 10").df() 20 | # Plot the top 10 exchanges' volumeUSD 21 | fig = px.bar(clean_data, x='name', y='volumeUsd', title='Top 10 Exchanges by VolumeUSD') 22 | fig.show() 23 | 24 | ``` 25 | --------------------------------------------------------------------------------