├── .python-version
├── astro
    ├── dags
    │   ├── .airflowignore
    │   ├── dbt
    │   │   └── magic_the_gathering
    │   │   │   ├── analyses
    │   │   │       └── .gitkeep
    │   │   │   ├── macros
    │   │   │       ├── .gitkeep
    │   │   │       └── export_cards_data.sql
    │   │   │   ├── seeds
    │   │   │       └── .gitkeep
    │   │   │   ├── snapshots
    │   │   │       └── .gitkeep
    │   │   │   ├── tests
    │   │   │       └── .gitkeep
    │   │   │   ├── .user.yml
    │   │   │   ├── .gitignore
    │   │   │   ├── packages.yml
    │   │   │   ├── package-lock.yml
    │   │   │   ├── models
    │   │   │       ├── core
    │   │   │       │   └── fact_cards.sql
    │   │   │       └── staging
    │   │   │       │   ├── sources.yml
    │   │   │       │   ├── stg_cards.sql
    │   │   │       │   └── schema.yml
    │   │   │   ├── profiles.yml
    │   │   │   ├── README.md
    │   │   │   └── dbt_project.yml
    │   ├── gold_ingestion.py
    │   ├── raw_ingestion.py
    │   ├── bronze_ingestion.py
    │   └── silver_ingestion.py
    ├── packages.txt
    ├── .astro
    │   ├── config.yaml
    │   └── test_dag_integrity_default.py
    ├── dbt-requirements.txt
    ├── .dockerignore
    ├── .gitignore
    ├── include
    │   ├── models
    │   │   ├── purchase_uris.py
    │   │   ├── all_parts.py
    │   │   ├── image_uris.py
    │   │   ├── related_uris.py
    │   │   ├── prices.py
    │   │   ├── legalities.py
    │   │   └── card.py
    │   ├── lib
    │   │   ├── motherduck_manager.py
    │   │   ├── duckdb_manager.py
    │   │   └── aws_manager.py
    │   └── ingestion
    │   │   ├── silver
    │   │       └── cards.py
    │   │   ├── bronze
    │   │       └── cards.py
    │   │   └── raw
    │   │       └── cards.py
    ├── requirements.txt
    ├── Dockerfile
    ├── tests
    │   └── dags
    │   │   └── test_dag_example.py
    └── README.md
├── dbt
    └── magic_the_gathering
    │   ├── seeds
    │       └── .gitkeep
    │   ├── tests
    │       └── .gitkeep
    │   ├── analyses
    │       └── .gitkeep
    │   ├── macros
    │       ├── .gitkeep
    │       └── export_cards_data.sql
    │   ├── snapshots
    │       └── .gitkeep
    │   ├── .user.yml
    │   ├── .gitignore
    │   ├── packages.yml
    │   ├── temp
    │       └── dbt.duckdb
    │   ├── package-lock.yml
    │   ├── models
    │       ├── core
    │       │   └── fact_cards.sql
    │       └── staging
    │       │   ├── sources.yml
    │       │   ├── stg_cards.sql
    │       │   └── schema.yml
    │   ├── profiles.yml
    │   ├── README.md
    │   └── dbt_project.yml
├── images
    ├── magic_the_gathering.jpg
    ├── magic_the_gathering_dashboard.jpg
    └── magic_the_gathering_pipeline_etl.png
├── models
    ├── purchase_uris.py
    ├── all_parts.py
    ├── image_uris.py
    ├── related_uris.py
    ├── prices.py
    ├── legalities.py
    └── card.py
├── terraform
    ├── variables.tf
    ├── main.tf
    └── s3_bucket.tf
├── pyproject.toml
├── LICENSE
├── lib
    ├── duckdb_manager.py
    ├── motherduck_manager.py
    └── aws_manager.py
├── .gitignore
├── docs
    └── info_dataset.md
├── README.md
└── ingestion
    ├── silver
        └── cards.py
    ├── bronze
        └── cards.py
    └── raw
        └── cards.py


/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/astro/dags/.airflowignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/packages.txt:
--------------------------------------------------------------------------------
1 | gcc
2 | python3-dev


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/astro/.astro/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 |     name: astro
3 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/.user.yml:
--------------------------------------------------------------------------------
1 | id: e0fed478-152d-44aa-84d0-430a4900cd71
2 | 


--------------------------------------------------------------------------------
/astro/dbt-requirements.txt:
--------------------------------------------------------------------------------
1 | dbt-core==1.5.0
2 | dbt-duckdb==1.5.2
3 | duckdb==0.9.2


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/.user.yml:
--------------------------------------------------------------------------------
1 | id: e0fed478-152d-44aa-84d0-430a4900cd71
2 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | .env
6 | .duckdb/
7 | temp/


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1


--------------------------------------------------------------------------------
/astro/.dockerignore:
--------------------------------------------------------------------------------
1 | astro
2 | .git
3 | .env
4 | airflow_settings.yaml
5 | logs/
6 | .venv
7 | airflow.db
8 | airflow.cfg
9 | 


--------------------------------------------------------------------------------
/images/magic_the_gathering.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/images/magic_the_gathering.jpg


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/temp/dbt.duckdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/dbt/magic_the_gathering/temp/dbt.duckdb


--------------------------------------------------------------------------------
/images/magic_the_gathering_dashboard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/images/magic_the_gathering_dashboard.jpg


--------------------------------------------------------------------------------
/images/magic_the_gathering_pipeline_etl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/images/magic_the_gathering_pipeline_etl.png


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 |   version: 1.1.1
4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353
5 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/models/core/fact_cards.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='table') }}
2 | 
3 | SELECT
4 |     *
5 | FROM {{ ref('stg_cards') }}
6 | ORDER BY usd_price DESC


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 |   version: 1.1.1
4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353
5 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/models/core/fact_cards.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized="table") }}
2 | 
3 | SELECT
4 |     *
5 | FROM {{ ref("stg_cards") }}
6 | ORDER BY usd_price DESC
7 | 


--------------------------------------------------------------------------------
/astro/.gitignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | .env
 3 | .DS_Store
 4 | airflow_settings.yaml
 5 | __pycache__/
 6 | astro
 7 | .venv
 8 | airflow-webserver.pid
 9 | webserver_config.py
10 | airflow.cfg
11 | airflow.db
12 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/models/staging/sources.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | sources:
4 |   - name: s3-magic-the-gathering
5 |     meta:
6 |       external_location: "{{ env_var('TRANSFORM_S3_PATH_INPUT') }}"
7 |     tables:
8 |       - name: cards


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/models/staging/sources.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | sources:
4 |   - name: s3-magic-the-gathering
5 |     meta:
6 |       external_location: "{{ env_var('TRANSFORM_S3_PATH_INPUT') }}"
7 |     tables:
8 |       - name: cards


--------------------------------------------------------------------------------
/models/purchase_uris.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class PurchaseUris(BaseModel):
 5 |     tcgplayer: str
 6 |     cardmarket: str
 7 |     cardhoarder: str
 8 | 
 9 |     def __getitem__(self, item):
10 |         return getattr(self, item)
11 | 


--------------------------------------------------------------------------------
/astro/include/models/purchase_uris.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class PurchaseUris(BaseModel):
 5 |     tcgplayer: str
 6 |     cardmarket: str
 7 |     cardhoarder: str
 8 | 
 9 |     def __getitem__(self, item):
10 |         return getattr(self, item)
11 | 


--------------------------------------------------------------------------------
/astro/requirements.txt:
--------------------------------------------------------------------------------
1 | # Astro Runtime includes the following pre-installed providers packages: https://docs.astronomer.io/astro/runtime-image-architecture#provider-packages
2 | duckdb==0.9.2
3 | airflow-provider-duckdb==0.2.0
4 | astro-sdk-python[duckdb]==1.6.1
5 | astronomer-cosmos
6 | pydantic
7 | 


--------------------------------------------------------------------------------
/models/all_parts.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class AllParts(BaseModel):
 5 |     object: str
 6 |     id: str
 7 |     component: str
 8 |     name: str
 9 |     type_line: str
10 |     uri: str
11 | 
12 |     def __getitem__(self, item):
13 |         return getattr(self, item)
14 | 


--------------------------------------------------------------------------------
/models/image_uris.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class ImageUris(BaseModel):
 5 |     small: str
 6 |     normal: str
 7 |     large: str
 8 |     png: str
 9 |     art_crop: str
10 |     border_crop: str
11 | 
12 |     def __getitem__(self, item):
13 |         return getattr(self, item)
14 | 


--------------------------------------------------------------------------------
/astro/include/models/all_parts.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class AllParts(BaseModel):
 5 |     object: str
 6 |     id: str
 7 |     component: str
 8 |     name: str
 9 |     type_line: str
10 |     uri: str
11 | 
12 |     def __getitem__(self, item):
13 |         return getattr(self, item)
14 | 


--------------------------------------------------------------------------------
/astro/include/models/image_uris.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class ImageUris(BaseModel):
 5 |     small: str
 6 |     normal: str
 7 |     large: str
 8 |     png: str
 9 |     art_crop: str
10 |     border_crop: str
11 | 
12 |     def __getitem__(self, item):
13 |         return getattr(self, item)
14 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/macros/export_cards_data.sql:
--------------------------------------------------------------------------------
 1 | {% macro export_cards_data(table) %}
 2 | {% set s3_path = env_var('TRANSFORM_S3_PATH_OUTPUT', 'my-bucket-path') %}
 3 |     COPY (
 4 |         SELECT 
 5 |             *
 6 |         FROM {{ table }}
 7 |     ) 
 8 |     TO '{{ s3_path }}{{ table }}.parquet'
 9 |      (FORMAT PARQUET);
10 | {% endmacro %}


--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | # AWS VARIABLES
 2 | variable "AWS_ACCESS_KEY" {
 3 |     description = "AWS Access Key."
 4 |     type = string
 5 | }
 6 | 
 7 | variable "AWS_SECRET_KEY" {
 8 |     description = "AWS Secret Key."
 9 |     type = string
10 | }
11 | 
12 | variable "AWS_REGION" {
13 |     description = "AWS Region."
14 |     type = string
15 | }
16 | 


--------------------------------------------------------------------------------
/models/related_uris.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class RelatedUris(BaseModel):
 6 |     tcgplayer_infinite_articles: Optional[str] = ""
 7 |     tcgplayer_infinite_decks: Optional[str] = ""
 8 |     edhrec: Optional[str] = ""
 9 | 
10 |     def __getitem__(self, item):
11 |         return getattr(self, item)
12 | 


--------------------------------------------------------------------------------
/astro/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM --platform=linux/amd64 quay.io/astronomer/astro-runtime:10.6.0
2 | 
3 | # install dbt into a venv to avoid package dependency conflicts
4 | WORKDIR "/usr/local/airflow"
5 | COPY dbt-requirements.txt ./
6 | RUN python -m virtualenv dbt_venv && source dbt_venv/bin/activate && \
7 |     pip install --no-cache-dir -r dbt-requirements.txt && deactivate
8 |     
9 | 


--------------------------------------------------------------------------------
/astro/include/models/related_uris.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class RelatedUris(BaseModel):
 6 |     tcgplayer_infinite_articles: Optional[str] = ""
 7 |     tcgplayer_infinite_decks: Optional[str] = ""
 8 |     edhrec: Optional[str] = ""
 9 | 
10 |     def __getitem__(self, item):
11 |         return getattr(self, item)
12 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/macros/export_cards_data.sql:
--------------------------------------------------------------------------------
 1 | {% macro export_cards_data(schema, table) %}
 2 | {% set s3_path = env_var('TRANSFORM_S3_PATH_OUTPUT') %}
 3 | -- {% set schema = 'gold' %}
 4 |     COPY (
 5 |         SELECT 
 6 |             *
 7 |         FROM {{ schema }}.{{ table }}
 8 |     )
 9 |     TO '{{ s3_path }}{{ table }}.parquet'
10 |      (FORMAT PARQUET);
11 | {% endmacro %}


--------------------------------------------------------------------------------
/models/prices.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class Prices(BaseModel):
 6 |     usd: Optional[str] = ""
 7 |     usd_foil: Optional[str] = ""
 8 |     usd_etched: Optional[str] = ""
 9 |     eur: Optional[str] = ""
10 |     eur_foil: Optional[str] = ""
11 |     tix: Optional[str] = ""
12 | 
13 |     def __getitem__(self, item):
14 |         return getattr(self, item)
15 | 


--------------------------------------------------------------------------------
/astro/include/models/prices.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class Prices(BaseModel):
 6 |     usd: Optional[str] = ""
 7 |     usd_foil: Optional[str] = ""
 8 |     usd_etched: Optional[str] = ""
 9 |     eur: Optional[str] = ""
10 |     eur_foil: Optional[str] = ""
11 |     tix: Optional[str] = ""
12 | 
13 |     def __getitem__(self, item):
14 |         return getattr(self, item)
15 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/profiles.yml:
--------------------------------------------------------------------------------
 1 | magic_the_gathering:
 2 |   outputs:
 3 |     dev:
 4 |       type: duckdb
 5 |       path: temp/dbt.duckdb
 6 |       extensions:
 7 |         - httpfs
 8 |         - parquet
 9 |       settings:
10 |         s3_region: "{{ env_var('AWS_REGION') }}"
11 |         s3_access_key_id: "{{ env_var('AWS_ACCESS_KEY') }}"
12 |         s3_secret_access_key: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}"
13 |     prod:
14 |       type: duckdb
15 |       schema: gold
16 |       path: "{{ env_var('MOTHERDUCK_DATABASE') }}"
17 |   target: dev


--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.0"
 3 |   backend "local" {}  # Change from "local" to "gcs" (for Google Cloud Storage) or "s3" (for AWS S3) if you want to preserve your tfstate online
 4 |   required_providers {
 5 |     aws = {  # Change the required provider to AWS
 6 |       source  = "hashicorp/aws"
 7 |     }
 8 |   }
 9 | }
10 | 
11 | provider "aws" {
12 |   region      = var.AWS_REGION  # Set AWS region using variable
13 |   access_key  = var.AWS_ACCESS_KEY  # Set AWS access key using variable
14 |   secret_key  = var.AWS_SECRET_KEY  # Set AWS secret key using variable
15 | }
16 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/profiles.yml:
--------------------------------------------------------------------------------
 1 | magic_the_gathering:
 2 |   outputs:
 3 |     dev:
 4 |       type: duckdb
 5 |       schema: gold
 6 |       path: /usr/local/airflow/dags/dbt/magic_the_gathering/temp/dbt.duckdb
 7 |       extensions:
 8 |         - httpfs
 9 |         - parquet
10 |       settings:
11 |         s3_region: "{{ env_var('AWS_REGION') }}"
12 |         s3_access_key_id: "{{ env_var('AWS_ACCESS_KEY') }}"
13 |         s3_secret_access_key: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}"
14 |     prod:
15 |       type: duckdb
16 |       schema: gold
17 |       path: "{{ env_var('MOTHERDUCK_DATABASE') }}"
18 |   target: dev
19 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "magic-the-gathering-pipeline"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Pitta <arthurpitta21@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | pydantic = "^2.6.4"
11 | pytest = "^8.1.1"
12 | ruff = "^0.3.3"
13 | requests = "^2.31.0"
14 | loguru = "^0.7.2"
15 | boto3 = "^1.34.64"
16 | python-dotenv = "^1.0.1"
17 | duckdb = "^0.9.2"
18 | dbt-duckdb = "^1.7.3"
19 | 
20 | 
21 | [tool.poetry.group.dev.dependencies]
22 | ipykernel = "^6.29.4"
23 | 
24 | [build-system]
25 | requires = ["poetry-core"]
26 | build-backend = "poetry.core.masonry.api"
27 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/models/legalities.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class Legalities(BaseModel):
 5 |     standard: str
 6 |     future: str
 7 |     historic: str
 8 |     timeless: str
 9 |     gladiator: str
10 |     pioneer: str
11 |     explorer: str
12 |     modern: str
13 |     legacy: str
14 |     pauper: str
15 |     vintage: str
16 |     penny: str
17 |     commander: str
18 |     oathbreaker: str
19 |     standardbrawl: str
20 |     brawl: str
21 |     alchemy: str
22 |     paupercommander: str
23 |     duel: str
24 |     oldschool: str
25 |     premodern: str
26 |     predh: str
27 | 
28 |     def __getitem__(self, item):
29 |         return getattr(self, item)
30 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/astro/include/models/legalities.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class Legalities(BaseModel):
 5 |     standard: str
 6 |     future: str
 7 |     historic: str
 8 |     timeless: str
 9 |     gladiator: str
10 |     pioneer: str
11 |     explorer: str
12 |     modern: str
13 |     legacy: str
14 |     pauper: str
15 |     vintage: str
16 |     penny: str
17 |     commander: str
18 |     oathbreaker: str
19 |     standardbrawl: str
20 |     brawl: str
21 |     alchemy: str
22 |     paupercommander: str
23 |     duel: str
24 |     oldschool: str
25 |     premodern: str
26 |     predh: str
27 | 
28 |     def __getitem__(self, item):
29 |         return getattr(self, item)
30 | 


--------------------------------------------------------------------------------
/terraform/s3_bucket.tf:
--------------------------------------------------------------------------------
 1 | # AWS
 2 | # Ref: https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket
 3 | resource "aws_s3_bucket" "bucket" {
 4 |   bucket = "magic-the-gathering-bucket"
 5 | }
 6 | 
 7 | ## Consumes more space 
 8 | # resource "aws_s3_bucket_versioning" "bucket_versioning" {
 9 | #   bucket = aws_s3_bucket.bucket.id
10 | #   versioning_configuration {
11 | #     status = "Enabled"
12 | #   }
13 | # }
14 | 
15 | resource "aws_s3_bucket_lifecycle_configuration" "bucket_lifecycle_configuration" {
16 |   bucket = aws_s3_bucket.bucket.id
17 |   rule {
18 |     id = "delete"
19 |     expiration {
20 |       days = 30  # objects will be deleted after 30 days
21 |     }
22 |     status = "Enabled"
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/models/staging/stg_cards.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized="view") }}
 2 | 
 3 | --handling deduplication
 4 | WITH cards AS
 5 | (
 6 |     SELECT 
 7 |         *
 8 |         , row_number() OVER(PARTITION BY card_id, released_at) AS rn
 9 |     FROM {{ source("s3-magic-the-gathering", "cards") }}
10 | )
11 | SELECT 
12 |     -- identifiers
13 |     CAST(({{ dbt_utils.generate_surrogate_key(['card_id', 'released_at']) }}) AS string) AS case_id,
14 |     CAST(card_id AS string) AS card_id,
15 | 
16 |     -- Cards info
17 |     CAST(name AS string) AS name,
18 |     CAST(released_at AS date) AS released_at,
19 |     CAST(color_identity AS string) AS color_identity,
20 |     CAST(set_name AS string) AS set_name,
21 |     CAST(artist AS string) AS artist,
22 |     CAST(usd_price AS float) AS usd_price,
23 | 
24 | FROM cards
25 | WHERE rn = 1
26 | 


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/models/staging/stg_cards.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | --handling deduplication
 4 | WITH cards AS
 5 | (
 6 |     SELECT 
 7 |         *
 8 |         , row_number() over(partition by card_id, released_at) AS rn
 9 |     FROM {{ source("s3-magic-the-gathering", "cards") }}
10 | )
11 | SELECT 
12 |     -- identifiers
13 |     CAST(({{ dbt_utils.generate_surrogate_key(['card_id', 'released_at']) }}) AS string) AS case_id,
14 |     CAST(card_id AS string) AS card_id,
15 | 
16 |     -- Cards info
17 |     CAST(name AS string) AS name,
18 |     CAST(released_at AS date) AS released_at,
19 |     CAST(color_identity AS string) AS color_identity,
20 |     CAST(set_name AS string) AS set_name,
21 |     CAST(artist AS string) AS artist,
22 |     CAST(usd_price AS float) AS usd_price,
23 | 
24 | FROM cards
25 | WHERE rn = 1
26 | 
27 | -- {% if var('is_test_run', default=true) %}
28 | 
29 | --     limit 100
30 | 
31 | -- {% endif %}


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/models/staging/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: stg_cards
 5 |     description: "Individual Magic: The Gathering cards that players could obtain and add to their collection (with a few minor exceptions)." 
 6 |     columns:
 7 |       - name: id
 8 |         description: >
 9 |           A unique ID for the card in Scryfall database.
10 |       - name: name
11 |         description: >
12 |           The name of the card. If the card has multiple faces, this field will contain both names separated by ␣//␣.
13 |       - name: released_at
14 |         description: >
15 |           The date the card was first released.
16 |       - name: color_identity
17 |         description: >
18 |           The card color identity.
19 |       - name: set_name
20 |         description: >
21 |           The card full set name.
22 |       - name: artist
23 |         description: >
24 |           The name of the illustrator of the card face. Newly spoiled cards may not have this field yet.
25 |       - name: (prices).usd
26 |         description: >
27 |           The card usd price.
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Arthur Pitta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib/duckdb_manager.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | from loguru import logger
 3 | from typing import Any
 4 | 
 5 | 
 6 | class DuckDBManager:
 7 |     """
 8 |     Manages DuckDB connection and executes queries.
 9 |     """
10 | 
11 |     def __init__(self):
12 |         """
13 |         Initializes DuckDBManager.
14 |         """
15 |         self.connection = self.create_connection()
16 | 
17 |     def create_connection(self) -> Any:
18 |         """
19 |         Create a connection to DuckDB.
20 | 
21 |         Returns:
22 |             duckdb.Connection: DuckDB connection object.
23 |         """
24 |         try:
25 |             logger.info("Creating DuckDB connection")
26 |             duckdb_conn = duckdb.connect()
27 |             logger.success("DuckDB connection created!")
28 |             return duckdb_conn
29 |         except Exception as e:
30 |             logger.error(f"Error creating DuckDB connection: {e}")
31 |             return None
32 | 
33 |     def execute_query(self, query: str) -> None:
34 |         """
35 |         Executes a SQL query.
36 | 
37 |         Args:
38 |             query (str): SQL query to execute.
39 | 
40 |         Returns:
41 |             None
42 |         """
43 |         try:
44 |             logger.info("Executing query")
45 |             self.connection.execute(query)
46 |             logger.success("Query executed")
47 |         except Exception as e:
48 |             logger.error(f"Error executing query: {e}")
49 | 


--------------------------------------------------------------------------------
/astro/include/lib/motherduck_manager.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class MotherDuckManager:
 8 |     """
 9 |     Manages connection to MotherDuck.
10 |     """
11 | 
12 |     def __init__(self, duckdb_manager, motherduck_token: str):
13 |         """
14 |         Initializes MotherDuckManager.
15 | 
16 |         Args:
17 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
18 |             motherduck_token (str): Token for accessing MotherDuck.
19 |         """
20 |         self.duckdb_manager = duckdb_manager
21 |         self.connect(motherduck_token)
22 | 
23 |     def connect(self, motherduck_token: str) -> None:
24 |         """
25 |         Connects to MotherDuck.
26 | 
27 |         Args:
28 |             motherduck_token (str): Token for accessing MotherDuck.
29 | 
30 |         Returns:
31 |             None
32 |         """
33 |         try:
34 |             logger.info("Connecting to Mother Duck")
35 |             self.duckdb_manager.execute_query("INSTALL md;")
36 |             self.duckdb_manager.execute_query("LOAD md;")
37 |             self.duckdb_manager.execute_query(
38 |                 f"SET motherduck_token='{motherduck_token}'"
39 |             )
40 |             self.duckdb_manager.execute_query("ATTACH 'md:'")
41 |             logger.info("Connected to Mother Duck!")
42 |         except Exception as e:
43 |             logger.error(f"Error connecting to MotherDuck: {e}")
44 | 


--------------------------------------------------------------------------------
/lib/motherduck_manager.py:
--------------------------------------------------------------------------------
 1 | from lib.duckdb_manager import DuckDBManager
 2 | from loguru import logger
 3 | 
 4 | 
 5 | class MotherDuckManager:
 6 |     """
 7 |     Manages connection to MotherDuck.
 8 |     """
 9 | 
10 |     def __init__(self, duckdb_manager: DuckDBManager, motherduck_token: str):
11 |         """
12 |         Initializes MotherDuckManager.
13 | 
14 |         Args:
15 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
16 |             motherduck_token (str): Token for accessing MotherDuck.
17 |         """
18 |         self.duckdb_manager = duckdb_manager
19 |         self.connect(motherduck_token)
20 | 
21 |     def connect(self, motherduck_token: str) -> None:
22 |         """
23 |         Connects to MotherDuck.
24 | 
25 |         Args:
26 |             motherduck_token (str): Token for accessing MotherDuck.
27 | 
28 |         Returns:
29 |             None
30 |         """
31 |         try:
32 |             logger.info("Connecting to Mother Duck")
33 |             self.duckdb_manager.execute_query("INSTALL md;")
34 |             self.duckdb_manager.execute_query("LOAD md;")
35 |             self.duckdb_manager.execute_query(
36 |                 f"SET motherduck_token='{motherduck_token}'"
37 |             )
38 |             self.duckdb_manager.execute_query("ATTACH 'md:'")
39 |             logger.success("Connected to Mother Duck!")
40 |         except Exception as e:
41 |             logger.error(f"Error connecting to MotherDuck: {e}")
42 | 


--------------------------------------------------------------------------------
/astro/include/lib/duckdb_manager.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | from typing import Any
 3 | import logging
 4 | 
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class DuckDBManager:
10 |     """
11 |     Manages DuckDB connection and executes queries.
12 |     """
13 | 
14 |     def __init__(self):
15 |         """
16 |         Initializes DuckDBManager.
17 |         """
18 |         self.connection = self.create_connection()
19 | 
20 |     def create_connection(self) -> Any:
21 |         """
22 |         Create a connection to DuckDB.
23 | 
24 |         Returns:
25 |             duckdb.Connection: DuckDB connection object.
26 |         """
27 |         try:
28 |             logger.info("Creating DuckDB connection")
29 |             duckdb_conn = duckdb.connect()
30 |             logger.info("DuckDB connection created!")
31 |             return duckdb_conn
32 |         except Exception as e:
33 |             logger.error(f"Error creating DuckDB connection: {e}")
34 |             return None
35 | 
36 |     def execute_query(self, query: str) -> None:
37 |         """
38 |         Executes a SQL query.
39 | 
40 |         Args:
41 |             query (str): SQL query to execute.
42 | 
43 |         Returns:
44 |             None
45 |         """
46 |         try:
47 |             logger.info("Executing query")
48 |             self.connection.execute(query)
49 |             logger.info("Query executed")
50 |         except Exception as e:
51 |             logger.error(f"Error executing query: {e}")
52 | 


--------------------------------------------------------------------------------
/astro/dags/dbt/magic_the_gathering/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'magic_the_gathering'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'magic_the_gathering'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |   - "target"
24 |   - "dbt_packages"
25 | 
26 | 
27 | # Configuring models
28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
29 | 
30 | # In this example config, we tell dbt to build all models in the example/
31 | # directory as views. These settings can be overridden in the individual model
32 | # files using the `{{ config(...) }}` macro.
33 | models:
34 |   magic_the_gathering:
35 |     # Applies to all files under models/staging/
36 |     staging:
37 |       +materialized: view
38 |     # Applies to all files under models/core/
39 |     core:
40 |       +materialized: table
41 |       +post-hook: "{{ export_cards_data('gold', 'fact_cards') }}"


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/models/staging/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: stg_cards
 5 |     description: "Individual Magic: The Gathering cards that players could obtain and add to their collection (with a few minor exceptions)." 
 6 |     columns:
 7 |       - name: id
 8 |         description: >
 9 |           A unique ID for the card in Scryfall database.
10 |         tests:
11 |           - unique:
12 |               severity: warn
13 |           - not_null:
14 |               severity: warn
15 |       - name: name
16 |         description: >
17 |           The name of the card. If the card has multiple faces, this field will contain both names separated by ␣//␣.
18 |         tests:
19 |           - not_null:
20 |               severity: warn
21 |       - name: released_at
22 |         description: >
23 |           The date the card was first released.
24 |         tests:
25 |           - not_null:
26 |               severity: warn
27 |       - name: color_identity
28 |         description: >
29 |           The card color identity
30 |         tests:
31 |           - not_null:
32 |               severity: warn
33 |       - name: set_name
34 |         description: >
35 |           The card full set name.
36 |         tests:
37 |           - not_null:
38 |               severity: warn
39 |       - name: artist
40 |         description: >
41 |           The name of the illustrator of the card face. Newly spoiled cards may not have this field yet.
42 |       - name: (prices).usd
43 |         description: >
44 |           The card usd price.
45 |         tests:
46 |           - not_null:
47 |               severity: warn


--------------------------------------------------------------------------------
/dbt/magic_the_gathering/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'magic_the_gathering'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'magic_the_gathering'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |   - "target"
24 |   - "dbt_packages"
25 | 
26 | 
27 | # Configuring models
28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
29 | 
30 | # In this example config, we tell dbt to build all models in the example/
31 | # directory as views. These settings can be overridden in the individual model
32 | # files using the `{{ config(...) }}` macro.
33 | models:
34 |   magic_the_gathering:
35 |     # Applies to all files under models/staging/
36 |     staging:
37 |       +materialized: view
38 |     # Applies to all files under models/core/
39 |     core:
40 |       +materialized: table
41 |       +post-hook: "{% if target.name == 'dev' %}{{ export_cards_data(this.name) }}{% endif %}"


--------------------------------------------------------------------------------
/astro/dags/gold_ingestion.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | import os
 3 | from cosmos import DbtDag, ProjectConfig, ProfileConfig, ExecutionConfig
 4 | 
 5 | default_args = {
 6 |     'owner': 'airflow',
 7 |     'depends_on_past': True,
 8 |     'start_date': datetime(2024, 4, 1),
 9 |     'retries': 2,
10 |     'retry_delay': timedelta(minutes=5),
11 | }
12 | 
13 | profile_config = ProfileConfig(profile_name="magic_the_gathering",
14 |                                target_name="dev",
15 |                                profiles_yml_filepath="/usr/local/airflow/dags/dbt/magic_the_gathering/profiles.yml")
16 | 
17 | project_config = ProjectConfig(dbt_project_path="/usr/local/airflow/dags/dbt/magic_the_gathering")
18 | 
19 | OPERATOR_ARGS = {
20 |     "install_deps": True,
21 |     "env": {
22 |         "HOME": "/usr/local/airflow/dags/dbt/magic_the_gathering",
23 |         "AWS_REGION": os.environ["AWS_REGION"],
24 |         "AWS_ACCESS_KEY": os.environ["AWS_ACCESS_KEY"],
25 |         "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"],
26 |         "TRANSFORM_S3_PATH_INPUT": os.environ["TRANSFORM_S3_PATH_INPUT"],
27 |         "TRANSFORM_S3_PATH_OUTPUT": os.environ["TRANSFORM_S3_PATH_OUTPUT"],
28 |         "MOTHERDUCK_DATABASE": os.environ["MOTHERDUCK_DATABASE"]
29 |     }
30 | }
31 | 
32 | cards_gold_dag = DbtDag(project_config=project_config,
33 |                         operator_args=OPERATOR_ARGS,
34 |                         profile_config=profile_config,
35 |                         execution_config=ExecutionConfig(dbt_executable_path=f"{os.environ['AIRFLOW_HOME']}/dbt_venv/bin/dbt",),
36 |                         default_args=default_args,
37 |                         tags=['gold_cards'],
38 |                         dag_id='ingestor_gold')
39 | 
40 | 


--------------------------------------------------------------------------------
/astro/include/lib/aws_manager.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class AWSManager:
 8 |     """
 9 |     Manages AWS credentials and operations.
10 |     """
11 | 
12 |     def __init__(
13 |         self,
14 |         duckdb_manager,
15 |         aws_region: str,
16 |         aws_access_key: str,
17 |         aws_secret_access_key: str,
18 |     ):
19 |         """
20 |         Initializes AWSManager.
21 | 
22 |         Args:
23 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
24 |             aws_region (str): AWS region.
25 |             aws_access_key (str): AWS access key ID.
26 |             aws_secret_access_key (str): AWS secret access key.
27 |         """
28 |         self.duckdb_manager = duckdb_manager
29 |         self.load_credentials(aws_region, aws_access_key, aws_secret_access_key)
30 | 
31 |     def load_credentials(
32 |         self, aws_region: str, aws_access_key: str, aws_secret_access_key: str
33 |     ) -> None:
34 |         """
35 |         Loads AWS credentials.
36 | 
37 |         Args:
38 |             aws_region (str): AWS region.
39 |             aws_access_key (str): AWS access key ID.
40 |             aws_secret_access_key (str): AWS secret access key.
41 | 
42 |         Returns:
43 |             None
44 |         """
45 |         try:
46 |             logger.info("Loading AWS credentials")
47 |             self.duckdb_manager.execute_query("INSTALL httpfs;")
48 |             self.duckdb_manager.execute_query("LOAD httpfs;")
49 |             self.duckdb_manager.execute_query(f"SET s3_region='{aws_region}'")
50 |             self.duckdb_manager.execute_query(
51 |                 f"SET s3_access_key_id='{aws_access_key}';"
52 |             )
53 |             self.duckdb_manager.execute_query(
54 |                 f"SET s3_secret_access_key='{aws_secret_access_key}';"
55 |             )
56 |             self.duckdb_manager.execute_query("CALL load_aws_credentials();")
57 |             logger.info("AWS credentials loaded!")
58 |         except Exception as e:
59 |             logger.error(f"Error loading AWS credentials: {e}")
60 | 


--------------------------------------------------------------------------------
/lib/aws_manager.py:
--------------------------------------------------------------------------------
 1 | from lib.duckdb_manager import DuckDBManager
 2 | from loguru import logger
 3 | 
 4 | 
 5 | class AWSManager:
 6 |     """
 7 |     Manages AWS credentials and operations.
 8 |     """
 9 | 
10 |     def __init__(
11 |         self,
12 |         duckdb_manager: DuckDBManager,
13 |         aws_region: str,
14 |         aws_access_key: str,
15 |         aws_secret_access_key: str,
16 |     ):
17 |         """
18 |         Initializes AWSManager.
19 | 
20 |         Args:
21 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
22 |             aws_region (str): AWS region.
23 |             aws_access_key (str): AWS access key ID.
24 |             aws_secret_access_key (str): AWS secret access key.
25 |         """
26 |         self.duckdb_manager = duckdb_manager
27 |         self.load_credentials(aws_region, aws_access_key, aws_secret_access_key)
28 | 
29 |     def load_credentials(
30 |         self, aws_region: str, aws_access_key: str, aws_secret_access_key: str
31 |     ) -> None:
32 |         """
33 |         Loads AWS credentials.
34 | 
35 |         Args:
36 |             aws_region (str): AWS region.
37 |             aws_access_key (str): AWS access key ID.
38 |             aws_secret_access_key (str): AWS secret access key.
39 | 
40 |         Returns:
41 |             None
42 |         """
43 |         try:
44 |             logger.info("Loading AWS credentials")
45 |             self.duckdb_manager.execute_query("INSTALL httpfs;")
46 |             self.duckdb_manager.execute_query("LOAD httpfs;")
47 |             self.duckdb_manager.execute_query(f"SET s3_region='{aws_region}'")
48 |             self.duckdb_manager.execute_query(
49 |                 f"SET s3_access_key_id='{aws_access_key}';"
50 |             )
51 |             self.duckdb_manager.execute_query(
52 |                 f"SET s3_secret_access_key='{aws_secret_access_key}';"
53 |             )
54 |             self.duckdb_manager.execute_query("CALL load_aws_credentials();")
55 |             logger.success("AWS credentials loaded!")
56 |         except Exception as e:
57 |             logger.error(f"Error loading AWS credentials: {e}")
58 | 


--------------------------------------------------------------------------------
/models/card.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional
 3 | 
 4 | from models.image_uris import ImageUris
 5 | from models.all_parts import AllParts
 6 | from models.legalities import Legalities
 7 | from models.prices import Prices
 8 | from models.related_uris import RelatedUris
 9 | from models.purchase_uris import PurchaseUris
10 | 
11 | 
12 | class Card(BaseModel):
13 |     object: str
14 |     id: str
15 |     oracle_id: Optional[str] = ""
16 |     multiverse_ids: Optional[List] = None
17 |     mtgo_id: Optional[int] = 0
18 |     mtgo_foil_id: Optional[int] = 0
19 |     tcgplayer_id: Optional[int] = 0
20 |     cardmarket_id: Optional[int] = 0
21 |     name: str
22 |     lang: str
23 |     released_at: str
24 |     uri: str
25 |     scryfall_uri: str
26 |     layout: str
27 |     highres_image: bool
28 |     image_status: str
29 |     image_uris: Optional[ImageUris] = None
30 |     mana_cost: Optional[str] = ""
31 |     cmc: Optional[float] = 0.0
32 |     type_line: Optional[str] = ""
33 |     oracle_text: Optional[str] = ""
34 |     power: Optional[str] = ""
35 |     toughness: Optional[str] = ""
36 |     colors: Optional[List[str]] = None
37 |     color_identity: List[str]
38 |     keywords: List[str]
39 |     all_parts: Optional[List[AllParts]] = None
40 |     legalities: Legalities
41 |     games: List[str]
42 |     reserved: bool
43 |     foil: Optional[bool] = None
44 |     nonfoil: Optional[bool] = None
45 |     finishes: List[str]
46 |     oversized: bool
47 |     promo: bool
48 |     reprint: bool
49 |     variation: bool
50 |     set_id: str
51 |     set: str
52 |     set_name: str
53 |     set_type: str
54 |     set_uri: str
55 |     set_search_uri: str
56 |     scryfall_set_uri: str
57 |     rulings_uri: str
58 |     prints_search_uri: str
59 |     collector_number: str
60 |     digital: bool
61 |     rarity: str
62 |     flavor_text: Optional[str] = ""
63 |     card_back_id: Optional[str] = ""
64 |     artist: Optional[str] = ""
65 |     artist_ids: Optional[List[str]] = None
66 |     illustration_id: Optional[str] = ""
67 |     border_color: str
68 |     frame: str
69 |     full_art: bool
70 |     textless: bool
71 |     booster: bool
72 |     story_spotlight: bool
73 |     edhrec_rank: Optional[int] = 0
74 |     penny_rank: Optional[int] = 0
75 |     prices: Prices
76 |     related_uris: RelatedUris
77 |     purchase_uris: Optional[PurchaseUris] = None
78 | 
79 |     def __getitem__(self, item):
80 |         return getattr(self, item)
81 | 


--------------------------------------------------------------------------------
/astro/include/models/card.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional
 3 | 
 4 | from models.image_uris import ImageUris
 5 | from models.all_parts import AllParts
 6 | from models.legalities import Legalities
 7 | from models.prices import Prices
 8 | from models.related_uris import RelatedUris
 9 | from models.purchase_uris import PurchaseUris
10 | 
11 | 
12 | class Card(BaseModel):
13 |     object: str
14 |     id: str
15 |     oracle_id: Optional[str] = ""
16 |     multiverse_ids: Optional[List] = None
17 |     mtgo_id: Optional[int] = 0
18 |     mtgo_foil_id: Optional[int] = 0
19 |     tcgplayer_id: Optional[int] = 0
20 |     cardmarket_id: Optional[int] = 0
21 |     name: str
22 |     lang: str
23 |     released_at: str
24 |     uri: str
25 |     scryfall_uri: str
26 |     layout: str
27 |     highres_image: bool
28 |     image_status: str
29 |     image_uris: Optional[ImageUris] = None
30 |     mana_cost: Optional[str] = ""
31 |     cmc: Optional[float] = 0.0
32 |     type_line: Optional[str] = ""
33 |     oracle_text: Optional[str] = ""
34 |     power: Optional[str] = ""
35 |     toughness: Optional[str] = ""
36 |     colors: Optional[List[str]] = None
37 |     color_identity: List[str]
38 |     keywords: List[str]
39 |     all_parts: Optional[List[AllParts]] = None
40 |     legalities: Legalities
41 |     games: List[str]
42 |     reserved: bool
43 |     foil: Optional[bool] = None
44 |     nonfoil: Optional[bool] = None
45 |     finishes: List[str]
46 |     oversized: bool
47 |     promo: bool
48 |     reprint: bool
49 |     variation: bool
50 |     set_id: str
51 |     set: str
52 |     set_name: str
53 |     set_type: str
54 |     set_uri: str
55 |     set_search_uri: str
56 |     scryfall_set_uri: str
57 |     rulings_uri: str
58 |     prints_search_uri: str
59 |     collector_number: str
60 |     digital: bool
61 |     rarity: str
62 |     flavor_text: Optional[str] = ""
63 |     card_back_id: Optional[str] = ""
64 |     artist: Optional[str] = ""
65 |     artist_ids: Optional[List[str]] = None
66 |     illustration_id: Optional[str] = ""
67 |     border_color: str
68 |     frame: str
69 |     full_art: bool
70 |     textless: bool
71 |     booster: bool
72 |     story_spotlight: bool
73 |     edhrec_rank: Optional[int] = 0
74 |     penny_rank: Optional[int] = 0
75 |     prices: Prices
76 |     related_uris: RelatedUris
77 |     purchase_uris: Optional[PurchaseUris] = None
78 | 
79 |     def __getitem__(self, item):
80 |         return getattr(self, item)
81 | 


--------------------------------------------------------------------------------
/astro/dags/raw_ingestion.py:
--------------------------------------------------------------------------------
 1 | # from datetime import datetime, timedelta
 2 | from datetime import datetime, timedelta
 3 | import os
 4 | import logging
 5 | from airflow.decorators import dag, task
 6 | from airflow.operators.dagrun_operator import TriggerDagRunOperator
 7 | from dotenv import load_dotenv
 8 | 
 9 | from include.ingestion.raw.cards import (
10 |     APIClient,
11 |     DataParser,
12 |     DataSaver,
13 |     )
14 | 
15 | # Load environment variables
16 | load_dotenv()
17 | 
18 | # Configuration
19 | API_BASE_URL = "https://api.scryfall.com/bulk-data/"
20 | DATASET_NAME = "default_cards"
21 | TABLE_NAME = "cards"
22 | TABLE_PATH = "data/raw/"
23 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
24 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
25 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
26 | 
27 | default_args = {
28 |     'owner': 'airflow',
29 |     'depends_on_past': False,
30 |     'start_date': datetime(2024, 4, 1),
31 |     'retries': 2,
32 |     'retry_delay': timedelta(minutes=5),
33 | }
34 | 
35 | logger = logging.getLogger(__name__)
36 | 
37 | 
38 | @dag(default_args=default_args, schedule_interval='@daily', catchup=False, tags=['raw_cards'])
39 | def ingestor_raw():
40 |     """
41 |     Airflow DAG for ingesting data from an API, parsing it, and saving it.
42 |     """
43 | 
44 |     @task
45 |     def raw_cards():
46 |         try:
47 |             api_client = APIClient(API_BASE_URL, DATASET_NAME)
48 |             data_parser = DataParser()
49 |             data_saver = DataSaver(
50 |                 TABLE_PATH, TABLE_NAME, AWS_BUCKET_NAME, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY
51 |             )
52 |             
53 |             logger.info("START FETCHING BULKING DATA")
54 |             bulk_data = api_client.fetch_bulk_data()
55 |             logger.info("START FETCHING CARDS DATA")
56 |             cards_data = api_client.fetch_cards_data(bulk_data)
57 |             logger.info("START PARSING DATA")
58 |             parsed_data = data_parser.parse_cards(cards_data)
59 |             logger.info("START SAVING DATA LOCALLY")
60 |             data_saver.save_local(parsed_data)
61 |             logger.info("START SAVING DATA TO S3")
62 |             data_saver.save_s3(parsed_data)
63 |         except Exception as e:
64 |             logger.error(f"An error occured: {e}")
65 | 
66 |     # Define task dependencies
67 |     raw_cards_task = raw_cards()
68 |     
69 |     # Define task to trigger the Silver DAG
70 |     trigger_bronze_task = TriggerDagRunOperator(
71 |         task_id='trigger_bronze',
72 |         trigger_dag_id='ingestor_bronze',
73 |         wait_for_completion=True,
74 |         deferrable=True,
75 |     )
76 |         
77 |     # Define tasks dependencies
78 |     raw_cards_task >> trigger_bronze_task
79 | 
80 | 
81 | # Instantiate the DAG
82 | ingestor_raw()
83 | 


--------------------------------------------------------------------------------
/astro/tests/dags/test_dag_example.py:
--------------------------------------------------------------------------------
 1 | """Example DAGs test. This test ensures that all Dags have tags, retries set to two, and no import errors. This is an example pytest and may not be fit the context of your DAGs. Feel free to add and remove tests."""
 2 | 
 3 | import os
 4 | import logging
 5 | from contextlib import contextmanager
 6 | import pytest
 7 | from airflow.models import DagBag
 8 | 
 9 | 
10 | @contextmanager
11 | def suppress_logging(namespace):
12 |     logger = logging.getLogger(namespace)
13 |     old_value = logger.disabled
14 |     logger.disabled = True
15 |     try:
16 |         yield
17 |     finally:
18 |         logger.disabled = old_value
19 | 
20 | 
21 | def get_import_errors():
22 |     """
23 |     Generate a tuple for import errors in the dag bag
24 |     """
25 |     with suppress_logging("airflow"):
26 |         dag_bag = DagBag(include_examples=False)
27 | 
28 |         def strip_path_prefix(path):
29 |             return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
30 | 
31 |         # prepend "(None,None)" to ensure that a test object is always created even if it's a no op.
32 |         return [(None, None)] + [
33 |             (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()
34 |         ]
35 | 
36 | 
37 | def get_dags():
38 |     """
39 |     Generate a tuple of dag_id, <DAG objects> in the DagBag
40 |     """
41 |     with suppress_logging("airflow"):
42 |         dag_bag = DagBag(include_examples=False)
43 | 
44 |     def strip_path_prefix(path):
45 |         return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
46 | 
47 |     return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()]
48 | 
49 | 
50 | @pytest.mark.parametrize(
51 |     "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
52 | )
53 | def test_file_imports(rel_path, rv):
54 |     """Test for import errors on a file"""
55 |     if rel_path and rv:
56 |         raise Exception(f"{rel_path} failed to import with message \n {rv}")
57 | 
58 | 
59 | APPROVED_TAGS = {}
60 | 
61 | 
62 | @pytest.mark.parametrize(
63 |     "dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()]
64 | )
65 | def test_dag_tags(dag_id, dag, fileloc):
66 |     """
67 |     test if a DAG is tagged and if those TAGs are in the approved list
68 |     """
69 |     assert dag.tags, f"{dag_id} in {fileloc} has no tags"
70 |     if APPROVED_TAGS:
71 |         assert not set(dag.tags) - APPROVED_TAGS
72 | 
73 | 
74 | @pytest.mark.parametrize(
75 |     "dag_id,dag, fileloc", get_dags(), ids=[x[2] for x in get_dags()]
76 | )
77 | def test_dag_retries(dag_id, dag, fileloc):
78 |     """
79 |     test if a DAG has retries set
80 |     """
81 |     assert (
82 |         dag.default_args.get("retries", None) >= 2
83 |     ), f"{dag_id} in {fileloc} must have task retries >= 2."
84 | 


--------------------------------------------------------------------------------
/astro/dags/bronze_ingestion.py:
--------------------------------------------------------------------------------
 1 | # from datetime import datetime, timedelta
 2 | from datetime import datetime, timedelta
 3 | import os
 4 | import logging
 5 | from airflow.decorators import dag, task
 6 | from airflow.operators.dagrun_operator import TriggerDagRunOperator
 7 | from dotenv import load_dotenv
 8 | 
 9 | from include.lib.aws_manager import AWSManager
10 | from include.lib.duckdb_manager import DuckDBManager
11 | from include.lib.motherduck_manager import MotherDuckManager
12 | from include.ingestion.bronze.cards import DataManager
13 | 
14 | 
15 | # Load environment variables
16 | load_dotenv()
17 | 
18 | # Configuration
19 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN")
20 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
21 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
22 | AWS_REGION = os.getenv("AWS_REGION")
23 | RAW_S3_PATH = os.getenv("RAW_S3_PATH")
24 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH")
25 | LOCAL_PATH = "data/bronze/"
26 | TABLE_NAME = "cards"
27 | BRONZE_SCHEMA = "bronze"
28 | LOCAL_DATABASE = "memory"
29 | REMOTE_DATABASE = "magic_the_gathering"
30 | 
31 | default_args = {
32 |     'owner': 'airflow',
33 |     'depends_on_past': True,
34 |     'start_date': datetime(2024, 4, 1),
35 |     'retries': 2,
36 |     'retry_delay': timedelta(minutes=5),
37 | }
38 | 
39 | logger = logging.getLogger(__name__)
40 | 
41 | 
42 | @dag(default_args=default_args, catchup=False, tags=['bronze_cards'])
43 | def ingestor_bronze():
44 |     """
45 |     Airflow DAG for ingesting data from an API, parsing it, and saving it.
46 |     """
47 | 
48 |     @task
49 |     def bronze_cards():
50 |         try:
51 |             duckdb_manager = DuckDBManager()
52 |             MotherDuckManager(
53 |                 duckdb_manager, MOTHERDUCK_TOKEN
54 |             )
55 |             AWSManager(
56 |                 duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY
57 |             )
58 |             data_manager = DataManager(
59 |                 duckdb_manager,
60 |                 LOCAL_DATABASE,
61 |                 REMOTE_DATABASE,
62 |                 BRONZE_SCHEMA,
63 |                 TABLE_NAME,
64 |                 LOCAL_PATH,
65 |                 RAW_S3_PATH,
66 |                 BRONZE_S3_PATH,
67 |             )
68 |             
69 |             logger.info("START CREATING TABLE FROM S3")
70 |             data_manager.create_table_from_json_file()
71 |             logger.info("START SAVING DATA LOCALLY")
72 |             data_manager.save_to_local()
73 |             logger.info("START SAVING DATA TO S3")
74 |             data_manager.save_to_s3()
75 |             logger.info("START SAVING DATA TO MD")
76 |             data_manager.save_to_md()
77 |         except Exception as e:
78 |             logger.error(f"An error occured: {e}")
79 | 
80 |     bronze_cards_task = bronze_cards()
81 |     
82 |     # Define task to trigger the Silver DAG
83 |     trigger_silver_task = TriggerDagRunOperator(
84 |         task_id='trigger_silver',
85 |         trigger_dag_id='ingestor_silver',
86 |         wait_for_completion=True,
87 |         deferrable=True,
88 |     )
89 |         
90 |     # Define tasks dependencies
91 |     bronze_cards_task >> trigger_silver_task
92 | 
93 | 
94 | # Instantiate the DAG
95 | ingestor_bronze_dag = ingestor_bronze()
96 | 


--------------------------------------------------------------------------------
/astro/dags/silver_ingestion.py:
--------------------------------------------------------------------------------
 1 | # from datetime import datetime, timedelta
 2 | from datetime import datetime, timedelta
 3 | import os
 4 | import logging
 5 | from airflow.decorators import dag, task
 6 | from airflow.operators.dagrun_operator import TriggerDagRunOperator
 7 | from dotenv import load_dotenv
 8 | 
 9 | from include.lib.aws_manager import AWSManager
10 | from include.lib.duckdb_manager import DuckDBManager
11 | from include.lib.motherduck_manager import MotherDuckManager
12 | from include.ingestion.silver.cards import DataManager
13 | 
14 | 
15 | # Load environment variables
16 | load_dotenv()
17 | 
18 | # Configuration
19 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN")
20 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
21 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
22 | AWS_REGION = os.getenv("AWS_REGION")
23 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH")
24 | SILVER_S3_PATH = os.getenv("SILVER_S3_PATH")
25 | LOCAL_PATH = "data/silver/"
26 | TABLE_NAME = "cards"
27 | SILVER_SCHEMA = "silver"
28 | LOCAL_DATABASE = "memory"
29 | REMOTE_DATABASE = "magic_the_gathering"
30 | 
31 | default_args = {
32 |     'owner': 'airflow',
33 |     'depends_on_past': True,
34 |     'start_date': datetime(2024, 4, 1),
35 |     'retries': 2,
36 |     'retry_delay': timedelta(minutes=5),
37 | }
38 | 
39 | logger = logging.getLogger(__name__)
40 | 
41 | 
42 | @dag(default_args=default_args, catchup=False, tags=['silver_cards'])
43 | def ingestor_silver():
44 |     """
45 |     Airflow DAG for ingesting data from an API, parsing it, and saving it.
46 |     """
47 | 
48 |     @task
49 |     def silver_cards():
50 |         try:
51 |             duckdb_manager = DuckDBManager()
52 |             MotherDuckManager(duckdb_manager, MOTHERDUCK_TOKEN)
53 |             AWSManager(
54 |                 duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY
55 |             )
56 |             data_manager = DataManager(
57 |                 duckdb_manager,
58 |                 LOCAL_DATABASE,
59 |                 REMOTE_DATABASE,
60 |                 SILVER_SCHEMA,
61 |                 TABLE_NAME,
62 |                 LOCAL_PATH,
63 |                 BRONZE_S3_PATH,
64 |                 SILVER_S3_PATH,
65 |             )
66 |             
67 |             logger.info("START CREATING TABLE FROM S3")
68 |             data_manager.create_table_from_bronze()
69 |             logger.info("START SAVING DATA LOCALLY")
70 |             data_manager.save_to_local()
71 |             logger.info("START SAVING DATA TO S3")
72 |             data_manager.save_to_s3()
73 |             logger.info("START SAVING DATA TO MD")
74 |             data_manager.save_to_md()
75 |         except Exception as e:
76 |             logger.error(f"An error occured: {e}")
77 | 
78 | 
79 |     # Define task dependencies
80 |     silver_cards_task = silver_cards()
81 |     
82 |     # Define task to trigger the Silver DAG
83 |     trigger_gold_task = TriggerDagRunOperator(
84 |         task_id='trigger_gold',
85 |         trigger_dag_id='ingestor_gold',
86 |         wait_for_completion=True,
87 |         deferrable=True,
88 |     )
89 |         
90 |     # Define tasks dependencies
91 |     silver_cards_task >> trigger_gold_task
92 | 
93 | 
94 | # Instantiate the DAG
95 | ingestor_silver_dag = ingestor_silver()
96 | 
97 | 


--------------------------------------------------------------------------------
/astro/README.md:
--------------------------------------------------------------------------------
 1 | Overview
 2 | ========
 3 | 
 4 | Welcome to Astronomer! This project was generated after you ran 'astro dev init' using the Astronomer CLI. This readme describes the contents of the project, as well as how to run Apache Airflow on your local machine.
 5 | 
 6 | Project Contents
 7 | ================
 8 | 
 9 | Your Astro project contains the following files and folders:
10 | 
11 | - dags: This folder contains the Python files for your Airflow DAGs. By default, this directory includes two example DAGs:
12 |     - `example_dag_basic`: This DAG shows a simple ETL data pipeline example with three TaskFlow API tasks that run daily.
13 |     - `example_dag_advanced`: This advanced DAG showcases a variety of Airflow features like branching, Jinja templates, task groups and several Airflow operators.
14 | - Dockerfile: This file contains a versioned Astro Runtime Docker image that provides a differentiated Airflow experience. If you want to execute other commands or overrides at runtime, specify them here.
15 | - include: This folder contains any additional files that you want to include as part of your project. It is empty by default.
16 | - packages.txt: Install OS-level packages needed for your project by adding them to this file. It is empty by default.
17 | - requirements.txt: Install Python packages needed for your project by adding them to this file. It is empty by default.
18 | - plugins: Add custom or community plugins for your project to this file. It is empty by default.
19 | - airflow_settings.yaml: Use this local-only file to specify Airflow Connections, Variables, and Pools instead of entering them in the Airflow UI as you develop DAGs in this project.
20 | 
21 | Deploy Your Project Locally
22 | ===========================
23 | 
24 | 1. Start Airflow on your local machine by running 'astro dev start'.
25 | 
26 | This command will spin up 4 Docker containers on your machine, each for a different Airflow component:
27 | 
28 | - Postgres: Airflow's Metadata Database
29 | - Webserver: The Airflow component responsible for rendering the Airflow UI
30 | - Scheduler: The Airflow component responsible for monitoring and triggering tasks
31 | - Triggerer: The Airflow component responsible for triggering deferred tasks
32 | 
33 | 2. Verify that all 4 Docker containers were created by running 'docker ps'.
34 | 
35 | Note: Running 'astro dev start' will start your project with the Airflow Webserver exposed at port 8080 and Postgres exposed at port 5432. If you already have either of those ports allocated, you can either [stop your existing Docker containers or change the port](https://docs.astronomer.io/astro/test-and-troubleshoot-locally#ports-are-not-available).
36 | 
37 | 3. Access the Airflow UI for your local Airflow project. To do so, go to http://localhost:8080/ and log in with 'admin' for both your Username and Password.
38 | 
39 | You should also be able to access your Postgres Database at 'localhost:5432/postgres'.
40 | 
41 | Deploy Your Project to Astronomer
42 | =================================
43 | 
44 | If you have an Astronomer account, pushing code to a Deployment on Astronomer is simple. For deploying instructions, refer to Astronomer documentation: https://docs.astronomer.io/cloud/deploy-code/
45 | 
46 | Contact
47 | =======
48 | 
49 | The Astronomer CLI is maintained with love by the Astronomer team. To report a bug or suggest a change, reach out to our support.
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | # lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Data Folder
163 | data/
164 | 
165 | # Terraform environment variables
166 | terraform/.terraform*
167 | terraform/terraform.tfvars
168 | terraform/terraform.tfstate
169 | terraform/*.backup
170 | 
171 | # Ruff
172 | .ruff_cache
173 | 
174 | # Database
175 | *.db*


--------------------------------------------------------------------------------
/docs/info_dataset.md:
--------------------------------------------------------------------------------
 1 | # Information about the dataset fields
 2 | 
 3 | Datasets columns info is listed below and also on the official site [here](https://scryfall.com/docs/api/cards).
 4 | 
 5 | ## Cards Table
 6 | 
 7 | | Column Name                         | Description                                                                                                                                                                                                                                                                                                                                                                                                                                    | Type            |
 8 | |-------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|
 9 | | cards_id                         | A unique ID for this card in Scryfall’s database.                                                                                                                                                                                                                                                                                                                                                 | String      |
10 | | name                             | The name of this card. If this card has multiple faces, this field will contain both names separated by ␣//␣.                                                                                                                                                                                                                                                                                                                                            | String      |
11 | | released_at                                |  The date this card was first released.                                                                                                                                                                                                                                                                                                                                                          | Date      |
12 | | color_identity                               | This card’s color identity.                                                                                                                                                                                                                                                                                                                                                                    | String      |
13 | | set_name      | This card’s full set name.                                                                                                                                                                                                                                                                                                                                                                                             | String      |
14 | | artist                           | The name of the illustrator of this card face. Newly spoiled cards may not have this field yet.                                                                                                                                                                                                                                                                                                                                                            | String |
15 | | usd_price                         | Daily price information for this card.                                                                                                                                                                                                                                                                                                                                          | Float        |


--------------------------------------------------------------------------------
/astro/include/ingestion/silver/cards.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | 
  4 | logger = logging.getLogger(__name__)
  5 | 
  6 | 
  7 | class DataManager:
  8 |     """
  9 |     Manages data operations.
 10 |     """
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         duckdb_manager,
 15 |         local_database: str,
 16 |         remote_database: str,
 17 |         silver_schema: str,
 18 |         table_name: str,
 19 |         local_path: str,
 20 |         bronze_s3_path: str,
 21 |         silver_s3_path: str,
 22 |     ):
 23 |         """
 24 |         Initializes DataManager.
 25 | 
 26 |         Args:
 27 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
 28 |         """
 29 |         self.duckdb_manager = duckdb_manager
 30 |         self.local_database = local_database
 31 |         self.remote_database = remote_database
 32 |         self.silver_schema = silver_schema
 33 |         self.table_name = table_name
 34 |         self.local_path = local_path
 35 |         self.bronze_s3_path = bronze_s3_path
 36 |         self.silver_s3_path = silver_s3_path
 37 | 
 38 |     def create_table_from_bronze(self) -> None:
 39 |         """
 40 |         Creates a table from a JSON file stored in S3.
 41 | 
 42 |         Returns:
 43 |             None
 44 |         """
 45 |         try:
 46 |             logger.info("Creating cards table locally")
 47 |             query = f"""
 48 |                 CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS
 49 |                 SELECT
 50 |                     id AS card_id
 51 |                     , name
 52 |                     , released_at
 53 |                     , color_identity
 54 |                     , set_name
 55 |                     , artist
 56 |                     , (prices).usd AS usd_price
 57 |                 FROM read_parquet('{self.bronze_s3_path}{self.table_name}.parquet');
 58 |                 """
 59 |             self.duckdb_manager.execute_query(query)
 60 |             logger.info("Cards table created!")
 61 |         except Exception as e:
 62 |             logger.error(f"Error creating table from JSON file: {e}")
 63 | 
 64 |     def save_to_local(self) -> None:
 65 |         """
 66 |         Saves data to local disk.
 67 | 
 68 |         Returns:
 69 |             None
 70 |         """
 71 |         try:
 72 |             logger.info("Saving cards table as parquet format locally")
 73 |             os.makedirs(os.path.dirname(self.local_path), exist_ok=True)
 74 |             query = f"""
 75 |                 COPY (
 76 |                     SELECT
 77 |                         *
 78 |                     FROM {self.local_database}.{self.table_name}
 79 |                 )
 80 |                 TO '{self.local_path}{self.table_name}.parquet'
 81 |                 (FORMAT PARQUET)
 82 |                 """
 83 |             self.duckdb_manager.execute_query(query)
 84 |             logger.info("Cards table saved locally!")
 85 |         except Exception as e:
 86 |             logger.error(f"Error saving to local: {e}")
 87 | 
 88 |     def save_to_s3(self) -> None:
 89 |         """
 90 |         Saves data to Amazon S3.
 91 | 
 92 |         Returns:
 93 |             None
 94 |         """
 95 |         try:
 96 |             logger.info("Saving cards table to s3 as parquet")
 97 |             query = f"""
 98 |                 COPY (
 99 |                     SELECT
100 |                         *
101 |                     FROM {self.local_database}.{self.table_name}
102 |                 )
103 |                 TO '{self.silver_s3_path}{self.table_name}.parquet'
104 |                 (FORMAT PARQUET)
105 |                 """
106 |             self.duckdb_manager.execute_query(query)
107 |             logger.info("Cards table saved to s3!")
108 |         except Exception as e:
109 |             logger.error(f"Error saving to S3: {e}")
110 | 
111 |     def save_to_md(self) -> None:
112 |         """
113 |         Saves data to MotherDuck.
114 | 
115 |         Returns:
116 |             None
117 |         """
118 |         try:
119 |             logger.info("Saving cards table to Mother Duck")
120 |             self.duckdb_manager.execute_query(
121 |                 f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.silver_schema};"
122 |             )
123 |             query = f"""
124 |                 CREATE OR REPLACE TABLE {self.remote_database}.{self.silver_schema}.{self.table_name} AS
125 |                     SELECT
126 |                         *
127 |                     FROM {self.local_database}.{self.table_name};
128 |                 """
129 |             self.duckdb_manager.execute_query(query)
130 |             logger.info("Cards table saved!")
131 |         except Exception as e:
132 |             logger.error(f"Error saving to MotherDuck: {e}")
133 | 


--------------------------------------------------------------------------------
/astro/.astro/test_dag_integrity_default.py:
--------------------------------------------------------------------------------
  1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**"""
  2 | 
  3 | from contextlib import contextmanager
  4 | import logging
  5 | import os
  6 | 
  7 | import pytest
  8 | 
  9 | from airflow.models import DagBag, Variable, Connection
 10 | from airflow.hooks.base import BaseHook
 11 | from airflow.utils.db import initdb
 12 | 
 13 | # init airflow database
 14 | initdb()
 15 | 
 16 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables
 17 | 
 18 | 
 19 | # =========== MONKEYPATCH BaseHook.get_connection() ===========
 20 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs):
 21 |     print(
 22 |         f"Attempted to fetch connection during parse returning an empty Connection object for {key}"
 23 |     )
 24 |     return Connection(key)
 25 | 
 26 | 
 27 | BaseHook.get_connection = basehook_get_connection_monkeypatch
 28 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() ===========
 29 | 
 30 | 
 31 | # =========== MONKEYPATCH OS.GETENV() ===========
 32 | def os_getenv_monkeypatch(key: str, *args, **kwargs):
 33 |     default = None
 34 |     if args:
 35 |         default = args[0]  # os.getenv should get at most 1 arg after the key
 36 |     if kwargs:
 37 |         default = kwargs.get(
 38 |             "default", None
 39 |         )  # and sometimes kwarg if people are using the sig
 40 | 
 41 |     env_value = os.environ.get(key, None)
 42 | 
 43 |     if env_value:
 44 |         return env_value  # if the env_value is set, return it
 45 |     if (
 46 |         key == "JENKINS_HOME" and default is None
 47 |     ):  # fix https://github.com/astronomer/astro-cli/issues/601
 48 |         return None
 49 |     if default:
 50 |         return default  # otherwise return whatever default has been passed
 51 |     return f"MOCKED_{key.upper()}_VALUE"  # if absolutely nothing has been passed - return the mocked value
 52 | 
 53 | 
 54 | os.getenv = os_getenv_monkeypatch
 55 | # # =========== /MONKEYPATCH OS.GETENV() ===========
 56 | 
 57 | # =========== MONKEYPATCH VARIABLE.GET() ===========
 58 | 
 59 | 
 60 | class magic_dict(dict):
 61 |     def __init__(self, *args, **kwargs):
 62 |         self.update(*args, **kwargs)
 63 | 
 64 |     def __getitem__(self, key):
 65 |         return {}.get(key, "MOCKED_KEY_VALUE")
 66 | 
 67 | 
 68 | _no_default = object()  # allow falsey defaults
 69 | 
 70 | 
 71 | def variable_get_monkeypatch(key: str, default_var=_no_default, deserialize_json=False):
 72 |     print(
 73 |         f"Attempted to get Variable value during parse, returning a mocked value for {key}"
 74 |     )
 75 | 
 76 |     if default_var is not _no_default:
 77 |         return default_var
 78 |     if deserialize_json:
 79 |         return magic_dict()
 80 |     return "NON_DEFAULT_MOCKED_VARIABLE_VALUE"
 81 | 
 82 | 
 83 | Variable.get = variable_get_monkeypatch
 84 | # # =========== /MONKEYPATCH VARIABLE.GET() ===========
 85 | 
 86 | 
 87 | @contextmanager
 88 | def suppress_logging(namespace):
 89 |     """
 90 |     Suppress logging within a specific namespace to keep tests "clean" during build
 91 |     """
 92 |     logger = logging.getLogger(namespace)
 93 |     old_value = logger.disabled
 94 |     logger.disabled = True
 95 |     try:
 96 |         yield
 97 |     finally:
 98 |         logger.disabled = old_value
 99 | 
100 | 
101 | def get_import_errors():
102 |     """
103 |     Generate a tuple for import errors in the dag bag, and include DAGs without errors.
104 |     """
105 |     with suppress_logging("airflow"):
106 |         dag_bag = DagBag(include_examples=False)
107 | 
108 |         def strip_path_prefix(path):
109 |             return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
110 | 
111 |         # Initialize an empty list to store the tuples
112 |         result = []
113 | 
114 |         # Iterate over the items in import_errors
115 |         for k, v in dag_bag.import_errors.items():
116 |             result.append((strip_path_prefix(k), v.strip()))
117 | 
118 |         # Check if there are DAGs without errors
119 |         for file_path in dag_bag.dags:
120 |             # Check if the file_path is not in import_errors, meaning no errors
121 |             if file_path not in dag_bag.import_errors:
122 |                 result.append((strip_path_prefix(file_path), "No import errors"))
123 | 
124 |         return result
125 | 
126 | 
127 | @pytest.mark.parametrize(
128 |     "rel_path, rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
129 | )
130 | def test_file_imports(rel_path, rv):
131 |     """Test for import errors on a file"""
132 |     if rv != "No import errors":
133 |         # If rv is not "No import errors," consider it a failed test
134 |         raise Exception(f"{rel_path} failed to import with message \n {rv}")
135 |     else:
136 |         # If rv is "No import errors," consider it a passed test
137 |         print(f"{rel_path} passed the import test")
138 | 


--------------------------------------------------------------------------------
/astro/include/ingestion/bronze/cards.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | 
  4 | logger = logging.getLogger(__name__)
  5 | 
  6 | 
  7 | class DataManager:
  8 |     """
  9 |     Manages data operations.
 10 |     """
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         duckdb_manager,
 15 |         local_database: str,
 16 |         remote_database: str,
 17 |         bronze_schema: str,
 18 |         table_name: str,
 19 |         local_path: str,
 20 |         raw_s3_path: str,
 21 |         bronze_s3_path: str,
 22 |     ):
 23 |         """
 24 |         Initializes DataManager.
 25 | 
 26 |         Args:
 27 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
 28 |         """
 29 |         self.duckdb_manager = duckdb_manager
 30 |         self.local_database = local_database
 31 |         self.remote_database = remote_database
 32 |         self.bronze_schema = bronze_schema
 33 |         self.table_name = table_name
 34 |         self.local_path = local_path
 35 |         self.raw_s3_path = raw_s3_path
 36 |         self.bronze_s3_path = bronze_s3_path
 37 | 
 38 |     def create_table_from_json_file(self) -> None:
 39 |         """
 40 |         Creates a table from a JSON file stored in S3.
 41 | 
 42 |         Returns:
 43 |             None
 44 |         """
 45 |         try:
 46 |             logger.info("Creating cards table locally")
 47 |             query = f"""
 48 |                 CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS
 49 |                 WITH ranked_cards AS (
 50 |                     SELECT
 51 |                         *
 52 |                         , ROW_NUMBER() OVER (PARTITION BY id ORDER BY released_at DESC) AS row_num
 53 |                     FROM read_json_auto('{self.raw_s3_path}{self.table_name}.json')
 54 |                 )
 55 |                 SELECT
 56 |                     * EXCLUDE (row_num)
 57 |                 FROM ranked_cards
 58 |                 WHERE row_num = 1;
 59 |                 """
 60 |             self.duckdb_manager.execute_query(query)
 61 |             logger.info("Cards table created!")
 62 |         except Exception as e:
 63 |             logger.error(f"Error creating table from JSON file: {e}")
 64 | 
 65 |     def save_to_local(self) -> None:
 66 |         """
 67 |         Saves data to local disk.
 68 | 
 69 |         Returns:
 70 |             None
 71 |         """
 72 |         try:
 73 |             logger.info("Saving cards table as parquet format locally")
 74 |             os.makedirs(os.path.dirname(self.local_path), exist_ok=True)
 75 |             query = f"""
 76 |                 COPY (
 77 |                     SELECT
 78 |                         *
 79 |                     FROM {self.local_database}.{self.table_name}
 80 |                 )
 81 |                 TO '{self.local_path}{self.table_name}.parquet'
 82 |                 (FORMAT PARQUET)
 83 |                 """
 84 |             self.duckdb_manager.execute_query(query)
 85 |             logger.info("Cards table saved locally!")
 86 |         except Exception as e:
 87 |             logger.error(f"Error saving to local: {e}")
 88 | 
 89 |     def save_to_s3(self) -> None:
 90 |         """
 91 |         Saves data to Amazon S3.
 92 | 
 93 |         Returns:
 94 |             None
 95 |         """
 96 |         try:
 97 |             logger.info("Saving cards table to s3 as parquet")
 98 |             query = f"""
 99 |                 COPY (
100 |                     SELECT
101 |                         *
102 |                     FROM {self.local_database}.{self.table_name}
103 |                 )
104 |                 TO '{self.bronze_s3_path}{self.table_name}.parquet'
105 |                 (FORMAT PARQUET)
106 |                 """
107 |             self.duckdb_manager.execute_query(query)
108 |             logger.info("Cards table saved to s3!")
109 |         except Exception as e:
110 |             logger.error(f"Error saving to S3: {e}")
111 | 
112 |     def save_to_md(self) -> None:
113 |         """
114 |         Saves data to MotherDuck.
115 | 
116 |         Returns:
117 |             None
118 |         """
119 |         try:
120 |             logger.info("Saving cards table to Mother Duck")
121 |             self.duckdb_manager.execute_query(
122 |                 f"CREATE DATABASE IF NOT EXISTS {self.remote_database}"
123 |             )
124 |             self.duckdb_manager.execute_query(
125 |                 f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.bronze_schema};"
126 |             )
127 |             query = f"""
128 |                 CREATE OR REPLACE TABLE {self.remote_database}.{self.bronze_schema}.{self.table_name} AS
129 |                     SELECT
130 |                         *
131 |                     FROM {self.local_database}.{self.table_name};
132 |                 """
133 |             self.duckdb_manager.execute_query(query)
134 |             logger.info("Cards table saved!")
135 |         except Exception as e:
136 |             logger.error(f"Error saving to MotherDuck: {e}")
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Magic: The Gathering Data Pipeline - A Data Engineering Project
  2 | ---
  3 | 
  4 | ![BATCH ELT Architecture](images/magic_the_gathering.jpg)
  5 | 
  6 | ### About the Game
  7 | 
  8 | **Magic: The Gathering** (colloquially known as **Magic** or **MTG**) is a tabletop and digital collectible card game created by Richard Garfield. Released in 1993 by Wizards of the Coast, Magic was the first trading card game and had approximately fifty million players as of February 2023. Over twenty billion Magic cards were produced in the period from 2008 to 2016, during which time it grew in popularity. As of the 2022 fiscal year, Magic generates over $1 billion in revenue annually - [Wikipedia](https://en.wikipedia.org/wiki/Magic:_The_Gathering)
  9 | 
 10 | The goal of this project is to build an end-to-end batch data pipeline on Magic: The Gathering Data available at [Scryfall](https://scryfall.com/). In addition, perform ELT (Extract Load Transform) daily in order to analyze the cards information availabe from historical data to till date. 
 11 | 
 12 | ### Table of contents
 13 | 
 14 | - [Problem statement](#problem-statement)
 15 |   - [Dataset](#dataset)
 16 |   - [Proposed Solution](#proposed-solution)
 17 | - [Data Pipeline Overview](#data-pipeline-overview)
 18 | - [Technologies](#technologies)
 19 | - [Architecture](#architecture)
 20 | - [ELT Steps](#elt-steps)
 21 | - [The Dashboard](#the-dashboard)
 22 | - [Conclusion](#conclusion)
 23 | 
 24 | ## Problem statement
 25 | ---
 26 | * ### ***Data***: 
 27 |     The Data selected for this project is the `Magic: The Gathering` obtained from [Scryfall](https://scryfall.com/). This data includes the latest cards information available. Data since 1993. The Data is extracted via Scryfall API.
 28 | 
 29 |     The columns in the Datasets and their descriptions is available [here](docs/info_dataset.md)
 30 | 
 31 | * ### ***Proposed Solution***:
 32 |     This project aims at extracting this data from the source via API and building a BATCH ELT which will be scheduled to run daily and update the connected Dashboard for daily Analytics & Reporting. 
 33 | 
 34 | 
 35 | ## Data Pipeline Overview 
 36 | ---
 37 | This is a Batch Pipeline which will perform ELT on the every day at 09:00 am. 
 38 | 
 39 | The ELT steps include:
 40 | 
 41 | * **Extract** dataset from Scryfall via API and load the data into the Datalake
 42 | * **Clean** data and load the data into Datalake 
 43 | * **Load** the data from Datalake into external tables in the Datawarehouse
 44 | * **Transform** the data in the Datawarehouse
 45 | * **Visualize** the data by creating a Dashboard
 46 | 
 47 | ## Data Pipeline with Medallion Architecture
 48 | ---
 49 | * **RAW:** where the raw data is placed as soon as it is collected
 50 | * **BRONZE:** data treated and ready to be consumed
 51 | * **SILVER:** data processed and can be consumed easily
 52 | * **GOLD:** data made available from analyzes and models, which can be consumed by BI or DataViz tools
 53 | 
 54 | ## Technologies 
 55 | ---
 56 | * Cloud: ***AWS***
 57 | * Infrastructure as code (IaC): ***Terraform***
 58 | * Workflow orchestration: ***Astronomer + Airflow***
 59 | * Data Warehouse: ***MotherDuck***
 60 | * Batch processing: ***DuckDb***
 61 | * Data Transformation: ***dbt-core***
 62 | * DataViz: ***Preset***
 63 | * Virtual Environment: ***Poetry***
 64 | * CICD: ***Git***
 65 | 
 66 | ## Architecture
 67 | ---
 68 | Pipeline
 69 | 
 70 | 
 71 | ![BATCH ELT Architecture](images/magic_the_gathering_pipeline_etl.png)
 72 | 
 73 | 
 74 | ## ELT Steps
 75 | 
 76 | Steps in the ELT are as follows:
 77 | 
 78 | 1. A Project is created on ***GitHub*** 
 79 | 2. Infrastructure for the Project is created using ***Terraform*** which creates the following:
 80 |     * Datalake: ***S3 Bucket*** where the raw and cleaned data will be stored
 81 | 3. The Pipeline for ELT is created and is scheduled for daily execution. It is orchestrated via ***Astronomer + Airflow***, which does the following tasks:
 82 |     * Extracts raw data from source via ***Scryfall API***
 83 |     * Loads raw data as json file to ***S3 Bucket***
 84 |     * Cleans the raw data using ***DuckDb***
 85 |     * Loads the cleaned data as parquet files to ***S3***
 86 |     * Creates External table in the Datasets in ***MotherDuck*** by pulling data from ***S3***. 
 87 |     * Transforms Data from ***S3*** using ***dbt-core*** and creates the following in the dev/prod Dataset (along with Tests and Documentation)
 88 |         - The view: `stg_cards`
 89 |         - The fact table: `fact_cards`
 90 | 4. Transformed Data from ***MotherDuck*** is used for Reporting and Visualization using ***Preset*** to produce Dashboards
 91 | 
 92 | ## The Dashboard: 
 93 | ---
 94 | 
 95 | ![image](images/magic_the_gathering_dashboard.jpg)
 96 | 
 97 | ## Conclusion
 98 | ---
 99 | Through this project we were able to successfully build a ELT pipeline end to end which is scheduled to run daily. And as a result we have a daily updated **MTG cards** which can be visualized via the Dashboard on **Preset**. This helps us get some useful insights on the latest cards information.
100 | 
101 | [Back To Top](magic-the-gathering-data-pipeline-a-data-engineering-project)
102 | 


--------------------------------------------------------------------------------
/astro/include/ingestion/raw/cards.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import json
  3 | import os
  4 | import sys
  5 | from typing import List, Optional
  6 | import requests
  7 | import boto3
  8 | import logging
  9 | 
 10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 11 | 
 12 | from models.card import Card
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class APIClient:
 18 |     """
 19 |     Class for interacting with an API.
 20 |     """
 21 | 
 22 |     def __init__(self, base_url: str, dataset: str) -> None:
 23 |         self.base_url = base_url
 24 |         self.dataset = dataset
 25 | 
 26 |     def fetch_bulk_data(self) -> Optional[requests.Response]:
 27 |         """
 28 |         Fetches data from the API.
 29 | 
 30 |         Args:
 31 |             dataset (str): Name of the dataset to fetch.
 32 | 
 33 |         Returns:
 34 |             dict: Response JSON data.
 35 |         """
 36 |         try:
 37 |             logger.info("Fetching bulk data")
 38 |             # Construct the URL for fetching data
 39 |             url = f"{self.base_url}{self.dataset}"
 40 |             # Send HTTP GET request to fetch data
 41 |             response = requests.get(url, timeout=5)
 42 |             # Raise an exception for HTTP errors
 43 |             response.raise_for_status()
 44 |             logger.info(f"Data fetched successfully from {url}")
 45 |             return response
 46 |         except requests.exceptions.RequestException as e:
 47 |             logger.error(f"Failed to fetch data: {e}")
 48 |             return None
 49 | 
 50 |     def fetch_cards_data(
 51 |         self, response: requests.Response
 52 |     ) -> Optional[requests.Response]:
 53 |         """
 54 |         Fetchs data from the bulk data download uri
 55 | 
 56 |         Args:
 57 |             response (Response): Response object from the API.
 58 | 
 59 |         Returns:
 60 |             dict: Response JSON data.
 61 |         """
 62 |         try:
 63 |             logger.info("Fetching cards data")
 64 |             # Extract relevant data and update timestamp from the API response
 65 |             download_uri = response.json()["download_uri"]
 66 |             data = requests.get(download_uri)
 67 |             update_timestamp = response.json()["updated_at"]
 68 |             # Parse the update timestamp to extract the date
 69 |             update_date = datetime.strptime(
 70 |                 update_timestamp, "%Y-%m-%dT%H:%M:%S.%f%z"
 71 |             ).date()
 72 |             logger.info(
 73 |                 f"Data fetched successfully from {download_uri} - Update date: {update_date}"
 74 |             )
 75 |             return data
 76 |         except KeyError as e:
 77 |             logger.error(f"Failed to extract data from API response: {e}")
 78 |             return None
 79 |         except ValueError as e:
 80 |             logger.error(f"Failed to parse timestamp: {e}")
 81 |             return None
 82 | 
 83 | 
 84 | class DataParser:
 85 |     """
 86 |     Class for parsing data.
 87 |     """
 88 | 
 89 |     @staticmethod
 90 |     def parse_cards(data: requests.Response) -> Optional[List[Card]]:
 91 |         """
 92 |         Parses JSON data into instances of the Card model.
 93 | 
 94 |         Args:
 95 |             data (dict): JSON data to parse.
 96 | 
 97 |         Returns:
 98 |             List[Card]: List of parsed Card instances.
 99 |         """
100 |         try:
101 |             logger.info("Parsing data")
102 |             cards_data = data.json()
103 |             parsed_cards = [Card(**card) for card in cards_data]
104 |             logger.info("Data parsing successful!")
105 |             return parsed_cards
106 |         except Exception as e:
107 |             logger.error(f"An unexpected error occurred: {e}")
108 |             return None
109 | 
110 | 
111 | class DataSaver:
112 |     """
113 |     Class for saving data.
114 |     """
115 | 
116 |     def __init__(
117 |         self,
118 |         table_path: str,
119 |         table_name: str,
120 |         bucket_name: Optional[str],
121 |         access_key_id: Optional[str],
122 |         secret_access_key: Optional[str],
123 |     ):
124 |         self.table_path = table_path
125 |         self.table_name = table_name
126 |         self.bucket_name = bucket_name
127 |         if bucket_name:
128 |             self.s3_client = boto3.client(
129 |                 "s3",
130 |                 aws_access_key_id=access_key_id,
131 |                 aws_secret_access_key=secret_access_key,
132 |             )
133 | 
134 |     def save_local(self, data: List) -> None:
135 |         """
136 |         Saves parsed data to a local file.
137 | 
138 |         Args:
139 |             data (list): List of parsed data.
140 |             table_path (str): Path to store local data.
141 |             table_name (str): Name of the table.
142 | 
143 |         Returns:
144 |             None
145 |         """
146 |         try:
147 |             logger.info("Saving data locally")
148 |             path = f"{self.table_path}{self.table_name}.json"
149 |             os.makedirs(os.path.dirname(path), exist_ok=True)
150 |             with open(path, "w") as file:
151 |                 json.dump([item.dict() for item in data], file, indent=4)
152 |             logger.info(f"Data saved locally to {path}")
153 |         except Exception as e:
154 |             logger.error(f"An error occurred while saving data locally: {e}")
155 | 
156 |     def save_s3(self, data: List) -> None:
157 |         """
158 |         Saves parsed data to AWS S3 bucket.
159 | 
160 |         Args:
161 |             data (list): List of parsed data.
162 |             s3_client: Boto3 S3 client.
163 |             bucket_name (str): Name of the AWS S3 bucket.
164 |             table_path (str): Path to store data in S3.
165 |             table_name (str): Name of the table.
166 | 
167 |         Returns:
168 |             None
169 |         """
170 |         try:
171 |             logger.info("Saving data to S3 bucket")
172 |             json_bytes = json.dumps([item.dict() for item in data], indent=4).encode(
173 |                 "utf-8"
174 |             )
175 |             key = f"{self.table_path}{self.table_name}.json"
176 |             self.s3_client.put_object(Body=json_bytes, Bucket=self.bucket_name, Key=key)
177 |             logger.info(f"Data saved successfully to S3 bucket: {self.bucket_name}")
178 |         except Exception as e:
179 |             logger.error(f"An error occurred while saving data to S3 bucket: {e}")
180 | 


--------------------------------------------------------------------------------
/ingestion/silver/cards.py:
--------------------------------------------------------------------------------
  1 | import duckdb
  2 | import os
  3 | import sys
  4 | from dotenv import load_dotenv
  5 | from loguru import logger
  6 | from typing import Any
  7 | 
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
  9 | 
 10 | from lib import duckdb_manager, motherduck_manager, aws_manager
 11 | 
 12 | load_dotenv()
 13 | 
 14 | # Load environment variables
 15 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN")
 16 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
 17 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 18 | AWS_REGION = os.getenv("AWS_REGION")
 19 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH")
 20 | SILVER_S3_PATH = os.getenv("SILVER_S3_PATH")
 21 | LOCAL_PATH = "data/silver/"
 22 | TABLE_NAME = "cards"
 23 | SILVER_SCHEMA = "silver"
 24 | LOCAL_DATABASE = "memory"
 25 | REMOTE_DATABASE = "magic_the_gathering"
 26 | 
 27 | 
 28 | class DataManager:
 29 |     """
 30 |     Manages data operations.
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         duckdb_manager,
 36 |         local_database: str,
 37 |         remote_database: str,
 38 |         silver_schema: str,
 39 |         table_name: str,
 40 |         local_path: str,
 41 |         bronze_s3_path: str,
 42 |         silver_s3_path: str,
 43 |     ):
 44 |         """
 45 |         Initializes DataManager.
 46 | 
 47 |         Args:
 48 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
 49 |         """
 50 |         self.duckdb_manager = duckdb_manager
 51 |         self.local_database = local_database
 52 |         self.remote_database = remote_database
 53 |         self.silver_schema = silver_schema
 54 |         self.table_name = table_name
 55 |         self.local_path = local_path
 56 |         self.bronze_s3_path = bronze_s3_path
 57 |         self.silver_s3_path = silver_s3_path
 58 | 
 59 |     def create_table_from_bronze(self) -> None:
 60 |         """
 61 |         Creates a table from a JSON file stored in S3.
 62 | 
 63 |         Returns:
 64 |             None
 65 |         """
 66 |         try:
 67 |             logger.info("Creating cards table locally")
 68 |             query = f"""
 69 |                 CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS
 70 |                 SELECT
 71 |                     id AS card_id
 72 |                     , name
 73 |                     , released_at
 74 |                     , color_identity
 75 |                     , set_name
 76 |                     , artist
 77 |                     , (prices).usd AS usd_price
 78 |                 FROM read_parquet('{self.bronze_s3_path}{self.table_name}.parquet');
 79 |                 """
 80 |             self.duckdb_manager.execute_query(query)
 81 |             logger.success("Cards table created!")
 82 |         except Exception as e:
 83 |             logger.error(f"Error creating table from JSON file: {e}")
 84 | 
 85 |     def save_to_local(self) -> None:
 86 |         """
 87 |         Saves data to local disk.
 88 | 
 89 |         Returns:
 90 |             None
 91 |         """
 92 |         try:
 93 |             logger.info("Saving cards table as parquet format locally")
 94 |             os.makedirs(os.path.dirname(self.local_path), exist_ok=True)
 95 |             query = f"""
 96 |                 COPY (
 97 |                     SELECT
 98 |                         *
 99 |                     FROM {self.local_database}.{self.table_name}
100 |                 )
101 |                 TO '{self.local_path}{self.table_name}.parquet'
102 |                 (FORMAT PARQUET)
103 |                 """
104 |             self.duckdb_manager.execute_query(query)
105 |             logger.success("Cards table saved locally!")
106 |         except Exception as e:
107 |             logger.error(f"Error saving to local: {e}")
108 | 
109 |     def save_to_s3(self) -> None:
110 |         """
111 |         Saves data to Amazon S3.
112 | 
113 |         Returns:
114 |             None
115 |         """
116 |         try:
117 |             logger.info("Saving cards table to s3 as parquet")
118 |             query = f"""
119 |                 COPY (
120 |                     SELECT
121 |                         *
122 |                     FROM {self.local_database}.{self.table_name}
123 |                 )
124 |                 TO '{self.silver_s3_path}{self.table_name}.parquet'
125 |                 (FORMAT PARQUET)
126 |                 """
127 |             self.duckdb_manager.execute_query(query)
128 |             logger.success("Cards table saved to s3!")
129 |         except Exception as e:
130 |             logger.error(f"Error saving to S3: {e}")
131 | 
132 |     def save_to_md(self) -> None:
133 |         """
134 |         Saves data to MotherDuck.
135 | 
136 |         Returns:
137 |             None
138 |         """
139 |         try:
140 |             logger.info("Saving cards table to Mother Duck")
141 |             self.duckdb_manager.execute_query(
142 |                 f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.silver_schema};"
143 |             )
144 |             query = f"""
145 |                 CREATE OR REPLACE TABLE {self.remote_database}.{self.silver_schema}.{self.table_name} AS
146 |                     SELECT
147 |                         *
148 |                     FROM {self.local_database}.{self.table_name};
149 |                 """
150 |             self.duckdb_manager.execute_query(query)
151 |             logger.info("Cards table saved!")
152 |         except Exception as e:
153 |             logger.error(f"Error saving to MotherDuck: {e}")
154 | 
155 | 
156 | class Ingestor:
157 |     """
158 |     Orchestrates the entire data ingestion process.
159 |     """
160 | 
161 |     def __init__(
162 |         self,
163 |         duckdb_manager,
164 |         motherduck_manager,
165 |         aws_manager,
166 |         data_manager,
167 |     ):
168 |         """
169 |         Initializes Ingestor.
170 |         """
171 |         self.duckdb_manager = duckdb_manager
172 |         self.motherduck_manager = motherduck_manager
173 |         self.aws_manager = aws_manager
174 |         self.data_manager = data_manager
175 | 
176 |     def execute(self) -> None:
177 |         """
178 |         Ingests data by executing the entire data ingestion process.
179 | 
180 |         Returns:
181 |             None
182 |         """
183 |         try:
184 |             logger.info("Starting ingestion")
185 |             self.data_manager.create_table_from_bronze()
186 |             self.data_manager.save_to_local()
187 |             self.data_manager.save_to_s3()
188 |             self.data_manager.save_to_md()
189 |             logger.success("Ingestion completed!")
190 |         except Exception as e:
191 |             logger.error(f"Error executing data ingestion process: {e}")
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     # Create instances of classed
196 |     duckdb_manager = duckdb_manager.DuckDBManager()
197 |     motherduck_manager = motherduck_manager.MotherDuckManager(duckdb_manager, MOTHERDUCK_TOKEN)
198 |     aws_manager = aws_manager.AWSManager(
199 |         duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY
200 |     )
201 |     data_manager = DataManager(
202 |         duckdb_manager,
203 |         LOCAL_DATABASE,
204 |         REMOTE_DATABASE,
205 |         SILVER_SCHEMA,
206 |         TABLE_NAME,
207 |         LOCAL_PATH,
208 |         BRONZE_S3_PATH,
209 |         SILVER_S3_PATH,
210 |     )
211 | 
212 |     # Creating an instance of DataIngestor and execute the ingestion process
213 |     ingestor = Ingestor(duckdb_manager, motherduck_manager, aws_manager, data_manager)
214 |     ingestor.execute()
215 | 


--------------------------------------------------------------------------------
/ingestion/bronze/cards.py:
--------------------------------------------------------------------------------
  1 | import duckdb
  2 | import os
  3 | import sys
  4 | from dotenv import load_dotenv
  5 | from loguru import logger
  6 | from typing import Any
  7 | 
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
  9 | 
 10 | from lib import duckdb_manager, motherduck_manager, aws_manager
 11 | 
 12 | load_dotenv()
 13 | 
 14 | # Load environment variables
 15 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN")
 16 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
 17 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 18 | AWS_REGION = os.getenv("AWS_REGION")
 19 | RAW_S3_PATH = os.getenv("RAW_S3_PATH")
 20 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH")
 21 | LOCAL_PATH = "data/bronze/"
 22 | TABLE_NAME = "cards"
 23 | BRONZE_SCHEMA = "bronze"
 24 | LOCAL_DATABASE = "memory"
 25 | REMOTE_DATABASE = "magic_the_gathering"
 26 | 
 27 | 
 28 | class DataManager:
 29 |     """
 30 |     Manages data operations.
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         duckdb_manager,
 36 |         local_database: str,
 37 |         remote_database: str,
 38 |         bronze_schema: str,
 39 |         table_name: str,
 40 |         local_path: str,
 41 |         raw_s3_path: str,
 42 |         bronze_s3_path: str,
 43 |     ):
 44 |         """
 45 |         Initializes DataManager.
 46 | 
 47 |         Args:
 48 |             duckdb_manager (DuckDBManager): Instance of DuckDBManager.
 49 |         """
 50 |         self.duckdb_manager = duckdb_manager
 51 |         self.local_database = local_database
 52 |         self.remote_database = remote_database
 53 |         self.bronze_schema = bronze_schema
 54 |         self.table_name = table_name
 55 |         self.local_path = local_path
 56 |         self.raw_s3_path = raw_s3_path
 57 |         self.bronze_s3_path = bronze_s3_path
 58 | 
 59 |     def create_table_from_json_file(self) -> None:
 60 |         """
 61 |         Creates a table from a JSON file stored in S3.
 62 | 
 63 |         Returns:
 64 |             None
 65 |         """
 66 |         try:
 67 |             logger.info("Creating cards table locally")
 68 |             query = f"""
 69 |                 CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS
 70 |                 WITH ranked_cards AS (
 71 |                     SELECT
 72 |                         *
 73 |                         , ROW_NUMBER() OVER (PARTITION BY id ORDER BY released_at DESC) AS row_num
 74 |                     FROM read_json_auto('{self.raw_s3_path}{self.table_name}.json')
 75 |                 )
 76 |                 SELECT
 77 |                     * EXCLUDE (row_num)
 78 |                 FROM ranked_cards
 79 |                 WHERE row_num = 1;
 80 |                 """
 81 |             self.duckdb_manager.execute_query(query)
 82 |             logger.success("Cards table created!")
 83 |         except Exception as e:
 84 |             logger.error(f"Error creating table from JSON file: {e}")
 85 | 
 86 |     def save_to_local(self) -> None:
 87 |         """
 88 |         Saves data to local disk.
 89 | 
 90 |         Returns:
 91 |             None
 92 |         """
 93 |         try:
 94 |             logger.info("Saving cards table as parquet format locally")
 95 |             os.makedirs(os.path.dirname(self.local_path), exist_ok=True)
 96 |             query = f"""
 97 |                 COPY (
 98 |                     SELECT
 99 |                         *
100 |                     FROM {self.local_database}.{self.table_name}
101 |                 )
102 |                 TO '{self.local_path}{self.table_name}.parquet'
103 |                 (FORMAT PARQUET)
104 |                 """
105 |             self.duckdb_manager.execute_query(query)
106 |             logger.success("Cards table saved locally!")
107 |         except Exception as e:
108 |             logger.error(f"Error saving to local: {e}")
109 | 
110 |     def save_to_s3(self) -> None:
111 |         """
112 |         Saves data to Amazon S3.
113 | 
114 |         Returns:
115 |             None
116 |         """
117 |         try:
118 |             logger.info("Saving cards table to s3 as parquet")
119 |             query = f"""
120 |                 COPY (
121 |                     SELECT
122 |                         *
123 |                     FROM {self.local_database}.{self.table_name}
124 |                 )
125 |                 TO '{self.bronze_s3_path}{self.table_name}.parquet'
126 |                 (FORMAT PARQUET)
127 |                 """
128 |             self.duckdb_manager.execute_query(query)
129 |             logger.success("Cards table saved to s3!")
130 |         except Exception as e:
131 |             logger.error(f"Error saving to S3: {e}")
132 | 
133 |     def save_to_md(self) -> None:
134 |         """
135 |         Saves data to MotherDuck.
136 | 
137 |         Returns:
138 |             None
139 |         """
140 |         try:
141 |             logger.info("Saving cards table to Mother Duck")
142 |             self.duckdb_manager.execute_query(
143 |                 f"CREATE DATABASE IF NOT EXISTS {self.remote_database}"
144 |             )
145 |             self.duckdb_manager.execute_query(
146 |                 f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.bronze_schema};"
147 |             )
148 |             query = f"""
149 |                 CREATE OR REPLACE TABLE {self.remote_database}.{self.bronze_schema}.{self.table_name} AS
150 |                     SELECT
151 |                         *
152 |                     FROM {self.local_database}.{self.table_name};
153 |                 """
154 |             self.duckdb_manager.execute_query(query)
155 |             logger.info("Cards table saved!")
156 |         except Exception as e:
157 |             logger.error(f"Error saving to MotherDuck: {e}")
158 | 
159 | 
160 | class Ingestor:
161 |     """
162 |     Orchestrates the entire data ingestion process.
163 |     """
164 | 
165 |     def __init__(
166 |         self,
167 |         duckdb_manager,
168 |         motherduck_manager,
169 |         aws_manager,
170 |         data_manager,
171 |     ):
172 |         """
173 |         Initializes Ingestor.
174 |         """
175 |         self.duckdb_manager = duckdb_manager
176 |         self.motherduck_manager = motherduck_manager
177 |         self.aws_manager = aws_manager
178 |         self.data_manager = data_manager
179 | 
180 |     def execute(self) -> None:
181 |         """
182 |         Ingests data by executing the entire data ingestion process.
183 | 
184 |         Returns:
185 |             None
186 |         """
187 |         try:
188 |             logger.info("Starting ingestion")
189 |             self.data_manager.create_table_from_json_file()
190 |             self.data_manager.save_to_local()
191 |             self.data_manager.save_to_s3()
192 |             self.data_manager.save_to_md()
193 |             logger.success("Ingestion completed!")
194 |         except Exception as e:
195 |             logger.error(f"Error executing data ingestion process: {e}")
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     # Create instances of classed
200 |     duckdb_manager = duckdb_manager.DuckDBManager()
201 |     motherduck_manager = motherduck_manager.MotherDuckManager(
202 |         duckdb_manager, MOTHERDUCK_TOKEN
203 |     )
204 |     aws_manager = aws_manager.AWSManager(
205 |         duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY
206 |     )
207 |     data_manager = DataManager(
208 |         duckdb_manager,
209 |         LOCAL_DATABASE,
210 |         REMOTE_DATABASE,
211 |         BRONZE_SCHEMA,
212 |         TABLE_NAME,
213 |         LOCAL_PATH,
214 |         RAW_S3_PATH,
215 |         BRONZE_S3_PATH,
216 |     )
217 | 
218 |     # Creating an instance of DataIngestor and execute the ingestion process
219 |     ingestor = Ingestor(duckdb_manager, motherduck_manager, aws_manager, data_manager)
220 |     ingestor.execute()
221 | 


--------------------------------------------------------------------------------
/ingestion/raw/cards.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import json
  3 | import os
  4 | import sys
  5 | from typing import Any, Dict, List, Optional
  6 | import requests
  7 | import boto3
  8 | from dotenv import load_dotenv
  9 | from loguru import logger
 10 | 
 11 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
 12 | 
 13 | from models.card import Card
 14 | 
 15 | 
 16 | # Load environment variables
 17 | load_dotenv()
 18 | 
 19 | # Configuration
 20 | API_BASE_URL = "https://api.scryfall.com/bulk-data/"
 21 | DATASET_NAME = "default_cards"
 22 | TABLE_NAME = "cards"
 23 | TABLE_PATH = "data/raw/"
 24 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
 25 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
 26 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 27 | 
 28 | 
 29 | class APIClient:
 30 |     """
 31 |     Class for interacting with an API.
 32 |     """
 33 | 
 34 |     def __init__(self, base_url: str, dataset: str) -> None:
 35 |         self.base_url = base_url
 36 |         self.dataset = dataset
 37 | 
 38 |     def fetch_bulk_data(self) -> Optional[requests.Response]:
 39 |         """
 40 |         Fetches data from the API.
 41 | 
 42 |         Args:
 43 |             dataset (str): Name of the dataset to fetch.
 44 | 
 45 |         Returns:
 46 |             dict: Response JSON data.
 47 |         """
 48 |         try:
 49 |             logger.info("Fetching bulk data")
 50 |             # Construct the URL for fetching data
 51 |             url = f"{self.base_url}{self.dataset}"
 52 |             # Send HTTP GET request to fetch data
 53 |             response = requests.get(url, timeout=5)
 54 |             # Raise an exception for HTTP errors
 55 |             response.raise_for_status()
 56 |             logger.success(f"Data fetched successfully from {url}")
 57 |             return response
 58 |         except requests.exceptions.RequestException as e:
 59 |             logger.error(f"Failed to fetch data: {e}")
 60 |             return None
 61 | 
 62 |     def fetch_cards_data(
 63 |         self, response: requests.Response
 64 |     ) -> Optional[requests.Response]:
 65 |         """
 66 |         Fetchs data from the bulk data download uri
 67 | 
 68 |         Args:
 69 |             response (Response): Response object from the API.
 70 | 
 71 |         Returns:
 72 |             dict: Response JSON data.
 73 |         """
 74 |         try:
 75 |             logger.info("Fetching cards data")
 76 |             # Extract relevant data and update timestamp from the API response
 77 |             download_uri = response.json()["download_uri"]
 78 |             data = requests.get(download_uri)
 79 |             update_timestamp = response.json()["updated_at"]
 80 |             # Parse the update timestamp to extract the date
 81 |             update_date = datetime.strptime(
 82 |                 update_timestamp, "%Y-%m-%dT%H:%M:%S.%f%z"
 83 |             ).date()
 84 |             logger.success(
 85 |                 f"Data fetched successfully from {download_uri} - Update date: {update_date}"
 86 |             )
 87 |             return data
 88 |         except KeyError as e:
 89 |             logger.error(f"Failed to extract data from API response: {e}")
 90 |             return None
 91 |         except ValueError as e:
 92 |             logger.error(f"Failed to parse timestamp: {e}")
 93 |             return None
 94 | 
 95 | 
 96 | class DataParser:
 97 |     """
 98 |     Class for parsing data.
 99 |     """
100 | 
101 |     @staticmethod
102 |     def parse_cards(data: requests.Response) -> Optional[List[Card]]:
103 |         """
104 |         Parses JSON data into instances of the Card model.
105 | 
106 |         Args:
107 |             data (dict): JSON data to parse.
108 | 
109 |         Returns:
110 |             List[Card]: List of parsed Card instances.
111 |         """
112 |         try:
113 |             logger.info("Parsing data")
114 |             cards_data = data.json()
115 |             parsed_cards = [Card(**card) for card in cards_data]
116 |             logger.success("Data parsing successful!")
117 |             return parsed_cards
118 |         except Exception as e:
119 |             logger.error(f"An unexpected error occurred: {e}")
120 |             return None
121 | 
122 | 
123 | class DataSaver:
124 |     """
125 |     Class for saving data.
126 |     """
127 | 
128 |     def __init__(
129 |         self,
130 |         table_path: str,
131 |         table_name: str,
132 |         bucket_name: Optional[str],
133 |         access_key_id: Optional[str],
134 |         secret_access_key: Optional[str],
135 |     ):
136 |         self.table_path = table_path
137 |         self.table_name = table_name
138 |         self.bucket_name = bucket_name
139 |         if bucket_name:
140 |             self.s3_client = boto3.client(
141 |                 "s3",
142 |                 aws_access_key_id=access_key_id,
143 |                 aws_secret_access_key=secret_access_key,
144 |             )
145 | 
146 |     def save_local(self, data: List) -> None:
147 |         """
148 |         Saves parsed data to a local file.
149 | 
150 |         Args:
151 |             data (list): List of parsed data.
152 |             table_path (str): Path to store local data.
153 |             table_name (str): Name of the table.
154 | 
155 |         Returns:
156 |             None
157 |         """
158 |         try:
159 |             logger.info("Saving data locally")
160 |             path = f"{self.table_path}{self.table_name}.json"
161 |             os.makedirs(os.path.dirname(path), exist_ok=True)
162 |             with open(path, "w") as file:
163 |                 json.dump([item.dict() for item in data], file, indent=4)
164 |             logger.success(f"Data saved locally to {path}")
165 |         except Exception as e:
166 |             logger.error(f"An error occurred while saving data locally: {e}")
167 | 
168 |     def save_s3(self, data: List) -> None:
169 |         """
170 |         Saves parsed data to AWS S3 bucket.
171 | 
172 |         Args:
173 |             data (list): List of parsed data.
174 |             s3_client: Boto3 S3 client.
175 |             bucket_name (str): Name of the AWS S3 bucket.
176 |             table_path (str): Path to store data in S3.
177 |             table_name (str): Name of the table.
178 | 
179 |         Returns:
180 |             None
181 |         """
182 |         try:
183 |             logger.info("Saving data to S3 bucket")
184 |             json_bytes = json.dumps([item.dict() for item in data], indent=4).encode(
185 |                 "utf-8"
186 |             )
187 |             key = f"{self.table_path}{self.table_name}.json"
188 |             self.s3_client.put_object(Body=json_bytes, Bucket=self.bucket_name, Key=key)
189 |             logger.success(f"Data saved successfully to S3 bucket: {self.bucket_name}")
190 |         except Exception as e:
191 |             logger.error(f"An error occurred while saving data to S3 bucket: {e}")
192 | 
193 | 
194 | class Ingestor:
195 |     """
196 |     Class for ingesting data from an API, parsing it, and saving it.
197 |     """
198 | 
199 |     def __init__(self, api_client, data_parser, data_saver):
200 |         self.api_client = api_client
201 |         self.data_parser = data_parser
202 |         self.data_saver = data_saver
203 | 
204 |     def execute(self) -> None:
205 |         """
206 |         Executes the ingestion process.
207 | 
208 |         Args:
209 |             dataset (str): Name of the dataset to fetch.
210 |         """
211 |         # Fetch data from the API
212 |         try:
213 |             logger.info("Starting ingestion")
214 |             bulk_data = self.api_client.fetch_bulk_data()
215 |             cards_data = self.api_client.fetch_cards_data(bulk_data)
216 | 
217 |             if cards_data:
218 |                 # Parse data
219 |                 parsed_data = self.data_parser.parse_cards(cards_data)
220 |                 if parsed_data:
221 |                     # Save data locally
222 |                     self.data_saver.save_local(parsed_data)
223 |                     # Save data to S3
224 |                     self.data_saver.save_s3(parsed_data)
225 |                     logger.success("Ingestion completed!")
226 |         except Exception as e:
227 |             logger.error(logger.error(f"Error executing data ingestion process: {e}"))
228 | 
229 | 
230 | if __name__ == "__main__":
231 |     # Create instances of classes
232 |     api_client = APIClient(API_BASE_URL, DATASET_NAME)
233 |     data_parser = DataParser()
234 |     data_saver = DataSaver(
235 |         TABLE_PATH, TABLE_NAME, AWS_BUCKET_NAME, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY
236 |     )
237 | 
238 |     # Create an instance of Ingestor and execute the ingestion process
239 |     ingestor = Ingestor(api_client, data_parser, data_saver)
240 |     ingestor.execute()
241 | 


--------------------------------------------------------------------------------