├── .python-version ├── astro ├── dags │ ├── .airflowignore │ ├── dbt │ │ └── magic_the_gathering │ │ │ ├── analyses │ │ │ └── .gitkeep │ │ │ ├── macros │ │ │ ├── .gitkeep │ │ │ └── export_cards_data.sql │ │ │ ├── seeds │ │ │ └── .gitkeep │ │ │ ├── snapshots │ │ │ └── .gitkeep │ │ │ ├── tests │ │ │ └── .gitkeep │ │ │ ├── .user.yml │ │ │ ├── .gitignore │ │ │ ├── packages.yml │ │ │ ├── package-lock.yml │ │ │ ├── models │ │ │ ├── core │ │ │ │ └── fact_cards.sql │ │ │ └── staging │ │ │ │ ├── sources.yml │ │ │ │ ├── stg_cards.sql │ │ │ │ └── schema.yml │ │ │ ├── profiles.yml │ │ │ ├── README.md │ │ │ └── dbt_project.yml │ ├── gold_ingestion.py │ ├── raw_ingestion.py │ ├── bronze_ingestion.py │ └── silver_ingestion.py ├── packages.txt ├── .astro │ ├── config.yaml │ └── test_dag_integrity_default.py ├── dbt-requirements.txt ├── .dockerignore ├── .gitignore ├── include │ ├── models │ │ ├── purchase_uris.py │ │ ├── all_parts.py │ │ ├── image_uris.py │ │ ├── related_uris.py │ │ ├── prices.py │ │ ├── legalities.py │ │ └── card.py │ ├── lib │ │ ├── motherduck_manager.py │ │ ├── duckdb_manager.py │ │ └── aws_manager.py │ └── ingestion │ │ ├── silver │ │ └── cards.py │ │ ├── bronze │ │ └── cards.py │ │ └── raw │ │ └── cards.py ├── requirements.txt ├── Dockerfile ├── tests │ └── dags │ │ └── test_dag_example.py └── README.md ├── dbt └── magic_the_gathering │ ├── seeds │ └── .gitkeep │ ├── tests │ └── .gitkeep │ ├── analyses │ └── .gitkeep │ ├── macros │ ├── .gitkeep │ └── export_cards_data.sql │ ├── snapshots │ └── .gitkeep │ ├── .user.yml │ ├── .gitignore │ ├── packages.yml │ ├── temp │ └── dbt.duckdb │ ├── package-lock.yml │ ├── models │ ├── core │ │ └── fact_cards.sql │ └── staging │ │ ├── sources.yml │ │ ├── stg_cards.sql │ │ └── schema.yml │ ├── profiles.yml │ ├── README.md │ └── dbt_project.yml ├── images ├── magic_the_gathering.jpg ├── magic_the_gathering_dashboard.jpg └── magic_the_gathering_pipeline_etl.png ├── models ├── purchase_uris.py ├── all_parts.py ├── image_uris.py ├── related_uris.py ├── prices.py ├── legalities.py └── card.py ├── terraform ├── variables.tf ├── main.tf └── s3_bucket.tf ├── pyproject.toml ├── LICENSE ├── lib ├── duckdb_manager.py ├── motherduck_manager.py └── aws_manager.py ├── .gitignore ├── docs └── info_dataset.md ├── README.md └── ingestion ├── silver └── cards.py ├── bronze └── cards.py └── raw └── cards.py /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /astro/dags/.airflowignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/packages.txt: -------------------------------------------------------------------------------- 1 | gcc 2 | python3-dev -------------------------------------------------------------------------------- /dbt/magic_the_gathering/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /astro/.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: astro 3 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/.user.yml: -------------------------------------------------------------------------------- 1 | id: e0fed478-152d-44aa-84d0-430a4900cd71 2 | -------------------------------------------------------------------------------- /astro/dbt-requirements.txt: -------------------------------------------------------------------------------- 1 | dbt-core==1.5.0 2 | dbt-duckdb==1.5.2 3 | duckdb==0.9.2 -------------------------------------------------------------------------------- /dbt/magic_the_gathering/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/.user.yml: -------------------------------------------------------------------------------- 1 | id: e0fed478-152d-44aa-84d0-430a4900cd71 2 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | .env 6 | .duckdb/ 7 | temp/ -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 -------------------------------------------------------------------------------- /astro/.dockerignore: -------------------------------------------------------------------------------- 1 | astro 2 | .git 3 | .env 4 | airflow_settings.yaml 5 | logs/ 6 | .venv 7 | airflow.db 8 | airflow.cfg 9 | -------------------------------------------------------------------------------- /images/magic_the_gathering.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/images/magic_the_gathering.jpg -------------------------------------------------------------------------------- /dbt/magic_the_gathering/temp/dbt.duckdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/dbt/magic_the_gathering/temp/dbt.duckdb -------------------------------------------------------------------------------- /images/magic_the_gathering_dashboard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/images/magic_the_gathering_dashboard.jpg -------------------------------------------------------------------------------- /images/magic_the_gathering_pipeline_etl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apittaa/magic-the-gathering-pipeline/HEAD/images/magic_the_gathering_pipeline_etl.png -------------------------------------------------------------------------------- /dbt/magic_the_gathering/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 5 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/models/core/fact_cards.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | SELECT 4 | * 5 | FROM {{ ref('stg_cards') }} 6 | ORDER BY usd_price DESC -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 5 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/models/core/fact_cards.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="table") }} 2 | 3 | SELECT 4 | * 5 | FROM {{ ref("stg_cards") }} 6 | ORDER BY usd_price DESC 7 | -------------------------------------------------------------------------------- /astro/.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | .DS_Store 4 | airflow_settings.yaml 5 | __pycache__/ 6 | astro 7 | .venv 8 | airflow-webserver.pid 9 | webserver_config.py 10 | airflow.cfg 11 | airflow.db 12 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/models/staging/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: s3-magic-the-gathering 5 | meta: 6 | external_location: "{{ env_var('TRANSFORM_S3_PATH_INPUT') }}" 7 | tables: 8 | - name: cards -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/models/staging/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: s3-magic-the-gathering 5 | meta: 6 | external_location: "{{ env_var('TRANSFORM_S3_PATH_INPUT') }}" 7 | tables: 8 | - name: cards -------------------------------------------------------------------------------- /models/purchase_uris.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class PurchaseUris(BaseModel): 5 | tcgplayer: str 6 | cardmarket: str 7 | cardhoarder: str 8 | 9 | def __getitem__(self, item): 10 | return getattr(self, item) 11 | -------------------------------------------------------------------------------- /astro/include/models/purchase_uris.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class PurchaseUris(BaseModel): 5 | tcgplayer: str 6 | cardmarket: str 7 | cardhoarder: str 8 | 9 | def __getitem__(self, item): 10 | return getattr(self, item) 11 | -------------------------------------------------------------------------------- /astro/requirements.txt: -------------------------------------------------------------------------------- 1 | # Astro Runtime includes the following pre-installed providers packages: https://docs.astronomer.io/astro/runtime-image-architecture#provider-packages 2 | duckdb==0.9.2 3 | airflow-provider-duckdb==0.2.0 4 | astro-sdk-python[duckdb]==1.6.1 5 | astronomer-cosmos 6 | pydantic 7 | -------------------------------------------------------------------------------- /models/all_parts.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class AllParts(BaseModel): 5 | object: str 6 | id: str 7 | component: str 8 | name: str 9 | type_line: str 10 | uri: str 11 | 12 | def __getitem__(self, item): 13 | return getattr(self, item) 14 | -------------------------------------------------------------------------------- /models/image_uris.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ImageUris(BaseModel): 5 | small: str 6 | normal: str 7 | large: str 8 | png: str 9 | art_crop: str 10 | border_crop: str 11 | 12 | def __getitem__(self, item): 13 | return getattr(self, item) 14 | -------------------------------------------------------------------------------- /astro/include/models/all_parts.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class AllParts(BaseModel): 5 | object: str 6 | id: str 7 | component: str 8 | name: str 9 | type_line: str 10 | uri: str 11 | 12 | def __getitem__(self, item): 13 | return getattr(self, item) 14 | -------------------------------------------------------------------------------- /astro/include/models/image_uris.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ImageUris(BaseModel): 5 | small: str 6 | normal: str 7 | large: str 8 | png: str 9 | art_crop: str 10 | border_crop: str 11 | 12 | def __getitem__(self, item): 13 | return getattr(self, item) 14 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/macros/export_cards_data.sql: -------------------------------------------------------------------------------- 1 | {% macro export_cards_data(table) %} 2 | {% set s3_path = env_var('TRANSFORM_S3_PATH_OUTPUT', 'my-bucket-path') %} 3 | COPY ( 4 | SELECT 5 | * 6 | FROM {{ table }} 7 | ) 8 | TO '{{ s3_path }}{{ table }}.parquet' 9 | (FORMAT PARQUET); 10 | {% endmacro %} -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # AWS VARIABLES 2 | variable "AWS_ACCESS_KEY" { 3 | description = "AWS Access Key." 4 | type = string 5 | } 6 | 7 | variable "AWS_SECRET_KEY" { 8 | description = "AWS Secret Key." 9 | type = string 10 | } 11 | 12 | variable "AWS_REGION" { 13 | description = "AWS Region." 14 | type = string 15 | } 16 | -------------------------------------------------------------------------------- /models/related_uris.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | 4 | 5 | class RelatedUris(BaseModel): 6 | tcgplayer_infinite_articles: Optional[str] = "" 7 | tcgplayer_infinite_decks: Optional[str] = "" 8 | edhrec: Optional[str] = "" 9 | 10 | def __getitem__(self, item): 11 | return getattr(self, item) 12 | -------------------------------------------------------------------------------- /astro/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 quay.io/astronomer/astro-runtime:10.6.0 2 | 3 | # install dbt into a venv to avoid package dependency conflicts 4 | WORKDIR "/usr/local/airflow" 5 | COPY dbt-requirements.txt ./ 6 | RUN python -m virtualenv dbt_venv && source dbt_venv/bin/activate && \ 7 | pip install --no-cache-dir -r dbt-requirements.txt && deactivate 8 | 9 | -------------------------------------------------------------------------------- /astro/include/models/related_uris.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | 4 | 5 | class RelatedUris(BaseModel): 6 | tcgplayer_infinite_articles: Optional[str] = "" 7 | tcgplayer_infinite_decks: Optional[str] = "" 8 | edhrec: Optional[str] = "" 9 | 10 | def __getitem__(self, item): 11 | return getattr(self, item) 12 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/macros/export_cards_data.sql: -------------------------------------------------------------------------------- 1 | {% macro export_cards_data(schema, table) %} 2 | {% set s3_path = env_var('TRANSFORM_S3_PATH_OUTPUT') %} 3 | -- {% set schema = 'gold' %} 4 | COPY ( 5 | SELECT 6 | * 7 | FROM {{ schema }}.{{ table }} 8 | ) 9 | TO '{{ s3_path }}{{ table }}.parquet' 10 | (FORMAT PARQUET); 11 | {% endmacro %} -------------------------------------------------------------------------------- /models/prices.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | 4 | 5 | class Prices(BaseModel): 6 | usd: Optional[str] = "" 7 | usd_foil: Optional[str] = "" 8 | usd_etched: Optional[str] = "" 9 | eur: Optional[str] = "" 10 | eur_foil: Optional[str] = "" 11 | tix: Optional[str] = "" 12 | 13 | def __getitem__(self, item): 14 | return getattr(self, item) 15 | -------------------------------------------------------------------------------- /astro/include/models/prices.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | 4 | 5 | class Prices(BaseModel): 6 | usd: Optional[str] = "" 7 | usd_foil: Optional[str] = "" 8 | usd_etched: Optional[str] = "" 9 | eur: Optional[str] = "" 10 | eur_foil: Optional[str] = "" 11 | tix: Optional[str] = "" 12 | 13 | def __getitem__(self, item): 14 | return getattr(self, item) 15 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/profiles.yml: -------------------------------------------------------------------------------- 1 | magic_the_gathering: 2 | outputs: 3 | dev: 4 | type: duckdb 5 | path: temp/dbt.duckdb 6 | extensions: 7 | - httpfs 8 | - parquet 9 | settings: 10 | s3_region: "{{ env_var('AWS_REGION') }}" 11 | s3_access_key_id: "{{ env_var('AWS_ACCESS_KEY') }}" 12 | s3_secret_access_key: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}" 13 | prod: 14 | type: duckdb 15 | schema: gold 16 | path: "{{ env_var('MOTHERDUCK_DATABASE') }}" 17 | target: dev -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0" 3 | backend "local" {} # Change from "local" to "gcs" (for Google Cloud Storage) or "s3" (for AWS S3) if you want to preserve your tfstate online 4 | required_providers { 5 | aws = { # Change the required provider to AWS 6 | source = "hashicorp/aws" 7 | } 8 | } 9 | } 10 | 11 | provider "aws" { 12 | region = var.AWS_REGION # Set AWS region using variable 13 | access_key = var.AWS_ACCESS_KEY # Set AWS access key using variable 14 | secret_key = var.AWS_SECRET_KEY # Set AWS secret key using variable 15 | } 16 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/profiles.yml: -------------------------------------------------------------------------------- 1 | magic_the_gathering: 2 | outputs: 3 | dev: 4 | type: duckdb 5 | schema: gold 6 | path: /usr/local/airflow/dags/dbt/magic_the_gathering/temp/dbt.duckdb 7 | extensions: 8 | - httpfs 9 | - parquet 10 | settings: 11 | s3_region: "{{ env_var('AWS_REGION') }}" 12 | s3_access_key_id: "{{ env_var('AWS_ACCESS_KEY') }}" 13 | s3_secret_access_key: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}" 14 | prod: 15 | type: duckdb 16 | schema: gold 17 | path: "{{ env_var('MOTHERDUCK_DATABASE') }}" 18 | target: dev 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "magic-the-gathering-pipeline" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Pitta "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | pydantic = "^2.6.4" 11 | pytest = "^8.1.1" 12 | ruff = "^0.3.3" 13 | requests = "^2.31.0" 14 | loguru = "^0.7.2" 15 | boto3 = "^1.34.64" 16 | python-dotenv = "^1.0.1" 17 | duckdb = "^0.9.2" 18 | dbt-duckdb = "^1.7.3" 19 | 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | ipykernel = "^6.29.4" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /models/legalities.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Legalities(BaseModel): 5 | standard: str 6 | future: str 7 | historic: str 8 | timeless: str 9 | gladiator: str 10 | pioneer: str 11 | explorer: str 12 | modern: str 13 | legacy: str 14 | pauper: str 15 | vintage: str 16 | penny: str 17 | commander: str 18 | oathbreaker: str 19 | standardbrawl: str 20 | brawl: str 21 | alchemy: str 22 | paupercommander: str 23 | duel: str 24 | oldschool: str 25 | premodern: str 26 | predh: str 27 | 28 | def __getitem__(self, item): 29 | return getattr(self, item) 30 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /astro/include/models/legalities.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class Legalities(BaseModel): 5 | standard: str 6 | future: str 7 | historic: str 8 | timeless: str 9 | gladiator: str 10 | pioneer: str 11 | explorer: str 12 | modern: str 13 | legacy: str 14 | pauper: str 15 | vintage: str 16 | penny: str 17 | commander: str 18 | oathbreaker: str 19 | standardbrawl: str 20 | brawl: str 21 | alchemy: str 22 | paupercommander: str 23 | duel: str 24 | oldschool: str 25 | premodern: str 26 | predh: str 27 | 28 | def __getitem__(self, item): 29 | return getattr(self, item) 30 | -------------------------------------------------------------------------------- /terraform/s3_bucket.tf: -------------------------------------------------------------------------------- 1 | # AWS 2 | # Ref: https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket 3 | resource "aws_s3_bucket" "bucket" { 4 | bucket = "magic-the-gathering-bucket" 5 | } 6 | 7 | ## Consumes more space 8 | # resource "aws_s3_bucket_versioning" "bucket_versioning" { 9 | # bucket = aws_s3_bucket.bucket.id 10 | # versioning_configuration { 11 | # status = "Enabled" 12 | # } 13 | # } 14 | 15 | resource "aws_s3_bucket_lifecycle_configuration" "bucket_lifecycle_configuration" { 16 | bucket = aws_s3_bucket.bucket.id 17 | rule { 18 | id = "delete" 19 | expiration { 20 | days = 30 # objects will be deleted after 30 days 21 | } 22 | status = "Enabled" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/models/staging/stg_cards.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="view") }} 2 | 3 | --handling deduplication 4 | WITH cards AS 5 | ( 6 | SELECT 7 | * 8 | , row_number() OVER(PARTITION BY card_id, released_at) AS rn 9 | FROM {{ source("s3-magic-the-gathering", "cards") }} 10 | ) 11 | SELECT 12 | -- identifiers 13 | CAST(({{ dbt_utils.generate_surrogate_key(['card_id', 'released_at']) }}) AS string) AS case_id, 14 | CAST(card_id AS string) AS card_id, 15 | 16 | -- Cards info 17 | CAST(name AS string) AS name, 18 | CAST(released_at AS date) AS released_at, 19 | CAST(color_identity AS string) AS color_identity, 20 | CAST(set_name AS string) AS set_name, 21 | CAST(artist AS string) AS artist, 22 | CAST(usd_price AS float) AS usd_price, 23 | 24 | FROM cards 25 | WHERE rn = 1 26 | -------------------------------------------------------------------------------- /dbt/magic_the_gathering/models/staging/stg_cards.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | --handling deduplication 4 | WITH cards AS 5 | ( 6 | SELECT 7 | * 8 | , row_number() over(partition by card_id, released_at) AS rn 9 | FROM {{ source("s3-magic-the-gathering", "cards") }} 10 | ) 11 | SELECT 12 | -- identifiers 13 | CAST(({{ dbt_utils.generate_surrogate_key(['card_id', 'released_at']) }}) AS string) AS case_id, 14 | CAST(card_id AS string) AS card_id, 15 | 16 | -- Cards info 17 | CAST(name AS string) AS name, 18 | CAST(released_at AS date) AS released_at, 19 | CAST(color_identity AS string) AS color_identity, 20 | CAST(set_name AS string) AS set_name, 21 | CAST(artist AS string) AS artist, 22 | CAST(usd_price AS float) AS usd_price, 23 | 24 | FROM cards 25 | WHERE rn = 1 26 | 27 | -- {% if var('is_test_run', default=true) %} 28 | 29 | -- limit 100 30 | 31 | -- {% endif %} -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_cards 5 | description: "Individual Magic: The Gathering cards that players could obtain and add to their collection (with a few minor exceptions)." 6 | columns: 7 | - name: id 8 | description: > 9 | A unique ID for the card in Scryfall database. 10 | - name: name 11 | description: > 12 | The name of the card. If the card has multiple faces, this field will contain both names separated by ␣//␣. 13 | - name: released_at 14 | description: > 15 | The date the card was first released. 16 | - name: color_identity 17 | description: > 18 | The card color identity. 19 | - name: set_name 20 | description: > 21 | The card full set name. 22 | - name: artist 23 | description: > 24 | The name of the illustrator of the card face. Newly spoiled cards may not have this field yet. 25 | - name: (prices).usd 26 | description: > 27 | The card usd price. 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Arthur Pitta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/duckdb_manager.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | from loguru import logger 3 | from typing import Any 4 | 5 | 6 | class DuckDBManager: 7 | """ 8 | Manages DuckDB connection and executes queries. 9 | """ 10 | 11 | def __init__(self): 12 | """ 13 | Initializes DuckDBManager. 14 | """ 15 | self.connection = self.create_connection() 16 | 17 | def create_connection(self) -> Any: 18 | """ 19 | Create a connection to DuckDB. 20 | 21 | Returns: 22 | duckdb.Connection: DuckDB connection object. 23 | """ 24 | try: 25 | logger.info("Creating DuckDB connection") 26 | duckdb_conn = duckdb.connect() 27 | logger.success("DuckDB connection created!") 28 | return duckdb_conn 29 | except Exception as e: 30 | logger.error(f"Error creating DuckDB connection: {e}") 31 | return None 32 | 33 | def execute_query(self, query: str) -> None: 34 | """ 35 | Executes a SQL query. 36 | 37 | Args: 38 | query (str): SQL query to execute. 39 | 40 | Returns: 41 | None 42 | """ 43 | try: 44 | logger.info("Executing query") 45 | self.connection.execute(query) 46 | logger.success("Query executed") 47 | except Exception as e: 48 | logger.error(f"Error executing query: {e}") 49 | -------------------------------------------------------------------------------- /astro/include/lib/motherduck_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class MotherDuckManager: 8 | """ 9 | Manages connection to MotherDuck. 10 | """ 11 | 12 | def __init__(self, duckdb_manager, motherduck_token: str): 13 | """ 14 | Initializes MotherDuckManager. 15 | 16 | Args: 17 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 18 | motherduck_token (str): Token for accessing MotherDuck. 19 | """ 20 | self.duckdb_manager = duckdb_manager 21 | self.connect(motherduck_token) 22 | 23 | def connect(self, motherduck_token: str) -> None: 24 | """ 25 | Connects to MotherDuck. 26 | 27 | Args: 28 | motherduck_token (str): Token for accessing MotherDuck. 29 | 30 | Returns: 31 | None 32 | """ 33 | try: 34 | logger.info("Connecting to Mother Duck") 35 | self.duckdb_manager.execute_query("INSTALL md;") 36 | self.duckdb_manager.execute_query("LOAD md;") 37 | self.duckdb_manager.execute_query( 38 | f"SET motherduck_token='{motherduck_token}'" 39 | ) 40 | self.duckdb_manager.execute_query("ATTACH 'md:'") 41 | logger.info("Connected to Mother Duck!") 42 | except Exception as e: 43 | logger.error(f"Error connecting to MotherDuck: {e}") 44 | -------------------------------------------------------------------------------- /lib/motherduck_manager.py: -------------------------------------------------------------------------------- 1 | from lib.duckdb_manager import DuckDBManager 2 | from loguru import logger 3 | 4 | 5 | class MotherDuckManager: 6 | """ 7 | Manages connection to MotherDuck. 8 | """ 9 | 10 | def __init__(self, duckdb_manager: DuckDBManager, motherduck_token: str): 11 | """ 12 | Initializes MotherDuckManager. 13 | 14 | Args: 15 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 16 | motherduck_token (str): Token for accessing MotherDuck. 17 | """ 18 | self.duckdb_manager = duckdb_manager 19 | self.connect(motherduck_token) 20 | 21 | def connect(self, motherduck_token: str) -> None: 22 | """ 23 | Connects to MotherDuck. 24 | 25 | Args: 26 | motherduck_token (str): Token for accessing MotherDuck. 27 | 28 | Returns: 29 | None 30 | """ 31 | try: 32 | logger.info("Connecting to Mother Duck") 33 | self.duckdb_manager.execute_query("INSTALL md;") 34 | self.duckdb_manager.execute_query("LOAD md;") 35 | self.duckdb_manager.execute_query( 36 | f"SET motherduck_token='{motherduck_token}'" 37 | ) 38 | self.duckdb_manager.execute_query("ATTACH 'md:'") 39 | logger.success("Connected to Mother Duck!") 40 | except Exception as e: 41 | logger.error(f"Error connecting to MotherDuck: {e}") 42 | -------------------------------------------------------------------------------- /astro/include/lib/duckdb_manager.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | from typing import Any 3 | import logging 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class DuckDBManager: 10 | """ 11 | Manages DuckDB connection and executes queries. 12 | """ 13 | 14 | def __init__(self): 15 | """ 16 | Initializes DuckDBManager. 17 | """ 18 | self.connection = self.create_connection() 19 | 20 | def create_connection(self) -> Any: 21 | """ 22 | Create a connection to DuckDB. 23 | 24 | Returns: 25 | duckdb.Connection: DuckDB connection object. 26 | """ 27 | try: 28 | logger.info("Creating DuckDB connection") 29 | duckdb_conn = duckdb.connect() 30 | logger.info("DuckDB connection created!") 31 | return duckdb_conn 32 | except Exception as e: 33 | logger.error(f"Error creating DuckDB connection: {e}") 34 | return None 35 | 36 | def execute_query(self, query: str) -> None: 37 | """ 38 | Executes a SQL query. 39 | 40 | Args: 41 | query (str): SQL query to execute. 42 | 43 | Returns: 44 | None 45 | """ 46 | try: 47 | logger.info("Executing query") 48 | self.connection.execute(query) 49 | logger.info("Query executed") 50 | except Exception as e: 51 | logger.error(f"Error executing query: {e}") 52 | -------------------------------------------------------------------------------- /astro/dags/dbt/magic_the_gathering/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'magic_the_gathering' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'magic_the_gathering' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_packages" 25 | 26 | 27 | # Configuring models 28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 29 | 30 | # In this example config, we tell dbt to build all models in the example/ 31 | # directory as views. These settings can be overridden in the individual model 32 | # files using the `{{ config(...) }}` macro. 33 | models: 34 | magic_the_gathering: 35 | # Applies to all files under models/staging/ 36 | staging: 37 | +materialized: view 38 | # Applies to all files under models/core/ 39 | core: 40 | +materialized: table 41 | +post-hook: "{{ export_cards_data('gold', 'fact_cards') }}" -------------------------------------------------------------------------------- /dbt/magic_the_gathering/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_cards 5 | description: "Individual Magic: The Gathering cards that players could obtain and add to their collection (with a few minor exceptions)." 6 | columns: 7 | - name: id 8 | description: > 9 | A unique ID for the card in Scryfall database. 10 | tests: 11 | - unique: 12 | severity: warn 13 | - not_null: 14 | severity: warn 15 | - name: name 16 | description: > 17 | The name of the card. If the card has multiple faces, this field will contain both names separated by ␣//␣. 18 | tests: 19 | - not_null: 20 | severity: warn 21 | - name: released_at 22 | description: > 23 | The date the card was first released. 24 | tests: 25 | - not_null: 26 | severity: warn 27 | - name: color_identity 28 | description: > 29 | The card color identity 30 | tests: 31 | - not_null: 32 | severity: warn 33 | - name: set_name 34 | description: > 35 | The card full set name. 36 | tests: 37 | - not_null: 38 | severity: warn 39 | - name: artist 40 | description: > 41 | The name of the illustrator of the card face. Newly spoiled cards may not have this field yet. 42 | - name: (prices).usd 43 | description: > 44 | The card usd price. 45 | tests: 46 | - not_null: 47 | severity: warn -------------------------------------------------------------------------------- /dbt/magic_the_gathering/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'magic_the_gathering' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'magic_the_gathering' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_packages" 25 | 26 | 27 | # Configuring models 28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 29 | 30 | # In this example config, we tell dbt to build all models in the example/ 31 | # directory as views. These settings can be overridden in the individual model 32 | # files using the `{{ config(...) }}` macro. 33 | models: 34 | magic_the_gathering: 35 | # Applies to all files under models/staging/ 36 | staging: 37 | +materialized: view 38 | # Applies to all files under models/core/ 39 | core: 40 | +materialized: table 41 | +post-hook: "{% if target.name == 'dev' %}{{ export_cards_data(this.name) }}{% endif %}" -------------------------------------------------------------------------------- /astro/dags/gold_ingestion.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | from cosmos import DbtDag, ProjectConfig, ProfileConfig, ExecutionConfig 4 | 5 | default_args = { 6 | 'owner': 'airflow', 7 | 'depends_on_past': True, 8 | 'start_date': datetime(2024, 4, 1), 9 | 'retries': 2, 10 | 'retry_delay': timedelta(minutes=5), 11 | } 12 | 13 | profile_config = ProfileConfig(profile_name="magic_the_gathering", 14 | target_name="dev", 15 | profiles_yml_filepath="/usr/local/airflow/dags/dbt/magic_the_gathering/profiles.yml") 16 | 17 | project_config = ProjectConfig(dbt_project_path="/usr/local/airflow/dags/dbt/magic_the_gathering") 18 | 19 | OPERATOR_ARGS = { 20 | "install_deps": True, 21 | "env": { 22 | "HOME": "/usr/local/airflow/dags/dbt/magic_the_gathering", 23 | "AWS_REGION": os.environ["AWS_REGION"], 24 | "AWS_ACCESS_KEY": os.environ["AWS_ACCESS_KEY"], 25 | "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"], 26 | "TRANSFORM_S3_PATH_INPUT": os.environ["TRANSFORM_S3_PATH_INPUT"], 27 | "TRANSFORM_S3_PATH_OUTPUT": os.environ["TRANSFORM_S3_PATH_OUTPUT"], 28 | "MOTHERDUCK_DATABASE": os.environ["MOTHERDUCK_DATABASE"] 29 | } 30 | } 31 | 32 | cards_gold_dag = DbtDag(project_config=project_config, 33 | operator_args=OPERATOR_ARGS, 34 | profile_config=profile_config, 35 | execution_config=ExecutionConfig(dbt_executable_path=f"{os.environ['AIRFLOW_HOME']}/dbt_venv/bin/dbt",), 36 | default_args=default_args, 37 | tags=['gold_cards'], 38 | dag_id='ingestor_gold') 39 | 40 | -------------------------------------------------------------------------------- /astro/include/lib/aws_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class AWSManager: 8 | """ 9 | Manages AWS credentials and operations. 10 | """ 11 | 12 | def __init__( 13 | self, 14 | duckdb_manager, 15 | aws_region: str, 16 | aws_access_key: str, 17 | aws_secret_access_key: str, 18 | ): 19 | """ 20 | Initializes AWSManager. 21 | 22 | Args: 23 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 24 | aws_region (str): AWS region. 25 | aws_access_key (str): AWS access key ID. 26 | aws_secret_access_key (str): AWS secret access key. 27 | """ 28 | self.duckdb_manager = duckdb_manager 29 | self.load_credentials(aws_region, aws_access_key, aws_secret_access_key) 30 | 31 | def load_credentials( 32 | self, aws_region: str, aws_access_key: str, aws_secret_access_key: str 33 | ) -> None: 34 | """ 35 | Loads AWS credentials. 36 | 37 | Args: 38 | aws_region (str): AWS region. 39 | aws_access_key (str): AWS access key ID. 40 | aws_secret_access_key (str): AWS secret access key. 41 | 42 | Returns: 43 | None 44 | """ 45 | try: 46 | logger.info("Loading AWS credentials") 47 | self.duckdb_manager.execute_query("INSTALL httpfs;") 48 | self.duckdb_manager.execute_query("LOAD httpfs;") 49 | self.duckdb_manager.execute_query(f"SET s3_region='{aws_region}'") 50 | self.duckdb_manager.execute_query( 51 | f"SET s3_access_key_id='{aws_access_key}';" 52 | ) 53 | self.duckdb_manager.execute_query( 54 | f"SET s3_secret_access_key='{aws_secret_access_key}';" 55 | ) 56 | self.duckdb_manager.execute_query("CALL load_aws_credentials();") 57 | logger.info("AWS credentials loaded!") 58 | except Exception as e: 59 | logger.error(f"Error loading AWS credentials: {e}") 60 | -------------------------------------------------------------------------------- /lib/aws_manager.py: -------------------------------------------------------------------------------- 1 | from lib.duckdb_manager import DuckDBManager 2 | from loguru import logger 3 | 4 | 5 | class AWSManager: 6 | """ 7 | Manages AWS credentials and operations. 8 | """ 9 | 10 | def __init__( 11 | self, 12 | duckdb_manager: DuckDBManager, 13 | aws_region: str, 14 | aws_access_key: str, 15 | aws_secret_access_key: str, 16 | ): 17 | """ 18 | Initializes AWSManager. 19 | 20 | Args: 21 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 22 | aws_region (str): AWS region. 23 | aws_access_key (str): AWS access key ID. 24 | aws_secret_access_key (str): AWS secret access key. 25 | """ 26 | self.duckdb_manager = duckdb_manager 27 | self.load_credentials(aws_region, aws_access_key, aws_secret_access_key) 28 | 29 | def load_credentials( 30 | self, aws_region: str, aws_access_key: str, aws_secret_access_key: str 31 | ) -> None: 32 | """ 33 | Loads AWS credentials. 34 | 35 | Args: 36 | aws_region (str): AWS region. 37 | aws_access_key (str): AWS access key ID. 38 | aws_secret_access_key (str): AWS secret access key. 39 | 40 | Returns: 41 | None 42 | """ 43 | try: 44 | logger.info("Loading AWS credentials") 45 | self.duckdb_manager.execute_query("INSTALL httpfs;") 46 | self.duckdb_manager.execute_query("LOAD httpfs;") 47 | self.duckdb_manager.execute_query(f"SET s3_region='{aws_region}'") 48 | self.duckdb_manager.execute_query( 49 | f"SET s3_access_key_id='{aws_access_key}';" 50 | ) 51 | self.duckdb_manager.execute_query( 52 | f"SET s3_secret_access_key='{aws_secret_access_key}';" 53 | ) 54 | self.duckdb_manager.execute_query("CALL load_aws_credentials();") 55 | logger.success("AWS credentials loaded!") 56 | except Exception as e: 57 | logger.error(f"Error loading AWS credentials: {e}") 58 | -------------------------------------------------------------------------------- /models/card.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional 3 | 4 | from models.image_uris import ImageUris 5 | from models.all_parts import AllParts 6 | from models.legalities import Legalities 7 | from models.prices import Prices 8 | from models.related_uris import RelatedUris 9 | from models.purchase_uris import PurchaseUris 10 | 11 | 12 | class Card(BaseModel): 13 | object: str 14 | id: str 15 | oracle_id: Optional[str] = "" 16 | multiverse_ids: Optional[List] = None 17 | mtgo_id: Optional[int] = 0 18 | mtgo_foil_id: Optional[int] = 0 19 | tcgplayer_id: Optional[int] = 0 20 | cardmarket_id: Optional[int] = 0 21 | name: str 22 | lang: str 23 | released_at: str 24 | uri: str 25 | scryfall_uri: str 26 | layout: str 27 | highres_image: bool 28 | image_status: str 29 | image_uris: Optional[ImageUris] = None 30 | mana_cost: Optional[str] = "" 31 | cmc: Optional[float] = 0.0 32 | type_line: Optional[str] = "" 33 | oracle_text: Optional[str] = "" 34 | power: Optional[str] = "" 35 | toughness: Optional[str] = "" 36 | colors: Optional[List[str]] = None 37 | color_identity: List[str] 38 | keywords: List[str] 39 | all_parts: Optional[List[AllParts]] = None 40 | legalities: Legalities 41 | games: List[str] 42 | reserved: bool 43 | foil: Optional[bool] = None 44 | nonfoil: Optional[bool] = None 45 | finishes: List[str] 46 | oversized: bool 47 | promo: bool 48 | reprint: bool 49 | variation: bool 50 | set_id: str 51 | set: str 52 | set_name: str 53 | set_type: str 54 | set_uri: str 55 | set_search_uri: str 56 | scryfall_set_uri: str 57 | rulings_uri: str 58 | prints_search_uri: str 59 | collector_number: str 60 | digital: bool 61 | rarity: str 62 | flavor_text: Optional[str] = "" 63 | card_back_id: Optional[str] = "" 64 | artist: Optional[str] = "" 65 | artist_ids: Optional[List[str]] = None 66 | illustration_id: Optional[str] = "" 67 | border_color: str 68 | frame: str 69 | full_art: bool 70 | textless: bool 71 | booster: bool 72 | story_spotlight: bool 73 | edhrec_rank: Optional[int] = 0 74 | penny_rank: Optional[int] = 0 75 | prices: Prices 76 | related_uris: RelatedUris 77 | purchase_uris: Optional[PurchaseUris] = None 78 | 79 | def __getitem__(self, item): 80 | return getattr(self, item) 81 | -------------------------------------------------------------------------------- /astro/include/models/card.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional 3 | 4 | from models.image_uris import ImageUris 5 | from models.all_parts import AllParts 6 | from models.legalities import Legalities 7 | from models.prices import Prices 8 | from models.related_uris import RelatedUris 9 | from models.purchase_uris import PurchaseUris 10 | 11 | 12 | class Card(BaseModel): 13 | object: str 14 | id: str 15 | oracle_id: Optional[str] = "" 16 | multiverse_ids: Optional[List] = None 17 | mtgo_id: Optional[int] = 0 18 | mtgo_foil_id: Optional[int] = 0 19 | tcgplayer_id: Optional[int] = 0 20 | cardmarket_id: Optional[int] = 0 21 | name: str 22 | lang: str 23 | released_at: str 24 | uri: str 25 | scryfall_uri: str 26 | layout: str 27 | highres_image: bool 28 | image_status: str 29 | image_uris: Optional[ImageUris] = None 30 | mana_cost: Optional[str] = "" 31 | cmc: Optional[float] = 0.0 32 | type_line: Optional[str] = "" 33 | oracle_text: Optional[str] = "" 34 | power: Optional[str] = "" 35 | toughness: Optional[str] = "" 36 | colors: Optional[List[str]] = None 37 | color_identity: List[str] 38 | keywords: List[str] 39 | all_parts: Optional[List[AllParts]] = None 40 | legalities: Legalities 41 | games: List[str] 42 | reserved: bool 43 | foil: Optional[bool] = None 44 | nonfoil: Optional[bool] = None 45 | finishes: List[str] 46 | oversized: bool 47 | promo: bool 48 | reprint: bool 49 | variation: bool 50 | set_id: str 51 | set: str 52 | set_name: str 53 | set_type: str 54 | set_uri: str 55 | set_search_uri: str 56 | scryfall_set_uri: str 57 | rulings_uri: str 58 | prints_search_uri: str 59 | collector_number: str 60 | digital: bool 61 | rarity: str 62 | flavor_text: Optional[str] = "" 63 | card_back_id: Optional[str] = "" 64 | artist: Optional[str] = "" 65 | artist_ids: Optional[List[str]] = None 66 | illustration_id: Optional[str] = "" 67 | border_color: str 68 | frame: str 69 | full_art: bool 70 | textless: bool 71 | booster: bool 72 | story_spotlight: bool 73 | edhrec_rank: Optional[int] = 0 74 | penny_rank: Optional[int] = 0 75 | prices: Prices 76 | related_uris: RelatedUris 77 | purchase_uris: Optional[PurchaseUris] = None 78 | 79 | def __getitem__(self, item): 80 | return getattr(self, item) 81 | -------------------------------------------------------------------------------- /astro/dags/raw_ingestion.py: -------------------------------------------------------------------------------- 1 | # from datetime import datetime, timedelta 2 | from datetime import datetime, timedelta 3 | import os 4 | import logging 5 | from airflow.decorators import dag, task 6 | from airflow.operators.dagrun_operator import TriggerDagRunOperator 7 | from dotenv import load_dotenv 8 | 9 | from include.ingestion.raw.cards import ( 10 | APIClient, 11 | DataParser, 12 | DataSaver, 13 | ) 14 | 15 | # Load environment variables 16 | load_dotenv() 17 | 18 | # Configuration 19 | API_BASE_URL = "https://api.scryfall.com/bulk-data/" 20 | DATASET_NAME = "default_cards" 21 | TABLE_NAME = "cards" 22 | TABLE_PATH = "data/raw/" 23 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") 24 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 25 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 26 | 27 | default_args = { 28 | 'owner': 'airflow', 29 | 'depends_on_past': False, 30 | 'start_date': datetime(2024, 4, 1), 31 | 'retries': 2, 32 | 'retry_delay': timedelta(minutes=5), 33 | } 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | 38 | @dag(default_args=default_args, schedule_interval='@daily', catchup=False, tags=['raw_cards']) 39 | def ingestor_raw(): 40 | """ 41 | Airflow DAG for ingesting data from an API, parsing it, and saving it. 42 | """ 43 | 44 | @task 45 | def raw_cards(): 46 | try: 47 | api_client = APIClient(API_BASE_URL, DATASET_NAME) 48 | data_parser = DataParser() 49 | data_saver = DataSaver( 50 | TABLE_PATH, TABLE_NAME, AWS_BUCKET_NAME, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY 51 | ) 52 | 53 | logger.info("START FETCHING BULKING DATA") 54 | bulk_data = api_client.fetch_bulk_data() 55 | logger.info("START FETCHING CARDS DATA") 56 | cards_data = api_client.fetch_cards_data(bulk_data) 57 | logger.info("START PARSING DATA") 58 | parsed_data = data_parser.parse_cards(cards_data) 59 | logger.info("START SAVING DATA LOCALLY") 60 | data_saver.save_local(parsed_data) 61 | logger.info("START SAVING DATA TO S3") 62 | data_saver.save_s3(parsed_data) 63 | except Exception as e: 64 | logger.error(f"An error occured: {e}") 65 | 66 | # Define task dependencies 67 | raw_cards_task = raw_cards() 68 | 69 | # Define task to trigger the Silver DAG 70 | trigger_bronze_task = TriggerDagRunOperator( 71 | task_id='trigger_bronze', 72 | trigger_dag_id='ingestor_bronze', 73 | wait_for_completion=True, 74 | deferrable=True, 75 | ) 76 | 77 | # Define tasks dependencies 78 | raw_cards_task >> trigger_bronze_task 79 | 80 | 81 | # Instantiate the DAG 82 | ingestor_raw() 83 | -------------------------------------------------------------------------------- /astro/tests/dags/test_dag_example.py: -------------------------------------------------------------------------------- 1 | """Example DAGs test. This test ensures that all Dags have tags, retries set to two, and no import errors. This is an example pytest and may not be fit the context of your DAGs. Feel free to add and remove tests.""" 2 | 3 | import os 4 | import logging 5 | from contextlib import contextmanager 6 | import pytest 7 | from airflow.models import DagBag 8 | 9 | 10 | @contextmanager 11 | def suppress_logging(namespace): 12 | logger = logging.getLogger(namespace) 13 | old_value = logger.disabled 14 | logger.disabled = True 15 | try: 16 | yield 17 | finally: 18 | logger.disabled = old_value 19 | 20 | 21 | def get_import_errors(): 22 | """ 23 | Generate a tuple for import errors in the dag bag 24 | """ 25 | with suppress_logging("airflow"): 26 | dag_bag = DagBag(include_examples=False) 27 | 28 | def strip_path_prefix(path): 29 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 30 | 31 | # prepend "(None,None)" to ensure that a test object is always created even if it's a no op. 32 | return [(None, None)] + [ 33 | (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items() 34 | ] 35 | 36 | 37 | def get_dags(): 38 | """ 39 | Generate a tuple of dag_id, in the DagBag 40 | """ 41 | with suppress_logging("airflow"): 42 | dag_bag = DagBag(include_examples=False) 43 | 44 | def strip_path_prefix(path): 45 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 46 | 47 | return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()] 48 | 49 | 50 | @pytest.mark.parametrize( 51 | "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()] 52 | ) 53 | def test_file_imports(rel_path, rv): 54 | """Test for import errors on a file""" 55 | if rel_path and rv: 56 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 57 | 58 | 59 | APPROVED_TAGS = {} 60 | 61 | 62 | @pytest.mark.parametrize( 63 | "dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()] 64 | ) 65 | def test_dag_tags(dag_id, dag, fileloc): 66 | """ 67 | test if a DAG is tagged and if those TAGs are in the approved list 68 | """ 69 | assert dag.tags, f"{dag_id} in {fileloc} has no tags" 70 | if APPROVED_TAGS: 71 | assert not set(dag.tags) - APPROVED_TAGS 72 | 73 | 74 | @pytest.mark.parametrize( 75 | "dag_id,dag, fileloc", get_dags(), ids=[x[2] for x in get_dags()] 76 | ) 77 | def test_dag_retries(dag_id, dag, fileloc): 78 | """ 79 | test if a DAG has retries set 80 | """ 81 | assert ( 82 | dag.default_args.get("retries", None) >= 2 83 | ), f"{dag_id} in {fileloc} must have task retries >= 2." 84 | -------------------------------------------------------------------------------- /astro/dags/bronze_ingestion.py: -------------------------------------------------------------------------------- 1 | # from datetime import datetime, timedelta 2 | from datetime import datetime, timedelta 3 | import os 4 | import logging 5 | from airflow.decorators import dag, task 6 | from airflow.operators.dagrun_operator import TriggerDagRunOperator 7 | from dotenv import load_dotenv 8 | 9 | from include.lib.aws_manager import AWSManager 10 | from include.lib.duckdb_manager import DuckDBManager 11 | from include.lib.motherduck_manager import MotherDuckManager 12 | from include.ingestion.bronze.cards import DataManager 13 | 14 | 15 | # Load environment variables 16 | load_dotenv() 17 | 18 | # Configuration 19 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN") 20 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 21 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 22 | AWS_REGION = os.getenv("AWS_REGION") 23 | RAW_S3_PATH = os.getenv("RAW_S3_PATH") 24 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH") 25 | LOCAL_PATH = "data/bronze/" 26 | TABLE_NAME = "cards" 27 | BRONZE_SCHEMA = "bronze" 28 | LOCAL_DATABASE = "memory" 29 | REMOTE_DATABASE = "magic_the_gathering" 30 | 31 | default_args = { 32 | 'owner': 'airflow', 33 | 'depends_on_past': True, 34 | 'start_date': datetime(2024, 4, 1), 35 | 'retries': 2, 36 | 'retry_delay': timedelta(minutes=5), 37 | } 38 | 39 | logger = logging.getLogger(__name__) 40 | 41 | 42 | @dag(default_args=default_args, catchup=False, tags=['bronze_cards']) 43 | def ingestor_bronze(): 44 | """ 45 | Airflow DAG for ingesting data from an API, parsing it, and saving it. 46 | """ 47 | 48 | @task 49 | def bronze_cards(): 50 | try: 51 | duckdb_manager = DuckDBManager() 52 | MotherDuckManager( 53 | duckdb_manager, MOTHERDUCK_TOKEN 54 | ) 55 | AWSManager( 56 | duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY 57 | ) 58 | data_manager = DataManager( 59 | duckdb_manager, 60 | LOCAL_DATABASE, 61 | REMOTE_DATABASE, 62 | BRONZE_SCHEMA, 63 | TABLE_NAME, 64 | LOCAL_PATH, 65 | RAW_S3_PATH, 66 | BRONZE_S3_PATH, 67 | ) 68 | 69 | logger.info("START CREATING TABLE FROM S3") 70 | data_manager.create_table_from_json_file() 71 | logger.info("START SAVING DATA LOCALLY") 72 | data_manager.save_to_local() 73 | logger.info("START SAVING DATA TO S3") 74 | data_manager.save_to_s3() 75 | logger.info("START SAVING DATA TO MD") 76 | data_manager.save_to_md() 77 | except Exception as e: 78 | logger.error(f"An error occured: {e}") 79 | 80 | bronze_cards_task = bronze_cards() 81 | 82 | # Define task to trigger the Silver DAG 83 | trigger_silver_task = TriggerDagRunOperator( 84 | task_id='trigger_silver', 85 | trigger_dag_id='ingestor_silver', 86 | wait_for_completion=True, 87 | deferrable=True, 88 | ) 89 | 90 | # Define tasks dependencies 91 | bronze_cards_task >> trigger_silver_task 92 | 93 | 94 | # Instantiate the DAG 95 | ingestor_bronze_dag = ingestor_bronze() 96 | -------------------------------------------------------------------------------- /astro/dags/silver_ingestion.py: -------------------------------------------------------------------------------- 1 | # from datetime import datetime, timedelta 2 | from datetime import datetime, timedelta 3 | import os 4 | import logging 5 | from airflow.decorators import dag, task 6 | from airflow.operators.dagrun_operator import TriggerDagRunOperator 7 | from dotenv import load_dotenv 8 | 9 | from include.lib.aws_manager import AWSManager 10 | from include.lib.duckdb_manager import DuckDBManager 11 | from include.lib.motherduck_manager import MotherDuckManager 12 | from include.ingestion.silver.cards import DataManager 13 | 14 | 15 | # Load environment variables 16 | load_dotenv() 17 | 18 | # Configuration 19 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN") 20 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 21 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 22 | AWS_REGION = os.getenv("AWS_REGION") 23 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH") 24 | SILVER_S3_PATH = os.getenv("SILVER_S3_PATH") 25 | LOCAL_PATH = "data/silver/" 26 | TABLE_NAME = "cards" 27 | SILVER_SCHEMA = "silver" 28 | LOCAL_DATABASE = "memory" 29 | REMOTE_DATABASE = "magic_the_gathering" 30 | 31 | default_args = { 32 | 'owner': 'airflow', 33 | 'depends_on_past': True, 34 | 'start_date': datetime(2024, 4, 1), 35 | 'retries': 2, 36 | 'retry_delay': timedelta(minutes=5), 37 | } 38 | 39 | logger = logging.getLogger(__name__) 40 | 41 | 42 | @dag(default_args=default_args, catchup=False, tags=['silver_cards']) 43 | def ingestor_silver(): 44 | """ 45 | Airflow DAG for ingesting data from an API, parsing it, and saving it. 46 | """ 47 | 48 | @task 49 | def silver_cards(): 50 | try: 51 | duckdb_manager = DuckDBManager() 52 | MotherDuckManager(duckdb_manager, MOTHERDUCK_TOKEN) 53 | AWSManager( 54 | duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY 55 | ) 56 | data_manager = DataManager( 57 | duckdb_manager, 58 | LOCAL_DATABASE, 59 | REMOTE_DATABASE, 60 | SILVER_SCHEMA, 61 | TABLE_NAME, 62 | LOCAL_PATH, 63 | BRONZE_S3_PATH, 64 | SILVER_S3_PATH, 65 | ) 66 | 67 | logger.info("START CREATING TABLE FROM S3") 68 | data_manager.create_table_from_bronze() 69 | logger.info("START SAVING DATA LOCALLY") 70 | data_manager.save_to_local() 71 | logger.info("START SAVING DATA TO S3") 72 | data_manager.save_to_s3() 73 | logger.info("START SAVING DATA TO MD") 74 | data_manager.save_to_md() 75 | except Exception as e: 76 | logger.error(f"An error occured: {e}") 77 | 78 | 79 | # Define task dependencies 80 | silver_cards_task = silver_cards() 81 | 82 | # Define task to trigger the Silver DAG 83 | trigger_gold_task = TriggerDagRunOperator( 84 | task_id='trigger_gold', 85 | trigger_dag_id='ingestor_gold', 86 | wait_for_completion=True, 87 | deferrable=True, 88 | ) 89 | 90 | # Define tasks dependencies 91 | silver_cards_task >> trigger_gold_task 92 | 93 | 94 | # Instantiate the DAG 95 | ingestor_silver_dag = ingestor_silver() 96 | 97 | -------------------------------------------------------------------------------- /astro/README.md: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | Welcome to Astronomer! This project was generated after you ran 'astro dev init' using the Astronomer CLI. This readme describes the contents of the project, as well as how to run Apache Airflow on your local machine. 5 | 6 | Project Contents 7 | ================ 8 | 9 | Your Astro project contains the following files and folders: 10 | 11 | - dags: This folder contains the Python files for your Airflow DAGs. By default, this directory includes two example DAGs: 12 | - `example_dag_basic`: This DAG shows a simple ETL data pipeline example with three TaskFlow API tasks that run daily. 13 | - `example_dag_advanced`: This advanced DAG showcases a variety of Airflow features like branching, Jinja templates, task groups and several Airflow operators. 14 | - Dockerfile: This file contains a versioned Astro Runtime Docker image that provides a differentiated Airflow experience. If you want to execute other commands or overrides at runtime, specify them here. 15 | - include: This folder contains any additional files that you want to include as part of your project. It is empty by default. 16 | - packages.txt: Install OS-level packages needed for your project by adding them to this file. It is empty by default. 17 | - requirements.txt: Install Python packages needed for your project by adding them to this file. It is empty by default. 18 | - plugins: Add custom or community plugins for your project to this file. It is empty by default. 19 | - airflow_settings.yaml: Use this local-only file to specify Airflow Connections, Variables, and Pools instead of entering them in the Airflow UI as you develop DAGs in this project. 20 | 21 | Deploy Your Project Locally 22 | =========================== 23 | 24 | 1. Start Airflow on your local machine by running 'astro dev start'. 25 | 26 | This command will spin up 4 Docker containers on your machine, each for a different Airflow component: 27 | 28 | - Postgres: Airflow's Metadata Database 29 | - Webserver: The Airflow component responsible for rendering the Airflow UI 30 | - Scheduler: The Airflow component responsible for monitoring and triggering tasks 31 | - Triggerer: The Airflow component responsible for triggering deferred tasks 32 | 33 | 2. Verify that all 4 Docker containers were created by running 'docker ps'. 34 | 35 | Note: Running 'astro dev start' will start your project with the Airflow Webserver exposed at port 8080 and Postgres exposed at port 5432. If you already have either of those ports allocated, you can either [stop your existing Docker containers or change the port](https://docs.astronomer.io/astro/test-and-troubleshoot-locally#ports-are-not-available). 36 | 37 | 3. Access the Airflow UI for your local Airflow project. To do so, go to http://localhost:8080/ and log in with 'admin' for both your Username and Password. 38 | 39 | You should also be able to access your Postgres Database at 'localhost:5432/postgres'. 40 | 41 | Deploy Your Project to Astronomer 42 | ================================= 43 | 44 | If you have an Astronomer account, pushing code to a Deployment on Astronomer is simple. For deploying instructions, refer to Astronomer documentation: https://docs.astronomer.io/cloud/deploy-code/ 45 | 46 | Contact 47 | ======= 48 | 49 | The Astronomer CLI is maintained with love by the Astronomer team. To report a bug or suggest a change, reach out to our support. 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | # lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Data Folder 163 | data/ 164 | 165 | # Terraform environment variables 166 | terraform/.terraform* 167 | terraform/terraform.tfvars 168 | terraform/terraform.tfstate 169 | terraform/*.backup 170 | 171 | # Ruff 172 | .ruff_cache 173 | 174 | # Database 175 | *.db* -------------------------------------------------------------------------------- /docs/info_dataset.md: -------------------------------------------------------------------------------- 1 | # Information about the dataset fields 2 | 3 | Datasets columns info is listed below and also on the official site [here](https://scryfall.com/docs/api/cards). 4 | 5 | ## Cards Table 6 | 7 | | Column Name | Description | Type | 8 | |-------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| 9 | | cards_id | A unique ID for this card in Scryfall’s database. | String | 10 | | name | The name of this card. If this card has multiple faces, this field will contain both names separated by ␣//␣. | String | 11 | | released_at | The date this card was first released. | Date | 12 | | color_identity | This card’s color identity. | String | 13 | | set_name | This card’s full set name. | String | 14 | | artist | The name of the illustrator of this card face. Newly spoiled cards may not have this field yet. | String | 15 | | usd_price | Daily price information for this card. | Float | -------------------------------------------------------------------------------- /astro/include/ingestion/silver/cards.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class DataManager: 8 | """ 9 | Manages data operations. 10 | """ 11 | 12 | def __init__( 13 | self, 14 | duckdb_manager, 15 | local_database: str, 16 | remote_database: str, 17 | silver_schema: str, 18 | table_name: str, 19 | local_path: str, 20 | bronze_s3_path: str, 21 | silver_s3_path: str, 22 | ): 23 | """ 24 | Initializes DataManager. 25 | 26 | Args: 27 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 28 | """ 29 | self.duckdb_manager = duckdb_manager 30 | self.local_database = local_database 31 | self.remote_database = remote_database 32 | self.silver_schema = silver_schema 33 | self.table_name = table_name 34 | self.local_path = local_path 35 | self.bronze_s3_path = bronze_s3_path 36 | self.silver_s3_path = silver_s3_path 37 | 38 | def create_table_from_bronze(self) -> None: 39 | """ 40 | Creates a table from a JSON file stored in S3. 41 | 42 | Returns: 43 | None 44 | """ 45 | try: 46 | logger.info("Creating cards table locally") 47 | query = f""" 48 | CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS 49 | SELECT 50 | id AS card_id 51 | , name 52 | , released_at 53 | , color_identity 54 | , set_name 55 | , artist 56 | , (prices).usd AS usd_price 57 | FROM read_parquet('{self.bronze_s3_path}{self.table_name}.parquet'); 58 | """ 59 | self.duckdb_manager.execute_query(query) 60 | logger.info("Cards table created!") 61 | except Exception as e: 62 | logger.error(f"Error creating table from JSON file: {e}") 63 | 64 | def save_to_local(self) -> None: 65 | """ 66 | Saves data to local disk. 67 | 68 | Returns: 69 | None 70 | """ 71 | try: 72 | logger.info("Saving cards table as parquet format locally") 73 | os.makedirs(os.path.dirname(self.local_path), exist_ok=True) 74 | query = f""" 75 | COPY ( 76 | SELECT 77 | * 78 | FROM {self.local_database}.{self.table_name} 79 | ) 80 | TO '{self.local_path}{self.table_name}.parquet' 81 | (FORMAT PARQUET) 82 | """ 83 | self.duckdb_manager.execute_query(query) 84 | logger.info("Cards table saved locally!") 85 | except Exception as e: 86 | logger.error(f"Error saving to local: {e}") 87 | 88 | def save_to_s3(self) -> None: 89 | """ 90 | Saves data to Amazon S3. 91 | 92 | Returns: 93 | None 94 | """ 95 | try: 96 | logger.info("Saving cards table to s3 as parquet") 97 | query = f""" 98 | COPY ( 99 | SELECT 100 | * 101 | FROM {self.local_database}.{self.table_name} 102 | ) 103 | TO '{self.silver_s3_path}{self.table_name}.parquet' 104 | (FORMAT PARQUET) 105 | """ 106 | self.duckdb_manager.execute_query(query) 107 | logger.info("Cards table saved to s3!") 108 | except Exception as e: 109 | logger.error(f"Error saving to S3: {e}") 110 | 111 | def save_to_md(self) -> None: 112 | """ 113 | Saves data to MotherDuck. 114 | 115 | Returns: 116 | None 117 | """ 118 | try: 119 | logger.info("Saving cards table to Mother Duck") 120 | self.duckdb_manager.execute_query( 121 | f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.silver_schema};" 122 | ) 123 | query = f""" 124 | CREATE OR REPLACE TABLE {self.remote_database}.{self.silver_schema}.{self.table_name} AS 125 | SELECT 126 | * 127 | FROM {self.local_database}.{self.table_name}; 128 | """ 129 | self.duckdb_manager.execute_query(query) 130 | logger.info("Cards table saved!") 131 | except Exception as e: 132 | logger.error(f"Error saving to MotherDuck: {e}") 133 | -------------------------------------------------------------------------------- /astro/.astro/test_dag_integrity_default.py: -------------------------------------------------------------------------------- 1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**""" 2 | 3 | from contextlib import contextmanager 4 | import logging 5 | import os 6 | 7 | import pytest 8 | 9 | from airflow.models import DagBag, Variable, Connection 10 | from airflow.hooks.base import BaseHook 11 | from airflow.utils.db import initdb 12 | 13 | # init airflow database 14 | initdb() 15 | 16 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables 17 | 18 | 19 | # =========== MONKEYPATCH BaseHook.get_connection() =========== 20 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs): 21 | print( 22 | f"Attempted to fetch connection during parse returning an empty Connection object for {key}" 23 | ) 24 | return Connection(key) 25 | 26 | 27 | BaseHook.get_connection = basehook_get_connection_monkeypatch 28 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() =========== 29 | 30 | 31 | # =========== MONKEYPATCH OS.GETENV() =========== 32 | def os_getenv_monkeypatch(key: str, *args, **kwargs): 33 | default = None 34 | if args: 35 | default = args[0] # os.getenv should get at most 1 arg after the key 36 | if kwargs: 37 | default = kwargs.get( 38 | "default", None 39 | ) # and sometimes kwarg if people are using the sig 40 | 41 | env_value = os.environ.get(key, None) 42 | 43 | if env_value: 44 | return env_value # if the env_value is set, return it 45 | if ( 46 | key == "JENKINS_HOME" and default is None 47 | ): # fix https://github.com/astronomer/astro-cli/issues/601 48 | return None 49 | if default: 50 | return default # otherwise return whatever default has been passed 51 | return f"MOCKED_{key.upper()}_VALUE" # if absolutely nothing has been passed - return the mocked value 52 | 53 | 54 | os.getenv = os_getenv_monkeypatch 55 | # # =========== /MONKEYPATCH OS.GETENV() =========== 56 | 57 | # =========== MONKEYPATCH VARIABLE.GET() =========== 58 | 59 | 60 | class magic_dict(dict): 61 | def __init__(self, *args, **kwargs): 62 | self.update(*args, **kwargs) 63 | 64 | def __getitem__(self, key): 65 | return {}.get(key, "MOCKED_KEY_VALUE") 66 | 67 | 68 | _no_default = object() # allow falsey defaults 69 | 70 | 71 | def variable_get_monkeypatch(key: str, default_var=_no_default, deserialize_json=False): 72 | print( 73 | f"Attempted to get Variable value during parse, returning a mocked value for {key}" 74 | ) 75 | 76 | if default_var is not _no_default: 77 | return default_var 78 | if deserialize_json: 79 | return magic_dict() 80 | return "NON_DEFAULT_MOCKED_VARIABLE_VALUE" 81 | 82 | 83 | Variable.get = variable_get_monkeypatch 84 | # # =========== /MONKEYPATCH VARIABLE.GET() =========== 85 | 86 | 87 | @contextmanager 88 | def suppress_logging(namespace): 89 | """ 90 | Suppress logging within a specific namespace to keep tests "clean" during build 91 | """ 92 | logger = logging.getLogger(namespace) 93 | old_value = logger.disabled 94 | logger.disabled = True 95 | try: 96 | yield 97 | finally: 98 | logger.disabled = old_value 99 | 100 | 101 | def get_import_errors(): 102 | """ 103 | Generate a tuple for import errors in the dag bag, and include DAGs without errors. 104 | """ 105 | with suppress_logging("airflow"): 106 | dag_bag = DagBag(include_examples=False) 107 | 108 | def strip_path_prefix(path): 109 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 110 | 111 | # Initialize an empty list to store the tuples 112 | result = [] 113 | 114 | # Iterate over the items in import_errors 115 | for k, v in dag_bag.import_errors.items(): 116 | result.append((strip_path_prefix(k), v.strip())) 117 | 118 | # Check if there are DAGs without errors 119 | for file_path in dag_bag.dags: 120 | # Check if the file_path is not in import_errors, meaning no errors 121 | if file_path not in dag_bag.import_errors: 122 | result.append((strip_path_prefix(file_path), "No import errors")) 123 | 124 | return result 125 | 126 | 127 | @pytest.mark.parametrize( 128 | "rel_path, rv", get_import_errors(), ids=[x[0] for x in get_import_errors()] 129 | ) 130 | def test_file_imports(rel_path, rv): 131 | """Test for import errors on a file""" 132 | if rv != "No import errors": 133 | # If rv is not "No import errors," consider it a failed test 134 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 135 | else: 136 | # If rv is "No import errors," consider it a passed test 137 | print(f"{rel_path} passed the import test") 138 | -------------------------------------------------------------------------------- /astro/include/ingestion/bronze/cards.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class DataManager: 8 | """ 9 | Manages data operations. 10 | """ 11 | 12 | def __init__( 13 | self, 14 | duckdb_manager, 15 | local_database: str, 16 | remote_database: str, 17 | bronze_schema: str, 18 | table_name: str, 19 | local_path: str, 20 | raw_s3_path: str, 21 | bronze_s3_path: str, 22 | ): 23 | """ 24 | Initializes DataManager. 25 | 26 | Args: 27 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 28 | """ 29 | self.duckdb_manager = duckdb_manager 30 | self.local_database = local_database 31 | self.remote_database = remote_database 32 | self.bronze_schema = bronze_schema 33 | self.table_name = table_name 34 | self.local_path = local_path 35 | self.raw_s3_path = raw_s3_path 36 | self.bronze_s3_path = bronze_s3_path 37 | 38 | def create_table_from_json_file(self) -> None: 39 | """ 40 | Creates a table from a JSON file stored in S3. 41 | 42 | Returns: 43 | None 44 | """ 45 | try: 46 | logger.info("Creating cards table locally") 47 | query = f""" 48 | CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS 49 | WITH ranked_cards AS ( 50 | SELECT 51 | * 52 | , ROW_NUMBER() OVER (PARTITION BY id ORDER BY released_at DESC) AS row_num 53 | FROM read_json_auto('{self.raw_s3_path}{self.table_name}.json') 54 | ) 55 | SELECT 56 | * EXCLUDE (row_num) 57 | FROM ranked_cards 58 | WHERE row_num = 1; 59 | """ 60 | self.duckdb_manager.execute_query(query) 61 | logger.info("Cards table created!") 62 | except Exception as e: 63 | logger.error(f"Error creating table from JSON file: {e}") 64 | 65 | def save_to_local(self) -> None: 66 | """ 67 | Saves data to local disk. 68 | 69 | Returns: 70 | None 71 | """ 72 | try: 73 | logger.info("Saving cards table as parquet format locally") 74 | os.makedirs(os.path.dirname(self.local_path), exist_ok=True) 75 | query = f""" 76 | COPY ( 77 | SELECT 78 | * 79 | FROM {self.local_database}.{self.table_name} 80 | ) 81 | TO '{self.local_path}{self.table_name}.parquet' 82 | (FORMAT PARQUET) 83 | """ 84 | self.duckdb_manager.execute_query(query) 85 | logger.info("Cards table saved locally!") 86 | except Exception as e: 87 | logger.error(f"Error saving to local: {e}") 88 | 89 | def save_to_s3(self) -> None: 90 | """ 91 | Saves data to Amazon S3. 92 | 93 | Returns: 94 | None 95 | """ 96 | try: 97 | logger.info("Saving cards table to s3 as parquet") 98 | query = f""" 99 | COPY ( 100 | SELECT 101 | * 102 | FROM {self.local_database}.{self.table_name} 103 | ) 104 | TO '{self.bronze_s3_path}{self.table_name}.parquet' 105 | (FORMAT PARQUET) 106 | """ 107 | self.duckdb_manager.execute_query(query) 108 | logger.info("Cards table saved to s3!") 109 | except Exception as e: 110 | logger.error(f"Error saving to S3: {e}") 111 | 112 | def save_to_md(self) -> None: 113 | """ 114 | Saves data to MotherDuck. 115 | 116 | Returns: 117 | None 118 | """ 119 | try: 120 | logger.info("Saving cards table to Mother Duck") 121 | self.duckdb_manager.execute_query( 122 | f"CREATE DATABASE IF NOT EXISTS {self.remote_database}" 123 | ) 124 | self.duckdb_manager.execute_query( 125 | f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.bronze_schema};" 126 | ) 127 | query = f""" 128 | CREATE OR REPLACE TABLE {self.remote_database}.{self.bronze_schema}.{self.table_name} AS 129 | SELECT 130 | * 131 | FROM {self.local_database}.{self.table_name}; 132 | """ 133 | self.duckdb_manager.execute_query(query) 134 | logger.info("Cards table saved!") 135 | except Exception as e: 136 | logger.error(f"Error saving to MotherDuck: {e}") 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Magic: The Gathering Data Pipeline - A Data Engineering Project 2 | --- 3 | 4 | ![BATCH ELT Architecture](images/magic_the_gathering.jpg) 5 | 6 | ### About the Game 7 | 8 | **Magic: The Gathering** (colloquially known as **Magic** or **MTG**) is a tabletop and digital collectible card game created by Richard Garfield. Released in 1993 by Wizards of the Coast, Magic was the first trading card game and had approximately fifty million players as of February 2023. Over twenty billion Magic cards were produced in the period from 2008 to 2016, during which time it grew in popularity. As of the 2022 fiscal year, Magic generates over $1 billion in revenue annually - [Wikipedia](https://en.wikipedia.org/wiki/Magic:_The_Gathering) 9 | 10 | The goal of this project is to build an end-to-end batch data pipeline on Magic: The Gathering Data available at [Scryfall](https://scryfall.com/). In addition, perform ELT (Extract Load Transform) daily in order to analyze the cards information availabe from historical data to till date. 11 | 12 | ### Table of contents 13 | 14 | - [Problem statement](#problem-statement) 15 | - [Dataset](#dataset) 16 | - [Proposed Solution](#proposed-solution) 17 | - [Data Pipeline Overview](#data-pipeline-overview) 18 | - [Technologies](#technologies) 19 | - [Architecture](#architecture) 20 | - [ELT Steps](#elt-steps) 21 | - [The Dashboard](#the-dashboard) 22 | - [Conclusion](#conclusion) 23 | 24 | ## Problem statement 25 | --- 26 | * ### ***Data***: 27 | The Data selected for this project is the `Magic: The Gathering` obtained from [Scryfall](https://scryfall.com/). This data includes the latest cards information available. Data since 1993. The Data is extracted via Scryfall API. 28 | 29 | The columns in the Datasets and their descriptions is available [here](docs/info_dataset.md) 30 | 31 | * ### ***Proposed Solution***: 32 | This project aims at extracting this data from the source via API and building a BATCH ELT which will be scheduled to run daily and update the connected Dashboard for daily Analytics & Reporting. 33 | 34 | 35 | ## Data Pipeline Overview 36 | --- 37 | This is a Batch Pipeline which will perform ELT on the every day at 09:00 am. 38 | 39 | The ELT steps include: 40 | 41 | * **Extract** dataset from Scryfall via API and load the data into the Datalake 42 | * **Clean** data and load the data into Datalake 43 | * **Load** the data from Datalake into external tables in the Datawarehouse 44 | * **Transform** the data in the Datawarehouse 45 | * **Visualize** the data by creating a Dashboard 46 | 47 | ## Data Pipeline with Medallion Architecture 48 | --- 49 | * **RAW:** where the raw data is placed as soon as it is collected 50 | * **BRONZE:** data treated and ready to be consumed 51 | * **SILVER:** data processed and can be consumed easily 52 | * **GOLD:** data made available from analyzes and models, which can be consumed by BI or DataViz tools 53 | 54 | ## Technologies 55 | --- 56 | * Cloud: ***AWS*** 57 | * Infrastructure as code (IaC): ***Terraform*** 58 | * Workflow orchestration: ***Astronomer + Airflow*** 59 | * Data Warehouse: ***MotherDuck*** 60 | * Batch processing: ***DuckDb*** 61 | * Data Transformation: ***dbt-core*** 62 | * DataViz: ***Preset*** 63 | * Virtual Environment: ***Poetry*** 64 | * CICD: ***Git*** 65 | 66 | ## Architecture 67 | --- 68 | Pipeline 69 | 70 | 71 | ![BATCH ELT Architecture](images/magic_the_gathering_pipeline_etl.png) 72 | 73 | 74 | ## ELT Steps 75 | 76 | Steps in the ELT are as follows: 77 | 78 | 1. A Project is created on ***GitHub*** 79 | 2. Infrastructure for the Project is created using ***Terraform*** which creates the following: 80 | * Datalake: ***S3 Bucket*** where the raw and cleaned data will be stored 81 | 3. The Pipeline for ELT is created and is scheduled for daily execution. It is orchestrated via ***Astronomer + Airflow***, which does the following tasks: 82 | * Extracts raw data from source via ***Scryfall API*** 83 | * Loads raw data as json file to ***S3 Bucket*** 84 | * Cleans the raw data using ***DuckDb*** 85 | * Loads the cleaned data as parquet files to ***S3*** 86 | * Creates External table in the Datasets in ***MotherDuck*** by pulling data from ***S3***. 87 | * Transforms Data from ***S3*** using ***dbt-core*** and creates the following in the dev/prod Dataset (along with Tests and Documentation) 88 | - The view: `stg_cards` 89 | - The fact table: `fact_cards` 90 | 4. Transformed Data from ***MotherDuck*** is used for Reporting and Visualization using ***Preset*** to produce Dashboards 91 | 92 | ## The Dashboard: 93 | --- 94 | 95 | ![image](images/magic_the_gathering_dashboard.jpg) 96 | 97 | ## Conclusion 98 | --- 99 | Through this project we were able to successfully build a ELT pipeline end to end which is scheduled to run daily. And as a result we have a daily updated **MTG cards** which can be visualized via the Dashboard on **Preset**. This helps us get some useful insights on the latest cards information. 100 | 101 | [Back To Top](magic-the-gathering-data-pipeline-a-data-engineering-project) 102 | -------------------------------------------------------------------------------- /astro/include/ingestion/raw/cards.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | import os 4 | import sys 5 | from typing import List, Optional 6 | import requests 7 | import boto3 8 | import logging 9 | 10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) 11 | 12 | from models.card import Card 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class APIClient: 18 | """ 19 | Class for interacting with an API. 20 | """ 21 | 22 | def __init__(self, base_url: str, dataset: str) -> None: 23 | self.base_url = base_url 24 | self.dataset = dataset 25 | 26 | def fetch_bulk_data(self) -> Optional[requests.Response]: 27 | """ 28 | Fetches data from the API. 29 | 30 | Args: 31 | dataset (str): Name of the dataset to fetch. 32 | 33 | Returns: 34 | dict: Response JSON data. 35 | """ 36 | try: 37 | logger.info("Fetching bulk data") 38 | # Construct the URL for fetching data 39 | url = f"{self.base_url}{self.dataset}" 40 | # Send HTTP GET request to fetch data 41 | response = requests.get(url, timeout=5) 42 | # Raise an exception for HTTP errors 43 | response.raise_for_status() 44 | logger.info(f"Data fetched successfully from {url}") 45 | return response 46 | except requests.exceptions.RequestException as e: 47 | logger.error(f"Failed to fetch data: {e}") 48 | return None 49 | 50 | def fetch_cards_data( 51 | self, response: requests.Response 52 | ) -> Optional[requests.Response]: 53 | """ 54 | Fetchs data from the bulk data download uri 55 | 56 | Args: 57 | response (Response): Response object from the API. 58 | 59 | Returns: 60 | dict: Response JSON data. 61 | """ 62 | try: 63 | logger.info("Fetching cards data") 64 | # Extract relevant data and update timestamp from the API response 65 | download_uri = response.json()["download_uri"] 66 | data = requests.get(download_uri) 67 | update_timestamp = response.json()["updated_at"] 68 | # Parse the update timestamp to extract the date 69 | update_date = datetime.strptime( 70 | update_timestamp, "%Y-%m-%dT%H:%M:%S.%f%z" 71 | ).date() 72 | logger.info( 73 | f"Data fetched successfully from {download_uri} - Update date: {update_date}" 74 | ) 75 | return data 76 | except KeyError as e: 77 | logger.error(f"Failed to extract data from API response: {e}") 78 | return None 79 | except ValueError as e: 80 | logger.error(f"Failed to parse timestamp: {e}") 81 | return None 82 | 83 | 84 | class DataParser: 85 | """ 86 | Class for parsing data. 87 | """ 88 | 89 | @staticmethod 90 | def parse_cards(data: requests.Response) -> Optional[List[Card]]: 91 | """ 92 | Parses JSON data into instances of the Card model. 93 | 94 | Args: 95 | data (dict): JSON data to parse. 96 | 97 | Returns: 98 | List[Card]: List of parsed Card instances. 99 | """ 100 | try: 101 | logger.info("Parsing data") 102 | cards_data = data.json() 103 | parsed_cards = [Card(**card) for card in cards_data] 104 | logger.info("Data parsing successful!") 105 | return parsed_cards 106 | except Exception as e: 107 | logger.error(f"An unexpected error occurred: {e}") 108 | return None 109 | 110 | 111 | class DataSaver: 112 | """ 113 | Class for saving data. 114 | """ 115 | 116 | def __init__( 117 | self, 118 | table_path: str, 119 | table_name: str, 120 | bucket_name: Optional[str], 121 | access_key_id: Optional[str], 122 | secret_access_key: Optional[str], 123 | ): 124 | self.table_path = table_path 125 | self.table_name = table_name 126 | self.bucket_name = bucket_name 127 | if bucket_name: 128 | self.s3_client = boto3.client( 129 | "s3", 130 | aws_access_key_id=access_key_id, 131 | aws_secret_access_key=secret_access_key, 132 | ) 133 | 134 | def save_local(self, data: List) -> None: 135 | """ 136 | Saves parsed data to a local file. 137 | 138 | Args: 139 | data (list): List of parsed data. 140 | table_path (str): Path to store local data. 141 | table_name (str): Name of the table. 142 | 143 | Returns: 144 | None 145 | """ 146 | try: 147 | logger.info("Saving data locally") 148 | path = f"{self.table_path}{self.table_name}.json" 149 | os.makedirs(os.path.dirname(path), exist_ok=True) 150 | with open(path, "w") as file: 151 | json.dump([item.dict() for item in data], file, indent=4) 152 | logger.info(f"Data saved locally to {path}") 153 | except Exception as e: 154 | logger.error(f"An error occurred while saving data locally: {e}") 155 | 156 | def save_s3(self, data: List) -> None: 157 | """ 158 | Saves parsed data to AWS S3 bucket. 159 | 160 | Args: 161 | data (list): List of parsed data. 162 | s3_client: Boto3 S3 client. 163 | bucket_name (str): Name of the AWS S3 bucket. 164 | table_path (str): Path to store data in S3. 165 | table_name (str): Name of the table. 166 | 167 | Returns: 168 | None 169 | """ 170 | try: 171 | logger.info("Saving data to S3 bucket") 172 | json_bytes = json.dumps([item.dict() for item in data], indent=4).encode( 173 | "utf-8" 174 | ) 175 | key = f"{self.table_path}{self.table_name}.json" 176 | self.s3_client.put_object(Body=json_bytes, Bucket=self.bucket_name, Key=key) 177 | logger.info(f"Data saved successfully to S3 bucket: {self.bucket_name}") 178 | except Exception as e: 179 | logger.error(f"An error occurred while saving data to S3 bucket: {e}") 180 | -------------------------------------------------------------------------------- /ingestion/silver/cards.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import os 3 | import sys 4 | from dotenv import load_dotenv 5 | from loguru import logger 6 | from typing import Any 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) 9 | 10 | from lib import duckdb_manager, motherduck_manager, aws_manager 11 | 12 | load_dotenv() 13 | 14 | # Load environment variables 15 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN") 16 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 17 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 18 | AWS_REGION = os.getenv("AWS_REGION") 19 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH") 20 | SILVER_S3_PATH = os.getenv("SILVER_S3_PATH") 21 | LOCAL_PATH = "data/silver/" 22 | TABLE_NAME = "cards" 23 | SILVER_SCHEMA = "silver" 24 | LOCAL_DATABASE = "memory" 25 | REMOTE_DATABASE = "magic_the_gathering" 26 | 27 | 28 | class DataManager: 29 | """ 30 | Manages data operations. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | duckdb_manager, 36 | local_database: str, 37 | remote_database: str, 38 | silver_schema: str, 39 | table_name: str, 40 | local_path: str, 41 | bronze_s3_path: str, 42 | silver_s3_path: str, 43 | ): 44 | """ 45 | Initializes DataManager. 46 | 47 | Args: 48 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 49 | """ 50 | self.duckdb_manager = duckdb_manager 51 | self.local_database = local_database 52 | self.remote_database = remote_database 53 | self.silver_schema = silver_schema 54 | self.table_name = table_name 55 | self.local_path = local_path 56 | self.bronze_s3_path = bronze_s3_path 57 | self.silver_s3_path = silver_s3_path 58 | 59 | def create_table_from_bronze(self) -> None: 60 | """ 61 | Creates a table from a JSON file stored in S3. 62 | 63 | Returns: 64 | None 65 | """ 66 | try: 67 | logger.info("Creating cards table locally") 68 | query = f""" 69 | CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS 70 | SELECT 71 | id AS card_id 72 | , name 73 | , released_at 74 | , color_identity 75 | , set_name 76 | , artist 77 | , (prices).usd AS usd_price 78 | FROM read_parquet('{self.bronze_s3_path}{self.table_name}.parquet'); 79 | """ 80 | self.duckdb_manager.execute_query(query) 81 | logger.success("Cards table created!") 82 | except Exception as e: 83 | logger.error(f"Error creating table from JSON file: {e}") 84 | 85 | def save_to_local(self) -> None: 86 | """ 87 | Saves data to local disk. 88 | 89 | Returns: 90 | None 91 | """ 92 | try: 93 | logger.info("Saving cards table as parquet format locally") 94 | os.makedirs(os.path.dirname(self.local_path), exist_ok=True) 95 | query = f""" 96 | COPY ( 97 | SELECT 98 | * 99 | FROM {self.local_database}.{self.table_name} 100 | ) 101 | TO '{self.local_path}{self.table_name}.parquet' 102 | (FORMAT PARQUET) 103 | """ 104 | self.duckdb_manager.execute_query(query) 105 | logger.success("Cards table saved locally!") 106 | except Exception as e: 107 | logger.error(f"Error saving to local: {e}") 108 | 109 | def save_to_s3(self) -> None: 110 | """ 111 | Saves data to Amazon S3. 112 | 113 | Returns: 114 | None 115 | """ 116 | try: 117 | logger.info("Saving cards table to s3 as parquet") 118 | query = f""" 119 | COPY ( 120 | SELECT 121 | * 122 | FROM {self.local_database}.{self.table_name} 123 | ) 124 | TO '{self.silver_s3_path}{self.table_name}.parquet' 125 | (FORMAT PARQUET) 126 | """ 127 | self.duckdb_manager.execute_query(query) 128 | logger.success("Cards table saved to s3!") 129 | except Exception as e: 130 | logger.error(f"Error saving to S3: {e}") 131 | 132 | def save_to_md(self) -> None: 133 | """ 134 | Saves data to MotherDuck. 135 | 136 | Returns: 137 | None 138 | """ 139 | try: 140 | logger.info("Saving cards table to Mother Duck") 141 | self.duckdb_manager.execute_query( 142 | f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.silver_schema};" 143 | ) 144 | query = f""" 145 | CREATE OR REPLACE TABLE {self.remote_database}.{self.silver_schema}.{self.table_name} AS 146 | SELECT 147 | * 148 | FROM {self.local_database}.{self.table_name}; 149 | """ 150 | self.duckdb_manager.execute_query(query) 151 | logger.info("Cards table saved!") 152 | except Exception as e: 153 | logger.error(f"Error saving to MotherDuck: {e}") 154 | 155 | 156 | class Ingestor: 157 | """ 158 | Orchestrates the entire data ingestion process. 159 | """ 160 | 161 | def __init__( 162 | self, 163 | duckdb_manager, 164 | motherduck_manager, 165 | aws_manager, 166 | data_manager, 167 | ): 168 | """ 169 | Initializes Ingestor. 170 | """ 171 | self.duckdb_manager = duckdb_manager 172 | self.motherduck_manager = motherduck_manager 173 | self.aws_manager = aws_manager 174 | self.data_manager = data_manager 175 | 176 | def execute(self) -> None: 177 | """ 178 | Ingests data by executing the entire data ingestion process. 179 | 180 | Returns: 181 | None 182 | """ 183 | try: 184 | logger.info("Starting ingestion") 185 | self.data_manager.create_table_from_bronze() 186 | self.data_manager.save_to_local() 187 | self.data_manager.save_to_s3() 188 | self.data_manager.save_to_md() 189 | logger.success("Ingestion completed!") 190 | except Exception as e: 191 | logger.error(f"Error executing data ingestion process: {e}") 192 | 193 | 194 | if __name__ == "__main__": 195 | # Create instances of classed 196 | duckdb_manager = duckdb_manager.DuckDBManager() 197 | motherduck_manager = motherduck_manager.MotherDuckManager(duckdb_manager, MOTHERDUCK_TOKEN) 198 | aws_manager = aws_manager.AWSManager( 199 | duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY 200 | ) 201 | data_manager = DataManager( 202 | duckdb_manager, 203 | LOCAL_DATABASE, 204 | REMOTE_DATABASE, 205 | SILVER_SCHEMA, 206 | TABLE_NAME, 207 | LOCAL_PATH, 208 | BRONZE_S3_PATH, 209 | SILVER_S3_PATH, 210 | ) 211 | 212 | # Creating an instance of DataIngestor and execute the ingestion process 213 | ingestor = Ingestor(duckdb_manager, motherduck_manager, aws_manager, data_manager) 214 | ingestor.execute() 215 | -------------------------------------------------------------------------------- /ingestion/bronze/cards.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import os 3 | import sys 4 | from dotenv import load_dotenv 5 | from loguru import logger 6 | from typing import Any 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) 9 | 10 | from lib import duckdb_manager, motherduck_manager, aws_manager 11 | 12 | load_dotenv() 13 | 14 | # Load environment variables 15 | MOTHERDUCK_TOKEN = os.getenv("MOTHERDUCK_TOKEN") 16 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 17 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 18 | AWS_REGION = os.getenv("AWS_REGION") 19 | RAW_S3_PATH = os.getenv("RAW_S3_PATH") 20 | BRONZE_S3_PATH = os.getenv("BRONZE_S3_PATH") 21 | LOCAL_PATH = "data/bronze/" 22 | TABLE_NAME = "cards" 23 | BRONZE_SCHEMA = "bronze" 24 | LOCAL_DATABASE = "memory" 25 | REMOTE_DATABASE = "magic_the_gathering" 26 | 27 | 28 | class DataManager: 29 | """ 30 | Manages data operations. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | duckdb_manager, 36 | local_database: str, 37 | remote_database: str, 38 | bronze_schema: str, 39 | table_name: str, 40 | local_path: str, 41 | raw_s3_path: str, 42 | bronze_s3_path: str, 43 | ): 44 | """ 45 | Initializes DataManager. 46 | 47 | Args: 48 | duckdb_manager (DuckDBManager): Instance of DuckDBManager. 49 | """ 50 | self.duckdb_manager = duckdb_manager 51 | self.local_database = local_database 52 | self.remote_database = remote_database 53 | self.bronze_schema = bronze_schema 54 | self.table_name = table_name 55 | self.local_path = local_path 56 | self.raw_s3_path = raw_s3_path 57 | self.bronze_s3_path = bronze_s3_path 58 | 59 | def create_table_from_json_file(self) -> None: 60 | """ 61 | Creates a table from a JSON file stored in S3. 62 | 63 | Returns: 64 | None 65 | """ 66 | try: 67 | logger.info("Creating cards table locally") 68 | query = f""" 69 | CREATE OR REPLACE TABLE {self.local_database}.{self.table_name} AS 70 | WITH ranked_cards AS ( 71 | SELECT 72 | * 73 | , ROW_NUMBER() OVER (PARTITION BY id ORDER BY released_at DESC) AS row_num 74 | FROM read_json_auto('{self.raw_s3_path}{self.table_name}.json') 75 | ) 76 | SELECT 77 | * EXCLUDE (row_num) 78 | FROM ranked_cards 79 | WHERE row_num = 1; 80 | """ 81 | self.duckdb_manager.execute_query(query) 82 | logger.success("Cards table created!") 83 | except Exception as e: 84 | logger.error(f"Error creating table from JSON file: {e}") 85 | 86 | def save_to_local(self) -> None: 87 | """ 88 | Saves data to local disk. 89 | 90 | Returns: 91 | None 92 | """ 93 | try: 94 | logger.info("Saving cards table as parquet format locally") 95 | os.makedirs(os.path.dirname(self.local_path), exist_ok=True) 96 | query = f""" 97 | COPY ( 98 | SELECT 99 | * 100 | FROM {self.local_database}.{self.table_name} 101 | ) 102 | TO '{self.local_path}{self.table_name}.parquet' 103 | (FORMAT PARQUET) 104 | """ 105 | self.duckdb_manager.execute_query(query) 106 | logger.success("Cards table saved locally!") 107 | except Exception as e: 108 | logger.error(f"Error saving to local: {e}") 109 | 110 | def save_to_s3(self) -> None: 111 | """ 112 | Saves data to Amazon S3. 113 | 114 | Returns: 115 | None 116 | """ 117 | try: 118 | logger.info("Saving cards table to s3 as parquet") 119 | query = f""" 120 | COPY ( 121 | SELECT 122 | * 123 | FROM {self.local_database}.{self.table_name} 124 | ) 125 | TO '{self.bronze_s3_path}{self.table_name}.parquet' 126 | (FORMAT PARQUET) 127 | """ 128 | self.duckdb_manager.execute_query(query) 129 | logger.success("Cards table saved to s3!") 130 | except Exception as e: 131 | logger.error(f"Error saving to S3: {e}") 132 | 133 | def save_to_md(self) -> None: 134 | """ 135 | Saves data to MotherDuck. 136 | 137 | Returns: 138 | None 139 | """ 140 | try: 141 | logger.info("Saving cards table to Mother Duck") 142 | self.duckdb_manager.execute_query( 143 | f"CREATE DATABASE IF NOT EXISTS {self.remote_database}" 144 | ) 145 | self.duckdb_manager.execute_query( 146 | f"CREATE SCHEMA IF NOT EXISTS {self.remote_database}.{self.bronze_schema};" 147 | ) 148 | query = f""" 149 | CREATE OR REPLACE TABLE {self.remote_database}.{self.bronze_schema}.{self.table_name} AS 150 | SELECT 151 | * 152 | FROM {self.local_database}.{self.table_name}; 153 | """ 154 | self.duckdb_manager.execute_query(query) 155 | logger.info("Cards table saved!") 156 | except Exception as e: 157 | logger.error(f"Error saving to MotherDuck: {e}") 158 | 159 | 160 | class Ingestor: 161 | """ 162 | Orchestrates the entire data ingestion process. 163 | """ 164 | 165 | def __init__( 166 | self, 167 | duckdb_manager, 168 | motherduck_manager, 169 | aws_manager, 170 | data_manager, 171 | ): 172 | """ 173 | Initializes Ingestor. 174 | """ 175 | self.duckdb_manager = duckdb_manager 176 | self.motherduck_manager = motherduck_manager 177 | self.aws_manager = aws_manager 178 | self.data_manager = data_manager 179 | 180 | def execute(self) -> None: 181 | """ 182 | Ingests data by executing the entire data ingestion process. 183 | 184 | Returns: 185 | None 186 | """ 187 | try: 188 | logger.info("Starting ingestion") 189 | self.data_manager.create_table_from_json_file() 190 | self.data_manager.save_to_local() 191 | self.data_manager.save_to_s3() 192 | self.data_manager.save_to_md() 193 | logger.success("Ingestion completed!") 194 | except Exception as e: 195 | logger.error(f"Error executing data ingestion process: {e}") 196 | 197 | 198 | if __name__ == "__main__": 199 | # Create instances of classed 200 | duckdb_manager = duckdb_manager.DuckDBManager() 201 | motherduck_manager = motherduck_manager.MotherDuckManager( 202 | duckdb_manager, MOTHERDUCK_TOKEN 203 | ) 204 | aws_manager = aws_manager.AWSManager( 205 | duckdb_manager, AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY 206 | ) 207 | data_manager = DataManager( 208 | duckdb_manager, 209 | LOCAL_DATABASE, 210 | REMOTE_DATABASE, 211 | BRONZE_SCHEMA, 212 | TABLE_NAME, 213 | LOCAL_PATH, 214 | RAW_S3_PATH, 215 | BRONZE_S3_PATH, 216 | ) 217 | 218 | # Creating an instance of DataIngestor and execute the ingestion process 219 | ingestor = Ingestor(duckdb_manager, motherduck_manager, aws_manager, data_manager) 220 | ingestor.execute() 221 | -------------------------------------------------------------------------------- /ingestion/raw/cards.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | import os 4 | import sys 5 | from typing import Any, Dict, List, Optional 6 | import requests 7 | import boto3 8 | from dotenv import load_dotenv 9 | from loguru import logger 10 | 11 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) 12 | 13 | from models.card import Card 14 | 15 | 16 | # Load environment variables 17 | load_dotenv() 18 | 19 | # Configuration 20 | API_BASE_URL = "https://api.scryfall.com/bulk-data/" 21 | DATASET_NAME = "default_cards" 22 | TABLE_NAME = "cards" 23 | TABLE_PATH = "data/raw/" 24 | AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") 25 | AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") 26 | AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") 27 | 28 | 29 | class APIClient: 30 | """ 31 | Class for interacting with an API. 32 | """ 33 | 34 | def __init__(self, base_url: str, dataset: str) -> None: 35 | self.base_url = base_url 36 | self.dataset = dataset 37 | 38 | def fetch_bulk_data(self) -> Optional[requests.Response]: 39 | """ 40 | Fetches data from the API. 41 | 42 | Args: 43 | dataset (str): Name of the dataset to fetch. 44 | 45 | Returns: 46 | dict: Response JSON data. 47 | """ 48 | try: 49 | logger.info("Fetching bulk data") 50 | # Construct the URL for fetching data 51 | url = f"{self.base_url}{self.dataset}" 52 | # Send HTTP GET request to fetch data 53 | response = requests.get(url, timeout=5) 54 | # Raise an exception for HTTP errors 55 | response.raise_for_status() 56 | logger.success(f"Data fetched successfully from {url}") 57 | return response 58 | except requests.exceptions.RequestException as e: 59 | logger.error(f"Failed to fetch data: {e}") 60 | return None 61 | 62 | def fetch_cards_data( 63 | self, response: requests.Response 64 | ) -> Optional[requests.Response]: 65 | """ 66 | Fetchs data from the bulk data download uri 67 | 68 | Args: 69 | response (Response): Response object from the API. 70 | 71 | Returns: 72 | dict: Response JSON data. 73 | """ 74 | try: 75 | logger.info("Fetching cards data") 76 | # Extract relevant data and update timestamp from the API response 77 | download_uri = response.json()["download_uri"] 78 | data = requests.get(download_uri) 79 | update_timestamp = response.json()["updated_at"] 80 | # Parse the update timestamp to extract the date 81 | update_date = datetime.strptime( 82 | update_timestamp, "%Y-%m-%dT%H:%M:%S.%f%z" 83 | ).date() 84 | logger.success( 85 | f"Data fetched successfully from {download_uri} - Update date: {update_date}" 86 | ) 87 | return data 88 | except KeyError as e: 89 | logger.error(f"Failed to extract data from API response: {e}") 90 | return None 91 | except ValueError as e: 92 | logger.error(f"Failed to parse timestamp: {e}") 93 | return None 94 | 95 | 96 | class DataParser: 97 | """ 98 | Class for parsing data. 99 | """ 100 | 101 | @staticmethod 102 | def parse_cards(data: requests.Response) -> Optional[List[Card]]: 103 | """ 104 | Parses JSON data into instances of the Card model. 105 | 106 | Args: 107 | data (dict): JSON data to parse. 108 | 109 | Returns: 110 | List[Card]: List of parsed Card instances. 111 | """ 112 | try: 113 | logger.info("Parsing data") 114 | cards_data = data.json() 115 | parsed_cards = [Card(**card) for card in cards_data] 116 | logger.success("Data parsing successful!") 117 | return parsed_cards 118 | except Exception as e: 119 | logger.error(f"An unexpected error occurred: {e}") 120 | return None 121 | 122 | 123 | class DataSaver: 124 | """ 125 | Class for saving data. 126 | """ 127 | 128 | def __init__( 129 | self, 130 | table_path: str, 131 | table_name: str, 132 | bucket_name: Optional[str], 133 | access_key_id: Optional[str], 134 | secret_access_key: Optional[str], 135 | ): 136 | self.table_path = table_path 137 | self.table_name = table_name 138 | self.bucket_name = bucket_name 139 | if bucket_name: 140 | self.s3_client = boto3.client( 141 | "s3", 142 | aws_access_key_id=access_key_id, 143 | aws_secret_access_key=secret_access_key, 144 | ) 145 | 146 | def save_local(self, data: List) -> None: 147 | """ 148 | Saves parsed data to a local file. 149 | 150 | Args: 151 | data (list): List of parsed data. 152 | table_path (str): Path to store local data. 153 | table_name (str): Name of the table. 154 | 155 | Returns: 156 | None 157 | """ 158 | try: 159 | logger.info("Saving data locally") 160 | path = f"{self.table_path}{self.table_name}.json" 161 | os.makedirs(os.path.dirname(path), exist_ok=True) 162 | with open(path, "w") as file: 163 | json.dump([item.dict() for item in data], file, indent=4) 164 | logger.success(f"Data saved locally to {path}") 165 | except Exception as e: 166 | logger.error(f"An error occurred while saving data locally: {e}") 167 | 168 | def save_s3(self, data: List) -> None: 169 | """ 170 | Saves parsed data to AWS S3 bucket. 171 | 172 | Args: 173 | data (list): List of parsed data. 174 | s3_client: Boto3 S3 client. 175 | bucket_name (str): Name of the AWS S3 bucket. 176 | table_path (str): Path to store data in S3. 177 | table_name (str): Name of the table. 178 | 179 | Returns: 180 | None 181 | """ 182 | try: 183 | logger.info("Saving data to S3 bucket") 184 | json_bytes = json.dumps([item.dict() for item in data], indent=4).encode( 185 | "utf-8" 186 | ) 187 | key = f"{self.table_path}{self.table_name}.json" 188 | self.s3_client.put_object(Body=json_bytes, Bucket=self.bucket_name, Key=key) 189 | logger.success(f"Data saved successfully to S3 bucket: {self.bucket_name}") 190 | except Exception as e: 191 | logger.error(f"An error occurred while saving data to S3 bucket: {e}") 192 | 193 | 194 | class Ingestor: 195 | """ 196 | Class for ingesting data from an API, parsing it, and saving it. 197 | """ 198 | 199 | def __init__(self, api_client, data_parser, data_saver): 200 | self.api_client = api_client 201 | self.data_parser = data_parser 202 | self.data_saver = data_saver 203 | 204 | def execute(self) -> None: 205 | """ 206 | Executes the ingestion process. 207 | 208 | Args: 209 | dataset (str): Name of the dataset to fetch. 210 | """ 211 | # Fetch data from the API 212 | try: 213 | logger.info("Starting ingestion") 214 | bulk_data = self.api_client.fetch_bulk_data() 215 | cards_data = self.api_client.fetch_cards_data(bulk_data) 216 | 217 | if cards_data: 218 | # Parse data 219 | parsed_data = self.data_parser.parse_cards(cards_data) 220 | if parsed_data: 221 | # Save data locally 222 | self.data_saver.save_local(parsed_data) 223 | # Save data to S3 224 | self.data_saver.save_s3(parsed_data) 225 | logger.success("Ingestion completed!") 226 | except Exception as e: 227 | logger.error(logger.error(f"Error executing data ingestion process: {e}")) 228 | 229 | 230 | if __name__ == "__main__": 231 | # Create instances of classes 232 | api_client = APIClient(API_BASE_URL, DATASET_NAME) 233 | data_parser = DataParser() 234 | data_saver = DataSaver( 235 | TABLE_PATH, TABLE_NAME, AWS_BUCKET_NAME, AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY 236 | ) 237 | 238 | # Create an instance of Ingestor and execute the ingestion process 239 | ingestor = Ingestor(api_client, data_parser, data_saver) 240 | ingestor.execute() 241 | --------------------------------------------------------------------------------