├── src ├── __init__.py ├── transform │ ├── .gitkeep │ └── dbt_aoe │ │ ├── macros │ │ ├── .gitkeep │ │ ├── generate_schema_name.sql │ │ ├── filter_load.sql │ │ └── deduplicate_by_key.sql │ │ ├── seeds │ │ ├── .gitkeep │ │ ├── seeds.yml │ │ └── country_list.csv │ │ ├── tests │ │ ├── .gitkeep │ │ ├── assert_countrys_mapped.sql │ │ └── generic │ │ │ ├── test_within_threshold.sql │ │ │ └── test_recent_ldts.sql │ │ ├── analyses │ │ └── .gitkeep │ │ ├── snapshots │ │ └── .gitkeep │ │ ├── .gitignore │ │ ├── .user.yml │ │ ├── packages.yml │ │ ├── package-lock.yml │ │ ├── models │ │ ├── bronze │ │ │ ├── v_relic_raw.sql │ │ │ ├── v_matches_raw.sql │ │ │ ├── v_players_raw.sql │ │ │ ├── players_br.sql │ │ │ ├── ext_table_schema.yml │ │ │ ├── dim_date_br.py │ │ │ ├── matches_br.sql │ │ │ ├── statgroup_br.sql │ │ │ ├── leaderboards_br.sql │ │ │ └── bronze_schema.yml │ │ ├── gold │ │ │ ├── dim_date.sql │ │ │ ├── dim_civ.sql │ │ │ ├── dim_player.sql │ │ │ ├── dim_match.sql │ │ │ ├── fact_player_matches.sql │ │ │ └── gold_schema.yml │ │ ├── silver │ │ │ ├── player_match_sr.sql │ │ │ ├── matches_sr.sql │ │ │ ├── player_leaderboard_stats_sr.sql │ │ │ └── silver_schema.yml │ │ └── Medallion_README.md │ │ ├── profiles.yml.template │ │ └── dbt_project.yml ├── load │ ├── __init__.py │ └── loader.py ├── extract │ ├── __init__.py │ ├── filter.py │ └── models.py ├── config.yaml ├── set_load_master.py ├── elt_relic_api.py ├── elt_metadata.py ├── elt_stat_matches.py ├── elt_stat_players.py ├── project_tests.py └── utils.py ├── .astro ├── config.yaml ├── dag_integrity_exceptions.txt └── test_dag_integrity_default.py ├── .dockerignore ├── README_resources ├── aoe_counter_pick.PNG ├── aoe_leaderboard.PNG ├── dbt_dag_airflow.PNG ├── project_all_dag.PNG ├── aoe_civ_performance.PNG ├── db_diagram_aoe_model.PNG └── aoe2project_data_pipeline.PNG ├── Dockerfile ├── requirements.txt ├── .gitignore ├── dags ├── dbt_dag.py ├── elt_metadata_dag.py ├── elt_relic_api_dag.py ├── set_load_master_dag.py ├── elt_stat_matches_dag.py ├── elt_stat_players_dag.py └── all_project_dag.py ├── dbdiagram.txt ├── .github └── workflows │ ├── ci.yaml │ └── cd.yaml └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transform/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: project 3 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/.user.yml: -------------------------------------------------------------------------------- 1 | id: ab0b036d-20c8-4235-b8a8-7364d501df69 2 | -------------------------------------------------------------------------------- /.astro/dag_integrity_exceptions.txt: -------------------------------------------------------------------------------- 1 | # Add dag files to exempt from parse test below. ex: dags/ -------------------------------------------------------------------------------- /src/transform/dbt_aoe/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_external_tables 3 | version: 0.9.0 -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | astro 2 | .git 3 | .env 4 | airflow_settings.yaml 5 | logs/ 6 | .venv 7 | airflow.db 8 | airflow.cfg 9 | -------------------------------------------------------------------------------- /README_resources/aoe_counter_pick.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe_counter_pick.PNG -------------------------------------------------------------------------------- /README_resources/aoe_leaderboard.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe_leaderboard.PNG -------------------------------------------------------------------------------- /README_resources/dbt_dag_airflow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/dbt_dag_airflow.PNG -------------------------------------------------------------------------------- /README_resources/project_all_dag.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/project_all_dag.PNG -------------------------------------------------------------------------------- /src/load/__init__.py: -------------------------------------------------------------------------------- 1 | from .loader import load_json_data, load_parquet_data 2 | 3 | __all__ = ["load_json_data", "load_parquet_data"] 4 | -------------------------------------------------------------------------------- /README_resources/aoe_civ_performance.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe_civ_performance.PNG -------------------------------------------------------------------------------- /README_resources/db_diagram_aoe_model.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/db_diagram_aoe_model.PNG -------------------------------------------------------------------------------- /README_resources/aoe2project_data_pipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe2project_data_pipeline.PNG -------------------------------------------------------------------------------- /src/transform/dbt_aoe/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_external_tables 3 | version: 0.9.0 4 | sha1_hash: f459fa5ae8276125d72b0ae3ae38baf5847fbb81 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/astronomer/astro-runtime:12.3.0 2 | 3 | RUN python -m venv dbt_venv && source dbt_venv/bin/activate && \ 4 | pip install --no-cache-dir dbt-snowflake && deactivate -------------------------------------------------------------------------------- /src/transform/dbt_aoe/tests/assert_countrys_mapped.sql: -------------------------------------------------------------------------------- 1 | --DBT Tests pass if no rows are returned. 2 | --Hence, write the test assertion to find failing rows. 3 | 4 | SELECT 5 | country_name 6 | FROM 7 | {{ ref('player_leaderboard_stats_sr') }} 8 | WHERE 9 | country_name = 'Unknown' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==2.0 2 | pandas==2.2.2 3 | pyarrow==18.0.0 4 | fastparquet==2024.5.0 5 | backoff==2.2.1 6 | boto3==1.24.28 7 | pydantic==2.9.2 8 | pytest==7.4.0 9 | python-dotenv==1.0.1 10 | PyYAML==6.0.2 11 | Requests==2.32.3 12 | astronomer-cosmos 13 | apache-airflow-providers-snowflake 14 | snowflake-connector-python==3.7.0 15 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/macros/generate_schema_name.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_schema_name(custom_schema_name, node) -%} 2 | 3 | {%- set default_schema = target.schema -%} 4 | {%- if custom_schema_name is none -%} 5 | 6 | {{ default_schema }} 7 | 8 | {%- else -%} 9 | 10 | {{ custom_schema_name | trim }} 11 | 12 | {%- endif -%} 13 | 14 | {%- endmacro %} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/v_relic_raw.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | 4 | --NOTE: Due to the 2 list objects in the json schema, we will need to dedup the individual arrays 5 | SELECT DISTINCT 6 | * 7 | ,metadata$filename AS rsrc 8 | ,metadata$file_last_modified AS ldts 9 | ,'RELIC_LINK_API'::VARCHAR as source 10 | FROM 11 | {{ source('aoe_ext', 'relic_ext') }} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/seeds/seeds.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | seeds: 4 | - name: country_list 5 | description: 'list of country codes and names for aoe2 players' 6 | docs: 7 | show: true 8 | node_color: purple 9 | columns: 10 | - name: Country 11 | description: 'Name of country' 12 | - name: Country_code 13 | description: 'Country codes in lowercase' -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/v_matches_raw.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | 4 | SELECT DISTINCT 5 | * 6 | ,metadata$filename AS rsrc 7 | ,metadata$file_last_modified AS ldts 8 | ,'AOESTATS'::VARCHAR as source 9 | ,SPLIT(metadata$filename, '/')[2]::DATE AS file_date 10 | FROM 11 | {{ source('aoe_ext', 'matches_ext') }} 12 | WHERE 1=1 13 | {{ filter_load('file_date') }} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/v_players_raw.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} 2 | 3 | 4 | SELECT DISTINCT 5 | * 6 | ,metadata$filename AS rsrc 7 | ,metadata$file_last_modified AS ldts 8 | ,'AOESTATS'::VARCHAR as source 9 | ,SPLIT(metadata$filename, '/')[2]::DATE AS file_date 10 | FROM 11 | {{ source('aoe_ext', 'players_ext') }} 12 | WHERE 1=1 13 | {{ filter_load('file_date') }} 14 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/gold/dim_date.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key='date_pk', 5 | on_schema_change='fail' 6 | ) 7 | }} 8 | 9 | 10 | SELECT 11 | TO_VARCHAR(date, 'YYYYMMDD')::int as date_pk 12 | ,date::date as date 13 | ,year 14 | ,month 15 | ,day 16 | ,day_of_week 17 | ,is_weekend 18 | FROM 19 | {{ ref('dim_date_br') }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | __pycache__ 3 | metadata.json 4 | todo.txt 5 | extract_old.py 6 | .env 7 | archived_data 8 | .mypy_cache 9 | dbt_commands.txt 10 | my_dbt_ext_table.sql 11 | sf_setup.sql 12 | project/transform/logs 13 | relic_link_testing 14 | logs 15 | packages.txt 16 | test_dag_example.py 17 | include 18 | plugins 19 | .venv 20 | astro 21 | .git 22 | airflow_settings.yaml 23 | logs/ 24 | airflow.db 25 | airflow.cfg 26 | profiles.yml 27 | .pytest_cache 28 | .astro 29 | tree.py -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/gold/dim_civ.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key='civ_pk', 5 | on_schema_change='fail' 6 | ) 7 | }} 8 | 9 | 10 | SELECT DISTINCT 11 | MD5(civ) as civ_pk 12 | ,civ as civ_name 13 | ,''::TEXT as civ_weaknesses --placeholder, to be filled in later 14 | ,''::TEXT as civ_strengths --placeholder, to be filled in later 15 | ,CURRENT_DATE() AS load_date 16 | FROM 17 | {{ ref('players_br') }} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/macros/filter_load.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro filter_load(file_date) %} 3 | {%- set env = target.name -%} 4 | 5 | AND {{ file_date }} > ( 6 | SELECT 7 | load_start_date 8 | FROM 9 | aoe.control.load_master 10 | WHERE 11 | environment = '{{ env }}' 12 | ) 13 | AND {{ file_date }} <= ( 14 | SELECT 15 | load_end_date 16 | FROM 17 | aoe.control.load_master 18 | WHERE 19 | environment = '{{ env }}' 20 | ) 21 | {% endmacro %} -------------------------------------------------------------------------------- /src/extract/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import ApiSchema, WeeklyDump, RelicResponse, Matches, Players 2 | from .filter import ( 3 | validate_json_schema, 4 | generate_weekly_queries, 5 | create_stats_endpoints, 6 | validate_parquet_schema, 7 | fetch_relic_chunk, 8 | ) 9 | 10 | __all__ = [ 11 | "ApiSchema", 12 | "WeeklyDump", 13 | "Matches", 14 | "Players", 15 | "RelicResponse", 16 | "validate_json_schema", 17 | "generate_weekly_queries", 18 | "create_stats_endpoints", 19 | "validate_parquet_schema", 20 | "fetch_relic_chunk", 21 | ] 22 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/gold/dim_player.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | WITH 5 | deduplicated AS ( 6 | {{ deduplicate_by_key('_ref(player_leaderboard_stats_sr)', 'CONCAT(profile_id,source)', 'ldts') }} 7 | ) 8 | SELECT 9 | MD5(CONCAT(profile_id::TEXT,'~',source)) as player_pk 10 | ,profile_id 11 | ,gaming_name 12 | ,country_code 13 | ,country_name 14 | ,statgroup_id 15 | ,wins 16 | ,losses 17 | ,current_rank 18 | ,current_rating 19 | ,last_match_date 20 | ,CURRENT_DATE() AS load_date 21 | ,source 22 | FROM 23 | deduplicated -------------------------------------------------------------------------------- /src/transform/dbt_aoe/profiles.yml.template: -------------------------------------------------------------------------------- 1 | your_project_name: 2 | target: dev 3 | outputs: 4 | dev: 5 | account: YOUR_ACCOUNT 6 | database: YOUR_DATABASE 7 | password: YOUR_PASSWORD 8 | role: YOUR_ROLE 9 | schema: YOUR_SCHEMA 10 | threads: 4 11 | type: snowflake 12 | user: YOUR_USERNAME 13 | warehouse: YOUR_WAREHOUSE 14 | prod: 15 | account: YOUR_ACCOUNT 16 | database: YOUR_DATABASE 17 | password: YOUR_PASSWORD 18 | role: YOUR_ROLE 19 | schema: YOUR_SCHEMA 20 | threads: 4 21 | type: snowflake 22 | user: YOUR_USERNAME 23 | warehouse: YOUR_WAREHOUSE -------------------------------------------------------------------------------- /src/transform/dbt_aoe/macros/deduplicate_by_key.sql: -------------------------------------------------------------------------------- 1 | -- This macro accepts both CTE's and table references. 2 | -- The parameter 'tbl' expects a string, however we can manipulate this to use the 'ref' macro 3 | -- We can pass a ref{} table by specifying the parameter in the format '_ref({table_name})' 4 | -- Example: '_ref(v_relic_raw)' 5 | {% macro deduplicate_by_key(tbl, pk, date_field) %} 6 | SELECT 7 | * 8 | FROM 9 | {% if tbl is string and tbl.startswith('_ref') %} 10 | {{ ref(tbl[5:-1]) }} 11 | {% else %} 12 | {{ tbl }} 13 | {% endif %} 14 | QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ pk }} ORDER BY {{ date_field }} DESC) = 1 15 | {% endmacro %} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/silver/player_match_sr.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key='id', 5 | on_schema_change='fail' 6 | ) 7 | }} 8 | 9 | 10 | WITH cte as ( 11 | SELECT 12 | MD5(CONCAT(game_id,'~',profile_id)) as id 13 | ,game_id 14 | ,team 15 | ,profile_id 16 | ,civ 17 | ,winner 18 | ,match_rating_diff 19 | ,new_rating 20 | ,old_rating 21 | ,source 22 | ,file_date 23 | FROM 24 | {{ ref('players_br') }} 25 | WHERE 26 | profile_id IS NOT NULL --0.14% null from source, assume DQ issue. 27 | ) 28 | 29 | SELECT * FROM cte 30 | {% if is_incremental() %} 31 | where file_date > (select max(file_date) from {{ this }}) 32 | {% endif %} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/players_br.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | WITH 5 | landing_data AS ( 6 | SELECT 7 | * 8 | FROM 9 | {{ ref('v_players_raw') }} 10 | ) 11 | SELECT 12 | value: civ::VARCHAR AS civ 13 | ,value:game_id::INT AS game_id 14 | ,value:match_rating_diff::INT AS match_rating_diff 15 | ,value:new_rating::INT AS new_rating 16 | ,value:old_rating::INT AS old_rating 17 | ,value:profile_id::INT AS profile_id 18 | ,value:replay_summary_raw::ARRAY AS replay_summary_raw 19 | ,value:team::VARCHAR AS team 20 | ,value:winner::BOOLEAN AS winner 21 | ,rsrc::VARCHAR AS rsrc 22 | ,ldts::TIMESTAMP_NTZ(9) AS ldts 23 | ,source 24 | ,file_date 25 | FROM 26 | landing_data -------------------------------------------------------------------------------- /src/load/loader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import io 3 | from utils import upload_to_s3 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def load_json_data(model, file_dir, fn, bucket, s3): 10 | """Loads Pydantic model as json files in S3.""" 11 | file_name = f"{file_dir}/{fn}.json" 12 | json_data = model.model_dump_json(indent=4) 13 | json_to_bytes = io.BytesIO(json_data.encode("utf-8")) 14 | with json_to_bytes as file_obj: 15 | upload_to_s3(s3, file_obj, bucket, file_name) 16 | 17 | 18 | def load_parquet_data(data, file_dir, fn, bucket, s3): 19 | """Loads parquet data directly into S3 bucket.""" 20 | file_name = f"{file_dir}/{fn}.parquet" 21 | with data as file_obj: 22 | upload_to_s3(s3, file_obj, bucket, file_name) 23 | -------------------------------------------------------------------------------- /dags/dbt_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | from cosmos import DbtDag, ProjectConfig, ProfileConfig, ExecutionConfig 4 | 5 | profile_config = ProfileConfig( 6 | profile_name="dbt_aoe", 7 | target_name="dev", 8 | profiles_yml_filepath=f"{os.environ['AIRFLOW_HOME']}/src/transform/dbt_aoe/profiles.yml", 9 | ) 10 | 11 | dag = DbtDag( 12 | profile_config=profile_config, 13 | project_config=ProjectConfig( 14 | f"{os.environ['AIRFLOW_HOME']}/src/transform/dbt_aoe", 15 | ), 16 | operator_args={"install_deps": True}, 17 | execution_config=ExecutionConfig( 18 | dbt_executable_path=f"{os.environ['AIRFLOW_HOME']}/dbt_venv/bin/dbt", 19 | ), 20 | schedule_interval=timedelta(days=7), 21 | start_date=datetime(2023, 9, 10), 22 | catchup=False, 23 | dag_id="dbt_dag_id", 24 | ) 25 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/tests/generic/test_within_threshold.sql: -------------------------------------------------------------------------------- 1 | {% test within_threshold(model, column_name, lower_threshold, upper_threshold, error_lim) %} 2 | 3 | 4 | WITH 5 | test_def as ( 6 | SELECT 7 | {{ column_name }} as test_field 8 | FROM 9 | {{ model }} 10 | ) 11 | ,perform_test as ( 12 | SELECT 13 | * 14 | FROM 15 | test_def 16 | WHERE 17 | test_field <= {{ lower_threshold }} 18 | OR test_field >= {{ upper_threshold }} 19 | ) 20 | ,validation_errors AS ( 21 | SELECT 22 | COUNT(*) AS n_errors 23 | FROM 24 | perform_test 25 | ) 26 | SELECT 27 | CASE WHEN n_errors > {{error_lim}} THEN 'fail' 28 | ELSE 'pass' 29 | END AS result 30 | FROM 31 | validation_errors 32 | WHERE 33 | result <> 'pass' --Test expected to pass, trigger 'failed test' otherwise 34 | 35 | {% endtest %} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/tests/generic/test_recent_ldts.sql: -------------------------------------------------------------------------------- 1 | {% test recent_ldts(model, column_name, back_date_days) %} 2 | {{ config(severity = 'warn') }} 3 | 4 | 5 | WITH 6 | test_def as ( 7 | SELECT 8 | MAX({{ column_name }}) as test_field 9 | FROM 10 | {{ model }} 11 | ) 12 | ,perform_test as ( 13 | SELECT 14 | * 15 | FROM 16 | test_def 17 | WHERE 18 | test_field >= (CURRENT_DATE()-{{back_date_days}}) 19 | ) 20 | ,validation_errors AS ( 21 | SELECT 22 | COUNT(*) AS n_errors 23 | FROM 24 | perform_test 25 | ) 26 | SELECT 27 | --If there is no max(ldts) > backdate_date, then fail 28 | CASE WHEN n_errors = 0 THEN 'fail' 29 | ELSE 'pass' 30 | END AS result 31 | FROM 32 | validation_errors 33 | WHERE 34 | result <> 'pass' --Test expected to pass, trigger 'failed test' otherwise 35 | 36 | {% endtest %} -------------------------------------------------------------------------------- /src/transform/dbt_aoe/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | name: 'dbt_aoe' 4 | version: '1.0.0' 5 | 6 | # This setting configures which "profile" dbt uses for this project. 7 | profile: 'dbt_aoe' 8 | 9 | # These configurations specify where dbt should look for different types of files. 10 | model-paths: ["models"] 11 | analysis-paths: ["analyses"] 12 | test-paths: ["tests"] 13 | seed-paths: ["seeds"] 14 | macro-paths: ["macros"] 15 | snapshot-paths: ["snapshots"] 16 | 17 | clean-targets: # directories to be removed by `dbt clean` 18 | - "target" 19 | - "dbt_packages" 20 | 21 | models: 22 | dbt_aoe: 23 | # Config indicated by + and applies to all files 24 | bronze: 25 | +materialized: table 26 | +schema: bronze 27 | silver: 28 | +materialized: table 29 | +schema: silver 30 | gold: 31 | +materialized: table 32 | +schema: gold -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/ext_table_schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: aoe_ext 5 | loader: S3 6 | 7 | tables: 8 | - name: matches_ext 9 | description: "External table of AOE matches, stored as parquet files" 10 | external: 11 | location: "@s3_stage/{{target.name}}/matches" 12 | file_format: parquet_ff 13 | auto_refresh: true 14 | 15 | - name: players_ext 16 | description: "External table of AOE players, stored as parquet files" 17 | external: 18 | location: "@s3_stage/{{target.name}}/players" 19 | file_format: parquet_ff 20 | auto_refresh: true 21 | 22 | - name: relic_ext 23 | description: "External table of relic API data, stored as json files" 24 | external: 25 | location: "@s3_stage/{{target.name}}/rl_api" 26 | file_format: json_ff 27 | auto_refresh: true -------------------------------------------------------------------------------- /src/config.yaml: -------------------------------------------------------------------------------- 1 | # parse_date params (update here for 1 time loads or backdating) 2 | backdate_days_start: 90 3 | backdate_days_end: 1 4 | target_run_date: ~ 5 | target_run_end_date: ~ 6 | date_format: '%Y-%m-%d' 7 | 8 | # AoeStats 9 | stats_dir_url: '/media/db_dumps/date_range%3D' 10 | matches_fn_suffix: 'matches' 11 | players_fn_suffix: 'players' 12 | stats_base_url: https://aoestats.io 13 | matches_folder_name: dev/matches 14 | players_folder_name: dev/players 15 | 16 | metadata_endpoint: /api/db_dumps.json 17 | metadata_fn_suffix: metadata 18 | metadata_folder_name: dev/meta 19 | 20 | # RelicLink 21 | relic_base_url: https://aoe-api.worldsedgelink.com 22 | relic_endpoint: '/community/leaderboard/getLeaderBoard2?' 23 | relic_params: 24 | leaderboard_id: 3 25 | title: age2 26 | sort_by: 0 27 | chunk_size: 100 28 | relic_fn_suffix: relic 29 | relic_folder_name: dev/rl_api 30 | 31 | # s3 bucket. Other bucket params are in .env file. 32 | bucket: aoe2dumps -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/gold/dim_match.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key='match_pk', 5 | on_schema_change='fail' 6 | ) 7 | }} 8 | 9 | 10 | WITH 11 | deduplicated AS ( 12 | {{ deduplicate_by_key('_ref(matches_sr)', 'CONCAT(game_id,source)', 'ldts') }} 13 | ) 14 | ,cte AS ( 15 | SELECT 16 | MD5(CONCAT(game_id::TEXT,'~',source)) as match_pk 17 | ,game_id 18 | ,map 19 | ,avg_elo 20 | ,game_duration_secs 21 | ,actual_duration_secs 22 | ,game_started_timestamp 23 | ,game_date 24 | ,team_0_elo 25 | ,team_1_elo 26 | ,leaderboard 27 | ,mirror 28 | ,patch 29 | ,rsrc 30 | ,CURRENT_DATE() AS load_date 31 | ,source 32 | ,file_date 33 | FROM 34 | deduplicated 35 | ) 36 | 37 | SELECT * FROM cte 38 | {% if is_incremental() %} 39 | where file_date > (select max(file_date) from {{ this }}) 40 | or match_pk is NULL 41 | {% endif %} 42 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/silver/matches_sr.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key='game_id', 5 | on_schema_change='fail' 6 | ) 7 | }} 8 | 9 | 10 | with cte as ( 11 | SELECT 12 | game_id 13 | ,map 14 | ,game_type 15 | ,game_speed 16 | ,starting_age 17 | ,num_players 18 | ,avg_elo 19 | ,SUBSTR(duration::TEXT, 0, 4)::int as game_duration_secs 20 | ,SUBSTR(irl_duration::TEXT, 0, 4)::int as actual_duration_secs 21 | ,started_timestamp as game_started_timestamp 22 | ,started_timestamp::date as game_date 23 | ,team_0_elo::int as team_0_elo 24 | ,team_1_elo::int as team_1_elo 25 | ,leaderboard 26 | ,mirror 27 | ,patch 28 | ,rsrc 29 | ,ldts 30 | ,source 31 | ,file_date 32 | FROM 33 | {{ ref('matches_br') }} 34 | WHERE 35 | leaderboard = 'random_map' --filter for 1v1 RM's in this analysis 36 | ) 37 | 38 | SELECT * FROM cte 39 | {% if is_incremental() %} 40 | where file_date > (select max(file_date) from {{ this }}) 41 | {% endif %} 42 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/dim_date_br.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime 3 | 4 | START_YEAR = 2020 5 | END_YEAR = 2029 6 | 7 | 8 | def model(dbt, session): 9 | """Generates a date dimension table""" 10 | 11 | # Define the start and end dates for the dimension table 12 | start_date = datetime(START_YEAR, 1, 1) 13 | end_date = datetime(END_YEAR, 12, 31) 14 | 15 | # Generate a date range 16 | date_range = pd.date_range(start=start_date, end=end_date, freq="D") 17 | 18 | dim_date_df = pd.DataFrame( 19 | { 20 | "date": date_range, 21 | "year": date_range.year, 22 | "month": date_range.month, 23 | "day": date_range.day, 24 | "day_of_week": date_range.dayofweek, # 0=Monday, 6=Sunday 25 | } 26 | ) 27 | 28 | dim_date_df["is_weekend"] = dim_date_df["day_of_week"].isin([5, 6]) 29 | 30 | # Convert all column names to uppercase, to make case insensitve in Snowflake 31 | dim_date_df = dim_date_df.rename(columns=str.upper) 32 | 33 | return dim_date_df 34 | -------------------------------------------------------------------------------- /dags/elt_metadata_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.python import PythonOperator 4 | import sys 5 | import os 6 | 7 | # Add the 'src' directory to the Python path 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(os.path.join(project_root, "src")) 10 | 11 | from src.elt_metadata import main as run_elt_metadata 12 | 13 | default_args = { 14 | "owner": "Jono", 15 | "depends_on_past": False, 16 | "email": ["test@live.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=1), 21 | } 22 | 23 | with DAG( 24 | "elt_metadata_dag_id", 25 | default_args=default_args, 26 | description="ETL process for metadata", 27 | schedule_interval=timedelta(days=7), 28 | start_date=datetime(2023, 1, 1), 29 | catchup=False, 30 | tags=["etl", "metadata"], 31 | ) as dag: 32 | run_elt_metadata_task = PythonOperator( 33 | task_id="run_elt_metadata", 34 | python_callable=run_elt_metadata, 35 | ) 36 | 37 | run_elt_metadata_task 38 | -------------------------------------------------------------------------------- /dags/elt_relic_api_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.python import PythonOperator 4 | import sys 5 | import os 6 | 7 | # Add the 'src' directory to the Python path 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(os.path.join(project_root, "src")) 10 | 11 | from src.elt_relic_api import main as run_elt_relic_api 12 | 13 | default_args = { 14 | "owner": "Jono", 15 | "depends_on_past": False, 16 | "email": ["test@live.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=1), 21 | } 22 | 23 | with DAG( 24 | "elt_relic_api_id", 25 | default_args=default_args, 26 | description="ETL process for relic_api", 27 | schedule_interval=timedelta(days=7), 28 | start_date=datetime(2023, 1, 1), 29 | catchup=False, 30 | tags=["etl", "relic"], 31 | ) as dag: 32 | run_elt_relic_api_task = PythonOperator( 33 | task_id="run_elt_relic_api", 34 | python_callable=run_elt_relic_api, 35 | ) 36 | 37 | run_elt_relic_api_task 38 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/matches_br.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | WITH 5 | landing_data AS ( 6 | SELECT 7 | * 8 | FROM 9 | {{ ref('v_matches_raw') }} 10 | ) 11 | SELECT 12 | value: avg_elo::DEC(38,2) AS avg_elo 13 | ,value:duration::INT AS duration 14 | ,value:game_id::INT AS game_id 15 | ,value:game_speed::VARCHAR AS game_speed 16 | ,value:game_type::VARCHAR AS game_type 17 | ,value:irl_duration::INT AS irl_duration 18 | ,value:leaderboard::VARCHAR AS leaderboard 19 | ,value:map::VARCHAR AS map 20 | ,value:mirror::BOOLEAN AS mirror 21 | ,value:num_players::INT AS num_players 22 | ,value:patch::VARCHAR AS patch 23 | ,value:raw_match_type::VARCHAR AS raw_match_type 24 | ,value:replay_enhanced::BOOLEAN AS replay_enhanced 25 | ,value:started_timestamp::VARCHAR::TIMESTAMP_NTZ AS started_timestamp 26 | ,value:starting_age::VARCHAR AS starting_age 27 | ,value:team_0_elo::DEC(38,2) AS team_0_elo 28 | ,value:team_1_elo::DEC(38,2) AS team_1_elo 29 | ,rsrc::VARCHAR AS rsrc 30 | ,ldts::TIMESTAMP_NTZ(9) AS ldts 31 | ,source 32 | ,file_date 33 | FROM 34 | landing_data -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/gold/fact_player_matches.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key='fact_pk', 5 | on_schema_change='fail' 6 | ) 7 | }} 8 | 9 | 10 | WITH cte AS ( 11 | SELECT 12 | pm.id as fact_pk 13 | ,dm.match_pk as match_fk 14 | ,dp.player_pk as player_fk 15 | ,dc.civ_pk as civ_fk 16 | ,dd.date_pk as date_fk 17 | ,pm.team 18 | ,pm.winner 19 | ,pm.match_rating_diff 20 | ,pm.new_rating 21 | ,pm.old_rating 22 | ,pm.source 23 | ,pm.file_date 24 | ,CURRENT_DATE() as load_date 25 | FROM 26 | {{ ref('player_match_sr') }} as pm 27 | INNER JOIN 28 | {{ ref('dim_civ') }} as dc 29 | ON pm.civ = dc.civ_name 30 | INNER JOIN 31 | {{ ref('dim_match') }} as dm 32 | ON pm.game_id = dm.game_id 33 | INNER JOIN 34 | {{ ref('dim_player') }} as dp 35 | ON pm.profile_id = dp.profile_id 36 | INNER JOIN 37 | {{ ref('dim_date') }} as dd 38 | ON dm.game_date = dd.date 39 | ) 40 | 41 | SELECT * FROM cte 42 | {% if is_incremental() %} 43 | where file_date > (select max(file_date) from {{ this }}) 44 | or fact_pk IS NULL 45 | {% endif %} 46 | -------------------------------------------------------------------------------- /dags/set_load_master_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.python import PythonOperator 4 | import sys 5 | import os 6 | 7 | # Add the 'src' directory to the Python path 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(os.path.join(project_root, "src")) 10 | 11 | from src.set_load_master import main as run_set_load_master 12 | 13 | default_args = { 14 | "owner": "Jono", 15 | "depends_on_past": False, 16 | "email": ["test@live.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=1), 21 | } 22 | 23 | with DAG( 24 | "set_load_master_id", 25 | default_args=default_args, 26 | description="ETL process for load_master", 27 | schedule_interval=timedelta(days=7), 28 | start_date=datetime(2023, 1, 1), 29 | catchup=False, 30 | tags=["etl", "master"], 31 | ) as dag: 32 | run_set_load_master_task = PythonOperator( 33 | task_id="run_set_load_master", 34 | python_callable=run_set_load_master, 35 | ) 36 | 37 | run_set_load_master_task 38 | -------------------------------------------------------------------------------- /dags/elt_stat_matches_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.python import PythonOperator 4 | import sys 5 | import os 6 | 7 | # Add the 'src' directory to the Python path 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(os.path.join(project_root, "src")) 10 | 11 | from src.elt_stat_matches import main as run_elt_stat_matches 12 | 13 | default_args = { 14 | "owner": "Jono", 15 | "depends_on_past": False, 16 | "email": ["test@live.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=1), 21 | } 22 | 23 | with DAG( 24 | "elt_stat_matches_id", 25 | default_args=default_args, 26 | description="ETL process for stat_matches", 27 | schedule_interval=timedelta(days=7), 28 | start_date=datetime(2023, 1, 1), 29 | catchup=False, 30 | tags=["etl", "stats"], 31 | ) as dag: 32 | run_elt_stat_matches_task = PythonOperator( 33 | task_id="run_elt_stat_matches", 34 | python_callable=run_elt_stat_matches, 35 | ) 36 | 37 | run_elt_stat_matches_task 38 | -------------------------------------------------------------------------------- /dags/elt_stat_players_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.python import PythonOperator 4 | import sys 5 | import os 6 | 7 | # Add the 'src' directory to the Python path 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | sys.path.append(os.path.join(project_root, "src")) 10 | 11 | from src.elt_stat_players import main as run_elt_stat_players 12 | 13 | default_args = { 14 | "owner": "Jono", 15 | "depends_on_past": False, 16 | "email": ["test@live.com"], 17 | "email_on_failure": False, 18 | "email_on_retry": False, 19 | "retries": 1, 20 | "retry_delay": timedelta(minutes=1), 21 | } 22 | 23 | with DAG( 24 | "elt_stat_players_id", 25 | default_args=default_args, 26 | description="ETL process for stat_players", 27 | schedule_interval=timedelta(days=7), 28 | start_date=datetime(2023, 1, 1), 29 | catchup=False, 30 | tags=["etl", "stats"], 31 | ) as dag: 32 | run_elt_stat_players_task = PythonOperator( 33 | task_id="run_elt_stat_players", 34 | python_callable=run_elt_stat_players, 35 | ) 36 | 37 | run_elt_stat_players_task 38 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/statgroup_br.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | WITH 5 | landing_data AS ( 6 | SELECT 7 | VALUE AS json_col 8 | ,rsrc::VARCHAR as rsrc 9 | ,ldts::TIMESTAMP_NTZ(9) as ldts 10 | ,source 11 | FROM 12 | {{ ref('v_relic_raw') }} 13 | ) 14 | ,members_stats AS ( 15 | SELECT DISTINCT 16 | value:id::int as id 17 | ,value:name::VARCHAR as group_name 18 | ,value:type::int as type 19 | ,value:members::array as members 20 | ,rsrc 21 | ,ldts 22 | ,source 23 | FROM 24 | landing_data 25 | ,LATERAL FLATTEN(INPUT => json_col:statGroups) 26 | ) 27 | ,flattened AS ( 28 | SELECT 29 | id 30 | ,group_name 31 | ,type 32 | ,value:alias::VARCHAR as alias 33 | ,value:country::VARCHAR as country 34 | ,value:leaderboardregion_id::int as leaderboardregion_id 35 | ,value:level::int as level 36 | ,value:name::VARCHAR as name 37 | ,value:personal_statgroup_id::int as personal_statgroup_id 38 | ,value:profile_id::int as profile_id 39 | ,value:xp::int as xp 40 | ,rsrc 41 | ,ldts 42 | ,source 43 | FROM 44 | members_stats 45 | ,LATERAL FLATTEN(INPUT => members) 46 | ) 47 | ,deduplicated AS ( 48 | {{ deduplicate_by_key('flattened', 'profile_id', 'ldts') }} 49 | ) 50 | SELECT 51 | * 52 | FROM 53 | deduplicated -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/gold/gold_schema.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | - name: dim_civ 6 | description: "Dimension table contain the attributes for civs" 7 | columns: 8 | - name: civ_pk 9 | description: "A primary key for this table" 10 | data_tests: 11 | - unique 12 | - not_null 13 | 14 | - name: dim_date 15 | description: "Dimension table containing the attributes for dates" 16 | columns: 17 | - name: date_pk 18 | description: "A primary key for this table" 19 | data_tests: 20 | - unique 21 | - not_null 22 | 23 | - name: dim_match 24 | description: "Dimension table containing the attributes for matches" 25 | columns: 26 | - name: match_pk 27 | description: "A primary key for this table" 28 | data_tests: 29 | - unique 30 | - not_null 31 | 32 | - name: dim_player 33 | description: "Dimension table containing the attributes for players" 34 | columns: 35 | - name: player_pk 36 | description: "A primary key for this table" 37 | data_tests: 38 | - unique 39 | - not_null 40 | 41 | - name: fact_player_matches 42 | description: "Fact table containing the joining keys and measures" 43 | columns: 44 | - name: fact_pk 45 | description: "The primary key for this table" 46 | data_tests: 47 | - unique 48 | - not_null -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/leaderboards_br.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | WITH 5 | landing_data AS ( 6 | SELECT 7 | VALUE AS json_col 8 | ,rsrc::VARCHAR as rsrc 9 | ,ldts::TIMESTAMP_NTZ(9) as ldts 10 | ,source 11 | FROM 12 | {{ ref('v_relic_raw') }} 13 | ) 14 | ,flattened AS ( 15 | SELECT DISTINCT 16 | value:disputes::int as disputes 17 | ,value:drops::int as drops 18 | ,value:highestrank::int as highestrank 19 | ,value:highestranklevel::int as highestranklevel 20 | ,value:highestrating::int as highestrating 21 | ,value:lastmatchdate::int as lastmatchdate 22 | ,value:leaderboard_id::int as leaderboard_id 23 | ,value:losses::int as losses 24 | ,value:rank::int as rank 25 | ,value:ranklevel::int as ranklevel 26 | ,value:ranktotal::int as ranktotal 27 | ,value:rating::int as rating 28 | ,value:regionrank::int as regionrank 29 | ,value:regionranktotal::int as regionranktotal 30 | ,value:statgroup_id::int as statgroup_id 31 | ,value:streak::int as streak 32 | ,value:wins::int as wins 33 | ,rsrc 34 | ,ldts 35 | ,source 36 | FROM 37 | landing_data 38 | ,LATERAL FLATTEN(INPUT => json_col:leaderboardStats) 39 | ) 40 | ,deduplicated AS ( 41 | {{ deduplicate_by_key('flattened', 'statgroup_id', 'ldts') }} 42 | ) 43 | SELECT 44 | * 45 | FROM 46 | deduplicated -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/silver/player_leaderboard_stats_sr.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='table') }} 2 | 3 | 4 | WITH 5 | stat_group_data AS ( 6 | SELECT 7 | profile_id 8 | ,alias as gaming_name 9 | ,personal_statgroup_id 10 | ,country 11 | FROM 12 | {{ ref('statgroup_br') }} 13 | ) 14 | ,country_codes AS ( 15 | SELECT 16 | Country AS country_name 17 | ,Country_code 18 | FROM 19 | {{ ref('country_list') }} 20 | ) 21 | ,leaderboard_data AS ( 22 | SELECT 23 | statgroup_id 24 | ,wins 25 | ,losses 26 | ,rank as current_rank 27 | ,rating as current_rating 28 | ,TO_TIMESTAMP(lastmatchdate) as last_match_date 29 | ,ldts 30 | ,rsrc 31 | ,source 32 | FROM 33 | {{ ref('leaderboards_br') }} 34 | ) 35 | SELECT 36 | sgd.profile_id 37 | ,sgd.gaming_name 38 | ,sgd.country AS country_code 39 | ,COALESCE(cc.country_name, 'Unknown') AS country_name 40 | ,ld.statgroup_id 41 | ,ld.wins 42 | ,ld.losses 43 | ,ld.current_rank 44 | ,ld.current_rating 45 | ,ld.last_match_date 46 | ,ld.ldts 47 | ,ld.rsrc 48 | ,ld.source 49 | FROM 50 | leaderboard_data as ld 51 | INNER JOIN 52 | stat_group_data as sgd 53 | ON ld.statgroup_id = sgd.personal_statgroup_id 54 | LEFT JOIN 55 | country_codes AS cc 56 | ON sgd.country = cc.country_code 57 | -------------------------------------------------------------------------------- /src/set_load_master.py: -------------------------------------------------------------------------------- 1 | from utils import Config, sf_connect 2 | import logging 3 | import os 4 | from pathlib import Path 5 | 6 | # Configure logging 7 | logging.basicConfig( 8 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 9 | ) 10 | logger = logging.getLogger(__name__) 11 | 12 | # Get the directory of the current script 13 | script_dir = Path(__file__).resolve().parent 14 | 15 | YAML_CONFIG = os.path.join(script_dir, "config.yaml") 16 | 17 | # Set up script to update only 'dev' file date ranges. 18 | # 'prod' runs will be bring in all data by default, 19 | # unless control table manually overridden or updated here (ENV = "prod"). 20 | ENV = "dev" 21 | 22 | 23 | def update_load_master(connection, env: str, from_date: str, to_date: str): 24 | query = f""" 25 | UPDATE load_master 26 | SET 27 | load_start_date = '{from_date}' 28 | ,load_end_date = '{to_date}' 29 | WHERE 30 | environment = '{env}' 31 | ; 32 | """ 33 | cursor = connection.cursor() 34 | cursor.execute(query) 35 | result = cursor.fetchone()[0] 36 | cursor.close() 37 | return result 38 | 39 | 40 | def main(*args, **kwargs): 41 | config = Config(YAML_CONFIG) 42 | run_date_from = config.run_date 43 | run_date_to = config.run_end_date 44 | 45 | connection = sf_connect(db="aoe", schema="control") 46 | update_load_master(connection, ENV, run_date_from, run_date_to) 47 | connection.close() 48 | logger.info("Script complete.") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /src/elt_relic_api.py: -------------------------------------------------------------------------------- 1 | from utils import Config, timer, create_s3_session 2 | from extract import RelicResponse, validate_json_schema, fetch_relic_chunk 3 | from load import load_json_data 4 | import logging 5 | import os 6 | from pathlib import Path 7 | 8 | # Configure logging 9 | logging.basicConfig( 10 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 11 | ) 12 | logger = logging.getLogger(__name__) 13 | 14 | # Get the directory of the current script 15 | script_dir = Path(__file__).resolve().parent 16 | 17 | YAML_CONFIG = os.path.join(script_dir, "config.yaml") 18 | 19 | 20 | @timer 21 | def main(*args, **kwargs): 22 | config = Config(YAML_CONFIG) 23 | 24 | # Setup: 25 | s3 = create_s3_session() 26 | _base_url = config.relic_base_url 27 | _endpoint = config.relic_endpoint 28 | _params = config.relic_params 29 | _validation_schema = RelicResponse 30 | _fn = config.relic_fn_suffix 31 | _file_dir = config.relic_folder_name 32 | _s3_bucket = config.bucket 33 | 34 | # Extract phase 35 | logger.info("Starting data extraction.") 36 | content_chunk = fetch_relic_chunk(_base_url, _endpoint, _params) 37 | 38 | for i, json_data in enumerate(content_chunk): 39 | fn = f"{_fn}_{i+1}" 40 | 41 | # Validate phase 42 | validated_data = validate_json_schema(json_data, _validation_schema) 43 | 44 | # Load phase 45 | load_json_data(validated_data, _file_dir, fn, _s3_bucket, s3) 46 | logger.info("Script complete.") 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/bronze/bronze_schema.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | - name: leaderboards_br 6 | description: "Schema of flattened leaderboard data from Relic API" 7 | columns: 8 | - name: statgroup_id 9 | description: "The primary key for this table" 10 | data_tests: 11 | - unique 12 | - not_null 13 | - name: ldts 14 | data_tests: 15 | - recent_ldts: 16 | 'back_date_days': 7 17 | 18 | - name: statgroup_br 19 | description: "Schema of flattened player stats data from Relic API" 20 | columns: 21 | - name: profile_id 22 | description: "The primary key for this table" 23 | data_tests: 24 | - unique 25 | - not_null 26 | - name: personal_statgroup_id 27 | description: "Joining key to leaderboards table" 28 | - name: ldts 29 | data_tests: 30 | - recent_ldts: 31 | 'back_date_days': 7 32 | 33 | - name: matches_br 34 | description: "Individual match data from Aoestats.io" 35 | columns: 36 | - name: game_id 37 | description: "The primary key for this table" 38 | data_tests: 39 | - unique 40 | - not_null 41 | 42 | - name: ldts 43 | data_tests: 44 | - recent_ldts: 45 | 'back_date_days': 7 46 | 47 | - name: players_br 48 | description: "Individual player data from Aoestats.io" 49 | columns: 50 | - name: game_id 51 | description: "Joining key for the table when combined with profile_id" 52 | - name: profile_id 53 | description: "Joining key for the table when combined with game_id" 54 | - name: ldts 55 | data_tests: 56 | - recent_ldts: 57 | 'back_date_days': 7 58 | -------------------------------------------------------------------------------- /src/elt_metadata.py: -------------------------------------------------------------------------------- 1 | from utils import Config, timer, fetch_api_json, create_s3_session 2 | from extract import validate_json_schema, ApiSchema 3 | from load import load_json_data 4 | import logging 5 | import os 6 | from pathlib import Path 7 | 8 | # Configure logging 9 | logging.basicConfig( 10 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 11 | ) 12 | logger = logging.getLogger(__name__) 13 | 14 | # Get the directory of the current script 15 | script_dir = Path(__file__).resolve().parent 16 | 17 | YAML_CONFIG = os.path.join(script_dir, "config.yaml") 18 | 19 | 20 | @timer 21 | def main(*args, **kwargs): 22 | config = Config(YAML_CONFIG) 23 | 24 | # Setup: 25 | s3 = create_s3_session() 26 | _base_url = config.stats_base_url 27 | _endpoint = config.metadata_endpoint 28 | _params = None 29 | _validation_schema = ApiSchema 30 | _fn = f"{config.run_end_date}_{config.metadata_fn_suffix}" 31 | _file_dir = config.metadata_folder_name 32 | _s3_bucket = config.bucket 33 | 34 | try: 35 | # Extract phase 36 | logger.info("Starting data extraction.") 37 | json_data = fetch_api_json(_base_url, _endpoint, _params) 38 | 39 | if json_data is None: 40 | logger.error( 41 | f"Failed to fetch data after 3 attempts for endpoint: {_endpoint}" 42 | ) 43 | raise 44 | 45 | # Validate phase 46 | logger.info("Validating data.") 47 | validated_data = validate_json_schema(json_data, _validation_schema) 48 | 49 | # Load phase 50 | logger.info("Starting data loading.") 51 | load_json_data(validated_data, _file_dir, _fn, _s3_bucket, s3) 52 | 53 | logger.info("ELT process completed successfully.") 54 | except Exception as e: 55 | logger.error(f"ELT process failed: {e}") 56 | raise 57 | logger.info("Script complete.") 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/silver/silver_schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: matches_sr 5 | description: "Cleaned matches table filtered for 1v1RM matches only" 6 | columns: 7 | - name: game_duration_secs 8 | data_tests: 9 | - within_threshold: 10 | 'lower_threshold': 0 11 | 'upper_threshold': 10000 12 | 'error_lim': 10 13 | - name: actual_duration_secs 14 | data_tests: 15 | - within_threshold: 16 | 'lower_threshold': 0 17 | 'upper_threshold': 10000 18 | 'error_lim': 10 19 | - name: game_started_timestamp 20 | data_tests: 21 | - within_threshold: 22 | 'lower_threshold': "'2022-01-01'" 23 | 'upper_threshold': CURRENT_DATE()+1 24 | 'error_lim': 1 25 | - name: game_date 26 | data_tests: 27 | - within_threshold: 28 | 'lower_threshold': "'2022-01-01'" 29 | 'upper_threshold': CURRENT_DATE()+1 30 | 'error_lim': 1 31 | - name: ldts 32 | data_tests: 33 | - recent_ldts: 34 | 'back_date_days': 7 35 | 36 | - name: player_leaderboard_stats_sr 37 | description: "Schema of flattened player stats data from Relic API" 38 | columns: 39 | - name: last_match_date 40 | description: "The last time the player has played" 41 | data_tests: 42 | - within_threshold: 43 | 'lower_threshold': "'2022-01-01'" 44 | 'upper_threshold': CURRENT_DATE()+1 45 | 'error_lim': 1 46 | - name: ldts 47 | data_tests: 48 | - recent_ldts: 49 | 'back_date_days': 7 50 | 51 | - name: player_match_sr 52 | description: "pre-fact table containing mainly business keys and measures" 53 | columns: 54 | - name: id 55 | description: "The primary key for this table" 56 | data_tests: 57 | - unique 58 | - not_null -------------------------------------------------------------------------------- /dbdiagram.txt: -------------------------------------------------------------------------------- 1 | Table "DIM_CIV" { 2 | "CIV_PK" VARCHAR(32) 3 | "CIV_NAME" VARCHAR(16777216) 4 | "CIV_WEAKNESSES" VARCHAR(16777216) 5 | "CIV_STRENGTHS" VARCHAR(16777216) 6 | "LOAD_DATE" DATE 7 | } 8 | 9 | Table "DIM_DATE" { 10 | "DATE_PK" NUMBER(38,0) 11 | "DATE" DATE 12 | "YEAR" NUMBER(38,0) 13 | "MONTH" NUMBER(38,0) 14 | "DAY" NUMBER(38,0) 15 | "DAY_OF_WEEK" NUMBER(38,0) 16 | "IS_WEEKEND" BOOLEAN 17 | } 18 | 19 | Table "DIM_MATCH" { 20 | "MATCH_PK" VARCHAR(32) 21 | "GAME_ID" NUMBER(38,0) 22 | "MAP" VARCHAR(16777216) 23 | "AVG_ELO" NUMBER(38,2) 24 | "GAME_DURATION_SECS" NUMBER(38,0) 25 | "ACTUAL_DURATION_SECS" NUMBER(38,0) 26 | "GAME_STARTED_TIMESTAMP" TIMESTAMP_NTZ(6) 27 | "GAME_DATE" DATE 28 | "TEAM_0_ELO" NUMBER(38,0) 29 | "TEAM_1_ELO" NUMBER(38,0) 30 | "LEADERBOARD" VARCHAR(16777216) 31 | "MIRROR" BOOLEAN 32 | "PATCH" VARCHAR(16777216) 33 | "RSRC" VARCHAR(16777216) 34 | "LOAD_DATE" DATE 35 | "SOURCE" VARCHAR(16777216) 36 | "FILE_DATE" DATE 37 | } 38 | 39 | Table "DIM_PLAYER" { 40 | "PLAYER_PK" VARCHAR(32) 41 | "PROFILE_ID" NUMBER(38,0) 42 | "GAMING_NAME" VARCHAR(16777216) 43 | "COUNTRY_CODE" VARCHAR(16777216) 44 | "COUNTRY_NAME" VARCHAR(16777216) 45 | "STATGROUP_ID" NUMBER(38,0) 46 | "WINS" NUMBER(38,0) 47 | "LOSSES" NUMBER(38,0) 48 | "CURRENT_RANK" NUMBER(38,0) 49 | "CURRENT_RATING" NUMBER(38,0) 50 | "LAST_MATCH_DATE" TIMESTAMP_NTZ(0) 51 | "LOAD_DATE" DATE 52 | "SOURCE" VARCHAR(16777216) 53 | } 54 | 55 | Table "FACT_PLAYER_MATCHES" { 56 | "FACT_PK" VARCHAR(32) 57 | "MATCH_FK" VARCHAR(32) 58 | "PLAYER_FK" VARCHAR(32) 59 | "CIV_FK" VARCHAR(32) 60 | "DATE_FK" NUMBER(38,0) 61 | "TEAM" VARCHAR(16777216) 62 | "WINNER" BOOLEAN 63 | "MATCH_RATING_DIFF" NUMBER(38,0) 64 | "NEW_RATING" NUMBER(38,0) 65 | "OLD_RATING" NUMBER(38,0) 66 | "SOURCE" VARCHAR(16777216) 67 | "FILE_DATE" DATE 68 | "LOAD_DATE" DATE 69 | } 70 | 71 | 72 | Ref: FACT_PLAYER_MATCHES.MATCH_FK < DIM_MATCH.MATCH_PK 73 | Ref: FACT_PLAYER_MATCHES.PLAYER_FK < DIM_PLAYER.PLAYER_PK 74 | Ref: FACT_PLAYER_MATCHES.CIV_FK < DIM_CIV.CIV_PK 75 | Ref: FACT_PLAYER_MATCHES.DATE_FK < DIM_DATE.DATE_PK 76 | -------------------------------------------------------------------------------- /src/elt_stat_matches.py: -------------------------------------------------------------------------------- 1 | from utils import Config, timer, fetch_api_file, create_s3_session 2 | from extract import ( 3 | Matches, 4 | generate_weekly_queries, 5 | create_stats_endpoints, 6 | validate_parquet_schema, 7 | ) 8 | from load import load_parquet_data 9 | import logging 10 | import os 11 | from pathlib import Path 12 | 13 | # Configure logging 14 | logging.basicConfig( 15 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 16 | ) 17 | logger = logging.getLogger(__name__) 18 | 19 | # Get the directory of the current script 20 | script_dir = Path(__file__).resolve().parent 21 | 22 | YAML_CONFIG = os.path.join(script_dir, "config.yaml") 23 | 24 | 25 | @timer 26 | def main(*args, **kwargs): 27 | config = Config(YAML_CONFIG) 28 | 29 | # Setup: 30 | s3 = create_s3_session() 31 | _fn = config.matches_fn_suffix 32 | _file_dir = config.matches_folder_name 33 | _validation_schema = Matches 34 | 35 | _start_date = config.run_date 36 | _end_date = config.run_end_date 37 | _base_url = config.stats_base_url + config.stats_dir_url 38 | _params = None 39 | _s3_bucket = config.bucket 40 | 41 | # Pre-extract phase 42 | weekly_querys = generate_weekly_queries(_start_date, _end_date) 43 | endpoints = create_stats_endpoints(_fn, weekly_querys) 44 | 45 | for i, endpoint in enumerate(endpoints): 46 | endpoint_url = endpoint["endpoint_str"] 47 | dated_filename = endpoint["file_date"] 48 | file_dir = f"{_file_dir}/{dated_filename.split('_')[0]}" 49 | 50 | # Extract phase 51 | content = fetch_api_file(_base_url, endpoint_url, _params) 52 | 53 | if content is None: 54 | logger.error( 55 | f"Failed to fetch data after 3 attempts for endpoint: {endpoint_url}" 56 | ) 57 | continue # Skip to the next iteration of the loop 58 | 59 | # Validate phase 60 | validated_data = validate_parquet_schema(content, _validation_schema) 61 | 62 | # Load phase 63 | load_parquet_data(validated_data, file_dir, dated_filename, _s3_bucket, s3) 64 | logger.info(f"{i+1}/{len(endpoints)} loaded.") 65 | logger.info("Script complete.") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /src/elt_stat_players.py: -------------------------------------------------------------------------------- 1 | from utils import Config, timer, fetch_api_file, create_s3_session 2 | from extract import ( 3 | Players, 4 | generate_weekly_queries, 5 | create_stats_endpoints, 6 | validate_parquet_schema, 7 | ) 8 | from load import load_parquet_data 9 | import logging 10 | import os 11 | from pathlib import Path 12 | 13 | # Configure logging 14 | logging.basicConfig( 15 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 16 | ) 17 | logger = logging.getLogger(__name__) 18 | 19 | # Get the directory of the current script 20 | script_dir = Path(__file__).resolve().parent 21 | 22 | YAML_CONFIG = os.path.join(script_dir, "config.yaml") 23 | 24 | 25 | @timer 26 | def main(*args, **kwargs): 27 | config = Config(YAML_CONFIG) 28 | 29 | # Setup: 30 | s3 = create_s3_session() 31 | _fn = config.players_fn_suffix 32 | _file_dir = config.players_folder_name 33 | _validation_schema = Players 34 | 35 | _start_date = config.run_date 36 | _end_date = config.run_end_date 37 | _base_url = config.stats_base_url + config.stats_dir_url 38 | _params = None 39 | _s3_bucket = config.bucket 40 | 41 | # Pre-extract phase 42 | weekly_querys = generate_weekly_queries(_start_date, _end_date) 43 | endpoints = create_stats_endpoints(_fn, weekly_querys) 44 | 45 | for i, endpoint in enumerate(endpoints): 46 | endpoint_url = endpoint["endpoint_str"] 47 | dated_filename = endpoint["file_date"] 48 | file_dir = f"{_file_dir}/{dated_filename.split('_')[0]}" 49 | 50 | # Extract phase 51 | content = fetch_api_file(_base_url, endpoint_url, _params) 52 | 53 | if content is None: 54 | logger.error( 55 | f"Failed to fetch data after 3 attempts for endpoint: {endpoint_url}" 56 | ) 57 | continue # Skip to the next iteration of the loop 58 | 59 | # Validate phase 60 | validated_data = validate_parquet_schema(content, _validation_schema) 61 | 62 | # Load phase 63 | load_parquet_data(validated_data, file_dir, dated_filename, _s3_bucket, s3) 64 | logger.info(f"{i+1}/{len(endpoints)} loaded.") 65 | logger.info("Script complete.") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /src/project_tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from datetime import date 3 | import os 4 | import sys 5 | import pandas as pd 6 | import io 7 | from pydantic import BaseModel 8 | 9 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 10 | sys.path.append(os.path.join(project_root, "src")) 11 | 12 | from src.extract.filter import * 13 | from src.load.loader import * 14 | 15 | 16 | @pytest.fixture 17 | def sample_dates(): 18 | return {"start_date": date(2023, 1, 1), "end_date": date(2023, 1, 14)} 19 | 20 | 21 | # Test Data and Fixtures 22 | @pytest.fixture 23 | def sample_validation_schema(): 24 | class TestSchema(BaseModel): 25 | id: int 26 | name: str 27 | 28 | return TestSchema 29 | 30 | 31 | # Tests for validate_parquet_schema 32 | def test_validate_parquet_schema(sample_validation_schema): 33 | # Create a sample DataFrame 34 | df = pd.DataFrame([{"id": 1, "name": "test1"}, {"id": 2, "name": "test2"}]) 35 | 36 | # Create a parquet file in memory 37 | parquet_buffer = io.BytesIO() 38 | df.to_parquet(parquet_buffer) 39 | parquet_buffer.seek(0) 40 | 41 | result = validate_parquet_schema(parquet_buffer, sample_validation_schema) 42 | assert result is not None 43 | 44 | 45 | # Tests for validate_json_schema 46 | def test_validate_json_schema_valid(sample_validation_schema): 47 | valid_data = {"id": 1, "name": "test"} 48 | result = validate_json_schema(valid_data, sample_validation_schema) 49 | assert result.id == 1 50 | assert result.name == "test" 51 | 52 | 53 | def test_validate_json_schema_invalid(sample_validation_schema): 54 | invalid_data = {"id": "not_an_integer", "name": "test"} 55 | result = validate_json_schema(invalid_data, sample_validation_schema) 56 | assert result == [] 57 | 58 | 59 | def test_generate_weekly_queries(sample_dates): 60 | queries = generate_weekly_queries( 61 | sample_dates["start_date"], sample_dates["end_date"] 62 | ) 63 | assert len(queries) == 2 # Should generate 2 weeks worth of queries 64 | assert all("dated" in q and "query_str" in q for q in queries) 65 | assert queries[0]["query_str"].startswith("2023-01-01") 66 | 67 | 68 | def test_create_stats_endpoints(): 69 | weekly_queries = [{"dated": date(2023, 1, 1), "query_str": "2023-01-01_2023-01-07"}] 70 | extract_file = "stats" 71 | endpoints = create_stats_endpoints(extract_file, weekly_queries) 72 | assert len(endpoints) == 1 73 | assert endpoints[0]["endpoint_str"] == "2023-01-01_2023-01-07/stats.parquet" 74 | assert endpoints[0]["file_date"] == "2023-01-01_stats" 75 | -------------------------------------------------------------------------------- /dags/all_project_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.python import PythonOperator 3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator 4 | 5 | from datetime import datetime 6 | 7 | default_args = {"start_date": datetime(2021, 1, 1)} 8 | 9 | 10 | def _dag_start(): 11 | print("Starting Airflow Pipeline to run all DAGS") 12 | 13 | 14 | def _dag_end(): 15 | print("Finished running Airflow Pipeline for all DAGS") 16 | 17 | 18 | with DAG( 19 | "trigger_dag", schedule_interval="@daily", default_args=default_args, catchup=False 20 | ) as dag: 21 | start_trigger = PythonOperator(task_id="run_all_start", python_callable=_dag_start) 22 | 23 | end_trigger = PythonOperator(task_id="run_all_end", python_callable=_dag_end) 24 | 25 | trigger_set_load_master = TriggerDagRunOperator( 26 | task_id="trigger_load_master", 27 | trigger_dag_id="set_load_master_id", 28 | wait_for_completion=True, 29 | poke_interval=30, 30 | ) 31 | 32 | trigger_elt_metadata = TriggerDagRunOperator( 33 | task_id="trigger_metadata", 34 | trigger_dag_id="elt_metadata_dag_id", 35 | wait_for_completion=True, 36 | poke_interval=30, 37 | ) 38 | 39 | trigger_elt_stat_matches = TriggerDagRunOperator( 40 | task_id="trigger_stat_matches", 41 | trigger_dag_id="elt_stat_matches_id", 42 | wait_for_completion=True, 43 | poke_interval=30, 44 | ) 45 | 46 | trigger_elt_stat_players = TriggerDagRunOperator( 47 | task_id="trigger_stat_players", 48 | trigger_dag_id="elt_stat_players_id", 49 | wait_for_completion=True, 50 | poke_interval=30, 51 | ) 52 | 53 | trigger_elt_relic_api = TriggerDagRunOperator( 54 | task_id="trigger_relic_api", 55 | trigger_dag_id="elt_relic_api_id", 56 | wait_for_completion=True, 57 | poke_interval=30, 58 | ) 59 | 60 | trigger_dbt_dag = TriggerDagRunOperator( 61 | task_id="trigger_dbt_dag", 62 | trigger_dag_id="dbt_dag_id", 63 | wait_for_completion=True, 64 | poke_interval=60, 65 | ) 66 | 67 | # Run order: Check for metadata, then load stats datasets. Can run relic_api simultaneously. 68 | start_trigger >> trigger_elt_metadata 69 | start_trigger >> trigger_set_load_master 70 | start_trigger >> trigger_elt_relic_api 71 | trigger_elt_metadata >> trigger_elt_stat_matches 72 | trigger_elt_metadata >> trigger_elt_stat_players 73 | 74 | # Only trigger dbt_dag if all 3 datasets + load_master are finished. 75 | trigger_set_load_master >> trigger_dbt_dag 76 | trigger_elt_stat_matches >> trigger_dbt_dag 77 | trigger_elt_stat_players >> trigger_dbt_dag 78 | trigger_elt_relic_api >> trigger_dbt_dag 79 | 80 | # Finally close dag run after dbt_dag has finished. 81 | trigger_dbt_dag >> end_trigger 82 | -------------------------------------------------------------------------------- /src/extract/filter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from datetime import timedelta 4 | from pydantic import ValidationError 5 | from utils import fetch_api_json 6 | 7 | import pandas as pd 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def fetch_relic_chunk(base_url: str, endpoint: str, params: dict): 13 | """Fetches all data from Relic API in chunks of 100/request (API limit)""" 14 | start = 1 15 | chunk = params["chunk_size"] 16 | logger.info(f"Processing data in chunks of {chunk} from {endpoint}") 17 | while True: 18 | params["start"] = start 19 | response = fetch_api_json(base_url, endpoint, params) 20 | 21 | if not response: 22 | break 23 | api_end = response["rankTotal"] + chunk 24 | if start > api_end: 25 | break 26 | logger.info(f"Processing chunk {start}/{api_end}") 27 | 28 | yield response 29 | start += chunk 30 | time.sleep(0.2) 31 | 32 | 33 | def validate_json_schema(json_data, validation_schema): 34 | try: 35 | data = json_data 36 | validated_data = validation_schema.model_validate(data) 37 | return validated_data 38 | except ValidationError as e: 39 | logger.error(f"Validation Error: {e}") 40 | return [] 41 | 42 | 43 | def validate_parquet_schema(content, validation_schema): 44 | df = pd.read_parquet(content) 45 | records = df.to_dict(orient="records") 46 | for record in records: 47 | try: 48 | validation_schema.model_validate(record) 49 | except ValidationError as e: 50 | logger.error(f"Validation error: {e}") 51 | 52 | # Reset the pointer to start of file: 53 | content.seek(0) 54 | return content 55 | 56 | 57 | def generate_weekly_queries(start_date, end_date): 58 | # Move start date to the next Sunday if it's not already a Sunday 59 | while start_date.weekday() != 6: 60 | start_date += timedelta(days=1) 61 | 62 | # Move end date to the previous Saturday if it's not already a Saturday 63 | while end_date.weekday() != 5: 64 | end_date -= timedelta(days=1) 65 | 66 | queries = [] 67 | print(f"Finding all files between {start_date} and {end_date}.") 68 | while start_date <= end_date: 69 | # Calculate the end date of the current week (the next Saturday) 70 | end_date_saturday = start_date + timedelta(days=6) 71 | query = { 72 | "dated": start_date, 73 | "query_str": f"{start_date.strftime('%Y-%m-%d')}_{end_date_saturday.strftime('%Y-%m-%d')}", 74 | } 75 | queries.append(query) 76 | # Move to the next week 77 | start_date += timedelta(days=7) 78 | return queries 79 | 80 | 81 | def create_stats_endpoints(extract_file: str, weekly_querys: list): 82 | endpoints = [] 83 | for weekly_query in weekly_querys: 84 | result_dated = f"{weekly_query['dated']}_{extract_file}" 85 | result_query = f"{weekly_query['query_str']}/{extract_file}.parquet" 86 | endpoints.append({"file_date": result_dated, "endpoint_str": result_query}) 87 | logger.info(f"{len(endpoints)} found.") 88 | return endpoints 89 | -------------------------------------------------------------------------------- /src/extract/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, model_validator 2 | from typing import List, Optional 3 | from datetime import date, timedelta, datetime 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | # ----------------------------------------------------------------------------- 9 | # Aoedumps schema 10 | # ----------------------------------------------------------------------------- 11 | 12 | 13 | class WeeklyDump(BaseModel): 14 | start_date: date 15 | end_date: Optional[date] = None 16 | num_matches: int 17 | num_players: Optional[int] = None 18 | matches_url: str 19 | players_url: str 20 | match_checksum: Optional[str] = None 21 | player_checksum: Optional[str] = None 22 | 23 | @model_validator(mode="after") 24 | def validate_and_get_message(self) -> "WeeklyDump": 25 | if self.num_matches == 0: 26 | msg = f"File {self.start_date} is present but is empty!" 27 | logger.warning(msg) 28 | return self 29 | 30 | 31 | class ApiSchema(BaseModel): 32 | db_dumps: List[WeeklyDump] 33 | total_matches: Optional[int] = None 34 | total_players: Optional[int] = None 35 | 36 | 37 | # ----------------------------------------------------------------------------- 38 | # RelicAPI schema 39 | # ----------------------------------------------------------------------------- 40 | 41 | 42 | class RelicResult(BaseModel): 43 | code: int 44 | message: str 45 | 46 | 47 | class Member(BaseModel): 48 | profile_id: Optional[int | float] = None 49 | name: Optional[str] = None 50 | alias: str 51 | personal_statgroup_id: Optional[int] = None 52 | xp: Optional[int] = None 53 | level: Optional[int] = None 54 | leaderboardregion_id: Optional[int] = None 55 | country: str 56 | 57 | 58 | class StatGroup(BaseModel): 59 | id: int 60 | name: Optional[str] = None 61 | type: Optional[int] = None 62 | members: List[Member] 63 | 64 | 65 | class LeaderboardStat(BaseModel): 66 | statgroup_id: int 67 | leaderboard_id: Optional[int] = None 68 | wins: Optional[int] = None 69 | losses: Optional[int] = None 70 | streak: Optional[int] = None 71 | disputes: Optional[int] = None 72 | drops: Optional[int] = None 73 | rank: int 74 | ranktotal: Optional[int] = None 75 | ranklevel: Optional[int] = None 76 | rating: Optional[int] = None 77 | regionrank: Optional[int] = None 78 | regionranktotal: Optional[int] = None 79 | lastmatchdate: int 80 | highestrank: Optional[int] = None 81 | highestranklevel: Optional[int] = None 82 | highestrating: Optional[int] = None 83 | 84 | 85 | class RelicResponse(BaseModel): 86 | statGroups: List[StatGroup] 87 | leaderboardStats: List[LeaderboardStat] 88 | 89 | 90 | # -------------------------------Aoestats .parquet schemas 91 | 92 | 93 | class Players(BaseModel): 94 | civ: str 95 | game_id: int 96 | match_rating_diff: float 97 | new_rating: int 98 | old_rating: int 99 | profile_id: Optional[int | float] = None 100 | replay_summary_raw: Optional[str | float] = None 101 | team: int 102 | winner: bool 103 | 104 | 105 | class Matches(BaseModel): 106 | avg_elo: float 107 | duration: timedelta 108 | game_id: int 109 | game_speed: str 110 | game_type: str 111 | irl_duration: timedelta 112 | leaderboard: str 113 | map: str 114 | mirror: bool 115 | num_players: int 116 | patch: int 117 | raw_match_type: Optional[int | float] = None 118 | replay_enhanced: bool 119 | started_timestamp: datetime 120 | starting_age: str 121 | team_0_elo: float 122 | team_1_elo: float 123 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI Workflow 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | lint_and_test: 10 | name: Lint & Format Codebase 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.12' 20 | 21 | - name: Install dependencies 22 | run: | 23 | pip install --upgrade pip 24 | pip install -r requirements.txt 25 | 26 | - name: Run_linting_with_Black 27 | uses: psf/black@stable 28 | with: 29 | src: '.' 30 | options: '--check' 31 | env: 32 | DEFAULT_BRANCH: main 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | 35 | - name: Check for linting errors 36 | if: steps.Run_linting_with_Black.outcome == 'failure' 37 | run: echo "Linting errors found. Please fix and re-commit." 38 | continue-on-error: true 39 | 40 | - name: Run tests with pytest 41 | run: | 42 | pytest src/project_tests.py 43 | 44 | dev_dbt_deployment: 45 | name: Slim CI dbt-run in dev environment 46 | needs: lint_and_test 47 | runs-on: ubuntu-latest 48 | steps: 49 | - name: Check out repository 50 | uses: actions/checkout@v3 51 | 52 | - name: Set up Python 53 | uses: actions/setup-python@v4 54 | with: 55 | python-version: '3.12' 56 | 57 | - name: Install dependencies 58 | run: pip install dbt-core dbt-snowflake 59 | 60 | - name: Write profiles.yml 61 | env: 62 | DBT_PROFILES: ${{ secrets.DBT_PROFILES }} 63 | run: echo "$DBT_PROFILES" > src/transform/dbt_aoe/profiles.yml 64 | 65 | - name: Configure AWS Credentials 66 | uses: aws-actions/configure-aws-credentials@v2 67 | with: 68 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 69 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 70 | aws-region: ${{ secrets.AWS_REGION }} 71 | 72 | - name: Download manifest.json from S3 73 | run: | 74 | aws s3 cp s3://aoe2dumps/dev_artifacts/manifest.json src/transform/dbt_aoe/manifest.json 75 | 76 | - name: Run dbt debug 77 | run: | 78 | cd src/transform/dbt_aoe 79 | dbt debug 80 | 81 | - name: Run dbt deps 82 | run: | 83 | cd src/transform/dbt_aoe 84 | dbt deps 85 | 86 | - name: Run dbt compile 87 | run: | 88 | # This creates the current manifest for the dbt project, based on merged files 89 | cd src/transform/dbt_aoe 90 | dbt compile 91 | 92 | - name: Dbt list changed files 93 | run: | 94 | # This will list the changes between our current files, and what was last merged to prod 95 | cd src/transform/dbt_aoe 96 | dbt ls --models state:modified --state . 97 | 98 | - name: Refresh dbt external tables 99 | run: | 100 | cd src/transform/dbt_aoe 101 | dbt run-operation --target dev stage_external_sources --vars "ext_full_refresh: true" 102 | 103 | - name: Run dbt run 104 | run: | 105 | cd src/transform/dbt_aoe 106 | dbt run --target dev --models state:modified+ --state . 107 | 108 | - name: Run dbt tests 109 | run: | 110 | cd src/transform/dbt_aoe 111 | dbt test --target dev --models state:modified+ --state . 112 | 113 | - name: Upload updated manifest.json to S3 114 | run: | 115 | aws s3 cp src/transform/dbt_aoe/target/manifest.json s3://aoe2dumps/dev_artifacts/manifest.json 116 | -------------------------------------------------------------------------------- /.github/workflows/cd.yaml: -------------------------------------------------------------------------------- 1 | name: CD Workflow 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | 9 | jobs: 10 | copy_s3_data: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout Repository 15 | uses: actions/checkout@v3 16 | 17 | - name: Configure AWS Credentials 18 | uses: aws-actions/configure-aws-credentials@v2 19 | with: 20 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 21 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 22 | aws-region: ${{ secrets.AWS_REGION }} 23 | 24 | - name: Install AWS CLI (if not already installed) 25 | run: | 26 | if ! command -v aws &> /dev/null; then 27 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 28 | unzip awscliv2.zip 29 | sudo ./aws/install 30 | fi 31 | 32 | - name: Sync S3 Data 33 | run: | 34 | aws s3 sync s3://aoe2dumps/dev/ s3://aoe2dumps/prod/ 35 | 36 | - name: Verify Sync Operation 37 | run: | 38 | # Example verification: List objects in the destination prefix 39 | aws s3 ls s3://aoe2dumps/prod/ --recursive 40 | 41 | 42 | prod_dbt_deployment: 43 | name: Slim CI dbt-run in prod environment 44 | needs: copy_s3_data 45 | runs-on: ubuntu-latest 46 | steps: 47 | - name: Check out repository 48 | uses: actions/checkout@v3 49 | 50 | - name: Set up Python 51 | uses: actions/setup-python@v4 52 | with: 53 | python-version: '3.12' 54 | 55 | - name: Install dependencies 56 | run: pip install dbt-core dbt-snowflake 57 | 58 | - name: Write profiles.yml 59 | env: 60 | DBT_PROFILES: ${{ secrets.DBT_PROFILES }} 61 | run: echo "$DBT_PROFILES" > src/transform/dbt_aoe/profiles.yml 62 | 63 | - name: Configure AWS Credentials 64 | uses: aws-actions/configure-aws-credentials@v2 65 | with: 66 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 67 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 68 | aws-region: ${{ secrets.AWS_REGION }} 69 | 70 | - name: Download manifest.json from S3 71 | run: | 72 | aws s3 cp s3://aoe2dumps/prod_artifacts/manifest.json src/transform/dbt_aoe/manifest.json 73 | 74 | - name: Run dbt debug 75 | run: | 76 | cd src/transform/dbt_aoe 77 | dbt debug 78 | 79 | - name: Run dbt deps 80 | run: | 81 | cd src/transform/dbt_aoe 82 | dbt deps 83 | 84 | - name: Run dbt compile 85 | run: | 86 | # This creates the current manifest for the dbt project, based on merged files 87 | cd src/transform/dbt_aoe 88 | dbt compile 89 | 90 | - name: Dbt list changed files 91 | run: | 92 | # This will list the changes between our current files, and what was last merged to prod 93 | cd src/transform/dbt_aoe 94 | dbt ls --models state:modified --state . 95 | 96 | - name: Refresh dbt external tables 97 | run: | 98 | cd src/transform/dbt_aoe 99 | dbt run-operation --target prod stage_external_sources --vars "ext_full_refresh: true" 100 | 101 | - name: Run dbt run 102 | run: | 103 | cd src/transform/dbt_aoe 104 | dbt run --target prod --models state:modified+ --state . 105 | 106 | - name: Run dbt tests 107 | run: | 108 | cd src/transform/dbt_aoe 109 | dbt test --target prod --models state:modified+ --state . 110 | 111 | - name: Upload updated manifest.json to S3 112 | run: | 113 | aws s3 cp src/transform/dbt_aoe/target/manifest.json s3://aoe2dumps/prod_artifacts/manifest.json 114 | 115 | - name: Run post-merge actions 116 | run: | 117 | echo "Pull request has been approved and merged to main" 118 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/seeds/country_list.csv: -------------------------------------------------------------------------------- 1 | Country,Country_code 2 | Afghanistan,af 3 | Ã…land Islands,ax 4 | Albania,al 5 | Algeria,dz 6 | American Samoa,as 7 | Andorra,ad 8 | Angola,ao 9 | Anguilla,ai 10 | Antarctica,aq 11 | Antigua and Barbuda,ag 12 | Argentina,ar 13 | Armenia,am 14 | Aruba,aw 15 | Australia,au 16 | Austria,at 17 | Azerbaijan,az 18 | Bahamas,bs 19 | Bahrain,bh 20 | Bangladesh,bd 21 | Barbados,bb 22 | Belarus,by 23 | Belgium,be 24 | Belize,bz 25 | Benin,bj 26 | Bermuda,bm 27 | Bhutan,bt 28 | Bolivia,bo 29 | Bonaire,bq 30 | Bosnia and Herzegovina,ba 31 | Botswana,bw 32 | Bouvet Island,bv 33 | Brazil,br 34 | British Indian Ocean Territory,io 35 | British Virgin Islands,vg 36 | Brunei,bn 37 | Bulgaria,bg 38 | Burkina Faso,bf 39 | Burundi,bi 40 | Cabo Verde,cv 41 | Cambodia,kh 42 | Cameroon,cm 43 | Canada,ca 44 | Cayman Islands,ky 45 | Central African Republic,cf 46 | Chad,td 47 | Chile,cl 48 | China,cn 49 | Christmas Island,cx 50 | Cocos (Keeling) Islands,cc 51 | Colombia,co 52 | Comoros,km 53 | Congo,cg 54 | Congo (DRC),cd 55 | Cook Islands,ck 56 | Costa Rica,cr 57 | Côte d’Ivoire,ci 58 | Croatia,hr 59 | Cuba,cu 60 | Curaçao,cw 61 | Cyprus,cy 62 | Czechia,cz 63 | Denmark,dk 64 | Djibouti,dj 65 | Dominica,dm 66 | Dominican Republic,do 67 | Ecuador,ec 68 | Egypt,eg 69 | El Salvador,sv 70 | Equatorial Guinea,gq 71 | Eritrea,er 72 | Estonia,ee 73 | Eswatini,sz 74 | Ethiopia,et 75 | Falkland Islands,fk 76 | Faroe Islands,fo 77 | Fiji,fj 78 | Finland,fi 79 | France,fr 80 | French Guiana,gf 81 | French Polynesia,pf 82 | French Southern Territories,tf 83 | Gabon,ga 84 | Gambia,gm 85 | Georgia,ge 86 | Germany,de 87 | Ghana,gh 88 | Gibraltar,gi 89 | Greece,gr 90 | Greenland,gl 91 | Grenada,gd 92 | Guadeloupe,gp 93 | Guam,gu 94 | Guatemala,gt 95 | Guernsey,gg 96 | Guinea,gn 97 | Guinea-Bissau,gw 98 | Guyana,gy 99 | Haiti,ht 100 | Heard Island and McDonald Islands,hm 101 | Honduras,hn 102 | Hong Kong SAR,hk 103 | Hungary,hu 104 | Iceland,is 105 | India,in 106 | Indonesia,id 107 | Iran,ir 108 | Iraq,iq 109 | Ireland,ie 110 | Isle of Man,im 111 | Israel,il 112 | Italy,it 113 | Jamaica,jm 114 | Jan Mayen,xj 115 | Japan,jp 116 | Jersey,je 117 | Jordan,jo 118 | Kazakhstan,kz 119 | Kenya,ke 120 | Kiribati,ki 121 | Korea,kr 122 | Kosovo,xk 123 | Kuwait,kw 124 | Kyrgyzstan,kg 125 | Laos,la 126 | Latvia,lv 127 | Lebanon,lb 128 | Lesotho,ls 129 | Liberia,lr 130 | Libya,ly 131 | Liechtenstein,li 132 | Lithuania,lt 133 | Luxembourg,lu 134 | Macao SAR,mo 135 | Madagascar,mg 136 | Malawi,mw 137 | Malaysia,my 138 | Maldives,mv 139 | Mali,ml 140 | Malta,mt 141 | Marshall Islands,mh 142 | Martinique,mq 143 | Mauritania,mr 144 | Mauritius,mu 145 | Mayotte,yt 146 | Mexico,mx 147 | Micronesia,fm 148 | Moldova,md 149 | Monaco,mc 150 | Mongolia,mn 151 | Montenegro,me 152 | Montserrat,ms 153 | Morocco,ma 154 | Mozambique,mz 155 | Myanmar,mm 156 | Namibia,na 157 | Nauru,nr 158 | Nepal,np 159 | Netherlands,nl 160 | New Caledonia,nc 161 | New Zealand,nz 162 | Nicaragua,ni 163 | Niger,ne 164 | Nigeria,ng 165 | Niue,nu 166 | Norfolk Island,nf 167 | North Korea,kp 168 | North Macedonia,mk 169 | Northern Mariana Islands,mp 170 | Norway,no 171 | Oman,om 172 | Pakistan,pk 173 | Palau,pw 174 | Palestinian Authority,ps 175 | Panama,pa 176 | Papua New Guinea,pg 177 | Paraguay,py 178 | Peru,pe 179 | Philippines,ph 180 | Pitcairn Islands,pn 181 | Poland,pl 182 | Portugal,pt 183 | Puerto Rico,pr 184 | Qatar,qa 185 | Réunion,re 186 | Romania,ro 187 | Russia,ru 188 | Rwanda,rw 189 | Saba,xs 190 | Saint Barthélemy,bl 191 | Saint Kitts and Nevis,kn 192 | Saint Lucia,lc 193 | Saint Martin,mf 194 | Saint Pierre and Miquelon,pm 195 | Saint Vincent and the Grenadines,vc 196 | Samoa,ws 197 | San Marino,sm 198 | São Tomé and Príncipe,st 199 | Saudi Arabia,sa 200 | Senegal,sn 201 | Serbia,rs 202 | Seychelles,sc 203 | Sierra Leone,sl 204 | Singapore,sg 205 | Sint Eustatius,xe 206 | Sint Maarten,sx 207 | Slovakia,sk 208 | Slovenia,si 209 | Solomon Islands,sb 210 | Somalia,so 211 | South Africa,za 212 | South Georgia and South Sandwich Islands,gs 213 | South Sudan,ss 214 | Spain,es 215 | Sri Lanka,lk 216 | "St Helena, Ascension, Tristan da Cunha",sh 217 | Sudan,sd 218 | Suriname,sr 219 | Svalbard,sj 220 | Sweden,se 221 | Switzerland,ch 222 | Syria,sy 223 | Taiwan,tw 224 | Tajikistan,tj 225 | Tanzania,tz 226 | Thailand,th 227 | Timor-Leste,tl 228 | Togo,tg 229 | Tokelau,tk 230 | Tonga,to 231 | Trinidad and Tobago,tt 232 | Tunisia,tn 233 | Turkey,tr 234 | Turkmenistan,tm 235 | Turks and Caicos Islands,tc 236 | Tuvalu,tv 237 | U.S. Outlying Islands,um 238 | U.S. Virgin Islands,vi 239 | Uganda,ug 240 | Ukraine,ua 241 | United Arab Emirates,ae 242 | United Kingdom,gb 243 | United States,us 244 | Uruguay,uy 245 | Uzbekistan,uz 246 | Vanuatu,vu 247 | Vatican City,va 248 | Venezuela,ve 249 | Vietnam,vn 250 | Wallis and Futuna,wf 251 | Yemen,ye 252 | Zambia,zm 253 | Zimbabwe,zw 254 | -------------------------------------------------------------------------------- /src/transform/dbt_aoe/models/Medallion_README.md: -------------------------------------------------------------------------------- 1 | # Medallion Logical Layer Design 2 | 3 | For this project, I have structured the tables to use the Medallion framework, sorting the data pipeline into the logical layers: Bronze -> Silver -> Gold. This approach keeps the data logically separated into separate schemas as it moves from 'dirty' raw data into 'clean' conformed dimensional models. Note, I have overridden the dbt macro `generate_schema_name.sql` such that the schema now takes upon the models medallion layer as its schema name within the Snowflake data warehouse. 4 | 5 | --- 6 | 7 | ## Bronze Layer 8 | 9 | ### Overview 10 | 11 | The Bronze layer represents the cleaned landing data reconfigured into database usable form. It is expected to hold temporary data that is prepared to be loaded into Silver Tables. 12 | 13 | #### Components 14 | 15 | - **External Tables** 16 | 17 | External tables are defined in the `ext_table_schema.yml` and represent the data within the S3 bucket path. Loading and refresh of the external table is handled by the dbt package `dbt_external_tables`. No transformation takes place here; fields are shown 'as is' from the underlying data. We add the `_ext` suffix to easily identify these within Snowflake. 18 | 19 | - **Raw Views** 20 | 21 | The Raw Views are Snowflake views built on top of the external tables. They contain all fields from the external table, plus metadata fields such as source name and load timestamp. Data is filtered on its `file_date` as according to the `control_master_table`, such that we only bring in relevant rows of data needed for consumption. This is called by the user-defined macro `filter_load.sql`, which can be found in the macros folder. We add the `v_` prefix to denote it is a view, and the `_raw` suffix to identify this is a raw reflection of the source data. 22 | 23 | - **Bronze Tables** 24 | 25 | The Bronze Tables read the data from the Raw Views and apply light transformations. This includes flattening JSON, applying correct data types, and renaming fields. Data is also de-duplicated based on the defined primary key and timestamp, using the user-defined macro `deduplicate_by_key.sql`. A Python model `dim_date_br.py` is also used to create the `dim_date` table, as it is incredibly easy and efficient to do so using the pandas library. We add the `_br` suffix to denote the bronze tables. 26 | 27 | ### Data Retention 28 | 29 | The Bronze layer's purpose is to reflect a cleaned version of the staging area and hence is expected to contain temporary held tables/views. 30 | 31 | - **Object Type**: Transient Tables or Views. 32 | - **Loading Type**: Truncate & Load or Create & Replace. 33 | - **Keeps Historical Data**: No. Data is not expected to be retained in the bronze layer. The reloading of data will overwrite the previously held data in the table. History will be retained in the Silver/Gold Layer. 34 | 35 | --- 36 | 37 | ## Silver Layer 38 | 39 | ### Overview 40 | 41 | The Silver layer represents the area of which we run further transformations and apply business logic to the data. Transformation can involve creating new surrogate keys, joining new datasets together, applying conditional logic, etc. 42 | 43 | #### Silver Tables 44 | 45 | In this project, the data is already in a mostly usable form, so we have a light silver layer with minor transformations. We create a surrogate key on the `player_match_sr` table using a MD5 hash, to uniquely identify rows based on a single field. This makes it easier for Snowflake to perform UPSERT commands, rather than trying to use a composite key. We also ingest a simple 'seed' file (`country_list.csv`) into the table `player_leaderboard_stats_sr`, to provide the mapping between country codes and names. We add the `_sr` suffix to denote the silver tables. 46 | 47 | ### Data Retention 48 | 49 | The Silver layer's purpose is to hold a record and history of previously loaded bronze data. Hence data is expected to be held in permanent tables. 50 | 51 | - **Object Type**: Permanent and/or incrementally loaded Tables. 52 | - **Loading Type**: Insert & Update (mainly). 53 | - **Keeps Historical Data**: Yes. Key Silver tables should not be dropped or truncated unless a full reload is required or specified otherwise. 54 | 55 | --- 56 | 57 | ## Gold Layer 58 | 59 | ### Overview 60 | 61 | The Gold layer represents the area of which data is modeled into a dimensional model as preparation for reporting. As such, this entails: 62 | 63 | - **Fact Tables**: The primary table that stores the measures and foreign keys to the dimension table. 64 | - **Dimension Tables**: Tables that contain the attributes of a certain area, with a primary key that joins to the Fact table. 65 | 66 | ### Data Retention 67 | 68 | Gold layer tables should retain data and keep an historical view. As such, inserts should be done as 'delta' loads, only adding in new data. 69 | 70 | - **Object Type**: Permanent and/or incrementally loaded Tables. 71 | - **Loading Type**: Insert & Update (mainly). 72 | - **Keeps Historical Data**: Yes. Gold tables should not be dropped or truncated unless a full reload is required or specified otherwise. -------------------------------------------------------------------------------- /.astro/test_dag_integrity_default.py: -------------------------------------------------------------------------------- 1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**""" 2 | 3 | from contextlib import contextmanager 4 | import logging 5 | import os 6 | 7 | import pytest 8 | 9 | from airflow.models import DagBag, Variable, Connection 10 | from airflow.hooks.base import BaseHook 11 | from airflow.utils.db import initdb 12 | 13 | # init airflow database 14 | initdb() 15 | 16 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables 17 | 18 | 19 | # =========== MONKEYPATCH BaseHook.get_connection() =========== 20 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs): 21 | print( 22 | f"Attempted to fetch connection during parse returning an empty Connection object for {key}" 23 | ) 24 | return Connection(key) 25 | 26 | 27 | BaseHook.get_connection = basehook_get_connection_monkeypatch 28 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() =========== 29 | 30 | 31 | # =========== MONKEYPATCH OS.GETENV() =========== 32 | def os_getenv_monkeypatch(key: str, *args, **kwargs): 33 | default = None 34 | if args: 35 | default = args[0] # os.getenv should get at most 1 arg after the key 36 | if kwargs: 37 | default = kwargs.get( 38 | "default", None 39 | ) # and sometimes kwarg if people are using the sig 40 | 41 | env_value = os.environ.get(key, None) 42 | 43 | if env_value: 44 | return env_value # if the env_value is set, return it 45 | if ( 46 | key == "JENKINS_HOME" and default is None 47 | ): # fix https://github.com/astronomer/astro-cli/issues/601 48 | return None 49 | if default: 50 | return default # otherwise return whatever default has been passed 51 | return f"MOCKED_{key.upper()}_VALUE" # if absolutely nothing has been passed - return the mocked value 52 | 53 | 54 | os.getenv = os_getenv_monkeypatch 55 | # # =========== /MONKEYPATCH OS.GETENV() =========== 56 | 57 | # =========== MONKEYPATCH VARIABLE.GET() =========== 58 | 59 | 60 | class magic_dict(dict): 61 | def __init__(self, *args, **kwargs): 62 | self.update(*args, **kwargs) 63 | 64 | def __getitem__(self, key): 65 | return {}.get(key, "MOCKED_KEY_VALUE") 66 | 67 | 68 | _no_default = object() # allow falsey defaults 69 | 70 | 71 | def variable_get_monkeypatch(key: str, default_var=_no_default, deserialize_json=False): 72 | print( 73 | f"Attempted to get Variable value during parse, returning a mocked value for {key}" 74 | ) 75 | 76 | if default_var is not _no_default: 77 | return default_var 78 | if deserialize_json: 79 | return magic_dict() 80 | return "NON_DEFAULT_MOCKED_VARIABLE_VALUE" 81 | 82 | 83 | Variable.get = variable_get_monkeypatch 84 | # # =========== /MONKEYPATCH VARIABLE.GET() =========== 85 | 86 | 87 | @contextmanager 88 | def suppress_logging(namespace): 89 | """ 90 | Suppress logging within a specific namespace to keep tests "clean" during build 91 | """ 92 | logger = logging.getLogger(namespace) 93 | old_value = logger.disabled 94 | logger.disabled = True 95 | try: 96 | yield 97 | finally: 98 | logger.disabled = old_value 99 | 100 | 101 | def get_import_errors(): 102 | """ 103 | Generate a tuple for import errors in the dag bag, and include DAGs without errors. 104 | """ 105 | with suppress_logging("airflow"): 106 | dag_bag = DagBag(include_examples=False) 107 | 108 | def strip_path_prefix(path): 109 | return os.path.relpath(path, os.environ.get("AIRFLOW_HOME")) 110 | 111 | # Initialize an empty list to store the tuples 112 | result = [] 113 | 114 | # Iterate over the items in import_errors 115 | for k, v in dag_bag.import_errors.items(): 116 | result.append((strip_path_prefix(k), v.strip())) 117 | 118 | # Check if there are DAGs without errors 119 | for file_path in dag_bag.dags: 120 | # Check if the file_path is not in import_errors, meaning no errors 121 | if file_path not in dag_bag.import_errors: 122 | result.append((strip_path_prefix(file_path), "No import errors")) 123 | 124 | return result 125 | 126 | 127 | @pytest.mark.parametrize( 128 | "rel_path, rv", get_import_errors(), ids=[x[0] for x in get_import_errors()] 129 | ) 130 | def test_file_imports(rel_path, rv): 131 | """Test for import errors on a file""" 132 | if os.path.exists(".astro/dag_integrity_exceptions.txt"): 133 | with open(".astro/dag_integrity_exceptions.txt", "r") as f: 134 | exceptions = f.readlines() 135 | print(f"Exceptions: {exceptions}") 136 | if (rv != "No import errors") and rel_path not in exceptions: 137 | # If rv is not "No import errors," consider it a failed test 138 | raise Exception(f"{rel_path} failed to import with message \n {rv}") 139 | else: 140 | # If rv is "No import errors," consider it a passed test 141 | print(f"{rel_path} passed the import test") 142 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from datetime import datetime, timedelta 3 | import requests 4 | import os 5 | import boto3 6 | from typing import Dict, Optional, BinaryIO 7 | import io 8 | import time 9 | import logging 10 | import backoff 11 | import snowflake.connector 12 | from dotenv import load_dotenv, find_dotenv 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | # Load environment variables from .env file 18 | load_dotenv(find_dotenv()) 19 | 20 | # ----------------------------------------------------------------------------- 21 | # Classes 22 | # ----------------------------------------------------------------------------- 23 | 24 | 25 | class Config: 26 | def __init__(self, yaml_file: str): 27 | with open(yaml_file, "r") as f: 28 | self.__dict__.update(yaml.safe_load(f)) 29 | self.run_date = self.parse_date( 30 | self.backdate_days_start, self.target_run_date, self.date_format 31 | ) 32 | self.run_end_date = self.parse_date( 33 | self.backdate_days_end, self.target_run_end_date, self.date_format 34 | ) 35 | 36 | @staticmethod 37 | def parse_date(backdate_days: int, specific_date: str, date_format: str): 38 | """Creates a date object on initialisation. If target_run_date is specified, 39 | it takes priority, otherwise uses a number of backdated days from current date. 40 | """ 41 | if specific_date: 42 | result = datetime.strptime(specific_date, date_format).date() 43 | else: 44 | result = (datetime.now() - timedelta(days=backdate_days)).date() 45 | return result 46 | 47 | 48 | # ----------------------------------------------------------------------------- 49 | # Functions 50 | # ----------------------------------------------------------------------------- 51 | 52 | 53 | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=3) 54 | def fetch_api_file( 55 | base_url: str, endpoint: str, params: Optional[Dict] = None 56 | ) -> BinaryIO | None: 57 | """Fetches a file from an API endpoint and returns it as a BytesIO object.""" 58 | try: 59 | url = base_url + endpoint 60 | response = requests.get(url, params=params) 61 | response.raise_for_status() 62 | if not response.content: 63 | logger.warning("Received empty response from API.") 64 | return None 65 | content = io.BytesIO(response.content) 66 | return content 67 | except requests.RequestException as e: 68 | logger.error(f"Error fetching data: {e}") 69 | return None 70 | 71 | 72 | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=3) 73 | def fetch_api_json(base_url: str, endpoint: str, params: dict) -> dict | None: 74 | """Fetches JSON data from an API endpoint with retry logic.""" 75 | try: 76 | url = base_url + endpoint 77 | response = requests.get(url, params=params) 78 | response.raise_for_status() 79 | return response.json() 80 | except requests.RequestException as e: 81 | logger.warning(f"API request failed: {e}. Retrying...") 82 | raise # Re-raise to trigger backoff 83 | 84 | 85 | def create_s3_session(s3=None): 86 | if s3 == None: 87 | logger.info("Authenticating to S3.") 88 | s3 = boto3.client( 89 | "s3", 90 | aws_access_key_id=os.getenv("AWS_ACCESS_KEY"), 91 | aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), 92 | region_name=os.getenv("AWS_REGION"), 93 | ) 94 | return s3 95 | 96 | 97 | def upload_to_s3(s3, data, bucket_name, path_key): 98 | try: 99 | s3.upload_fileobj(data, bucket_name, path_key) 100 | logger.info( 101 | f"File '{path_key}' uploaded to S3 bucket '{bucket_name}' successfully!" 102 | ) 103 | except Exception as e: 104 | logger.error(f"Error uploading file: {e}") 105 | 106 | 107 | def timer(func): 108 | """A simple timer decorator to record how long a function took to run.""" 109 | 110 | def wrapper(*args, **kwargs): 111 | start_time = time.time() 112 | result = func(*args, **kwargs) 113 | end_time = time.time() 114 | elapsed_time = end_time - start_time 115 | logger.info( 116 | f"Function '{func.__name__}' took {elapsed_time:.1f} seconds to run." 117 | ) 118 | return result 119 | 120 | return wrapper 121 | 122 | 123 | def sf_connect( 124 | db: str | None = os.getenv("SF_DATABASE"), 125 | schema: str | None = os.getenv("SF_SCHEMA"), 126 | ) -> snowflake.connector.connection.SnowflakeConnection: 127 | """ 128 | Uses environment variables (store in .env) to connect to Snowflake. 129 | Optional: to specify database or schema. 130 | Returns: A snowflake connection object. 131 | """ 132 | 133 | logger.info("Connecting to Snowflake...") 134 | connection_parameters = { 135 | "account": os.getenv("SF_ACCOUNT_NAME"), 136 | "user": os.getenv("SF_USERNAME"), 137 | "password": os.getenv("SF_PASSWORD"), 138 | "warehouse": os.getenv("SF_WAREHOUSE"), 139 | "role": os.getenv("SF_ROLE"), 140 | "database": db, 141 | "schema": schema, 142 | } 143 | connection = snowflake.connector.connect(**connection_parameters) 144 | logger.info("Connection successfully created") 145 | return connection 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Age of Empires 2 Project 2 | 3 | ## 1. Overview 4 | 5 | The Age of Empires 2 (aoe2) project dashboard provides a comprehensive summary view of player and match statistics from the video game 'Age of Empires 2 DE'. Data is automatically pulled and refreshed weekly in a PowerBI report, enabling in-depth analysis and data slicing/dicing. The dashboard aims to answer questions such as: 6 | 7 | * Who are the top performing players currently? 8 | * What civilization should I counter-pick to maximize my chances of beating my opponent? 9 | * How has my favorite civilization performed over time? 10 | 11 | ## 2. Features 12 | 13 | * Automated weekly data extraction, loading, and transformation. 14 | * Pipeline incorporates numerous tests to ensure high data quality. 15 | * User-friendly dashboards for data and insights visualization. 16 | 17 | ## 3. Project Structure 18 | 19 | ### High level view 20 | 21 | Data Pipeline 22 | 23 | ### Pipeline DAG - project all 24 | 25 | Airflow project DAG 26 | 27 | ### Pipeline DAG - dbt models 28 | 29 | Airflow dbt DAG 30 | 31 | ### Dimensional Model 32 | 33 | Dimensional Model 34 | 35 | ## 4. Dashboard Example 36 | 37 | ### Leaderboard 38 | 39 | Player Leaderboard 40 | 41 | ### Counter-Civ picker 42 | 43 | Civ Counter Picker 44 | 45 | ### Civ Performance over time 46 | 47 | Civ Performance 48 | 49 | ## 5. Tools 50 | 51 | * **a) Python (Data Extract & Load)** 52 | * Custom-built modules (API data extraction) 53 | * Pydantic (schema validation) 54 | * Pytest (unit testing) 55 | * Logging & API retries (error handling) 56 | * **b) Apache Airflow (Data Orchestration)** 57 | * Astronomer Cosmos library (via Docker container) 58 | * **c) Snowflake (Data Warehouse)** 59 | * **d) dbt core (Data Transformation)** 60 | * User-defined Macros 61 | * Seeds 62 | * User-defined Generic and Custom tests 63 | * `dbt_external_tables` package 64 | * SQL & Python models 65 | * Incremental modeling (performance optimization) 66 | * **e) Git/Github Actions (Version Control)** 67 | * CI/CD pipeline (linting, testing, replication) 68 | * Slim CI (optimization) 69 | * Dev & Prod environments (software development) 70 | * **f) PowerBI (Data Visualization)** 71 | * **g) Other** 72 | * AWS S3 buckets (data storage) 73 | * Medallion architecture (logical data modeling) 74 | * Star Schema (dimensional data modeling) 75 | * `.env` & `config.yaml` files (Configuration as Code) 76 | * `README.md` files & Dbt docs (documentation) 77 | * `requirements.txt` (package management) 78 | 79 | ## 6. Project Methodology & Technical Details 80 | 81 | ### Data Extraction and Load 82 | 83 | The data pipeline uses the ELT framework, extracting and loading data "as-is" from APIs into an AWS S3 bucket. Data is sourced from two APIs: 84 | 85 | 1. **Aoestats.io API (`https://aoestats.io`)** 86 | This API provides historical player and match data. Two endpoints are used: one for dataset metadata (JSON) and another for direct download of match and player data (Parquet). Custom Python functions generate API endpoint strings, query the API, validate schemas using Pydantic, and load data into S3. 87 | 88 | 2. **Relic-link API (now WorldsEdge)** 89 | This unofficial community API provides the latest leaderboard data (JSON). Due to a 100-row request limit, data is retrieved in chunks. Each chunk is validated and loaded as a separate JSON file into S3 to avoid exceeding Snowflake's Variant column limit on ingestion to Snowflake External Tables. 90 | 91 | Each API endpoint has dedicated Python scripts following a consistent template: 92 | 93 | * a. Import functions from helper modules (`utils`, `filter`, `loader`). 94 | * b. Ingest parameters from the configuration file. 95 | * c. Establish an AWS S3 connection. 96 | * d. Submit GET requests to retrieve data. 97 | * e. Validate data against the expected schema (Pydantic). 98 | * f. Load data into the S3 bucket. 99 | 100 | Unit tests using `pytest` ensure function correctness. Airflow DAGs orchestrate script execution. An `all_project_dag.py` script runs all individual DAGs, including a `dbt_dag.py` for transformation steps. 101 | 102 | ### Data Transformation 103 | 104 | Data transformation occurs in dbt on Snowflake, using the Medallion architecture (bronze -> silver -> gold). The gold layer uses a star schema optimized for visualization tools like PowerBI. Each schema has a corresponding `_schema.yml` file for documentation. Further details are available in `Medallion_README.md` under the dbt `models` folder. 105 | 106 | ### Workflow Environment 107 | 108 | Development and production environments are separated using distinct S3 buckets (`dev`, `prod`) and Snowflake databases (`aoe_dev`, `aoe_prod`). Dbt profiles switch between targets. Production data is synced from development. 109 | 110 | ### Github Workflows 111 | 112 | CI workflows (`ci.yaml`) on pull requests run linting (Black), `pytest`, and Slim CI for dbt. CD workflows (`cd.yaml`) on merge to main sync S3 data to `prod` and run Slim CI on production tables (`aoe_prod` db in Snowflake). 113 | 114 | ### Reducing Compute Costs 115 | 116 | Date-driven directory structures in S3 enable delta loading in Snowflake, processing only new data. The dbt models on `dbt run` only process the date/date-range specified from the `load_master` control table. Dbt's incremental models and Slim CI further minimize compute costs. 117 | 118 | ## 7. Future Direction 119 | 120 | * Enhanced dbt testing (unit tests, improved thresholds). 121 | * Data quality dashboards for dbt/Airflow runs. 122 | * Improved Airflow failure notifications. 123 | * Incorporating additional AOE data (civilization strengths/weaknesses, logos). 124 | * Infrastructure as Code (IaC) for Snowflake/AWS. 125 | * Migrating processing to AWS (managed Airflow, EC2). 126 | * Utilizing RelicLink API for live data. 127 | 128 | ## 8. Miscellaneous 129 | 130 | ### Project Structure 131 | 132 | ```bash 133 | ├── Dockerfile 134 | ├── .env 135 | ├── README.md 136 | ├── requirements.txt 137 | ├── .github 138 | │ └── workflows 139 | │ ├── ci.yaml 140 | │ └── cd.yaml 141 | ├── dags 142 | │ ├── all_project_dag.py 143 | │ ├── dbt_dag.py 144 | │ ├── elt_metadata_dag.py 145 | │ ├── elt_relic_api_dag.py 146 | │ ├── elt_stat_matches_dag.py 147 | │ ├── elt_stat_players_dag.py 148 | │ └── set_load_master_dag.py 149 | ├── src 150 | │ ├── __init__.py 151 | │ ├── config.yaml 152 | │ ├── elt_metadata.py 153 | │ ├── elt_relic_api.py 154 | │ ├── elt_stat_matches.py 155 | │ ├── elt_stat_players.py 156 | │ ├── project_tests.py 157 | │ ├── set_load_master.py 158 | │ ├── utils.py 159 | │ ├── extract 160 | │ │ ├── __init__.py 161 | │ │ ├── filter.py 162 | │ │ └── models.py 163 | │ ├── load 164 | │ │ ├── __init__.py 165 | │ │ └── loader.py 166 | │ └── transform 167 | │ └── dbt_aoe 168 | │ ├── dbt_project.yml 169 | │ ├── package-lock.yml 170 | │ ├── packages.yml 171 | │ ├── profiles.yml.template 172 | │ ├── analyses 173 | │ ├── dbt_packages 174 | │ │ └── dbt_external_tables 175 | │ ├── macros 176 | │ │ ├── deduplicate_by_key.sql 177 | │ │ ├── filter_load.sql 178 | │ │ └── generate_schema_name.sql 179 | │ ├── models 180 | │ │ ├── Medallion_README.md 181 | │ │ ├── bronze 182 | │ │ │ ├── bronze_schema.yml 183 | │ │ │ ├── dim_date_br.py 184 | │ │ │ ├── ext_table_schema.yml 185 | │ │ │ ├── leaderboards_br.sql 186 | │ │ │ ├── matches_br.sql 187 | │ │ │ ├── players_br.sql 188 | │ │ │ ├── statgroup_br.sql 189 | │ │ │ ├── v_matches_raw.sql 190 | │ │ │ ├── v_players_raw.sql 191 | │ │ │ └── v_relic_raw.sql 192 | │ │ ├── gold 193 | │ │ │ ├── dim_civ.sql 194 | │ │ │ ├── dim_date.sql 195 | │ │ │ ├── dim_match.sql 196 | │ │ │ ├── dim_player.sql 197 | │ │ │ ├── fact_player_matches.sql 198 | │ │ │ └── gold_schema.yml 199 | │ │ └── silver 200 | │ │ ├── matches_sr.sql 201 | │ │ ├── player_leaderboard_stats_sr.sql 202 | │ │ ├── player_match_sr.sql 203 | │ │ └── silver_schema.yml 204 | │ ├── seeds 205 | │ │ ├── country_list.csv 206 | │ │ └── seeds.yml 207 | │ ├── snapshots 208 | │ └── tests 209 | │ ├── assert_countrys_mapped.sql 210 | │ └── generic 211 | │ ├── test_recent_ldts.sql 212 | │ └── test_within_threshold.sql 213 | ``` --------------------------------------------------------------------------------