├── src
    ├── __init__.py
    ├── transform
    │   ├── .gitkeep
    │   └── dbt_aoe
    │   │   ├── macros
    │   │       ├── .gitkeep
    │   │       ├── generate_schema_name.sql
    │   │       ├── filter_load.sql
    │   │       └── deduplicate_by_key.sql
    │   │   ├── seeds
    │   │       ├── .gitkeep
    │   │       ├── seeds.yml
    │   │       └── country_list.csv
    │   │   ├── tests
    │   │       ├── .gitkeep
    │   │       ├── assert_countrys_mapped.sql
    │   │       └── generic
    │   │       │   ├── test_within_threshold.sql
    │   │       │   └── test_recent_ldts.sql
    │   │   ├── analyses
    │   │       └── .gitkeep
    │   │   ├── snapshots
    │   │       └── .gitkeep
    │   │   ├── .gitignore
    │   │   ├── .user.yml
    │   │   ├── packages.yml
    │   │   ├── package-lock.yml
    │   │   ├── models
    │   │       ├── bronze
    │   │       │   ├── v_relic_raw.sql
    │   │       │   ├── v_matches_raw.sql
    │   │       │   ├── v_players_raw.sql
    │   │       │   ├── players_br.sql
    │   │       │   ├── ext_table_schema.yml
    │   │       │   ├── dim_date_br.py
    │   │       │   ├── matches_br.sql
    │   │       │   ├── statgroup_br.sql
    │   │       │   ├── leaderboards_br.sql
    │   │       │   └── bronze_schema.yml
    │   │       ├── gold
    │   │       │   ├── dim_date.sql
    │   │       │   ├── dim_civ.sql
    │   │       │   ├── dim_player.sql
    │   │       │   ├── dim_match.sql
    │   │       │   ├── fact_player_matches.sql
    │   │       │   └── gold_schema.yml
    │   │       ├── silver
    │   │       │   ├── player_match_sr.sql
    │   │       │   ├── matches_sr.sql
    │   │       │   ├── player_leaderboard_stats_sr.sql
    │   │       │   └── silver_schema.yml
    │   │       └── Medallion_README.md
    │   │   ├── profiles.yml.template
    │   │   └── dbt_project.yml
    ├── load
    │   ├── __init__.py
    │   └── loader.py
    ├── extract
    │   ├── __init__.py
    │   ├── filter.py
    │   └── models.py
    ├── config.yaml
    ├── set_load_master.py
    ├── elt_relic_api.py
    ├── elt_metadata.py
    ├── elt_stat_matches.py
    ├── elt_stat_players.py
    ├── project_tests.py
    └── utils.py
├── .astro
    ├── config.yaml
    ├── dag_integrity_exceptions.txt
    └── test_dag_integrity_default.py
├── .dockerignore
├── README_resources
    ├── aoe_counter_pick.PNG
    ├── aoe_leaderboard.PNG
    ├── dbt_dag_airflow.PNG
    ├── project_all_dag.PNG
    ├── aoe_civ_performance.PNG
    ├── db_diagram_aoe_model.PNG
    └── aoe2project_data_pipeline.PNG
├── Dockerfile
├── requirements.txt
├── .gitignore
├── dags
    ├── dbt_dag.py
    ├── elt_metadata_dag.py
    ├── elt_relic_api_dag.py
    ├── set_load_master_dag.py
    ├── elt_stat_matches_dag.py
    ├── elt_stat_players_dag.py
    └── all_project_dag.py
├── dbdiagram.txt
├── .github
    └── workflows
    │   ├── ci.yaml
    │   └── cd.yaml
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/transform/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.astro/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 |     name: project
3 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/.user.yml:
--------------------------------------------------------------------------------
1 | id: ab0b036d-20c8-4235-b8a8-7364d501df69
2 | 


--------------------------------------------------------------------------------
/.astro/dag_integrity_exceptions.txt:
--------------------------------------------------------------------------------
1 | # Add dag files to exempt from parse test below. ex: dags/<test-file>


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_external_tables
3 |     version: 0.9.0


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | astro
2 | .git
3 | .env
4 | airflow_settings.yaml
5 | logs/
6 | .venv
7 | airflow.db
8 | airflow.cfg
9 | 


--------------------------------------------------------------------------------
/README_resources/aoe_counter_pick.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe_counter_pick.PNG


--------------------------------------------------------------------------------
/README_resources/aoe_leaderboard.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe_leaderboard.PNG


--------------------------------------------------------------------------------
/README_resources/dbt_dag_airflow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/dbt_dag_airflow.PNG


--------------------------------------------------------------------------------
/README_resources/project_all_dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/project_all_dag.PNG


--------------------------------------------------------------------------------
/src/load/__init__.py:
--------------------------------------------------------------------------------
1 | from .loader import load_json_data, load_parquet_data
2 | 
3 | __all__ = ["load_json_data", "load_parquet_data"]
4 | 


--------------------------------------------------------------------------------
/README_resources/aoe_civ_performance.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe_civ_performance.PNG


--------------------------------------------------------------------------------
/README_resources/db_diagram_aoe_model.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/db_diagram_aoe_model.PNG


--------------------------------------------------------------------------------
/README_resources/aoe2project_data_pipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JonathanEnright/aoe_project/HEAD/README_resources/aoe2project_data_pipeline.PNG


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_external_tables
3 |     version: 0.9.0
4 | sha1_hash: f459fa5ae8276125d72b0ae3ae38baf5847fbb81
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/astronomer/astro-runtime:12.3.0
2 | 
3 | RUN python -m venv dbt_venv && source dbt_venv/bin/activate && \
4 |     pip install --no-cache-dir dbt-snowflake && deactivate


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/tests/assert_countrys_mapped.sql:
--------------------------------------------------------------------------------
1 | --DBT Tests pass if no rows are returned.
2 | --Hence, write the test assertion to find failing rows. 
3 | 
4 | SELECT
5 |     country_name
6 | FROM
7 |     {{ ref('player_leaderboard_stats_sr') }}
8 | WHERE
9 |     country_name = 'Unknown'


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==2.0 
 2 | pandas==2.2.2 
 3 | pyarrow==18.0.0 
 4 | fastparquet==2024.5.0
 5 | backoff==2.2.1
 6 | boto3==1.24.28
 7 | pydantic==2.9.2
 8 | pytest==7.4.0
 9 | python-dotenv==1.0.1
10 | PyYAML==6.0.2
11 | Requests==2.32.3
12 | astronomer-cosmos
13 | apache-airflow-providers-snowflake
14 | snowflake-connector-python==3.7.0
15 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/macros/generate_schema_name.sql:
--------------------------------------------------------------------------------
 1 | {% macro generate_schema_name(custom_schema_name, node) -%}
 2 | 
 3 |     {%- set default_schema = target.schema -%}
 4 |     {%- if custom_schema_name is none -%}
 5 | 
 6 |         {{ default_schema }}
 7 | 
 8 |     {%- else -%}
 9 | 
10 |         {{ custom_schema_name | trim }}
11 | 
12 |     {%- endif -%}
13 | 
14 | {%- endmacro %}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/v_relic_raw.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | 
 4 | --NOTE: Due to the 2 list objects in the json schema, we will need to dedup the individual arrays
 5 | SELECT DISTINCT
 6 |    *
 7 |    ,metadata$filename AS rsrc
 8 |    ,metadata$file_last_modified AS ldts
 9 |    ,'RELIC_LINK_API'::VARCHAR as source
10 | FROM
11 |    {{ source('aoe_ext', 'relic_ext') }}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/seeds/seeds.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | seeds:
 4 |   - name: country_list
 5 |     description: 'list of country codes and names for aoe2 players' 
 6 |     docs:
 7 |       show: true
 8 |       node_color: purple
 9 |     columns:
10 |       - name: Country
11 |         description: 'Name of country'
12 |       - name: Country_code
13 |         description: 'Country codes in lowercase'


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/v_matches_raw.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | 
 4 | SELECT DISTINCT
 5 |    *
 6 |    ,metadata$filename AS rsrc
 7 |    ,metadata$file_last_modified AS ldts
 8 |    ,'AOESTATS'::VARCHAR as source
 9 |    ,SPLIT(metadata$filename, '/')[2]::DATE AS file_date
10 | FROM
11 |    {{ source('aoe_ext', 'matches_ext') }}
12 | WHERE 1=1
13 |    {{ filter_load('file_date') }}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/v_players_raw.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='view') }}
 2 | 
 3 | 
 4 | SELECT DISTINCT
 5 |    *
 6 |    ,metadata$filename AS rsrc
 7 |    ,metadata$file_last_modified AS ldts
 8 |    ,'AOESTATS'::VARCHAR as source
 9 |    ,SPLIT(metadata$filename, '/')[2]::DATE AS file_date
10 | FROM
11 |    {{ source('aoe_ext', 'players_ext') }}
12 | WHERE 1=1
13 |    {{ filter_load('file_date') }}
14 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/gold/dim_date.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key='date_pk',
 5 |         on_schema_change='fail'
 6 |     )
 7 | }}
 8 | 
 9 | 
10 | SELECT
11 |     TO_VARCHAR(date, 'YYYYMMDD')::int as date_pk
12 |     ,date::date as date
13 |     ,year
14 |     ,month
15 |     ,day
16 |     ,day_of_week
17 |     ,is_weekend
18 | FROM
19 |     {{ ref('dim_date_br') }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | __pycache__
 3 | metadata.json
 4 | todo.txt
 5 | extract_old.py
 6 | .env
 7 | archived_data
 8 | .mypy_cache
 9 | dbt_commands.txt
10 | my_dbt_ext_table.sql
11 | sf_setup.sql
12 | project/transform/logs
13 | relic_link_testing
14 | logs
15 | packages.txt
16 | test_dag_example.py
17 | include
18 | plugins
19 | .venv
20 | astro
21 | .git
22 | airflow_settings.yaml
23 | logs/
24 | airflow.db
25 | airflow.cfg
26 | profiles.yml
27 | .pytest_cache
28 | .astro
29 | tree.py


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/gold/dim_civ.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key='civ_pk',
 5 |         on_schema_change='fail'
 6 |     )
 7 | }}
 8 | 
 9 | 
10 | SELECT DISTINCT
11 |     MD5(civ) as civ_pk
12 |     ,civ as civ_name
13 |     ,''::TEXT as civ_weaknesses --placeholder, to be filled in later
14 |     ,''::TEXT as civ_strengths --placeholder, to be filled in later
15 |     ,CURRENT_DATE() AS load_date
16 | FROM
17 |     {{ ref('players_br') }}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/macros/filter_load.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro filter_load(file_date) %}
 3 | {%- set env = target.name -%}
 4 | 
 5 |     AND {{ file_date }} > (
 6 |         SELECT
 7 |             load_start_date
 8 |         FROM
 9 |             aoe.control.load_master
10 |         WHERE
11 |             environment = '{{ env }}'
12 |     )
13 |     AND {{ file_date }} <= (
14 |         SELECT
15 |             load_end_date
16 |         FROM
17 |            aoe.control.load_master
18 |         WHERE
19 |             environment = '{{ env }}'  
20 |     )
21 | {% endmacro %}


--------------------------------------------------------------------------------
/src/extract/__init__.py:
--------------------------------------------------------------------------------
 1 | from .models import ApiSchema, WeeklyDump, RelicResponse, Matches, Players
 2 | from .filter import (
 3 |     validate_json_schema,
 4 |     generate_weekly_queries,
 5 |     create_stats_endpoints,
 6 |     validate_parquet_schema,
 7 |     fetch_relic_chunk,
 8 | )
 9 | 
10 | __all__ = [
11 |     "ApiSchema",
12 |     "WeeklyDump",
13 |     "Matches",
14 |     "Players",
15 |     "RelicResponse",
16 |     "validate_json_schema",
17 |     "generate_weekly_queries",
18 |     "create_stats_endpoints",
19 |     "validate_parquet_schema",
20 |     "fetch_relic_chunk",
21 | ]
22 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/gold/dim_player.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | 
 4 | WITH 
 5 |     deduplicated AS (
 6 |         {{ deduplicate_by_key('_ref(player_leaderboard_stats_sr)', 'CONCAT(profile_id,source)', 'ldts') }}
 7 |     )
 8 | SELECT
 9 |     MD5(CONCAT(profile_id::TEXT,'~',source)) as player_pk
10 |     ,profile_id
11 |     ,gaming_name
12 |     ,country_code
13 |     ,country_name
14 |     ,statgroup_id
15 |     ,wins
16 |     ,losses
17 |     ,current_rank
18 |     ,current_rating
19 |     ,last_match_date
20 |     ,CURRENT_DATE() AS load_date
21 |     ,source
22 | FROM
23 |     deduplicated


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/profiles.yml.template:
--------------------------------------------------------------------------------
 1 | your_project_name:
 2 |   target: dev
 3 |   outputs:
 4 |     dev:
 5 |       account: YOUR_ACCOUNT
 6 |       database: YOUR_DATABASE
 7 |       password: YOUR_PASSWORD
 8 |       role: YOUR_ROLE
 9 |       schema: YOUR_SCHEMA
10 |       threads: 4
11 |       type: snowflake
12 |       user: YOUR_USERNAME
13 |       warehouse: YOUR_WAREHOUSE
14 |     prod:
15 |       account: YOUR_ACCOUNT
16 |       database: YOUR_DATABASE
17 |       password: YOUR_PASSWORD
18 |       role: YOUR_ROLE
19 |       schema: YOUR_SCHEMA
20 |       threads: 4
21 |       type: snowflake
22 |       user: YOUR_USERNAME
23 |       warehouse: YOUR_WAREHOUSE


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/macros/deduplicate_by_key.sql:
--------------------------------------------------------------------------------
 1 | -- This macro accepts both CTE's and table references.
 2 | -- The parameter 'tbl' expects a string, however we can manipulate this to use the 'ref' macro
 3 | -- We can pass a ref{} table by specifying the parameter in the format '_ref({table_name})'
 4 | -- Example: '_ref(v_relic_raw)'
 5 | {% macro deduplicate_by_key(tbl, pk, date_field) %}
 6 |     SELECT
 7 |         *
 8 |     FROM 
 9 |         {% if tbl is string and tbl.startswith('_ref') %}
10 |             {{ ref(tbl[5:-1]) }}
11 |         {% else %}
12 |             {{ tbl }}
13 |         {% endif %}
14 |     QUALIFY ROW_NUMBER() OVER (PARTITION BY {{ pk }} ORDER BY {{ date_field }} DESC) = 1
15 | {% endmacro %}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/silver/player_match_sr.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key='id',
 5 |         on_schema_change='fail'
 6 |     )
 7 | }}
 8 | 
 9 |    
10 | WITH cte as (
11 | SELECT
12 |     MD5(CONCAT(game_id,'~',profile_id)) as id
13 |     ,game_id
14 |     ,team
15 |     ,profile_id
16 |     ,civ
17 |     ,winner
18 |     ,match_rating_diff
19 |     ,new_rating
20 |     ,old_rating
21 |     ,source
22 |     ,file_date
23 | FROM
24 |     {{ ref('players_br') }}
25 | WHERE
26 |     profile_id IS NOT NULL --0.14% null from source, assume DQ issue.
27 | )
28 | 
29 | SELECT * FROM cte
30 | {% if is_incremental() %}
31 |     where file_date > (select max(file_date) from {{ this }})
32 | {% endif %}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/players_br.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | 
 4 | WITH
 5 |    landing_data AS (
 6 |    SELECT 
 7 |       *
 8 |    FROM
 9 |       {{ ref('v_players_raw') }}
10 |    )
11 | SELECT
12 |    value: civ::VARCHAR AS civ
13 |    ,value:game_id::INT AS game_id
14 |    ,value:match_rating_diff::INT AS match_rating_diff
15 |    ,value:new_rating::INT AS new_rating
16 |    ,value:old_rating::INT AS old_rating
17 |    ,value:profile_id::INT AS profile_id
18 |    ,value:replay_summary_raw::ARRAY AS replay_summary_raw
19 |    ,value:team::VARCHAR AS team
20 |    ,value:winner::BOOLEAN AS winner
21 |    ,rsrc::VARCHAR AS rsrc
22 |    ,ldts::TIMESTAMP_NTZ(9) AS ldts
23 |    ,source
24 |    ,file_date
25 | FROM
26 |    landing_data


--------------------------------------------------------------------------------
/src/load/loader.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import io
 3 | from utils import upload_to_s3
 4 | 
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def load_json_data(model, file_dir, fn, bucket, s3):
10 |     """Loads Pydantic model as json files in S3."""
11 |     file_name = f"{file_dir}/{fn}.json"
12 |     json_data = model.model_dump_json(indent=4)
13 |     json_to_bytes = io.BytesIO(json_data.encode("utf-8"))
14 |     with json_to_bytes as file_obj:
15 |         upload_to_s3(s3, file_obj, bucket, file_name)
16 | 
17 | 
18 | def load_parquet_data(data, file_dir, fn, bucket, s3):
19 |     """Loads parquet data directly into S3 bucket."""
20 |     file_name = f"{file_dir}/{fn}.parquet"
21 |     with data as file_obj:
22 |         upload_to_s3(s3, file_obj, bucket, file_name)
23 | 


--------------------------------------------------------------------------------
/dags/dbt_dag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | from cosmos import DbtDag, ProjectConfig, ProfileConfig, ExecutionConfig
 4 | 
 5 | profile_config = ProfileConfig(
 6 |     profile_name="dbt_aoe",
 7 |     target_name="dev",
 8 |     profiles_yml_filepath=f"{os.environ['AIRFLOW_HOME']}/src/transform/dbt_aoe/profiles.yml",
 9 | )
10 | 
11 | dag = DbtDag(
12 |     profile_config=profile_config,
13 |     project_config=ProjectConfig(
14 |         f"{os.environ['AIRFLOW_HOME']}/src/transform/dbt_aoe",
15 |     ),
16 |     operator_args={"install_deps": True},
17 |     execution_config=ExecutionConfig(
18 |         dbt_executable_path=f"{os.environ['AIRFLOW_HOME']}/dbt_venv/bin/dbt",
19 |     ),
20 |     schedule_interval=timedelta(days=7),
21 |     start_date=datetime(2023, 9, 10),
22 |     catchup=False,
23 |     dag_id="dbt_dag_id",
24 | )
25 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/tests/generic/test_within_threshold.sql:
--------------------------------------------------------------------------------
 1 | {% test within_threshold(model, column_name, lower_threshold, upper_threshold, error_lim) %}
 2 | 
 3 | 
 4 | WITH
 5 |     test_def as (
 6 |     SELECT
 7 |         {{ column_name }} as test_field
 8 |     FROM
 9 |         {{ model }}
10 |     )
11 |     ,perform_test as (
12 |     SELECT
13 |         *
14 |     FROM
15 |         test_def
16 |     WHERE
17 |         test_field <= {{ lower_threshold }}
18 |     OR  test_field >= {{ upper_threshold }}
19 |     )
20 |     ,validation_errors AS (
21 |     SELECT
22 |         COUNT(*) AS n_errors
23 |     FROM
24 |         perform_test
25 |     )
26 | SELECT
27 |     CASE WHEN n_errors > {{error_lim}} THEN 'fail' 
28 |     ELSE 'pass'
29 |     END AS result
30 | FROM
31 |     validation_errors
32 | WHERE
33 |     result <> 'pass' --Test expected to pass, trigger 'failed test' otherwise
34 | 
35 | {% endtest %}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/tests/generic/test_recent_ldts.sql:
--------------------------------------------------------------------------------
 1 | {% test recent_ldts(model, column_name, back_date_days) %}
 2 | {{ config(severity = 'warn') }}
 3 | 
 4 | 
 5 | WITH
 6 |     test_def as (
 7 |     SELECT
 8 |         MAX({{ column_name }}) as test_field
 9 |     FROM
10 |         {{ model }}
11 |     )
12 |     ,perform_test as (
13 |     SELECT
14 |         *
15 |     FROM
16 |         test_def
17 |     WHERE
18 |         test_field >= (CURRENT_DATE()-{{back_date_days}})
19 |     )
20 |     ,validation_errors AS (
21 |     SELECT
22 |         COUNT(*) AS n_errors
23 |     FROM
24 |         perform_test
25 |     )
26 | SELECT
27 | --If there is no max(ldts) > backdate_date, then fail
28 |     CASE WHEN n_errors = 0 THEN 'fail' 
29 |     ELSE 'pass'
30 |     END AS result
31 | FROM
32 |     validation_errors
33 | WHERE
34 |     result <> 'pass' --Test expected to pass, trigger 'failed test' otherwise
35 | 
36 | {% endtest %}


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | name: 'dbt_aoe'
 4 | version: '1.0.0'
 5 | 
 6 | # This setting configures which "profile" dbt uses for this project.
 7 | profile: 'dbt_aoe'
 8 | 
 9 | # These configurations specify where dbt should look for different types of files.
10 | model-paths: ["models"]
11 | analysis-paths: ["analyses"]
12 | test-paths: ["tests"]
13 | seed-paths: ["seeds"]
14 | macro-paths: ["macros"]
15 | snapshot-paths: ["snapshots"]
16 | 
17 | clean-targets:         # directories to be removed by `dbt clean`
18 |   - "target"
19 |   - "dbt_packages"
20 | 
21 | models:
22 |   dbt_aoe:
23 |     # Config indicated by + and applies to all files
24 |     bronze:
25 |       +materialized: table
26 |       +schema: bronze
27 |     silver:
28 |       +materialized: table
29 |       +schema: silver
30 |     gold:
31 |       +materialized: table
32 |       +schema: gold


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/ext_table_schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: aoe_ext
 5 |     loader: S3
 6 |     
 7 |     tables:
 8 |       - name: matches_ext
 9 |         description: "External table of AOE matches, stored as parquet files"
10 |         external:
11 |           location: "@s3_stage/{{target.name}}/matches"
12 |           file_format: parquet_ff
13 |           auto_refresh: true
14 | 
15 |       - name: players_ext
16 |         description: "External table of AOE players, stored as parquet files"
17 |         external:
18 |           location: "@s3_stage/{{target.name}}/players"
19 |           file_format: parquet_ff
20 |           auto_refresh: true
21 | 
22 |       - name: relic_ext
23 |         description: "External table of relic API data, stored as json files"
24 |         external:
25 |           location: "@s3_stage/{{target.name}}/rl_api"
26 |           file_format: json_ff
27 |           auto_refresh: true


--------------------------------------------------------------------------------
/src/config.yaml:
--------------------------------------------------------------------------------
 1 | # parse_date params (update here for 1 time loads or backdating)
 2 | backdate_days_start: 90
 3 | backdate_days_end: 1
 4 | target_run_date: ~
 5 | target_run_end_date: ~
 6 | date_format: '%Y-%m-%d'
 7 | 
 8 | # AoeStats
 9 | stats_dir_url: '/media/db_dumps/date_range%3D'
10 | matches_fn_suffix: 'matches'
11 | players_fn_suffix: 'players'
12 | stats_base_url: https://aoestats.io
13 | matches_folder_name: dev/matches
14 | players_folder_name: dev/players
15 | 
16 | metadata_endpoint: /api/db_dumps.json
17 | metadata_fn_suffix: metadata
18 | metadata_folder_name: dev/meta
19 | 
20 | # RelicLink
21 | relic_base_url: https://aoe-api.worldsedgelink.com
22 | relic_endpoint: '/community/leaderboard/getLeaderBoard2?'
23 | relic_params: 
24 |   leaderboard_id: 3
25 |   title: age2
26 |   sort_by: 0
27 |   chunk_size: 100 
28 | relic_fn_suffix: relic
29 | relic_folder_name: dev/rl_api
30 | 
31 | # s3 bucket. Other bucket params are in .env file.
32 | bucket: aoe2dumps


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/gold/dim_match.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key='match_pk',
 5 |         on_schema_change='fail'
 6 |     )
 7 | }}
 8 | 
 9 | 
10 | WITH 
11 |     deduplicated AS (
12 |         {{ deduplicate_by_key('_ref(matches_sr)', 'CONCAT(game_id,source)', 'ldts') }}
13 |     )
14 |     ,cte AS (
15 | SELECT
16 |     MD5(CONCAT(game_id::TEXT,'~',source)) as match_pk
17 |     ,game_id
18 |     ,map
19 |     ,avg_elo
20 |     ,game_duration_secs
21 |     ,actual_duration_secs
22 |     ,game_started_timestamp
23 |     ,game_date
24 |     ,team_0_elo
25 |     ,team_1_elo
26 |     ,leaderboard
27 |     ,mirror
28 |     ,patch
29 |     ,rsrc
30 |     ,CURRENT_DATE() AS load_date
31 |     ,source
32 |     ,file_date
33 | FROM
34 |     deduplicated
35 | )
36 | 
37 | SELECT * FROM cte
38 | {% if is_incremental() %}
39 |     where file_date > (select max(file_date) from {{ this }})
40 |     or match_pk is NULL
41 | {% endif %}
42 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/silver/matches_sr.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key='game_id',
 5 |         on_schema_change='fail'
 6 |     )
 7 | }}
 8 | 
 9 | 
10 | with cte as (
11 | SELECT
12 |     game_id
13 |     ,map
14 |     ,game_type
15 |     ,game_speed
16 |     ,starting_age
17 |     ,num_players
18 |     ,avg_elo
19 |     ,SUBSTR(duration::TEXT, 0, 4)::int as game_duration_secs
20 |     ,SUBSTR(irl_duration::TEXT, 0, 4)::int as actual_duration_secs
21 |     ,started_timestamp as game_started_timestamp
22 |     ,started_timestamp::date as game_date
23 |     ,team_0_elo::int as team_0_elo
24 |     ,team_1_elo::int as team_1_elo
25 |     ,leaderboard
26 |     ,mirror
27 |     ,patch
28 |     ,rsrc
29 |     ,ldts
30 |     ,source
31 |     ,file_date
32 | FROM
33 |     {{ ref('matches_br') }}
34 | WHERE
35 |     leaderboard = 'random_map' --filter for 1v1 RM's in this analysis
36 | )
37 | 
38 | SELECT * FROM cte
39 | {% if is_incremental() %}
40 |     where file_date > (select max(file_date) from {{ this }})
41 | {% endif %}
42 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/dim_date_br.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import datetime
 3 | 
 4 | START_YEAR = 2020
 5 | END_YEAR = 2029
 6 | 
 7 | 
 8 | def model(dbt, session):
 9 |     """Generates a date dimension table"""
10 | 
11 |     # Define the start and end dates for the dimension table
12 |     start_date = datetime(START_YEAR, 1, 1)
13 |     end_date = datetime(END_YEAR, 12, 31)
14 | 
15 |     # Generate a date range
16 |     date_range = pd.date_range(start=start_date, end=end_date, freq="D")
17 | 
18 |     dim_date_df = pd.DataFrame(
19 |         {
20 |             "date": date_range,
21 |             "year": date_range.year,
22 |             "month": date_range.month,
23 |             "day": date_range.day,
24 |             "day_of_week": date_range.dayofweek,  # 0=Monday, 6=Sunday
25 |         }
26 |     )
27 | 
28 |     dim_date_df["is_weekend"] = dim_date_df["day_of_week"].isin([5, 6])
29 | 
30 |     # Convert all column names to uppercase, to make case insensitve in Snowflake
31 |     dim_date_df = dim_date_df.rename(columns=str.upper)
32 | 
33 |     return dim_date_df
34 | 


--------------------------------------------------------------------------------
/dags/elt_metadata_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.python import PythonOperator
 4 | import sys
 5 | import os
 6 | 
 7 | # Add the 'src' directory to the Python path
 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(os.path.join(project_root, "src"))
10 | 
11 | from src.elt_metadata import main as run_elt_metadata
12 | 
13 | default_args = {
14 |     "owner": "Jono",
15 |     "depends_on_past": False,
16 |     "email": ["test@live.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=1),
21 | }
22 | 
23 | with DAG(
24 |     "elt_metadata_dag_id",
25 |     default_args=default_args,
26 |     description="ETL process for metadata",
27 |     schedule_interval=timedelta(days=7),
28 |     start_date=datetime(2023, 1, 1),
29 |     catchup=False,
30 |     tags=["etl", "metadata"],
31 | ) as dag:
32 |     run_elt_metadata_task = PythonOperator(
33 |         task_id="run_elt_metadata",
34 |         python_callable=run_elt_metadata,
35 |     )
36 | 
37 |     run_elt_metadata_task
38 | 


--------------------------------------------------------------------------------
/dags/elt_relic_api_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.python import PythonOperator
 4 | import sys
 5 | import os
 6 | 
 7 | # Add the 'src' directory to the Python path
 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(os.path.join(project_root, "src"))
10 | 
11 | from src.elt_relic_api import main as run_elt_relic_api
12 | 
13 | default_args = {
14 |     "owner": "Jono",
15 |     "depends_on_past": False,
16 |     "email": ["test@live.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=1),
21 | }
22 | 
23 | with DAG(
24 |     "elt_relic_api_id",
25 |     default_args=default_args,
26 |     description="ETL process for relic_api",
27 |     schedule_interval=timedelta(days=7),
28 |     start_date=datetime(2023, 1, 1),
29 |     catchup=False,
30 |     tags=["etl", "relic"],
31 | ) as dag:
32 |     run_elt_relic_api_task = PythonOperator(
33 |         task_id="run_elt_relic_api",
34 |         python_callable=run_elt_relic_api,
35 |     )
36 | 
37 |     run_elt_relic_api_task
38 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/matches_br.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | 
 4 | WITH
 5 |    landing_data AS (
 6 |    SELECT 
 7 |       *
 8 |    FROM
 9 |       {{ ref('v_matches_raw') }}
10 |    )
11 | SELECT
12 |    value: avg_elo::DEC(38,2) AS avg_elo
13 |    ,value:duration::INT AS duration
14 |    ,value:game_id::INT AS game_id
15 |    ,value:game_speed::VARCHAR AS game_speed
16 |    ,value:game_type::VARCHAR AS game_type
17 |    ,value:irl_duration::INT AS irl_duration
18 |    ,value:leaderboard::VARCHAR AS leaderboard
19 |    ,value:map::VARCHAR AS map
20 |    ,value:mirror::BOOLEAN AS mirror
21 |    ,value:num_players::INT AS num_players
22 |    ,value:patch::VARCHAR AS patch
23 |    ,value:raw_match_type::VARCHAR AS raw_match_type
24 |    ,value:replay_enhanced::BOOLEAN AS replay_enhanced
25 |    ,value:started_timestamp::VARCHAR::TIMESTAMP_NTZ AS started_timestamp
26 |    ,value:starting_age::VARCHAR AS starting_age
27 |    ,value:team_0_elo::DEC(38,2) AS team_0_elo
28 |    ,value:team_1_elo::DEC(38,2) AS team_1_elo
29 |    ,rsrc::VARCHAR AS rsrc
30 |    ,ldts::TIMESTAMP_NTZ(9) AS ldts
31 |    ,source
32 |    ,file_date
33 | FROM
34 |    landing_data


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/gold/fact_player_matches.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key='fact_pk',
 5 |         on_schema_change='fail'
 6 |     )
 7 | }}
 8 | 
 9 | 
10 | WITH cte AS (
11 | SELECT
12 |     pm.id as fact_pk
13 |     ,dm.match_pk as match_fk
14 |     ,dp.player_pk as player_fk
15 |     ,dc.civ_pk as civ_fk
16 |     ,dd.date_pk as date_fk
17 |     ,pm.team
18 |     ,pm.winner
19 |     ,pm.match_rating_diff
20 |     ,pm.new_rating
21 |     ,pm.old_rating
22 |     ,pm.source
23 |     ,pm.file_date
24 |     ,CURRENT_DATE() as load_date
25 | FROM
26 |     {{ ref('player_match_sr') }} as pm
27 | INNER JOIN
28 |     {{ ref('dim_civ') }} as dc
29 |     ON pm.civ = dc.civ_name
30 | INNER JOIN
31 |     {{ ref('dim_match') }} as dm
32 |     ON pm.game_id = dm.game_id
33 | INNER JOIN
34 |     {{ ref('dim_player') }} as dp
35 |     ON pm.profile_id = dp.profile_id
36 | INNER JOIN 
37 |     {{ ref('dim_date') }} as dd
38 |     ON dm.game_date = dd.date
39 | )
40 | 
41 | SELECT * FROM cte
42 | {% if is_incremental() %}
43 |     where file_date > (select max(file_date) from {{ this }})
44 |     or fact_pk IS NULL
45 | {% endif %}
46 | 


--------------------------------------------------------------------------------
/dags/set_load_master_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.python import PythonOperator
 4 | import sys
 5 | import os
 6 | 
 7 | # Add the 'src' directory to the Python path
 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(os.path.join(project_root, "src"))
10 | 
11 | from src.set_load_master import main as run_set_load_master
12 | 
13 | default_args = {
14 |     "owner": "Jono",
15 |     "depends_on_past": False,
16 |     "email": ["test@live.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=1),
21 | }
22 | 
23 | with DAG(
24 |     "set_load_master_id",
25 |     default_args=default_args,
26 |     description="ETL process for load_master",
27 |     schedule_interval=timedelta(days=7),
28 |     start_date=datetime(2023, 1, 1),
29 |     catchup=False,
30 |     tags=["etl", "master"],
31 | ) as dag:
32 |     run_set_load_master_task = PythonOperator(
33 |         task_id="run_set_load_master",
34 |         python_callable=run_set_load_master,
35 |     )
36 | 
37 |     run_set_load_master_task
38 | 


--------------------------------------------------------------------------------
/dags/elt_stat_matches_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.python import PythonOperator
 4 | import sys
 5 | import os
 6 | 
 7 | # Add the 'src' directory to the Python path
 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(os.path.join(project_root, "src"))
10 | 
11 | from src.elt_stat_matches import main as run_elt_stat_matches
12 | 
13 | default_args = {
14 |     "owner": "Jono",
15 |     "depends_on_past": False,
16 |     "email": ["test@live.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=1),
21 | }
22 | 
23 | with DAG(
24 |     "elt_stat_matches_id",
25 |     default_args=default_args,
26 |     description="ETL process for stat_matches",
27 |     schedule_interval=timedelta(days=7),
28 |     start_date=datetime(2023, 1, 1),
29 |     catchup=False,
30 |     tags=["etl", "stats"],
31 | ) as dag:
32 |     run_elt_stat_matches_task = PythonOperator(
33 |         task_id="run_elt_stat_matches",
34 |         python_callable=run_elt_stat_matches,
35 |     )
36 | 
37 |     run_elt_stat_matches_task
38 | 


--------------------------------------------------------------------------------
/dags/elt_stat_players_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.python import PythonOperator
 4 | import sys
 5 | import os
 6 | 
 7 | # Add the 'src' directory to the Python path
 8 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 9 | sys.path.append(os.path.join(project_root, "src"))
10 | 
11 | from src.elt_stat_players import main as run_elt_stat_players
12 | 
13 | default_args = {
14 |     "owner": "Jono",
15 |     "depends_on_past": False,
16 |     "email": ["test@live.com"],
17 |     "email_on_failure": False,
18 |     "email_on_retry": False,
19 |     "retries": 1,
20 |     "retry_delay": timedelta(minutes=1),
21 | }
22 | 
23 | with DAG(
24 |     "elt_stat_players_id",
25 |     default_args=default_args,
26 |     description="ETL process for stat_players",
27 |     schedule_interval=timedelta(days=7),
28 |     start_date=datetime(2023, 1, 1),
29 |     catchup=False,
30 |     tags=["etl", "stats"],
31 | ) as dag:
32 |     run_elt_stat_players_task = PythonOperator(
33 |         task_id="run_elt_stat_players",
34 |         python_callable=run_elt_stat_players,
35 |     )
36 | 
37 |     run_elt_stat_players_task
38 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/statgroup_br.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | 
 4 | WITH
 5 |     landing_data AS (
 6 |     SELECT
 7 |         VALUE AS json_col
 8 |         ,rsrc::VARCHAR as rsrc
 9 |         ,ldts::TIMESTAMP_NTZ(9) as ldts
10 |         ,source
11 |     FROM
12 |         {{ ref('v_relic_raw') }}    
13 |     )
14 |     ,members_stats AS (
15 |     SELECT DISTINCT
16 |         value:id::int as id
17 |         ,value:name::VARCHAR as group_name
18 |         ,value:type::int as type
19 |         ,value:members::array as members
20 |         ,rsrc
21 |         ,ldts
22 |         ,source
23 |     FROM
24 |         landing_data
25 |         ,LATERAL FLATTEN(INPUT => json_col:statGroups)
26 |     )
27 |     ,flattened AS (
28 | SELECT 
29 |     id
30 |     ,group_name
31 |     ,type
32 |     ,value:alias::VARCHAR as alias
33 |     ,value:country::VARCHAR as country
34 |     ,value:leaderboardregion_id::int as leaderboardregion_id
35 |     ,value:level::int as level
36 |     ,value:name::VARCHAR as name
37 |     ,value:personal_statgroup_id::int as personal_statgroup_id
38 |     ,value:profile_id::int as profile_id
39 |     ,value:xp::int as xp
40 |     ,rsrc
41 |     ,ldts
42 |     ,source
43 | FROM
44 |     members_stats
45 |     ,LATERAL FLATTEN(INPUT => members)
46 |     )
47 |     ,deduplicated AS (
48 |         {{ deduplicate_by_key('flattened', 'profile_id', 'ldts') }}
49 |     )
50 | SELECT
51 |     *
52 | FROM
53 |     deduplicated


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/gold/gold_schema.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: 2
 3 | 
 4 | models:
 5 |   - name: dim_civ
 6 |     description: "Dimension table contain the attributes for civs"
 7 |     columns:
 8 |       - name: civ_pk
 9 |         description: "A primary key for this table"
10 |         data_tests:
11 |           - unique
12 |           - not_null
13 | 
14 |   - name: dim_date
15 |     description: "Dimension table containing the attributes for dates"
16 |     columns:
17 |       - name: date_pk
18 |         description: "A primary key for this table"
19 |         data_tests:
20 |           - unique
21 |           - not_null
22 | 
23 |   - name: dim_match
24 |     description: "Dimension table containing the attributes for matches"
25 |     columns:
26 |       - name: match_pk
27 |         description: "A primary key for this table"
28 |         data_tests:
29 |           - unique
30 |           - not_null
31 | 
32 |   - name: dim_player
33 |     description: "Dimension table containing the attributes for players"
34 |     columns:
35 |       - name: player_pk
36 |         description: "A primary key for this table"
37 |         data_tests:
38 |           - unique
39 |           - not_null
40 | 
41 |   - name: fact_player_matches
42 |     description: "Fact table containing the joining keys and measures"
43 |     columns:
44 |       - name: fact_pk
45 |         description: "The primary key for this table"
46 |         data_tests:
47 |           - unique
48 |           - not_null


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/leaderboards_br.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | 
 4 | WITH
 5 |     landing_data AS (
 6 |     SELECT
 7 |         VALUE AS json_col
 8 |         ,rsrc::VARCHAR as rsrc
 9 |         ,ldts::TIMESTAMP_NTZ(9) as ldts
10 |         ,source
11 |     FROM
12 |         {{ ref('v_relic_raw') }} 
13 |     )
14 |     ,flattened AS (
15 |     SELECT DISTINCT
16 |         value:disputes::int as disputes
17 |         ,value:drops::int as drops
18 |         ,value:highestrank::int as highestrank
19 |         ,value:highestranklevel::int as highestranklevel
20 |         ,value:highestrating::int as highestrating
21 |         ,value:lastmatchdate::int as lastmatchdate
22 |         ,value:leaderboard_id::int as leaderboard_id
23 |         ,value:losses::int as losses
24 |         ,value:rank::int as rank
25 |         ,value:ranklevel::int as ranklevel
26 |         ,value:ranktotal::int as ranktotal
27 |         ,value:rating::int as rating
28 |         ,value:regionrank::int as regionrank
29 |         ,value:regionranktotal::int as regionranktotal
30 |         ,value:statgroup_id::int as statgroup_id
31 |         ,value:streak::int as streak
32 |         ,value:wins::int as wins
33 |         ,rsrc
34 |         ,ldts
35 |         ,source
36 |     FROM
37 |         landing_data
38 |         ,LATERAL FLATTEN(INPUT => json_col:leaderboardStats)
39 |     )
40 |     ,deduplicated AS (
41 |         {{ deduplicate_by_key('flattened', 'statgroup_id', 'ldts') }}
42 |     )
43 | SELECT
44 |     *
45 | FROM
46 |     deduplicated


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/silver/player_leaderboard_stats_sr.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table') }}
 2 | 
 3 | 
 4 | WITH
 5 |     stat_group_data AS (
 6 |         SELECT
 7 |             profile_id
 8 |             ,alias as gaming_name
 9 |             ,personal_statgroup_id
10 |             ,country
11 |         FROM
12 |             {{ ref('statgroup_br') }}
13 |     )
14 |     ,country_codes AS (
15 |         SELECT
16 |             Country AS country_name
17 |             ,Country_code
18 |         FROM
19 |             {{ ref('country_list') }}
20 |     )
21 |     ,leaderboard_data AS (
22 |         SELECT
23 |             statgroup_id
24 |             ,wins
25 |             ,losses
26 |             ,rank as current_rank
27 |             ,rating as current_rating
28 |             ,TO_TIMESTAMP(lastmatchdate) as last_match_date
29 |             ,ldts
30 |             ,rsrc
31 |             ,source
32 |         FROM
33 |             {{ ref('leaderboards_br') }}
34 |     )
35 | SELECT
36 |     sgd.profile_id
37 |     ,sgd.gaming_name
38 |     ,sgd.country AS country_code
39 |     ,COALESCE(cc.country_name, 'Unknown') AS country_name
40 |     ,ld.statgroup_id
41 |     ,ld.wins
42 |     ,ld.losses
43 |     ,ld.current_rank
44 |     ,ld.current_rating
45 |     ,ld.last_match_date
46 |     ,ld.ldts
47 |     ,ld.rsrc
48 |     ,ld.source
49 | FROM
50 |     leaderboard_data as ld
51 | INNER JOIN
52 |     stat_group_data as sgd
53 |     ON ld.statgroup_id = sgd.personal_statgroup_id
54 | LEFT JOIN
55 |     country_codes AS cc
56 |     ON sgd.country = cc.country_code
57 | 


--------------------------------------------------------------------------------
/src/set_load_master.py:
--------------------------------------------------------------------------------
 1 | from utils import Config, sf_connect
 2 | import logging
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | # Configure logging
 7 | logging.basicConfig(
 8 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 9 | )
10 | logger = logging.getLogger(__name__)
11 | 
12 | # Get the directory of the current script
13 | script_dir = Path(__file__).resolve().parent
14 | 
15 | YAML_CONFIG = os.path.join(script_dir, "config.yaml")
16 | 
17 | # Set up script to update only 'dev' file date ranges.
18 | # 'prod' runs will be bring in all data by default,
19 | #  unless control table manually overridden or updated here (ENV = "prod").
20 | ENV = "dev"
21 | 
22 | 
23 | def update_load_master(connection, env: str, from_date: str, to_date: str):
24 |     query = f"""
25 |     UPDATE load_master
26 |     SET 
27 |         load_start_date = '{from_date}'
28 |         ,load_end_date = '{to_date}'
29 |     WHERE
30 |         environment = '{env}' 
31 |     ;
32 |     """
33 |     cursor = connection.cursor()
34 |     cursor.execute(query)
35 |     result = cursor.fetchone()[0]
36 |     cursor.close()
37 |     return result
38 | 
39 | 
40 | def main(*args, **kwargs):
41 |     config = Config(YAML_CONFIG)
42 |     run_date_from = config.run_date
43 |     run_date_to = config.run_end_date
44 | 
45 |     connection = sf_connect(db="aoe", schema="control")
46 |     update_load_master(connection, ENV, run_date_from, run_date_to)
47 |     connection.close()
48 |     logger.info("Script complete.")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/src/elt_relic_api.py:
--------------------------------------------------------------------------------
 1 | from utils import Config, timer, create_s3_session
 2 | from extract import RelicResponse, validate_json_schema, fetch_relic_chunk
 3 | from load import load_json_data
 4 | import logging
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | # Configure logging
 9 | logging.basicConfig(
10 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11 | )
12 | logger = logging.getLogger(__name__)
13 | 
14 | # Get the directory of the current script
15 | script_dir = Path(__file__).resolve().parent
16 | 
17 | YAML_CONFIG = os.path.join(script_dir, "config.yaml")
18 | 
19 | 
20 | @timer
21 | def main(*args, **kwargs):
22 |     config = Config(YAML_CONFIG)
23 | 
24 |     # Setup:
25 |     s3 = create_s3_session()
26 |     _base_url = config.relic_base_url
27 |     _endpoint = config.relic_endpoint
28 |     _params = config.relic_params
29 |     _validation_schema = RelicResponse
30 |     _fn = config.relic_fn_suffix
31 |     _file_dir = config.relic_folder_name
32 |     _s3_bucket = config.bucket
33 | 
34 |     # Extract phase
35 |     logger.info("Starting data extraction.")
36 |     content_chunk = fetch_relic_chunk(_base_url, _endpoint, _params)
37 | 
38 |     for i, json_data in enumerate(content_chunk):
39 |         fn = f"{_fn}_{i+1}"
40 | 
41 |         # Validate phase
42 |         validated_data = validate_json_schema(json_data, _validation_schema)
43 | 
44 |         # Load phase
45 |         load_json_data(validated_data, _file_dir, fn, _s3_bucket, s3)
46 |         logger.info("Script complete.")
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/bronze/bronze_schema.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: 2
 3 | 
 4 | models:
 5 |   - name: leaderboards_br
 6 |     description: "Schema of flattened leaderboard data from Relic API"
 7 |     columns:
 8 |       - name: statgroup_id
 9 |         description: "The primary key for this table"
10 |         data_tests:
11 |           - unique
12 |           - not_null
13 |       - name: ldts
14 |         data_tests:
15 |           - recent_ldts:
16 |               'back_date_days': 7
17 | 
18 |   - name: statgroup_br
19 |     description: "Schema of flattened player stats data from Relic API"
20 |     columns:
21 |       - name: profile_id
22 |         description: "The primary key for this table"
23 |         data_tests:
24 |           - unique
25 |           - not_null
26 |       - name: personal_statgroup_id
27 |         description: "Joining key to leaderboards table"
28 |       - name: ldts
29 |         data_tests:
30 |           - recent_ldts:
31 |               'back_date_days': 7
32 | 
33 |   - name: matches_br
34 |     description: "Individual match data from Aoestats.io"
35 |     columns:
36 |       - name: game_id
37 |         description: "The primary key for this table"
38 |         data_tests:
39 |           - unique
40 |           - not_null
41 |           
42 |       - name: ldts
43 |         data_tests:
44 |           - recent_ldts:
45 |               'back_date_days': 7
46 | 
47 |   - name: players_br
48 |     description: "Individual player data from Aoestats.io"
49 |     columns:
50 |       - name: game_id
51 |         description: "Joining key for the table when combined with profile_id"
52 |       - name: profile_id
53 |         description: "Joining key for the table when combined with game_id"
54 |       - name: ldts
55 |         data_tests:
56 |           - recent_ldts:
57 |               'back_date_days': 7
58 | 


--------------------------------------------------------------------------------
/src/elt_metadata.py:
--------------------------------------------------------------------------------
 1 | from utils import Config, timer, fetch_api_json, create_s3_session
 2 | from extract import validate_json_schema, ApiSchema
 3 | from load import load_json_data
 4 | import logging
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | # Configure logging
 9 | logging.basicConfig(
10 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11 | )
12 | logger = logging.getLogger(__name__)
13 | 
14 | # Get the directory of the current script
15 | script_dir = Path(__file__).resolve().parent
16 | 
17 | YAML_CONFIG = os.path.join(script_dir, "config.yaml")
18 | 
19 | 
20 | @timer
21 | def main(*args, **kwargs):
22 |     config = Config(YAML_CONFIG)
23 | 
24 |     # Setup:
25 |     s3 = create_s3_session()
26 |     _base_url = config.stats_base_url
27 |     _endpoint = config.metadata_endpoint
28 |     _params = None
29 |     _validation_schema = ApiSchema
30 |     _fn = f"{config.run_end_date}_{config.metadata_fn_suffix}"
31 |     _file_dir = config.metadata_folder_name
32 |     _s3_bucket = config.bucket
33 | 
34 |     try:
35 |         # Extract phase
36 |         logger.info("Starting data extraction.")
37 |         json_data = fetch_api_json(_base_url, _endpoint, _params)
38 | 
39 |         if json_data is None:
40 |             logger.error(
41 |                 f"Failed to fetch data after 3 attempts for endpoint: {_endpoint}"
42 |             )
43 |             raise
44 | 
45 |         # Validate phase
46 |         logger.info("Validating data.")
47 |         validated_data = validate_json_schema(json_data, _validation_schema)
48 | 
49 |         # Load phase
50 |         logger.info("Starting data loading.")
51 |         load_json_data(validated_data, _file_dir, _fn, _s3_bucket, s3)
52 | 
53 |         logger.info("ELT process completed successfully.")
54 |     except Exception as e:
55 |         logger.error(f"ELT process failed: {e}")
56 |         raise
57 |     logger.info("Script complete.")
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/silver/silver_schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: matches_sr
 5 |     description: "Cleaned matches table filtered for 1v1RM matches only"
 6 |     columns:
 7 |       - name: game_duration_secs
 8 |         data_tests:
 9 |           - within_threshold:
10 |               'lower_threshold': 0
11 |               'upper_threshold': 10000
12 |               'error_lim': 10
13 |       - name: actual_duration_secs
14 |         data_tests:
15 |           - within_threshold:
16 |               'lower_threshold': 0
17 |               'upper_threshold': 10000
18 |               'error_lim': 10
19 |       - name: game_started_timestamp
20 |         data_tests:
21 |           - within_threshold:
22 |               'lower_threshold': "'2022-01-01'"
23 |               'upper_threshold': CURRENT_DATE()+1
24 |               'error_lim': 1
25 |       - name: game_date
26 |         data_tests:
27 |           - within_threshold:
28 |               'lower_threshold': "'2022-01-01'"
29 |               'upper_threshold': CURRENT_DATE()+1
30 |               'error_lim': 1
31 |       - name: ldts
32 |         data_tests:
33 |           - recent_ldts:
34 |               'back_date_days': 7
35 | 
36 |   - name: player_leaderboard_stats_sr
37 |     description: "Schema of flattened player stats data from Relic API"
38 |     columns:
39 |       - name: last_match_date
40 |         description: "The last time the player has played"
41 |         data_tests:
42 |           - within_threshold: 
43 |               'lower_threshold': "'2022-01-01'"
44 |               'upper_threshold': CURRENT_DATE()+1
45 |               'error_lim': 1
46 |       - name: ldts
47 |         data_tests:
48 |           - recent_ldts:
49 |               'back_date_days': 7
50 | 
51 |   - name: player_match_sr
52 |     description: "pre-fact table containing mainly business keys and measures"
53 |     columns:
54 |       - name: id
55 |         description: "The primary key for this table"
56 |         data_tests:
57 |           - unique
58 |           - not_null


--------------------------------------------------------------------------------
/dbdiagram.txt:
--------------------------------------------------------------------------------
 1 | Table "DIM_CIV" {
 2 |   "CIV_PK" VARCHAR(32)
 3 |   "CIV_NAME" VARCHAR(16777216)
 4 |   "CIV_WEAKNESSES" VARCHAR(16777216)
 5 |   "CIV_STRENGTHS" VARCHAR(16777216)
 6 |   "LOAD_DATE" DATE
 7 | }
 8 | 
 9 | Table "DIM_DATE" {
10 |   "DATE_PK" NUMBER(38,0)
11 |   "DATE" DATE
12 |   "YEAR" NUMBER(38,0)
13 |   "MONTH" NUMBER(38,0)
14 |   "DAY" NUMBER(38,0)
15 |   "DAY_OF_WEEK" NUMBER(38,0)
16 |   "IS_WEEKEND" BOOLEAN
17 | }
18 | 
19 | Table "DIM_MATCH" {
20 |   "MATCH_PK" VARCHAR(32)
21 |   "GAME_ID" NUMBER(38,0)
22 |   "MAP" VARCHAR(16777216)
23 |   "AVG_ELO" NUMBER(38,2)
24 |   "GAME_DURATION_SECS" NUMBER(38,0)
25 |   "ACTUAL_DURATION_SECS" NUMBER(38,0)
26 |   "GAME_STARTED_TIMESTAMP" TIMESTAMP_NTZ(6)
27 |   "GAME_DATE" DATE
28 |   "TEAM_0_ELO" NUMBER(38,0)
29 |   "TEAM_1_ELO" NUMBER(38,0)
30 |   "LEADERBOARD" VARCHAR(16777216)
31 |   "MIRROR" BOOLEAN
32 |   "PATCH" VARCHAR(16777216)
33 |   "RSRC" VARCHAR(16777216)
34 |   "LOAD_DATE" DATE
35 |   "SOURCE" VARCHAR(16777216)
36 |   "FILE_DATE" DATE
37 | }
38 | 
39 | Table "DIM_PLAYER" {
40 |   "PLAYER_PK" VARCHAR(32)
41 |   "PROFILE_ID" NUMBER(38,0)
42 |   "GAMING_NAME" VARCHAR(16777216)
43 |   "COUNTRY_CODE" VARCHAR(16777216)
44 |   "COUNTRY_NAME" VARCHAR(16777216)
45 |   "STATGROUP_ID" NUMBER(38,0)
46 |   "WINS" NUMBER(38,0)
47 |   "LOSSES" NUMBER(38,0)
48 |   "CURRENT_RANK" NUMBER(38,0)
49 |   "CURRENT_RATING" NUMBER(38,0)
50 |   "LAST_MATCH_DATE" TIMESTAMP_NTZ(0)
51 |   "LOAD_DATE" DATE
52 |   "SOURCE" VARCHAR(16777216)
53 | }
54 | 
55 | Table "FACT_PLAYER_MATCHES" {
56 |   "FACT_PK" VARCHAR(32)
57 |   "MATCH_FK" VARCHAR(32)
58 |   "PLAYER_FK" VARCHAR(32)
59 |   "CIV_FK" VARCHAR(32)
60 |   "DATE_FK" NUMBER(38,0)
61 |   "TEAM" VARCHAR(16777216)
62 |   "WINNER" BOOLEAN
63 |   "MATCH_RATING_DIFF" NUMBER(38,0)
64 |   "NEW_RATING" NUMBER(38,0)
65 |   "OLD_RATING" NUMBER(38,0)
66 |   "SOURCE" VARCHAR(16777216)
67 |   "FILE_DATE" DATE
68 |   "LOAD_DATE" DATE
69 | }
70 | 
71 | 
72 | Ref: FACT_PLAYER_MATCHES.MATCH_FK < DIM_MATCH.MATCH_PK
73 | Ref: FACT_PLAYER_MATCHES.PLAYER_FK < DIM_PLAYER.PLAYER_PK
74 | Ref: FACT_PLAYER_MATCHES.CIV_FK < DIM_CIV.CIV_PK
75 | Ref: FACT_PLAYER_MATCHES.DATE_FK < DIM_DATE.DATE_PK
76 | 


--------------------------------------------------------------------------------
/src/elt_stat_matches.py:
--------------------------------------------------------------------------------
 1 | from utils import Config, timer, fetch_api_file, create_s3_session
 2 | from extract import (
 3 |     Matches,
 4 |     generate_weekly_queries,
 5 |     create_stats_endpoints,
 6 |     validate_parquet_schema,
 7 | )
 8 | from load import load_parquet_data
 9 | import logging
10 | import os
11 | from pathlib import Path
12 | 
13 | # Configure logging
14 | logging.basicConfig(
15 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
16 | )
17 | logger = logging.getLogger(__name__)
18 | 
19 | # Get the directory of the current script
20 | script_dir = Path(__file__).resolve().parent
21 | 
22 | YAML_CONFIG = os.path.join(script_dir, "config.yaml")
23 | 
24 | 
25 | @timer
26 | def main(*args, **kwargs):
27 |     config = Config(YAML_CONFIG)
28 | 
29 |     # Setup:
30 |     s3 = create_s3_session()
31 |     _fn = config.matches_fn_suffix
32 |     _file_dir = config.matches_folder_name
33 |     _validation_schema = Matches
34 | 
35 |     _start_date = config.run_date
36 |     _end_date = config.run_end_date
37 |     _base_url = config.stats_base_url + config.stats_dir_url
38 |     _params = None
39 |     _s3_bucket = config.bucket
40 | 
41 |     # Pre-extract phase
42 |     weekly_querys = generate_weekly_queries(_start_date, _end_date)
43 |     endpoints = create_stats_endpoints(_fn, weekly_querys)
44 | 
45 |     for i, endpoint in enumerate(endpoints):
46 |         endpoint_url = endpoint["endpoint_str"]
47 |         dated_filename = endpoint["file_date"]
48 |         file_dir = f"{_file_dir}/{dated_filename.split('_')[0]}"
49 | 
50 |         # Extract phase
51 |         content = fetch_api_file(_base_url, endpoint_url, _params)
52 | 
53 |         if content is None:
54 |             logger.error(
55 |                 f"Failed to fetch data after 3 attempts for endpoint: {endpoint_url}"
56 |             )
57 |             continue  # Skip to the next iteration of the loop
58 | 
59 |         # Validate phase
60 |         validated_data = validate_parquet_schema(content, _validation_schema)
61 | 
62 |         # Load phase
63 |         load_parquet_data(validated_data, file_dir, dated_filename, _s3_bucket, s3)
64 |         logger.info(f"{i+1}/{len(endpoints)} loaded.")
65 |     logger.info("Script complete.")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/src/elt_stat_players.py:
--------------------------------------------------------------------------------
 1 | from utils import Config, timer, fetch_api_file, create_s3_session
 2 | from extract import (
 3 |     Players,
 4 |     generate_weekly_queries,
 5 |     create_stats_endpoints,
 6 |     validate_parquet_schema,
 7 | )
 8 | from load import load_parquet_data
 9 | import logging
10 | import os
11 | from pathlib import Path
12 | 
13 | # Configure logging
14 | logging.basicConfig(
15 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
16 | )
17 | logger = logging.getLogger(__name__)
18 | 
19 | # Get the directory of the current script
20 | script_dir = Path(__file__).resolve().parent
21 | 
22 | YAML_CONFIG = os.path.join(script_dir, "config.yaml")
23 | 
24 | 
25 | @timer
26 | def main(*args, **kwargs):
27 |     config = Config(YAML_CONFIG)
28 | 
29 |     # Setup:
30 |     s3 = create_s3_session()
31 |     _fn = config.players_fn_suffix
32 |     _file_dir = config.players_folder_name
33 |     _validation_schema = Players
34 | 
35 |     _start_date = config.run_date
36 |     _end_date = config.run_end_date
37 |     _base_url = config.stats_base_url + config.stats_dir_url
38 |     _params = None
39 |     _s3_bucket = config.bucket
40 | 
41 |     # Pre-extract phase
42 |     weekly_querys = generate_weekly_queries(_start_date, _end_date)
43 |     endpoints = create_stats_endpoints(_fn, weekly_querys)
44 | 
45 |     for i, endpoint in enumerate(endpoints):
46 |         endpoint_url = endpoint["endpoint_str"]
47 |         dated_filename = endpoint["file_date"]
48 |         file_dir = f"{_file_dir}/{dated_filename.split('_')[0]}"
49 | 
50 |         # Extract phase
51 |         content = fetch_api_file(_base_url, endpoint_url, _params)
52 | 
53 |         if content is None:
54 |             logger.error(
55 |                 f"Failed to fetch data after 3 attempts for endpoint: {endpoint_url}"
56 |             )
57 |             continue  # Skip to the next iteration of the loop
58 | 
59 |         # Validate phase
60 |         validated_data = validate_parquet_schema(content, _validation_schema)
61 | 
62 |         # Load phase
63 |         load_parquet_data(validated_data, file_dir, dated_filename, _s3_bucket, s3)
64 |         logger.info(f"{i+1}/{len(endpoints)} loaded.")
65 |     logger.info("Script complete.")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/src/project_tests.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from datetime import date
 3 | import os
 4 | import sys
 5 | import pandas as pd
 6 | import io
 7 | from pydantic import BaseModel
 8 | 
 9 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
10 | sys.path.append(os.path.join(project_root, "src"))
11 | 
12 | from src.extract.filter import *
13 | from src.load.loader import *
14 | 
15 | 
16 | @pytest.fixture
17 | def sample_dates():
18 |     return {"start_date": date(2023, 1, 1), "end_date": date(2023, 1, 14)}
19 | 
20 | 
21 | # Test Data and Fixtures
22 | @pytest.fixture
23 | def sample_validation_schema():
24 |     class TestSchema(BaseModel):
25 |         id: int
26 |         name: str
27 | 
28 |     return TestSchema
29 | 
30 | 
31 | # Tests for validate_parquet_schema
32 | def test_validate_parquet_schema(sample_validation_schema):
33 |     # Create a sample DataFrame
34 |     df = pd.DataFrame([{"id": 1, "name": "test1"}, {"id": 2, "name": "test2"}])
35 | 
36 |     # Create a parquet file in memory
37 |     parquet_buffer = io.BytesIO()
38 |     df.to_parquet(parquet_buffer)
39 |     parquet_buffer.seek(0)
40 | 
41 |     result = validate_parquet_schema(parquet_buffer, sample_validation_schema)
42 |     assert result is not None
43 | 
44 | 
45 | # Tests for validate_json_schema
46 | def test_validate_json_schema_valid(sample_validation_schema):
47 |     valid_data = {"id": 1, "name": "test"}
48 |     result = validate_json_schema(valid_data, sample_validation_schema)
49 |     assert result.id == 1
50 |     assert result.name == "test"
51 | 
52 | 
53 | def test_validate_json_schema_invalid(sample_validation_schema):
54 |     invalid_data = {"id": "not_an_integer", "name": "test"}
55 |     result = validate_json_schema(invalid_data, sample_validation_schema)
56 |     assert result == []
57 | 
58 | 
59 | def test_generate_weekly_queries(sample_dates):
60 |     queries = generate_weekly_queries(
61 |         sample_dates["start_date"], sample_dates["end_date"]
62 |     )
63 |     assert len(queries) == 2  # Should generate 2 weeks worth of queries
64 |     assert all("dated" in q and "query_str" in q for q in queries)
65 |     assert queries[0]["query_str"].startswith("2023-01-01")
66 | 
67 | 
68 | def test_create_stats_endpoints():
69 |     weekly_queries = [{"dated": date(2023, 1, 1), "query_str": "2023-01-01_2023-01-07"}]
70 |     extract_file = "stats"
71 |     endpoints = create_stats_endpoints(extract_file, weekly_queries)
72 |     assert len(endpoints) == 1
73 |     assert endpoints[0]["endpoint_str"] == "2023-01-01_2023-01-07/stats.parquet"
74 |     assert endpoints[0]["file_date"] == "2023-01-01_stats"
75 | 


--------------------------------------------------------------------------------
/dags/all_project_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.python import PythonOperator
 3 | from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 4 | 
 5 | from datetime import datetime
 6 | 
 7 | default_args = {"start_date": datetime(2021, 1, 1)}
 8 | 
 9 | 
10 | def _dag_start():
11 |     print("Starting Airflow Pipeline to run all DAGS")
12 | 
13 | 
14 | def _dag_end():
15 |     print("Finished running Airflow Pipeline for all DAGS")
16 | 
17 | 
18 | with DAG(
19 |     "trigger_dag", schedule_interval="@daily", default_args=default_args, catchup=False
20 | ) as dag:
21 |     start_trigger = PythonOperator(task_id="run_all_start", python_callable=_dag_start)
22 | 
23 |     end_trigger = PythonOperator(task_id="run_all_end", python_callable=_dag_end)
24 | 
25 |     trigger_set_load_master = TriggerDagRunOperator(
26 |         task_id="trigger_load_master",
27 |         trigger_dag_id="set_load_master_id",
28 |         wait_for_completion=True,
29 |         poke_interval=30,
30 |     )
31 | 
32 |     trigger_elt_metadata = TriggerDagRunOperator(
33 |         task_id="trigger_metadata",
34 |         trigger_dag_id="elt_metadata_dag_id",
35 |         wait_for_completion=True,
36 |         poke_interval=30,
37 |     )
38 | 
39 |     trigger_elt_stat_matches = TriggerDagRunOperator(
40 |         task_id="trigger_stat_matches",
41 |         trigger_dag_id="elt_stat_matches_id",
42 |         wait_for_completion=True,
43 |         poke_interval=30,
44 |     )
45 | 
46 |     trigger_elt_stat_players = TriggerDagRunOperator(
47 |         task_id="trigger_stat_players",
48 |         trigger_dag_id="elt_stat_players_id",
49 |         wait_for_completion=True,
50 |         poke_interval=30,
51 |     )
52 | 
53 |     trigger_elt_relic_api = TriggerDagRunOperator(
54 |         task_id="trigger_relic_api",
55 |         trigger_dag_id="elt_relic_api_id",
56 |         wait_for_completion=True,
57 |         poke_interval=30,
58 |     )
59 | 
60 |     trigger_dbt_dag = TriggerDagRunOperator(
61 |         task_id="trigger_dbt_dag",
62 |         trigger_dag_id="dbt_dag_id",
63 |         wait_for_completion=True,
64 |         poke_interval=60,
65 |     )
66 | 
67 |     # Run order: Check for metadata, then load stats datasets. Can run relic_api simultaneously.
68 |     start_trigger >> trigger_elt_metadata
69 |     start_trigger >> trigger_set_load_master
70 |     start_trigger >> trigger_elt_relic_api
71 |     trigger_elt_metadata >> trigger_elt_stat_matches
72 |     trigger_elt_metadata >> trigger_elt_stat_players
73 | 
74 |     # Only trigger dbt_dag if all 3 datasets + load_master are finished.
75 |     trigger_set_load_master >> trigger_dbt_dag
76 |     trigger_elt_stat_matches >> trigger_dbt_dag
77 |     trigger_elt_stat_players >> trigger_dbt_dag
78 |     trigger_elt_relic_api >> trigger_dbt_dag
79 | 
80 |     # Finally close dag run after dbt_dag has finished.
81 |     trigger_dbt_dag >> end_trigger
82 | 


--------------------------------------------------------------------------------
/src/extract/filter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | from datetime import timedelta
 4 | from pydantic import ValidationError
 5 | from utils import fetch_api_json
 6 | 
 7 | import pandas as pd
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def fetch_relic_chunk(base_url: str, endpoint: str, params: dict):
13 |     """Fetches all data from Relic API in chunks of 100/request (API limit)"""
14 |     start = 1
15 |     chunk = params["chunk_size"]
16 |     logger.info(f"Processing data in chunks of {chunk} from {endpoint}")
17 |     while True:
18 |         params["start"] = start
19 |         response = fetch_api_json(base_url, endpoint, params)
20 | 
21 |         if not response:
22 |             break
23 |         api_end = response["rankTotal"] + chunk
24 |         if start > api_end:
25 |             break
26 |         logger.info(f"Processing chunk {start}/{api_end}")
27 | 
28 |         yield response
29 |         start += chunk
30 |         time.sleep(0.2)
31 | 
32 | 
33 | def validate_json_schema(json_data, validation_schema):
34 |     try:
35 |         data = json_data
36 |         validated_data = validation_schema.model_validate(data)
37 |         return validated_data
38 |     except ValidationError as e:
39 |         logger.error(f"Validation Error: {e}")
40 |         return []
41 | 
42 | 
43 | def validate_parquet_schema(content, validation_schema):
44 |     df = pd.read_parquet(content)
45 |     records = df.to_dict(orient="records")
46 |     for record in records:
47 |         try:
48 |             validation_schema.model_validate(record)
49 |         except ValidationError as e:
50 |             logger.error(f"Validation error: {e}")
51 | 
52 |     # Reset the pointer to start of file:
53 |     content.seek(0)
54 |     return content
55 | 
56 | 
57 | def generate_weekly_queries(start_date, end_date):
58 |     # Move start date to the next Sunday if it's not already a Sunday
59 |     while start_date.weekday() != 6:
60 |         start_date += timedelta(days=1)
61 | 
62 |     # Move end date to the previous Saturday if it's not already a Saturday
63 |     while end_date.weekday() != 5:
64 |         end_date -= timedelta(days=1)
65 | 
66 |     queries = []
67 |     print(f"Finding all files between {start_date} and {end_date}.")
68 |     while start_date <= end_date:
69 |         # Calculate the end date of the current week (the next Saturday)
70 |         end_date_saturday = start_date + timedelta(days=6)
71 |         query = {
72 |             "dated": start_date,
73 |             "query_str": f"{start_date.strftime('%Y-%m-%d')}_{end_date_saturday.strftime('%Y-%m-%d')}",
74 |         }
75 |         queries.append(query)
76 |         # Move to the next week
77 |         start_date += timedelta(days=7)
78 |     return queries
79 | 
80 | 
81 | def create_stats_endpoints(extract_file: str, weekly_querys: list):
82 |     endpoints = []
83 |     for weekly_query in weekly_querys:
84 |         result_dated = f"{weekly_query['dated']}_{extract_file}"
85 |         result_query = f"{weekly_query['query_str']}/{extract_file}.parquet"
86 |         endpoints.append({"file_date": result_dated, "endpoint_str": result_query})
87 |     logger.info(f"{len(endpoints)} found.")
88 |     return endpoints
89 | 


--------------------------------------------------------------------------------
/src/extract/models.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel, model_validator
  2 | from typing import List, Optional
  3 | from datetime import date, timedelta, datetime
  4 | import logging
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | # -----------------------------------------------------------------------------
  9 | # Aoedumps schema
 10 | # -----------------------------------------------------------------------------
 11 | 
 12 | 
 13 | class WeeklyDump(BaseModel):
 14 |     start_date: date
 15 |     end_date: Optional[date] = None
 16 |     num_matches: int
 17 |     num_players: Optional[int] = None
 18 |     matches_url: str
 19 |     players_url: str
 20 |     match_checksum: Optional[str] = None
 21 |     player_checksum: Optional[str] = None
 22 | 
 23 |     @model_validator(mode="after")
 24 |     def validate_and_get_message(self) -> "WeeklyDump":
 25 |         if self.num_matches == 0:
 26 |             msg = f"File {self.start_date} is present but is empty!"
 27 |             logger.warning(msg)
 28 |         return self
 29 | 
 30 | 
 31 | class ApiSchema(BaseModel):
 32 |     db_dumps: List[WeeklyDump]
 33 |     total_matches: Optional[int] = None
 34 |     total_players: Optional[int] = None
 35 | 
 36 | 
 37 | # -----------------------------------------------------------------------------
 38 | # RelicAPI schema
 39 | # -----------------------------------------------------------------------------
 40 | 
 41 | 
 42 | class RelicResult(BaseModel):
 43 |     code: int
 44 |     message: str
 45 | 
 46 | 
 47 | class Member(BaseModel):
 48 |     profile_id: Optional[int | float] = None
 49 |     name: Optional[str] = None
 50 |     alias: str
 51 |     personal_statgroup_id: Optional[int] = None
 52 |     xp: Optional[int] = None
 53 |     level: Optional[int] = None
 54 |     leaderboardregion_id: Optional[int] = None
 55 |     country: str
 56 | 
 57 | 
 58 | class StatGroup(BaseModel):
 59 |     id: int
 60 |     name: Optional[str] = None
 61 |     type: Optional[int] = None
 62 |     members: List[Member]
 63 | 
 64 | 
 65 | class LeaderboardStat(BaseModel):
 66 |     statgroup_id: int
 67 |     leaderboard_id: Optional[int] = None
 68 |     wins: Optional[int] = None
 69 |     losses: Optional[int] = None
 70 |     streak: Optional[int] = None
 71 |     disputes: Optional[int] = None
 72 |     drops: Optional[int] = None
 73 |     rank: int
 74 |     ranktotal: Optional[int] = None
 75 |     ranklevel: Optional[int] = None
 76 |     rating: Optional[int] = None
 77 |     regionrank: Optional[int] = None
 78 |     regionranktotal: Optional[int] = None
 79 |     lastmatchdate: int
 80 |     highestrank: Optional[int] = None
 81 |     highestranklevel: Optional[int] = None
 82 |     highestrating: Optional[int] = None
 83 | 
 84 | 
 85 | class RelicResponse(BaseModel):
 86 |     statGroups: List[StatGroup]
 87 |     leaderboardStats: List[LeaderboardStat]
 88 | 
 89 | 
 90 | # -------------------------------Aoestats .parquet schemas
 91 | 
 92 | 
 93 | class Players(BaseModel):
 94 |     civ: str
 95 |     game_id: int
 96 |     match_rating_diff: float
 97 |     new_rating: int
 98 |     old_rating: int
 99 |     profile_id: Optional[int | float] = None
100 |     replay_summary_raw: Optional[str | float] = None
101 |     team: int
102 |     winner: bool
103 | 
104 | 
105 | class Matches(BaseModel):
106 |     avg_elo: float
107 |     duration: timedelta
108 |     game_id: int
109 |     game_speed: str
110 |     game_type: str
111 |     irl_duration: timedelta
112 |     leaderboard: str
113 |     map: str
114 |     mirror: bool
115 |     num_players: int
116 |     patch: int
117 |     raw_match_type: Optional[int | float] = None
118 |     replay_enhanced: bool
119 |     started_timestamp: datetime
120 |     starting_age: str
121 |     team_0_elo: float
122 |     team_1_elo: float
123 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
  1 | name: CI Workflow
  2 | 
  3 | on: 
  4 |   pull_request:
  5 |     branches:
  6 |       - main
  7 | 
  8 | jobs:
  9 |   lint_and_test:
 10 |     name: Lint & Format Codebase
 11 |     runs-on: ubuntu-latest
 12 |     steps:
 13 |       - name: Checkout code
 14 |         uses: actions/checkout@v3
 15 | 
 16 |       - name: Set up Python
 17 |         uses: actions/setup-python@v4
 18 |         with:
 19 |           python-version: '3.12'
 20 | 
 21 |       - name: Install dependencies
 22 |         run: |
 23 |           pip install --upgrade pip
 24 |           pip install -r requirements.txt 
 25 |       
 26 |       - name: Run_linting_with_Black
 27 |         uses: psf/black@stable
 28 |         with:
 29 |           src: '.'
 30 |           options: '--check'
 31 |         env:
 32 |           DEFAULT_BRANCH: main
 33 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 34 | 
 35 |       - name: Check for linting errors
 36 |         if: steps.Run_linting_with_Black.outcome == 'failure'
 37 |         run: echo "Linting errors found. Please fix and re-commit."
 38 |         continue-on-error: true 
 39 | 
 40 |       - name: Run tests with pytest
 41 |         run: |
 42 |           pytest src/project_tests.py
 43 | 
 44 |   dev_dbt_deployment:
 45 |     name: Slim CI dbt-run in dev environment
 46 |     needs: lint_and_test 
 47 |     runs-on: ubuntu-latest
 48 |     steps:
 49 |       - name: Check out repository
 50 |         uses: actions/checkout@v3
 51 | 
 52 |       - name: Set up Python
 53 |         uses: actions/setup-python@v4
 54 |         with:
 55 |           python-version: '3.12'
 56 | 
 57 |       - name: Install dependencies
 58 |         run: pip install dbt-core dbt-snowflake 
 59 |     
 60 |       - name: Write profiles.yml
 61 |         env:
 62 |           DBT_PROFILES: ${{ secrets.DBT_PROFILES }}
 63 |         run: echo "$DBT_PROFILES" > src/transform/dbt_aoe/profiles.yml
 64 | 
 65 |       - name: Configure AWS Credentials
 66 |         uses: aws-actions/configure-aws-credentials@v2
 67 |         with:
 68 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 69 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 70 |           aws-region: ${{ secrets.AWS_REGION }}
 71 | 
 72 |       - name: Download manifest.json from S3
 73 |         run: |
 74 |           aws s3 cp s3://aoe2dumps/dev_artifacts/manifest.json src/transform/dbt_aoe/manifest.json
 75 | 
 76 |       - name: Run dbt debug
 77 |         run: |
 78 |           cd src/transform/dbt_aoe
 79 |           dbt debug
 80 | 
 81 |       - name: Run dbt deps
 82 |         run: |
 83 |           cd src/transform/dbt_aoe
 84 |           dbt deps
 85 | 
 86 |       - name: Run dbt compile
 87 |         run: |
 88 |           # This creates the current manifest for the dbt project, based on merged files
 89 |           cd src/transform/dbt_aoe
 90 |           dbt compile
 91 | 
 92 |       - name: Dbt list changed files
 93 |         run: |
 94 |           # This will list the changes between our current files, and what was last merged to prod
 95 |           cd src/transform/dbt_aoe
 96 |           dbt ls --models state:modified --state .
 97 | 
 98 |       - name: Refresh dbt external tables
 99 |         run: |
100 |           cd src/transform/dbt_aoe
101 |           dbt run-operation --target dev stage_external_sources --vars "ext_full_refresh: true"
102 | 
103 |       - name: Run dbt run
104 |         run: |
105 |           cd src/transform/dbt_aoe
106 |           dbt run --target dev --models state:modified+ --state .
107 | 
108 |       - name: Run dbt tests
109 |         run: |
110 |           cd src/transform/dbt_aoe
111 |           dbt test --target dev --models state:modified+ --state .
112 | 
113 |       - name: Upload updated manifest.json to S3
114 |         run: |
115 |           aws s3 cp src/transform/dbt_aoe/target/manifest.json s3://aoe2dumps/dev_artifacts/manifest.json
116 |     


--------------------------------------------------------------------------------
/.github/workflows/cd.yaml:
--------------------------------------------------------------------------------
  1 | name: CD Workflow
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 | 
  8 | 
  9 | jobs:
 10 |   copy_s3_data:
 11 |     runs-on: ubuntu-latest
 12 | 
 13 |     steps:
 14 |       - name: Checkout Repository
 15 |         uses: actions/checkout@v3
 16 | 
 17 |       - name: Configure AWS Credentials
 18 |         uses: aws-actions/configure-aws-credentials@v2
 19 |         with:
 20 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 21 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 22 |           aws-region: ${{ secrets.AWS_REGION }}
 23 | 
 24 |       - name: Install AWS CLI (if not already installed)
 25 |         run: |
 26 |           if ! command -v aws &> /dev/null; then
 27 |             curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
 28 |             unzip awscliv2.zip
 29 |             sudo ./aws/install
 30 |           fi
 31 | 
 32 |       - name: Sync S3 Data
 33 |         run: |
 34 |           aws s3 sync s3://aoe2dumps/dev/ s3://aoe2dumps/prod/
 35 | 
 36 |       - name: Verify Sync Operation
 37 |         run: |
 38 |           # Example verification: List objects in the destination prefix
 39 |           aws s3 ls s3://aoe2dumps/prod/ --recursive
 40 | 
 41 | 
 42 |   prod_dbt_deployment:
 43 |     name: Slim CI dbt-run in prod environment
 44 |     needs: copy_s3_data     
 45 |     runs-on: ubuntu-latest
 46 |     steps:
 47 |       - name: Check out repository
 48 |         uses: actions/checkout@v3
 49 | 
 50 |       - name: Set up Python
 51 |         uses: actions/setup-python@v4
 52 |         with:
 53 |           python-version: '3.12'
 54 | 
 55 |       - name: Install dependencies
 56 |         run: pip install dbt-core dbt-snowflake 
 57 |     
 58 |       - name: Write profiles.yml
 59 |         env:
 60 |           DBT_PROFILES: ${{ secrets.DBT_PROFILES }}
 61 |         run: echo "$DBT_PROFILES" > src/transform/dbt_aoe/profiles.yml
 62 | 
 63 |       - name: Configure AWS Credentials
 64 |         uses: aws-actions/configure-aws-credentials@v2
 65 |         with:
 66 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 67 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 68 |           aws-region: ${{ secrets.AWS_REGION }}
 69 | 
 70 |       - name: Download manifest.json from S3
 71 |         run: |
 72 |           aws s3 cp s3://aoe2dumps/prod_artifacts/manifest.json src/transform/dbt_aoe/manifest.json
 73 | 
 74 |       - name: Run dbt debug
 75 |         run: |
 76 |           cd src/transform/dbt_aoe
 77 |           dbt debug
 78 | 
 79 |       - name: Run dbt deps
 80 |         run: |
 81 |           cd src/transform/dbt_aoe
 82 |           dbt deps
 83 | 
 84 |       - name: Run dbt compile
 85 |         run: |
 86 |           # This creates the current manifest for the dbt project, based on merged files
 87 |           cd src/transform/dbt_aoe
 88 |           dbt compile
 89 | 
 90 |       - name: Dbt list changed files
 91 |         run: |
 92 |           # This will list the changes between our current files, and what was last merged to prod
 93 |           cd src/transform/dbt_aoe
 94 |           dbt ls --models state:modified --state .
 95 | 
 96 |       - name: Refresh dbt external tables
 97 |         run: |
 98 |           cd src/transform/dbt_aoe
 99 |           dbt run-operation --target prod stage_external_sources --vars "ext_full_refresh: true"
100 | 
101 |       - name: Run dbt run
102 |         run: |
103 |           cd src/transform/dbt_aoe
104 |           dbt run --target prod --models state:modified+ --state .
105 |       
106 |       - name: Run dbt tests
107 |         run: |
108 |           cd src/transform/dbt_aoe
109 |           dbt test --target prod --models state:modified+ --state .
110 | 
111 |       - name: Upload updated manifest.json to S3
112 |         run: |
113 |           aws s3 cp src/transform/dbt_aoe/target/manifest.json s3://aoe2dumps/prod_artifacts/manifest.json
114 |     
115 |       - name: Run post-merge actions
116 |         run: |
117 |           echo "Pull request has been approved and merged to main"
118 |           


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/seeds/country_list.csv:
--------------------------------------------------------------------------------
  1 | Country,Country_code
  2 | Afghanistan,af
  3 | Ã…land Islands,ax
  4 | Albania,al
  5 | Algeria,dz
  6 | American Samoa,as
  7 | Andorra,ad
  8 | Angola,ao
  9 | Anguilla,ai
 10 | Antarctica,aq
 11 | Antigua and Barbuda,ag
 12 | Argentina,ar
 13 | Armenia,am
 14 | Aruba,aw
 15 | Australia,au
 16 | Austria,at
 17 | Azerbaijan,az
 18 | Bahamas,bs
 19 | Bahrain,bh
 20 | Bangladesh,bd
 21 | Barbados,bb
 22 | Belarus,by
 23 | Belgium,be
 24 | Belize,bz
 25 | Benin,bj
 26 | Bermuda,bm
 27 | Bhutan,bt
 28 | Bolivia,bo
 29 | Bonaire,bq
 30 | Bosnia and Herzegovina,ba
 31 | Botswana,bw
 32 | Bouvet Island,bv
 33 | Brazil,br
 34 | British Indian Ocean Territory,io
 35 | British Virgin Islands,vg
 36 | Brunei,bn
 37 | Bulgaria,bg
 38 | Burkina Faso,bf
 39 | Burundi,bi
 40 | Cabo Verde,cv
 41 | Cambodia,kh
 42 | Cameroon,cm
 43 | Canada,ca
 44 | Cayman Islands,ky
 45 | Central African Republic,cf
 46 | Chad,td
 47 | Chile,cl
 48 | China,cn
 49 | Christmas Island,cx
 50 | Cocos (Keeling) Islands,cc
 51 | Colombia,co
 52 | Comoros,km
 53 | Congo,cg
 54 | Congo (DRC),cd
 55 | Cook Islands,ck
 56 | Costa Rica,cr
 57 | CÃ´te dâ€™Ivoire,ci
 58 | Croatia,hr
 59 | Cuba,cu
 60 | CuraÃ§ao,cw
 61 | Cyprus,cy
 62 | Czechia,cz
 63 | Denmark,dk
 64 | Djibouti,dj
 65 | Dominica,dm
 66 | Dominican Republic,do
 67 | Ecuador,ec
 68 | Egypt,eg
 69 | El Salvador,sv
 70 | Equatorial Guinea,gq
 71 | Eritrea,er
 72 | Estonia,ee
 73 | Eswatini,sz
 74 | Ethiopia,et
 75 | Falkland Islands,fk
 76 | Faroe Islands,fo
 77 | Fiji,fj
 78 | Finland,fi
 79 | France,fr
 80 | French Guiana,gf
 81 | French Polynesia,pf
 82 | French Southern Territories,tf
 83 | Gabon,ga
 84 | Gambia,gm
 85 | Georgia,ge
 86 | Germany,de
 87 | Ghana,gh
 88 | Gibraltar,gi
 89 | Greece,gr
 90 | Greenland,gl
 91 | Grenada,gd
 92 | Guadeloupe,gp
 93 | Guam,gu
 94 | Guatemala,gt
 95 | Guernsey,gg
 96 | Guinea,gn
 97 | Guinea-Bissau,gw
 98 | Guyana,gy
 99 | Haiti,ht
100 | Heard Island and McDonald Islands,hm
101 | Honduras,hn
102 | Hong Kong SAR,hk
103 | Hungary,hu
104 | Iceland,is
105 | India,in
106 | Indonesia,id
107 | Iran,ir
108 | Iraq,iq
109 | Ireland,ie
110 | Isle of Man,im
111 | Israel,il
112 | Italy,it
113 | Jamaica,jm
114 | Jan Mayen,xj
115 | Japan,jp
116 | Jersey,je
117 | Jordan,jo
118 | Kazakhstan,kz
119 | Kenya,ke
120 | Kiribati,ki
121 | Korea,kr
122 | Kosovo,xk
123 | Kuwait,kw
124 | Kyrgyzstan,kg
125 | Laos,la
126 | Latvia,lv
127 | Lebanon,lb
128 | Lesotho,ls
129 | Liberia,lr
130 | Libya,ly
131 | Liechtenstein,li
132 | Lithuania,lt
133 | Luxembourg,lu
134 | Macao SAR,mo
135 | Madagascar,mg
136 | Malawi,mw
137 | Malaysia,my
138 | Maldives,mv
139 | Mali,ml
140 | Malta,mt
141 | Marshall Islands,mh
142 | Martinique,mq
143 | Mauritania,mr
144 | Mauritius,mu
145 | Mayotte,yt
146 | Mexico,mx
147 | Micronesia,fm
148 | Moldova,md
149 | Monaco,mc
150 | Mongolia,mn
151 | Montenegro,me
152 | Montserrat,ms
153 | Morocco,ma
154 | Mozambique,mz
155 | Myanmar,mm
156 | Namibia,na
157 | Nauru,nr
158 | Nepal,np
159 | Netherlands,nl
160 | New Caledonia,nc
161 | New Zealand,nz
162 | Nicaragua,ni
163 | Niger,ne
164 | Nigeria,ng
165 | Niue,nu
166 | Norfolk Island,nf
167 | North Korea,kp
168 | North Macedonia,mk
169 | Northern Mariana Islands,mp
170 | Norway,no
171 | Oman,om
172 | Pakistan,pk
173 | Palau,pw
174 | Palestinian Authority,ps
175 | Panama,pa
176 | Papua New Guinea,pg
177 | Paraguay,py
178 | Peru,pe
179 | Philippines,ph
180 | Pitcairn Islands,pn
181 | Poland,pl
182 | Portugal,pt
183 | Puerto Rico,pr
184 | Qatar,qa
185 | RÃ©union,re
186 | Romania,ro
187 | Russia,ru
188 | Rwanda,rw
189 | Saba,xs
190 | Saint BarthÃ©lemy,bl
191 | Saint Kitts and Nevis,kn
192 | Saint Lucia,lc
193 | Saint Martin,mf
194 | Saint Pierre and Miquelon,pm
195 | Saint Vincent and the Grenadines,vc
196 | Samoa,ws
197 | San Marino,sm
198 | SÃ£o TomÃ© and PrÃ­ncipe,st
199 | Saudi Arabia,sa
200 | Senegal,sn
201 | Serbia,rs
202 | Seychelles,sc
203 | Sierra Leone,sl
204 | Singapore,sg
205 | Sint Eustatius,xe
206 | Sint Maarten,sx
207 | Slovakia,sk
208 | Slovenia,si
209 | Solomon Islands,sb
210 | Somalia,so
211 | South Africa,za
212 | South Georgia and South Sandwich Islands,gs
213 | South Sudan,ss
214 | Spain,es
215 | Sri Lanka,lk
216 | "St Helena, Ascension, Tristan da Cunha",sh
217 | Sudan,sd
218 | Suriname,sr
219 | Svalbard,sj
220 | Sweden,se
221 | Switzerland,ch
222 | Syria,sy
223 | Taiwan,tw
224 | Tajikistan,tj
225 | Tanzania,tz
226 | Thailand,th
227 | Timor-Leste,tl
228 | Togo,tg
229 | Tokelau,tk
230 | Tonga,to
231 | Trinidad and Tobago,tt
232 | Tunisia,tn
233 | Turkey,tr
234 | Turkmenistan,tm
235 | Turks and Caicos Islands,tc
236 | Tuvalu,tv
237 | U.S. Outlying Islands,um
238 | U.S. Virgin Islands,vi
239 | Uganda,ug
240 | Ukraine,ua
241 | United Arab Emirates,ae
242 | United Kingdom,gb
243 | United States,us
244 | Uruguay,uy
245 | Uzbekistan,uz
246 | Vanuatu,vu
247 | Vatican City,va
248 | Venezuela,ve
249 | Vietnam,vn
250 | Wallis and Futuna,wf
251 | Yemen,ye
252 | Zambia,zm
253 | Zimbabwe,zw
254 | 


--------------------------------------------------------------------------------
/src/transform/dbt_aoe/models/Medallion_README.md:
--------------------------------------------------------------------------------
 1 | # Medallion Logical Layer Design
 2 | 
 3 | For this project, I have structured the tables to use the Medallion framework, sorting the data pipeline into the logical layers: Bronze -> Silver -> Gold. This approach keeps the data logically separated into separate schemas as it moves from 'dirty' raw data into 'clean' conformed dimensional models. Note, I have overridden the dbt macro `generate_schema_name.sql` such that the schema now takes upon the models medallion layer as its schema name within the Snowflake data warehouse.
 4 | 
 5 | ---
 6 | 
 7 | ## Bronze Layer
 8 | 
 9 | ### Overview
10 | 
11 | The Bronze layer represents the cleaned landing data reconfigured into database usable form. It is expected to hold temporary data that is prepared to be loaded into Silver Tables.
12 | 
13 | #### Components
14 | 
15 | - **External Tables**
16 | 
17 |   External tables are defined in the `ext_table_schema.yml` and represent the data within the S3 bucket path. Loading and refresh of the external table is handled by the dbt package `dbt_external_tables`. No transformation takes place here; fields are shown 'as is' from the underlying data. We add the `_ext` suffix to easily identify these within Snowflake.
18 | 
19 | - **Raw Views**
20 | 
21 |   The Raw Views are Snowflake views built on top of the external tables. They contain all fields from the external table, plus metadata fields such as source name and load timestamp. Data is filtered on its `file_date` as according to the `control_master_table`, such that we only bring in relevant rows of data needed for consumption. This is called by the user-defined macro `filter_load.sql`, which can be found in the macros folder. We add the `v_` prefix to denote it is a view, and the `_raw` suffix to identify this is a raw reflection of the source data.
22 | 
23 | - **Bronze Tables**
24 | 
25 |   The Bronze Tables read the data from the Raw Views and apply light transformations. This includes flattening JSON, applying correct data types, and renaming fields. Data is also de-duplicated based on the defined primary key and timestamp, using the user-defined macro `deduplicate_by_key.sql`. A Python model `dim_date_br.py` is also used to create the `dim_date` table, as it is incredibly easy and efficient to do so using the pandas library. We add the `_br` suffix to denote the bronze tables.
26 | 
27 | ### Data Retention
28 | 
29 | The Bronze layer's purpose is to reflect a cleaned version of the staging area and hence is expected to contain temporary held tables/views.
30 | 
31 | - **Object Type**: Transient Tables or Views.
32 | - **Loading Type**: Truncate & Load or Create & Replace.
33 | - **Keeps Historical Data**: No. Data is not expected to be retained in the bronze layer. The reloading of data will overwrite the previously held data in the table. History will be retained in the Silver/Gold Layer.
34 | 
35 | ---
36 | 
37 | ## Silver Layer
38 | 
39 | ### Overview
40 | 
41 | The Silver layer represents the area of which we run further transformations and apply business logic to the data. Transformation can involve creating new surrogate keys, joining new datasets together, applying conditional logic, etc.
42 | 
43 | #### Silver Tables
44 | 
45 | In this project, the data is already in a mostly usable form, so we have a light silver layer with minor transformations. We create a surrogate key on the `player_match_sr` table using a MD5 hash, to uniquely identify rows based on a single field. This makes it easier for Snowflake to perform UPSERT commands, rather than trying to use a composite key. We also ingest a simple 'seed' file (`country_list.csv`) into the table `player_leaderboard_stats_sr`, to provide the mapping between country codes and names. We add the `_sr` suffix to denote the silver tables.
46 | 
47 | ### Data Retention
48 | 
49 | The Silver layer's purpose is to hold a record and history of previously loaded bronze data. Hence data is expected to be held in permanent tables.
50 | 
51 | - **Object Type**: Permanent and/or incrementally loaded Tables.
52 | - **Loading Type**: Insert & Update (mainly).
53 | - **Keeps Historical Data**: Yes. Key Silver tables should not be dropped or truncated unless a full reload is required or specified otherwise.
54 | 
55 | ---
56 | 
57 | ## Gold Layer
58 | 
59 | ### Overview
60 | 
61 | The Gold layer represents the area of which data is modeled into a dimensional model as preparation for reporting. As such, this entails:
62 | 
63 | - **Fact Tables**: The primary table that stores the measures and foreign keys to the dimension table.
64 | - **Dimension Tables**: Tables that contain the attributes of a certain area, with a primary key that joins to the Fact table.
65 | 
66 | ### Data Retention
67 | 
68 | Gold layer tables should retain data and keep an historical view. As such, inserts should be done as 'delta' loads, only adding in new data.
69 | 
70 | - **Object Type**: Permanent and/or incrementally loaded Tables.
71 | - **Loading Type**: Insert & Update (mainly).
72 | - **Keeps Historical Data**: Yes. Gold tables should not be dropped or truncated unless a full reload is required or specified otherwise.


--------------------------------------------------------------------------------
/.astro/test_dag_integrity_default.py:
--------------------------------------------------------------------------------
  1 | """Test the validity of all DAGs. **USED BY DEV PARSE COMMAND DO NOT EDIT**"""
  2 | 
  3 | from contextlib import contextmanager
  4 | import logging
  5 | import os
  6 | 
  7 | import pytest
  8 | 
  9 | from airflow.models import DagBag, Variable, Connection
 10 | from airflow.hooks.base import BaseHook
 11 | from airflow.utils.db import initdb
 12 | 
 13 | # init airflow database
 14 | initdb()
 15 | 
 16 | # The following code patches errors caused by missing OS Variables, Airflow Connections, and Airflow Variables
 17 | 
 18 | 
 19 | # =========== MONKEYPATCH BaseHook.get_connection() ===========
 20 | def basehook_get_connection_monkeypatch(key: str, *args, **kwargs):
 21 |     print(
 22 |         f"Attempted to fetch connection during parse returning an empty Connection object for {key}"
 23 |     )
 24 |     return Connection(key)
 25 | 
 26 | 
 27 | BaseHook.get_connection = basehook_get_connection_monkeypatch
 28 | # # =========== /MONKEYPATCH BASEHOOK.GET_CONNECTION() ===========
 29 | 
 30 | 
 31 | # =========== MONKEYPATCH OS.GETENV() ===========
 32 | def os_getenv_monkeypatch(key: str, *args, **kwargs):
 33 |     default = None
 34 |     if args:
 35 |         default = args[0]  # os.getenv should get at most 1 arg after the key
 36 |     if kwargs:
 37 |         default = kwargs.get(
 38 |             "default", None
 39 |         )  # and sometimes kwarg if people are using the sig
 40 | 
 41 |     env_value = os.environ.get(key, None)
 42 | 
 43 |     if env_value:
 44 |         return env_value  # if the env_value is set, return it
 45 |     if (
 46 |         key == "JENKINS_HOME" and default is None
 47 |     ):  # fix https://github.com/astronomer/astro-cli/issues/601
 48 |         return None
 49 |     if default:
 50 |         return default  # otherwise return whatever default has been passed
 51 |     return f"MOCKED_{key.upper()}_VALUE"  # if absolutely nothing has been passed - return the mocked value
 52 | 
 53 | 
 54 | os.getenv = os_getenv_monkeypatch
 55 | # # =========== /MONKEYPATCH OS.GETENV() ===========
 56 | 
 57 | # =========== MONKEYPATCH VARIABLE.GET() ===========
 58 | 
 59 | 
 60 | class magic_dict(dict):
 61 |     def __init__(self, *args, **kwargs):
 62 |         self.update(*args, **kwargs)
 63 | 
 64 |     def __getitem__(self, key):
 65 |         return {}.get(key, "MOCKED_KEY_VALUE")
 66 | 
 67 | 
 68 | _no_default = object()  # allow falsey defaults
 69 | 
 70 | 
 71 | def variable_get_monkeypatch(key: str, default_var=_no_default, deserialize_json=False):
 72 |     print(
 73 |         f"Attempted to get Variable value during parse, returning a mocked value for {key}"
 74 |     )
 75 | 
 76 |     if default_var is not _no_default:
 77 |         return default_var
 78 |     if deserialize_json:
 79 |         return magic_dict()
 80 |     return "NON_DEFAULT_MOCKED_VARIABLE_VALUE"
 81 | 
 82 | 
 83 | Variable.get = variable_get_monkeypatch
 84 | # # =========== /MONKEYPATCH VARIABLE.GET() ===========
 85 | 
 86 | 
 87 | @contextmanager
 88 | def suppress_logging(namespace):
 89 |     """
 90 |     Suppress logging within a specific namespace to keep tests "clean" during build
 91 |     """
 92 |     logger = logging.getLogger(namespace)
 93 |     old_value = logger.disabled
 94 |     logger.disabled = True
 95 |     try:
 96 |         yield
 97 |     finally:
 98 |         logger.disabled = old_value
 99 | 
100 | 
101 | def get_import_errors():
102 |     """
103 |     Generate a tuple for import errors in the dag bag, and include DAGs without errors.
104 |     """
105 |     with suppress_logging("airflow"):
106 |         dag_bag = DagBag(include_examples=False)
107 | 
108 |         def strip_path_prefix(path):
109 |             return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
110 | 
111 |         # Initialize an empty list to store the tuples
112 |         result = []
113 | 
114 |         # Iterate over the items in import_errors
115 |         for k, v in dag_bag.import_errors.items():
116 |             result.append((strip_path_prefix(k), v.strip()))
117 | 
118 |         # Check if there are DAGs without errors
119 |         for file_path in dag_bag.dags:
120 |             # Check if the file_path is not in import_errors, meaning no errors
121 |             if file_path not in dag_bag.import_errors:
122 |                 result.append((strip_path_prefix(file_path), "No import errors"))
123 | 
124 |         return result
125 | 
126 | 
127 | @pytest.mark.parametrize(
128 |     "rel_path, rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
129 | )
130 | def test_file_imports(rel_path, rv):
131 |     """Test for import errors on a file"""
132 |     if os.path.exists(".astro/dag_integrity_exceptions.txt"):
133 |         with open(".astro/dag_integrity_exceptions.txt", "r") as f:
134 |             exceptions = f.readlines()
135 |     print(f"Exceptions: {exceptions}")
136 |     if (rv != "No import errors") and rel_path not in exceptions:
137 |         # If rv is not "No import errors," consider it a failed test
138 |         raise Exception(f"{rel_path} failed to import with message \n {rv}")
139 |     else:
140 |         # If rv is "No import errors," consider it a passed test
141 |         print(f"{rel_path} passed the import test")
142 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | from datetime import datetime, timedelta
  3 | import requests
  4 | import os
  5 | import boto3
  6 | from typing import Dict, Optional, BinaryIO
  7 | import io
  8 | import time
  9 | import logging
 10 | import backoff
 11 | import snowflake.connector
 12 | from dotenv import load_dotenv, find_dotenv
 13 | 
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | # Load environment variables from .env file
 18 | load_dotenv(find_dotenv())
 19 | 
 20 | # -----------------------------------------------------------------------------
 21 | # Classes
 22 | # -----------------------------------------------------------------------------
 23 | 
 24 | 
 25 | class Config:
 26 |     def __init__(self, yaml_file: str):
 27 |         with open(yaml_file, "r") as f:
 28 |             self.__dict__.update(yaml.safe_load(f))
 29 |         self.run_date = self.parse_date(
 30 |             self.backdate_days_start, self.target_run_date, self.date_format
 31 |         )
 32 |         self.run_end_date = self.parse_date(
 33 |             self.backdate_days_end, self.target_run_end_date, self.date_format
 34 |         )
 35 | 
 36 |     @staticmethod
 37 |     def parse_date(backdate_days: int, specific_date: str, date_format: str):
 38 |         """Creates a date object on initialisation. If target_run_date is specified,
 39 |         it takes priority, otherwise uses a number of backdated days from current date.
 40 |         """
 41 |         if specific_date:
 42 |             result = datetime.strptime(specific_date, date_format).date()
 43 |         else:
 44 |             result = (datetime.now() - timedelta(days=backdate_days)).date()
 45 |         return result
 46 | 
 47 | 
 48 | # -----------------------------------------------------------------------------
 49 | # Functions
 50 | # -----------------------------------------------------------------------------
 51 | 
 52 | 
 53 | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=3)
 54 | def fetch_api_file(
 55 |     base_url: str, endpoint: str, params: Optional[Dict] = None
 56 | ) -> BinaryIO | None:
 57 |     """Fetches a file from an API endpoint and returns it as a BytesIO object."""
 58 |     try:
 59 |         url = base_url + endpoint
 60 |         response = requests.get(url, params=params)
 61 |         response.raise_for_status()
 62 |         if not response.content:
 63 |             logger.warning("Received empty response from API.")
 64 |             return None
 65 |         content = io.BytesIO(response.content)
 66 |         return content
 67 |     except requests.RequestException as e:
 68 |         logger.error(f"Error fetching data: {e}")
 69 |         return None
 70 | 
 71 | 
 72 | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=3)
 73 | def fetch_api_json(base_url: str, endpoint: str, params: dict) -> dict | None:
 74 |     """Fetches JSON data from an API endpoint with retry logic."""
 75 |     try:
 76 |         url = base_url + endpoint
 77 |         response = requests.get(url, params=params)
 78 |         response.raise_for_status()
 79 |         return response.json()
 80 |     except requests.RequestException as e:
 81 |         logger.warning(f"API request failed: {e}. Retrying...")
 82 |         raise  # Re-raise to trigger backoff
 83 | 
 84 | 
 85 | def create_s3_session(s3=None):
 86 |     if s3 == None:
 87 |         logger.info("Authenticating to S3.")
 88 |         s3 = boto3.client(
 89 |             "s3",
 90 |             aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
 91 |             aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
 92 |             region_name=os.getenv("AWS_REGION"),
 93 |         )
 94 |     return s3
 95 | 
 96 | 
 97 | def upload_to_s3(s3, data, bucket_name, path_key):
 98 |     try:
 99 |         s3.upload_fileobj(data, bucket_name, path_key)
100 |         logger.info(
101 |             f"File '{path_key}' uploaded to S3 bucket '{bucket_name}' successfully!"
102 |         )
103 |     except Exception as e:
104 |         logger.error(f"Error uploading file: {e}")
105 | 
106 | 
107 | def timer(func):
108 |     """A simple timer decorator to record how long a function took to run."""
109 | 
110 |     def wrapper(*args, **kwargs):
111 |         start_time = time.time()
112 |         result = func(*args, **kwargs)
113 |         end_time = time.time()
114 |         elapsed_time = end_time - start_time
115 |         logger.info(
116 |             f"Function '{func.__name__}' took {elapsed_time:.1f} seconds to run."
117 |         )
118 |         return result
119 | 
120 |     return wrapper
121 | 
122 | 
123 | def sf_connect(
124 |     db: str | None = os.getenv("SF_DATABASE"),
125 |     schema: str | None = os.getenv("SF_SCHEMA"),
126 | ) -> snowflake.connector.connection.SnowflakeConnection:
127 |     """
128 |     Uses environment variables (store in .env) to connect to Snowflake.
129 |     Optional: to specify database or schema.
130 |     Returns: A snowflake connection object.
131 |     """
132 | 
133 |     logger.info("Connecting to Snowflake...")
134 |     connection_parameters = {
135 |         "account": os.getenv("SF_ACCOUNT_NAME"),
136 |         "user": os.getenv("SF_USERNAME"),
137 |         "password": os.getenv("SF_PASSWORD"),
138 |         "warehouse": os.getenv("SF_WAREHOUSE"),
139 |         "role": os.getenv("SF_ROLE"),
140 |         "database": db,
141 |         "schema": schema,
142 |     }
143 |     connection = snowflake.connector.connect(**connection_parameters)
144 |     logger.info("Connection successfully created")
145 |     return connection
146 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Age of Empires 2 Project
  2 | 
  3 | ## 1. Overview
  4 | 
  5 | The Age of Empires 2 (aoe2) project dashboard provides a comprehensive summary view of player and match statistics from the video game 'Age of Empires 2 DE'.  Data is automatically pulled and refreshed weekly in a PowerBI report, enabling in-depth analysis and data slicing/dicing.  The dashboard aims to answer questions such as:
  6 | 
  7 | * Who are the top performing players currently?
  8 | * What civilization should I counter-pick to maximize my chances of beating my opponent?
  9 | * How has my favorite civilization performed over time?
 10 | 
 11 | ## 2. Features
 12 | 
 13 | * Automated weekly data extraction, loading, and transformation.
 14 | * Pipeline incorporates numerous tests to ensure high data quality.
 15 | * User-friendly dashboards for data and insights visualization.
 16 | 
 17 | ## 3. Project Structure
 18 | 
 19 | ### High level view
 20 | 
 21 | <img src="./README_resources/aoe2project_data_pipeline.PNG" alt="Data Pipeline" width="1200"/>
 22 | 
 23 | ### Pipeline DAG - project all
 24 | 
 25 | <img src="./README_resources/project_all_dag.PNG" alt="Airflow project DAG" width="1200"/>
 26 | 
 27 | ### Pipeline DAG - dbt models
 28 | 
 29 | <img src="./README_resources/dbt_dag_airflow.PNG" alt="Airflow dbt DAG" width="1200"/>
 30 | 
 31 | ### Dimensional Model
 32 | 
 33 | <img src="./README_resources/db_diagram_aoe_model.PNG" alt="Dimensional Model" width="1200"/>
 34 | 
 35 | ## 4. Dashboard Example
 36 | 
 37 | ### Leaderboard
 38 | 
 39 | <img src="./README_resources/aoe_leaderboard.PNG" alt="Player Leaderboard" width="1200"/>
 40 | 
 41 | ### Counter-Civ picker
 42 | 
 43 | <img src="./README_resources/aoe_counter_pick.PNG" alt="Civ Counter Picker" width="1200"/>
 44 | 
 45 | ### Civ Performance over time
 46 | 
 47 | <img src="./README_resources/aoe_civ_performance.PNG" alt="Civ Performance" width="1200"/>
 48 | 
 49 | ## 5. Tools
 50 | 
 51 | * **a) Python (Data Extract & Load)**
 52 |     * Custom-built modules (API data extraction)
 53 |     * Pydantic (schema validation)
 54 |     * Pytest (unit testing)
 55 |     * Logging & API retries (error handling)
 56 | * **b) Apache Airflow (Data Orchestration)**
 57 |     * Astronomer Cosmos library (via Docker container)
 58 | * **c) Snowflake (Data Warehouse)**
 59 | * **d) dbt core (Data Transformation)**
 60 |     * User-defined Macros
 61 |     * Seeds
 62 |     * User-defined Generic and Custom tests
 63 |     * `dbt_external_tables` package
 64 |     * SQL & Python models
 65 |     * Incremental modeling (performance optimization)
 66 | * **e) Git/Github Actions (Version Control)**
 67 |     * CI/CD pipeline (linting, testing, replication)
 68 |     * Slim CI (optimization)
 69 |     * Dev & Prod environments (software development)
 70 | * **f) PowerBI (Data Visualization)**
 71 | * **g) Other**
 72 |     * AWS S3 buckets (data storage)
 73 |     * Medallion architecture (logical data modeling)
 74 |     * Star Schema (dimensional data modeling)
 75 |     * `.env` & `config.yaml` files (Configuration as Code)
 76 |     * `README.md` files & Dbt docs (documentation)
 77 |     * `requirements.txt` (package management)
 78 | 
 79 | ## 6. Project Methodology & Technical Details
 80 | 
 81 | ### Data Extraction and Load
 82 | 
 83 | The data pipeline uses the ELT framework, extracting and loading data "as-is" from APIs into an AWS S3 bucket.  Data is sourced from two APIs:
 84 | 
 85 | 1. **Aoestats.io API (`https://aoestats.io`)**
 86 |    This API provides historical player and match data.  Two endpoints are used: one for dataset metadata (JSON) and another for direct download of match and player data (Parquet).  Custom Python functions generate API endpoint strings, query the API, validate schemas using Pydantic, and load data into S3.
 87 | 
 88 | 2. **Relic-link API (now WorldsEdge)**
 89 |    This unofficial community API provides the latest leaderboard data (JSON).  Due to a 100-row request limit, data is retrieved in chunks.  Each chunk is validated and loaded as a separate JSON file into S3 to avoid exceeding Snowflake's Variant column limit on ingestion to Snowflake External Tables.
 90 | 
 91 | Each API endpoint has dedicated Python scripts following a consistent template:
 92 | 
 93 | * a. Import functions from helper modules (`utils`, `filter`, `loader`).
 94 | * b. Ingest parameters from the configuration file.
 95 | * c. Establish an AWS S3 connection.
 96 | * d. Submit GET requests to retrieve data.
 97 | * e. Validate data against the expected schema (Pydantic).
 98 | * f. Load data into the S3 bucket.
 99 | 
100 | Unit tests using `pytest` ensure function correctness. Airflow DAGs orchestrate script execution. An `all_project_dag.py` script runs all individual DAGs, including a `dbt_dag.py` for transformation steps.
101 | 
102 | ### Data Transformation
103 | 
104 | Data transformation occurs in dbt on Snowflake, using the Medallion architecture (bronze -> silver -> gold). The gold layer uses a star schema optimized for visualization tools like PowerBI.  Each schema has a corresponding `_schema.yml` file for documentation.  Further details are available in `Medallion_README.md` under the dbt `models` folder.
105 | 
106 | ### Workflow Environment
107 | 
108 | Development and production environments are separated using distinct S3 buckets (`dev`, `prod`) and Snowflake databases (`aoe_dev`, `aoe_prod`).  Dbt profiles switch between targets.  Production data is synced from development.
109 | 
110 | ### Github Workflows
111 | 
112 | CI workflows (`ci.yaml`) on pull requests run linting (Black), `pytest`, and Slim CI for dbt. CD workflows (`cd.yaml`) on merge to main sync S3 data to `prod` and run Slim CI on production tables (`aoe_prod` db in Snowflake).
113 | 
114 | ### Reducing Compute Costs
115 | 
116 | Date-driven directory structures in S3 enable delta loading in Snowflake, processing only new data. The dbt models on `dbt run` only process the date/date-range specified from the `load_master` control table.  Dbt's incremental models and Slim CI further minimize compute costs.
117 | 
118 | ## 7. Future Direction
119 | 
120 | * Enhanced dbt testing (unit tests, improved thresholds).
121 | * Data quality dashboards for dbt/Airflow runs.
122 | * Improved Airflow failure notifications.
123 | * Incorporating additional AOE data (civilization strengths/weaknesses, logos).
124 | * Infrastructure as Code (IaC) for Snowflake/AWS.
125 | * Migrating processing to AWS (managed Airflow, EC2).
126 | * Utilizing RelicLink API for live data.
127 | 
128 | ## 8. Miscellaneous
129 | 
130 | ### Project Structure
131 | 
132 | ```bash
133 | ├── Dockerfile
134 | ├── .env
135 | ├── README.md
136 | ├── requirements.txt
137 | ├── .github
138 | │   └── workflows
139 | │       ├── ci.yaml
140 | │       └── cd.yaml
141 | ├── dags
142 | │   ├── all_project_dag.py
143 | │   ├── dbt_dag.py
144 | │   ├── elt_metadata_dag.py
145 | │   ├── elt_relic_api_dag.py
146 | │   ├── elt_stat_matches_dag.py
147 | │   ├── elt_stat_players_dag.py
148 | │   └── set_load_master_dag.py
149 | ├── src
150 | │   ├── __init__.py
151 | │   ├── config.yaml
152 | │   ├── elt_metadata.py
153 | │   ├── elt_relic_api.py
154 | │   ├── elt_stat_matches.py
155 | │   ├── elt_stat_players.py
156 | │   ├── project_tests.py
157 | │   ├── set_load_master.py
158 | │   ├── utils.py
159 | │   ├── extract
160 | │   │   ├── __init__.py
161 | │   │   ├── filter.py
162 | │   │   └── models.py
163 | │   ├── load
164 | │   │   ├── __init__.py
165 | │   │   └── loader.py
166 | │   └── transform
167 | │       └── dbt_aoe
168 | │           ├── dbt_project.yml
169 | │           ├── package-lock.yml
170 | │           ├── packages.yml
171 | │           ├── profiles.yml.template
172 | │           ├── analyses
173 | │           ├── dbt_packages
174 | │           │   └── dbt_external_tables
175 | │           ├── macros
176 | │           │   ├── deduplicate_by_key.sql
177 | │           │   ├── filter_load.sql
178 | │           │   └── generate_schema_name.sql
179 | │           ├── models
180 | │           │   ├── Medallion_README.md
181 | │           │   ├── bronze
182 | │           │   │   ├── bronze_schema.yml
183 | │           │   │   ├── dim_date_br.py
184 | │           │   │   ├── ext_table_schema.yml
185 | │           │   │   ├── leaderboards_br.sql
186 | │           │   │   ├── matches_br.sql
187 | │           │   │   ├── players_br.sql
188 | │           │   │   ├── statgroup_br.sql
189 | │           │   │   ├── v_matches_raw.sql
190 | │           │   │   ├── v_players_raw.sql
191 | │           │   │   └── v_relic_raw.sql
192 | │           │   ├── gold
193 | │           │   │   ├── dim_civ.sql
194 | │           │   │   ├── dim_date.sql
195 | │           │   │   ├── dim_match.sql
196 | │           │   │   ├── dim_player.sql
197 | │           │   │   ├── fact_player_matches.sql
198 | │           │   │   └── gold_schema.yml
199 | │           │   └── silver
200 | │           │       ├── matches_sr.sql
201 | │           │       ├── player_leaderboard_stats_sr.sql
202 | │           │       ├── player_match_sr.sql
203 | │           │       └── silver_schema.yml
204 | │           ├── seeds
205 | │           │   ├── country_list.csv
206 | │           │   └── seeds.yml
207 | │           ├── snapshots
208 | │           └── tests
209 | │               ├── assert_countrys_mapped.sql
210 | │               └── generic
211 | │                   ├── test_recent_ldts.sql
212 | │                   └── test_within_threshold.sql
213 | ```


--------------------------------------------------------------------------------