├── src
    ├── strava_data
    │   ├── db
    │   │   ├── __init__.py
    │   │   ├── models.py
    │   │   └── dao.py
    │   ├── strava_api
    │   │   ├── visualisation
    │   │   │   ├── __init__.py
    │   │   │   ├── graphs_effort.py
    │   │   │   ├── graphs_pace.py
    │   │   │   ├── graphs_distribution.py
    │   │   │   ├── graphs_distance.py
    │   │   │   └── utils.py
    │   │   ├── processing
    │   │   │   ├── __init__.py
    │   │   │   └── transform.py
    │   │   ├── __init__.py
    │   │   └── client.py
    │   ├── ml
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── run_type_classifier.py
    │   │   ├── pace_forecast.py
    │   │   └── training_advisor.py
    │   ├── __init__.py
    │   ├── config.py
    │   └── auth.py
    ├── utils
    │   └── logger.py
    ├── get_tokens.py
    ├── generate_readme.py
    └── main.py
├── strava.sqlite
├── .bandit
├── Pace_by_Day.png
├── Activity_Heatmap.png
├── Cadence_Over_Time.png
├── Pace_Distribution.png
├── Rest_Days_Heatmap.png
├── Run_Days_Heatmap.png
├── Run_Type_Clusters.png
├── Cumulative_Distance.png
├── Time_Taken_Distance.png
├── VO2_Proxy_Over_Time.png
├── Forecast_Weekly_Pace.png
├── Longest_Run_per_Month.png
├── Run_Rest_Ratio_Heatmap.png
├── Running_Pace_over_Time.png
├── Median_1k_Pace_over_Time.png
├── Monthly_Distance_by_Year.png
├── Pace_Consistency_by_Run.png
├── Run_Start_Time_by_Month.png
├── Training_Load_Over_Time.png
├── A.I._Recommended_Training.png
├── Elevation_Gain_Distribution.png
├── Fastest_1k_Pace_over_Time.png
├── Number_of_Runs_per_Distance.png
├── Rolling_30_Day_Comparison.png
├── Run_Distance_Distribution.png
├── Total_Distance_Ran_by_Month.png
├── Run_Type_Distribution_By_Year.png
├── Elevation_Gain_per_KM_by_Month.png
├── Running_Pace_vs_Elevation_Change.png
├── Running_Pace_vs_Total_Distance.png
├── Time_Taken_Distance_Recent_Years.png
├── Training_Intensity_by_HeartRate_Zone.png
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── cleanup-runs.yml
    │   ├── test-code.yml
    │   ├── generate-stats.yml
    │   └── codeql-analysis.yml
├── LICENSE
├── pyproject.toml
├── .gitignore
└── README.md


/src/strava_data/db/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | strava_data db package.
3 | """
4 | 


--------------------------------------------------------------------------------
/strava.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/strava.sqlite


--------------------------------------------------------------------------------
/.bandit:
--------------------------------------------------------------------------------
1 | [bandit]
2 | # B105 - Not a hardcoded password, it's a secrets passed in
3 | skips = B105


--------------------------------------------------------------------------------
/Pace_by_Day.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Pace_by_Day.png


--------------------------------------------------------------------------------
/Activity_Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Activity_Heatmap.png


--------------------------------------------------------------------------------
/Cadence_Over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Cadence_Over_Time.png


--------------------------------------------------------------------------------
/Pace_Distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Pace_Distribution.png


--------------------------------------------------------------------------------
/Rest_Days_Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Rest_Days_Heatmap.png


--------------------------------------------------------------------------------
/Run_Days_Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Days_Heatmap.png


--------------------------------------------------------------------------------
/Run_Type_Clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Type_Clusters.png


--------------------------------------------------------------------------------
/Cumulative_Distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Cumulative_Distance.png


--------------------------------------------------------------------------------
/Time_Taken_Distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Time_Taken_Distance.png


--------------------------------------------------------------------------------
/VO2_Proxy_Over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/VO2_Proxy_Over_Time.png


--------------------------------------------------------------------------------
/Forecast_Weekly_Pace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Forecast_Weekly_Pace.png


--------------------------------------------------------------------------------
/Longest_Run_per_Month.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Longest_Run_per_Month.png


--------------------------------------------------------------------------------
/Run_Rest_Ratio_Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Rest_Ratio_Heatmap.png


--------------------------------------------------------------------------------
/Running_Pace_over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Running_Pace_over_Time.png


--------------------------------------------------------------------------------
/Median_1k_Pace_over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Median_1k_Pace_over_Time.png


--------------------------------------------------------------------------------
/Monthly_Distance_by_Year.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Monthly_Distance_by_Year.png


--------------------------------------------------------------------------------
/Pace_Consistency_by_Run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Pace_Consistency_by_Run.png


--------------------------------------------------------------------------------
/Run_Start_Time_by_Month.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Start_Time_by_Month.png


--------------------------------------------------------------------------------
/Training_Load_Over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Training_Load_Over_Time.png


--------------------------------------------------------------------------------
/A.I._Recommended_Training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/A.I._Recommended_Training.png


--------------------------------------------------------------------------------
/Elevation_Gain_Distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Elevation_Gain_Distribution.png


--------------------------------------------------------------------------------
/Fastest_1k_Pace_over_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Fastest_1k_Pace_over_Time.png


--------------------------------------------------------------------------------
/Number_of_Runs_per_Distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Number_of_Runs_per_Distance.png


--------------------------------------------------------------------------------
/Rolling_30_Day_Comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Rolling_30_Day_Comparison.png


--------------------------------------------------------------------------------
/Run_Distance_Distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Distance_Distribution.png


--------------------------------------------------------------------------------
/Total_Distance_Ran_by_Month.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Total_Distance_Ran_by_Month.png


--------------------------------------------------------------------------------
/src/strava_data/strava_api/visualisation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Visualisation package for generating required charts.
3 | """
4 | 


--------------------------------------------------------------------------------
/Run_Type_Distribution_By_Year.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Type_Distribution_By_Year.png


--------------------------------------------------------------------------------
/src/strava_data/strava_api/processing/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Package for data processing, cleanup, and transformation logic.
3 | """
4 | 


--------------------------------------------------------------------------------
/Elevation_Gain_per_KM_by_Month.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Elevation_Gain_per_KM_by_Month.png


--------------------------------------------------------------------------------
/Running_Pace_vs_Elevation_Change.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Running_Pace_vs_Elevation_Change.png


--------------------------------------------------------------------------------
/Running_Pace_vs_Total_Distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Running_Pace_vs_Total_Distance.png


--------------------------------------------------------------------------------
/Time_Taken_Distance_Recent_Years.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Time_Taken_Distance_Recent_Years.png


--------------------------------------------------------------------------------
/Training_Intensity_by_HeartRate_Zone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Training_Intensity_by_HeartRate_Zone.png


--------------------------------------------------------------------------------
/src/strava_data/strava_api/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | strava_api package containing logic to call the Strava endpoints and handle rate-limiting.
3 | """
4 | 


--------------------------------------------------------------------------------
/src/strava_data/ml/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | strava_data.ml package.
3 | 
4 | Contains all machine learning functionality for:
5 | - Forecasting future running pace
6 | - Model training and evaluation
7 | - ML visualisations
8 | """
9 | 


--------------------------------------------------------------------------------
/src/strava_data/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | strava_data package.
 3 | 
 4 | Houses all core functionality for:
 5 | - Authentication & config
 6 | - Database models and data access
 7 | - Strava API calls
 8 | - Data processing
 9 | - Visualization
10 | """
11 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 |   - package-ecosystem: "pip"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "daily"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/cleanup-runs.yml:
--------------------------------------------------------------------------------
 1 | name: Cleanup All Old Workflow Runs
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *"
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   cleanup:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Delete workflow runs
14 |         uses: Mattraks/delete-workflow-runs@v2.1.0
15 |         with:
16 |           token: ${{ secrets.TOKEN_GITHUB }}
17 |           repository: ${{ github.repository }}
18 |           retain_days: 1
19 | 


--------------------------------------------------------------------------------
/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Central logger configuration.
 3 | """
 4 | 
 5 | import logging
 6 | import sys
 7 | 
 8 | LOGGER = logging.getLogger("StravaDataAnalysis")
 9 | LOGGER.setLevel(logging.INFO)
10 | 
11 | handler = logging.StreamHandler(sys.stdout)
12 | handler.setLevel(logging.INFO)
13 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
14 | handler.setFormatter(formatter)
15 | LOGGER.addHandler(handler)
16 | 
17 | 
18 | def get_logger() -> logging.Logger:
19 |     """
20 |     Returns the shared logger instance.
21 |     """
22 |     return LOGGER
23 | 


--------------------------------------------------------------------------------
/.github/workflows/test-code.yml:
--------------------------------------------------------------------------------
 1 | name: CodeTest
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   schedule:
 9 |     - cron: '25 11 * * *'
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   test-and-lint:
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |       - name: Checkout code
18 |         uses: actions/checkout@v6
19 | 
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v6
22 |         with:
23 |           python-version: "3.11"
24 | 
25 |       - name: Install Poetry
26 |         uses: abatilo/actions-poetry@v4
27 | 
28 |       - name: Install dependencies
29 |         run: poetry install
30 | 
31 |       - name: Run pylint
32 |         run: poetry run pylint src
33 | 
34 |       - name: Check formatting with black
35 |         run: poetry run black --check src
36 | 
37 |       # - name: Run tests with pytest
38 |       #   run: poetry run pytest
39 | 


--------------------------------------------------------------------------------
/src/strava_data/ml/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shared utilities for machine learning feature engineering.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def prepare_pace_summary(splits_df: pd.DataFrame, group_cols: list[str]) -> pd.DataFrame:
 9 |     """
10 |     Aggregates pace-based metrics by the given group columns (e.g. weekly or per run).
11 |     """
12 |     data = splits_df.copy()
13 |     data = data[(data["distance_m"] >= 950) & (data["distance_m"] <= 1050)]
14 |     data["pace_sec_km"] = data["elapsed_time_s"] / (data["distance_m"] / 1000)
15 |     data["distance_km"] = data["distance_m"] / 1000
16 | 
17 |     grouped = (
18 |         data.groupby(group_cols)
19 |         .agg(
20 |             distance_km=("distance_km", "sum"),
21 |             pace_median=("pace_sec_km", "median"),
22 |             pace_std=("pace_sec_km", "std"),
23 |             split_count=("pace_sec_km", "count"),
24 |         )
25 |         .reset_index()
26 |         .dropna()
27 |     )
28 | 
29 |     return grouped
30 | 


--------------------------------------------------------------------------------
/src/strava_data/db/models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data classes or schemas representing Strava data in Python.
 3 | """
 4 | 
 5 | from dataclasses import dataclass
 6 | from typing import Optional
 7 | 
 8 | 
 9 | @dataclass
10 | class Activity:
11 |     # pylint: disable=too-many-instance-attributes
12 |     """
13 |     Represents a single Strava activity row.
14 |     """
15 | 
16 |     activity_id: int
17 |     name: str
18 |     activity_type: str
19 |     distance_m: float
20 |     moving_time_s: int
21 |     average_speed_m_s: float
22 |     max_speed_m_s: float
23 |     total_elevation_gain_m: float
24 |     start_date_local: str
25 |     average_cadence: float
26 | 
27 | 
28 | @dataclass
29 | class Split:
30 |     # pylint: disable=too-many-instance-attributes
31 |     """
32 |     Represents a single 1 km split from a Strava activity.
33 |     """
34 | 
35 |     split_id: int
36 |     activity_id: int
37 |     distance_m: float
38 |     elapsed_time_s: int
39 |     elevation_difference_m: float
40 |     moving_time_s: int
41 |     pace_zone: int
42 |     split_index: int
43 |     average_grade_adjusted_speed_m_s: float
44 |     average_heartrate: Optional[float]
45 |     start_date_local: str
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "strava_project"
 3 | version = "0.1.1"
 4 | description = "Codebase for Strava data analysis."
 5 | authors = ["Craig Wilkinson"]
 6 | readme = "README.md"
 7 | packages = [{ include = "src" }]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | pandas = "^2.3"
12 | matplotlib = "^3.10"
13 | seaborn = "^0.13"
14 | requests = "^2.32"
15 | pyAesCrypt = "^6.0"
16 | cryptography = ">=44.0.1"
17 | numpy = "^2.3"
18 | scikit-learn = "^1.8"
19 | 
20 | [tool.poetry.group.dev.dependencies]
21 | pylint = "^4.0"
22 | pytest = "^9.0"
23 | black = "^25.12"
24 | 
25 | [build-system]
26 | requires = ["poetry-core>=1.0.0"]
27 | build-backend = "poetry.core.masonry.api"
28 | 
29 | [tool.black]
30 | line-length = 100
31 | target-version = ['py39']
32 | include = 'src/.*\.pyi?$'
33 | exclude = '''
34 | /(
35 |     \.venv
36 |   | \.git
37 |   | \.mypy_cache
38 |   | \.pytest_cache
39 |   | \.tox
40 |   | \.eggs
41 |   | \.idea
42 |   | __pycache__
43 |   | build
44 |   | dist
45 |   | tests
46 | )/
47 | '''
48 | 
49 | [tool.pytest.ini_options]
50 | minversion = "6.0"
51 | addopts = "-ra -q"
52 | testpaths = ["src/tests"]
53 | 
54 | [tool.pylint.'MAIN']
55 | max-line-length = 100
56 | disable = [
57 |     "missing-docstring",
58 |     "too-few-public-methods",
59 |     "too-many-arguments"
60 | ]


--------------------------------------------------------------------------------
/src/strava_data/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Global configuration and environment variable handling for Strava secrets
 3 | and optional encryption.
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | CONFIG_FILE = "config.txt"
 9 | 
10 | # pylint: disable=C0103
11 | # Attempt to read password / buffer size from local config file
12 | if os.path.isfile(CONFIG_FILE):
13 |     with open(CONFIG_FILE, "r", encoding="utf-8") as file_handle:
14 |         lines = file_handle.read().splitlines()
15 |         BUFFER_SIZE = int(lines[0].strip())  # First line
16 |         ENCRYPTION_PASSWORD = lines[1].strip()  # Second line
17 |         CLIENT_ID = lines[2].strip()  # Third line
18 |         CLIENT_SECRET = lines[3].strip()  # Forth line
19 | else:
20 |     # Fallback to environment variables
21 |     BUFFER_SIZE = int(os.environ.get("BUFFERSIZE", 65536))  # default 64KB
22 |     ENCRYPTION_PASSWORD = os.environ.get("ENCRYPTIONPASSWORD", "default_password")
23 |     CLIENT_ID = os.environ.get("CLIENTID", "")
24 |     CLIENT_SECRET = os.environ.get("CLIENTSECRET", "")
25 | # pylint: enable=C0103
26 | 
27 | 
28 | def get_buffer_size() -> int:
29 |     """
30 |     Returns the buffer size used for file encryption/decryption.
31 |     Reads from config.txt if present, otherwise from environment variables.
32 |     """
33 |     return BUFFER_SIZE
34 | 
35 | 
36 | def get_encryption_password() -> str:
37 |     """
38 |     Returns the encryption password used for securing the database file.
39 |     Reads from config.txt if present, otherwise from environment variables.
40 |     """
41 |     return ENCRYPTION_PASSWORD
42 | 
43 | 
44 | def get_client_id() -> str:
45 |     """
46 |     Retrieves Strava client ID from environment variables.
47 |     """
48 |     return CLIENT_ID
49 | 
50 | 
51 | def get_client_secret() -> str:
52 |     """
53 |     Retrieves Strava client secret from environment variables.
54 |     """
55 |     return CLIENT_SECRET
56 | 


--------------------------------------------------------------------------------
/.github/workflows/generate-stats.yml:
--------------------------------------------------------------------------------
 1 | name: Generate Stats and Update README
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '2,17,32,47 1,4,7,10,13,16,19,22 * * *'
 6 |   workflow_dispatch:
 7 | jobs:
 8 |   Stats:
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       contents: write
12 | 
13 |     steps:
14 |       - name: Check out code
15 |         uses: actions/checkout@v6
16 | 
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v6
19 |         with:
20 |           python-version: "3.11"
21 | 
22 |       - name: Install Poetry
23 |         uses: abatilo/actions-poetry@v4
24 | 
25 |       - name: Install dependencies
26 |         run: poetry install
27 | 
28 |       - name: Generate Stats
29 |         run: |
30 |           poetry run python src/main.py
31 |         continue-on-error: true
32 |         env:
33 |           BUFFERSIZE: ${{ secrets.BUFFERSIZE }}
34 |           ENCRYPTIONPASSWORD: ${{ secrets.ENCRYPTIONPASSWORD }}
35 |           CLIENTID: ${{ secrets.CLIENTID }}
36 |           CLIENTSECRET: ${{ secrets.CLIENTSECRET }}
37 |           
38 |       - name: Generate README
39 |         run: |
40 |           poetry run python src/generate_readme.py
41 |         continue-on-error: true
42 |         env:
43 |           BUFFERSIZE: ${{ secrets.BUFFERSIZE }}
44 |           ENCRYPTIONPASSWORD: ${{ secrets.ENCRYPTIONPASSWORD }}
45 |           CLIENTID: ${{ secrets.CLIENTID }}
46 |           CLIENTSECRET: ${{ secrets.CLIENTSECRET }}
47 | 
48 |       - name: Commit and push changes
49 |         run: |
50 |           git config --global user.name 'GithubBot'
51 |           git config --global user.email 'GithubBot@9bc0ff44ae664378ab0252851a8954ad.com'
52 |           git remote set-url origin https://x-access-token:${{ secrets.TOKEN_GITHUB }}@github.com/${{ github.repository }}
53 |           git diff-index --quiet HEAD || git commit --allow-empty -am "Automated changes"
54 |           git push
55 |         env:
56 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
57 | 


--------------------------------------------------------------------------------
/src/strava_data/auth.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Handles Strava OAuth token retrieval and refresh logic.
 3 | """
 4 | 
 5 | import json
 6 | import time
 7 | import requests
 8 | from strava_data.config import get_client_id, get_client_secret
 9 | from strava_data.db.dao import read_tokens, store_tokens
10 | from utils.logger import get_logger
11 | 
12 | LOGGER = get_logger()
13 | 
14 | 
15 | def get_or_refresh_tokens() -> None:
16 |     """
17 |     Reads existing tokens from the database.
18 |     If expired, refreshes them via Strava OAuth.
19 |     If none exist, the user must initially obtain them with a manual OAuth flow.
20 |     """
21 |     tokens = read_tokens()
22 |     if not tokens:
23 |         LOGGER.info("No tokens found in the database. Please obtain them initially.")
24 |         return
25 | 
26 |     expires_at = tokens.get("expires_at", 0)
27 |     if expires_at < time.time():
28 |         LOGGER.info("Tokens expired. Refreshing now.")
29 |         refresh_token = tokens.get("refresh_token", "")
30 |         new_tokens = refresh_strava_tokens(refresh_token)
31 |         if not new_tokens:
32 |             raise RuntimeError("Token refresh failed")
33 |         store_tokens(new_tokens)
34 |     else:
35 |         LOGGER.info("Tokens are still valid.")
36 | 
37 | 
38 | def refresh_strava_tokens(refresh_token: str) -> dict:
39 |     """
40 |     Calls Strava's /oauth/token endpoint to refresh an expired token.
41 | 
42 |     :param refresh_token: The user's current refresh token from the DB.
43 |     :return: Dictionary of new tokens, or an empty dict if refresh fails.
44 |     """
45 |     url = "https://www.strava.com/oauth/token"
46 |     payload = {
47 |         "client_id": get_client_id(),
48 |         "client_secret": get_client_secret(),
49 |         "grant_type": "refresh_token",
50 |         "refresh_token": refresh_token,
51 |     }
52 | 
53 |     payload_str = json.dumps(payload)
54 |     LOGGER.info(payload_str)
55 |     try:
56 |         response = requests.post(url, data=payload, timeout=10)
57 |     except requests.exceptions.Timeout:
58 |         LOGGER.info("Token refresh request timed out.")
59 |         return {}
60 | 
61 |     if response.ok:
62 |         return response.json()
63 | 
64 |     LOGGER.info(
65 |         "Failed to refresh tokens. Status: %s Response: %s", response.status_code, response.text
66 |     )
67 |     return {}
68 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ main ]
20 |   schedule:
21 |     - cron: '25 11 * * *'
22 |   workflow_dispatch:
23 | 
24 | jobs:
25 |   analyze:
26 |     name: Analyze
27 |     runs-on: ubuntu-latest
28 |     permissions:
29 |       actions: read
30 |       contents: read
31 |       security-events: write
32 | 
33 |     strategy:
34 |       fail-fast: false
35 |       matrix:
36 |         language: [ 'python' ]
37 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
38 |         # Learn more:
39 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
40 | 
41 |     steps:
42 |     - name: Checkout repository
43 |       uses: actions/checkout@v6
44 | 
45 |     # Initializes the CodeQL tools for scanning.
46 |     - name: Initialize CodeQL
47 |       uses: github/codeql-action/init@v4
48 |       with:
49 |         languages: ${{ matrix.language }}
50 |         # If you wish to specify custom queries, you can do so here or in a config file.
51 |         # By default, queries listed here will override any specified in a config file.
52 |         # Prefix the list here with "+" to use these queries and those in the config file.
53 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
54 | 
55 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
56 |     # If this step fails, then you should remove it and run the build manually (see below)
57 |     - name: Autobuild
58 |       uses: github/codeql-action/autobuild@v4
59 | 
60 |     # ℹ️ Command-line programs to run using the OS shell.
61 |     # 📚 https://git.io/JvXDl
62 | 
63 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
64 |     #    and modify them (or add more) to build your code if your project
65 |     #    uses a compiled language
66 | 
67 |     #- run: |
68 |     #   make bootstrap
69 |     #   make release
70 | 
71 |     - name: Perform CodeQL Analysis
72 |       uses: github/codeql-action/analyze@v4
73 | 


--------------------------------------------------------------------------------
/src/get_tokens.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Allows a one-time, manual retrieval of Strava tokens using an authorization code.
 3 | Usage:
 4 |   1. Run: python get_tokens.py
 5 |   2. Paste the authorization code when prompted.
 6 |   3. The script will request tokens from Strava and store them in the encrypted database.
 7 | """
 8 | 
 9 | import requests
10 | 
11 | from strava_data.db.dao import decrypt_database, encrypt_database, store_tokens
12 | 
13 | 
14 | def main() -> None:
15 |     """
16 |     Steps:
17 |       1. Prompt user for the 'authorization code' from the redirect URL.
18 |       2. Fetch tokens from Strava's /oauth/token endpoint.
19 |       3. Store tokens in the DB (encrypting at the end).
20 |     """
21 |     print("=== Strava Token Retrieval ===")
22 |     print(
23 |         "After creating a Strava application and authorizing it, you obtain a code "
24 |         "in the redirect URL."
25 |     )
26 |     print(
27 |         "Example redirect URL: http://localhost/exchange_token?state=&code=LONGCODEHERE"
28 |         "&scope=read,activity:read_all,profile:read_all"
29 |     )
30 |     print("Enter your LONGCODEHERE value below.\n")
31 | 
32 |     auth_code = input("Paste your Strava authorization code: ").strip()
33 |     if not auth_code:
34 |         print("No authorization code provided. Exiting.")
35 |         return
36 | 
37 |     client_id = input("Paste your Strava client id: ").strip()
38 |     if not client_id:
39 |         print("No client id provided. Exiting.")
40 |         return
41 | 
42 |     client_secret = input("Paste your Strava client secret: ").strip()
43 |     if not client_secret:
44 |         print("No client secret provided. Exiting.")
45 |         return
46 | 
47 |     print("\nRequesting tokens from Strava...")
48 |     try:
49 |         response = requests.post(
50 |             url="https://www.strava.com/oauth/token",
51 |             data={
52 |                 "client_id": client_id,
53 |                 "client_secret": client_secret,
54 |                 "code": auth_code,
55 |                 "grant_type": "authorization_code",
56 |             },
57 |             timeout=10,
58 |         )
59 |     except requests.exceptions.Timeout:
60 |         print("Timeout occurred while requesting Strava tokens.")
61 |         return
62 | 
63 |     strava_tokens = response.json()
64 |     if "errors" in strava_tokens or "message" in strava_tokens:
65 |         print("Failed to retrieve tokens. Strava responded with:")
66 |         print(strava_tokens)
67 |         return
68 | 
69 |     print("Successfully retrieved tokens!")
70 |     print(strava_tokens)
71 | 
72 |     print("\nStoring tokens in the database...")
73 |     decrypt_database()
74 |     store_tokens(strava_tokens)
75 |     encrypt_database()
76 |     print("Tokens stored successfully. Database re-encrypted.\n")
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | config.txt
  2 | strava_temp.sqlite
  3 | *.lock
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # pytype static type analyzer
139 | .pytype/
140 | 
141 | # Cython debug symbols
142 | cython_debug/


--------------------------------------------------------------------------------
/src/strava_data/strava_api/processing/transform.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data transformation utilities for activities and splits.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | 
 9 | def transform_activities(activities_df: pd.DataFrame) -> pd.DataFrame:
10 |     """
11 |     Cleans and enriches raw Strava activities.
12 | 
13 |     :param activities_df: Raw DataFrame from Strava's /activities endpoint.
14 |     :return: DataFrame with standardized columns and transformations.
15 |     """
16 |     if activities_df.empty:
17 |         return pd.DataFrame()
18 | 
19 |     activities_clean = activities_df.copy()
20 | 
21 |     activities_clean["distance_m"] = activities_clean["distance"]
22 |     activities_clean["moving_time_s"] = activities_clean["moving_time"]
23 |     activities_clean["average_speed_m_s"] = activities_clean["average_speed"]
24 |     activities_clean["max_speed_m_s"] = activities_clean["max_speed"]
25 |     activities_clean["total_elevation_gain_m"] = activities_clean["total_elevation_gain"]
26 | 
27 |     if "average_cadence" not in activities_clean.columns:
28 |         activities_clean["average_cadence"] = 0.0
29 | 
30 |     activities_clean["start_date_local"] = activities_clean["start_date_local"]
31 |     activities_clean["activity_type"] = np.where(
32 |         activities_clean["type"].str.lower() == "run", "Run", activities_clean["type"]
33 |     )
34 | 
35 |     final_cols = [
36 |         "id",
37 |         "name",
38 |         "type",
39 |         "distance_m",
40 |         "moving_time_s",
41 |         "average_speed_m_s",
42 |         "max_speed_m_s",
43 |         "total_elevation_gain_m",
44 |         "start_date_local",
45 |         "average_cadence",
46 |     ]
47 | 
48 |     return activities_clean[final_cols].copy()
49 | 
50 | 
51 | def transform_splits(splits_df: pd.DataFrame) -> pd.DataFrame:
52 |     """
53 |     Cleans and enriches splits data from Strava activities.
54 | 
55 |     :param splits_df: DataFrame from activity detail calls.
56 |     :return: DataFrame with standardized columns for splits.
57 |     """
58 |     if splits_df.empty:
59 |         return pd.DataFrame()
60 | 
61 |     splits_clean = splits_df.copy()
62 | 
63 |     splits_clean["distance_m"] = splits_clean["distance"]
64 |     splits_clean["elapsed_time_s"] = splits_clean["elapsed_time"]
65 |     splits_clean["elevation_difference_m"] = splits_clean["elevation_difference"]
66 |     splits_clean["moving_time_s"] = splits_clean["moving_time"]
67 |     splits_clean["average_grade_adjusted_speed_m_s"] = splits_clean["average_grade_adjusted_speed"]
68 |     splits_clean["average_heartrate"] = splits_clean.get("average_heartrate", np.nan)
69 |     splits_clean["split_index"] = splits_clean["split"]
70 |     splits_clean["start_date_local"] = splits_clean["start_date_local"]
71 | 
72 |     final_cols = [
73 |         "activity_id",
74 |         "distance_m",
75 |         "elapsed_time_s",
76 |         "elevation_difference_m",
77 |         "moving_time_s",
78 |         "pace_zone",
79 |         "split_index",
80 |         "average_grade_adjusted_speed_m_s",
81 |         "average_heartrate",
82 |         "start_date_local",
83 |     ]
84 | 
85 |     return splits_clean[final_cols].copy()
86 | 


--------------------------------------------------------------------------------
/src/strava_data/ml/run_type_classifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Machine learning to classify run types (e.g. Easy, Tempo, Intervals, Long)
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | from sklearn.cluster import KMeans
  9 | from sklearn.preprocessing import StandardScaler
 10 | from sklearn.metrics import silhouette_score
 11 | 
 12 | from strava_data.ml.utils import prepare_pace_summary
 13 | from strava_data.strava_api.visualisation.utils import (
 14 |     prepare_dated_activities,
 15 |     save_and_close_plot,
 16 |     format_pace,
 17 |     add_title_with_attribution,
 18 |     TitleBoxConfig,
 19 | )
 20 | from utils.logger import get_logger
 21 | 
 22 | LOGGER = get_logger()
 23 | 
 24 | RUN_TYPE_LABELS = {
 25 |     0: "Easy",
 26 |     1: "Tempo",
 27 |     2: "Intervals",
 28 |     3: "Long",
 29 | }
 30 | 
 31 | 
 32 | def build_run_features(splits_df: pd.DataFrame) -> pd.DataFrame:
 33 |     """
 34 |     Aggregates split data into per-run features for clustering.
 35 |     """
 36 |     data = prepare_dated_activities(splits_df)
 37 |     data["start_date"] = pd.to_datetime(data["start_date_local"]).dt.tz_localize(None)
 38 | 
 39 |     # Group by activity ID and start date to represent each run
 40 |     summary = prepare_pace_summary(data, group_cols=["activity_id", "start_date_local"])
 41 | 
 42 |     summary["start_date"] = pd.to_datetime(summary["start_date_local"])
 43 |     summary["day_of_week"] = summary["start_date"].dt.dayofweek
 44 |     summary["month"] = summary["start_date"].dt.month
 45 |     summary["year"] = summary["start_date"].dt.year
 46 | 
 47 |     return summary
 48 | 
 49 | 
 50 | def cluster_run_types(data: pd.DataFrame, n_clusters: int = 4) -> pd.DataFrame:
 51 |     """
 52 |     Applies KMeans clustering to classify run types.
 53 |     """
 54 |     features = data[["distance_km", "pace_median", "pace_std", "split_count"]]
 55 |     scaler = StandardScaler()
 56 |     features_scaled = scaler.fit_transform(features)
 57 | 
 58 |     model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
 59 |     cluster_labels = model.fit_predict(features_scaled)
 60 | 
 61 |     score = silhouette_score(features_scaled, cluster_labels)
 62 |     LOGGER.info("Silhouette Score: %.3f", score)
 63 | 
 64 |     data["run_type_cluster"] = cluster_labels
 65 |     data["run_type"] = data["run_type_cluster"].map(RUN_TYPE_LABELS)
 66 |     return data
 67 | 
 68 | 
 69 | def plot_clusters(data: pd.DataFrame, output_path: str) -> None:
 70 |     """
 71 |     Scatterplot of distance vs. pace coloured by run type.
 72 |     """
 73 |     plt.figure(figsize=(10, 6))
 74 |     sns.scatterplot(
 75 |         data=data,
 76 |         x="distance_km",
 77 |         y="pace_median",
 78 |         hue="run_type",
 79 |         palette="tab10",
 80 |         alpha=0.8,
 81 |     )
 82 |     plt.xlabel("Distance (km)")
 83 |     plt.ylabel("Pace (mm:ss per km)")
 84 |     plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(format_pace))
 85 |     plt.grid(True)
 86 |     plt.legend(title="Run Type")
 87 |     add_title_with_attribution(
 88 |         plt.gcf(),
 89 |         "Run Type Clustering",
 90 |         TitleBoxConfig(),
 91 |     )
 92 |     save_and_close_plot(output_path)
 93 | 
 94 | 
 95 | def plot_run_type_distribution_by_year(data: pd.DataFrame, output_path: str) -> None:
 96 |     """
 97 |     Bar chart showing count of run types per year.
 98 |     """
 99 |     counts = data.groupby(["year", "run_type"]).size().reset_index(name="count")
100 |     pivot = counts.pivot(index="year", columns="run_type", values="count").fillna(0)
101 | 
102 |     pivot.plot(kind="bar", stacked=True, figsize=(10, 6), colormap="tab10")
103 |     plt.xlabel("Year")
104 |     plt.ylabel("Number of Runs")
105 |     plt.xticks(rotation=45)
106 |     plt.legend(title="Run Type")
107 |     plt.grid(True, axis="y")
108 |     add_title_with_attribution(
109 |         plt.gcf(),
110 |         "Run Type Distribution by Year",
111 |         TitleBoxConfig(),
112 |     )
113 |     save_and_close_plot(output_path)
114 | 
115 | 
116 | def run_clustering_pipeline(splits_df: pd.DataFrame) -> pd.DataFrame:
117 |     """
118 |     Runs the full clustering pipeline: feature prep, clustering, and visualisation.
119 |     """
120 |     LOGGER.info("Loading and building features...")
121 |     feature_data = build_run_features(splits_df)
122 | 
123 |     LOGGER.info("Running KMeans clustering...")
124 |     clustered = cluster_run_types(feature_data, n_clusters=4)
125 | 
126 |     LOGGER.info("Plotting clusters...")
127 |     plot_clusters(clustered, "Run_Type_Clusters.png")
128 | 
129 |     LOGGER.info("Plotting run type distribution...")
130 |     plot_run_type_distribution_by_year(clustered, "Run_Type_Distribution_By_Year.png")
131 | 
132 |     return clustered
133 | 


--------------------------------------------------------------------------------
/src/strava_data/ml/pace_forecast.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Machine learning to forecast pace
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | from sklearn.linear_model import Ridge
  8 | from sklearn.model_selection import TimeSeriesSplit, cross_val_score
  9 | from sklearn.pipeline import Pipeline
 10 | from sklearn.preprocessing import StandardScaler
 11 | 
 12 | from strava_data.ml.utils import prepare_pace_summary
 13 | from strava_data.strava_api.visualisation.utils import (
 14 |     prepare_dated_activities,
 15 |     format_pace,
 16 |     save_and_close_plot,
 17 |     add_title_with_attribution,
 18 |     TitleBoxConfig,
 19 | )
 20 | from utils.logger import get_logger
 21 | 
 22 | LOGGER = get_logger()
 23 | 
 24 | 
 25 | def build_weekly_pace_features(splits_df: pd.DataFrame) -> pd.DataFrame:
 26 |     """
 27 |     Aggregates ~1 km splits into weekly median pace and rolling stats.
 28 |     """
 29 |     data = prepare_dated_activities(splits_df)
 30 |     data["start_date"] = pd.to_datetime(data["start_date_local"]).dt.tz_localize(None)
 31 |     data["week"] = data["start_date"].dt.to_period("W").apply(lambda r: r.start_time)
 32 |     summary = prepare_pace_summary(data, group_cols=["week"])
 33 | 
 34 |     summary["pace_7d_avg"] = summary["pace_median"].rolling(window=2).mean()
 35 |     summary["pace_7d_std"] = summary["pace_median"].rolling(window=2).std()
 36 |     return summary.dropna()
 37 | 
 38 | 
 39 | def train_forecast_model(data: pd.DataFrame):
 40 |     """
 41 |     Trains a Ridge regression model using time-based cross-validation.
 42 |     """
 43 |     features = data[["pace_7d_avg", "pace_7d_std", "split_count"]]
 44 |     target = data["pace_median"]
 45 | 
 46 |     model = Pipeline([("scale", StandardScaler()), ("ridge", Ridge(alpha=1.0))])
 47 | 
 48 |     tscv = TimeSeriesSplit(n_splits=5)
 49 |     scores = cross_val_score(
 50 |         model, features, target, cv=tscv, scoring="neg_root_mean_squared_error"
 51 |     )
 52 |     LOGGER.info("CV RMSE: %.2f seconds", -scores.mean())
 53 | 
 54 |     model.fit(features, target)
 55 |     return model
 56 | 
 57 | 
 58 | def predict_next_week(model, latest_row: pd.Series):
 59 |     """
 60 |     Uses trained model to predict next week's average pace.
 61 |     """
 62 |     next_features = latest_row[["pace_7d_avg", "pace_7d_std", "split_count"]].to_frame().T
 63 |     predicted_pace = model.predict(next_features)[0]
 64 |     minutes = int(predicted_pace // 60)
 65 |     seconds = int(predicted_pace % 60)
 66 |     LOGGER.info("Forecasted pace for next week: %d:%02d per km", minutes, seconds)
 67 |     return predicted_pace
 68 | 
 69 | 
 70 | def plot_forecast(weekly_data: pd.DataFrame, forecast_value: float, output_path: str) -> None:
 71 |     """
 72 |     Plots weekly median pace and overlays the next week's forecast as an X with RMSE band.
 73 |     """
 74 |     plt.figure(figsize=(10, 6))
 75 |     plt.plot(weekly_data["week"], weekly_data["pace_median"], label="Actual Pace", marker="o")
 76 | 
 77 |     true_values = weekly_data["pace_median"]
 78 |     feature_values = weekly_data[["pace_7d_avg", "pace_7d_std", "split_count"]]
 79 |     model = Pipeline([("scale", StandardScaler()), ("ridge", Ridge(alpha=1.0))])
 80 |     model.fit(feature_values, true_values)
 81 |     residuals = true_values - model.predict(feature_values)
 82 |     rmse = residuals.std()
 83 | 
 84 |     forecast_week = weekly_data["week"].max() + pd.Timedelta(weeks=1)
 85 |     plt.scatter(
 86 |         forecast_week, forecast_value, marker="x", color="red", s=100, label="Forecast Next Week"
 87 |     )
 88 | 
 89 |     plt.fill_between(
 90 |         weekly_data["week"].astype("datetime64[ns]"),
 91 |         weekly_data["pace_median"] - rmse,
 92 |         weekly_data["pace_median"] + rmse,
 93 |         color="blue",
 94 |         alpha=0.1,
 95 |         label="±1 RMSE Band",
 96 |     )
 97 | 
 98 |     plt.xlabel("Week")
 99 |     plt.ylabel("Pace (mm:ss)")
100 |     plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(format_pace))
101 |     plt.legend()
102 |     plt.grid(True)
103 |     add_title_with_attribution(
104 |         plt.gcf(),
105 |         "Weekly Median Pace with Forecast",
106 |         TitleBoxConfig(),
107 |     )
108 |     save_and_close_plot(output_path)
109 | 
110 | 
111 | def run_forecast_pipeline(splits_df: pd.DataFrame) -> None:
112 |     """
113 |     Orchestrates weekly pace forecast: feature prep, training, prediction, plotting.
114 |     """
115 |     LOGGER.info("Building features from splits...")
116 |     weekly_data = build_weekly_pace_features(splits_df)
117 | 
118 |     LOGGER.info("Training forecast model...")
119 |     model = train_forecast_model(weekly_data)
120 | 
121 |     LOGGER.info("Predicting future pace...")
122 |     latest_features = weekly_data.iloc[-1]
123 |     forecast_value = predict_next_week(model, latest_features)
124 | 
125 |     LOGGER.info("Generating forecast chart...")
126 |     plot_forecast(weekly_data, forecast_value, "Forecast_Weekly_Pace.png")
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # StravaDataAnalysis
  2 | This repository extracts data from the Strava API, which is downstream of Garmin devices, stores it locally (encrypted), and generates visualizations.
  3 | 
  4 | If other people start using this, I'll try and streamline this process as much as I can.
  5 | 
  6 | [![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](http://unlicense.org/)
  7 | [![CodeFactor](https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis/badge)](https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis)
  8 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/9f08e367a5594645aa30c1e31c54dbb8)](https://app.codacy.com/gh/c-wilkinson/StravaDataAnalysis?utm_source=github.com&utm_medium=referral&utm_content=c-wilkinson/StravaDataAnalysis&utm_campaign=Badge_Grade)
  9 | [![CodeTest](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/test-code.yml/badge.svg)](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/test-code.yml)
 10 | [![GenerateStats](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/generate-stats.yml/badge.svg)](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/generate-stats.yml)
 11 | [![CodeQL](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/codeql-analysis.yml)
 12 | 
 13 | ## Generated Content
 14 | 📅 Stats last updated on: **2025-12-19 02:50:13**
 15 | 
 16 | 🏃‍♂️ Most recent run: 0 years, 0 months, 3 days, 20 hours and 50 minutes
 17 | 
 18 | ![A.I. Recommended Training](A.I._Recommended_Training.png?raw=true "A.I. Recommended Training")
 19 | 
 20 | ![Activity Heatmap](Activity_Heatmap.png?raw=true "Activity Heatmap")
 21 | 
 22 | ![Cadence Over Time](Cadence_Over_Time.png?raw=true "Cadence Over Time")
 23 | 
 24 | ![Cumulative Distance](Cumulative_Distance.png?raw=true "Cumulative Distance")
 25 | 
 26 | ![Elevation Gain Distribution](Elevation_Gain_Distribution.png?raw=true "Elevation Gain Distribution")
 27 | 
 28 | ![Elevation Gain Per Km By Month](Elevation_Gain_per_KM_by_Month.png?raw=true "Elevation Gain Per Km By Month")
 29 | 
 30 | ![Fastest 1K Pace Over Time](Fastest_1k_Pace_over_Time.png?raw=true "Fastest 1K Pace Over Time")
 31 | 
 32 | ![Forecast Weekly Pace](Forecast_Weekly_Pace.png?raw=true "Forecast Weekly Pace")
 33 | 
 34 | ![Longest Run Per Month](Longest_Run_per_Month.png?raw=true "Longest Run Per Month")
 35 | 
 36 | ![Median 1K Pace Over Time](Median_1k_Pace_over_Time.png?raw=true "Median 1K Pace Over Time")
 37 | 
 38 | ![Monthly Distance By Year](Monthly_Distance_by_Year.png?raw=true "Monthly Distance By Year")
 39 | 
 40 | ![Number Of Runs Per Distance](Number_of_Runs_per_Distance.png?raw=true "Number Of Runs Per Distance")
 41 | 
 42 | ![Pace Consistency By Run](Pace_Consistency_by_Run.png?raw=true "Pace Consistency By Run")
 43 | 
 44 | ![Pace Distribution](Pace_Distribution.png?raw=true "Pace Distribution")
 45 | 
 46 | ![Pace By Day](Pace_by_Day.png?raw=true "Pace By Day")
 47 | 
 48 | ![Rest Days Heatmap](Rest_Days_Heatmap.png?raw=true "Rest Days Heatmap")
 49 | 
 50 | ![Rolling 30 Day Comparison](Rolling_30_Day_Comparison.png?raw=true "Rolling 30 Day Comparison")
 51 | 
 52 | ![Run Days Heatmap](Run_Days_Heatmap.png?raw=true "Run Days Heatmap")
 53 | 
 54 | ![Run Distance Distribution](Run_Distance_Distribution.png?raw=true "Run Distance Distribution")
 55 | 
 56 | ![Run Rest Ratio Heatmap](Run_Rest_Ratio_Heatmap.png?raw=true "Run Rest Ratio Heatmap")
 57 | 
 58 | ![Run Start Time By Month](Run_Start_Time_by_Month.png?raw=true "Run Start Time By Month")
 59 | 
 60 | ![Run Type Clusters](Run_Type_Clusters.png?raw=true "Run Type Clusters")
 61 | 
 62 | ![Run Type Distribution By Year](Run_Type_Distribution_By_Year.png?raw=true "Run Type Distribution By Year")
 63 | 
 64 | ![Running Pace Over Time](Running_Pace_over_Time.png?raw=true "Running Pace Over Time")
 65 | 
 66 | ![Running Pace Vs Elevation Change](Running_Pace_vs_Elevation_Change.png?raw=true "Running Pace Vs Elevation Change")
 67 | 
 68 | ![Running Pace Vs Total Distance](Running_Pace_vs_Total_Distance.png?raw=true "Running Pace Vs Total Distance")
 69 | 
 70 | ![Time Taken Distance](Time_Taken_Distance.png?raw=true "Time Taken Distance")
 71 | 
 72 | ![Time Taken Distance Recent Years](Time_Taken_Distance_Recent_Years.png?raw=true "Time Taken Distance Recent Years")
 73 | 
 74 | ![Total Distance Ran By Month](Total_Distance_Ran_by_Month.png?raw=true "Total Distance Ran By Month")
 75 | 
 76 | ![Training Intensity By Heartrate Zone](Training_Intensity_by_HeartRate_Zone.png?raw=true "Training Intensity By Heartrate Zone")
 77 | 
 78 | ![Training Load Over Time](Training_Load_Over_Time.png?raw=true "Training Load Over Time")
 79 | 
 80 | ![Vo2 Proxy Over Time](VO2_Proxy_Over_Time.png?raw=true "Vo2 Proxy Over Time")
 81 | 
 82 | ## Instructions
 83 | As I'm sure is obvious, I'm teaching myself python as I go so the code quality is not likely to be great. Do with it as you wish.
 84 | 
 85 | 1. To use, create an Application on Strava. This can be done here: https://www.strava.com/settings/api
 86 | 
 87 | Give it a name, a website and an 'Authorization Callback Domain'. The 'Authorization Callback Domain' should be 'local host'.
 88 | 
 89 | 2. Copy and paste the following link into your browser, replacing {CLIENTIDHERE} with your numeric Client ID found on your Strava application settings page.
 90 | 
 91 | > http://www.strava.com/oauth/authorize?client_id={CLIENTIDHERE}&response_type=code&redirect_uri=http://localhost/exchange_token&approval_prompt=force&scope=profile:read_all,activity:read_all
 92 | 
 93 | Click authorise when you visit the above link
 94 | 
 95 | 3. You will go to a 404 not found page with a link that looks like this: -
 96 | 
 97 | > http://localhost/exchange_token?state=&code={LONGCODEHERE}&scope=read,activity:read_all,profile:read_all
 98 | 
 99 | Copy the code after '&code=' to save for step 4. You will also need your client ID and client secret found on your Strava application settings page.
100 | 
101 | 4. Run 'get_tokens.py'. This will create the initial tokens required for the script.
102 | 
103 | Once this has been completed, you can run 'main.py' which uses the tokens to get the data points. If the access_token has expired, it will refresh its tokens automatically during run time.


--------------------------------------------------------------------------------
/src/generate_readme.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generates an updated README.md at the top-level of the repository.
  3 | """
  4 | 
  5 | import os
  6 | from datetime import datetime
  7 | from dateutil.relativedelta import relativedelta
  8 | 
  9 | from strava_data.db.dao import decrypt_database, encrypt_database, get_last_run_time
 10 | from utils.logger import get_logger
 11 | 
 12 | LOGGER = get_logger()
 13 | 
 14 | # Build a path to the README.md in the top-level directory
 15 | README_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md")
 16 | 
 17 | 
 18 | def generate_readme() -> None:
 19 |     """
 20 |     1. Decrypts the DB if needed.
 21 |     2. Fetches the last run from the activities table.
 22 |     3. Calculates how long ago it was.
 23 |     4. Rebuilds README.md in the top-level directory with embedded graphs.
 24 |     5. Encrypts DB again if desired.
 25 |     """
 26 |     LOGGER.info("Start generate_readme.")
 27 |     decrypt_database()
 28 | 
 29 |     last_updated = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 30 |     last_run_time = get_last_run_time()
 31 |     time_string = "No runs found!"
 32 |     if last_run_time is not None:
 33 |         delta = relativedelta(datetime.now(), last_run_time)
 34 |         time_string = (
 35 |             f"{delta.years} years, "
 36 |             f"{delta.months} months, "
 37 |             f"{delta.days} days, "
 38 |             f"{delta.hours} hours and "
 39 |             f"{delta.minutes} minutes"
 40 |         )
 41 | 
 42 |     encrypt_database()
 43 |     if os.path.exists(README_PATH):
 44 |         os.remove(README_PATH)
 45 | 
 46 |     readme_dir = os.path.dirname(README_PATH)
 47 | 
 48 |     with open(README_PATH, "w", encoding="utf-8") as handle:
 49 |         handle.write("# StravaDataAnalysis\n")
 50 |         handle.write(
 51 |             "This repository extracts data from the Strava API, which is downstream of Garmin "
 52 |             "devices, stores it locally (encrypted), and generates visualizations.\n\n"
 53 |             "If other people start using this, I'll try and streamline this process as much as I "
 54 |             "can.\n\n"
 55 |             "[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)]"
 56 |             "(http://unlicense.org/)\n"
 57 |             "[![CodeFactor]("
 58 |             "https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis/badge)]"
 59 |             "(https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis)\n"
 60 |             "[![Codacy Badge]("
 61 |             "https://api.codacy.com/project/badge/Grade/9f08e367a5594645aa30c1e31c54dbb8)]"
 62 |             "(https://app.codacy.com/gh/c-wilkinson/StravaDataAnalysis?"
 63 |             "utm_source=github.com&utm_medium=referral"
 64 |             "&utm_content=c-wilkinson/StravaDataAnalysis&utm_campaign=Badge_Grade)\n"
 65 |             "[![CodeTest](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/"
 66 |             "test-code.yml/badge.svg)]"
 67 |             "(https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/test-code.yml)\n"
 68 |             "[![GenerateStats](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/"
 69 |             "generate-stats.yml/badge.svg)]"
 70 |             "(https://github.com/c-wilkinson/StravaDataAnalysis/actions"
 71 |             "/workflows/generate-stats.yml)\n"
 72 |             "[![CodeQL](https://github.com/"
 73 |             "c-wilkinson/StravaDataAnalysis/actions/workflows/codeql-analysis.yml/"
 74 |             "badge.svg)]"
 75 |             "(https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/"
 76 |             "codeql-analysis.yml)\n\n"
 77 |         )
 78 |         handle.write("## Generated Content\n")
 79 |         handle.write(f"📅 Stats last updated on: **{last_updated}**\n\n")
 80 |         handle.write(f"🏃‍♂️ Most recent run: {time_string}\n\n")
 81 | 
 82 |         # Dynamically insert all PNG images
 83 |         image_files = sorted(f for f in os.listdir(readme_dir) if f.endswith(".png"))
 84 |         for image in image_files:
 85 |             title = image.replace("_", " ").replace(".png", "").title()
 86 |             LOGGER.info("Adding %s to readme.md", title)
 87 |             handle.write(f'![{title}]({image}?raw=true "{title}")\n\n')
 88 | 
 89 |         handle.write("## Instructions\n")
 90 |         handle.write(
 91 |             "As I'm sure is obvious, I'm teaching myself python as I go so the code "
 92 |             "quality is not "
 93 |             "likely to be great. Do with it as you wish.\n\n"
 94 |             "1. To use, create an Application on Strava. This can be done here: "
 95 |             "https://www.strava.com/settings/api\n\n"
 96 |             "Give it a name, a website and an 'Authorization Callback Domain'. The "
 97 |             "'Authorization Callback "
 98 |             "Domain' should be 'local host'.\n\n"
 99 |             "2. Copy and paste the following link into your browser, replacing {CLIENTIDHERE} "
100 |             "with your numeric "
101 |             "Client ID found on your Strava application settings page.\n\n"
102 |             "> http://www.strava.com/oauth/authorize?client_id={CLIENTIDHERE}&"
103 |             "response_type=code&redirect_uri="
104 |             "http://localhost/exchange_token&approval_prompt=force&scope="
105 |             "profile:read_all,activity:read_all\n\n"
106 |             "Click authorise when you visit the above link\n\n"
107 |             "3. You will go to a 404 not found page with a link that looks like this: -\n\n"
108 |             "> http://localhost/exchange_token?state=&code={LONGCODEHERE}"
109 |             "&scope=read,activity:read_all,"
110 |             "profile:read_all\n\n"
111 |             "Copy the code after '&code=' to save for step 4. You will also need your "
112 |             "client ID and client secret "
113 |             "found on your Strava application settings page.\n\n"
114 |             "4. Run 'get_tokens.py'. This will create the initial tokens required for "
115 |             "the script.\n\n"
116 |             "Once this has been completed, you can run 'main.py' which uses the tokens "
117 |             "to get the data points. "
118 |             "If the access_token has expired, it will refresh its tokens automatically "
119 |             "during run time."
120 |         )
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     generate_readme()
125 | 


--------------------------------------------------------------------------------
/src/strava_data/strava_api/visualisation/graphs_effort.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains the effort chart functions, each saving a PNG file.
  3 | """
  4 | 
  5 | import matplotlib.dates as mdates
  6 | import numpy as np
  7 | import pandas as pd
  8 | import seaborn as sns
  9 | 
 10 | from strava_data.strava_api.visualisation import utils
 11 | 
 12 | 
 13 | def plot_elevation_gain_per_km_by_month(activities_df: pd.DataFrame, output_path: str) -> None:
 14 |     """
 15 |     Plots average elevation gain per km for each month, per year.
 16 |     - X-axis: Month (Jan–Dec)
 17 |     - Y-axis: Elevation gain per km
 18 |     - Line series: one per year
 19 |     """
 20 |     data = utils.prepare_dated_activities(activities_df)
 21 | 
 22 |     monthly_stats = (
 23 |         data.groupby(["year", "month"])
 24 |         .agg({"distance_km": "sum", "total_elevation_gain_m": "sum"})
 25 |         .reset_index()
 26 |     )
 27 | 
 28 |     monthly_stats = monthly_stats[monthly_stats["distance_km"] > 0]
 29 |     monthly_stats["elev_gain_per_km"] = (
 30 |         monthly_stats["total_elevation_gain_m"] / monthly_stats["distance_km"]
 31 |     )
 32 | 
 33 |     def plot_fn(axis):
 34 |         for year in sorted(monthly_stats["year"].unique()):
 35 |             year_data = monthly_stats[monthly_stats["year"] == year].sort_values("month")
 36 |             axis.plot(
 37 |                 year_data["month"], year_data["elev_gain_per_km"], marker="o", label=str(year)
 38 |             )
 39 |         utils.label_month_axis(axis)
 40 |         axis.legend(title="Year")
 41 | 
 42 |     # pylint: disable=R0801
 43 |     utils.plot_with_common_setup(
 44 |         title="Elevation Gain per km by Month",
 45 |         xlabel="Month",
 46 |         ylabel="Elevation Gain (m/km)",
 47 |         output_path=output_path,
 48 |         plot_func=plot_fn,
 49 |     )
 50 |     # pylint: enable=R0801
 51 | 
 52 | 
 53 | def plot_cadence_over_time(activities_df: pd.DataFrame, output_path: str) -> None:
 54 |     """
 55 |     Scatter plot of average cadence over time with trend line.
 56 |     - Filters to activities with cadence > 0
 57 |     """
 58 |     data = utils.prepare_dated_activities(activities_df)
 59 |     data = data[data["average_cadence"] > 0]
 60 |     if data.empty:
 61 |         return
 62 | 
 63 |     data["start_date"] = pd.to_datetime(data["start_date_local"])
 64 |     data = data.sort_values("start_date")
 65 |     data["start_date_num"] = mdates.date2num(data["start_date"])
 66 | 
 67 |     def plot_fn(axis):
 68 |         sns.scatterplot(data=data, x="start_date", y="average_cadence", alpha=0.5, ax=axis)
 69 |         sns.regplot(
 70 |             data=data,
 71 |             x="start_date_num",
 72 |             y="average_cadence",
 73 |             scatter=False,
 74 |             color="black",
 75 |             line_kws={"linestyle": "--"},
 76 |             ax=axis,
 77 |         )
 78 |         for label in axis.get_xticklabels():
 79 |             label.set_rotation(45)
 80 | 
 81 |     # pylint: disable=R0801
 82 |     utils.plot_with_common_setup(
 83 |         title="Average Cadence Over Time",
 84 |         xlabel="Date",
 85 |         ylabel="Cadence (steps per minute)",
 86 |         output_path=output_path,
 87 |         plot_func=plot_fn,
 88 |     )
 89 |     # pylint: enable=R0801
 90 | 
 91 | 
 92 | def plot_effort_score_over_time(activities_df: pd.DataFrame, output_path: str) -> None:
 93 |     """
 94 |     Line plot showing calculated effort score over time.
 95 |     effort = (distance_km * 10) + (elevation_gain_m * 1.5)
 96 |     """
 97 |     data = utils.prepare_dated_activities(activities_df)
 98 |     data["effort_score"] = (data["distance_km"] * 10) + (data["total_elevation_gain_m"] * 1.5)
 99 |     data["rolling_effort"] = data["effort_score"].rolling(window=7).mean()
100 | 
101 |     def plot_fn(axis):
102 |         axis.plot(
103 |             data["start_date"], data["rolling_effort"], label="7-day Avg Effort", color="blue"
104 |         )
105 |         axis.legend()
106 |         axis.grid(True)
107 | 
108 |     # pylint: disable=R0801
109 |     utils.plot_with_common_setup(
110 |         title="Training Load (Effort Score) Over Time",
111 |         xlabel="Date",
112 |         ylabel="Effort Score",
113 |         output_path=output_path,
114 |         plot_func=plot_fn,
115 |     )
116 |     # pylint: enable=R0801
117 | 
118 | 
119 | def plot_vo2_proxy_over_time(splits_df: pd.DataFrame, output_path: str) -> None:
120 |     """
121 |     Estimates a VO₂ max–style fitness proxy using 1 km split pace over time.
122 | 
123 |     VO₂ proxy = 15.0 × (speed in m/s), where speed = distance / time for fastest split per month.
124 | 
125 |     Produces a line chart per year showing how top-end aerobic fitness changes across months.
126 |     """
127 |     data = utils.prepare_dated_activities(splits_df)
128 |     if data.empty:
129 |         return
130 | 
131 |     data["pace_sec_km"] = data["elapsed_time_s"] / data["distance_km"]
132 |     data["speed_mps"] = data["distance_m"] / data["elapsed_time_s"]
133 |     data["vo2_proxy"] = 15.0 * data["speed_mps"]
134 |     data["year"] = pd.to_datetime(data["start_date_local"]).dt.year
135 |     data["month"] = pd.to_datetime(data["start_date_local"]).dt.month
136 | 
137 |     monthly = data.groupby(["year", "month"])["vo2_proxy"].max().reset_index()
138 | 
139 |     rows = []
140 |     for year in sorted(monthly["year"].unique()):
141 |         for month in range(1, 13):
142 |             match = monthly[(monthly["year"] == year) & (monthly["month"] == month)]
143 |             value = match["vo2_proxy"].values[0] if not match.empty else np.nan
144 |             rows.append({"year": year, "month": month, "vo2_proxy": value})
145 | 
146 |     plot_df = pd.DataFrame(rows)
147 |     plot_df["vo2_proxy"] = plot_df.groupby("year")["vo2_proxy"].ffill()
148 | 
149 |     def plot_fn(axis):
150 |         for year in sorted(plot_df["year"].unique()):
151 |             sub = plot_df[plot_df["year"] == year]
152 |             axis.plot(sub["month"], sub["vo2_proxy"], marker="o", label=str(year))
153 |         utils.label_month_axis(axis)
154 |         axis.legend(title="Year")
155 |         axis.grid(True)
156 | 
157 |     # pylint: disable=R0801
158 |     utils.plot_with_common_setup(
159 |         title="Estimated VO₂ Max Proxy Over Time",
160 |         xlabel="Month",
161 |         ylabel="VO₂ Proxy",
162 |         output_path=output_path,
163 |         plot_func=plot_fn,
164 |     )
165 |     # pylint: enable=R0801
166 | 


--------------------------------------------------------------------------------
/src/strava_data/strava_api/client.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Client code to call Strava's API endpoints, with rate-limiting control.
  3 | """
  4 | 
  5 | import time
  6 | from typing import Optional
  7 | from datetime import datetime
  8 | 
  9 | import requests
 10 | import pandas as pd
 11 | 
 12 | from strava_data.db.dao import read_tokens, insert_activities, get_latest_activity_date
 13 | from strava_data.strava_api.processing.transform import transform_activities
 14 | from utils.logger import get_logger
 15 | 
 16 | LOGGER = get_logger()
 17 | 
 18 | MAX_REQUESTS_15_MIN = 100
 19 | MAX_REQUESTS_DAY = 1000
 20 | RATE_LIMIT_15_MIN_SECONDS = 15 * 60
 21 | RATE_LIMIT_24_HOURS_SECONDS = 24 * 60 * 60
 22 | 
 23 | 
 24 | class RateLimiter:
 25 |     def __init__(self):
 26 |         self.last_request_time = None
 27 |         self.request_count = 0
 28 | 
 29 |     def update(self):
 30 |         self.last_request_time = time.time()
 31 |         self.request_count += 1
 32 | 
 33 |     def reset(self):
 34 |         self.last_request_time = None
 35 |         self.request_count = 0
 36 | 
 37 |     def should_wait(self) -> bool:
 38 |         if not self.last_request_time:
 39 |             return False
 40 |         elapsed = time.time() - self.last_request_time
 41 |         return elapsed < RATE_LIMIT_15_MIN_SECONDS and self.request_count >= MAX_REQUESTS_15_MIN
 42 | 
 43 |     def wait_if_needed(self):
 44 |         if self.should_wait():
 45 |             wait_time = RATE_LIMIT_15_MIN_SECONDS - (time.time() - self.last_request_time)
 46 |             LOGGER.warning("15-min rate limit reached. Waiting %f seconds.", wait_time)
 47 |             time.sleep(wait_time)
 48 |             self.reset()
 49 | 
 50 | 
 51 | rate_limiter = RateLimiter()
 52 | 
 53 | 
 54 | def fetch_activities(per_page: int = 30) -> pd.DataFrame:
 55 |     tokens = read_tokens()
 56 |     if not tokens:
 57 |         LOGGER.warning("No stored tokens. Returning empty DataFrame.")
 58 |         return pd.DataFrame()
 59 | 
 60 |     headers = {"Authorization": f"Bearer {tokens.get('access_token', '')}"}
 61 | 
 62 |     latest_str = get_latest_activity_date()
 63 |     if latest_str:
 64 |         latest_dt = datetime.strptime(latest_str, "%Y-%m-%dT%H:%M:%SZ")
 65 |         after_unix = int(latest_dt.timestamp())
 66 |         LOGGER.info("Fetching activities after %s (UNIX %d)", latest_str, after_unix)
 67 |     else:
 68 |         after_unix = 0
 69 |         LOGGER.info("No existing activities in DB, fetching all from start.")
 70 | 
 71 |     all_activities = pd.DataFrame()
 72 |     page = 1
 73 | 
 74 |     while True:
 75 |         LOGGER.info("Fetching page %d of activities", page)
 76 |         params = {"per_page": per_page, "page": page, "after": after_unix}
 77 |         response_data = _make_api_request(
 78 |             "https://www.strava.com/api/v3/athlete/activities", headers, params
 79 |         )
 80 | 
 81 |         if response_data is None:
 82 |             LOGGER.error("No data returned (None). Stopping fetch.")
 83 |             break
 84 | 
 85 |         if (
 86 |             isinstance(response_data, dict)
 87 |             and response_data.get("message") == "Rate Limit Exceeded"
 88 |         ):
 89 |             LOGGER.warning(
 90 |                 "Strava rate limit exceeded. Waiting %d seconds.", RATE_LIMIT_15_MIN_SECONDS
 91 |             )
 92 |             time.sleep(RATE_LIMIT_15_MIN_SECONDS)
 93 |             rate_limiter.reset()
 94 |             continue
 95 | 
 96 |         if isinstance(response_data, list):
 97 |             page_df = pd.DataFrame(response_data)
 98 |             if page_df.empty:
 99 |                 LOGGER.info("No more activities on page %d. Ending fetch.", page)
100 |                 break
101 | 
102 |             all_activities = pd.concat([all_activities, page_df], ignore_index=True)
103 |             transformed_df = transform_activities(page_df)
104 |             insert_activities(transformed_df)
105 | 
106 |             page += 1
107 |         else:
108 |             LOGGER.error("Unexpected response data type: %s", type(response_data))
109 |             break
110 | 
111 |     LOGGER.info("Fetched a total of %d activities", len(all_activities))
112 |     return all_activities
113 | 
114 | 
115 | def fetch_splits_if_needed(activities_df: pd.DataFrame) -> pd.DataFrame:
116 |     tokens = read_tokens()
117 |     if not tokens:
118 |         return pd.DataFrame()
119 | 
120 |     headers = {"Authorization": f"Bearer {tokens.get('access_token', '')}"}
121 |     all_splits = pd.DataFrame()
122 | 
123 |     for _, row in activities_df.iterrows():
124 |         if str(row.get("type", "")).lower() != "run":
125 |             continue
126 | 
127 |         activity_id = row.get("id")
128 |         if not activity_id:
129 |             continue
130 | 
131 |         splits_url = f"https://www.strava.com/api/v3/activities/{activity_id}"
132 |         splits_data = _make_api_request(splits_url, headers, None)
133 | 
134 |         if isinstance(splits_data, dict) and splits_data.get("message") == "Rate Limit Exceeded":
135 |             LOGGER.warning(
136 |                 "Hit 429 fetching splits for activity %d. Waiting %d seconds, then retrying once.",
137 |                 activity_id,
138 |                 RATE_LIMIT_15_MIN_SECONDS,
139 |             )
140 |             time.sleep(RATE_LIMIT_15_MIN_SECONDS)
141 |             rate_limiter.reset()
142 |             continue
143 | 
144 |         if not splits_data or "splits_metric" not in splits_data:
145 |             continue
146 | 
147 |         df_splits = pd.DataFrame(splits_data["splits_metric"])
148 |         df_splits["activity_id"] = activity_id
149 |         df_splits["start_date_local"] = splits_data.get("start_date_local", "")
150 |         all_splits = pd.concat([all_splits, df_splits], ignore_index=True)
151 | 
152 |     return all_splits
153 | 
154 | 
155 | def _make_api_request(url: str, headers: dict, params: Optional[dict]) -> Optional[list]:
156 |     rate_limiter.wait_if_needed()
157 | 
158 |     try:
159 |         response = requests.get(url, headers=headers, params=params, timeout=10)
160 |     except requests.exceptions.Timeout:
161 |         LOGGER.error("Timeout occurred while making API request to %s", url)
162 |         return None
163 | 
164 |     if response.status_code == 429:
165 |         LOGGER.error("HTTP 429: Rate Limit Exceeded by Strava")
166 |         rate_limiter.reset()
167 |         return {"message": "Rate Limit Exceeded"}
168 | 
169 |     if not response.ok:
170 |         LOGGER.error(
171 |             "Request failed. Status: %d, Response: %s",
172 |             response.status_code,
173 |             response.text,
174 |         )
175 |         return None
176 | 
177 |     rate_limiter.update()
178 | 
179 |     try:
180 |         return response.json()
181 |     except ValueError:
182 |         LOGGER.error("Invalid JSON response from %s", url)
183 |         return None
184 | 
185 | 
186 | def get_latest_activity_unix_timestamp() -> int:
187 |     latest_str = get_latest_activity_date()
188 |     if not latest_str:
189 |         return 0
190 | 
191 |     latest_dt = datetime.strptime(latest_str, "%Y-%m-%dT%H:%M:%SZ")
192 |     return int(latest_dt.timestamp())
193 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Main entry point for running Strava data retrieval, processing, and visualization.
  3 | """
  4 | 
  5 | import argparse
  6 | import pandas as pd
  7 | 
  8 | from utils.logger import get_logger
  9 | from strava_data.auth import get_or_refresh_tokens
 10 | from strava_data.db.dao import (
 11 |     decrypt_database,
 12 |     encrypt_database,
 13 |     init_database,
 14 |     insert_activities,
 15 |     insert_splits,
 16 |     load_all_activities,
 17 |     load_all_splits,
 18 | )
 19 | from strava_data.strava_api.client import fetch_activities, fetch_splits_if_needed
 20 | from strava_data.strava_api.processing.transform import transform_activities, transform_splits
 21 | from strava_data.strava_api.visualisation import (
 22 |     graphs_distribution,
 23 |     graphs_distance,
 24 |     graphs_pace,
 25 |     graphs_effort,
 26 | )
 27 | from strava_data.strava_api.visualisation.utils import configure_matplotlib_styles
 28 | from strava_data.ml.pace_forecast import run_forecast_pipeline
 29 | from strava_data.ml.run_type_classifier import run_clustering_pipeline
 30 | from strava_data.ml.training_advisor import generate_training_plan_chart
 31 | 
 32 | configure_matplotlib_styles()
 33 | LOGGER = get_logger()
 34 | 
 35 | 
 36 | def main(skip_fetch: bool = False) -> None:
 37 |     """
 38 |     Orchestrates the full flow: auth, DB prep, fetch, transform, visualize.
 39 |     """
 40 |     LOGGER.info("Start main.")
 41 |     decrypt_database()
 42 |     init_database()
 43 | 
 44 |     if not skip_fetch:
 45 |         process_new_activities()
 46 |     else:
 47 |         LOGGER.info("Skipping fetch. Using existing database contents.")
 48 | 
 49 |     LOGGER.info("Running chart generation...")
 50 |     generate_charts_from_db()
 51 |     encrypt_database()
 52 |     LOGGER.info("Done.")
 53 | 
 54 | 
 55 | def process_new_activities() -> None:
 56 |     """
 57 |     Authenticates and processes newly fetched Strava activities and splits.
 58 |     """
 59 |     get_or_refresh_tokens()
 60 |     new_activities = fetch_activities(per_page=50)
 61 | 
 62 |     if new_activities.empty:
 63 |         LOGGER.info("No new activities detected")
 64 |         return
 65 | 
 66 |     LOGGER.info("New activities detected, processing...")
 67 |     new_splits = fetch_splits_if_needed(new_activities)
 68 |     transformed_activities = transform_activities(new_activities)
 69 |     transformed_splits = transform_splits(new_splits)
 70 |     insert_activities(transformed_activities)
 71 |     insert_splits(transformed_splits)
 72 |     LOGGER.info("New activities processed")
 73 | 
 74 | 
 75 | def generate_charts_from_db() -> None:
 76 |     """
 77 |     Loads all data from the database and triggers chart generation.
 78 |     """
 79 |     all_activities = load_all_activities()
 80 |     all_splits = load_all_splits()
 81 |     generate_required_charts(all_activities, all_splits)
 82 |     LOGGER.info("Running pace forecast pipeline...")
 83 |     run_forecast_pipeline(all_splits)
 84 |     LOGGER.info("Running run type clustering pipeline...")
 85 |     run_clustering_pipeline(all_splits)
 86 |     LOGGER.info("Generating training plan...")
 87 |     generate_training_plan_chart(all_activities, all_splits, "A.I._Recommended_Training.png")
 88 | 
 89 | 
 90 | def generate_required_charts(activities_df: pd.DataFrame, splits_df: pd.DataFrame) -> None:
 91 |     """
 92 |     Produces visualisations from activity and split data.
 93 |     """
 94 |     generate_pace_and_distance_charts(activities_df, splits_df)
 95 |     generate_distribution_and_heatmaps(activities_df, splits_df)
 96 |     generate_time_series_and_trends(activities_df, splits_df)
 97 | 
 98 | 
 99 | def generate_pace_and_distance_charts(activities_df: pd.DataFrame, splits_df: pd.DataFrame) -> None:
100 |     LOGGER.info("Generate Running_Pace_vs_Elevation_Change")
101 |     graphs_pace.plot_pace_vs_elevation_change(splits_df, "Running_Pace_vs_Elevation_Change.png")
102 |     LOGGER.info("Generate Time_Taken_Distance")
103 |     graphs_distance.plot_time_taken_over_distances(activities_df, "Time_Taken_Distance.png")
104 |     LOGGER.info("Generate Time_Taken_Distance_Recent_Years")
105 |     graphs_distance.plot_time_taken_over_distances_recent_years(
106 |         activities_df, "Time_Taken_Distance_Recent_Years.png"
107 |     )
108 |     LOGGER.info("Generate Running_Pace_over_Time")
109 |     graphs_pace.plot_running_pace_over_time(splits_df, "Running_Pace_over_Time.png")
110 |     LOGGER.info("Generate Running_Pace_vs_Total_Distance")
111 |     graphs_distance.plot_pace_vs_total_distance(splits_df, "Running_Pace_vs_Total_Distance.png")
112 |     LOGGER.info("Generate Number_of_Runs_per_Distance")
113 |     graphs_distance.plot_number_of_runs_per_distance(
114 |         activities_df, "Number_of_Runs_per_Distance.png"
115 |     )
116 |     LOGGER.info("Generate Fastest_1k_Pace_over_Time")
117 |     graphs_pace.plot_fastest_1km_pace_over_time(splits_df, "Fastest_1k_Pace_over_Time.png")
118 |     LOGGER.info("Generate Median_1k_Pace_over_Time")
119 |     graphs_pace.plot_median_1km_pace_over_time(splits_df, "Median_1k_Pace_over_Time.png")
120 |     LOGGER.info("Generate Total_Distance_Ran_by_Month")
121 |     graphs_distance.plot_total_distance_by_month(activities_df, "Total_Distance_Ran_by_Month.png")
122 |     LOGGER.info("Generate Pace_by_Day")
123 |     graphs_pace.plot_pace_by_day_of_week(splits_df, "Pace_by_Day.png")
124 | 
125 | 
126 | def generate_distribution_and_heatmaps(
127 |     activities_df: pd.DataFrame, splits_df: pd.DataFrame
128 | ) -> None:
129 |     LOGGER.info("Generate Activity_Heatmap")
130 |     graphs_distribution.plot_heatmap_activities(activities_df, "Activity_Heatmap.png")
131 |     LOGGER.info("Generate Run_Distance_Distribution")
132 |     graphs_distribution.plot_run_distance_distribution(
133 |         activities_df, "Run_Distance_Distribution.png"
134 |     )
135 |     LOGGER.info("Generate Pace_Distribution")
136 |     graphs_distribution.plot_pace_distribution(splits_df, "Pace_Distribution.png")
137 |     LOGGER.info("Generate Elevation_Gain_Distribution")
138 |     graphs_distribution.plot_elevation_gain_distribution(
139 |         activities_df, "Elevation_Gain_Distribution.png"
140 |     )
141 |     LOGGER.info("Generate Run_Days_Heatmap")
142 |     graphs_distribution.plot_run_days_heatmap(activities_df, "Run_Days_Heatmap.png")
143 |     LOGGER.info("Generate Rest_Days_Heatmap")
144 |     graphs_distribution.plot_rest_days_heatmap(activities_df, "Rest_Days_Heatmap.png")
145 |     LOGGER.info("Generate Run_Rest_Ratio_Heatmap")
146 |     graphs_distribution.plot_run_rest_ratio_heatmap(activities_df, "Run_Rest_Ratio_Heatmap.png")
147 | 
148 | 
149 | def generate_time_series_and_trends(activities_df: pd.DataFrame, splits_df: pd.DataFrame) -> None:
150 |     LOGGER.info("Generate Cumulative_Distance")
151 |     graphs_distance.plot_cumulative_distance_over_time(activities_df, "Cumulative_Distance.png")
152 |     LOGGER.info("Generate Longest_Run_per_Month")
153 |     graphs_distance.plot_longest_run_per_month(activities_df, "Longest_Run_per_Month.png")
154 |     LOGGER.info("Generate Elevation_Gain_per_KM_by_Month")
155 |     graphs_effort.plot_elevation_gain_per_km_by_month(
156 |         activities_df, "Elevation_Gain_per_KM_by_Month.png"
157 |     )
158 |     LOGGER.info("Generate Run_Start_Time_by_Month")
159 |     graphs_distribution.plot_run_start_time_distribution(
160 |         activities_df, "Run_Start_Time_by_Month.png"
161 |     )
162 |     LOGGER.info("Generate Monthly_Distance_by_Year")
163 |     graphs_distance.plot_monthly_distance_by_year_grouped(
164 |         activities_df, "Monthly_Distance_by_Year.png"
165 |     )
166 |     LOGGER.info("Generate Rolling_30_Day_Comparison")
167 |     graphs_distance.plot_rolling_distance(activities_df, "Rolling_30_Day_Comparison.png", window=30)
168 |     LOGGER.info("Generate Cadence_Over_Time")
169 |     graphs_effort.plot_cadence_over_time(activities_df, "Cadence_Over_Time.png")
170 |     LOGGER.info("Generate Training_Intensity_by_HeartRate_Zone")
171 |     graphs_distribution.plot_heart_rate_zone_distribution(
172 |         splits_df, "Training_Intensity_by_HeartRate_Zone.png"
173 |     )
174 |     LOGGER.info("Generate Pace_Consistency_by_Run")
175 |     graphs_pace.plot_pace_variability_per_run(splits_df, "Pace_Consistency_by_Run.png")
176 |     LOGGER.info("Generate Training_Load_Over_Time")
177 |     graphs_effort.plot_effort_score_over_time(activities_df, "Training_Load_Over_Time.png")
178 |     LOGGER.info("Generate VO2_Proxy_Over_Time")
179 |     graphs_effort.plot_vo2_proxy_over_time(splits_df, "VO2_Proxy_Over_Time.png")
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     parser = argparse.ArgumentParser(description="Process and visualize Strava data.")
184 |     parser.add_argument("--skip-fetch", action="store_true", help="Skip fetching new activities.")
185 |     args = parser.parse_args()
186 |     main(skip_fetch=args.skip_fetch)
187 | 


--------------------------------------------------------------------------------
/src/strava_data/db/dao.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data Access Object (DAO) layer for reading/writing tokens and activity data.
  3 | Uses pyAesCrypt to decrypt/encrypt the SQLite database file, but only once at program start/end.
  4 | """
  5 | 
  6 | import os
  7 | import sqlite3
  8 | from os import stat
  9 | from typing import Any, Dict, Optional
 10 | from datetime import datetime
 11 | 
 12 | import pandas as pd
 13 | import pyAesCrypt
 14 | 
 15 | from strava_data.config import get_buffer_size, get_encryption_password
 16 | from utils.logger import get_logger
 17 | 
 18 | LOGGER = get_logger()
 19 | 
 20 | ENCRYPTED_DB_FILE = "strava.sqlite"
 21 | TEMP_DB_FILE = "strava_temp.sqlite"
 22 | 
 23 | 
 24 | def decrypt_database() -> None:
 25 |     """
 26 |     Decrypts strava.sqlite into strava_temp.sqlite (if strava.sqlite exists).
 27 |     If strava_temp.sqlite already exists, skip to avoid double-decryption.
 28 |     Call this once at program start.
 29 |     """
 30 |     if os.path.exists(TEMP_DB_FILE):
 31 |         LOGGER.warning("Database appears already decrypted. Skipping decryption.")
 32 |         return
 33 | 
 34 |     if not os.path.exists(ENCRYPTED_DB_FILE):
 35 |         LOGGER.warning(
 36 |             "Encrypted database file %s not found. Creating a new database.",
 37 |             ENCRYPTED_DB_FILE,
 38 |         )
 39 |         init_database()
 40 |         return
 41 | 
 42 |     enc_file_size = stat(ENCRYPTED_DB_FILE).st_size
 43 |     password = get_encryption_password()
 44 |     buffer_size = get_buffer_size()
 45 | 
 46 |     LOGGER.info("Decrypting database...")
 47 |     with open(ENCRYPTED_DB_FILE, "rb") as f_in, open(TEMP_DB_FILE, "wb") as f_out:
 48 |         pyAesCrypt.decryptStream(f_in, f_out, password, buffer_size, enc_file_size)
 49 | 
 50 |     LOGGER.info("Decryption complete. Working with the unencrypted file now.")
 51 | 
 52 | 
 53 | def encrypt_database() -> None:
 54 |     """
 55 |     Encrypts strava_temp.sqlite back into strava.sqlite and removes strava_temp.sqlite.
 56 |     Call this once at program end.
 57 |     """
 58 |     if not os.path.exists(TEMP_DB_FILE):
 59 |         LOGGER.warning("No decrypted DB file %s found to encrypt. Skipping.", TEMP_DB_FILE)
 60 |         return
 61 | 
 62 |     password = get_encryption_password()
 63 |     buffer_size = get_buffer_size()
 64 | 
 65 |     if os.path.exists(ENCRYPTED_DB_FILE):
 66 |         os.remove(ENCRYPTED_DB_FILE)
 67 | 
 68 |     LOGGER.info("Encrypting database back to strava.sqlite...")
 69 |     with open(TEMP_DB_FILE, "rb") as f_in, open(ENCRYPTED_DB_FILE, "wb") as f_out:
 70 |         pyAesCrypt.encryptStream(f_in, f_out, password, buffer_size)
 71 | 
 72 |     os.remove(TEMP_DB_FILE)
 73 |     LOGGER.info("Encryption complete.")
 74 | 
 75 | 
 76 | def init_database() -> None:
 77 |     """
 78 |     Creates required tables if they do not already exist in strava_temp.sqlite.
 79 |     Assumes decrypt_database() has already been called.
 80 |     """
 81 |     conn = sqlite3.connect(TEMP_DB_FILE)
 82 |     cur = conn.cursor()
 83 | 
 84 |     cur.execute(
 85 |         """
 86 |         CREATE TABLE IF NOT EXISTS config (
 87 |             token_type TEXT,
 88 |             access_token TEXT,
 89 |             expires_at INTEGER,
 90 |             expires_in INTEGER,
 91 |             refresh_token TEXT
 92 |         );
 93 |     """
 94 |     )
 95 | 
 96 |     cur.execute(
 97 |         """
 98 |         CREATE TABLE IF NOT EXISTS activities (
 99 |             activity_id INTEGER PRIMARY KEY,
100 |             name TEXT,
101 |             activity_type TEXT,
102 |             distance_m REAL,
103 |             moving_time_s INTEGER,
104 |             average_speed_m_s REAL,
105 |             max_speed_m_s REAL,
106 |             total_elevation_gain_m REAL,
107 |             start_date_local TEXT,
108 |             average_cadence REAL,
109 |             is_outdoor INTEGER
110 |         );
111 |     """
112 |     )
113 | 
114 |     cur.execute(
115 |         """
116 |         CREATE TABLE IF NOT EXISTS splits (
117 |             split_row_id INTEGER PRIMARY KEY AUTOINCREMENT,
118 |             activity_id INTEGER,
119 |             distance_m REAL,
120 |             elapsed_time_s INTEGER,
121 |             elevation_difference_m REAL,
122 |             moving_time_s INTEGER,
123 |             pace_zone INTEGER,
124 |             split_index INTEGER,
125 |             average_grade_adjusted_speed_m_s REAL,
126 |             average_heartrate REAL,
127 |             start_date_local TEXT,
128 |             FOREIGN KEY(activity_id) REFERENCES activities(activity_id)
129 |         );
130 |     """
131 |     )
132 | 
133 |     conn.commit()
134 |     conn.close()
135 | 
136 | 
137 | def store_tokens(tokens: Dict[str, Any]) -> None:
138 |     conn = sqlite3.connect(TEMP_DB_FILE)
139 |     cur = conn.cursor()
140 |     cur.execute("DELETE FROM config;")
141 |     cur.execute(
142 |         """
143 |         INSERT INTO config (token_type, access_token, expires_at, expires_in, refresh_token)
144 |         VALUES (?, ?, ?, ?, ?);
145 |     """,
146 |         (
147 |             tokens.get("token_type"),
148 |             tokens.get("access_token"),
149 |             tokens.get("expires_at"),
150 |             tokens.get("expires_in"),
151 |             tokens.get("refresh_token"),
152 |         ),
153 |     )
154 |     conn.commit()
155 |     conn.close()
156 | 
157 | 
158 | def read_tokens() -> Optional[Dict[str, Any]]:
159 |     conn = sqlite3.connect(TEMP_DB_FILE)
160 |     cur = conn.cursor()
161 |     cur.execute(
162 |         """
163 |         SELECT token_type, access_token, expires_at, expires_in, refresh_token
164 |         FROM config
165 |         LIMIT 1;
166 |     """
167 |     )
168 |     row = cur.fetchone()
169 |     conn.close()
170 | 
171 |     if row:
172 |         return {
173 |             "token_type": row[0],
174 |             "access_token": row[1],
175 |             "expires_at": row[2],
176 |             "expires_in": row[3],
177 |             "refresh_token": row[4],
178 |         }
179 |     return None
180 | 
181 | 
182 | def insert_activities(activities_df: pd.DataFrame) -> None:
183 |     if activities_df.empty:
184 |         return
185 | 
186 |     conn = sqlite3.connect(TEMP_DB_FILE)
187 |     cur = conn.cursor()
188 | 
189 |     for _, row in activities_df.iterrows():
190 |         cur.execute(
191 |             """
192 |             INSERT OR IGNORE INTO activities (
193 |                 activity_id,
194 |                 name,
195 |                 activity_type,
196 |                 distance_m,
197 |                 moving_time_s,
198 |                 average_speed_m_s,
199 |                 max_speed_m_s,
200 |                 total_elevation_gain_m,
201 |                 start_date_local,
202 |                 average_cadence,
203 |                 is_outdoor
204 |             )
205 |             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
206 |         """,
207 |             (
208 |                 row.get("id"),
209 |                 row.get("name"),
210 |                 row.get("type"),
211 |                 row.get("distance_m", 0.0),
212 |                 row.get("moving_time_s", 0),
213 |                 row.get("average_speed_m_s", 0.0),
214 |                 row.get("max_speed_m_s", 0.0),
215 |                 row.get("total_elevation_gain_m", 0.0),
216 |                 row.get("start_date_local", ""),
217 |                 row.get("average_cadence", 0.0),
218 |                 row.get("is_outdoor", 1 if row.get("is_outdoor") else 0),
219 |             ),
220 |         )
221 |     conn.commit()
222 |     conn.close()
223 | 
224 | 
225 | def insert_splits(splits_df: pd.DataFrame) -> None:
226 |     if splits_df.empty:
227 |         return
228 | 
229 |     conn = sqlite3.connect(TEMP_DB_FILE)
230 |     cur = conn.cursor()
231 | 
232 |     for _, row in splits_df.iterrows():
233 |         cur.execute(
234 |             """
235 |             INSERT INTO splits (
236 |                 activity_id,
237 |                 distance_m,
238 |                 elapsed_time_s,
239 |                 elevation_difference_m,
240 |                 moving_time_s,
241 |                 pace_zone,
242 |                 split_index,
243 |                 average_grade_adjusted_speed_m_s,
244 |                 average_heartrate,
245 |                 start_date_local
246 |             )
247 |             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
248 |         """,
249 |             (
250 |                 row.get("activity_id"),
251 |                 row.get("distance_m", 0.0),
252 |                 row.get("elapsed_time_s", 0),
253 |                 row.get("elevation_difference_m", 0.0),
254 |                 row.get("moving_time_s", 0),
255 |                 row.get("pace_zone", 0),
256 |                 row.get("split_index", 0),
257 |                 row.get("average_grade_adjusted_speed_m_s", 0.0),
258 |                 row.get("average_heartrate", None),
259 |                 row.get("start_date_local", ""),
260 |             ),
261 |         )
262 | 
263 |     conn.commit()
264 |     conn.close()
265 | 
266 | 
267 | def get_latest_activity_date() -> Optional[str]:
268 |     conn = sqlite3.connect(TEMP_DB_FILE)
269 |     cur = conn.cursor()
270 |     cur.execute("SELECT MAX(start_date_local) FROM activities;")
271 |     row = cur.fetchone()
272 |     conn.close()
273 | 
274 |     if row and row[0]:
275 |         return row[0]
276 |     return None
277 | 
278 | 
279 | def load_all_activities() -> pd.DataFrame:
280 |     conn = sqlite3.connect(TEMP_DB_FILE)
281 |     activities_df = pd.read_sql_query("SELECT * FROM activities;", conn)
282 |     conn.close()
283 |     return activities_df
284 | 
285 | 
286 | def load_all_splits() -> pd.DataFrame:
287 |     conn = sqlite3.connect(TEMP_DB_FILE)
288 |     splits_df = pd.read_sql_query("SELECT * FROM splits;", conn)
289 |     conn.close()
290 |     return splits_df
291 | 
292 | 
293 | def get_last_run_time():
294 |     """
295 |     Retrieves the latest run timestamp from 'activities' in strava_temp.sqlite,
296 |     returning a datetime object or None if no runs exist.
297 |     """
298 |     if not os.path.exists(TEMP_DB_FILE):
299 |         return None
300 | 
301 |     conn = sqlite3.connect(TEMP_DB_FILE)
302 |     cur = conn.cursor()
303 | 
304 |     query = "SELECT MAX(start_date_local) FROM activities;"
305 |     cur.execute(query)
306 |     row = cur.fetchone()
307 |     conn.close()
308 | 
309 |     if not row or not row[0]:
310 |         return None
311 | 
312 |     try:
313 |         return datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%SZ")
314 |     except ValueError:
315 |         return None
316 | 


--------------------------------------------------------------------------------
/src/strava_data/strava_api/visualisation/graphs_pace.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains the pace chart functions, each saving a PNG file.
  3 | """
  4 | 
  5 | import calendar
  6 | import matplotlib.dates as mdates
  7 | from matplotlib import ticker
  8 | import numpy as np
  9 | import pandas as pd
 10 | import seaborn as sns
 11 | 
 12 | from strava_data.strava_api.visualisation import utils
 13 | 
 14 | 
 15 | def plot_pace_vs_elevation_change(splits_df: pd.DataFrame, output_path: str) -> None:
 16 |     """
 17 |     Plot Running Pace vs. Elevation Change for 1km splits.
 18 |     - y-axis: pace (mm:ss)
 19 |     - x-axis: elevation change (m)
 20 |     - Points coloured by year
 21 |     - Trend line included
 22 |     """
 23 |     splits = utils.prepare_dated_activities(splits_df)
 24 |     splits = splits[
 25 |         (splits["elevation_difference_m"] >= -100) & (splits["elevation_difference_m"] <= 100)
 26 |     ]
 27 |     splits["pace_s_km"] = splits["elapsed_time_s"] / (splits["distance_m"] / 1000)
 28 |     splits["year"] = pd.to_datetime(splits["start_date_local"]).dt.year
 29 | 
 30 |     def plot_fn(axis):
 31 |         sns.scatterplot(
 32 |             data=splits,
 33 |             x="elevation_difference_m",
 34 |             y="pace_s_km",
 35 |             hue="year",
 36 |             alpha=0.6,
 37 |             palette="viridis",
 38 |             ax=axis,
 39 |         )
 40 |         sns.regplot(
 41 |             data=splits,
 42 |             x="elevation_difference_m",
 43 |             y="pace_s_km",
 44 |             scatter=False,
 45 |             color="black",
 46 |             line_kws={"linestyle": "--"},
 47 |             ci=95,
 48 |             ax=axis,
 49 |         )
 50 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
 51 | 
 52 |     # pylint: disable=R0801
 53 |     utils.plot_with_common_setup(
 54 |         title="Running Pace vs. Elevation Change",
 55 |         xlabel="Elevation Change (m)",
 56 |         ylabel="Split Pace (mm:ss)",
 57 |         output_path=output_path,
 58 |         plot_func=plot_fn,
 59 |     )
 60 |     # pylint: enable=R0801
 61 | 
 62 | 
 63 | def plot_running_pace_over_time(splits_df: pd.DataFrame, output_path: str) -> None:
 64 |     """
 65 |     Running pace over time:
 66 |     - y-axis: 1 km pace (mm:ss)
 67 |     - x-axis: date
 68 |     - Points for each ~1 km split
 69 |     - Trend line to show changes
 70 |     """
 71 |     data = utils.prepare_dated_activities(splits_df)
 72 |     if data.empty:
 73 |         return
 74 | 
 75 |     data["pace_sec_km"] = data["elapsed_time_s"] / data["distance_km"]
 76 |     data["datetime_obj"] = pd.to_datetime(data["start_date_local"], errors="coerce")
 77 |     data["date_numeric"] = mdates.date2num(data["datetime_obj"])
 78 |     data.sort_values("date_numeric", inplace=True)
 79 | 
 80 |     def plot_fn(axis):
 81 |         sns.scatterplot(data=data, x="date_numeric", y="pace_sec_km", alpha=0.5, ax=axis)
 82 |         sns.regplot(
 83 |             data=data,
 84 |             x="date_numeric",
 85 |             y="pace_sec_km",
 86 |             scatter=False,
 87 |             ci=95,
 88 |             color="black",
 89 |             line_kws={"linestyle": "--"},
 90 |             ax=axis,
 91 |         )
 92 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
 93 |         axis.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
 94 |         axis.set_xticks(data["date_numeric"][:: len(data) // 10 if len(data) >= 10 else 1])
 95 |         axis.set_xticklabels(
 96 |             [
 97 |                 d.strftime("%Y-%m")
 98 |                 for d in data["datetime_obj"].iloc[:: len(data) // 10 if len(data) >= 10 else 1]
 99 |             ],
100 |             rotation=45,
101 |         )
102 | 
103 |     # pylint: disable=R0801
104 |     utils.plot_with_common_setup(
105 |         title="Running Pace Over Time",
106 |         xlabel="Date",
107 |         ylabel="Pace (mm:ss)",
108 |         output_path=output_path,
109 |         plot_func=plot_fn,
110 |     )
111 |     # pylint: enable=R0801
112 | 
113 | 
114 | def plot_fastest_1km_pace_over_time(splits_df: pd.DataFrame, output_path: str) -> None:
115 |     """
116 |     Plots the fastest 1km pace per month across all years.
117 |     """
118 |     split_data = utils.prepare_dated_activities(splits_df)
119 |     if split_data.empty:
120 |         return
121 | 
122 |     split_data["pace_sec_km"] = split_data["elapsed_time_s"] / split_data["distance_km"]
123 |     split_data["year"] = pd.to_datetime(split_data["start_date_local"]).dt.year
124 |     split_data["month"] = pd.to_datetime(split_data["start_date_local"]).dt.month
125 | 
126 |     monthly_fastest = split_data.groupby(["year", "month"])["pace_sec_km"].min().reset_index()
127 | 
128 |     all_years = sorted(monthly_fastest["year"].unique())
129 |     rows = []
130 |     for year in all_years:
131 |         for month in range(1, 13):
132 |             pace = monthly_fastest.loc[
133 |                 (monthly_fastest["year"] == year) & (monthly_fastest["month"] == month),
134 |                 "pace_sec_km",
135 |             ]
136 |             pace_val = pace.values[0] if not pace.empty else np.nan
137 |             rows.append({"year": year, "month": month, "pace_sec_km": pace_val})
138 | 
139 |     plot_df = pd.DataFrame(rows)
140 |     plot_df["pace_sec_km"] = plot_df.groupby("year")["pace_sec_km"].ffill()
141 | 
142 |     def plot_fn(axis):
143 |         for year in sorted(plot_df["year"].unique()):
144 |             year_data = plot_df[plot_df["year"] == year].sort_values("month")
145 |             axis.plot(year_data["month"], year_data["pace_sec_km"], marker="o", label=str(year))
146 |         utils.label_month_axis(axis)
147 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
148 |         axis.legend(title="Year")
149 | 
150 |     # pylint: disable=R0801
151 |     utils.plot_with_common_setup(
152 |         title="Fastest 1 km Pace Over Time",
153 |         xlabel="Month",
154 |         ylabel="Fastest Pace (mm:ss)",
155 |         output_path=output_path,
156 |         plot_func=plot_fn,
157 |     )
158 |     # pylint: enable=R0801
159 | 
160 | 
161 | def plot_median_1km_pace_over_time(splits_df: pd.DataFrame, output_path: str) -> None:
162 |     """
163 |     Plots the median 1km pace per month across all years.
164 |     """
165 |     split_data = utils.prepare_dated_activities(splits_df)
166 |     if split_data.empty:
167 |         return
168 | 
169 |     split_data["pace_sec_km"] = split_data["elapsed_time_s"] / split_data["distance_km"]
170 |     split_data["year"] = pd.to_datetime(split_data["start_date_local"]).dt.year
171 |     split_data["month"] = pd.to_datetime(split_data["start_date_local"]).dt.month
172 | 
173 |     monthly_medians = split_data.groupby(["year", "month"])["pace_sec_km"].median().reset_index()
174 | 
175 |     all_years = monthly_medians["year"].unique()
176 |     rows = []
177 |     for year in all_years:
178 |         for month in range(1, 13):
179 |             val = monthly_medians.loc[
180 |                 (monthly_medians["year"] == year) & (monthly_medians["month"] == month),
181 |                 "pace_sec_km",
182 |             ]
183 |             pace_val = val.values[0] if not val.empty else np.nan
184 |             rows.append({"year": year, "month": month, "pace_sec_km": pace_val})
185 | 
186 |     plot_df = pd.DataFrame(rows)
187 |     plot_df["pace_sec_km"] = plot_df.groupby("year")["pace_sec_km"].ffill()
188 | 
189 |     def plot_fn(axis):
190 |         for year in sorted(plot_df["year"].unique()):
191 |             year_data = plot_df[plot_df["year"] == year].sort_values("month")
192 |             axis.plot(
193 |                 year_data["month"],
194 |                 year_data["pace_sec_km"],
195 |                 marker="o",
196 |                 linestyle="-",
197 |                 label=str(year),
198 |             )
199 |         utils.label_month_axis(axis)
200 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
201 |         axis.legend(title="Year")
202 | 
203 |     # pylint: disable=R0801
204 |     utils.plot_with_common_setup(
205 |         title="Median 1 km Pace Over Time",
206 |         xlabel="Month",
207 |         ylabel="Median Pace (mm:ss)",
208 |         output_path=output_path,
209 |         plot_func=plot_fn,
210 |     )
211 |     # pylint: enable=R0801
212 | 
213 | 
214 | def plot_pace_by_day_of_week(splits_df: pd.DataFrame, output_path: str) -> None:
215 |     """
216 |     Pace by Day of Week:
217 |     - y-axis: 1 km pace (mm:ss), x-axis: day of week
218 |     - Box plot filtered for ~1 km splits
219 |     """
220 |     split_data = utils.prepare_dated_activities(splits_df)
221 |     if split_data.empty:
222 |         return
223 | 
224 |     split_data["pace_sec_km"] = split_data["elapsed_time_s"] / split_data["distance_km"]
225 |     split_data["day_of_week"] = pd.to_datetime(split_data["start_date_local"]).dt.day_name()
226 |     ordered_days = list(calendar.day_name)
227 | 
228 |     def plot_fn(axis):
229 |         sns.boxplot(data=split_data, x="day_of_week", y="pace_sec_km", order=ordered_days, ax=axis)
230 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
231 | 
232 |     # pylint: disable=R0801
233 |     utils.plot_with_common_setup(
234 |         title="Pace by Day of Week",
235 |         xlabel="Day of Week",
236 |         ylabel="Pace (mm:ss)",
237 |         output_path=output_path,
238 |         plot_func=plot_fn,
239 |     )
240 |     # pylint: enable=R0801
241 | 
242 | 
243 | def plot_pace_variability_per_run(splits_df: pd.DataFrame, output_path: str) -> None:
244 |     """
245 |     Plots the standard deviation of pace (in sec/km) for each run over time.
246 |     Only includes activities with at least 3 ~1 km splits.
247 |     """
248 |     splits = utils.prepare_dated_activities(splits_df)
249 |     if splits.empty:
250 |         return
251 | 
252 |     splits["pace_sec_km"] = splits["elapsed_time_s"] / splits["distance_km"]
253 | 
254 |     grouped = (
255 |         splits.groupby(["activity_id", "start_date_local"])
256 |         .agg(pace_std=("pace_sec_km", "std"), split_count=("pace_sec_km", "count"))
257 |         .reset_index()
258 |     )
259 | 
260 |     grouped = grouped[grouped["split_count"] >= 3]
261 |     grouped["date"] = pd.to_datetime(grouped["start_date_local"])
262 | 
263 |     if grouped.empty:
264 |         return
265 | 
266 |     def plot_fn(axis):
267 |         sns.lineplot(data=grouped.sort_values("date"), x="date", y="pace_std", marker="o", ax=axis)
268 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
269 |         for label in axis.get_xticklabels():
270 |             label.set_rotation(45)
271 | 
272 |     # pylint: disable=R0801
273 |     utils.plot_with_common_setup(
274 |         title="Pace Variability per Run (Standard Deviation)",
275 |         xlabel="Date",
276 |         ylabel="Pace Std Dev (mm:ss)",
277 |         output_path=output_path,
278 |         plot_func=plot_fn,
279 |     )
280 |     # pylint: enable=R0801
281 | 


--------------------------------------------------------------------------------
/src/strava_data/strava_api/visualisation/graphs_distribution.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains the distribution chart functions, each saving a PNG file.
  3 | """
  4 | 
  5 | import calendar
  6 | from matplotlib import ticker
  7 | from matplotlib.colors import ListedColormap, BoundaryNorm
  8 | import numpy as np
  9 | import pandas as pd
 10 | import seaborn as sns
 11 | 
 12 | from strava_data.strava_api.visualisation import utils
 13 | 
 14 | 
 15 | def plot_run_distance_distribution(activities_df: pd.DataFrame, output_path: str) -> None:
 16 |     """
 17 |     KDE plot showing distribution of run distances, split by year.
 18 |     Highlights distance preferences and training evolution over time.
 19 |     """
 20 |     data = utils.prepare_dated_activities(activities_df)
 21 | 
 22 |     def plot_fn(axis):
 23 |         for year in sorted(data["year"].unique()):
 24 |             year_data = data[data["year"] == year]
 25 |             if year_data["distance_km"].nunique() > 1:
 26 |                 sns.kdeplot(
 27 |                     year_data["distance_km"],
 28 |                     fill=True,
 29 |                     label=str(year),
 30 |                     alpha=0.3,
 31 |                     ax=axis,
 32 |                 )
 33 |         axis.set_xlim(left=0)
 34 |         axis.legend(title="Year")
 35 |         axis.grid(True, linestyle="--", linewidth=0.5)
 36 | 
 37 |     # pylint: disable=R0801
 38 |     utils.plot_with_common_setup(
 39 |         title="Run Distance Distribution by Year",
 40 |         xlabel="Distance (km)",
 41 |         ylabel="Density",
 42 |         output_path=output_path,
 43 |         plot_func=plot_fn,
 44 |     )
 45 |     # pylint: enable=R0801
 46 | 
 47 | 
 48 | def plot_pace_distribution(splits_df: pd.DataFrame, output_path: str) -> None:
 49 |     """
 50 |     KDE plot showing distribution of paces (in mm:ss per km), one per year.
 51 |     Only includes ~1 km splits.
 52 |     """
 53 |     data = utils.prepare_dated_activities(splits_df)
 54 |     if data.empty:
 55 |         return
 56 | 
 57 |     data["pace_sec_km"] = data["elapsed_time_s"] / data["distance_km"]
 58 | 
 59 |     def plot_fn(axis):
 60 |         for year in sorted(data["year"].unique()):
 61 |             year_data = data[data["year"] == year]
 62 |             if year_data["pace_sec_km"].nunique() > 1:
 63 |                 sns.kdeplot(
 64 |                     year_data["pace_sec_km"],
 65 |                     fill=True,
 66 |                     label=str(year),
 67 |                     alpha=0.3,
 68 |                     ax=axis,
 69 |                 )
 70 |         axis.xaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace))
 71 |         axis.legend(title="Year")
 72 |         axis.grid(True)
 73 | 
 74 |     # pylint: disable=R0801
 75 |     utils.plot_with_common_setup(
 76 |         title="Pace Distribution by Year (1 km splits)",
 77 |         xlabel="Pace (mm:ss)",
 78 |         ylabel="Density",
 79 |         output_path=output_path,
 80 |         plot_func=plot_fn,
 81 |     )
 82 |     # pylint: enable=R0801
 83 | 
 84 | 
 85 | def plot_elevation_gain_distribution(activities_df: pd.DataFrame, output_path: str) -> None:
 86 |     """
 87 |     KDE plots showing distribution of elevation gain per run, one per year.
 88 |     Highlights how hilly your training was year-to-year.
 89 |     """
 90 |     data = utils.prepare_dated_activities(activities_df)
 91 |     data = data[data["total_elevation_gain_m"] != 0]
 92 | 
 93 |     def plot_fn(axis):
 94 |         for year in sorted(data["year"].unique()):
 95 |             year_data = data[data["year"] == year]
 96 |             if year_data["total_elevation_gain_m"].nunique() > 1:
 97 |                 sns.kdeplot(
 98 |                     year_data["total_elevation_gain_m"],
 99 |                     fill=True,
100 |                     label=str(year),
101 |                     alpha=0.3,
102 |                     ax=axis,
103 |                 )
104 |         axis.legend(title="Year")
105 |         axis.grid(True, linestyle="--", linewidth=0.5)
106 | 
107 |     # pylint: disable=R0801
108 |     utils.plot_with_common_setup(
109 |         title="Elevation Gain per Run (by Year)",
110 |         xlabel="Elevation Gain (m)",
111 |         ylabel="Density",
112 |         output_path=output_path,
113 |         plot_func=plot_fn,
114 |     )
115 |     # pylint: enable=R0801
116 | 
117 | 
118 | def plot_heart_rate_zone_distribution(splits_df: pd.DataFrame, output_path: str) -> None:
119 |     """
120 |     Stacked bar chart showing time spent in heart rate zones per month.
121 |     Only includes ~1 km splits with valid heart rate data.
122 |     """
123 |     data = utils.prepare_dated_activities(splits_df)
124 |     data = data[pd.notnull(data["average_heartrate"])]
125 | 
126 |     if data.empty:
127 |         return
128 | 
129 |     data["month_label"] = (
130 |         pd.to_datetime(data["start_date_local"]).dt.tz_localize(None).dt.to_period("M").astype(str)
131 |     )
132 |     data["hr_zone"] = data.apply(
133 |         lambda row: utils.classify_zone_dynamic(row["average_heartrate"], row["start_date_local"]),
134 |         axis=1,
135 |     )
136 |     data["time_min"] = data["elapsed_time_s"] / 60.0
137 |     grouped = data.groupby(["month_label", "hr_zone"])["time_min"].sum().unstack().fillna(0)
138 |     grouped = grouped.sort_index()
139 | 
140 |     def plot_fn(axis):
141 |         grouped.plot(kind="bar", stacked=True, figsize=(14, 6), colormap="viridis", ax=axis)
142 |         axis.set_xticks(range(len(grouped.index)))
143 |         axis.set_xticklabels([str(label) for label in grouped.index], rotation=45)
144 |         axis.legend(title="Heart Rate Zone")
145 | 
146 |     # pylint: disable=R0801
147 |     utils.plot_with_common_setup(
148 |         title="Training Intensity by Heart Rate Zone",
149 |         xlabel="Month",
150 |         ylabel="Time Spent (minutes)",
151 |         output_path=output_path,
152 |         plot_func=plot_fn,
153 |     )
154 |     # pylint: enable=R0801
155 | 
156 | 
157 | def plot_run_start_time_distribution(activities_df: pd.DataFrame, output_path: str) -> None:
158 |     """
159 |     Box plot showing distribution of run start times by month.
160 |     - X-axis: Month (Jan–Dec)
161 |     - Y-axis: Hour of day (0–23)
162 |     """
163 |     if activities_df.empty:
164 |         return
165 | 
166 |     data = utils.prepare_activities_with_distance(activities_df)
167 |     data["start_time"] = pd.to_datetime(data["start_date_local"], errors="coerce")
168 |     data["hour"] = data["start_time"].dt.hour
169 | 
170 |     if data[["month", "hour"]].dropna().empty:
171 |         return
172 | 
173 |     def plot_fn(axis):
174 |         sns.boxplot(data=data, x="month", y="hour", ax=axis)
175 |         axis.set_xticks(ticks=range(0, 12))
176 |         axis.set_xticklabels(labels=calendar.month_abbr[1:13])
177 | 
178 |     # pylint: disable=R0801
179 |     utils.plot_with_common_setup(
180 |         title="Distribution of Run Start Time by Month",
181 |         xlabel="Month",
182 |         ylabel="Start Hour of Day",
183 |         output_path=output_path,
184 |         plot_func=plot_fn,
185 |     )
186 |     # pylint: enable=R0801
187 | 
188 | 
189 | def plot_run_days_heatmap(activities_df: pd.DataFrame, output_path: str) -> None:
190 |     """
191 |     Heatmap showing number of days with runs per month.
192 |     Highlights how consistently you trained.
193 |     """
194 |     if activities_df.empty:
195 |         return
196 | 
197 |     data = activities_df.copy()
198 |     data["date"] = pd.to_datetime(data["start_date_local"]).dt.date
199 |     data["year"] = pd.to_datetime(data["start_date_local"]).dt.year
200 |     data["month"] = pd.to_datetime(data["start_date_local"]).dt.month
201 | 
202 |     run_days = data.drop_duplicates(subset="date")
203 |     summary = run_days.groupby(["year", "month"]).size().reset_index(name="run_day_count")
204 |     pivot = summary.pivot(index="year", columns="month", values="run_day_count")
205 | 
206 |     def plot_fn(axis):
207 |         sns.heatmap(
208 |             pivot,
209 |             annot=pivot,
210 |             fmt=".0f",
211 |             cmap="Greens",
212 |             cbar_kws={"label": "Run Days"},
213 |             mask=pivot.isna(),
214 |             ax=axis,
215 |         )
216 |         utils.label_month_axis_barplot(axis)
217 |         axis.set_xlabel("Month")
218 |         axis.set_ylabel("Year")
219 | 
220 |     # pylint: disable=R0801
221 |     utils.plot_with_common_setup(
222 |         title="Run Days per Month",
223 |         xlabel="Month",
224 |         ylabel="Year",
225 |         output_path=output_path,
226 |         plot_func=plot_fn,
227 |     )
228 |     # pylint: enable=R0801
229 | 
230 | 
231 | def plot_rest_days_heatmap(activities_df: pd.DataFrame, output_path: str) -> None:
232 |     """
233 |     Heatmap showing number of rest days per month.
234 |     Only annotates months where rest days occurred.
235 |     """
236 |     if activities_df.empty:
237 |         return
238 | 
239 |     data = activities_df.copy()
240 |     data["date"] = pd.to_datetime(data["start_date_local"]).dt.date
241 | 
242 |     start = data["date"].min()
243 |     end = data["date"].max()
244 |     full_dates = pd.DataFrame({"date": [d.date() for d in pd.date_range(start, end)]})
245 | 
246 |     rest_days = full_dates[~full_dates["date"].isin(data["date"])].copy()
247 |     rest_days["year"] = pd.to_datetime(rest_days["date"]).dt.year
248 |     rest_days["month"] = pd.to_datetime(rest_days["date"]).dt.month
249 | 
250 |     rest_summary = rest_days.groupby(["year", "month"]).size().reset_index(name="rest_day_count")
251 |     pivot = rest_summary.pivot(index="year", columns="month", values="rest_day_count")
252 | 
253 |     def plot_fn(axis):
254 |         sns.heatmap(
255 |             pivot,
256 |             annot=pivot,
257 |             fmt=".0f",
258 |             cmap="Reds",
259 |             cbar_kws={"label": "Rest Days"},
260 |             mask=pivot.isna(),
261 |             ax=axis,
262 |         )
263 |         utils.label_month_axis_barplot(axis)
264 |         axis.set_xlabel("Month")
265 |         axis.set_ylabel("Year")
266 | 
267 |     # pylint: disable=R0801
268 |     utils.plot_with_common_setup(
269 |         title="Rest Days per Month",
270 |         xlabel="Month",
271 |         ylabel="Year",
272 |         output_path=output_path,
273 |         plot_func=plot_fn,
274 |     )
275 |     # pylint: enable=R0801
276 | 
277 | 
278 | def plot_run_rest_ratio_heatmap(activities_df: pd.DataFrame, output_path: str) -> None:
279 |     """
280 |     Heatmap showing the run:rest ratio per month with colour-coded zones:
281 |     - Green = Balanced (0.25–0.9)
282 |     - Red = High (overtraining)
283 |     - Yellow = Low (undertraining)
284 |     """
285 |     if activities_df.empty:
286 |         return
287 | 
288 |     data = activities_df.copy()
289 |     data["date"] = pd.to_datetime(data["start_date_local"]).dt.date
290 | 
291 |     start = data["date"].min()
292 |     end = data["date"].max()
293 |     all_dates = pd.DataFrame({"date": [d.date() for d in pd.date_range(start, end)]})
294 |     all_dates["year"] = pd.to_datetime(all_dates["date"]).dt.year
295 |     all_dates["month"] = pd.to_datetime(all_dates["date"]).dt.month
296 | 
297 |     run_dates = data.drop_duplicates(subset="date")[["date"]].copy()
298 |     run_dates["ran"] = 1
299 | 
300 |     merged = all_dates.merge(run_dates, on="date", how="left")
301 |     merged["ran"] = merged["ran"].fillna(0)
302 | 
303 |     summary = (
304 |         merged.groupby(["year", "month"])["ran"]
305 |         .agg(run_days="sum", total_days="count")
306 |         .reset_index()
307 |     )
308 |     summary["run_rest_ratio"] = summary["run_days"] / summary["total_days"]
309 |     pivot = summary.pivot(index="year", columns="month", values="run_rest_ratio")
310 | 
311 |     cmap = ListedColormap(["#FFD700", "#32CD32", "#FF6347"])
312 |     bounds = [0, 0.25, 0.9, 1.0]
313 |     norm = BoundaryNorm(bounds, cmap.N)
314 | 
315 |     def plot_fn(axis):
316 |         sns.heatmap(
317 |             pivot,
318 |             annot=pivot,
319 |             fmt=".2f",
320 |             cmap=cmap,
321 |             norm=norm,
322 |             cbar_kws={"label": "Run:Rest Ratio"},
323 |             mask=pivot.isna(),
324 |             linewidths=0.5,
325 |             linecolor="white",
326 |             ax=axis,
327 |         )
328 |         utils.label_month_axis_barplot(axis)
329 |         axis.set_xlabel("Month")
330 |         axis.set_ylabel("Year")
331 | 
332 |     # pylint: disable=R0801
333 |     utils.plot_with_common_setup(
334 |         title="Run:Rest Ratio per Month",
335 |         xlabel="Month",
336 |         ylabel="Year",
337 |         output_path=output_path,
338 |         plot_func=plot_fn,
339 |     )
340 |     # pylint: enable=R0801
341 | 
342 | 
343 | def plot_heatmap_activities(activities_df: pd.DataFrame, output_path: str) -> None:
344 |     """
345 |     Heatmap of Activities by Day and Hour:
346 |     - x-axis: hour of day (0–23)
347 |     - y-axis: day of week
348 |     - cell = count of runs
349 |     """
350 |     if activities_df.empty:
351 |         return
352 | 
353 |     activity_data = activities_df.copy()
354 |     dt_col = pd.to_datetime(activity_data["start_date_local"])
355 |     activity_data["weekday"] = dt_col.dt.weekday
356 |     activity_data["hour"] = dt_col.dt.hour
357 | 
358 |     pivot = activity_data.groupby(["weekday", "hour"]).size().unstack(fill_value=0)
359 | 
360 |     def plot_fn(axis):
361 |         sns.heatmap(pivot, cmap="YlGnBu", cbar_kws={"label": "Count of Runs"}, ax=axis)
362 |         axis.set_xlabel("Hour of Day")
363 |         axis.set_ylabel("Day of Week")
364 |         ylabels = [calendar.day_name[i] for i in pivot.index]
365 |         axis.set_yticks(ticks=np.arange(0.5, 7.5, 1))
366 |         axis.set_yticklabels(labels=ylabels, rotation=0)
367 | 
368 |     # pylint: disable=R0801
369 |     utils.plot_with_common_setup(
370 |         title="Heatmap of Activities by Day and Hour",
371 |         xlabel="Hour of Day",
372 |         ylabel="Day of Week",
373 |         output_path=output_path,
374 |         plot_func=plot_fn,
375 |     )
376 |     # pylint: enable=R0801
377 | 


--------------------------------------------------------------------------------
/src/strava_data/ml/training_advisor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generates a weekly training recommendation chart based on recent Strava activity.
  3 | 
  4 | - Analyses the past 6 months to determine usual running days.
  5 | - Reviews the last 8 weeks of runs to identify missing session types.
  6 | - Uses historical pace data to tailor pace suggestions.
  7 | - Assigns recommended runs to the days you typically train.
  8 | - Outputs a visual table chart: Suggested_Training_Week.png
  9 | """
 10 | 
 11 | from collections import Counter
 12 | import pandas as pd
 13 | import matplotlib.pyplot as plt
 14 | from matplotlib.table import Table
 15 | 
 16 | from strava_data.strava_api.visualisation.utils import (
 17 |     prepare_dated_activities,
 18 |     format_pace,
 19 |     save_and_close_plot,
 20 | )
 21 | from strava_data.ml.run_type_classifier import cluster_run_types, build_run_features
 22 | from strava_data.ml.utils import prepare_pace_summary
 23 | from utils.logger import get_logger
 24 | 
 25 | LOGGER = get_logger()
 26 | 
 27 | 
 28 | def generate_training_plan_chart(
 29 |     activities_df: pd.DataFrame, splits_df: pd.DataFrame, output_path: str
 30 | ) -> None:
 31 |     """
 32 |     Generates and saves a visual weekly training plan as a PNG file.
 33 | 
 34 |     This function analyses recent training behaviour and fitness gaps to suggest
 35 |     up to 5 structured runs per week. The resulting schedule balances intensity,
 36 |     avoids overtraining, and aligns with the user's typical running days.
 37 | 
 38 |     Parameters:
 39 |         activities_df (pd.DataFrame): DataFrame of all Strava activity-level data.
 40 |         splits_df (pd.DataFrame): DataFrame of all split-level running data.
 41 |         output_path (str): File path where the training plan image will be saved.
 42 |     """
 43 |     LOGGER.info("Generating training recommendation chart...")
 44 | 
 45 |     (
 46 |         preferred_days,
 47 |         run_counts,
 48 |         fast_pace,
 49 |         median_pace,
 50 |         slow_pace,
 51 |         max_recommendations,
 52 |     ) = _get_recent_data(activities_df, splits_df)
 53 | 
 54 |     recommendations = _generate_recommendations(
 55 |         run_counts, fast_pace, median_pace, slow_pace, max_recommendations
 56 |     )
 57 | 
 58 |     assigned = _assign_runs_to_days(recommendations, preferred_days)
 59 |     _render_training_table(assigned, output_path)
 60 | 
 61 | 
 62 | def _get_recent_data(activities_df, splits_df):
 63 |     """
 64 |     Extracts recent training metadata needed to generate a personalised training plan.
 65 | 
 66 |     This includes:
 67 |     - The user's preferred training days (based on frequency over 6 months)
 68 |     - Run type distribution over the past 8 weeks
 69 |     - Recent representative pace metrics (fast, median, slow)
 70 |     - Target number of runs to recommend this week
 71 | 
 72 |     Parameters:
 73 |         activities_df (pd.DataFrame): DataFrame of all Strava activities.
 74 |         splits_df (pd.DataFrame): DataFrame of all split-level records.
 75 | 
 76 |     Returns:
 77 |         tuple:
 78 |             preferred_days (list): Ordered list of most common training days.
 79 |             run_counts (Counter): Frequency of run types over recent period.
 80 |             fast_pace (float): 25th percentile pace in seconds/km.
 81 |             median_pace (float): Median pace in seconds/km.
 82 |             slow_pace (float): 75th percentile pace in seconds/km.
 83 |             max_recommendations (int): Number of runs to recommend this week.
 84 |     """
 85 |     preferred_days, recent_data = _get_recent_days(activities_df)
 86 |     recent_splits = _get_recent_splits(splits_df)
 87 |     run_counts = _get_recent_run_counts(splits_df, recent_splits)
 88 |     pace_data = prepare_pace_summary(recent_splits, group_cols=["activity_id"])
 89 |     median_pace = pace_data["pace_median"].median()
 90 |     fast_pace = pace_data["pace_median"].quantile(0.25)
 91 |     slow_pace = pace_data["pace_median"].quantile(0.75)
 92 | 
 93 |     weekly_counts = recent_data.groupby(recent_data["start_date"].dt.isocalendar().week).size()
 94 |     average_weekly_runs = int(round(weekly_counts.mean() + 0.5))
 95 |     max_recommendations = max(3, min(5, average_weekly_runs))
 96 | 
 97 |     return preferred_days, run_counts, fast_pace, median_pace, slow_pace, max_recommendations
 98 | 
 99 | 
100 | def _get_recent_run_counts(splits_df: pd.DataFrame, recent_splits: pd.DataFrame) -> Counter:
101 |     """
102 |     Computes the frequency of run types over the recent training period.
103 | 
104 |     Uses clustering to classify runs into types (e.g. Long, Tempo, Recovery, Intervals),
105 |     then filters to include only runs from the same window as recent_splits.
106 | 
107 |     Parameters:
108 |         splits_df (pd.DataFrame): Full split-level dataset for building run features.
109 |         recent_splits (pd.DataFrame): Date-filtered splits used to define the recent period.
110 | 
111 |     Returns:
112 |         Counter: A mapping of run type labels to their counts over the recent period.
113 |     """
114 |     run_features = build_run_features(splits_df)
115 |     clustered = cluster_run_types(run_features)
116 |     recent_runs = clustered[clustered["start_date"] >= recent_splits["start_date"].min()]
117 |     return Counter(recent_runs["run_type"])
118 | 
119 | 
120 | def _get_recent_days(activities_df):
121 |     """
122 |     Identifies the user's most common training days over the past 6 months.
123 | 
124 |     Filters activities to the last 6 months and counts frequency of runs per day of the week.
125 | 
126 |     Parameters:
127 |         activities_df (pd.DataFrame): DataFrame containing all activity records.
128 | 
129 |     Returns:
130 |         tuple:
131 |             preferred_days (list): Day names ordered by frequency of runs.
132 |             recent_data (pd.DataFrame): Filtered DataFrame with activities from the past 6 months.
133 |     """
134 |     six_months_ago = pd.Timestamp.now(tz="UTC") - pd.DateOffset(months=6)
135 |     recent_data = prepare_dated_activities(activities_df)
136 |     recent_data = recent_data[recent_data["start_date"] >= six_months_ago]
137 |     recent_data["day"] = recent_data["start_date"].dt.day_name()
138 |     day_freq = Counter(recent_data["day"])
139 |     preferred_days = [d for d, _ in day_freq.most_common()]
140 |     return preferred_days, recent_data
141 | 
142 | 
143 | def _get_recent_splits(splits_df):
144 |     """
145 |     Filters split-level data to include only entries from the past 8 weeks.
146 | 
147 |     Converts 'start_date_local' to UTC-naive datetime and filters by date threshold.
148 | 
149 |     Parameters:
150 |         splits_df (pd.DataFrame): DataFrame containing all split records.
151 | 
152 |     Returns:
153 |         pd.DataFrame: Filtered DataFrame containing only recent splits.
154 |     """
155 |     eight_weeks_ago = pd.Timestamp.now(tz="UTC") - pd.DateOffset(weeks=8)
156 |     recent_splits = splits_df.copy()
157 |     recent_splits["start_date"] = pd.to_datetime(recent_splits["start_date_local"])
158 |     return recent_splits[recent_splits["start_date"] >= eight_weeks_ago]
159 | 
160 | 
161 | def _suggest_bounds(pace: float, tolerance: float = 0.05) -> str:
162 |     """
163 |     Returns a formatted pace range string based on a central pace value and tolerance.
164 | 
165 |     Parameters:
166 |         pace (float): Central pace in seconds per kilometre.
167 |         tolerance (float): Proportional margin around the pace (default is 5%).
168 | 
169 |     Returns:
170 |         str: Formatted pace range string (e.g. "4:45 – 5:00").
171 |     """
172 |     low = pace * (1 - tolerance)
173 |     high = pace * (1 + tolerance)
174 |     return f"{format_pace(low, None)} – {format_pace(high, None)}"
175 | 
176 | 
177 | def _generate_recommendations(run_counts, fast_pace, median_pace, slow_pace, max_recommendations):
178 |     """
179 |     Generates a list of recommended training sessions to improve fitness balance.
180 | 
181 |     Scores run types based on recent frequency and prioritises under-represented types.
182 |     Uses recent pace data to personalise pace ranges for each session type.
183 |     Limits output to a maximum number of recommendations.
184 | 
185 |     Parameters:
186 |         run_counts (Counter): Frequency of each run type over the recent period.
187 |         fast_pace (float): 25th percentile pace from recent runs (used for intervals).
188 |         median_pace (float): Median pace from recent runs (used for tempo).
189 |         slow_pace (float): 75th percentile pace from recent runs (used for long and recovery).
190 |         max_recommendations (int): Maximum number of runs to recommend in the week.
191 | 
192 |     Returns:
193 |         list: Recommended run dicts including type, distance, intensity, pace, and rationale.
194 |     """
195 | 
196 |     run_scores = {
197 |         "Long": 1.0 - (run_counts.get("Long", 0) / 4),
198 |         "Tempo": 1.0 - (run_counts.get("Tempo", 0) / 4),
199 |         "Intervals": 1.0 - (run_counts.get("Intervals", 0) / 3),
200 |         "Recovery": 1.0 - (run_counts.get("Recovery", 0) / 2),
201 |     }
202 |     sorted_types = sorted(run_scores.items(), key=lambda x: x[1], reverse=True)
203 |     recommendations = []
204 | 
205 |     for run_type, _ in sorted_types:
206 |         if len(recommendations) >= max_recommendations:
207 |             break
208 |         if run_type == "Intervals":
209 |             recommendations.append(
210 |                 {
211 |                     "type": "Intervals",
212 |                     "intensity": "Hard",
213 |                     "distance": "6x400m",
214 |                     "pace": _suggest_bounds(fast_pace),
215 |                     "reason": "Include interval session to improve VO2 max.",
216 |                 }
217 |             )
218 |         elif run_type == "Long":
219 |             recommendations.append(
220 |                 {
221 |                     "type": "Long",
222 |                     "intensity": "Easy",
223 |                     "distance": "14–18 km",
224 |                     "pace": _suggest_bounds(slow_pace),
225 |                     "reason": "Endurance run builds aerobic fitness.",
226 |                 }
227 |             )
228 |         elif run_type == "Tempo":
229 |             recommendations.append(
230 |                 {
231 |                     "type": "Tempo",
232 |                     "intensity": "Moderate–Hard",
233 |                     "distance": "6–10 km",
234 |                     "pace": _suggest_bounds(median_pace),
235 |                     "reason": "Tempo runs increase lactate threshold.",
236 |                 }
237 |             )
238 |         elif run_type == "Recovery":
239 |             recommendations.append(
240 |                 {
241 |                     "type": "Recovery",
242 |                     "intensity": "Easy",
243 |                     "distance": "5 km",
244 |                     "pace": _suggest_bounds(slow_pace),
245 |                     "reason": "Recovery run supports adaptation.",
246 |                 }
247 |             )
248 | 
249 |     return recommendations
250 | 
251 | 
252 | def _assign_runs_to_days(recommendations, preferred_days):
253 |     """
254 |     Assigns recommended training sessions to days of the week based on historical patterns.
255 | 
256 |     Prioritises days the user typically trains (based on recent activity),
257 |     avoids back-to-back hard sessions, and fills up to the number of recommended runs.
258 | 
259 |     Parameters:
260 |         recommendations (list): A list of run recommendation dicts, each containing type, pace, etc.
261 |         preferred_days (list): Days of the week sorted by user's historical training frequency.
262 | 
263 |     Returns:
264 |         dict: A mapping of day name to assigned run recommendation.
265 |     """
266 |     full_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
267 |     assigned = {}
268 |     used_days = set()
269 |     previous_day = None
270 | 
271 |     for rec in recommendations:
272 |         for day_candidate in preferred_days:
273 |             if day_candidate in used_days:
274 |                 continue
275 |             if previous_day is not None:
276 |                 current_idx = full_week.index(day_candidate)
277 |                 previous_idx = full_week.index(previous_day)
278 |                 if abs(current_idx - previous_idx) in (0, 1):
279 |                     if rec["type"] not in ("Recovery", "Easy") and assigned[previous_day][
280 |                         "type"
281 |                     ] not in ("Recovery", "Easy"):
282 |                         continue
283 |             assigned[day_candidate] = rec
284 |             used_days.add(day_candidate)
285 |             previous_day = day_candidate
286 |             break
287 | 
288 |     return assigned
289 | 
290 | 
291 | def _render_training_table(assigned, output_path):
292 |     """
293 |     Renders a visual training plan as a PNG table.
294 | 
295 |     Creates a 7-row table showing daily training recommendations using matplotlib.
296 |     Each row includes the day, run type, distance, pace range, intensity, and reason.
297 |     Rest days are automatically filled for unassigned days.
298 | 
299 |     Parameters:
300 |         assigned (dict): Mapping of day name (e.g. "Monday") to a run recommendation dict.
301 |         output_path (str): File path where the output PNG image should be saved.
302 |     """
303 |     full_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
304 |     _, plot_axis = plt.subplots(figsize=(12, len(full_week) * 0.8))
305 |     plot_axis.axis("off")
306 |     table = Table(plot_axis, bbox=[0, 0, 1, 1])
307 |     col_labels = ["Day", "Run Type", "Distance", "Pace", "Intensity", "Reason"]
308 |     cell_text = []
309 | 
310 |     for day in full_week:
311 |         if day in assigned:
312 |             rec = assigned[day]
313 |             row = [day, rec["type"], rec["distance"], rec["pace"], rec["intensity"], rec["reason"]]
314 |         else:
315 |             row = [day, "Rest", "–", "–", "–", "Scheduled rest day"]
316 |         cell_text.append(row)
317 | 
318 |     table.auto_set_font_size(False)
319 |     table.set_fontsize(9)
320 |     for row_idx, row in enumerate([col_labels] + cell_text):
321 |         for col_idx, val in enumerate(row):
322 |             cell_width = [0.08, 0.12, 0.10, 0.10, 0.12, 0.48][col_idx]
323 |             table.add_cell(
324 |                 row_idx,
325 |                 col_idx,
326 |                 cell_width,
327 |                 1 / (len(cell_text) + 1),
328 |                 text=val,
329 |                 loc="center",
330 |                 facecolor="#f0f0f0" if row_idx == 0 else "white",
331 |             )
332 | 
333 |     plot_axis.add_table(table)
334 |     plt.title("Suggested Training Plan (Next Week)", fontsize=14)
335 |     save_and_close_plot(output_path)
336 | 


--------------------------------------------------------------------------------
/src/strava_data/strava_api/visualisation/graphs_distance.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains the distance chart functions, each saving a PNG file.
  3 | """
  4 | 
  5 | import calendar
  6 | from datetime import datetime
  7 | import pandas as pd
  8 | import matplotlib.pyplot as plt
  9 | import seaborn as sns
 10 | from matplotlib import ticker
 11 | 
 12 | from strava_data.strava_api.visualisation import utils
 13 | 
 14 | 
 15 | def plot_time_taken_over_distances(activities_df: pd.DataFrame, output_path: str) -> None:
 16 |     """
 17 |     Time Taken Over Distances:
 18 |     - y-axis: total time (hh:mm:ss) with 15-minute intervals
 19 |     - x-axis: total distance (km) with 5 km intervals
 20 |     - Points colored by year
 21 |     - Trend line per year (same color, not labeled)
 22 |     - Overall trend in dashed black, labeled
 23 |     - Last run marked with a red X
 24 |     - Filters out runs shorter than 0.5 km
 25 |     - Decay logic: +180s/km added at max distance + 2km
 26 |     """
 27 |     if activities_df.empty:
 28 |         return
 29 | 
 30 |     data = utils.prepare_time_distance_data(activities_df)
 31 |     if data.empty:
 32 |         return
 33 | 
 34 |     decay_distance, decay_time = utils.calculate_decay_point(data)
 35 |     palette = sns.color_palette(n_colors=data["year"].nunique())
 36 |     year_color_map = dict(zip(sorted(data["year"].unique()), palette))
 37 | 
 38 |     def plot_fn(axis):
 39 |         for year in sorted(data["year"].unique()):
 40 |             year_data = data[data["year"] == year]
 41 |             sns.scatterplot(
 42 |                 data=year_data,
 43 |                 x="distance_km",
 44 |                 y="time_seconds",
 45 |                 color=year_color_map[year],
 46 |                 alpha=0.5,
 47 |                 label=year,
 48 |                 ax=axis,
 49 |             )
 50 | 
 51 |         last_run = data[data["is_last_run"]]
 52 |         if not last_run.empty:
 53 |             axis.plot(
 54 |                 last_run["distance_km"],
 55 |                 last_run["time_seconds"],
 56 |                 "x",
 57 |                 color="red",
 58 |                 markersize=10,
 59 |                 label="Last Run",
 60 |             )
 61 | 
 62 |         for year in sorted(data["year"].unique()):
 63 |             sub = data[data["year"] == year][["distance_km", "time_seconds"]].copy()
 64 |             sub = pd.concat(
 65 |                 [pd.DataFrame.from_records([{"distance_km": 0, "time_seconds": 0}]), sub]
 66 |             )
 67 |             sns.regplot(
 68 |                 data=sub,
 69 |                 x="distance_km",
 70 |                 y="time_seconds",
 71 |                 scatter=False,
 72 |                 ci=None,
 73 |                 truncate=False,
 74 |                 line_kws={"color": year_color_map[year], "alpha": 0.6},
 75 |                 ax=axis,
 76 |             )
 77 | 
 78 |         overall = pd.concat(
 79 |             [
 80 |                 pd.DataFrame.from_records([{"distance_km": 0, "time_seconds": 0}]),
 81 |                 data[["distance_km", "time_seconds"]],
 82 |                 pd.DataFrame.from_records(
 83 |                     [{"distance_km": decay_distance, "time_seconds": decay_time}]
 84 |                 ),
 85 |             ]
 86 |         )
 87 |         sns.regplot(
 88 |             data=overall,
 89 |             x="distance_km",
 90 |             y="time_seconds",
 91 |             scatter=False,
 92 |             ci=None,
 93 |             color="black",
 94 |             line_kws={"linestyle": "--"},
 95 |             ax=axis,
 96 |             label="Overall Trend",
 97 |             truncate=False,
 98 |         )
 99 | 
100 |         axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.seconds_to_hms))
101 |         axis.yaxis.set_major_locator(ticker.MultipleLocator(15 * 60))
102 |         axis.xaxis.set_major_locator(ticker.MultipleLocator(5))
103 |         axis.set_xlim(0, (int(decay_distance / 5) + 1) * 5)
104 |         axis.set_ylim(0, (int((decay_time * 1.05) / (15 * 60)) + 1) * (15 * 60))
105 |         axis.legend(title="Year")
106 | 
107 |     # pylint: disable=R0801
108 |     utils.plot_with_common_setup(
109 |         title="Time Taken Over Distances",
110 |         xlabel="Distance (km)",
111 |         ylabel="Time Taken (hh:mm:ss)",
112 |         output_path=output_path,
113 |         plot_func=plot_fn,
114 |     )
115 |     # pylint: enable=R0801
116 | 
117 | 
118 | def plot_time_taken_over_distances_recent_years(
119 |     activities_df: pd.DataFrame,
120 |     output_path: str,
121 | ) -> None:
122 |     """
123 |     Time Taken Over Distances (Recent Years Only):
124 |     - Same as plot_time_taken_over_distances but filtered to current and previous year.
125 |     """
126 |     if activities_df.empty:
127 |         return
128 | 
129 |     activities_df["start_date_local"] = pd.to_datetime(activities_df["start_date_local"])
130 | 
131 |     current_year = datetime.now().year
132 |     years_to_include = {current_year, current_year - 1}
133 |     filtered_df = activities_df[activities_df["start_date_local"].dt.year.isin(years_to_include)]
134 | 
135 |     # Reuse original plotting function on filtered data
136 |     plot_time_taken_over_distances(filtered_df, output_path)
137 | 
138 | 
139 | def plot_pace_vs_total_distance(splits_df: pd.DataFrame, output_path: str) -> None:
140 |     """
141 |     Running pace vs total distance of that run:
142 |     - x-axis: total distance (km)
143 |     - y-axis: average pace (mm:ss per km)
144 |     - Points colored by year
145 |     - Trend lines by year (matched color, not shown in legend)
146 |     """
147 |     if splits_df.empty:
148 |         return
149 | 
150 |     data = utils.prepare_pace_distance_data(splits_df)
151 |     if data.empty:
152 |         return
153 | 
154 |     max_distance = data["distance_km"].max()
155 |     palette = sns.color_palette(n_colors=data["year"].nunique())
156 |     year_color_map = dict(zip(sorted(data["year"].unique()), palette))
157 | 
158 |     def plot_fn(axis):
159 |         for year in sorted(data["year"].unique()):
160 |             year_data = data[data["year"] == year]
161 |             sns.scatterplot(
162 |                 data=year_data,
163 |                 x="distance_km",
164 |                 y="pace_sec",
165 |                 color=year_color_map[year],
166 |                 alpha=0.5,
167 |                 label=year,
168 |                 ax=axis,
169 |             )
170 | 
171 |         for year in sorted(data["year"].unique()):
172 |             year_data = data[data["year"] == year].copy()
173 |             if year_data.empty:
174 |                 continue
175 |             distance_max = year_data["distance_km"].max()
176 |             pace_max = year_data["pace_sec"].max()
177 |             decay_distance = distance_max + 2
178 |             decay_pace = pace_max + 180
179 | 
180 |             extended_data = pd.concat(
181 |                 [
182 |                     year_data,
183 |                     pd.DataFrame.from_records(
184 |                         [{"distance_km": decay_distance, "pace_sec": decay_pace}]
185 |                     ),
186 |                 ]
187 |             )
188 |             sns.regplot(
189 |                 data=extended_data,
190 |                 x="distance_km",
191 |                 y="pace_sec",
192 |                 scatter=False,
193 |                 ci=None,
194 |                 truncate=False,
195 |                 line_kws={"color": year_color_map[year], "alpha": 0.6},
196 |                 ax=axis,
197 |             )
198 | 
199 |         axis.set_xlim(0, max_distance + 3)
200 |         axis.yaxis.set_major_formatter(plt.FuncFormatter(utils.format_pace))
201 |         axis.legend(title="Year")
202 | 
203 |     # pylint: disable=R0801
204 |     utils.plot_with_common_setup(
205 |         title="Running Pace vs. Total Distance",
206 |         xlabel="Total Distance (km)",
207 |         ylabel="Average Pace (mm:ss per km)",
208 |         output_path=output_path,
209 |         plot_func=plot_fn,
210 |     )
211 |     # pylint: enable=R0801
212 | 
213 | 
214 | def plot_number_of_runs_per_distance(activities_df: pd.DataFrame, output_path: str) -> None:
215 |     """
216 |     Number of runs per distance:
217 |     - Bar graph showing grouped distances (<5 km, 5–10 km, etc.)
218 |     - Bars per year + an overall bar
219 |     """
220 |     data = utils.prepare_dated_activities(activities_df)
221 |     data["distance_bin"] = pd.cut(
222 |         data["distance_km"],
223 |         bins=[0, 5, 10, 15, 20, 25, 30, 9999],
224 |         labels=["<5", "5–10", "10–15", "15–20", "20–25", "25–30", "30+"],
225 |         include_lowest=True,
226 |     )
227 | 
228 |     grouped = data.groupby(["distance_bin", "year"]).size().reset_index(name="count")
229 | 
230 |     def plot_fn(axis):
231 |         sns.barplot(data=grouped, x="distance_bin", y="count", hue="year", errorbar=None, ax=axis)
232 | 
233 |     # pylint: disable=R0801
234 |     utils.plot_with_common_setup(
235 |         title="Number of Runs per Distance",
236 |         xlabel="Distance Range (km)",
237 |         ylabel="Count of Runs",
238 |         output_path=output_path,
239 |         plot_func=plot_fn,
240 |     )
241 |     # pylint: enable=R0801
242 | 
243 | 
244 | def plot_total_distance_by_month(activities_df: pd.DataFrame, output_path: str) -> None:
245 |     """
246 |     Total distance run by month:
247 |     - x-axis: months (Jan–Dec)
248 |     - y-axis: total distance run (km)
249 |     - Separate line graph for each year
250 |     """
251 |     data = utils.prepare_dated_activities(activities_df)
252 |     monthly_totals = data.groupby(["year", "month"])["distance_km"].sum().reset_index()
253 | 
254 |     def plot_fn(axis):
255 |         for year in sorted(monthly_totals["year"].unique()):
256 |             year_data = monthly_totals[monthly_totals["year"] == year].sort_values("month")
257 |             axis.plot(
258 |                 year_data["month"],
259 |                 year_data["distance_km"],
260 |                 marker="o",
261 |                 linestyle="-",
262 |                 label=str(year),
263 |             )
264 |         utils.label_month_axis(axis)
265 |         axis.legend(title="Year")
266 | 
267 |     # pylint: disable=R0801
268 |     utils.plot_with_common_setup(
269 |         title="Total Distance Run by Month",
270 |         xlabel="Month",
271 |         ylabel="Total Distance (km)",
272 |         output_path=output_path,
273 |         plot_func=plot_fn,
274 |     )
275 |     # pylint: enable=R0801
276 | 
277 | 
278 | def plot_cumulative_distance_over_time(activities_df: pd.DataFrame, output_path: str) -> None:
279 |     """
280 |     Cumulative distance per month:
281 |     - x-axis: ['Jan', 'Feb', ..., 'Dec']
282 |     - y-axis: cumulative distance (km)
283 |     - Separate line per year
284 |     """
285 |     data = utils.prepare_dated_activities(activities_df)
286 |     monthly_df = data.groupby(["year", "month"])["distance_km"].sum().reset_index()
287 | 
288 |     def plot_fn(axis):
289 |         for year in sorted(monthly_df["year"].unique()):
290 |             sub = monthly_df[monthly_df["year"] == year].copy()
291 |             sub = sub.set_index("month").reindex(range(1, 13), fill_value=0).reset_index()
292 |             sub["cum_dist"] = sub["distance_km"].cumsum()
293 |             axis.plot(sub["month"], sub["cum_dist"], marker="o", label=str(year))
294 |         utils.label_month_axis(axis)
295 |         axis.legend(title="Year")
296 | 
297 |     # pylint: disable=R0801
298 |     utils.plot_with_common_setup(
299 |         title="Cumulative Distance per Year",
300 |         xlabel="Month",
301 |         ylabel="Cumulative Distance (km)",
302 |         output_path=output_path,
303 |         plot_func=plot_fn,
304 |     )
305 |     # pylint: enable=R0801
306 | 
307 | 
308 | def plot_monthly_distance_by_year_grouped(activities_df: pd.DataFrame, output_path: str) -> None:
309 |     """
310 |     Clustered bar chart comparing total monthly distance by year.
311 |     - X-axis: Month (Jan–Dec)
312 |     - Y-axis: Total distance (km)
313 |     - Grouped by year
314 |     """
315 |     data = utils.prepare_dated_activities(activities_df)
316 |     grouped = data.groupby(["month", "year"])["distance_km"].sum().reset_index()
317 | 
318 |     pivot = grouped.pivot(index="month", columns="year", values="distance_km").fillna(0)
319 |     pivot = pivot.sort_index()
320 |     month_labels = [calendar.month_abbr[m] for m in pivot.index]
321 | 
322 |     def plot_fn(axis):
323 |         pivot.plot(kind="bar", width=0.8, ax=axis)
324 |         axis.set_xticks(range(len(month_labels)))
325 |         axis.set_xticklabels(month_labels, rotation=45)
326 |         axis.legend(title="Year")
327 | 
328 |     # pylint: disable=R0801
329 |     utils.plot_with_common_setup(
330 |         title="Year-over-Year Monthly Distance Comparison",
331 |         xlabel="Month",
332 |         ylabel="Total Distance (km)",
333 |         output_path=output_path,
334 |         plot_func=plot_fn,
335 |     )
336 |     # pylint: enable=R0801
337 | 
338 | 
339 | def plot_rolling_distance(activities_df: pd.DataFrame, output_path: str, window: int = 30) -> None:
340 |     """
341 |     Line graph showing rolling X-day distance total.
342 |     Default window = 30 days.
343 |     """
344 |     data = utils.prepare_dated_activities(activities_df)
345 |     daily = data.groupby("start_date")["distance_km"].sum().reset_index()
346 |     daily["rolling_distance_km"] = daily["distance_km"].rolling(window=window).sum()
347 | 
348 |     def plot_fn(axis):
349 |         axis.plot(daily["start_date"], daily["rolling_distance_km"], color="blue", linewidth=2)
350 | 
351 |     # pylint: disable=R0801
352 |     utils.plot_with_common_setup(
353 |         title=f"Rolling {window}-Day Distance",
354 |         xlabel="Date",
355 |         ylabel="Distance (km)",
356 |         output_path=output_path,
357 |         plot_func=plot_fn,
358 |     )
359 |     # pylint: enable=R0801
360 | 
361 | 
362 | def plot_longest_run_per_month(activities_df: pd.DataFrame, output_path: str) -> None:
363 |     """
364 |     Scatter plot of longest run per month across all years.
365 |     - X-axis: month (Jan–Dec)
366 |     - Y-axis: longest run (km)
367 |     - Points: one per year-month, only if a run occurred
368 |     - Colour-coded by year
369 |     """
370 |     data = utils.prepare_dated_activities(activities_df)
371 |     longest = data.groupby(["year", "month"])["distance_km"].max().reset_index()
372 | 
373 |     def plot_fn(axis):
374 |         for year in sorted(longest["year"].unique()):
375 |             year_data = longest[longest["year"] == year]
376 |             axis.scatter(
377 |                 year_data["month"],
378 |                 year_data["distance_km"],
379 |                 label=str(year),
380 |                 alpha=0.7,
381 |                 s=60,
382 |             )
383 |         utils.label_month_axis(axis)
384 |         axis.legend(title="Year")
385 | 
386 |     # pylint: disable=R0801
387 |     utils.plot_with_common_setup(
388 |         title="Longest Run per Month",
389 |         xlabel="Month",
390 |         ylabel="Distance (km)",
391 |         output_path=output_path,
392 |         plot_func=plot_fn,
393 |     )
394 |     # pylint: enable=R0801
395 | 


--------------------------------------------------------------------------------
/src/strava_data/strava_api/visualisation/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for chart styling or other shared visualisation helpers.
  3 | """
  4 | 
  5 | import calendar
  6 | from dataclasses import dataclass
  7 | import datetime
  8 | from typing import Callable, Optional, Tuple, List
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | from matplotlib.patches import FancyBboxPatch
 12 | from matplotlib.text import Text
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | DOB = datetime.datetime(1985, 1, 26)
 17 | 
 18 | 
 19 | @dataclass(frozen=True)
 20 | class TitleBoxConfig:
 21 |     """Configuration for the title + attribution banner box."""
 22 | 
 23 |     attribution: Optional[str] = "Data sourced from Garmin (synced via Strava)"
 24 |     fontsizes: Tuple[int, int] = (14, 9)  # (title_fontsize, subtitle_fontsize)
 25 |     offsets: Tuple[float, float] = (0.03, 1.3)  # (top_offset, line_height_scale)
 26 |     gap_and_pad: Tuple[float, float] = (0.02, 0.006)  # (min_gap, box_pad)
 27 |     box_lr: Tuple[float, float] = (0.05, 0.95)  # (box_left, box_right)
 28 | 
 29 | 
 30 | def _occupied_content_top(fig: plt.Figure, renderer) -> float:
 31 |     """
 32 |     Highest occupied y in figure coords considering axes *tight* bboxes
 33 |     (includes tick labels/rotations) and any visible legends.
 34 |     Falls back to _axes_top(fig) if nothing measurable is found.
 35 |     """
 36 |     tops: List[float] = []
 37 |     try:
 38 |         inv = fig.transFigure.inverted()
 39 |     except (AttributeError, ValueError):
 40 |         inv = None
 41 | 
 42 |     for axis in getattr(fig, "axes", []):
 43 |         try:
 44 |             if not axis.get_visible():
 45 |                 continue
 46 |         except (AttributeError, ValueError):
 47 |             pass
 48 | 
 49 |         # Axes tight bbox (ticks/labels included)
 50 |         try:
 51 |             tight_bbox = axis.get_tightbbox(renderer)
 52 |             if tight_bbox is not None:
 53 |                 tops.append(tight_bbox.transformed(inv).y1 if inv is not None else tight_bbox.y1)
 54 |         except (AttributeError, ValueError):
 55 |             pass
 56 | 
 57 |         # Legend bbox (if present)
 58 |         try:
 59 |             legend = axis.get_legend()
 60 |             if legend is not None and legend.get_visible():
 61 |                 legend_bbox = legend.get_window_extent(renderer=renderer)
 62 |                 if legend_bbox is not None:
 63 |                     tops.append(
 64 |                         legend_bbox.transformed(inv).y1 if inv is not None else legend_bbox.y1
 65 |                     )
 66 |         except (AttributeError, ValueError):
 67 |             pass
 68 | 
 69 |     return max(tops) if tops else _axes_top(fig)
 70 | 
 71 | 
 72 | def _reserve_space_above_axes(
 73 |     fig: plt.Figure, top_limit: float, *, min_bottom: float = 0.05
 74 | ) -> None:
 75 |     """
 76 |     Ensure no axes extend above top_limit (figure coords).
 77 |     Prefer shifting axes down; if that would push the bottom below min_bottom,
 78 |     trim from the top instead.
 79 |     """
 80 |     for axis in fig.axes:
 81 |         if not axis.get_visible():
 82 |             continue
 83 | 
 84 |         pos = axis.get_position()
 85 |         if pos.y1 <= top_limit + 1e-6:
 86 |             continue
 87 | 
 88 |         excess = pos.y1 - top_limit
 89 |         new_y0 = pos.y0 - excess
 90 |         new_y1 = top_limit
 91 | 
 92 |         # If we can't shift without going below min_bottom, shrink from the top
 93 |         if new_y0 < min_bottom:
 94 |             new_y0 = pos.y0
 95 |             new_y1 = top_limit
 96 | 
 97 |         axis.set_position([pos.x0, new_y0, pos.width, new_y1 - new_y0])
 98 | 
 99 | 
100 | def _finalise_and_get_renderer(fig: plt.Figure):
101 |     """Draw once so constrained layout is final and return the renderer."""
102 |     fig.canvas.draw()
103 |     return fig.canvas.get_renderer()
104 | 
105 | 
106 | def _axes_top(fig: plt.Figure) -> float:
107 |     """Top (y1) of the highest visible axes in figure coords."""
108 |     return max(ax.get_position().y1 for ax in fig.axes if ax.get_visible())
109 | 
110 | 
111 | def _line_height(fig: plt.Figure, fontsize_pt: int, scale: float) -> float:
112 |     """
113 |     Convert a font size in points to a figure-coordinate line height.
114 | 
115 |     Args:
116 |         fig: Matplotlib figure.
117 |         fontsize_pt: Font size in points.
118 |         scale: Multiplier to adjust line spacing.
119 | 
120 |     Returns:
121 |         Line height in figure coordinates.
122 |     """
123 |     fig_h_in = fig.get_size_inches()[1]
124 |     return (fontsize_pt / 72.0) / fig_h_in * scale
125 | 
126 | 
127 | def _place_texts(
128 |     fig: plt.Figure,
129 |     title: str,
130 |     attribution: Optional[str],
131 |     *,
132 |     title_y: float,
133 |     subtitle_y: float,
134 |     title_fontsize: int,
135 |     subtitle_fontsize: int,
136 | ) -> Tuple[Text, Optional[Text]]:
137 |     """
138 |     Create title and subtitle Text artists at the given y positions.
139 | 
140 |     Returns:
141 |         Tuple of (title_text, attribution_text_or_None).
142 |     """
143 |     title_txt = fig.text(
144 |         0.5,
145 |         title_y,
146 |         title,
147 |         ha="center",
148 |         va="bottom",
149 |         fontsize=title_fontsize,
150 |         color="black",
151 |         zorder=3,
152 |     )
153 |     attr_txt = None
154 |     if attribution:
155 |         attr_txt = fig.text(
156 |             0.5,
157 |             subtitle_y,
158 |             attribution,
159 |             ha="center",
160 |             va="bottom",
161 |             fontsize=subtitle_fontsize,
162 |             color="gray",
163 |             zorder=3,
164 |         )
165 |     return title_txt, attr_txt
166 | 
167 | 
168 | def _measure_text_bounds(fig: plt.Figure, renderer, artists: List[Text]) -> Tuple[float, float]:
169 |     """
170 |     Return the vertical span of the given text elements in **figure coordinates**.
171 | 
172 |     This computes the minimum and maximum y values across all provided text objects
173 |     after transforming their bounding boxes into the figure's coordinate system
174 |     (0–1 in both x and y).
175 | 
176 |     Returns:
177 |         (ymin, ymax) of all provided text artists.
178 |     """
179 |     y_bounds: List[Tuple[float, float]] = []
180 |     for artist in artists:
181 |         if artist is None:
182 |             continue
183 |         bounding_box = artist.get_window_extent(renderer).transformed(fig.transFigure.inverted())
184 |         y_bounds.append((bounding_box.ymin, bounding_box.ymax))
185 |     ymin = min(b[0] for b in y_bounds)
186 |     ymax = max(b[1] for b in y_bounds)
187 |     return ymin, ymax
188 | 
189 | 
190 | def _shift_texts(title_txt: Text, attr_txt: Optional[Text], shift: float) -> None:
191 |     """
192 |     Shift title and attribution texts upward by a given amount.
193 | 
194 |     Args:
195 |         title_txt: The title Text artist.
196 |         attr_txt: The attribution Text artist (or None).
197 |         shift: Amount to add to Y position in figure coords.
198 |     """
199 |     x_title, y_title = title_txt.get_position()
200 |     title_txt.set_position((x_title, y_title + shift))
201 |     if attr_txt is not None:
202 |         x_attr, y_attr = attr_txt.get_position()
203 |         attr_txt.set_position((x_attr, y_attr + shift))
204 | 
205 | 
206 | def _lift_if_needed(
207 |     fig: plt.Figure,
208 |     *,
209 |     min_gap: float,
210 |     box_pad: float,
211 |     title_txt: Text,
212 |     attr_txt: Optional[Text],
213 | ) -> Tuple[float, float]:
214 |     """
215 |     Ensure the box bottom clears the axes by min_gap.
216 |     If needed, shift both texts upward (clamped to the figure top).
217 | 
218 |     Returns:
219 |         (ymin, ymax) of the final text union after any shift.
220 |     """
221 |     renderer = _finalise_and_get_renderer(fig)
222 | 
223 |     ymin, ymax = _measure_text_bounds(fig, renderer, [title_txt, attr_txt])
224 |     box_bottom = ymin - box_pad
225 |     occupied_top = _occupied_content_top(fig, renderer)
226 | 
227 |     # Small extra buffer if any legend is visible (avoids near misses)
228 |     extra = (
229 |         0.01
230 |         if any((legend := ax.get_legend()) is not None and legend.get_visible() for ax in fig.axes)
231 |         else 0.0
232 |     )
233 | 
234 |     required_bottom = occupied_top + (min_gap + extra)
235 |     if box_bottom >= required_bottom:
236 |         return ymin, ymax
237 | 
238 |     # Need to lift the banner
239 |     shift = min(
240 |         required_bottom - box_bottom,
241 |         max(0.0, 0.995 - title_txt.get_position()[1]),
242 |     )
243 |     _shift_texts(title_txt, attr_txt, shift)
244 | 
245 |     fig.canvas.draw()  # re-measure after moving
246 |     return _measure_text_bounds(fig, fig.canvas.get_renderer(), [title_txt, attr_txt])
247 | 
248 | 
249 | def _draw_background_box(
250 |     fig: plt.Figure,
251 |     *,
252 |     ymin: float,
253 |     ymax: float,
254 |     box_left: float,
255 |     box_right: float,
256 |     box_pad: float,
257 | ) -> None:
258 |     """
259 |     Draw a white rounded rectangle behind the title and subtitle.
260 | 
261 |     Args:
262 |         fig: Matplotlib figure.
263 |         ymin: Lower y-bound of text union (figure coords).
264 |         ymax: Upper y-bound of text union (figure coords).
265 |         box_left: Left x-position of box (figure coords).
266 |         box_right: Right x-position of box (figure coords).
267 |         box_pad: Padding applied above/below the text union (figure coords).
268 |     """
269 |     fig.patches.append(
270 |         FancyBboxPatch(
271 |             (box_left, ymin - box_pad),
272 |             (box_right - box_left),
273 |             (ymax - ymin) + 2 * box_pad,
274 |             transform=fig.transFigure,
275 |             boxstyle="round,pad=0.004,rounding_size=0.01",
276 |             facecolor="white",
277 |             edgecolor="lightgray",
278 |             linewidth=0.8,
279 |             alpha=0.95,
280 |             zorder=2,
281 |         )
282 |     )
283 | 
284 | 
285 | def add_title_with_attribution(
286 |     fig: plt.Figure,
287 |     title: str,
288 |     config: TitleBoxConfig = TitleBoxConfig(),
289 | ) -> None:
290 |     """
291 |     Add a title and optional attribution above the plot area, automatically
292 |     lifting them if they would overlap the axes, and drawing a rounded
293 |     background box behind both lines.
294 |     """
295 |     if not fig.axes:
296 |         return
297 | 
298 |     axes_top = _axes_top(fig)
299 |     subtitle_y = axes_top + config.offsets[0]
300 |     title_y = subtitle_y + _line_height(fig, config.fontsizes[1], config.offsets[1])
301 | 
302 |     title_txt, attr_txt = _place_texts(
303 |         fig,
304 |         title,
305 |         config.attribution,
306 |         title_y=title_y,
307 |         subtitle_y=subtitle_y,
308 |         title_fontsize=config.fontsizes[0],
309 |         subtitle_fontsize=config.fontsizes[1],
310 |     )
311 | 
312 |     ymin, ymax = _lift_if_needed(
313 |         fig,
314 |         min_gap=config.gap_and_pad[0],
315 |         box_pad=config.gap_and_pad[1],
316 |         title_txt=title_txt,
317 |         attr_txt=attr_txt,
318 |     )
319 | 
320 |     # Push axes down so they clear the banner by at least min_gap
321 |     header_bottom = ymin - config.gap_and_pad[1]
322 |     top_limit = header_bottom - config.gap_and_pad[0]
323 |     _reserve_space_above_axes(fig, top_limit)
324 |     fig.canvas.draw()
325 | 
326 |     _draw_background_box(
327 |         fig,
328 |         ymin=ymin,
329 |         ymax=ymax,
330 |         box_left=config.box_lr[0],
331 |         box_right=config.box_lr[1],
332 |         box_pad=config.gap_and_pad[1],
333 |     )
334 | 
335 | 
336 | def configure_matplotlib_styles() -> None:
337 |     """
338 |     Apply consistent style settings across all charts.
339 |     """
340 |     plt.rcParams["figure.figsize"] = (10, 6)
341 |     plt.rcParams["axes.labelsize"] = 12
342 |     plt.rcParams["axes.titlesize"] = 14
343 |     plt.rcParams["legend.fontsize"] = 12
344 |     plt.rcParams["axes.grid"] = True
345 | 
346 | 
347 | def format_pace(value: float, _) -> str:
348 |     """
349 |     Convert a time value in seconds into 'minutes:seconds' format.
350 |     """
351 |     if not np.isfinite(value):
352 |         return ""
353 |     minutes = int(value // 60)
354 |     seconds = int(value % 60)
355 |     return f"{minutes}:{seconds:02d}"
356 | 
357 | 
358 | def classify_zone_dynamic(heart_rate: float, date_str: str) -> str:
359 |     """
360 |     Classify heart rate into a dynamic training zone based on age at the run date.
361 | 
362 |     Zones are computed from a max HR of (220 - age) on the given date.
363 |     """
364 |     try:
365 |         run_date = pd.to_datetime(date_str)
366 |     except (ValueError, TypeError):
367 |         return "Unknown"
368 | 
369 |     age = run_date.year - DOB.year - ((run_date.month, run_date.day) < (DOB.month, DOB.day))
370 |     max_hr = 220 - age
371 |     heart_pct = heart_rate / max_hr
372 | 
373 |     if heart_pct < 0.60:
374 |         return "Z1 (<60%)"
375 |     if heart_pct < 0.70:
376 |         return "Z2 (60–70%)"
377 |     if heart_pct < 0.80:
378 |         return "Z3 (70–80%)"
379 |     if heart_pct < 0.90:
380 |         return "Z4 (80–90%)"
381 |     return "Z5 (90–100%)"
382 | 
383 | 
384 | def prepare_pace_distance_data(splits_df: pd.DataFrame) -> pd.DataFrame:
385 |     """
386 |     Aggregate and derive per-run pace metrics from individual split data.
387 | 
388 |     Adds:
389 |         - pace_sec_km: seconds per kilometre for the run
390 |         - distance_km, pace_sec, year
391 |     """
392 |     splits = splits_df.copy()
393 |     splits["pace_sec_km"] = splits["elapsed_time_s"] / (splits["distance_m"] / 1000)
394 |     grouped_df = (
395 |         splits.groupby(["activity_id", "start_date_local"])
396 |         .agg({"distance_m": "sum", "elapsed_time_s": "sum"})
397 |         .reset_index()
398 |     )
399 |     grouped_df["pace_sec_km"] = grouped_df["elapsed_time_s"] / (grouped_df["distance_m"] / 1000)
400 |     grouped_df["distance_km"] = grouped_df["distance_m"] / 1000
401 |     grouped_df["pace_sec"] = grouped_df["pace_sec_km"]
402 |     grouped_df["year"] = pd.to_datetime(grouped_df["start_date_local"]).dt.year
403 |     return grouped_df
404 | 
405 | 
406 | def prepare_time_distance_data(activities_df: pd.DataFrame) -> pd.DataFrame:
407 |     """
408 |     Clean and enrich raw activities data for plotting time vs. distance trends.
409 | 
410 |     Adds:
411 |         - distance_km, time_seconds, year, is_last_run
412 |       and filters out very short activities (< 0.5 km).
413 |     """
414 |     data = activities_df.copy()
415 |     data["distance_km"] = data["distance_m"] / 1000.0
416 |     data = data[data["distance_km"] >= 0.5]
417 |     data["time_seconds"] = data["moving_time_s"]
418 |     data["year"] = pd.to_datetime(data["start_date_local"]).dt.year
419 |     last_run_date = pd.to_datetime(data["start_date_local"]).max()
420 |     data["is_last_run"] = pd.to_datetime(data["start_date_local"]) == last_run_date
421 |     return data
422 | 
423 | 
424 | def calculate_decay_point(data: pd.DataFrame) -> Tuple[float, float]:
425 |     """
426 |     Compute an extrapolated decay point for visualising projected pacing trends.
427 | 
428 |     Returns:
429 |         (decay_distance_km, decay_time_seconds)
430 |     """
431 |     max_distance = data["distance_km"].max()
432 |     max_time = data["time_seconds"].max()
433 |     decay_distance = max_distance + 2
434 |     average_pace = max_time / max_distance
435 |     decay_time = decay_distance * (average_pace + 180)
436 |     return decay_distance, decay_time
437 | 
438 | 
439 | def seconds_to_hms(value, _):
440 |     """
441 |     Convert a numeric value (in seconds) to a HH:MM:SS formatted string.
442 |     """
443 |     return str(datetime.timedelta(seconds=int(value)))
444 | 
445 | 
446 | def save_and_close_plot(output_path: str) -> None:
447 |     """
448 |     Common helper to save matplotlib plots without switching layout engines.
449 |     """
450 |     fig = plt.gcf()
451 |     fig.savefig(output_path, dpi=150, bbox_inches="tight")
452 |     plt.close(fig)
453 | 
454 | 
455 | def extract_year_month(dataframe: pd.DataFrame) -> pd.DataFrame:
456 |     """
457 |     Add 'year' and 'month' columns based on 'start_date_local'.
458 |     """
459 |     data = dataframe.copy()
460 |     data["year"] = pd.to_datetime(data["start_date_local"]).dt.year
461 |     data["month"] = pd.to_datetime(data["start_date_local"]).dt.month
462 |     return data
463 | 
464 | 
465 | def prepare_activities_with_distance(activities_df: pd.DataFrame) -> pd.DataFrame:
466 |     """
467 |     Copy and derive 'distance_km', 'year', 'month' from raw activities.
468 |     """
469 |     if activities_df.empty:
470 |         return pd.DataFrame()
471 | 
472 |     data = activities_df.copy()
473 |     data["distance_km"] = data["distance_m"] / 1000.0
474 |     data = extract_year_month(data)
475 |     return data
476 | 
477 | 
478 | def prepare_1km_splits(splits_df: pd.DataFrame) -> pd.DataFrame:
479 |     """
480 |     Filter splits to ~1 km and add 'distance_km' and 'year'.
481 |     """
482 |     if splits_df.empty:
483 |         return pd.DataFrame()
484 | 
485 |     data = splits_df.copy()
486 |     data["distance_km"] = data["distance_m"] / 1000.0
487 |     data = data[(data["distance_km"] >= 0.95) & (data["distance_km"] <= 1.05)]
488 |     if data.empty:
489 |         return pd.DataFrame()
490 | 
491 |     data["year"] = pd.to_datetime(data["start_date_local"]).dt.year
492 |     return data
493 | 
494 | 
495 | def plot_with_common_setup(
496 |     title: str,
497 |     xlabel: str,
498 |     ylabel: str,
499 |     output_path: str,
500 |     plot_func: Callable,
501 |     *,
502 |     attribution: Optional[str] = "Data sourced from Garmin (synced via Strava)",
503 |     figsize: tuple[int, int] = (10, 5),
504 | ):
505 |     """
506 |     Reusable wrapper to set up common plot structure and call the provided plot_func.
507 | 
508 |     Args:
509 |         title: Figure title.
510 |         xlabel: X-axis label.
511 |         ylabel: Y-axis label.
512 |         output_path: File path to save the figure.
513 |         plot_func: Callable that accepts a Matplotlib axis and draws the plot.
514 |         attribution: Optional attribution text for data source.
515 |         figsize: Figure size in inches (width, height).
516 |     """
517 |     fig, axis = plt.subplots(figsize=figsize, constrained_layout=True)
518 |     plot_func(axis)
519 |     axis.set_xlabel(xlabel)
520 |     axis.set_ylabel(ylabel)
521 |     axis.grid(True)
522 |     add_title_with_attribution(
523 |         fig,
524 |         title,
525 |         TitleBoxConfig(attribution=attribution),
526 |     )
527 |     save_and_close_plot(output_path)
528 | 
529 | 
530 | def prepare_dated_activities(activities_df: pd.DataFrame) -> pd.DataFrame:
531 |     """
532 |     Prepare an activities DataFrame for time series plotting.
533 | 
534 |     Adds a sorted 'start_date' timestamp column.
535 |     """
536 |     if activities_df.empty:
537 |         return pd.DataFrame()
538 |     data = prepare_activities_with_distance(activities_df)
539 |     data["start_date"] = pd.to_datetime(data["start_date_local"])
540 |     return data.sort_values("start_date")
541 | 
542 | 
543 | def label_month_axis(axis):
544 |     """
545 |     Apply consistent x-axis formatting for month-based plots.
546 |     """
547 |     axis.set_xticks(range(1, 13))
548 |     axis.set_xticklabels(calendar.month_abbr[1:13], rotation=45)
549 | 
550 | 
551 | def label_month_axis_barplot(axis):
552 |     """
553 |     Apply consistent x-axis formatting for month-based (bar) plots.
554 |     """
555 |     axis.set_xticks(np.arange(12) + 0.5)
556 |     axis.set_xticklabels(calendar.month_abbr[1:13], rotation=45)
557 | 


--------------------------------------------------------------------------------